summaryrefslogtreecommitdiff
path: root/scripts/const_structs.checkpatch
blob: 6eb94fddc338a211a2aac9cb6e4cd9d15c4b4a6c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
acpi_dock_ops
address_space_operations
backlight_ops
bin_attribute
block_device_operations
bus_type
clk_ops
comedi_lrange
component_ops
ctl_table
dentry_operations
dev_pm_ops
device_type
dma_map_ops
driver_info
drm_connector_funcs
drm_encoder_funcs
drm_encoder_helper_funcs
dvb_frontend_ops
dvb_tuner_ops
ethtool_ops
extent_io_ops
fb_ops
file_lock_operations
file_operations
hv_ops
hwmon_ops
ib_device_ops
ide_dma_ops
ide_port_ops
ieee80211_ops
iio_buffer_setup_ops
inode_operations
intel_dvo_dev_ops
irq_domain_ops
item_operations
iwl_cfg
iwl_ops
kernel_param_ops
kgdb_arch
kgdb_io
kobj_type
kset_uevent_ops
lcd_ops
lock_manager_operations
machine_desc
microcode_ops
mlxsw_reg_info
mtd_ooblayout_ops
mtrr_ops
nand_controller_ops
neigh_ops
net_device_ops
nft_expr_ops
nlmsvc_binding
nvkm_device_chip
of_device_id
pci_raw_ops
phy_ops
pinconf_ops
pinctrl_ops
pinmux_ops
pipe_buf_operations
platform_hibernation_ops
platform_suspend_ops
proc_ops
proto_ops
pwm_ops
reg_default
reg_field
reg_sequence
regmap_access_table
regmap_bus
regmap_config
regmap_irq
regmap_irq_chip
regmap_irq_sub_irq_map
regmap_range
regmap_range_cfg
regulator_ops
reset_control_ops
rpc_pipe_ops
rtc_class_ops
sd_desc
sdhci_ops
seq_operations
sirfsoc_padmux
snd_ac97_build_ops
snd_pcm_ops
snd_rawmidi_ops
snd_soc_component_driver
snd_soc_dai_ops
snd_soc_ops
snd_soc_tplg_ops
soc_pcmcia_socket_ops
stacktrace_ops
sysfs_ops
tty_operations
uart_ops
usb_mon_operations
v4l2_ctrl_ops
v4l2_ioctl_ops
v4l2_subdev_core_ops
v4l2_subdev_internal_ops
v4l2_subdev_ops
v4l2_subdev_pad_ops
v4l2_subdev_video_ops
vb2_ops
vm_operations_struct
wacom_features
watchdog_ops
wd_ops
xattr_handler
'mode'>-rw-r--r--kernel/Kconfig.kexec182
-rw-r--r--kernel/Kconfig.locks40
-rw-r--r--kernel/Kconfig.preempt139
-rw-r--r--kernel/Makefile244
-rw-r--r--kernel/acct.c301
-rw-r--r--kernel/async.c230
-rw-r--r--kernel/audit.c2050
-rw-r--r--kernel/audit.h209
-rw-r--r--kernel/audit_fsnotify.c194
-rw-r--r--kernel/audit_tree.c657
-rw-r--r--kernel/audit_watch.c202
-rw-r--r--kernel/auditfilter.c546
-rw-r--r--kernel/auditsc.c1932
-rw-r--r--kernel/backtracetest.c36
-rw-r--r--kernel/bounds.c15
-rw-r--r--kernel/bpf/Kconfig104
-rw-r--r--kernel/bpf/Makefile65
-rw-r--r--kernel/bpf/arena.c665
-rw-r--r--kernel/bpf/arraymap.c1407
-rw-r--r--kernel/bpf/bloom_filter.c219
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c238
-rw-r--r--kernel/bpf/bpf_inode_storage.c227
-rw-r--r--kernel/bpf/bpf_insn_array.c304
-rw-r--r--kernel/bpf/bpf_iter.c827
-rw-r--r--kernel/bpf/bpf_local_storage.c825
-rw-r--r--kernel/bpf/bpf_lru_list.c695
-rw-r--r--kernel/bpf/bpf_lru_list.h80
-rw-r--r--kernel/bpf/bpf_lsm.c448
-rw-r--r--kernel/bpf/bpf_struct_ops.c1404
-rw-r--r--kernel/bpf/bpf_task_storage.c373
-rw-r--r--kernel/bpf/btf.c9579
-rw-r--r--kernel/bpf/btf_iter.c2
-rw-r--r--kernel/bpf/btf_relocate.c2
-rw-r--r--kernel/bpf/cgroup.c2761
-rw-r--r--kernel/bpf/cgroup_iter.c359
-rw-r--r--kernel/bpf/core.c3370
-rw-r--r--kernel/bpf/cpumap.c813
-rw-r--r--kernel/bpf/cpumask.c534
-rw-r--r--kernel/bpf/crypto.c393
-rw-r--r--kernel/bpf/devmap.c1170
-rw-r--r--kernel/bpf/disasm.c393
-rw-r--r--kernel/bpf/disasm.h40
-rw-r--r--kernel/bpf/dispatcher.c171
-rw-r--r--kernel/bpf/dmabuf_iter.c150
-rw-r--r--kernel/bpf/hashtab.c2559
-rw-r--r--kernel/bpf/helpers.c4619
-rw-r--r--kernel/bpf/inode.c1105
-rw-r--r--kernel/bpf/kmem_cache_iter.c238
-rw-r--r--kernel/bpf/link_iter.c106
-rw-r--r--kernel/bpf/liveness.c753
-rw-r--r--kernel/bpf/local_storage.c607
-rw-r--r--kernel/bpf/log.c865
-rw-r--r--kernel/bpf/lpm_trie.c789
-rw-r--r--kernel/bpf/map_in_map.c134
-rw-r--r--kernel/bpf/map_in_map.h19
-rw-r--r--kernel/bpf/map_iter.c229
-rw-r--r--kernel/bpf/memalloc.c1016
-rw-r--r--kernel/bpf/mmap_unlock_work.h65
-rw-r--r--kernel/bpf/mprog.c452
-rw-r--r--kernel/bpf/net_namespace.c565
-rw-r--r--kernel/bpf/offload.c878
-rw-r--r--kernel/bpf/percpu_freelist.c137
-rw-r--r--kernel/bpf/percpu_freelist.h33
-rw-r--r--kernel/bpf/preload/.gitignore2
-rw-r--r--kernel/bpf/preload/Kconfig21
-rw-r--r--kernel/bpf/preload/Makefile7
-rw-r--r--kernel/bpf/preload/bpf_preload.h16
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c94
-rw-r--r--kernel/bpf/preload/iterators/.gitignore2
-rw-r--r--kernel/bpf/preload/iterators/Makefile67
-rw-r--r--kernel/bpf/preload/iterators/README7
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c118
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-big-endian.h437
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-little-endian.h435
-rw-r--r--kernel/bpf/prog_iter.c106
-rw-r--r--kernel/bpf/queue_stack_maps.c288
-rw-r--r--kernel/bpf/range_tree.c261
-rw-r--r--kernel/bpf/range_tree.h21
-rw-r--r--kernel/bpf/relo_core.c2
-rw-r--r--kernel/bpf/reuseport_array.c353
-rw-r--r--kernel/bpf/ringbuf.c879
-rw-r--r--kernel/bpf/rqspinlock.c762
-rw-r--r--kernel/bpf/rqspinlock.h48
-rw-r--r--kernel/bpf/stackmap.c792
-rw-r--r--kernel/bpf/stream.c384
-rw-r--r--kernel/bpf/syscall.c6463
-rw-r--r--kernel/bpf/sysfs_btf.c69
-rw-r--r--kernel/bpf/task_iter.c1070
-rw-r--r--kernel/bpf/tcx.c346
-rw-r--r--kernel/bpf/tnum.c255
-rw-r--r--kernel/bpf/token.c261
-rw-r--r--kernel/bpf/trampoline.c1185
-rw-r--r--kernel/bpf/verifier.c25526
-rw-r--r--kernel/capability.c256
-rw-r--r--kernel/cfi.c115
-rw-r--r--kernel/cgroup.c5603
-rw-r--r--kernel/cgroup/Makefile11
-rw-r--r--kernel/cgroup/cgroup-internal.h302
-rw-r--r--kernel/cgroup/cgroup-v1.c1372
-rw-r--r--kernel/cgroup/cgroup.c7448
-rw-r--r--kernel/cgroup/cpuset-internal.h308
-rw-r--r--kernel/cgroup/cpuset-v1.c607
-rw-r--r--kernel/cgroup/cpuset.c4543
-rw-r--r--kernel/cgroup/debug.c377
-rw-r--r--kernel/cgroup/dmem.c830
-rw-r--r--kernel/cgroup/freezer.c326
-rw-r--r--kernel/cgroup/legacy_freezer.c (renamed from kernel/cgroup_freezer.c)71
-rw-r--r--kernel/cgroup/misc.c478
-rw-r--r--kernel/cgroup/namespace.c144
-rw-r--r--kernel/cgroup/pids.c460
-rw-r--r--kernel/cgroup/rdma.c612
-rw-r--r--kernel/cgroup/rstat.c759
-rw-r--r--kernel/compat.c1047
-rw-r--r--kernel/configs.c69
-rw-r--r--kernel/configs/debug.config119
-rw-r--r--kernel/configs/hardening.config113
-rw-r--r--kernel/configs/kvm_guest.config35
-rw-r--r--kernel/configs/nopm.config17
-rw-r--r--kernel/configs/rust.config2
-rw-r--r--kernel/configs/tiny-base.config1
-rw-r--r--kernel/configs/tiny.config5
-rw-r--r--kernel/configs/x86_debug.config18
-rw-r--r--kernel/configs/xen.config52
-rw-r--r--kernel/context_tracking.c734
-rw-r--r--kernel/cpu.c3264
-rw-r--r--kernel/cpu_pm.c124
-rw-r--r--kernel/cpuset.c2700
-rw-r--r--kernel/crash_core.c693
-rw-r--r--kernel/crash_core_test.c343
-rw-r--r--kernel/crash_dump_dm_crypt.c464
-rw-r--r--kernel/crash_reserve.c540
-rw-r--r--kernel/cred.c515
-rw-r--r--kernel/debug/Makefile1
-rw-r--r--kernel/debug/debug_core.c369
-rw-r--r--kernel/debug/debug_core.h4
-rw-r--r--kernel/debug/gdbstub.c76
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile1
-rw-r--r--kernel/debug/kdb/kdb_bp.c99
-rw-r--r--kernel/debug/kdb/kdb_bt.c129
-rw-r--r--kernel/debug/kdb/kdb_debugger.c10
-rw-r--r--kernel/debug/kdb/kdb_io.c579
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c43
-rw-r--r--kernel/debug/kdb/kdb_main.c1334
-rw-r--r--kernel/debug/kdb/kdb_private.h55
-rw-r--r--kernel/debug/kdb/kdb_support.c582
-rw-r--r--kernel/delayacct.c265
-rw-r--r--kernel/dma.c15
-rw-r--r--kernel/dma/Kconfig270
-rw-r--r--kernel/dma/Makefile12
-rw-r--r--kernel/dma/coherent.c411
-rw-r--r--kernel/dma/contiguous.c510
-rw-r--r--kernel/dma/debug.c1608
-rw-r--r--kernel/dma/debug.h127
-rw-r--r--kernel/dma/direct.c668
-rw-r--r--kernel/dma/direct.h140
-rw-r--r--kernel/dma/dummy.c58
-rw-r--r--kernel/dma/map_benchmark.c381
-rw-r--r--kernel/dma/mapping.c1010
-rw-r--r--kernel/dma/ops_helpers.c103
-rw-r--r--kernel/dma/pool.c295
-rw-r--r--kernel/dma/remap.c72
-rw-r--r--kernel/dma/swiotlb.c1881
-rw-r--r--kernel/elfcore.c24
-rw-r--r--kernel/elfcorehdr.c (renamed from kernel/crash_dump.c)7
-rw-r--r--kernel/entry/Makefile17
-rw-r--r--kernel/entry/common.c266
-rw-r--r--kernel/entry/common.h7
-rw-r--r--kernel/entry/syscall-common.c104
-rw-r--r--kernel/entry/syscall_user_dispatch.c174
-rw-r--r--kernel/entry/virt.c46
-rw-r--r--kernel/events/Makefile7
-rw-r--r--kernel/events/callchain.c209
-rw-r--r--kernel/events/core.c11758
-rw-r--r--kernel/events/hw_breakpoint.c839
-rw-r--r--kernel/events/hw_breakpoint_test.c332
-rw-r--r--kernel/events/internal.h131
-rw-r--r--kernel/events/ring_buffer.c570
-rw-r--r--kernel/events/uprobes.c2197
-rw-r--r--kernel/exec_domain.c16
-rw-r--r--kernel/exit.c1392
-rw-r--r--kernel/exit.h30
-rw-r--r--kernel/extable.c140
-rw-r--r--kernel/fail_function.c334
-rw-r--r--kernel/fork.c2583
-rw-r--r--kernel/freezer.c162
-rw-r--r--kernel/futex.c3053
-rw-r--r--kernel/futex/Makefile3
-rw-r--r--kernel/futex/core.c2012
-rw-r--r--kernel/futex/futex.h450
-rw-r--r--kernel/futex/pi.c1294
-rw-r--r--kernel/futex/requeue.c913
-rw-r--r--kernel/futex/syscalls.c518
-rw-r--r--kernel/futex/waitwake.c752
-rw-r--r--kernel/futex_compat.c201
-rw-r--r--kernel/gcov/Kconfig39
-rw-r--r--kernel/gcov/Makefile9
-rw-r--r--kernel/gcov/base.c125
-rw-r--r--kernel/gcov/clang.c393
-rw-r--r--kernel/gcov/fs.c139
-rw-r--r--kernel/gcov/gcc_3_4.c562
-rw-r--r--kernel/gcov/gcc_4_7.c225
-rw-r--r--kernel/gcov/gcc_base.c86
-rw-r--r--kernel/gcov/gcov.h20
-rwxr-xr-xkernel/gen_kheaders.sh50
-rw-r--r--kernel/groups.c112
-rw-r--r--kernel/hung_task.c400
-rw-r--r--kernel/iomem.c165
-rw-r--r--kernel/irq/Kconfig123
-rw-r--r--kernel/irq/Makefile15
-rw-r--r--kernel/irq/affinity.c127
-rw-r--r--kernel/irq/autoprobe.c39
-rw-r--r--kernel/irq/chip.c1390
-rw-r--r--kernel/irq/cpuhotplug.c245
-rw-r--r--kernel/irq/debug.h20
-rw-r--r--kernel/irq/debugfs.c256
-rw-r--r--kernel/irq/devres.c324
-rw-r--r--kernel/irq/dummychip.c4
-rw-r--r--kernel/irq/generic-chip.c327
-rw-r--r--kernel/irq/handle.c152
-rw-r--r--kernel/irq/internals.h413
-rw-r--r--kernel/irq/ipi-mux.c206
-rw-r--r--kernel/irq/ipi.c345
-rw-r--r--kernel/irq/irq_sim.c297
-rw-r--r--kernel/irq/irq_test.c236
-rw-r--r--kernel/irq/irqdesc.c878
-rw-r--r--kernel/irq/irqdomain.c1906
-rw-r--r--kernel/irq/kexec.c36
-rw-r--r--kernel/irq/manage.c2393
-rw-r--r--kernel/irq/matrix.c519
-rw-r--r--kernel/irq/migration.c87
-rw-r--r--kernel/irq/msi.c1668
-rw-r--r--kernel/irq/pm.c121
-rw-r--r--kernel/irq/proc.c338
-rw-r--r--kernel/irq/resend.c185
-rw-r--r--kernel/irq/settings.h38
-rw-r--r--kernel/irq/spurious.c196
-rw-r--r--kernel/irq/timings.c959
-rw-r--r--kernel/irq_work.c275
-rw-r--r--kernel/jump_label.c825
-rw-r--r--kernel/kallsyms.c618
-rw-r--r--kernel/kallsyms_internal.h19
-rw-r--r--kernel/kallsyms_selftest.c446
-rw-r--r--kernel/kallsyms_selftest.h13
-rw-r--r--kernel/kcmp.c95
-rw-r--r--kernel/kcov.c1132
-rw-r--r--kernel/kcsan/.kunitconfig24
-rw-r--r--kernel/kcsan/Makefile21
-rw-r--r--kernel/kcsan/core.c1371
-rw-r--r--kernel/kcsan/debugfs.c272
-rw-r--r--kernel/kcsan/encoding.h102
-rw-r--r--kernel/kcsan/kcsan.h142
-rw-r--r--kernel/kcsan/kcsan_test.c1625
-rw-r--r--kernel/kcsan/permissive.h94
-rw-r--r--kernel/kcsan/report.c715
-rw-r--r--kernel/kcsan/selftest.c261
-rw-r--r--kernel/kexec.c2728
-rw-r--r--kernel/kexec_core.c1368
-rw-r--r--kernel/kexec_elf.c430
-rw-r--r--kernel/kexec_file.c1244
-rw-r--r--kernel/kexec_internal.h58
-rw-r--r--kernel/kheaders.c53
-rw-r--r--kernel/kprobes.c2579
-rw-r--r--kernel/kstack_erase.c177
-rw-r--r--kernel/ksyms_common.c43
-rw-r--r--kernel/ksysfs.c152
-rw-r--r--kernel/kthread.c1374
-rw-r--r--kernel/latencytop.c99
-rw-r--r--kernel/livepatch/Kconfig16
-rw-r--r--kernel/livepatch/Makefile3
-rw-r--r--kernel/livepatch/core.c1597
-rw-r--r--kernel/livepatch/core.h59
-rw-r--r--kernel/livepatch/patch.c289
-rw-r--r--kernel/livepatch/patch.h35
-rw-r--r--kernel/livepatch/shadow.c299
-rw-r--r--kernel/livepatch/state.c119
-rw-r--r--kernel/livepatch/state.h9
-rw-r--r--kernel/livepatch/transition.c731
-rw-r--r--kernel/livepatch/transition.h16
-rw-r--r--kernel/liveupdate/Kconfig75
-rw-r--r--kernel/liveupdate/Makefile12
-rw-r--r--kernel/liveupdate/kexec_handover.c1594
-rw-r--r--kernel/liveupdate/kexec_handover_debug.c25
-rw-r--r--kernel/liveupdate/kexec_handover_debugfs.c221
-rw-r--r--kernel/liveupdate/kexec_handover_internal.h55
-rw-r--r--kernel/liveupdate/luo_core.c450
-rw-r--r--kernel/liveupdate/luo_file.c889
-rw-r--r--kernel/liveupdate/luo_internal.h110
-rw-r--r--kernel/liveupdate/luo_session.c646
-rw-r--r--kernel/locking/Makefile26
-rw-r--r--kernel/locking/irqflag-debug.c13
-rw-r--r--kernel/locking/lglock.c89
-rw-r--r--kernel/locking/lock_events.c179
-rw-r--r--kernel/locking/lock_events.h64
-rw-r--r--kernel/locking/lock_events_list.h102
-rw-r--r--kernel/locking/lockdep.c5658
-rw-r--r--kernel/locking/lockdep_internals.h169
-rw-r--r--kernel/locking/lockdep_proc.c217
-rw-r--r--kernel/locking/lockdep_states.h1
-rw-r--r--kernel/locking/locktorture.c1048
-rw-r--r--kernel/locking/mcs_spinlock.h30
-rw-r--r--kernel/locking/mutex-debug.c63
-rw-r--r--kernel/locking/mutex-debug.h55
-rw-r--r--kernel/locking/mutex.c1184
-rw-r--r--kernel/locking/mutex.h94
-rw-r--r--kernel/locking/osq_lock.c97
-rw-r--r--kernel/locking/percpu-rwsem.c343
-rw-r--r--kernel/locking/qrwlock.c126
-rw-r--r--kernel/locking/qspinlock.c410
-rw-r--r--kernel/locking/qspinlock.h201
-rw-r--r--kernel/locking/qspinlock_paravirt.h557
-rw-r--r--kernel/locking/qspinlock_stat.h142
-rw-r--r--kernel/locking/rtmutex-debug.c184
-rw-r--r--kernel/locking/rtmutex-debug.h39
-rw-r--r--kernel/locking/rtmutex-tester.c420
-rw-r--r--kernel/locking/rtmutex.c1759
-rw-r--r--kernel/locking/rtmutex.h36
-rw-r--r--kernel/locking/rtmutex_api.c656
-rw-r--r--kernel/locking/rtmutex_common.h237
-rw-r--r--kernel/locking/rwbase_rt.c303
-rw-r--r--kernel/locking/rwsem-spinlock.c303
-rw-r--r--kernel/locking/rwsem-xadd.c531
-rw-r--r--kernel/locking/rwsem.c1639
-rw-r--r--kernel/locking/rwsem.h20
-rw-r--r--kernel/locking/semaphore.c114
-rw-r--r--kernel/locking/spinlock.c130
-rw-r--r--kernel/locking/spinlock_debug.c140
-rw-r--r--kernel/locking/spinlock_rt.c287
-rw-r--r--kernel/locking/test-ww_mutex.c699
-rw-r--r--kernel/locking/ww_mutex.h594
-rw-r--r--kernel/locking/ww_rt_mutex.c101
-rw-r--r--kernel/module-internal.h12
-rw-r--r--kernel/module.c3917
-rw-r--r--kernel/module/Kconfig465
-rw-r--r--kernel/module/Makefile25
-rw-r--r--kernel/module/debug_kmemleak.c21
-rw-r--r--kernel/module/decompress.c368
-rw-r--r--kernel/module/dups.c247
-rw-r--r--kernel/module/internal.h425
-rw-r--r--kernel/module/kallsyms.c501
-rw-r--r--kernel/module/kdb.c63
-rw-r--r--kernel/module/kmod.c179
-rw-r--r--kernel/module/livepatch.c74
-rw-r--r--kernel/module/main.c3923
-rw-r--r--kernel/module/procfs.c152
-rw-r--r--kernel/module/signing.c125
-rw-r--r--kernel/module/stats.c432
-rw-r--r--kernel/module/strict_rwx.c151
-rw-r--r--kernel/module/sysfs.c440
-rw-r--r--kernel/module/tracking.c127
-rw-r--r--kernel/module/tree_lookup.c112
-rw-r--r--kernel/module/version.c146
-rw-r--r--kernel/module_signature.c46
-rw-r--r--kernel/module_signing.c250
-rw-r--r--kernel/notifier.c274
-rw-r--r--kernel/nscommon.c311
-rw-r--r--kernel/nsproxy.c474
-rw-r--r--kernel/nstree.c813
-rw-r--r--kernel/padata.c1156
-rw-r--r--kernel/panic.c863
-rw-r--r--kernel/params.c483
-rw-r--r--kernel/pid.c904
-rw-r--r--kernel/pid_namespace.c379
-rw-r--r--kernel/pid_sysctl.h53
-rw-r--r--kernel/power/Kconfig190
-rw-r--r--kernel/power/Makefile17
-rw-r--r--kernel/power/autosleep.c6
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/console.c24
-rw-r--r--kernel/power/em_netlink.c308
-rw-r--r--kernel/power/em_netlink.h39
-rw-r--r--kernel/power/em_netlink_autogen.c48
-rw-r--r--kernel/power/em_netlink_autogen.h23
-rw-r--r--kernel/power/energy_model.c1046
-rw-r--r--kernel/power/hibernate.c810
-rw-r--r--kernel/power/main.c749
-rw-r--r--kernel/power/power.h109
-rw-r--r--kernel/power/poweroff.c7
-rw-r--r--kernel/power/process.c82
-rw-r--r--kernel/power/qos.c950
-rw-r--r--kernel/power/snapshot.c1478
-rw-r--r--kernel/power/suspend.c374
-rw-r--r--kernel/power/suspend_test.c31
-rw-r--r--kernel/power/swap.c947
-rw-r--r--kernel/power/user.c209
-rw-r--r--kernel/power/wakelock.c64
-rw-r--r--kernel/printk/.kunitconfig3
-rw-r--r--kernel/printk/Makefile9
-rw-r--r--kernel/printk/braille.c31
-rw-r--r--kernel/printk/braille.h14
-rw-r--r--kernel/printk/console_cmdline.h3
-rw-r--r--kernel/printk/index.c194
-rw-r--r--kernel/printk/internal.h304
-rw-r--r--kernel/printk/nbcon.c1985
-rw-r--r--kernel/printk/printk.c5258
-rw-r--r--kernel/printk/printk_ringbuffer.c2415
-rw-r--r--kernel/printk/printk_ringbuffer.h437
-rw-r--r--kernel/printk/printk_ringbuffer_kunit_test.c327
-rw-r--r--kernel/printk/printk_safe.c84
-rw-r--r--kernel/printk/sysctl.c84
-rw-r--r--kernel/profile.c479
-rw-r--r--kernel/ptrace.c808
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcu/Kconfig379
-rw-r--r--kernel/rcu/Kconfig.debug231
-rw-r--r--kernel/rcu/Makefile19
-rw-r--r--kernel/rcu/rcu.h684
-rw-r--r--kernel/rcu/rcu_segcblist.c622
-rw-r--r--kernel/rcu/rcu_segcblist.h145
-rw-r--r--kernel/rcu/rcuscale.c1219
-rw-r--r--kernel/rcu/rcutorture.c4197
-rw-r--r--kernel/rcu/refscale.c1616
-rw-r--r--kernel/rcu/srcu.c676
-rw-r--r--kernel/rcu/srcutiny.c316
-rw-r--r--kernel/rcu/srcutree.c2208
-rw-r--r--kernel/rcu/sync.c190
-rw-r--r--kernel/rcu/tasks.h2283
-rw-r--r--kernel/rcu/tiny.c327
-rw-r--r--kernel/rcu/tiny_plugin.h173
-rw-r--r--kernel/rcu/tree.c6561
-rw-r--r--kernel/rcu/tree.h728
-rw-r--r--kernel/rcu/tree_exp.h1113
-rw-r--r--kernel/rcu/tree_nocb.h1705
-rw-r--r--kernel/rcu/tree_plugin.h3412
-rw-r--r--kernel/rcu/tree_stall.h1185
-rw-r--r--kernel/rcu/tree_trace.c505
-rw-r--r--kernel/rcu/update.c871
-rw-r--r--kernel/reboot.c932
-rw-r--r--kernel/regset.c76
-rw-r--r--kernel/relay.c734
-rw-r--r--kernel/resource.c1338
-rw-r--r--kernel/resource_kunit.c306
-rw-r--r--kernel/rseq.c478
-rw-r--r--kernel/scftorture.c701
-rw-r--r--kernel/sched/Makefile40
-rw-r--r--kernel/sched/autogroup.c (renamed from kernel/sched/auto_group.c)106
-rw-r--r--kernel/sched/autogroup.h (renamed from kernel/sched/auto_group.h)26
-rw-r--r--kernel/sched/build_policy.c66
-rw-r--r--kernel/sched/build_utility.c106
-rw-r--r--kernel/sched/clock.c387
-rw-r--r--kernel/sched/completion.c113
-rw-r--r--kernel/sched/core.c13838
-rw-r--r--kernel/sched/core_sched.c302
-rw-r--r--kernel/sched/cpuacct.c278
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpudeadline.c254
-rw-r--r--kernel/sched/cpudeadline.h38
-rw-r--r--kernel/sched/cpufreq.c75
-rw-r--r--kernel/sched/cpufreq_schedutil.c937
-rw-r--r--kernel/sched/cpupri.c219
-rw-r--r--kernel/sched/cpupri.h39
-rw-r--r--kernel/sched/cputime.c1100
-rw-r--r--kernel/sched/deadline.c3383
-rw-r--r--kernel/sched/debug.c1206
-rw-r--r--kernel/sched/ext.c7310
-rw-r--r--kernel/sched/ext.h95
-rw-r--r--kernel/sched/ext_idle.c1435
-rw-r--r--kernel/sched/ext_idle.h24
-rw-r--r--kernel/sched/ext_internal.h1101
-rw-r--r--kernel/sched/fair.c13115
-rw-r--r--kernel/sched/features.h108
-rw-r--r--kernel/sched/idle.c616
-rw-r--r--kernel/sched/idle_task.c109
-rw-r--r--kernel/sched/isolation.c276
-rw-r--r--kernel/sched/loadavg.c399
-rw-r--r--kernel/sched/membarrier.c679
-rw-r--r--kernel/sched/pelt.c490
-rw-r--r--kernel/sched/pelt.h189
-rw-r--r--kernel/sched/proc.c584
-rw-r--r--kernel/sched/psi.c1682
-rw-r--r--kernel/sched/rq-offsets.c12
-rw-r--r--kernel/sched/rt.c2039
-rw-r--r--kernel/sched/sched-pelt.h15
-rw-r--r--kernel/sched/sched.h3884
-rw-r--r--kernel/sched/smp.h22
-rw-r--r--kernel/sched/stats.c146
-rw-r--r--kernel/sched/stats.h417
-rw-r--r--kernel/sched/stop_task.c100
-rw-r--r--kernel/sched/swait.c145
-rw-r--r--kernel/sched/syscalls.c1570
-rw-r--r--kernel/sched/topology.c2942
-rw-r--r--kernel/sched/wait.c707
-rw-r--r--kernel/sched/wait_bit.c278
-rw-r--r--kernel/scs.c168
-rw-r--r--kernel/seccomp.c2199
-rw-r--r--kernel/signal.c3234
-rw-r--r--kernel/smp.c1039
-rw-r--r--kernel/smpboot.c198
-rw-r--r--kernel/smpboot.h7
-rw-r--r--kernel/softirq.c935
-rw-r--r--kernel/stacktrace.c377
-rw-r--r--kernel/static_call.c8
-rw-r--r--kernel/static_call_inline.c566
-rw-r--r--kernel/stop_machine.c447
-rw-r--r--kernel/sys.c1570
-rw-r--r--kernel/sys_ni.c584
-rw-r--r--kernel/sysctl-test.c392
-rw-r--r--kernel/sysctl.c2946
-rw-r--r--kernel/sysctl_binary.c1494
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c106
-rw-r--r--kernel/task_work.c192
-rw-r--r--kernel/taskstats.c183
-rw-r--r--kernel/test_kprobes.c389
-rw-r--r--kernel/time/Kconfig195
-rw-r--r--kernel/time/Makefile43
-rw-r--r--kernel/time/alarmtimer.c706
-rw-r--r--kernel/time/clockevents.c254
-rw-r--r--kernel/time/clocksource-wdtest.c204
-rw-r--r--kernel/time/clocksource.c859
-rw-r--r--kernel/time/hrtimer.c2365
-rw-r--r--kernel/time/itimer.c317
-rw-r--r--kernel/time/jiffies.c224
-rw-r--r--kernel/time/namespace.c489
-rw-r--r--kernel/time/ntp.c1167
-rw-r--r--kernel/time/ntp_internal.h21
-rw-r--r--kernel/time/posix-clock.c262
-rw-r--r--kernel/time/posix-cpu-timers.c1933
-rw-r--r--kernel/time/posix-stubs.c209
-rw-r--r--kernel/time/posix-timers.c1886
-rw-r--r--kernel/time/posix-timers.h53
-rw-r--r--kernel/time/sched_clock.c164
-rw-r--r--kernel/time/sleep_timeout.c377
-rw-r--r--kernel/time/test_udelay.c34
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c102
-rw-r--r--kernel/time/tick-broadcast.c679
-rw-r--r--kernel/time/tick-common.c220
-rw-r--r--kernel/time/tick-internal.h102
-rw-r--r--kernel/time/tick-legacy.c37
-rw-r--r--kernel/time/tick-oneshot.c66
-rw-r--r--kernel/time/tick-sched.c1799
-rw-r--r--kernel/time/tick-sched.h102
-rw-r--r--kernel/time/time.c914
-rw-r--r--kernel/time/time_test.c100
-rw-r--r--kernel/time/timeconst.bc13
-rw-r--r--kernel/time/timeconv.c148
-rw-r--r--kernel/time/timecounter.c27
-rw-r--r--kernel/time/timekeeping.c2709
-rw-r--r--kernel/time/timekeeping.h27
-rw-r--r--kernel/time/timekeeping_debug.c64
-rw-r--r--kernel/time/timekeeping_internal.h50
-rw-r--r--kernel/time/timer.c2777
-rw-r--r--kernel/time/timer_list.c206
-rw-r--r--kernel/time/timer_migration.c2023
-rw-r--r--kernel/time/timer_migration.h146
-rw-r--r--kernel/time/timer_stats.c425
-rw-r--r--kernel/time/vsyscall.c215
-rw-r--r--kernel/torture.c563
-rw-r--r--kernel/trace/Kconfig980
-rw-r--r--kernel/trace/Makefile68
-rw-r--r--kernel/trace/blktrace.c1575
-rw-r--r--kernel/trace/bpf_trace.c3567
-rw-r--r--kernel/trace/bpf_trace.h34
-rw-r--r--kernel/trace/error_report-traces.c11
-rw-r--r--kernel/trace/fgraph.c1459
-rw-r--r--kernel/trace/fprobe.c972
-rw-r--r--kernel/trace/ftrace.c6970
-rw-r--r--kernel/trace/ftrace_internal.h68
-rw-r--r--kernel/trace/kprobe_event_gen_test.c276
-rw-r--r--kernel/trace/pid_list.c504
-rw-r--r--kernel/trace/pid_list.h89
-rw-r--r--kernel/trace/power-traces.c2
-rw-r--r--kernel/trace/preemptirq_delay_test.c222
-rw-r--r--kernel/trace/rethook.c337
-rw-r--r--kernel/trace/ring_buffer.c5435
-rw-r--r--kernel/trace/ring_buffer_benchmark.c172
-rw-r--r--kernel/trace/rpm-traces.c1
-rw-r--r--kernel/trace/rv/Kconfig95
-rw-r--r--kernel/trace/rv/Makefile23
-rw-r--r--kernel/trace/rv/monitors/nrp/Kconfig16
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.c138
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.h75
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp_trace.h15
-rw-r--r--kernel/trace/rv/monitors/opid/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c168
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h104
-rw-r--r--kernel/trace/rv/monitors/opid/opid_trace.h15
-rw-r--r--kernel/trace/rv/monitors/pagefault/Kconfig21
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.c88
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.h64
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault_trace.h14
-rw-r--r--kernel/trace/rv/monitors/rtapp/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.c33
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.h3
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig12
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c37
-rw-r--r--kernel/trace/rv/monitors/sched/sched.h3
-rw-r--r--kernel/trace/rv/monitors/sco/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c87
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h47
-rw-r--r--kernel/trace/rv/monitors/sco/sco_trace.h15
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c95
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h49
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sleep/Kconfig22
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c241
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.h257
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep_trace.h14
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c95
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h59
-rw-r--r--kernel/trace/rv/monitors/snep/snep_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snroc/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c84
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h47
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sssw/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.c116
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.h105
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sts/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/sts/sts.c156
-rw-r--r--kernel/trace/rv/monitors/sts/sts.h117
-rw-r--r--kernel/trace/rv/monitors/sts/sts_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c87
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h47
-rw-r--r--kernel/trace/rv/monitors/wip/wip_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig13
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c86
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h47
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr_trace.h16
-rw-r--r--kernel/trace/rv/reactor_panic.c42
-rw-r--r--kernel/trace/rv/reactor_printk.c41
-rw-r--r--kernel/trace/rv/rv.c854
-rw-r--r--kernel/trace/rv/rv.h47
-rw-r--r--kernel/trace/rv/rv_reactors.c481
-rw-r--r--kernel/trace/rv/rv_trace.h212
-rw-r--r--kernel/trace/synth_event_gen_test.c536
-rw-r--r--kernel/trace/trace.c8684
-rw-r--r--kernel/trace/trace.h1884
-rw-r--r--kernel/trace/trace_benchmark.c54
-rw-r--r--kernel/trace/trace_benchmark.h11
-rw-r--r--kernel/trace/trace_boot.c671
-rw-r--r--kernel/trace/trace_branch.c146
-rw-r--r--kernel/trace/trace_btf.c122
-rw-r--r--kernel/trace/trace_btf.h11
-rw-r--r--kernel/trace/trace_clock.c55
-rw-r--r--kernel/trace/trace_dynevent.c499
-rw-r--r--kernel/trace/trace_dynevent.h155
-rw-r--r--kernel/trace/trace_entries.h271
-rw-r--r--kernel/trace/trace_eprobe.c987
-rw-r--r--kernel/trace/trace_event_perf.c297
-rw-r--r--kernel/trace/trace_events.c3274
-rw-r--r--kernel/trace/trace_events_filter.c3376
-rw-r--r--kernel/trace/trace_events_filter_test.h1
-rw-r--r--kernel/trace/trace_events_hist.c7018
-rw-r--r--kernel/trace/trace_events_inject.c335
-rw-r--r--kernel/trace/trace_events_synth.c2357
-rw-r--r--kernel/trace/trace_events_trigger.c1630
-rw-r--r--kernel/trace/trace_events_user.c2933
-rw-r--r--kernel/trace/trace_export.c152
-rw-r--r--kernel/trace/trace_fprobe.c1589
-rw-r--r--kernel/trace/trace_functions.c633
-rw-r--r--kernel/trace/trace_functions_graph.c1079
-rw-r--r--kernel/trace/trace_hwlat.c888
-rw-r--r--kernel/trace/trace_irqsoff.c469
-rw-r--r--kernel/trace/trace_kdb.c94
-rw-r--r--kernel/trace/trace_kprobe.c2179
-rw-r--r--kernel/trace/trace_kprobe_selftest.c13
-rw-r--r--kernel/trace/trace_kprobe_selftest.h7
-rw-r--r--kernel/trace/trace_mmiotrace.c54
-rw-r--r--kernel/trace/trace_nop.c5
-rw-r--r--kernel/trace/trace_osnoise.c3120
-rw-r--r--kernel/trace/trace_output.c1152
-rw-r--r--kernel/trace/trace_output.h30
-rw-r--r--kernel/trace/trace_preemptirq.c129
-rw-r--r--kernel/trace/trace_printk.c62
-rw-r--r--kernel/trace/trace_probe.c2418
-rw-r--r--kernel/trace/trace_probe.h672
-rw-r--r--kernel/trace/trace_probe_kernel.h119
-rw-r--r--kernel/trace/trace_probe_tmpl.h275
-rw-r--r--kernel/trace/trace_recursion_record.c233
-rw-r--r--kernel/trace/trace_sched_switch.c597
-rw-r--r--kernel/trace/trace_sched_wakeup.c492
-rw-r--r--kernel/trace/trace_selftest.c500
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_seq.c81
-rw-r--r--kernel/trace/trace_stack.c392
-rw-r--r--kernel/trace/trace_stat.c61
-rw-r--r--kernel/trace/trace_stat.h3
-rw-r--r--kernel/trace/trace_synth.h41
-rw-r--r--kernel/trace/trace_syscalls.c1247
-rw-r--r--kernel/trace/trace_uprobe.c1530
-rw-r--r--kernel/trace/tracing_map.c1135
-rw-r--r--kernel/trace/tracing_map.h284
-rw-r--r--kernel/tracepoint.c474
-rw-r--r--kernel/tsacct.c113
-rw-r--r--kernel/ucount.c362
-rw-r--r--kernel/uid16.c34
-rw-r--r--kernel/uid16.h14
-rw-r--r--kernel/umh.c (renamed from kernel/kmod.c)394
-rw-r--r--kernel/unwind/Makefile1
-rw-r--r--kernel/unwind/deferred.c366
-rw-r--r--kernel/unwind/user.c165
-rw-r--r--kernel/up.c63
-rw-r--r--kernel/user-return-notifier.c1
-rw-r--r--kernel/user.c119
-rw-r--r--kernel/user_namespace.c668
-rw-r--r--kernel/utsname.c97
-rw-r--r--kernel/utsname_sysctl.c78
-rw-r--r--kernel/vhost_task.c165
-rw-r--r--kernel/vmcore_info.c250
-rw-r--r--kernel/watch_queue.c706
-rw-r--r--kernel/watchdog.c1632
-rw-r--r--kernel/watchdog_buddy.c110
-rw-r--r--kernel/watchdog_perf.c308
-rw-r--r--kernel/workqueue.c6370
-rw-r--r--kernel/workqueue_internal.h35
713 files changed, 398445 insertions, 105860 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 790d83c7d160..a501bfc80694 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,8 +1,5 @@
-#
-# Generated files
-#
-config_data.h
-config_data.gz
-timeconst.h
-hz.bc
-x509_certificate_list
+# SPDX-License-Identifier: GPL-2.0-only
+/config_data
+/kheaders.md5
+/kheaders-objlist
+/kheaders-srclist
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
index a3bb4cb52539..68646feefb3d 100644
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
config FREEZER
def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..ce1435cb08b1 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Timer Interrupt Frequency Configuration
#
@@ -29,7 +30,7 @@ choice
250 Hz is a good compromise choice allowing server performance
while also showing good interactive responsiveness even
on SMP and NUMA systems. If you are going to be using NTSC video
- or multimedia, selected 300Hz instead.
+ or multimedia, select 300Hz instead.
config HZ_300
bool "300 HZ"
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
new file mode 100644
index 000000000000..15632358bcf7
--- /dev/null
+++ b/kernel/Kconfig.kexec
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Kexec and crash features"
+
+config CRASH_RESERVE
+ bool
+
+config VMCORE_INFO
+ bool
+
+config KEXEC_CORE
+ bool
+
+config KEXEC_ELF
+ bool
+
+config HAVE_IMA_KEXEC
+ bool
+
+config KEXEC
+ bool "Enable kexec system call"
+ depends on ARCH_SUPPORTS_KEXEC
+ select KEXEC_CORE
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
+
+config KEXEC_FILE
+ bool "Enable kexec file based system call"
+ depends on ARCH_SUPPORTS_KEXEC_FILE
+ select CRYPTO_LIB_SHA256
+ select KEXEC_CORE
+ help
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by kexec system call.
+
+config KEXEC_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG
+ depends on KEXEC_FILE
+ help
+ This option makes the kexec_file_load() syscall check for a valid
+ signature of the kernel image. The image can still be loaded without
+ a valid signature unless you also enable KEXEC_SIG_FORCE, though if
+ there's a signature that we can check, then it must be valid.
+
+ In addition to this option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+config KEXEC_SIG_FORCE
+ bool "Require a valid signature in kexec_file_load() syscall"
+ depends on ARCH_SUPPORTS_KEXEC_SIG_FORCE
+ depends on KEXEC_SIG
+ help
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+config KEXEC_IMAGE_VERIFY_SIG
+ bool "Enable Image signature verification support (ARM)"
+ default ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG
+ depends on ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on EFI && SIGNED_PE_FILE_VERIFICATION
+ help
+ Enable Image signature verification support.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
+ depends on KEXEC_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ help
+ Enable bzImage signature verification support.
+
+config KEXEC_JUMP
+ bool "kexec jump"
+ depends on ARCH_SUPPORTS_KEXEC_JUMP
+ depends on KEXEC && HIBERNATION
+ help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
+config CRASH_DUMP
+ bool "kernel crash dumps"
+ default ARCH_DEFAULT_CRASH_DUMP
+ depends on ARCH_SUPPORTS_CRASH_DUMP
+ depends on KEXEC_CORE
+ select VMCORE_INFO
+ select CRASH_RESERVE
+ help
+ Generate crash dump after being started by kexec.
+ This should be normally only set in special crash dump kernels
+ which are loaded in the main kernel with kexec-tools into
+ a specially reserved region and then later executed after
+ a crash by kdump/kexec. The crash dump kernel must be compiled
+ to a memory address not used by the main kernel or BIOS using
+ PHYSICAL_START, or it must be built as a relocatable image
+ (CONFIG_RELOCATABLE=y).
+ For more details see Documentation/admin-guide/kdump/kdump.rst
+
+ For s390, this option also enables zfcpdump.
+ See also <file:Documentation/arch/s390/zfcpdump.rst>
+
+config CRASH_DM_CRYPT
+ bool "Support saving crash dump to dm-crypt encrypted volume"
+ depends on KEXEC_FILE
+ depends on CRASH_DUMP
+ depends on DM_CRYPT
+ depends on KEYS
+ help
+ With this option enabled, user space can intereact with
+ /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys
+ persistent for the dump-capture kernel.
+
+config CRASH_DM_CRYPT_CONFIGS
+ def_tristate CRASH_DM_CRYPT
+ select CONFIGFS_FS
+ help
+ CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that
+ is required to be built-in.
+
+config CRASH_DUMP_KUNIT_TEST
+ tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS
+ depends on CRASH_DUMP && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This option builds KUnit unit tests for kernel crash dumps. The unit
+ tests will be used to verify the correctness of covered functions and
+ also prevent any regression.
+
+ If unsure, say N.
+
+config CRASH_HOTPLUG
+ bool "Update the crash elfcorehdr on system configuration changes"
+ default y
+ depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG)
+ depends on ARCH_SUPPORTS_CRASH_HOTPLUG
+ help
+ Enable direct update to the crash elfcorehdr (which contains
+ the list of CPUs and memory regions to be dumped upon a crash)
+ in response to hot plug/unplug or online/offline of CPUs or
+ memory. This is a much more advanced approach than userspace
+ attempting that.
+
+ If unsure, say Y.
+
+config CRASH_MAX_MEMORY_RANGES
+ int "Specify the maximum number of memory regions for the elfcorehdr"
+ default 8192
+ depends on CRASH_HOTPLUG
+ help
+ For the kexec_file_load() syscall path, specify the maximum number of
+ memory regions that the elfcorehdr buffer/segment can accommodate.
+ These regions are obtained via walk_system_ram_res(); eg. the
+ 'System RAM' entries in /proc/iomem.
+ This value is combined with NR_CPUS_DEFAULT and multiplied by
+ sizeof(Elf64_Phdr) to determine the final elfcorehdr memory buffer/
+ segment size.
+ The value 8192, for example, covers a (sparsely populated) 1TiB system
+ consisting of 128MiB memblocks, while resulting in an elfcorehdr
+ memory buffer/segment size under 1MiB. This represents a sane choice
+ to accommodate both baremetal and virtual machine configurations.
+
+ For the kexec_load() syscall path, CRASH_MAX_MEMORY_RANGES is part of
+ the computation behind the value provided through the
+ /sys/kernel/crash_elfcorehdr_size attribute.
+
+endmenu
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1acd13..4198f0273ecd 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# The ARCH_INLINE foo is necessary because select ignores "depends on"
#
@@ -100,7 +101,7 @@ config UNINLINE_SPIN_UNLOCK
# unlock and unlock_irq functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
# or
-# - DEBUG_SPINLOCK=n and PREEMPT=n
+# - DEBUG_SPINLOCK=n and PREEMPTION=n
#
# unlock_bh and unlock_irqrestore functions are inlined when:
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
@@ -138,7 +139,7 @@ config INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_SPIN_UNLOCK_IRQ
config INLINE_SPIN_UNLOCK_IRQRESTORE
def_bool y
@@ -167,7 +168,7 @@ config INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
def_bool y
@@ -175,7 +176,7 @@ config INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_READ_UNLOCK_IRQ
config INLINE_READ_UNLOCK_IRQRESTORE
def_bool y
@@ -204,7 +205,7 @@ config INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
def_bool y
@@ -212,7 +213,7 @@ config INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
def_bool y
- depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
+ depends on !PREEMPTION || ARCH_INLINE_WRITE_UNLOCK_IRQ
config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool y
@@ -225,19 +226,36 @@ config ARCH_SUPPORTS_ATOMIC_RMW
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
config RWSEM_SPIN_ON_OWNER
def_bool y
- depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
config LOCK_SPIN_ON_OWNER
def_bool y
depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
bool
-config QUEUE_RWLOCK
- def_bool y if ARCH_USE_QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
+ def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+ depends on SMP
+
+config BPF_ARCH_SPINLOCK
+ bool
+
+config ARCH_USE_QUEUED_RWLOCKS
+ bool
+
+config QUEUED_RWLOCKS
+ def_bool y if ARCH_USE_QUEUED_RWLOCKS
+ depends on SMP && !PREEMPT_RT
+
+config ARCH_HAS_MMIOWB
+ bool
+
+config MMIOWB
+ def_bool y if ARCH_HAS_MMIOWB
depends on SMP
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 3f9c97419f02..da326800c1c9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config PREEMPT_NONE_BUILD
+ bool
+
+config PREEMPT_VOLUNTARY_BUILD
+ bool
+
+config PREEMPT_BUILD
+ bool
+ select PREEMPTION
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+
+config ARCH_HAS_PREEMPT_LAZY
+ bool
choice
prompt "Preemption Model"
@@ -5,6 +20,8 @@ choice
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
+ depends on !PREEMPT_RT
+ select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -18,6 +35,9 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
+ depends on !ARCH_NO_PREEMPT
+ depends on !PREEMPT_RT
+ select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -35,8 +55,8 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
- select PREEMPT_COUNT
- select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+ depends on !ARCH_NO_PREEMPT
+ select PREEMPT_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -52,7 +72,120 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_LAZY
+ bool "Scheduler controlled preemption model"
+ depends on !ARCH_NO_PREEMPT
+ depends on ARCH_HAS_PREEMPT_LAZY
+ select PREEMPT_BUILD if !PREEMPT_DYNAMIC
+ help
+ This option provides a scheduler driven preemption model that
+ is fundamentally similar to full preemption, but is less
+ eager to preempt SCHED_NORMAL tasks in an attempt to
+ reduce lock holder preemption and recover some of the performance
+ gains seen from using Voluntary preemption.
+
endchoice
+config PREEMPT_RT
+ bool "Fully Preemptible Kernel (Real-Time)"
+ depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST
+ select PREEMPTION
+ help
+ This option turns the kernel into a real-time kernel by replacing
+ various locking primitives (spinlocks, rwlocks, etc.) with
+ preemptible priority-inheritance aware variants, enforcing
+ interrupt threading and introducing mechanisms to break up long
+ non-preemptible sections. This makes the kernel, except for very
+ low level and critical code paths (entry code, scheduler, low
+ level interrupt handling) fully preemptible and brings most
+ execution contexts under scheduler control.
+
+ Select this if you are building a kernel for systems which
+ require real-time guarantees.
+
+config PREEMPT_RT_NEEDS_BH_LOCK
+ bool "Enforce softirq synchronisation on PREEMPT_RT"
+ depends on PREEMPT_RT
+ help
+ Enforce synchronisation across the softirqs context. On PREEMPT_RT
+ the softirq is preemptible. This enforces the same per-CPU BLK
+ semantic non-PREEMPT_RT builds have. This should not be needed
+ because per-CPU locks were added to avoid the per-CPU BKL.
+
+ This switch provides the old behaviour for testing reasons. Select
+ this if you suspect an error with preemptible softirq and want test
+ the old synchronized behaviour.
+
config PREEMPT_COUNT
- bool \ No newline at end of file
+ bool
+
+config PREEMPTION
+ bool
+ select PREEMPT_COUNT
+
+config PREEMPT_DYNAMIC
+ bool "Preemption behaviour defined on boot"
+ depends on HAVE_PREEMPT_DYNAMIC
+ select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
+ select PREEMPT_BUILD
+ default y if HAVE_PREEMPT_DYNAMIC_CALL
+ help
+ This option allows to define the preemption model on the kernel
+ command line parameter and thus override the default preemption
+ model defined during compile time.
+
+ The feature is primarily interesting for Linux distributions which
+ provide a pre-built kernel binary to reduce the number of kernel
+ flavors they offer while still offering different usecases.
+
+ The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled
+ but if runtime patching is not available for the specific architecture
+ then the potential overhead should be considered.
+
+ Interesting if you want the same pre-built kernel should be used for
+ both Server and Desktop workloads.
+
+config SCHED_CORE
+ bool "Core Scheduling for SMT"
+ depends on SCHED_SMT
+ help
+ This option permits Core Scheduling, a means of coordinated task
+ selection across SMT siblings. When enabled -- see
+ prctl(PR_SCHED_CORE) -- task selection ensures that all SMT siblings
+ will execute a task from the same 'core group', forcing idle when no
+ matching task is found.
+
+ Use of this feature includes:
+ - mitigation of some (not all) SMT side channels;
+ - limiting SMT interference to improve determinism and/or performance.
+
+ SCHED_CORE is default disabled. When it is enabled and unused,
+ which is the likely usage by Linux distributions, there should
+ be no measurable impact on performance.
+
+config SCHED_CLASS_EXT
+ bool "Extensible Scheduling Class"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ select STACKTRACE if STACKTRACE_SUPPORT
+ help
+ This option enables a new scheduler class sched_ext (SCX), which
+ allows scheduling policies to be implemented as BPF programs to
+ achieve the following:
+
+ - Ease of experimentation and exploration: Enabling rapid
+ iteration of new scheduling policies.
+ - Customization: Building application-specific schedulers which
+ implement policies that are not applicable to general-purpose
+ schedulers.
+ - Rapid scheduler deployments: Non-disruptive swap outs of
+ scheduling policies in production environments.
+
+ sched_ext leverages BPF struct_ops feature to define a structure
+ which exports function callbacks and flags to BPF programs that
+ wish to implement scheduling policies. The struct_ops structure
+ exported by sched_ext is struct sched_ext_ops, and is conceptually
+ similar to struct sched_class.
+
+ For more information:
+ Documentation/scheduler/sched-ext.rst
+ https://github.com/sched-ext/scx
diff --git a/kernel/Makefile b/kernel/Makefile
index 60c302cfb4d3..e83669841b8c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -1,26 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the linux kernel.
#
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
- signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
+ sysctl.o capability.o ptrace.o user.o \
+ signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
- kthread.o sys_ni.o nsproxy.o \
+ kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o
+ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
-# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+# Do not trace internal ftrace files
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
-# cond_syscall is currently not LTO compatible
-CFLAGS_sys_ni.o = $(DISABLE_LTO)
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING
+endif
+
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data
+# are CPU local" => assume no data races), to reduce overhead in interrupts.
+KCSAN_SANITIZE_softirq.o = n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_extable.o := n
+KCOV_INSTRUMENT_stacktrace.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+# If sanitizers detect any issues in kcov, it may lead to recursion
+# via printk, etc.
+KASAN_SANITIZE_kcov.o := n
+KCSAN_SANITIZE_kcov.o := n
+UBSAN_SANITIZE_kcov.o := n
+KMSAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
obj-y += sched/
obj-y += locking/
@@ -29,48 +52,57 @@ obj-y += printk/
obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
+obj-y += liveupdate/
+obj-y += dma/
+obj-y += entry/
+obj-y += unwind/
+obj-$(CONFIG_MODULES) += module/
-obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_FUTEX) += futex.o
-ifeq ($(CONFIG_COMPAT),y)
-obj-$(CONFIG_FUTEX) += futex_compat.o
-endif
+obj-$(CONFIG_FUTEX) += futex/
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
-obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
+obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
+obj-$(CONFIG_CRASH_DUMP) += crash_core.o
+obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o
+obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o
obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
+obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
-obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
-obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
-obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY) += watchdog_buddy.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_perf.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
@@ -78,132 +110,98 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_KCSAN) += kcsan/
+obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
+obj-$(CONFIG_CFI) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
-$(obj)/configs.o: $(obj)/config_data.h
+obj-$(CONFIG_HAS_IOMEM) += iomem.o
+obj-$(CONFIG_RSEQ) += rseq.o
+obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
-# config_data.h contains the same information as ikconfig.h but gzipped.
-# Info from config_data can be extracted from /proc/config*
-targets += config_data.gz
-$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
- $(call if_changed,gzip)
+obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
+obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
-targets += config_data.h
-$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call filechk,ikconfiggz)
+CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE)
+CFLAGS_kstack_erase.o += $(call cc-option,-mgeneral-regs-only)
+obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o
+KASAN_SANITIZE_kstack_erase.o := n
+KCSAN_SANITIZE_kstack_erase.o := n
+KCOV_INSTRUMENT_kstack_erase.o := n
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
-#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509". Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
-#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
- $(or $(realpath $(CERT)),$(CERT))))
-X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
-
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
+obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(info X.509 certificate list changed)
-$(shell rm $(obj)/.x509.list)
-endif
-endif
+$(obj)/configs.o: $(obj)/config_data.gz
+
+targets += config_data config_data.gz
+$(obj)/config_data.gz: $(obj)/config_data FORCE
+ $(call if_changed,gzip)
-kernel/system_certificates.o: $(obj)/x509_certificate_list
+filechk_cat = cat $<
-quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
+$(obj)/config_data: $(KCONFIG_CONFIG) FORCE
+ $(call filechk,cat)
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
- $(call if_changed,x509certs)
+# kheaders_data.tar.xz
+$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
- @echo $(X509_CERTIFICATES) >$@
-endif
+quiet_cmd_kheaders_data = GEN $@
+ cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" "$(KBUILD_BUILD_TIMESTAMP)"
+ cmd_kheaders_data_dep = cat $(depfile) >> $(dot-target).cmd; rm -f $(depfile)
-clean-files := x509_certificate_list .x509.list
+define rule_kheaders_data
+ $(call cmd_and_savecmd,kheaders_data)
+ $(call cmd,kheaders_data_dep)
+endef
-ifeq ($(CONFIG_MODULE_SIG),y)
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
+targets += kheaders_data.tar.xz
+$(obj)/kheaders_data.tar.xz: $(src)/gen_kheaders.sh $(obj)/kheaders-srclist $(obj)/kheaders-objlist $(obj)/kheaders.md5 FORCE
+ $(call if_changed_rule,kheaders_data)
+
+# generated headers in objtree
#
-###############################################################################
-ifndef CONFIG_MODULE_SIG_HASH
-$(error Could not determine digest type to use from kernel config)
-endif
+# include/generated/utsversion.h is ignored because it is generated
+# after gen_kheaders.sh is executed. (utsversion.h is unneeded for kheaders)
+filechk_kheaders_objlist = \
+ for d in include "arch/$(SRCARCH)/include"; do \
+ find "$${d}/generated" ! -path "include/generated/utsversion.h" -a -name "*.h" -print; \
+ done
-signing_key.priv signing_key.x509: x509.genkey
- @echo "###"
- @echo "### Now generating an X.509 key pair to be used for signing modules."
- @echo "###"
- @echo "### If this takes a long time, you might wish to run rngd in the"
- @echo "### background to keep the supply of entropy topped up. It"
- @echo "### needs to be run as root, and uses a hardware random"
- @echo "### number generator if one is available."
- @echo "###"
- openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
- -batch -x509 -config x509.genkey \
- -outform DER -out signing_key.x509 \
- -keyout signing_key.priv 2>&1
- @echo "###"
- @echo "### Key pair generated."
- @echo "###"
-
-x509.genkey:
- @echo Generating X.509 key generation config
- @echo >x509.genkey "[ req ]"
- @echo >>x509.genkey "default_bits = 4096"
- @echo >>x509.genkey "distinguished_name = req_distinguished_name"
- @echo >>x509.genkey "prompt = no"
- @echo >>x509.genkey "string_mask = utf8only"
- @echo >>x509.genkey "x509_extensions = myexts"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ req_distinguished_name ]"
- @echo >>x509.genkey "#O = Unspecified company"
- @echo >>x509.genkey "CN = Build time autogenerated kernel key"
- @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ myexts ]"
- @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
- @echo >>x509.genkey "keyUsage=digitalSignature"
- @echo >>x509.genkey "subjectKeyIdentifier=hash"
- @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
+$(obj)/kheaders-objlist: FORCE
+ $(call filechk,kheaders_objlist)
+
+# non-generated headers in srctree
+filechk_kheaders_srclist = \
+ for d in include "arch/$(SRCARCH)/include"; do \
+ find "$(srctree)/$${d}" -path "$(srctree)/$${d}/generated" -prune -o -name "*.h" -print; \
+ done
+
+$(obj)/kheaders-srclist: FORCE
+ $(call filechk,kheaders_srclist)
+
+# Some files are symlinks. If symlinks are changed, kheaders_data.tar.xz should
+# be rebuilt.
+filechk_kheaders_md5sum = xargs -r -a $< stat -c %N | md5sum
+
+$(obj)/kheaders.md5: $(obj)/kheaders-srclist FORCE
+ $(call filechk,kheaders_md5sum)
+
+clean-files := kheaders.md5 kheaders-srclist kheaders-objlist
diff --git a/kernel/acct.c b/kernel/acct.c
index 74963d192c5d..2a2b3c874acd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/acct.c
*
@@ -24,7 +25,7 @@
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
- * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * Fixed a nasty interaction with sys_umount(). If the accounting
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
@@ -39,25 +40,21 @@
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
- * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
-#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/capability.h>
-#include <linux/file.h>
#include <linux/tty.h>
-#include <linux/security.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
#include <linux/jiffies.h>
-#include <linux/times.h>
#include <linux/syscalls.h>
-#include <linux/mount.h>
-#include <linux/uaccess.h>
+#include <linux/namei.h>
+#include <linux/sched/cputime.h>
+
#include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
@@ -68,11 +65,30 @@
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
-int acct_parm[3] = {4, 2, 30};
+static int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_acct_table[] = {
+ {
+ .procname = "acct",
+ .data = &acct_parm,
+ .maxlen = 3*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static __init int kernel_acct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_acct_table);
+ return 0;
+}
+late_initcall(kernel_acct_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* External references and all of the globals.
*/
@@ -82,48 +98,50 @@ struct bsd_acct_struct {
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
- int active;
+ bool active;
+ bool check_space;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
+ acct_t ac;
};
-static void do_acct_process(struct bsd_acct_struct *acct);
+static void fill_ac(struct bsd_acct_struct *acct);
+static void acct_write_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct)
+static bool check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_before_jiffies(acct->needcheck))
- goto out;
+ if (!acct->check_space)
+ return acct->active;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
- goto out;
+ return acct->active;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
- acct->active = 0;
+ acct->active = false;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
- acct->active = 1;
+ acct->active = true;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-out:
return acct->active;
}
@@ -144,7 +162,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
again:
smp_rmb();
rcu_read_lock();
- res = to_acct(ACCESS_ONCE(ns->bacct));
+ res = to_acct(READ_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
@@ -156,7 +174,7 @@ again:
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+ if (res != to_acct(READ_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
acct_put(res);
goto again;
@@ -168,7 +186,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
- do_acct_process(acct);
+ /*
+ * Fill the accounting struct with the exiting task's info
+ * before punting to the workqueue.
+ */
+ fill_ac(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
@@ -181,76 +203,79 @@ static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
+
+ /* We were fired by acct_pin_kill() which holds acct->lock. */
+ acct_write_process(acct);
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
complete(&acct->done);
}
-static int acct_on(struct filename *pathname)
+DEFINE_FREE(fput_sync, struct file *, if (!IS_ERR_OR_NULL(_T)) __fput_sync(_T))
+static int acct_on(const char __user *name)
{
- struct file *file;
- struct vfsmount *mnt, *internal;
+ /* Difference from BSD - they don't do O_APPEND */
+ const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE;
struct pid_namespace *ns = task_active_pid_ns(current);
+ struct filename *pathname __free(putname) = getname(name);
+ struct file *original_file __free(fput) = NULL; // in that order
+ struct path internal __free(path_put) = {}; // in that order
+ struct file *file __free(fput_sync) = NULL; // in that order
struct bsd_acct_struct *acct;
+ struct vfsmount *mnt;
struct fs_pin *old;
- int err;
- acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
- if (!acct)
- return -ENOMEM;
+ if (IS_ERR(pathname))
+ return PTR_ERR(pathname);
+ original_file = file_open_name(pathname, open_flags, 0);
+ if (IS_ERR(original_file))
+ return PTR_ERR(original_file);
- /* Difference from BSD - they don't do O_APPEND */
- file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
- if (IS_ERR(file)) {
- kfree(acct);
+ mnt = mnt_clone_internal(&original_file->f_path);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ internal.mnt = mnt;
+ internal.dentry = dget(mnt->mnt_root);
+
+ file = dentry_open(&internal, open_flags, current_cred());
+ if (IS_ERR(file))
return PTR_ERR(file);
- }
- if (!S_ISREG(file_inode(file)->i_mode)) {
- kfree(acct);
- filp_close(file, NULL);
+ if (!S_ISREG(file_inode(file)->i_mode))
return -EACCES;
- }
- if (!(file->f_mode & FMODE_CAN_WRITE)) {
- kfree(acct);
- filp_close(file, NULL);
+ /* Exclude kernel kernel internal filesystems. */
+ if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT))
+ return -EINVAL;
+
+ /* Exclude procfs and sysfs. */
+ if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE)
+ return -EINVAL;
+
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EIO;
- }
- internal = mnt_clone_internal(&file->f_path);
- if (IS_ERR(internal)) {
- kfree(acct);
- filp_close(file, NULL);
- return PTR_ERR(internal);
- }
- err = mnt_want_write(internal);
- if (err) {
- mntput(internal);
- kfree(acct);
- filp_close(file, NULL);
- return err;
- }
- mnt = file->f_path.mnt;
- file->f_path.mnt = internal;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
atomic_long_set(&acct->count, 1);
init_fs_pin(&acct->pin, acct_pin_kill);
- acct->file = file;
+ acct->file = no_free_ptr(file);
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
INIT_WORK(&acct->work, close_work);
init_completion(&acct->done);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
- pin_insert(&acct->pin, mnt);
+ pin_insert(&acct->pin, original_file->f_path.mnt);
rcu_read_lock();
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- mnt_drop_write(mnt);
- mntput(mnt);
return 0;
}
@@ -260,12 +285,12 @@ static DEFINE_MUTEX(acct_on_mutex);
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
- * Returns 0 for success or negative errno values for failure.
- *
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
+ *
+ * Returns: 0 for success or negative errno values for failure.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
@@ -275,14 +300,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
return -EPERM;
if (name) {
- struct filename *tmp = getname(name);
-
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
mutex_lock(&acct_on_mutex);
- error = acct_on(tmp);
+ error = acct_on(name);
mutex_unlock(&acct_on_mutex);
- putname(tmp);
} else {
rcu_read_lock();
pin_kill(task_active_pid_ns(current)->bacct);
@@ -298,7 +318,7 @@ void acct_exit_ns(struct pid_namespace *ns)
}
/*
- * encode an unsigned long into a comp_t
+ * encode an u64 into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
@@ -309,7 +329,7 @@ void acct_exit_ns(struct pid_namespace *ns)
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;
@@ -328,6 +348,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
@@ -378,9 +400,7 @@ static comp2_t encode_comp2_t(u64 value)
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
-#endif
-
-#if ACCT_VERSION == 3
+#elif ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@ -409,12 +429,27 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-static void fill_ac(acct_t *ac)
+static void fill_ac(struct bsd_acct_struct *acct)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ struct file *file = acct->file;
+ acct_t *ac = &acct->ac;
u64 elapsed, run_time;
+ time64_t btime;
struct tty_struct *tty;
+ lockdep_assert_held(&acct->lock);
+
+ if (time_is_after_jiffies(acct->needcheck)) {
+ acct->check_space = false;
+
+ /* Don't fill in @ac if nothing will be written. */
+ if (!acct->active)
+ return;
+ } else {
+ acct->check_space = true;
+ }
+
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
@@ -422,7 +457,7 @@ static void fill_ac(acct_t *ac)
memset(ac, 0, sizeof(acct_t));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
+ strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
@@ -445,81 +480,75 @@ static void fill_ac(acct_t *ac)
}
#endif
do_div(elapsed, AHZ);
- ac->ac_btime = get_seconds() - elapsed;
-#if ACCT_VERSION==2
+ btime = ktime_get_real_seconds() - elapsed;
+ ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
+#if ACCT_VERSION == 2
ac->ac_ahz = AHZ;
#endif
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
- ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
- ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
+ ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
ac->ac_flag = pacct->ac_flag;
ac->ac_mem = encode_comp_t(pacct->ac_mem);
ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
-}
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct)
-{
- acct_t ac;
- unsigned long flim;
- const struct cred *orig_cred;
- struct file *file = acct->file;
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct))
- goto out;
-
- fill_ac(&ac);
/* we really need to bite the bullet and change layout */
- ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
- ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+ ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
+ ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
- ac.ac_uid16 = ac.ac_uid;
- ac.ac_gid16 = ac.ac_gid;
-#endif
-#if ACCT_VERSION == 3
+ ac->ac_uid16 = ac->ac_uid;
+ ac->ac_gid16 = ac->ac_gid;
+#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
- ac.ac_pid = task_tgid_nr_ns(current, ns);
+ ac->ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
- ns);
+ ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
}
#endif
- /*
- * Get freeze protection. If the fs is frozen, just skip the write
- * as we could deadlock the system otherwise.
- */
- if (file_start_write_trylock(file)) {
- /* it's been opened O_APPEND, so position is irrelevant */
- loff_t pos = 0;
- __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
- file_end_write(file);
+}
+
+static void acct_write_process(struct bsd_acct_struct *acct)
+{
+ struct file *file = acct->file;
+ acct_t *ac = &acct->ac;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ scoped_with_creds(file->f_cred) {
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system. Then get freeze protection. If
+ * the fs is frozen, just skip the write as we could deadlock
+ * the system otherwise.
+ */
+ if (check_free_space(acct) && file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
}
-out:
+}
+
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ unsigned long flim;
+
+ /* Accounting records are not subject to resource limits. */
+ flim = rlimit(RLIMIT_FSIZE);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ fill_ac(acct);
+ acct_write_process(acct);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- revert_creds(orig_cred);
}
/**
@@ -530,19 +559,18 @@ out:
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
- cputime_t utime, stime;
+ u64 utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
+ struct mm_struct *mm = current->mm;
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
- vma = current->mm->mmap;
- while (vma) {
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma)
vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(mm);
}
spin_lock_irq(&current->sighand->siglock);
@@ -559,6 +587,7 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
+
task_cputime(current, &utime, &stime);
pacct->ac_utime += utime;
pacct->ac_stime += stime;
@@ -580,9 +609,7 @@ static void slow_acct_process(struct pid_namespace *ns)
}
/**
- * acct_process
- *
- * handles process accounting for an exiting task
+ * acct_process - handles process accounting for an exiting task
*/
void acct_process(void)
{
diff --git a/kernel/async.c b/kernel/async.c
index 4c3773c0bf63..4c3e6a44595f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
@@ -50,11 +46,12 @@ asynchronous and synchronous parts of the kernel.
#include <linux/async.h>
#include <linux/atomic.h>
-#include <linux/ktime.h>
#include <linux/export.h>
-#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/wait.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
@@ -67,6 +64,7 @@ static async_cookie_t next_cookie = 1;
static LIST_HEAD(async_global_pending); /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
+static struct workqueue_struct *async_wq;
struct async_entry {
struct list_head domain_list;
@@ -82,22 +80,32 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
+static long long microseconds_since(ktime_t start)
+{
+ ktime_t now = ktime_get();
+ return ktime_to_ns(ktime_sub(now, start)) >> 10;
+}
+
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct list_head *pending;
+ struct async_entry *first = NULL;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
- if (domain)
- pending = &domain->pending;
- else
- pending = &async_global_pending;
+ if (domain) {
+ if (!list_empty(&domain->pending))
+ first = list_first_entry(&domain->pending,
+ struct async_entry, domain_list);
+ } else {
+ if (!list_empty(&async_global_pending))
+ first = list_first_entry(&async_global_pending,
+ struct async_entry, global_list);
+ }
- if (!list_empty(pending))
- ret = list_first_entry(pending, struct async_entry,
- domain_list)->cookie;
+ if (first)
+ ret = first->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
@@ -111,24 +119,18 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t uninitialized_var(calltime), delta, rettime;
+ ktime_t calltime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
- pr_debug("calling %lli_%pF @ %i\n",
- (long long)entry->cookie,
- entry->func, task_pid_nr(current));
- calltime = ktime_get();
- }
+ pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie,
+ entry->func, task_pid_nr(current));
+ calltime = ktime_get();
+
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
- rettime = ktime_get();
- delta = ktime_sub(rettime, calltime);
- pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
- (long long)entry->cookie,
- entry->func,
- (long long)ktime_to_ns(delta) >> 10);
- }
+
+ pr_debug("initcall %lli_%pS returned after %lld usecs\n",
+ (long long)entry->cookie, entry->func,
+ microseconds_since(calltime));
/* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
@@ -145,29 +147,14 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
+static async_cookie_t __async_schedule_node_domain(async_func_t func,
+ void *data, int node,
+ struct async_domain *domain,
+ struct async_entry *entry)
{
- struct async_entry *entry;
- unsigned long flags;
async_cookie_t newcookie;
+ unsigned long flags;
- /* allow irq-off callers */
- entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
-
- /*
- * If we're out of memory or if there's too much work
- * pending already, we execute synchronously.
- */
- if (!entry || atomic_read(&entry_count) > MAX_WORK) {
- kfree(entry);
- spin_lock_irqsave(&async_lock, flags);
- newcookie = next_cookie++;
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* low on memory.. run synchronously */
- func(data, newcookie);
- return newcookie;
- }
INIT_LIST_HEAD(&entry->domain_list);
INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
@@ -187,76 +174,116 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* mark that this task has queued an async job, used by module init */
- current->flags |= PF_USED_ASYNC;
-
/* schedule for execution */
- queue_work(system_unbound_wq, &entry->work);
+ queue_work_node(node, async_wq, &entry->work);
return newcookie;
}
/**
- * async_schedule - schedule a function for asynchronous execution
+ * async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
+ * @node: NUMA node that we want to schedule this on or close to
+ * @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally.
+ *
* Note: This function may be called from atomic or non-atomic contexts.
+ *
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
*/
-async_cookie_t async_schedule(async_func_t func, void *data)
+async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
+ int node, struct async_domain *domain)
{
- return __async_schedule(func, data, &async_dfl_domain);
+ struct async_entry *entry;
+ unsigned long flags;
+ async_cookie_t newcookie;
+
+ /* allow irq-off callers */
+ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
+
+ /*
+ * If we're out of memory or if there's too much work
+ * pending already, we execute synchronously.
+ */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ spin_lock_irqsave(&async_lock, flags);
+ newcookie = next_cookie++;
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* low on memory.. run synchronously */
+ func(data, newcookie);
+ return newcookie;
+ }
+
+ return __async_schedule_node_domain(func, data, node, domain, entry);
}
-EXPORT_SYMBOL_GPL(async_schedule);
+EXPORT_SYMBOL_GPL(async_schedule_node_domain);
/**
- * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
+ * async_schedule_node - NUMA specific version of async_schedule
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
- * @domain: the domain
+ * @node: NUMA node that we want to schedule this on or close to
*
* Returns an async_cookie_t that may be used for checkpointing later.
- * @domain may be used in the async_synchronize_*_domain() functions to
- * wait within a certain synchronization domain rather than globally. A
- * synchronization domain is specified via @domain. Note: This function
- * may be called from atomic or non-atomic contexts.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ *
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
*/
-async_cookie_t async_schedule_domain(async_func_t func, void *data,
- struct async_domain *domain)
+async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
- return __async_schedule(func, data, domain);
+ return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
-EXPORT_SYMBOL_GPL(async_schedule_domain);
+EXPORT_SYMBOL_GPL(async_schedule_node);
/**
- * async_synchronize_full - synchronize all asynchronous function calls
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
*
- * This function waits until all asynchronous function calls have been done.
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
*/
-void async_synchronize_full(void)
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
{
- async_synchronize_full_domain(NULL);
+ struct async_entry *entry;
+
+ entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+ /* Give up if there is no memory or too much work. */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ return false;
+ }
+
+ __async_schedule_node_domain(func, dev, dev_to_node(dev),
+ &async_dfl_domain, entry);
+ return true;
}
-EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
+ * async_synchronize_full - synchronize all asynchronous function calls
*
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ * This function waits until all asynchronous function calls have been done.
*/
-void async_unregister_domain(struct async_domain *domain)
+void async_synchronize_full(void)
{
- spin_lock_irq(&async_lock);
- WARN_ON(!domain->registered || !list_empty(&domain->pending));
- domain->registered = 0;
- spin_unlock_irq(&async_lock);
+ async_synchronize_full_domain(NULL);
}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
+EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
@@ -282,23 +309,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t uninitialized_var(starttime), delta, endtime;
+ ktime_t starttime;
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
- pr_debug("async_waiting @ %i\n", task_pid_nr(current));
- starttime = ktime_get();
- }
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
+ starttime = ktime_get();
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
- endtime = ktime_get();
- delta = ktime_sub(endtime, starttime);
-
- pr_debug("async_continuing @ %i after %lli usec\n",
- task_pid_nr(current),
- (long long)ktime_to_ns(delta) >> 10);
- }
+ pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
+ microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
@@ -326,3 +345,18 @@ bool current_is_async(void)
return worker && worker->current_func == async_run_entry_fn;
}
+EXPORT_SYMBOL_GPL(current_is_async);
+
+void __init async_init(void)
+{
+ /*
+ * Async can schedule a number of interdependent work items. However,
+ * unbound workqueues can handle only upto min_active interdependent
+ * work items. The default min_active of 8 isn't sufficient for async
+ * and can lead to stalls. Let's use a dedicated workqueue with raised
+ * min_active.
+ */
+ async_wq = alloc_workqueue("async", WQ_UNBOUND, 0);
+ BUG_ON(!async_wq);
+ workqueue_set_min_active(async_wq, WQ_DFL_ACTIVE);
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c13e4267de6..26a332ffb1b8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
* System-call specific features have moved to auditsc.c
@@ -5,20 +6,6 @@
* Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Goals: 1) Integrate fully with Security Modules.
@@ -38,7 +25,8 @@
* 6) Support low-overhead kernel-based filtering to minimize the
* information that must be passed to user-space.
*
- * Example user-space utilities: http://people.redhat.com/sgrubb/audit/
+ * Audit userspace, documentation, tests, and bug/issue trackers:
+ * https://github.com/linux-audit
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -54,17 +42,20 @@
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/mutex.h>
+#include <linux/gfp.h>
+#include <linux/pid.h>
#include <linux/audit.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
-#ifdef CONFIG_SECURITY
#include <linux/security.h>
-#endif
+#include <linux/lsm_hooks.h>
#include <linux/freezer.h>
-#include <linux/tty.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
@@ -75,29 +66,56 @@
#define AUDIT_DISABLED -1
#define AUDIT_UNINITIALIZED 0
#define AUDIT_INITIALIZED 1
-static int audit_initialized;
+static int audit_initialized = AUDIT_UNINITIALIZED;
-#define AUDIT_OFF 0
-#define AUDIT_ON 1
-#define AUDIT_LOCKED 2
-u32 audit_enabled;
-u32 audit_ever_enabled;
+u32 audit_enabled = AUDIT_OFF;
+bool audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static u32 audit_default;
+static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
-/*
- * If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_portid contains
- * the portid to use to send netlink messages to that process.
+/* private audit network namespace index */
+static unsigned int audit_net_id;
+
+/* Number of modules that provide a security context.
+ List of lsms that provide a security context */
+static u32 audit_subj_secctx_cnt;
+static u32 audit_obj_secctx_cnt;
+static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT];
+static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT];
+
+/**
+ * struct audit_net - audit private network namespace data
+ * @sk: communication socket
*/
-int audit_pid;
-static __u32 audit_nlk_portid;
+struct audit_net {
+ struct sock *sk;
+};
+
+/**
+ * struct auditd_connection - kernel/auditd connection state
+ * @pid: auditd PID
+ * @portid: netlink portid
+ * @net: the associated network namespace
+ * @rcu: RCU head
+ *
+ * Description:
+ * This struct is RCU protected; you must either hold the RCU lock for reading
+ * or the associated spinlock for writing.
+ */
+struct auditd_connection {
+ struct pid *pid;
+ u32 portid;
+ struct net *net;
+ struct rcu_head rcu;
+};
+static struct auditd_connection __rcu *auditd_conn;
+static DEFINE_SPINLOCK(auditd_conn_lock);
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -108,14 +126,12 @@ static u32 audit_rate_limit;
* When set to zero, this means unlimited. */
static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
-static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
-static u32 audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
-kuid_t audit_sig_uid = INVALID_UID;
-pid_t audit_sig_pid = -1;
-u32 audit_sig_sid = 0;
+static kuid_t audit_sig_uid = INVALID_UID;
+static pid_t audit_sig_pid = -1;
+static struct lsm_prop audit_sig_lsm;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -124,27 +140,30 @@ u32 audit_sig_sid = 0;
3) suppressed due to audit_rate_limit
4) suppressed due to audit_backlog_limit
*/
-static atomic_t audit_lost = ATOMIC_INIT(0);
+static atomic_t audit_lost = ATOMIC_INIT(0);
-/* The netlink socket. */
-static struct sock *audit_sock;
-static int audit_net_id;
+/* Monotonically increasing sum of time the kernel has spent
+ * waiting while the backlog limit is exceeded.
+ */
+static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
-/* The audit_freelist is a list of pre-allocated audit buffers (if more
- * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
- * being placed on the freelist). */
-static DEFINE_SPINLOCK(audit_freelist_lock);
-static int audit_freelist_count;
-static LIST_HEAD(audit_freelist);
+static struct kmem_cache *audit_buffer_cache;
-static struct sk_buff_head audit_skb_queue;
-/* queue of skbs to send to auditd when/if it comes back */
-static struct sk_buff_head audit_skb_hold_queue;
+/* queue msgs to send via kauditd_task */
+static struct sk_buff_head audit_queue;
+/* queue msgs due to temporary unicast send problems */
+static struct sk_buff_head audit_retry_queue;
+/* queue msgs waiting for new auditd connection */
+static struct sk_buff_head audit_hold_queue;
+
+/* queue servicing thread */
static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+
+/* waitqueue for callers who are blocked on the audit backlog */
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
@@ -157,28 +176,37 @@ static char *audit_feature_names[2] = {
"loginuid_immutable",
};
-
-/* Serialize requests from userspace. */
-DEFINE_MUTEX(audit_cmd_mutex);
+/**
+ * struct audit_ctl_mutex - serialize requests from userspace
+ * @lock: the mutex used for locking
+ * @owner: the task which owns the lock
+ *
+ * Description:
+ * This is the lock struct used to ensure we only process userspace requests
+ * in an orderly fashion. We can't simply use a mutex/lock here because we
+ * need to track lock ownership so we don't end up blocking the lock owner in
+ * audit_log_start() or similar.
+ */
+static struct audit_ctl_mutex {
+ struct mutex lock;
+ void *owner;
+} audit_cmd_mutex;
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
* should be at least that large. */
#define AUDIT_BUFSIZ 1024
-/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
- * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */
-#define AUDIT_MAXFREE (2*NR_CPUS)
-
/* The audit_buffer is used when formatting an audit record. The caller
* locks briefly to get the record off the freelist or to allocate the
* buffer, and locks briefly to send the buffer to the netlink layer or
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
- struct list_head list;
- struct sk_buff *skb; /* formatted skb ready to send */
+ struct sk_buff *skb; /* the skb for audit_log functions */
+ struct sk_buff_head skb_list; /* formatted skbs, ready to send */
struct audit_context *ctx; /* NULL or associated context */
+ struct audit_stamp stamp; /* audit stamp for these records */
gfp_t gfp_mask;
};
@@ -188,12 +216,122 @@ struct audit_reply {
struct sk_buff *skb;
};
-static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
+/**
+ * auditd_test_task - Check to see if a given task is an audit daemon
+ * @task: the task to check
+ *
+ * Description:
+ * Return 1 if the task is a registered audit daemon, 0 otherwise.
+ */
+int auditd_test_task(struct task_struct *task)
{
- if (ab) {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_pid = portid;
+ int rc;
+ struct auditd_connection *ac;
+
+ rcu_read_lock();
+ ac = rcu_dereference(auditd_conn);
+ rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
+ rcu_read_unlock();
+
+ return rc;
+}
+
+/**
+ * audit_ctl_lock - Take the audit control lock
+ */
+void audit_ctl_lock(void)
+{
+ mutex_lock(&audit_cmd_mutex.lock);
+ audit_cmd_mutex.owner = current;
+}
+
+/**
+ * audit_ctl_unlock - Drop the audit control lock
+ */
+void audit_ctl_unlock(void)
+{
+ audit_cmd_mutex.owner = NULL;
+ mutex_unlock(&audit_cmd_mutex.lock);
+}
+
+/**
+ * audit_ctl_owner_current - Test to see if the current task owns the lock
+ *
+ * Description:
+ * Return true if the current task owns the audit control lock, false if it
+ * doesn't own the lock.
+ */
+static bool audit_ctl_owner_current(void)
+{
+ return (current == audit_cmd_mutex.owner);
+}
+
+/**
+ * auditd_pid_vnr - Return the auditd PID relative to the namespace
+ *
+ * Description:
+ * Returns the PID in relation to the namespace, 0 on failure.
+ */
+static pid_t auditd_pid_vnr(void)
+{
+ pid_t pid;
+ const struct auditd_connection *ac;
+
+ rcu_read_lock();
+ ac = rcu_dereference(auditd_conn);
+ if (!ac || !ac->pid)
+ pid = 0;
+ else
+ pid = pid_vnr(ac->pid);
+ rcu_read_unlock();
+
+ return pid;
+}
+
+/**
+ * audit_cfg_lsm - Identify a security module as providing a secctx.
+ * @lsmid: LSM identity
+ * @flags: which contexts are provided
+ *
+ * Description:
+ * Increments the count of the security modules providing a secctx.
+ * If the LSM id is already in the list leave it alone.
+ */
+void audit_cfg_lsm(const struct lsm_id *lsmid, int flags)
+{
+ int i;
+
+ if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) {
+ for (i = 0 ; i < audit_subj_secctx_cnt; i++)
+ if (audit_subj_lsms[i] == lsmid)
+ return;
+ audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid;
}
+ if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) {
+ for (i = 0 ; i < audit_obj_secctx_cnt; i++)
+ if (audit_obj_lsms[i] == lsmid)
+ return;
+ audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid;
+ }
+}
+
+/**
+ * audit_get_sk - Return the audit socket for the given network namespace
+ * @net: the destination network namespace
+ *
+ * Description:
+ * Returns the sock pointer if valid, NULL otherwise. The caller must ensure
+ * that a reference is held for the network namespace while the sock is in use.
+ */
+static struct sock *audit_get_sk(const struct net *net)
+{
+ struct audit_net *aunet;
+
+ if (!net)
+ return NULL;
+
+ aunet = net_generic(net, audit_net_id);
+ return aunet->sk;
}
void audit_panic(const char *message)
@@ -206,9 +344,7 @@ void audit_panic(const char *message)
pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
- /* test audit_pid since printk is always losey, why bother? */
- if (audit_pid)
- panic("audit: %s\n", message);
+ panic("audit: %s\n", message);
break;
}
}
@@ -220,18 +356,17 @@ static inline int audit_rate_check(void)
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
- unsigned long elapsed;
int retval = 0;
- if (!audit_rate_limit) return 1;
+ if (!audit_rate_limit)
+ return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
retval = 1;
} else {
- now = jiffies;
- elapsed = now - last_check;
- if (elapsed > HZ) {
+ now = jiffies;
+ if (time_after(now, last_check + HZ)) {
last_check = now;
messages = 0;
retval = 1;
@@ -265,7 +400,7 @@ void audit_log_lost(const char *message)
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
- if (now - last_msg > HZ) {
+ if (time_after(now, last_msg + HZ)) {
print = 1;
last_msg = now;
}
@@ -288,10 +423,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
struct audit_buffer *ab;
int rc = 0;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
- audit_log_format(ab, "%s=%u old=%u", function_name, new, old);
+ audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
audit_log_session_info(ab);
rc = audit_log_task_context(ab);
if (rc)
@@ -340,7 +475,7 @@ static int audit_set_backlog_limit(u32 limit)
static int audit_set_backlog_wait_time(u32 timeout)
{
return audit_do_config_change("audit_backlog_wait_time",
- &audit_backlog_wait_time_master, timeout);
+ &audit_backlog_wait_time, timeout);
}
static int audit_set_enabled(u32 state)
@@ -366,76 +501,353 @@ static int audit_set_failure(u32 state)
return audit_do_config_change("audit_failure", &audit_failure, state);
}
-/*
- * Queue skbs to be sent to auditd when/if it comes back. These skbs should
- * already have been sent via prink/syslog and so if these messages are dropped
- * it is not a huge concern since we already passed the audit_log_lost()
- * notification and stuff. This is just nice to get audit messages during
- * boot before auditd is running or messages generated while auditd is stopped.
- * This only holds messages is audit_default is set, aka booting with audit=1
- * or building your kernel that way.
+/**
+ * auditd_conn_free - RCU helper to release an auditd connection struct
+ * @rcu: RCU head
+ *
+ * Description:
+ * Drop any references inside the auditd connection tracking struct and free
+ * the memory.
*/
-static void audit_hold_skb(struct sk_buff *skb)
+static void auditd_conn_free(struct rcu_head *rcu)
{
- if (audit_default &&
- (!audit_backlog_limit ||
- skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
- skb_queue_tail(&audit_skb_hold_queue, skb);
- else
- kfree_skb(skb);
+ struct auditd_connection *ac;
+
+ ac = container_of(rcu, struct auditd_connection, rcu);
+ put_pid(ac->pid);
+ put_net(ac->net);
+ kfree(ac);
}
-/*
- * For one reason or another this nlh isn't getting delivered to the userspace
- * audit daemon, just send it to printk.
+/**
+ * auditd_set - Set/Reset the auditd connection state
+ * @pid: auditd PID
+ * @portid: auditd netlink portid
+ * @net: auditd network namespace pointer
+ * @skb: the netlink command from the audit daemon
+ * @ack: netlink ack flag, cleared if ack'd here
+ *
+ * Description:
+ * This function will obtain and drop network namespace references as
+ * necessary. Returns zero on success, negative values on failure.
+ */
+static int auditd_set(struct pid *pid, u32 portid, struct net *net,
+ struct sk_buff *skb, bool *ack)
+{
+ unsigned long flags;
+ struct auditd_connection *ac_old, *ac_new;
+ struct nlmsghdr *nlh;
+
+ if (!pid || !net)
+ return -EINVAL;
+
+ ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
+ if (!ac_new)
+ return -ENOMEM;
+ ac_new->pid = get_pid(pid);
+ ac_new->portid = portid;
+ ac_new->net = get_net(net);
+
+ /* send the ack now to avoid a race with the queue backlog */
+ if (*ack) {
+ nlh = nlmsg_hdr(skb);
+ netlink_ack(skb, nlh, 0, NULL);
+ *ack = false;
+ }
+
+ spin_lock_irqsave(&auditd_conn_lock, flags);
+ ac_old = rcu_dereference_protected(auditd_conn,
+ lockdep_is_held(&auditd_conn_lock));
+ rcu_assign_pointer(auditd_conn, ac_new);
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
+
+ if (ac_old)
+ call_rcu(&ac_old->rcu, auditd_conn_free);
+
+ return 0;
+}
+
+/**
+ * kauditd_printk_skb - Print the audit record to the ring buffer
+ * @skb: audit record
+ *
+ * Whatever the reason, this packet may not make it to the auditd connection
+ * so write it via printk so the information isn't completely lost.
*/
-static void audit_printk_skb(struct sk_buff *skb)
+static void kauditd_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
char *data = nlmsg_data(nlh);
- if (nlh->nlmsg_type != AUDIT_EOE) {
- if (printk_ratelimit())
- pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
- else
- audit_log_lost("printk limit exceeded");
+ if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit())
+ pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
+}
+
+/**
+ * kauditd_rehold_skb - Handle a audit record send failure in the hold queue
+ * @skb: audit record
+ * @error: error code (unused)
+ *
+ * Description:
+ * This should only be used by the kauditd_thread when it fails to flush the
+ * hold queue.
+ */
+static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
+{
+ /* put the record back in the queue */
+ skb_queue_tail(&audit_hold_queue, skb);
+}
+
+/**
+ * kauditd_hold_skb - Queue an audit record, waiting for auditd
+ * @skb: audit record
+ * @error: error code
+ *
+ * Description:
+ * Queue the audit record, waiting for an instance of auditd. When this
+ * function is called we haven't given up yet on sending the record, but things
+ * are not looking good. The first thing we want to do is try to write the
+ * record via printk and then see if we want to try and hold on to the record
+ * and queue it, if we have room. If we want to hold on to the record, but we
+ * don't have room, record a record lost message.
+ */
+static void kauditd_hold_skb(struct sk_buff *skb, int error)
+{
+ /* at this point it is uncertain if we will ever send this to auditd so
+ * try to send the message via printk before we go any further */
+ kauditd_printk_skb(skb);
+
+ /* can we just silently drop the message? */
+ if (!audit_default)
+ goto drop;
+
+ /* the hold queue is only for when the daemon goes away completely,
+ * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
+ * record on the retry queue unless it's full, in which case drop it
+ */
+ if (error == -EAGAIN) {
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+ audit_log_lost("kauditd retry queue overflow");
+ goto drop;
+ }
+
+ /* if we have room in the hold queue, queue the message */
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_hold_queue, skb);
+ return;
}
- audit_hold_skb(skb);
+ /* we have no other options - drop the message */
+ audit_log_lost("kauditd hold queue overflow");
+drop:
+ kfree_skb(skb);
}
-static void kauditd_send_skb(struct sk_buff *skb)
+/**
+ * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
+ * @skb: audit record
+ * @error: error code (unused)
+ *
+ * Description:
+ * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
+ * but for some reason we are having problems sending it audit records so
+ * queue the given record and attempt to resend.
+ */
+static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
- int err;
- /* take a reference in case we can't send it and we want to hold it */
- skb_get(skb);
- err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
- if (err < 0) {
- BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
- if (audit_pid) {
- pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd disappeared");
- audit_pid = 0;
- audit_sock = NULL;
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+
+ /* we have to drop the record, send it via printk as a last effort */
+ kauditd_printk_skb(skb);
+ audit_log_lost("kauditd retry queue overflow");
+ kfree_skb(skb);
+}
+
+/**
+ * auditd_reset - Disconnect the auditd connection
+ * @ac: auditd connection state
+ *
+ * Description:
+ * Break the auditd/kauditd connection and move all the queued records into the
+ * hold queue in case auditd reconnects. It is important to note that the @ac
+ * pointer should never be dereferenced inside this function as it may be NULL
+ * or invalid, you can only compare the memory address! If @ac is NULL then
+ * the connection will always be reset.
+ */
+static void auditd_reset(const struct auditd_connection *ac)
+{
+ unsigned long flags;
+ struct sk_buff *skb;
+ struct auditd_connection *ac_old;
+
+ /* if it isn't already broken, break the connection */
+ spin_lock_irqsave(&auditd_conn_lock, flags);
+ ac_old = rcu_dereference_protected(auditd_conn,
+ lockdep_is_held(&auditd_conn_lock));
+ if (ac && ac != ac_old) {
+ /* someone already registered a new auditd connection */
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
+ return;
+ }
+ rcu_assign_pointer(auditd_conn, NULL);
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
+
+ if (ac_old)
+ call_rcu(&ac_old->rcu, auditd_conn_free);
+
+ /* flush the retry queue to the hold queue, but don't touch the main
+ * queue since we need to process that normally for multicast */
+ while ((skb = skb_dequeue(&audit_retry_queue)))
+ kauditd_hold_skb(skb, -ECONNREFUSED);
+}
+
+/**
+ * auditd_send_unicast_skb - Send a record via unicast to auditd
+ * @skb: audit record
+ *
+ * Description:
+ * Send a skb to the audit daemon, returns positive/zero values on success and
+ * negative values on failure; in all cases the skb will be consumed by this
+ * function. If the send results in -ECONNREFUSED the connection with auditd
+ * will be reset. This function may sleep so callers should not hold any locks
+ * where this would cause a problem.
+ */
+static int auditd_send_unicast_skb(struct sk_buff *skb)
+{
+ int rc;
+ u32 portid;
+ struct net *net;
+ struct sock *sk;
+ struct auditd_connection *ac;
+
+ /* NOTE: we can't call netlink_unicast while in the RCU section so
+ * take a reference to the network namespace and grab local
+ * copies of the namespace, the sock, and the portid; the
+ * namespace and sock aren't going to go away while we hold a
+ * reference and if the portid does become invalid after the RCU
+ * section netlink_unicast() should safely return an error */
+
+ rcu_read_lock();
+ ac = rcu_dereference(auditd_conn);
+ if (!ac) {
+ rcu_read_unlock();
+ kfree_skb(skb);
+ rc = -ECONNREFUSED;
+ goto err;
+ }
+ net = get_net(ac->net);
+ sk = audit_get_sk(net);
+ portid = ac->portid;
+ rcu_read_unlock();
+
+ rc = netlink_unicast(sk, skb, portid, 0);
+ put_net(net);
+ if (rc < 0)
+ goto err;
+
+ return rc;
+
+err:
+ if (ac && rc == -ECONNREFUSED)
+ auditd_reset(ac);
+ return rc;
+}
+
+/**
+ * kauditd_send_queue - Helper for kauditd_thread to flush skb queues
+ * @sk: the sending sock
+ * @portid: the netlink destination
+ * @queue: the skb queue to process
+ * @retry_limit: limit on number of netlink unicast failures
+ * @skb_hook: per-skb hook for additional processing
+ * @err_hook: hook called if the skb fails the netlink unicast send
+ *
+ * Description:
+ * Run through the given queue and attempt to send the audit records to auditd,
+ * returns zero on success, negative values on failure. It is up to the caller
+ * to ensure that the @sk is valid for the duration of this function.
+ *
+ */
+static int kauditd_send_queue(struct sock *sk, u32 portid,
+ struct sk_buff_head *queue,
+ unsigned int retry_limit,
+ void (*skb_hook)(struct sk_buff *skb),
+ void (*err_hook)(struct sk_buff *skb, int error))
+{
+ int rc = 0;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *skb_tail;
+ unsigned int failed = 0;
+
+ /* NOTE: kauditd_thread takes care of all our locking, we just use
+ * the netlink info passed to us (e.g. sk and portid) */
+
+ skb_tail = skb_peek_tail(queue);
+ while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
+ /* call the skb_hook for each skb we touch */
+ if (skb_hook)
+ (*skb_hook)(skb);
+
+ /* can we send to anyone via unicast? */
+ if (!sk) {
+ if (err_hook)
+ (*err_hook)(skb, -ECONNREFUSED);
+ continue;
}
- /* we might get lucky and get this in the next auditd */
- audit_hold_skb(skb);
- } else
- /* drop the extra reference if sent ok */
- consume_skb(skb);
+
+retry:
+ /* grab an extra skb reference in case of error */
+ skb_get(skb);
+ rc = netlink_unicast(sk, skb, portid, 0);
+ if (rc < 0) {
+ /* send failed - try a few times unless fatal error */
+ if (++failed >= retry_limit ||
+ rc == -ECONNREFUSED || rc == -EPERM) {
+ sk = NULL;
+ if (err_hook)
+ (*err_hook)(skb, rc);
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
+ continue;
+ } else
+ goto retry;
+ } else {
+ /* skb sent - drop the extra reference and continue */
+ consume_skb(skb);
+ failed = 0;
+ }
+ }
+
+ return (rc >= 0 ? 0 : rc);
}
/*
- * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ * kauditd_send_multicast_skb - Send a record to any multicast listeners
+ * @skb: audit record
*
- * This function doesn't consume an skb as might be expected since it has to
- * copy it anyways.
+ * Description:
+ * Write a multicast message to anyone listening in the initial network
+ * namespace. This function doesn't consume an skb as might be expected since
+ * it has to copy it anyways.
*/
-static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
- struct sk_buff *copy;
- struct audit_net *aunet = net_generic(&init_net, audit_net_id);
- struct sock *sock = aunet->nlsk;
+ struct sk_buff *copy;
+ struct sock *sock = audit_get_sk(&init_net);
+ struct nlmsghdr *nlh;
+
+ /* NOTE: we are not taking an additional reference for init_net since
+ * we don't have to worry about it going away */
if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
return;
@@ -450,98 +862,116 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
* no reason for new multicast clients to continue with this
* non-compliance.
*/
- copy = skb_copy(skb, gfp_mask);
+ copy = skb_copy(skb, GFP_KERNEL);
if (!copy)
return;
+ nlh = nlmsg_hdr(copy);
+ nlh->nlmsg_len = skb->len;
- nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
}
-/*
- * flush_hold_queue - empty the hold queue if auditd appears
- *
- * If auditd just started, drain the queue of messages already
- * sent to syslog/printk. Remember loss here is ok. We already
- * called audit_log_lost() if it didn't go out normally. so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
- *
- * If you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue. I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
+/**
+ * kauditd_thread - Worker thread to send audit records to userspace
+ * @dummy: unused
*/
-static void flush_hold_queue(void)
+static int kauditd_thread(void *dummy)
{
- struct sk_buff *skb;
-
- if (!audit_default || !audit_pid)
- return;
-
- skb = skb_dequeue(&audit_skb_hold_queue);
- if (likely(!skb))
- return;
-
- while (skb && audit_pid) {
- kauditd_send_skb(skb);
- skb = skb_dequeue(&audit_skb_hold_queue);
- }
+ int rc;
+ u32 portid = 0;
+ struct net *net = NULL;
+ struct sock *sk = NULL;
+ struct auditd_connection *ac;
- /*
- * if auditd just disappeared but we
- * dequeued an skb we need to drop ref
- */
- if (skb)
- consume_skb(skb);
-}
+#define UNICAST_RETRIES 5
-static int kauditd_thread(void *dummy)
-{
set_freezable();
while (!kthread_should_stop()) {
- struct sk_buff *skb;
-
- flush_hold_queue();
+ /* NOTE: see the lock comments in auditd_send_unicast_skb() */
+ rcu_read_lock();
+ ac = rcu_dereference(auditd_conn);
+ if (!ac) {
+ rcu_read_unlock();
+ goto main_queue;
+ }
+ net = get_net(ac->net);
+ sk = audit_get_sk(net);
+ portid = ac->portid;
+ rcu_read_unlock();
+
+ /* attempt to flush the hold queue */
+ rc = kauditd_send_queue(sk, portid,
+ &audit_hold_queue, UNICAST_RETRIES,
+ NULL, kauditd_rehold_skb);
+ if (rc < 0) {
+ sk = NULL;
+ auditd_reset(ac);
+ goto main_queue;
+ }
- skb = skb_dequeue(&audit_skb_queue);
+ /* attempt to flush the retry queue */
+ rc = kauditd_send_queue(sk, portid,
+ &audit_retry_queue, UNICAST_RETRIES,
+ NULL, kauditd_hold_skb);
+ if (rc < 0) {
+ sk = NULL;
+ auditd_reset(ac);
+ goto main_queue;
+ }
- if (skb) {
- if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
- wake_up(&audit_backlog_wait);
- if (audit_pid)
- kauditd_send_skb(skb);
- else
- audit_printk_skb(skb);
- continue;
+main_queue:
+ /* process the main queue - do the multicast send and attempt
+ * unicast, dump failed record sends to the retry queue; if
+ * sk == NULL due to previous failures we will just do the
+ * multicast send and move the record to the hold queue */
+ rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
+ kauditd_send_multicast_skb,
+ (sk ?
+ kauditd_retry_skb : kauditd_hold_skb));
+ if (ac && rc < 0)
+ auditd_reset(ac);
+ sk = NULL;
+
+ /* drop our netns reference, no auditd sends past this line */
+ if (net) {
+ put_net(net);
+ net = NULL;
}
- wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
+ /* we have processed all the queues so wake everyone */
+ wake_up(&audit_backlog_wait);
+
+ /* NOTE: we want to wake up if there is anything on the queue,
+ * regardless of if an auditd is connected, as we need to
+ * do the multicast send and rotate records from the
+ * main queue to the retry/hold queues */
+ wait_event_freezable(kauditd_wait,
+ (skb_queue_len(&audit_queue) ? 1 : 0));
}
+
return 0;
}
-int audit_send_list(void *_dest)
+int audit_send_list_thread(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
- struct net *net = dest->net;
- struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sk = audit_get_sk(dest->net);
/* wait for parent to finish and send an ACK */
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_lock();
+ audit_ctl_unlock();
while ((skb = __skb_dequeue(&dest->q)) != NULL)
- netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ netlink_unicast(sk, skb, dest->portid, 0);
- put_net(net);
+ put_net(dest->net);
kfree(dest);
return 0;
}
-struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
+struct sk_buff *audit_make_reply(int seq, int type, int done,
int multi, const void *payload, int size)
{
struct sk_buff *skb;
@@ -554,7 +984,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
if (!skb)
return NULL;
- nlh = nlmsg_put(skb, portid, seq, t, size, flags);
+ nlh = nlmsg_put(skb, 0, seq, t, size, flags);
if (!nlh)
goto out_kfree_skb;
data = nlmsg_data(nlh);
@@ -566,22 +996,32 @@ out_kfree_skb:
return NULL;
}
+static void audit_free_reply(struct audit_reply *reply)
+{
+ if (!reply)
+ return;
+
+ kfree_skb(reply->skb);
+ if (reply->net)
+ put_net(reply->net);
+ kfree(reply);
+}
+
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = reply->net;
- struct audit_net *aunet = net_generic(net, audit_net_id);
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_lock();
+ audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
- put_net(net);
- kfree(reply);
+ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
+ reply->skb = NULL;
+ audit_free_reply(reply);
return 0;
}
+
/**
* audit_send_reply - send an audit reply message via netlink
* @request_skb: skb of request we are replying to (used to target the reply)
@@ -592,36 +1032,32 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the port id.
- * No failure notifications.
+ * Allocates a skb, builds the netlink message, and sends it to the port id.
*/
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
- u32 portid = NETLINK_CB(request_skb).portid;
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
- struct sk_buff *skb;
struct task_struct *tsk;
- struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
- GFP_KERNEL);
+ struct audit_reply *reply;
+ reply = kzalloc(sizeof(*reply), GFP_KERNEL);
if (!reply)
return;
- skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
- if (!skb)
- goto out;
-
- reply->net = get_net(net);
- reply->portid = portid;
- reply->skb = skb;
+ reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
+ if (!reply->skb)
+ goto err;
+ reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
+ reply->portid = NETLINK_CB(request_skb).portid;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
- if (!IS_ERR(tsk))
- return;
- kfree_skb(skb);
-out:
- kfree(reply);
+ if (IS_ERR(tsk))
+ goto err;
+
+ return;
+
+err:
+ audit_free_reply(reply);
}
/*
@@ -684,33 +1120,36 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_context *context,
+ struct audit_buffer **ab, u16 msg_type)
{
- int rc = 0;
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
- return rc;
+ return;
}
- *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ *ab = audit_log_start(context, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
- return rc;
- audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
+ return;
+ audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
+}
- return rc;
+static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
+ u16 msg_type)
+{
+ audit_log_common_recv_msg(NULL, ab, msg_type);
}
-int is_audit_feature_set(int i)
+static int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
-
static int audit_get_feature(struct sk_buff *skb)
{
u32 seq;
@@ -730,21 +1169,21 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
if (audit_enabled == AUDIT_OFF)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
- audit_log_task_info(ab, current);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+ if (!ab)
+ return;
+ audit_log_task_info(ab);
audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
!!old_lock, !!new_lock, res);
audit_log_end(ab);
}
-static int audit_set_feature(struct sk_buff *skb)
+static int audit_set_feature(struct audit_features *uaf)
{
- struct audit_features *uaf;
int i;
BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
- uaf = nlmsg_data(nlmsg_hdr(skb));
/* if there is ever a version 2 we should handle that here */
@@ -796,47 +1235,54 @@ static int audit_set_feature(struct sk_buff *skb)
return 0;
}
-static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int audit_replace(struct pid *pid)
+{
+ pid_t pvnr;
+ struct sk_buff *skb;
+
+ pvnr = pid_vnr(pid);
+ skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
+ if (!skb)
+ return -ENOMEM;
+ return auditd_send_unicast_skb(skb);
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ bool *ack)
{
u32 seq;
void *data;
+ int data_len;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
struct audit_sig_info *sig_data;
- char *ctx = NULL;
- u32 len;
+ struct lsm_context lsmctx = { NULL, 0, 0 };
err = audit_netlink_ok(skb, msg_type);
if (err)
return err;
- /* As soon as there's any sign of userspace auditd,
- * start kauditd to talk to it */
- if (!kauditd_task) {
- kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
- }
- }
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
+ data_len = nlmsg_len(nlh);
switch (msg_type) {
case AUDIT_GET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
- s.enabled = audit_enabled;
- s.failure = audit_failure;
- s.pid = audit_pid;
- s.rate_limit = audit_rate_limit;
- s.backlog_limit = audit_backlog_limit;
- s.lost = atomic_read(&audit_lost);
- s.backlog = skb_queue_len(&audit_skb_queue);
- s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
- s.backlog_wait_time = audit_backlog_wait_time_master;
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
+ /* NOTE: use pid_vnr() so the PID is relative to the current
+ * namespace */
+ s.pid = auditd_pid_vnr();
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_queue);
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
@@ -844,7 +1290,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_status s;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled);
if (err < 0)
@@ -856,15 +1302,65 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
- int new_pid = s.pid;
-
- if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
- return -EACCES;
- if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
- audit_pid = new_pid;
- audit_nlk_portid = NETLINK_CB(skb).portid;
- audit_sock = skb->sk;
+ /* NOTE: we are using the vnr PID functions below
+ * because the s.pid value is relative to the
+ * namespace of the caller; at present this
+ * doesn't matter much since you can really only
+ * run auditd from the initial pid namespace, but
+ * something to keep in mind if this changes */
+ pid_t new_pid = s.pid;
+ pid_t auditd_pid;
+ struct pid *req_pid = task_tgid(current);
+
+ /* Sanity check - PID values must match. Setting
+ * pid to 0 is how auditd ends auditing. */
+ if (new_pid && (new_pid != pid_vnr(req_pid)))
+ return -EINVAL;
+
+ /* test the auditd connection */
+ audit_replace(req_pid);
+
+ auditd_pid = auditd_pid_vnr();
+ if (auditd_pid) {
+ /* replacing a healthy auditd is not allowed */
+ if (new_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EEXIST;
+ }
+ /* only current auditd can unregister itself */
+ if (pid_vnr(req_pid) != auditd_pid) {
+ audit_log_config_change("audit_pid",
+ new_pid, auditd_pid, 0);
+ return -EACCES;
+ }
+ }
+
+ if (new_pid) {
+ /* register a new auditd connection */
+ err = auditd_set(req_pid,
+ NETLINK_CB(skb).portid,
+ sock_net(NETLINK_CB(skb).sk),
+ skb, ack);
+ if (audit_enabled != AUDIT_OFF)
+ audit_log_config_change("audit_pid",
+ new_pid,
+ auditd_pid,
+ err ? 0 : 1);
+ if (err)
+ return err;
+
+ /* try to process any backlog */
+ wake_up_interruptible(&kauditd_wait);
+ } else {
+ if (audit_enabled != AUDIT_OFF)
+ audit_log_config_change("audit_pid",
+ new_pid,
+ auditd_pid, 1);
+
+ /* unregister the auditd connection */
+ auditd_reset(NULL);
+ }
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(s.rate_limit);
@@ -885,6 +1381,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
return err;
}
+ if (s.mask == AUDIT_STATUS_LOST) {
+ u32 lost = atomic_xchg(&audit_lost, 0);
+
+ audit_log_config_change("lost", 0, lost, 1);
+ return lost;
+ }
+ if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
+ u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
+
+ audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
+ return actual;
+ }
break;
}
case AUDIT_GET_FEATURE:
@@ -893,7 +1401,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
break;
case AUDIT_SET_FEATURE:
- err = audit_set_feature(skb);
+ if (data_len < sizeof(struct audit_features))
+ return -EINVAL;
+ err = audit_set_feature(data);
if (err)
return err;
break;
@@ -902,62 +1412,66 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
+ /* exit early if there isn't at least one character to print */
+ if (data_len < 2)
+ return -EINVAL;
- err = audit_filter_user(msg_type);
+ err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
+ char *str = data;
+
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = tty_audit_push_current();
+ err = tty_audit_push();
if (err)
break;
}
- mutex_unlock(&audit_cmd_mutex);
- audit_log_common_recv_msg(&ab, msg_type);
- if (msg_type != AUDIT_USER_TTY)
+ audit_log_user_recv_msg(&ab, msg_type);
+ if (msg_type != AUDIT_USER_TTY) {
+ /* ensure NULL termination */
+ str[data_len - 1] = '\0';
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
- (char *)data);
- else {
- int size;
-
+ str);
+ } else {
audit_log_format(ab, " data=");
- size = nlmsg_len(nlh);
- if (size > 0 &&
- ((unsigned char *)data)[size - 1] == '\0')
- size--;
- audit_log_n_untrustedstring(ab, data, size);
+ if (str[data_len - 1] == '\0')
+ data_len--;
+ audit_log_n_untrustedstring(ab, str, data_len);
}
- audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
- mutex_lock(&audit_cmd_mutex);
}
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+ if (data_len < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=%s audit_enabled=%d res=0",
+ msg_type == AUDIT_ADD_RULE ?
+ "add_rule" : "remove_rule",
+ audit_enabled);
audit_log_end(ab);
return -EPERM;
}
- err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
- seq, data, nlmsg_len(nlh));
+ err = audit_rule_change(msg_type, seq, data, data_len);
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
case AUDIT_MAKE_EQUIV: {
void *bufp = data;
u32 sizes[2];
- size_t msglen = nlmsg_len(nlh);
+ size_t msglen = data_len;
char *old, *new;
err = -EINVAL;
@@ -980,8 +1494,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
-
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
audit_log_format(ab, " new=");
@@ -993,63 +1507,65 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
}
case AUDIT_SIGNAL_INFO:
- len = 0;
- if (audit_sig_sid) {
- err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
- if (err)
+ if (lsmprop_is_set(&audit_sig_lsm)) {
+ err = security_lsmprop_to_secctx(&audit_sig_lsm,
+ &lsmctx, LSM_ID_UNDEF);
+ if (err < 0)
return err;
}
- sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len),
+ GFP_KERNEL);
if (!sig_data) {
- if (audit_sig_sid)
- security_release_secctx(ctx, len);
+ if (lsmprop_is_set(&audit_sig_lsm))
+ security_release_secctx(&lsmctx);
return -ENOMEM;
}
sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
- if (audit_sig_sid) {
- memcpy(sig_data->ctx, ctx, len);
- security_release_secctx(ctx, len);
+ if (lsmprop_is_set(&audit_sig_lsm)) {
+ memcpy(sig_data->ctx, lsmctx.context, lsmctx.len);
+ security_release_secctx(&lsmctx);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, sizeof(*sig_data) + len);
+ sig_data, struct_size(sig_data, ctx,
+ lsmctx.len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
- struct task_struct *tsk = current;
+ unsigned int t;
- spin_lock(&tsk->sighand->siglock);
- s.enabled = tsk->signal->audit_tty;
- s.log_passwd = tsk->signal->audit_tty_log_passwd;
- spin_unlock(&tsk->sighand->siglock);
+ t = READ_ONCE(current->signal->audit_tty);
+ s.enabled = t & AUDIT_TTY_ENABLE;
+ s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
struct audit_tty_status s, old;
- struct task_struct *tsk = current;
struct audit_buffer *ab;
+ unsigned int t;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+ memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
/* check if new data is valid */
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
err = -EINVAL;
- spin_lock(&tsk->sighand->siglock);
- old.enabled = tsk->signal->audit_tty;
- old.log_passwd = tsk->signal->audit_tty_log_passwd;
- if (!err) {
- tsk->signal->audit_tty = s.enabled;
- tsk->signal->audit_tty_log_passwd = s.log_passwd;
+ if (err)
+ t = READ_ONCE(current->signal->audit_tty);
+ else {
+ t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD);
+ t = xchg(&current->signal->audit_tty, t);
}
- spin_unlock(&tsk->sighand->siglock);
+ old.enabled = t & AUDIT_TTY_ENABLE;
+ old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
" old-log_passwd=%d new-log_passwd=%d res=%d",
old.enabled, s.enabled, old.log_passwd,
@@ -1065,13 +1581,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err < 0 ? err : 0;
}
-/*
- * Get message from skb. Each message is processed by audit_receive_msg.
- * Malformed skbs with wrong length are discarded silently.
+/**
+ * audit_receive - receive messages from a netlink control socket
+ * @skb: the message buffer
+ *
+ * Parse the provided skb and deal with any messages that may be present,
+ * malformed skbs are discarded.
*/
-static void audit_receive_skb(struct sk_buff *skb)
+static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
+ bool ack;
/*
* len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
@@ -1082,65 +1602,117 @@ static void audit_receive_skb(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
+ audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
- err = audit_receive_msg(skb, nlh);
- /* if err or if this message says it wants a response */
- if (err || (nlh->nlmsg_flags & NLM_F_ACK))
- netlink_ack(skb, nlh, err);
+ ack = nlh->nlmsg_flags & NLM_F_ACK;
+ err = audit_receive_msg(skb, nlh, &ack);
+
+ /* send an ack if the user asked for one and audit_receive_msg
+ * didn't already do it, or if there was an error. */
+ if (ack || err)
+ netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
}
+ audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
}
-/* Receive messages from netlink socket. */
-static void audit_receive(struct sk_buff *skb)
+/* Log information about who is connecting to the audit multicast socket */
+static void audit_log_multicast(int group, const char *op, int err)
{
- mutex_lock(&audit_cmd_mutex);
- audit_receive_skb(skb);
- mutex_unlock(&audit_cmd_mutex);
+ const struct cred *cred;
+ struct tty_struct *tty;
+ char comm[sizeof(current->comm)];
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER);
+ if (!ab)
+ return;
+
+ cred = current_cred();
+ tty = audit_get_tty();
+ audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
+ task_tgid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ tty ? tty_name(tty) : "(none)",
+ audit_get_sessionid(current));
+ audit_put_tty(tty);
+ audit_log_task_context(ab); /* subj= */
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_d_path_exe(ab, current->mm); /* exe= */
+ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err);
+ audit_log_end(ab);
}
/* Run custom bind function on netlink socket group connect or bind requests. */
-static int audit_bind(struct net *net, int group)
+static int audit_multicast_bind(struct net *net, int group)
{
+ int err = 0;
+
if (!capable(CAP_AUDIT_READ))
- return -EPERM;
+ err = -EPERM;
+ audit_log_multicast(group, "connect", err);
+ return err;
+}
- return 0;
+static void audit_multicast_unbind(struct net *net, int group)
+{
+ audit_log_multicast(group, "disconnect", 0);
}
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
- .bind = audit_bind,
+ .bind = audit_multicast_bind,
+ .unbind = audit_multicast_unbind,
.flags = NL_CFG_F_NONROOT_RECV,
.groups = AUDIT_NLGRP_MAX,
};
struct audit_net *aunet = net_generic(net, audit_net_id);
- aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
- if (aunet->nlsk == NULL) {
+ aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+ if (aunet->sk == NULL) {
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
+
return 0;
}
static void __net_exit audit_net_exit(struct net *net)
{
struct audit_net *aunet = net_generic(net, audit_net_id);
- struct sock *sock = aunet->nlsk;
- if (sock == audit_sock) {
- audit_pid = 0;
- audit_sock = NULL;
- }
- RCU_INIT_POINTER(aunet->nlsk, NULL);
- synchronize_net();
- netlink_kernel_release(sock);
+ /* NOTE: you would think that we would want to check the auditd
+ * connection and potentially reset it here if it lives in this
+ * namespace, but since the auditd connection tracking struct holds a
+ * reference to this namespace (see auditd_set()) we are only ever
+ * going to get here after that connection has been released */
+
+ netlink_kernel_release(aunet->sk);
}
static struct pernet_operations audit_net_ops __net_initdata = {
@@ -1158,31 +1730,58 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
+ audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC);
+
+ skb_queue_head_init(&audit_queue);
+ skb_queue_head_init(&audit_retry_queue);
+ skb_queue_head_init(&audit_hold_queue);
+
+ for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
+ INIT_LIST_HEAD(&audit_inode_hash[i]);
+
+ mutex_init(&audit_cmd_mutex.lock);
+ audit_cmd_mutex.owner = NULL;
+
pr_info("initializing netlink subsys (%s)\n",
- audit_default ? "enabled" : "disabled");
+ str_enabled_disabled(audit_default));
register_pernet_subsys(&audit_net_ops);
- skb_queue_head_init(&audit_skb_queue);
- skb_queue_head_init(&audit_skb_hold_queue);
audit_initialized = AUDIT_INITIALIZED;
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
- audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+ kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+ if (IS_ERR(kauditd_task)) {
+ int err = PTR_ERR(kauditd_task);
+ panic("audit: failed to start the kauditd thread (%d)\n", err);
+ }
- for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
- INIT_LIST_HEAD(&audit_inode_hash[i]);
+ audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
+ "state=initialized audit_enabled=%u res=1",
+ audit_enabled);
return 0;
}
-__initcall(audit_init);
+postcore_initcall(audit_init);
-/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
+/*
+ * Process kernel command-line parameter at boot time.
+ * audit={0|off} or audit={1|on}.
+ */
static int __init audit_enable(char *str)
{
- audit_default = !!simple_strtol(str, NULL, 0);
- if (!audit_default)
+ if (!strcasecmp(str, "off") || !strcmp(str, "0"))
+ audit_default = AUDIT_OFF;
+ else if (!strcasecmp(str, "on") || !strcmp(str, "1"))
+ audit_default = AUDIT_ON;
+ else {
+ pr_err("audit: invalid 'audit' parameter value (%s)\n", str);
+ audit_default = AUDIT_ON;
+ }
+
+ if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
+ if (audit_set_enabled(audit_default))
+ pr_err("audit: error setting audit state (%d)\n",
+ audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
@@ -1213,62 +1812,41 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
- unsigned long flags;
+ struct sk_buff *skb;
if (!ab)
return;
- if (ab->skb)
- kfree_skb(ab->skb);
-
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (audit_freelist_count > AUDIT_MAXFREE)
- kfree(ab);
- else {
- audit_freelist_count++;
- list_add(&ab->list, &audit_freelist);
- }
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ kfree_skb(skb);
+ kmem_cache_free(audit_buffer_cache, ab);
}
-static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
- gfp_t gfp_mask, int type)
+static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
+ gfp_t gfp_mask, int type)
{
- unsigned long flags;
- struct audit_buffer *ab = NULL;
- struct nlmsghdr *nlh;
+ struct audit_buffer *ab;
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (!list_empty(&audit_freelist)) {
- ab = list_entry(audit_freelist.next,
- struct audit_buffer, list);
- list_del(&ab->list);
- --audit_freelist_count;
- }
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
+ ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
+ if (!ab)
+ return NULL;
- if (!ab) {
- ab = kmalloc(sizeof(*ab), gfp_mask);
- if (!ab)
- goto err;
- }
-
- ab->ctx = ctx;
- ab->gfp_mask = gfp_mask;
+ skb_queue_head_init(&ab->skb_list);
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
- nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
- if (!nlh)
- goto out_kfree_skb;
+ skb_queue_tail(&ab->skb_list, ab->skb);
+
+ if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+ goto err;
+
+ ab->ctx = ctx;
+ ab->gfp_mask = gfp_mask;
return ab;
-out_kfree_skb:
- kfree_skb(ab->skb);
- ab->skb = NULL;
err:
audit_buffer_free(ab);
return NULL;
@@ -1295,37 +1873,18 @@ unsigned int audit_serial(void)
{
static atomic_t serial = ATOMIC_INIT(0);
- return atomic_add_return(1, &serial);
+ return atomic_inc_return(&serial);
}
static inline void audit_get_stamp(struct audit_context *ctx,
- struct timespec *t, unsigned int *serial)
+ struct audit_stamp *stamp)
{
- if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
- *t = CURRENT_TIME;
- *serial = audit_serial();
+ if (!ctx || !auditsc_get_stamp(ctx, stamp)) {
+ ktime_get_coarse_real_ts64(&stamp->ctime);
+ stamp->serial = audit_serial();
}
}
-/*
- * Wait for auditd to drain the queue a little
- */
-static long wait_for_auditd(long sleep_time)
-{
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_UNINTERRUPTIBLE);
- add_wait_queue_exclusive(&audit_backlog_wait, &wait);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
- sleep_time = schedule_timeout(sleep_time);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
-
- return sleep_time;
-}
-
/**
* audit_log_start - obtain an audit buffer
* @ctx: audit_context (may be NULL)
@@ -1344,61 +1903,70 @@ static long wait_for_auditd(long sleep_time)
struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
- struct audit_buffer *ab = NULL;
- struct timespec t;
- unsigned int uninitialized_var(serial);
- int reserve = 5; /* Allow atomic callers to go up to five
- entries over the normal backlog limit */
- unsigned long timeout_start = jiffies;
+ struct audit_buffer *ab;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
- if (unlikely(audit_filter_type(type)))
+ if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE)))
return NULL;
- if (gfp_mask & __GFP_WAIT) {
- if (audit_pid && audit_pid == current->pid)
- gfp_mask &= ~__GFP_WAIT;
- else
- reserve = 0;
- }
+ /* NOTE: don't ever fail/sleep on these two conditions:
+ * 1. auditd generated record - since we need auditd to drain the
+ * queue; also, when we are checking for auditd, compare PIDs using
+ * task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
+ * using a PID anchored in the caller's namespace
+ * 2. generator holding the audit_cmd_mutex - we don't want to block
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
+ if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
+ long stime = audit_backlog_wait_time;
- while (audit_backlog_limit
- && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
- long sleep_time;
+ while (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
- sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
- if (sleep_time > 0) {
- sleep_time = wait_for_auditd(sleep_time);
- if (sleep_time > 0)
- continue;
+ /* sleep if we are allowed and we haven't exhausted our
+ * backlog wait limit */
+ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
+ long rtime = stime;
+
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue_exclusive(&audit_backlog_wait,
+ &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ stime = schedule_timeout(rtime);
+ atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ } else {
+ if (audit_rate_check() && printk_ratelimit())
+ pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+ skb_queue_len(&audit_queue),
+ audit_backlog_limit);
+ audit_log_lost("backlog limit exceeded");
+ return NULL;
}
}
- if (audit_rate_check() && printk_ratelimit())
- pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
- skb_queue_len(&audit_skb_queue),
- audit_backlog_limit);
- audit_log_lost("backlog limit exceeded");
- audit_backlog_wait_time = audit_backlog_wait_overflow;
- wake_up(&audit_backlog_wait);
- return NULL;
}
- if (!reserve)
- audit_backlog_wait_time = audit_backlog_wait_time_master;
-
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
return NULL;
}
- audit_get_stamp(ab->ctx, &t, &serial);
+ audit_get_stamp(ab->ctx, &ab->stamp);
+ /* cancel dummy context to enable supporting records */
+ if (ctx)
+ ctx->dummy = 0;
+ audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
- audit_log_format(ab, "audit(%lu.%03lu:%u): ",
- t.tv_sec, t.tv_nsec/1000000, serial);
return ab;
}
@@ -1432,8 +2000,8 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
* will be called a second time. Currently, we assume that a printk
* can't format message larger than 1024 bytes, so we don't either.
*/
-static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
- va_list args)
+static __printf(2, 0)
+void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args)
{
int len, avail;
struct sk_buff *skb;
@@ -1490,7 +2058,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
}
/**
- * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * audit_log_n_hex - convert a buffer to hex and append it to the audit skb
* @ab: the audit_buffer
* @buf: buffer to convert to hex
* @len: length of @buf to be converted
@@ -1566,21 +2134,21 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
* @string: string to be checked
* @len: max length of the string to check
*/
-int audit_string_contains_control(const char *string, size_t len)
+bool audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/**
* audit_log_n_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
- * @len: length of string (not including trailing null)
* @string: string to be logged
+ * @len: length of string (not including trailing null)
*
* This code will escape a string that is passed to it if the string
* contains a control character, unprintable character, double quote mark,
@@ -1624,13 +2192,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
- audit_log_string(ab, "<no_memory>");
+ audit_log_format(ab, "\"<no_memory>\"");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
- audit_log_string(ab, "<too_long>");
+ audit_log_format(ab, "\"<too_long>\"");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
@@ -1641,7 +2209,7 @@ void audit_log_session_info(struct audit_buffer *ab)
unsigned int sessionid = audit_get_sessionid(current);
uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
- audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
+ audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
}
void audit_log_key(struct audit_buffer *ab, char *key)
@@ -1653,191 +2221,178 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
-void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x",
- cap->cap[CAP_LAST_U32 - i]);
- }
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x",
- name->fcap.fE, name->fcap_ver);
-}
-
-static inline int audit_copy_fcaps(struct audit_names *name,
- const struct dentry *dentry)
+/**
+ * audit_buffer_aux_new - Add an aux record buffer to the skb list
+ * @ab: audit_buffer
+ * @type: message type
+ *
+ * Aux records are allocated and added to the skb list of
+ * the "main" record. The ab->skb is reset to point to the
+ * aux record on its creation. When the aux record in complete
+ * ab->skb has to be reset to point to the "main" record.
+ * This allows the audit_log_ functions to be ignorant of
+ * which kind of record it is logging to. It also avoids adding
+ * special data for aux records.
+ *
+ * On success ab->skb will point to the new aux record.
+ * Returns 0 on success, -ENOMEM should allocation fail.
+ */
+static int audit_buffer_aux_new(struct audit_buffer *ab, int type)
{
- struct cpu_vfs_cap_data caps;
- int rc;
+ WARN_ON(ab->skb != skb_peek(&ab->skb_list));
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
+ ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask);
+ if (!ab->skb)
+ goto err;
+ if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
+ goto err;
+ skb_queue_tail(&ab->skb_list, ab->skb);
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
- VFS_CAP_REVISION_SHIFT;
+ audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+ (unsigned long long)ab->stamp.ctime.tv_sec,
+ ab->stamp.ctime.tv_nsec/1000000,
+ ab->stamp.serial);
return 0;
+
+err:
+ kfree_skb(ab->skb);
+ ab->skb = skb_peek(&ab->skb_list);
+ return -ENOMEM;
}
-/* Copy inode data into an audit_names. */
-void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
+/**
+ * audit_buffer_aux_end - Switch back to the "main" record from an aux record
+ * @ab: audit_buffer
+ *
+ * Restores the "main" audit record to ab->skb.
+ */
+static void audit_buffer_aux_end(struct audit_buffer *ab)
{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
+ ab->skb = skb_peek(&ab->skb_list);
}
/**
- * audit_log_name - produce AUDIT_PATH record from struct audit_names
- * @context: audit_context for the task
- * @n: audit_names structure with reportable details
- * @path: optional path to report instead of audit_names->name
- * @record_num: record number to report when handling a list of names
- * @call_panic: optional pointer to int that will be updated if secid fails
+ * audit_log_subj_ctx - Add LSM subject information
+ * @ab: audit_buffer
+ * @prop: LSM subject properties.
+ *
+ * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record.
*/
-void audit_log_name(struct audit_context *context, struct audit_names *n,
- struct path *path, int record_num, int *call_panic)
+int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
{
- struct audit_buffer *ab;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- return;
+ struct lsm_context ctx;
+ char *space = "";
+ int error;
+ int i;
- audit_log_format(ab, "item=%d", record_num);
+ security_current_getlsmprop_subj(prop);
+ if (!lsmprop_is_set(prop))
+ return 0;
- if (path)
- audit_log_d_path(ab, " name=", path);
- else if (n->name) {
- switch (n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name->name,
- n->name_len);
+ if (audit_subj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
}
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1)
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- from_kuid(&init_user_ns, n->uid),
- from_kgid(&init_user_ns, n->gid),
- MAJOR(n->rdev),
- MINOR(n->rdev));
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- if (call_panic)
- *call_panic = 2;
+ audit_log_format(ab, " subj=%s", ctx.context);
+ security_release_secctx(&ctx);
+ return 0;
+ }
+ /* Multiple LSMs provide contexts. Include an aux record. */
+ audit_log_format(ab, " subj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_subj_secctx_cnt; i++) {
+ error = security_lsmprop_to_secctx(prop, &ctx,
+ audit_subj_lsms[i]->id);
+ if (error < 0) {
+ /*
+ * Don't print anything. An LSM like BPF could
+ * claim to support contexts, but only do so under
+ * certain conditions.
+ */
+ if (error == -EOPNOTSUPP)
+ continue;
+ if (error != -EINVAL)
+ audit_panic("error in audit_log_subj_ctx");
} else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, "%ssubj_%s=%s", space,
+ audit_subj_lsms[i]->name, ctx.context);
+ space = " ";
+ security_release_secctx(&ctx);
}
}
+ audit_buffer_aux_end(ab);
+ return 0;
- /* log the audit_names record type */
- audit_log_format(ab, " nametype=");
- switch(n->type) {
- case AUDIT_TYPE_NORMAL:
- audit_log_format(ab, "NORMAL");
- break;
- case AUDIT_TYPE_PARENT:
- audit_log_format(ab, "PARENT");
- break;
- case AUDIT_TYPE_CHILD_DELETE:
- audit_log_format(ab, "DELETE");
- break;
- case AUDIT_TYPE_CHILD_CREATE:
- audit_log_format(ab, "CREATE");
- break;
- default:
- audit_log_format(ab, "UNKNOWN");
- break;
- }
-
- audit_log_fcaps(ab, n);
- audit_log_end(ab);
+error_path:
+ audit_panic("error in audit_log_subj_ctx");
+ return error;
}
+EXPORT_SYMBOL(audit_log_subj_ctx);
int audit_log_task_context(struct audit_buffer *ab)
{
- char *ctx = NULL;
- unsigned len;
- int error;
- u32 sid;
+ struct lsm_prop prop;
- security_task_getsecid(current, &sid);
- if (!sid)
- return 0;
+ security_current_getlsmprop_subj(&prop);
+ return audit_log_subj_ctx(ab, &prop);
+}
+EXPORT_SYMBOL(audit_log_task_context);
- error = security_secid_to_secctx(sid, &ctx, &len);
- if (error) {
- if (error != -EINVAL)
- goto error_path;
+int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop)
+{
+ int i;
+ int rc;
+ int error = 0;
+ char *space = "";
+ struct lsm_context ctx;
+
+ if (audit_obj_secctx_cnt < 2) {
+ error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF);
+ if (error < 0) {
+ if (error != -EINVAL)
+ goto error_path;
+ return error;
+ }
+ audit_log_format(ab, " obj=%s", ctx.context);
+ security_release_secctx(&ctx);
return 0;
}
+ audit_log_format(ab, " obj=?");
+ error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS);
+ if (error)
+ goto error_path;
+
+ for (i = 0; i < audit_obj_secctx_cnt; i++) {
+ rc = security_lsmprop_to_secctx(prop, &ctx,
+ audit_obj_lsms[i]->id);
+ if (rc < 0) {
+ audit_log_format(ab, "%sobj_%s=?", space,
+ audit_obj_lsms[i]->name);
+ if (rc != -EINVAL)
+ audit_panic("error in audit_log_obj_ctx");
+ error = rc;
+ } else {
+ audit_log_format(ab, "%sobj_%s=%s", space,
+ audit_obj_lsms[i]->name, ctx.context);
+ security_release_secctx(&ctx);
+ }
+ space = " ";
+ }
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- return 0;
+ audit_buffer_aux_end(ab);
+ return error;
error_path:
- audit_panic("error in audit_log_task_context");
+ audit_panic("error in audit_log_obj_ctx");
return error;
}
-EXPORT_SYMBOL(audit_log_task_context);
void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm)
@@ -1858,32 +2413,41 @@ out_null:
audit_log_format(ab, " exe=(null)");
}
-void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+struct tty_struct *audit_get_tty(void)
+{
+ struct tty_struct *tty = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&current->sighand->siglock, flags);
+ if (current->signal)
+ tty = tty_kref_get(current->signal->tty);
+ spin_unlock_irqrestore(&current->sighand->siglock, flags);
+ return tty;
+}
+
+void audit_put_tty(struct tty_struct *tty)
+{
+ tty_kref_put(tty);
+}
+
+void audit_log_task_info(struct audit_buffer *ab)
{
const struct cred *cred;
- char comm[sizeof(tsk->comm)];
- char *tty;
+ char comm[sizeof(current->comm)];
+ struct tty_struct *tty;
if (!ab)
return;
- /* tsk == current */
cred = current_cred();
-
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
+ tty = audit_get_tty();
audit_log_format(ab,
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
- task_ppid_nr(tsk),
- task_pid_nr(tsk),
- from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ task_ppid_nr(current),
+ task_tgid_nr(current),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
from_kuid(&init_user_ns, cred->euid),
@@ -1892,88 +2456,194 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
from_kgid(&init_user_ns, cred->egid),
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid),
- tty, audit_get_sessionid(tsk));
-
+ tty ? tty_name(tty) : "(none)",
+ audit_get_sessionid(current));
+ audit_put_tty(tty);
audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-
- audit_log_d_path_exe(ab, tsk->mm);
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_d_path_exe(ab, current->mm);
audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);
/**
- * audit_log_link_denied - report a link restriction denial
- * @operation: specific link opreation
- * @link: the path that triggered the restriction
+ * audit_log_path_denied - report a path restriction denial
+ * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
+ * @operation: specific operation name
*/
-void audit_log_link_denied(const char *operation, struct path *link)
+void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
- struct audit_names *name;
- name = kzalloc(sizeof(*name), GFP_NOFS);
- if (!name)
+ if (!audit_enabled)
return;
- /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
- ab = audit_log_start(current->audit_context, GFP_KERNEL,
- AUDIT_ANOM_LINK);
+ /* Generate log with subject, operation, outcome. */
+ ab = audit_log_start(audit_context(), GFP_KERNEL, type);
if (!ab)
- goto out;
+ return;
audit_log_format(ab, "op=%s", operation);
- audit_log_task_info(ab, current);
+ audit_log_task_info(ab);
audit_log_format(ab, " res=0");
audit_log_end(ab);
+}
+
+/* global counter which is incremented every time something logs in */
+static atomic_t session_id = ATOMIC_INIT(0);
+
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
+ && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
- /* Generate AUDIT_PATH record with object. */
- name->type = AUDIT_TYPE_NORMAL;
- audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry));
- audit_log_name(current->audit_context, name, link, 0, NULL);
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid,
+ unsigned int sessionid, int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+ struct tty_struct *tty;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid);
+ tty = audit_get_tty();
+
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+ oldsessionid, sessionid, !rc);
+ audit_put_tty(tty);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_set_loginuid - set current task's loginuid
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
+int audit_set_loginuid(kuid_t loginuid)
+{
+ unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
+ kuid_t oldloginuid;
+ int rc;
+
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
+
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid)) {
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ if (unlikely(sessionid == AUDIT_SID_UNSET))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ }
+
+ current->sessionid = sessionid;
+ current->loginuid = loginuid;
out:
- kfree(name);
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
+}
+
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int audit_signal_info(int sig, struct task_struct *t)
+{
+ kuid_t uid = current_uid(), auid;
+
+ if (auditd_test_task(t) &&
+ (sig == SIGTERM || sig == SIGHUP ||
+ sig == SIGUSR1 || sig == SIGUSR2)) {
+ audit_sig_pid = task_tgid_nr(current);
+ auid = audit_get_loginuid(current);
+ if (uid_valid(auid))
+ audit_sig_uid = auid;
+ else
+ audit_sig_uid = uid;
+ security_current_getlsmprop_subj(&audit_sig_lsm);
+ }
+
+ return audit_signal_info_syscall(t);
+}
+
+/**
+ * __audit_log_end - enqueue one audit record
+ * @skb: the buffer to send
+ */
+static void __audit_log_end(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+
+ if (audit_rate_check()) {
+ /* setup the netlink header, see the comments in
+ * kauditd_send_multicast_skb() for length quirks */
+ nlh = nlmsg_hdr(skb);
+ nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+ /* queue the netlink packet */
+ skb_queue_tail(&audit_queue, skb);
+ } else {
+ audit_log_lost("rate limit exceeded");
+ kfree_skb(skb);
+ }
}
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * netlink_unicast() cannot be called inside an irq context because it blocks
- * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
- * on a queue and a tasklet is scheduled to remove them from the queue outside
- * the irq context. May be called in any context.
+ * We can not do a netlink send inside an irq context because it blocks (last
+ * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
+ * queue and a kthread is scheduled to remove them from the queue outside the
+ * irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
+ struct sk_buff *skb;
+
if (!ab)
return;
- if (!audit_rate_check()) {
- audit_log_lost("rate limit exceeded");
- } else {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-
- nlh->nlmsg_len = ab->skb->len;
- kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
-
- /*
- * The original kaudit unicast socket sends up messages with
- * nlmsg_len set to the payload length rather than the entire
- * message length. This breaks the standard set by netlink.
- * The existing auditd daemon assumes this breakage. Fixing
- * this would require co-ordinating a change in the established
- * protocol between the kaudit kernel subsystem and the auditd
- * userspace code.
- */
- nlh->nlmsg_len -= NLMSG_HDRLEN;
-
- if (audit_pid) {
- skb_queue_tail(&audit_skb_queue, ab->skb);
- wake_up_interruptible(&kauditd_wait);
- } else {
- audit_printk_skb(ab->skb);
- }
- ab->skb = NULL;
- }
+
+ while ((skb = skb_dequeue(&ab->skb_list)))
+ __audit_log_end(skb);
+
+ /* poke the kauditd thread */
+ wake_up_interruptible(&kauditd_wait);
+
audit_buffer_free(ab);
}
@@ -2004,32 +2674,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
}
}
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
- u32 len;
- char *secctx;
-
- if (security_secid_to_secctx(secid, &secctx, &len)) {
- audit_panic("Cannot convert secid to context");
- } else {
- audit_log_format(ab, " obj=%s", secctx);
- security_release_secctx(secctx, len);
- }
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit.h b/kernel/audit.h
index d641f9bb3ed0..7c401729e21b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,28 +1,21 @@
-/* audit -- definition of audit_context structure and supporting types
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#ifndef _KERNEL_AUDIT_H_
+#define _KERNEL_AUDIT_H_
+
#include <linux/fs.h>
#include <linux/audit.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
+#include <linux/tty.h>
+#include <uapi/linux/openat2.h> // struct open_how
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
@@ -33,16 +26,16 @@
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
enum audit_state {
- AUDIT_DISABLED, /* Do not create per-task audit_context.
+ AUDIT_STATE_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ AUDIT_STATE_BUILD, /* Create the per-task audit_context,
* and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
* should be recorded. */
- AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ AUDIT_STATE_RECORD /* Create the per-task audit_context,
* always fill it in at syscall entry
* time, and always write out the audit
* record at syscall exit time. */
@@ -50,6 +43,7 @@ enum audit_state {
/* Rule lists */
struct audit_watch;
+struct audit_fsnotify_mark;
struct audit_tree;
struct audit_chunk;
@@ -66,6 +60,8 @@ struct audit_cap_data {
unsigned int fE; /* effective bit of file cap */
kernel_cap_t effective; /* effective set of process */
};
+ kernel_cap_t ambient;
+ kuid_t rootid;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -86,7 +82,7 @@ struct audit_names {
kuid_t uid;
kgid_t gid;
dev_t rdev;
- u32 osid;
+ struct lsm_prop oprop;
struct audit_cap_data fcap;
unsigned int fcap_ver;
unsigned char type; /* record type */
@@ -103,14 +99,24 @@ struct audit_proctitle {
char *value; /* the cmdline field */
};
+/* A timestamp/serial pair to identify an event */
+struct audit_stamp {
+ struct timespec64 ctime; /* time of syscall entry */
+ unsigned int serial; /* serial number for record */
+};
+
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
+ enum {
+ AUDIT_CTX_UNUSED, /* audit_context is currently unused */
+ AUDIT_CTX_SYSCALL, /* in use by syscall */
+ AUDIT_CTX_URING, /* in use by io_uring */
+ } context;
enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
+ struct audit_stamp stamp; /* event identifier */
int major; /* syscall number */
- struct timespec ctime; /* time of syscall entry */
+ int uring_op; /* uring operation */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
u64 prio;
@@ -132,8 +138,8 @@ struct audit_context {
struct audit_aux_data *aux_pids;
struct sockaddr_storage *sockaddr;
size_t sockaddr_len;
- /* Save things to print about task_struct */
- pid_t pid, ppid;
+ /* Save things to print about task_struct */
+ pid_t ppid;
kuid_t uid, euid, suid, fsuid;
kgid_t gid, egid, sgid, fsgid;
unsigned long personality;
@@ -143,7 +149,7 @@ struct audit_context {
kuid_t target_auid;
kuid_t target_uid;
unsigned int target_sessionid;
- u32 target_sid;
+ struct lsm_prop target_ref;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
@@ -160,7 +166,7 @@ struct audit_context {
kuid_t uid;
kgid_t gid;
umode_t mode;
- u32 osid;
+ struct lsm_prop oprop;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
@@ -179,7 +185,7 @@ struct audit_context {
mqd_t mqdes;
size_t msg_len;
unsigned int msg_prio;
- struct timespec abs_timeout;
+ struct timespec64 abs_timeout;
} mq_sendrecv;
struct {
int oflag;
@@ -194,26 +200,27 @@ struct audit_context {
int fd;
int flags;
} mmap;
+ struct open_how openat2;
struct {
int argc;
} execve;
+ struct {
+ const char *name;
+ } module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
};
-extern u32 audit_ever_enabled;
+extern bool audit_ever_enabled;
-extern void audit_copy_inode(struct audit_names *name,
- const struct dentry *dentry,
- const struct inode *inode);
-extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
- kernel_cap_t *cap);
-extern void audit_log_name(struct audit_context *context,
- struct audit_names *n, struct path *path,
- int record_num, int *call_panic);
+extern void audit_log_session_info(struct audit_buffer *ab);
-extern int audit_pid;
+extern int auditd_test_task(struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -231,9 +238,8 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
-extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
-extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
- int done, int multi,
+extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen);
+extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
const void *payload, int size);
extern void audit_panic(const char *message);
@@ -243,16 +249,11 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *);
-
-struct audit_net {
- struct sock *nlsk;
-};
-
-extern int selinux_audit_rule_update(void);
+int audit_send_list_thread(void *_dest);
extern struct mutex audit_filter_mutex;
-extern void audit_free_rule_rcu(struct rcu_head *);
+extern int audit_del_rule(struct audit_entry *entry);
+extern void audit_free_rule_rcu(struct rcu_head *head);
extern struct list_head audit_filter_list[];
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
@@ -260,69 +261,93 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
extern void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm);
-/* audit watch functions */
-#ifdef CONFIG_AUDIT_WATCH
+extern struct tty_struct *audit_get_tty(void);
+extern void audit_put_tty(struct tty_struct *tty);
+
+/* audit watch/mark/tree functions */
+extern unsigned int audit_serial(void);
+#ifdef CONFIG_AUDITSYSCALL
+extern int auditsc_get_stamp(struct audit_context *ctx,
+ struct audit_stamp *stamp);
+
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
-extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
+ u32 op);
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
-#else
-#define audit_put_watch(w) {}
-#define audit_get_watch(w) {}
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino,
+ dev_t dev);
+
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
+ char *pathname, int len);
+extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
+extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
+extern void audit_remove_mark_rule(struct audit_krule *krule);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark,
+ unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk,
+ struct audit_fsnotify_mark *mark);
+
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk,
+ struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
+extern void audit_trim_trees(void);
+extern int audit_tag_tree(char *old, char *new);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct audit_context *context);
+
+extern int audit_signal_info_syscall(struct task_struct *t);
+extern void audit_filter_inodes(struct task_struct *tsk,
+ struct audit_context *ctx);
+extern struct list_head *audit_killed_trees(void);
+#else /* CONFIG_AUDITSYSCALL */
+#define auditsc_get_stamp(c, s) 0
+#define audit_put_watch(w) do { } while (0)
+#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
#define audit_add_watch(k, l) (-EINVAL)
#define audit_remove_watch_rule(k) BUG()
#define audit_watch_path(w) ""
#define audit_watch_compare(w, i, d) 0
-#endif /* CONFIG_AUDIT_WATCH */
+#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
+#define audit_mark_path(m) ""
+#define audit_remove_mark(m) do { } while (0)
+#define audit_remove_mark_rule(k) do { } while (0)
+#define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
-#ifdef CONFIG_AUDIT_TREE
-extern struct audit_chunk *audit_tree_lookup(const struct inode *);
-extern void audit_put_chunk(struct audit_chunk *);
-extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
-extern int audit_make_tree(struct audit_krule *, char *, u32);
-extern int audit_add_tree_rule(struct audit_krule *);
-extern int audit_remove_tree_rule(struct audit_krule *);
-extern void audit_trim_trees(void);
-extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *);
-extern void audit_put_tree(struct audit_tree *);
-extern void audit_kill_trees(struct list_head *);
-#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
-#define audit_trim_trees() (void)0
-#define audit_put_tree(tree) (void)0
+#define audit_trim_trees() do { } while (0)
+#define audit_put_tree(tree) do { } while (0)
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
-#define audit_kill_trees(list) BUG()
-#endif
-
-extern char *audit_unpack_string(void **, size_t *, size_t);
+#define audit_kill_trees(context) BUG()
-extern pid_t audit_sig_pid;
-extern kuid_t audit_sig_uid;
-extern u32 audit_sig_sid;
-
-#ifdef CONFIG_AUDITSYSCALL
-extern int __audit_signal_info(int sig, struct task_struct *t);
-static inline int audit_signal_info(int sig, struct task_struct *t)
+static inline int audit_signal_info_syscall(struct task_struct *t)
{
- if (unlikely((audit_pid && t->tgid == audit_pid) ||
- (audit_signals && !audit_dummy_context())))
- return __audit_signal_info(sig, t);
return 0;
}
-extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
-extern struct list_head *audit_killed_trees(void);
-#else
-#define audit_signal_info(s,t) AUDIT_DISABLED
-#define audit_filter_inodes(t,c) AUDIT_DISABLED
-#endif
-extern struct mutex audit_cmd_mutex;
+#define audit_filter_inodes(t, c) do { } while (0)
+#endif /* CONFIG_AUDITSYSCALL */
+
+extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
+
+extern int audit_filter(int msgtype, unsigned int listtype);
+
+extern void audit_ctl_lock(void);
+extern void audit_ctl_unlock(void);
+
+#endif
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
new file mode 100644
index 000000000000..b92805b317a2
--- /dev/null
+++ b/kernel/audit_fsnotify.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* audit_fsnotify.c -- tracking inodes
+ *
+ * Copyright 2003-2009,2014-2015 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * this mark lives on the parent directory of the inode in question.
+ * but dev, ino, and path are about the child
+ */
+struct audit_fsnotify_mark {
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ char *path; /* insertion path */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
+ struct audit_krule *rule;
+};
+
+/* fsnotify handle. */
+static struct fsnotify_group *audit_fsnotify_group;
+
+/* fsnotify events we care about. */
+#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF)
+
+static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
+{
+ kfree(audit_mark->path);
+ kfree(audit_mark);
+}
+
+static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ struct audit_fsnotify_mark *audit_mark;
+
+ audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
+ audit_fsnotify_mark_free(audit_mark);
+}
+
+char *audit_mark_path(struct audit_fsnotify_mark *mark)
+{
+ return mark->path;
+}
+
+int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
+{
+ if (mark->ino == AUDIT_INO_UNSET)
+ return 0;
+ return (mark->ino == ino) && (mark->dev == dev);
+}
+
+static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
+ const struct inode *inode)
+{
+ audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
+ audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
+}
+
+struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ struct path path;
+ struct dentry *dentry;
+ int ret;
+
+ if (pathname[0] != '/' || pathname[len-1] == '/')
+ return ERR_PTR(-EINVAL);
+
+ dentry = kern_path_parent(pathname, &path);
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry); /* returning an error */
+ if (d_really_is_negative(dentry)) {
+ audit_mark = ERR_PTR(-ENOENT);
+ goto out;
+ }
+
+ audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
+ if (unlikely(!audit_mark)) {
+ audit_mark = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group);
+ audit_mark->mark.mask = AUDIT_FS_EVENTS;
+ audit_mark->path = pathname;
+ audit_update_mark(audit_mark, dentry->d_inode);
+ audit_mark->rule = krule;
+
+ ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0);
+ if (ret < 0) {
+ audit_mark->path = NULL;
+ fsnotify_put_mark(&audit_mark->mark);
+ audit_mark = ERR_PTR(ret);
+ }
+out:
+ dput(dentry);
+ path_put(&path);
+ return audit_mark;
+}
+
+static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
+{
+ struct audit_buffer *ab;
+ struct audit_krule *rule = audit_mark->rule;
+
+ if (!audit_enabled)
+ return;
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_session_info(ab);
+ audit_log_format(ab, " op=%s path=", op);
+ audit_log_untrustedstring(ab, audit_mark->path);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
+void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
+{
+ fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
+ fsnotify_put_mark(&audit_mark->mark);
+}
+
+void audit_remove_mark_rule(struct audit_krule *krule)
+{
+ struct audit_fsnotify_mark *mark = krule->exe;
+
+ audit_remove_mark(mark);
+}
+
+static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
+{
+ struct audit_krule *rule = audit_mark->rule;
+ struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
+
+ audit_mark_log_rule_change(audit_mark, "autoremove_rule");
+ audit_del_rule(entry);
+}
+
+/* Update mark data in audit rules based on fsnotify events. */
+static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname, u32 cookie)
+{
+ struct audit_fsnotify_mark *audit_mark;
+
+ audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
+
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
+ return 0;
+
+ if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
+ if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
+ return 0;
+ audit_update_mark(audit_mark, inode);
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) {
+ audit_autoremove_mark_rule(audit_mark);
+ }
+
+ return 0;
+}
+
+static const struct fsnotify_ops audit_mark_fsnotify_ops = {
+ .handle_inode_event = audit_mark_handle_event,
+ .free_mark = audit_fsnotify_free_mark,
+};
+
+static int __init audit_fsnotify_init(void)
+{
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
+ FSNOTIFY_GROUP_DUPS);
+ if (IS_ERR(audit_fsnotify_group)) {
+ audit_fsnotify_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
+ return 0;
+}
+device_initcall(audit_fsnotify_init);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0f9877273fc..fda6beb041e0 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,15 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
#include "audit.h"
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
+#include <linux/refcount.h>
#include <linux/slab.h>
struct audit_tree;
struct audit_chunk;
struct audit_tree {
- atomic_t count;
+ refcount_t count;
int goner;
struct audit_chunk *root;
struct list_head chunks;
@@ -22,17 +24,22 @@ struct audit_tree {
struct audit_chunk {
struct list_head hash;
- struct fsnotify_mark mark;
+ unsigned long key;
+ struct fsnotify_mark *mark;
struct list_head trees; /* with root here */
- int dead;
int count;
atomic_long_t refs;
struct rcu_head head;
- struct node {
+ struct audit_node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
- } owners[];
+ } owners[] __counted_by(count);
+};
+
+struct audit_tree_mark {
+ struct fsnotify_mark mark;
+ struct audit_chunk *chunk;
};
static LIST_HEAD(tree_list);
@@ -40,8 +47,15 @@ static LIST_HEAD(prune_list);
static struct task_struct *prune_thread;
/*
- * One struct chunk is attached to each inode of interest.
- * We replace struct chunk on tagging/untagging.
+ * One struct chunk is attached to each inode of interest through
+ * audit_tree_mark (fsnotify mark). We replace struct chunk on tagging /
+ * untagging, the mark is stable as long as there is chunk attached. The
+ * association between mark and chunk is protected by hash_lock and
+ * audit_tree_group->mark_mutex. Thus as long as we hold
+ * audit_tree_group->mark_mutex and check that the mark is alive by
+ * FSNOTIFY_MARK_FLAG_ATTACHED flag check, we are sure the mark points to
+ * the current chunk.
+ *
* Rules have pointer to struct audit_tree.
* Rules have struct list_head rlist forming a list of rules over
* the same tree.
@@ -60,8 +74,12 @@ static struct task_struct *prune_thread;
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
- * of watch contributes 1 to .refs).
+ * chunk is refcounted by embedded .refs. Mark associated with the chunk holds
+ * one chunk reference. This reference is dropped either when a mark is going
+ * to be freed (corresponding inode goes away) or when chunk attached to the
+ * mark gets replaced. This reference must be dropped using
+ * audit_mark_put_chunk() to make sure the reference is dropped only after RCU
+ * grace period as it protects RCU readers of the hash table.
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
@@ -69,34 +87,37 @@ static struct task_struct *prune_thread;
* that makes a difference. Some.
*/
-static struct fsnotify_group *audit_tree_group;
+static struct fsnotify_group *audit_tree_group __ro_after_init;
+static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
+ size_t sz;
- tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ sz = strlen(s) + 1;
+ tree = kmalloc(struct_size(tree, pathname, sz), GFP_KERNEL);
if (tree) {
- atomic_set(&tree->count, 1);
+ refcount_set(&tree->count, 1);
tree->goner = 0;
INIT_LIST_HEAD(&tree->chunks);
INIT_LIST_HEAD(&tree->rules);
INIT_LIST_HEAD(&tree->list);
INIT_LIST_HEAD(&tree->same_root);
tree->root = NULL;
- strcpy(tree->pathname, s);
+ strscpy(tree->pathname, s, sz);
}
return tree;
}
static inline void get_tree(struct audit_tree *tree)
{
- atomic_inc(&tree->count);
+ refcount_inc(&tree->count);
}
static inline void put_tree(struct audit_tree *tree)
{
- if (atomic_dec_and_test(&tree->count))
+ if (refcount_dec_and_test(&tree->count))
kfree_rcu(tree, head);
}
@@ -129,20 +150,49 @@ static void __put_chunk(struct rcu_head *rcu)
audit_put_chunk(chunk);
}
-static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+/*
+ * Drop reference to the chunk that was held by the mark. This is the reference
+ * that gets dropped after we've removed the chunk from the hash table and we
+ * use it to make sure chunk cannot be freed before RCU grace period expires.
+ */
+static void audit_mark_put_chunk(struct audit_chunk *chunk)
{
- struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
call_rcu(&chunk->head, __put_chunk);
}
+static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *mark)
+{
+ return container_of(mark, struct audit_tree_mark, mark);
+}
+
+static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
+{
+ return audit_mark(mark)->chunk;
+}
+
+static void audit_tree_destroy_watch(struct fsnotify_mark *mark)
+{
+ kmem_cache_free(audit_tree_mark_cachep, audit_mark(mark));
+}
+
+static struct fsnotify_mark *alloc_mark(void)
+{
+ struct audit_tree_mark *amark;
+
+ amark = kmem_cache_zalloc(audit_tree_mark_cachep, GFP_KERNEL);
+ if (!amark)
+ return NULL;
+ fsnotify_init_mark(&amark->mark, audit_tree_group);
+ amark->mark.mask = FS_IN_IGNORED;
+ return &amark->mark;
+}
+
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
- size_t size;
int i;
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
+ chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL);
if (!chunk)
return NULL;
@@ -154,8 +204,6 @@ static struct audit_chunk *alloc_chunk(int count)
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
}
- fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
- chunk->mark.mask = FS_IN_IGNORED;
return chunk;
}
@@ -163,33 +211,48 @@ enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
-static inline struct list_head *chunk_hash(const struct inode *inode)
+/* Function to return search key in our hash from inode. */
+static unsigned long inode_to_key(const struct inode *inode)
{
- unsigned long n = (unsigned long)inode / L1_CACHE_BYTES;
+ /* Use address pointed to by connector->obj as the key */
+ return (unsigned long)&inode->i_fsnotify_marks;
+}
+
+static inline struct list_head *chunk_hash(unsigned long key)
+{
+ unsigned long n = key / L1_CACHE_BYTES;
return chunk_hash_heads + n % HASH_SIZE;
}
-/* hash_lock & entry->lock is held by caller */
+/* hash_lock & mark->group->mark_mutex is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- struct fsnotify_mark *entry = &chunk->mark;
struct list_head *list;
- if (!entry->inode)
- return;
- list = chunk_hash(entry->inode);
+ /*
+ * Make sure chunk is fully initialized before making it visible in the
+ * hash. Pairs with a data dependency barrier in READ_ONCE() in
+ * audit_tree_lookup().
+ */
+ smp_wmb();
+ WARN_ON_ONCE(!chunk->key);
+ list = chunk_hash(chunk->key);
list_add_rcu(&chunk->hash, list);
}
/* called under rcu_read_lock */
struct audit_chunk *audit_tree_lookup(const struct inode *inode)
{
- struct list_head *list = chunk_hash(inode);
+ unsigned long key = inode_to_key(inode);
+ struct list_head *list = chunk_hash(key);
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- /* mark.inode may have gone NULL, but who cares? */
- if (p->mark.inode == inode) {
+ /*
+ * We use a data dependency barrier in READ_ONCE() to make sure
+ * the chunk we see is fully initialized.
+ */
+ if (READ_ONCE(p->key) == key) {
atomic_long_inc(&p->refs);
return p;
}
@@ -197,146 +260,177 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
return NULL;
}
-int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
+bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
{
int n;
for (n = 0; n < chunk->count; n++)
if (chunk->owners[n].owner == tree)
- return 1;
- return 0;
+ return true;
+ return false;
}
/* tagging and untagging inodes with trees */
-static struct audit_chunk *find_chunk(struct node *p)
+static struct audit_chunk *find_chunk(struct audit_node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
return container_of(p, struct audit_chunk, owners[0]);
}
-static void untag_chunk(struct node *p)
+static void replace_mark_chunk(struct fsnotify_mark *mark,
+ struct audit_chunk *chunk)
+{
+ struct audit_chunk *old;
+
+ assert_spin_locked(&hash_lock);
+ old = mark_chunk(mark);
+ audit_mark(mark)->chunk = chunk;
+ if (chunk)
+ chunk->mark = mark;
+ if (old)
+ old->mark = NULL;
+}
+
+static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
{
- struct audit_chunk *chunk = find_chunk(p);
- struct fsnotify_mark *entry = &chunk->mark;
- struct audit_chunk *new = NULL;
struct audit_tree *owner;
- int size = chunk->count - 1;
int i, j;
- fsnotify_get_mark(entry);
-
- spin_unlock(&hash_lock);
+ new->key = old->key;
+ list_splice_init(&old->trees, &new->trees);
+ list_for_each_entry(owner, &new->trees, same_root)
+ owner->root = new;
+ for (i = j = 0; j < old->count; i++, j++) {
+ if (!old->owners[j].owner) {
+ i--;
+ continue;
+ }
+ owner = old->owners[j].owner;
+ new->owners[i].owner = owner;
+ new->owners[i].index = old->owners[j].index - j + i;
+ if (!owner) /* result of earlier fallback */
+ continue;
+ get_tree(owner);
+ list_replace_init(&old->owners[j].list, &new->owners[i].list);
+ }
+ replace_mark_chunk(old->mark, new);
+ /*
+ * Make sure chunk is fully initialized before making it visible in the
+ * hash. Pairs with a data dependency barrier in READ_ONCE() in
+ * audit_tree_lookup().
+ */
+ smp_wmb();
+ list_replace_rcu(&old->hash, &new->hash);
+}
- if (size)
- new = alloc_chunk(size);
+static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
+{
+ struct audit_tree *owner = p->owner;
- spin_lock(&entry->lock);
- if (chunk->dead || !entry->inode) {
- spin_unlock(&entry->lock);
- if (new)
- free_chunk(new);
- goto out;
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
}
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+}
+
+static int chunk_count_trees(struct audit_chunk *chunk)
+{
+ int i;
+ int ret = 0;
+
+ for (i = 0; i < chunk->count; i++)
+ if (chunk->owners[i].owner)
+ ret++;
+ return ret;
+}
- owner = p->owner;
+static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
+{
+ struct audit_chunk *new;
+ int size;
+ fsnotify_group_lock(audit_tree_group);
+ /*
+ * mark_mutex stabilizes chunk attached to the mark so we can check
+ * whether it didn't change while we've dropped hash_lock.
+ */
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) ||
+ mark_chunk(mark) != chunk)
+ goto out_mutex;
+
+ size = chunk_count_trees(chunk);
if (!size) {
- chunk->dead = 1;
spin_lock(&hash_lock);
list_del_init(&chunk->trees);
- if (owner->root == chunk)
- owner->root = NULL;
- list_del_init(&p->list);
list_del_rcu(&chunk->hash);
+ replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry, audit_tree_group);
- goto out;
+ fsnotify_detach_mark(mark);
+ fsnotify_group_unlock(audit_tree_group);
+ audit_mark_put_chunk(chunk);
+ fsnotify_free_mark(mark);
+ return;
}
+ new = alloc_chunk(size);
if (!new)
- goto Fallback;
+ goto out_mutex;
- fsnotify_duplicate_mark(&new->mark, entry);
- if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
- fsnotify_put_mark(&new->mark);
- goto Fallback;
- }
-
- chunk->dead = 1;
spin_lock(&hash_lock);
- list_replace_init(&chunk->trees, &new->trees);
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
-
- for (i = j = 0; j <= size; i++, j++) {
- struct audit_tree *s;
- if (&chunk->owners[j] == p) {
- list_del_init(&p->list);
- i--;
- continue;
- }
- s = chunk->owners[j].owner;
- new->owners[i].owner = s;
- new->owners[i].index = chunk->owners[j].index - j + i;
- if (!s) /* result of earlier fallback */
- continue;
- get_tree(s);
- list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
- }
-
- list_replace_rcu(&chunk->hash, &new->hash);
- list_for_each_entry(owner, &new->trees, same_root)
- owner->root = new;
+ /*
+ * This has to go last when updating chunk as once replace_chunk() is
+ * called, new RCU readers can see the new chunk.
+ */
+ replace_chunk(new, chunk);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry, audit_tree_group);
- fsnotify_put_mark(&new->mark); /* drop initial reference */
- goto out;
+ fsnotify_group_unlock(audit_tree_group);
+ audit_mark_put_chunk(chunk);
+ return;
-Fallback:
- // do the best we can
- spin_lock(&hash_lock);
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
- list_del_init(&p->list);
- p->owner = NULL;
- put_tree(owner);
- spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
-out:
- fsnotify_put_mark(entry);
- spin_lock(&hash_lock);
+out_mutex:
+ fsnotify_group_unlock(audit_tree_group);
}
+/* Call with group->mark_mutex held, releases it */
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct fsnotify_mark *entry;
+ struct fsnotify_mark *mark;
struct audit_chunk *chunk = alloc_chunk(1);
- if (!chunk)
+
+ if (!chunk) {
+ fsnotify_group_unlock(audit_tree_group);
return -ENOMEM;
+ }
+
+ mark = alloc_mark();
+ if (!mark) {
+ fsnotify_group_unlock(audit_tree_group);
+ kfree(chunk);
+ return -ENOMEM;
+ }
- entry = &chunk->mark;
- if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
- fsnotify_put_mark(entry);
+ if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
+ fsnotify_group_unlock(audit_tree_group);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return -ENOSPC;
}
- spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
- chunk->dead = 1;
- spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry, audit_tree_group);
- fsnotify_put_mark(entry);
+ fsnotify_detach_mark(mark);
+ fsnotify_group_unlock(audit_tree_group);
+ fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return 0;
}
+ replace_mark_chunk(mark, chunk);
chunk->owners[0].index = (1U << 31);
chunk->owners[0].owner = tree;
get_tree(tree);
@@ -345,34 +439,49 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
+ chunk->key = inode_to_key(inode);
+ /*
+ * Inserting into the hash table has to go last as once we do that RCU
+ * readers can see the chunk.
+ */
insert_hash(chunk);
spin_unlock(&hash_lock);
- spin_unlock(&entry->lock);
- fsnotify_put_mark(entry); /* drop initial reference */
+ fsnotify_group_unlock(audit_tree_group);
+ /*
+ * Drop our initial reference. When mark we point to is getting freed,
+ * we get notification through ->freeing_mark callback and cleanup
+ * chunk pointing to this mark.
+ */
+ fsnotify_put_mark(mark);
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct fsnotify_mark *old_entry, *chunk_entry;
- struct audit_tree *owner;
+ struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
- struct node *p;
+ struct audit_node *p;
int n;
- old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
- if (!old_entry)
+ fsnotify_group_lock(audit_tree_group);
+ mark = fsnotify_find_inode_mark(inode, audit_tree_group);
+ if (!mark)
return create_chunk(inode, tree);
- old = container_of(old_entry, struct audit_chunk, mark);
-
+ /*
+ * Found mark is guaranteed to be attached and mark_mutex protects mark
+ * from getting detached and thus it makes sure there is chunk attached
+ * to the mark.
+ */
/* are we already there? */
spin_lock(&hash_lock);
+ old = mark_chunk(mark);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- fsnotify_put_mark(old_entry);
+ fsnotify_group_unlock(audit_tree_group);
+ fsnotify_put_mark(mark);
return 0;
}
}
@@ -380,94 +489,59 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- fsnotify_put_mark(old_entry);
+ fsnotify_group_unlock(audit_tree_group);
+ fsnotify_put_mark(mark);
return -ENOMEM;
}
- chunk_entry = &chunk->mark;
-
- spin_lock(&old_entry->lock);
- if (!old_entry->inode) {
- /* old_entry is being shot, lets just lie */
- spin_unlock(&old_entry->lock);
- fsnotify_put_mark(old_entry);
- free_chunk(chunk);
- return -ENOENT;
- }
-
- fsnotify_duplicate_mark(chunk_entry, old_entry);
- if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
- spin_unlock(&old_entry->lock);
- fsnotify_put_mark(chunk_entry);
- fsnotify_put_mark(old_entry);
- return -ENOSPC;
- }
-
- /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
- spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
-
- /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
- chunk->dead = 1;
- spin_unlock(&chunk_entry->lock);
- spin_unlock(&old_entry->lock);
-
- fsnotify_destroy_mark(chunk_entry, audit_tree_group);
-
- fsnotify_put_mark(chunk_entry);
- fsnotify_put_mark(old_entry);
+ fsnotify_group_unlock(audit_tree_group);
+ fsnotify_put_mark(mark);
+ kfree(chunk);
return 0;
}
- list_replace_init(&old->trees, &chunk->trees);
- for (n = 0, p = chunk->owners; n < old->count; n++, p++) {
- struct audit_tree *s = old->owners[n].owner;
- p->owner = s;
- p->index = old->owners[n].index;
- if (!s) /* result of fallback in untag */
- continue;
- get_tree(s);
- list_replace_init(&old->owners[n].list, &p->list);
- }
+ p = &chunk->owners[chunk->count - 1];
p->index = (chunk->count - 1) | (1U<<31);
p->owner = tree;
get_tree(tree);
list_add(&p->list, &tree->chunks);
- list_replace_rcu(&old->hash, &chunk->hash);
- list_for_each_entry(owner, &chunk->trees, same_root)
- owner->root = chunk;
- old->dead = 1;
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
+ /*
+ * This has to go last when updating chunk as once replace_chunk() is
+ * called, new RCU readers can see the new chunk.
+ */
+ replace_chunk(chunk, old);
spin_unlock(&hash_lock);
- spin_unlock(&chunk_entry->lock);
- spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(old_entry, audit_tree_group);
- fsnotify_put_mark(chunk_entry); /* drop initial reference */
- fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+ fsnotify_group_unlock(audit_tree_group);
+ fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
+ audit_mark_put_chunk(old);
+
return 0;
}
-static void audit_tree_log_remove_rule(struct audit_krule *rule)
+static void audit_tree_log_remove_rule(struct audit_context *context,
+ struct audit_krule *rule)
{
struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (!audit_enabled)
+ return;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
- audit_log_format(ab, "op=");
- audit_log_string(ab, "remove_rule");
- audit_log_format(ab, " dir=");
+ audit_log_format(ab, "op=remove_rule dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
}
-static void kill_rules(struct audit_tree *tree)
+static void kill_rules(struct audit_context *context, struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
@@ -478,7 +552,9 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- audit_tree_log_remove_rule(rule);
+ audit_tree_log_remove_rule(context, rule);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
@@ -488,19 +564,45 @@ static void kill_rules(struct audit_tree *tree)
}
/*
- * finish killing struct audit_tree
+ * Remove tree from chunks. If 'tagged' is set, remove tree only from tagged
+ * chunks. The function expects tagged chunks are all at the beginning of the
+ * chunks list.
*/
-static void prune_one(struct audit_tree *victim)
+static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
- struct node *p;
+ struct audit_node *p;
+ struct audit_chunk *chunk;
+ struct fsnotify_mark *mark;
- p = list_entry(victim->chunks.next, struct node, list);
+ p = list_first_entry(&victim->chunks, struct audit_node, list);
+ /* have we run out of marked? */
+ if (tagged && !(p->index & (1U<<31)))
+ break;
+ chunk = find_chunk(p);
+ mark = chunk->mark;
+ remove_chunk_node(chunk, p);
+ /* Racing with audit_tree_freeing_mark()? */
+ if (!mark)
+ continue;
+ fsnotify_get_mark(mark);
+ spin_unlock(&hash_lock);
- untag_chunk(p);
+ untag_chunk(chunk, mark);
+ fsnotify_put_mark(mark);
+
+ spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
+}
+
+/*
+ * finish killing struct audit_tree
+ */
+static void prune_one(struct audit_tree *victim)
+{
+ prune_tree_chunks(victim, false);
put_tree(victim);
}
@@ -516,30 +618,23 @@ static void trim_marked(struct audit_tree *tree)
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
- struct node *node = list_entry(p, struct node, list);
+ struct audit_node *node = list_entry(p, struct audit_node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
list_add(p, &tree->chunks);
}
}
+ spin_unlock(&hash_lock);
- while (!list_empty(&tree->chunks)) {
- struct node *node;
-
- node = list_entry(tree->chunks.next, struct node, list);
+ prune_tree_chunks(tree, true);
- /* have we run out of marked? */
- if (!(node->index & (1U<<31)))
- break;
-
- untag_chunk(node);
- }
+ spin_lock(&hash_lock);
if (!tree->root && !tree->goner) {
tree->goner = 1;
spin_unlock(&hash_lock);
mutex_lock(&audit_filter_mutex);
- kill_rules(tree);
+ kill_rules(audit_context(), tree);
list_del_init(&tree->list);
mutex_unlock(&audit_filter_mutex);
prune_one(tree);
@@ -575,11 +670,6 @@ int audit_remove_tree_rule(struct audit_krule *rule)
return 0;
}
-static int compare_root(struct vfsmount *mnt, void *arg)
-{
- return d_backing_inode(mnt->mnt_root) == arg;
-}
-
void audit_trim_trees(void)
{
struct list_head cursor;
@@ -589,37 +679,41 @@ void audit_trim_trees(void)
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct path path;
- struct vfsmount *root_mnt;
- struct node *node;
+ struct audit_node *node;
+ const struct path *paths;
+ struct path array[16];
int err;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
if (err)
goto skip_it;
- root_mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(root_mnt))
+ if (IS_ERR(paths))
goto skip_it;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
- struct inode *inode = chunk->mark.inode;
node->index |= 1U<<31;
- if (iterate_mounts(compare_root, inode, root_mnt))
- node->index &= ~(1U<<31);
+ for (const struct path *p = paths; p->dentry; p++) {
+ struct inode *inode = p->dentry->d_inode;
+ if (inode_to_key(inode) == chunk->key) {
+ node->index &= ~(1U<<31);
+ break;
+ }
+ }
}
spin_unlock(&hash_lock);
trim_marked(tree);
- drop_collected_mounts(root_mnt);
+ drop_collected_paths(paths, array);
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -632,7 +726,8 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
- rule->listnr != AUDIT_FILTER_EXIT ||
+ (rule->listnr != AUDIT_FILTER_EXIT &&
+ rule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
@@ -647,9 +742,14 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
-static int tag_mount(struct vfsmount *mnt, void *arg)
+static int tag_mounts(const struct path *paths, struct audit_tree *tree)
{
- return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+ for (const struct path *p = paths; p->dentry; p++) {
+ int err = tag_chunk(p->dentry->d_inode, tree);
+ if (err)
+ return err;
+ }
+ return 0;
}
/*
@@ -659,12 +759,12 @@ static int tag_mount(struct vfsmount *mnt, void *arg)
static int prune_tree_thread(void *unused)
{
for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (list_empty(&prune_list))
+ if (list_empty(&prune_list)) {
+ set_current_state(TASK_INTERRUPTIBLE);
schedule();
- __set_current_state(TASK_RUNNING);
+ }
- mutex_lock(&audit_cmd_mutex);
+ audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
@@ -682,7 +782,7 @@ static int prune_tree_thread(void *unused)
}
mutex_unlock(&audit_filter_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
return 0;
}
@@ -691,16 +791,14 @@ static int audit_launch_prune(void)
{
if (prune_thread)
return 0;
- prune_thread = kthread_create(prune_tree_thread, NULL,
+ prune_thread = kthread_run(prune_tree_thread, NULL,
"audit_prune_tree");
if (IS_ERR(prune_thread)) {
pr_err("cannot start thread audit_prune_tree");
prune_thread = NULL;
return -ENOMEM;
- } else {
- wake_up_process(prune_thread);
- return 0;
}
+ return 0;
}
/* called with audit_filter_mutex */
@@ -708,7 +806,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
- struct vfsmount *mnt;
+ struct path array[16];
+ const struct path *paths;
int err;
rule->tree = NULL;
@@ -735,19 +834,19 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
+ if (IS_ERR(paths)) {
+ err = PTR_ERR(paths);
goto Err;
}
get_tree(tree);
- err = iterate_mounts(tag_mount, tree, mnt);
- drop_collected_mounts(mnt);
+ err = tag_mounts(paths, tree);
+ drop_collected_paths(paths, array);
if (!err) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -779,20 +878,21 @@ int audit_tag_tree(char *old, char *new)
struct list_head cursor, barrier;
int failed = 0;
struct path path1, path2;
- struct vfsmount *tagged;
+ struct path array[16];
+ const struct path *paths;
int err;
err = kern_path(new, 0, &path2);
if (err)
return err;
- tagged = collect_mounts(&path2);
+ paths = collect_paths(&path2, array, 16);
path_put(&path2);
- if (IS_ERR(tagged))
- return PTR_ERR(tagged);
+ if (IS_ERR(paths))
+ return PTR_ERR(paths);
err = kern_path(old, 0, &path1);
if (err) {
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return err;
}
@@ -806,8 +906,7 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path2);
@@ -822,7 +921,7 @@ int audit_tag_tree(char *old, char *new)
continue;
}
- failed = iterate_mounts(tag_mount, tree, tagged);
+ failed = tag_mounts(paths, tree);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -832,8 +931,7 @@ int audit_tag_tree(char *old, char *new)
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
if (!tree->goner) {
- list_del(&tree->list);
- list_add(&tree->list, &tree_list);
+ list_move(&tree->list, &tree_list);
}
spin_unlock(&hash_lock);
put_tree(tree);
@@ -844,12 +942,11 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(barrier.prev, struct audit_tree, list);
get_tree(tree);
- list_del(&tree->list);
- list_add(&tree->list, &barrier);
+ list_move(&tree->list, &barrier);
mutex_unlock(&audit_filter_mutex);
if (!failed) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -865,7 +962,7 @@ int audit_tag_tree(char *old, char *new)
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
path_put(&path1);
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return failed;
}
@@ -879,16 +976,18 @@ static void audit_schedule_prune(void)
* ... and that one is done if evict_chunk() decides to delay until the end
* of syscall. Runs synchronously.
*/
-void audit_kill_trees(struct list_head *list)
+void audit_kill_trees(struct audit_context *context)
{
- mutex_lock(&audit_cmd_mutex);
+ struct list_head *list = &context->killed_trees;
+
+ audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(list)) {
struct audit_tree *victim;
victim = list_entry(list->next, struct audit_tree, list);
- kill_rules(victim);
+ kill_rules(context, victim);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
@@ -899,7 +998,7 @@ void audit_kill_trees(struct list_head *list)
}
mutex_unlock(&audit_filter_mutex);
- mutex_unlock(&audit_cmd_mutex);
+ audit_ctl_unlock();
}
/*
@@ -913,10 +1012,6 @@ static void evict_chunk(struct audit_chunk *chunk)
int need_prune = 0;
int n;
- if (chunk->dead)
- return;
-
- chunk->dead = 1;
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
while (!list_empty(&chunk->trees)) {
@@ -927,7 +1022,7 @@ static void evict_chunk(struct audit_chunk *chunk)
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
if (!postponed) {
- kill_rules(owner);
+ kill_rules(audit_context(), owner);
list_move(&owner->list, &prune_list);
need_prune = 1;
} else {
@@ -944,39 +1039,49 @@ static void evict_chunk(struct audit_chunk *chunk)
audit_schedule_prune();
}
-static int audit_tree_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *file_name, u32 cookie)
{
return 0;
}
-static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
+static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
+ struct fsnotify_group *group)
{
- struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ struct audit_chunk *chunk;
- evict_chunk(chunk);
+ fsnotify_group_lock(mark->group);
+ spin_lock(&hash_lock);
+ chunk = mark_chunk(mark);
+ replace_mark_chunk(mark, NULL);
+ spin_unlock(&hash_lock);
+ fsnotify_group_unlock(mark->group);
+ if (chunk) {
+ evict_chunk(chunk);
+ audit_mark_put_chunk(chunk);
+ }
/*
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
- BUG_ON(atomic_read(&entry->refcnt) < 1);
+ BUG_ON(refcount_read(&mark->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
- .handle_event = audit_tree_handle_event,
+ .handle_inode_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
+ .free_mark = audit_tree_destroy_watch,
};
static int __init audit_tree_init(void)
{
int i;
- audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
+
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6e30024d9aac..a700e3c8925f 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -1,24 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_watch.c -- watching inodes
*
* Copyright 2003-2009 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -27,6 +15,7 @@
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
+#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
@@ -45,7 +34,7 @@
*/
struct audit_watch {
- atomic_t count; /* reference count */
+ refcount_t count; /* reference count */
dev_t dev; /* associated superblock device */
char *path; /* insertion path */
unsigned long ino; /* associated inode number */
@@ -64,7 +53,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -101,7 +90,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
struct audit_parent *parent = NULL;
struct fsnotify_mark *entry;
- entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+ entry = fsnotify_find_inode_mark(inode, audit_watch_group);
if (entry)
parent = container_of(entry, struct audit_parent, mark);
@@ -110,12 +99,12 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
void audit_get_watch(struct audit_watch *watch)
{
- atomic_inc(&watch->count);
+ refcount_inc(&watch->count);
}
void audit_put_watch(struct audit_watch *watch)
{
- if (atomic_dec_and_test(&watch->count)) {
+ if (refcount_dec_and_test(&watch->count)) {
WARN_ON(watch->parent);
WARN_ON(!list_empty(&watch->rules));
kfree(watch->path);
@@ -138,13 +127,13 @@ char *audit_watch_path(struct audit_watch *watch)
int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
- return (watch->ino != (unsigned long)-1) &&
+ return (watch->ino != AUDIT_INO_UNSET) &&
(watch->ino == ino) &&
(watch->dev == dev);
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct path *path)
+static struct audit_parent *audit_init_parent(const struct path *path)
{
struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
@@ -156,9 +145,9 @@ static struct audit_parent *audit_init_parent(struct path *path)
INIT_LIST_HEAD(&parent->watches);
- fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+ fsnotify_init_mark(&parent->mark, audit_watch_group);
parent->mark.mask = AUDIT_FS_WATCH;
- ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+ ret = fsnotify_add_inode_mark(&parent->mark, inode, 0);
if (ret < 0) {
audit_free_parent(parent);
return ERR_PTR(ret);
@@ -177,15 +166,15 @@ static struct audit_watch *audit_init_watch(char *path)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&watch->rules);
- atomic_set(&watch->count, 1);
+ refcount_set(&watch->count, 1);
watch->path = path;
- watch->dev = (dev_t)-1;
- watch->ino = (unsigned long)-1;
+ watch->dev = AUDIT_DEV_UNSET;
+ watch->ino = AUDIT_INO_UNSET;
return watch;
}
-/* Translate a watch string to kernel respresentation. */
+/* Translate a watch string to kernel representation. */
int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
@@ -194,7 +183,8 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
+ (krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
@@ -203,7 +193,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
if (IS_ERR(watch))
return PTR_ERR(watch);
- audit_get_watch(watch);
krule->watch = watch;
return 0;
@@ -237,26 +226,24 @@ out:
static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
{
- if (audit_enabled) {
- struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
- if (unlikely(!ab))
- return;
- audit_log_format(ab, "auid=%u ses=%u op=",
- from_kuid(&init_user_ns, audit_get_loginuid(current)),
- audit_get_sessionid(current));
- audit_log_string(ab, op);
- audit_log_format(ab, " path=");
- audit_log_untrustedstring(ab, w->path);
- audit_log_key(ab, r->filterkey);
- audit_log_format(ab, " list=%d res=1", r->listnr);
- audit_log_end(ab);
- }
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (!ab)
+ return;
+ audit_log_session_info(ab);
+ audit_log_format(ab, "op=%s path=", op);
+ audit_log_untrustedstring(ab, w->path);
+ audit_log_key(ab, r->filterkey);
+ audit_log_format(ab, " list=%d res=1", r->listnr);
+ audit_log_end(ab);
}
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
- const char *dname, dev_t dev,
+ const struct qstr *dname, dev_t dev,
unsigned long ino, unsigned invalidating)
{
struct audit_watch *owatch, *nwatch, *nextw;
@@ -274,7 +261,7 @@ static void audit_update_watch(struct audit_parent *parent,
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
if (invalidating && !audit_dummy_context())
- audit_filter_inodes(current, current->audit_context);
+ audit_filter_inodes(current, audit_context());
/* updating ino will likely change which audit_hash_list we
* are on so we need a new watch for the new list */
@@ -313,8 +300,8 @@ static void audit_update_watch(struct audit_parent *parent,
list_replace(&oentry->rule.list,
&nentry->rule.list);
}
-
- audit_watch_log_rule_change(r, owatch, "updated_rules");
+ if (oentry->rule.exe)
+ audit_remove_mark(oentry->rule.exe);
call_rcu(&oentry->rcu, audit_free_rule_rcu);
}
@@ -343,6 +330,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
audit_watch_log_rule_change(r, w, "remove_rule");
+ if (e->rule.exe)
+ audit_remove_mark(e->rule.exe);
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
@@ -358,15 +347,18 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct dentry *d = kern_path_locked(watch->path, parent);
+ struct dentry *d;
+
+ d = kern_path_parent(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex);
+
if (d_is_positive(d)) {
/* update watch filter fields */
- watch->dev = d_backing_inode(d)->i_sb->s_dev;
+ watch->dev = d->d_sb->s_dev;
watch->ino = d_backing_inode(d)->i_ino;
}
+
dput(d);
return 0;
}
@@ -387,19 +379,20 @@ static void audit_add_to_parent(struct audit_krule *krule,
watch_found = 1;
- /* put krule's and initial refs to temporary watch */
- audit_put_watch(watch);
+ /* put krule's ref to temporary watch */
audit_put_watch(watch);
audit_get_watch(w);
krule->watch = watch = w;
+
+ audit_put_parent(parent);
break;
}
if (!watch_found) {
- audit_get_parent(parent);
watch->parent = parent;
+ audit_get_watch(watch);
list_add(&watch->wlist, &parent->watches);
}
list_add(&krule->rlist, &watch->rules);
@@ -414,6 +407,13 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
struct path parent_path;
int h, ret = 0;
+ /*
+ * When we will be calling audit_add_to_parent, krule->watch might have
+ * been updated and watch might have been freed.
+ * So we need to keep a reference of watch.
+ */
+ audit_get_watch(watch);
+
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
@@ -422,8 +422,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
- if (ret)
+ if (ret) {
+ audit_put_watch(watch);
return ret;
+ }
/* either find an old parent or attach a new one */
parent = audit_find_parent(d_backing_inode(parent_path.dentry));
@@ -437,13 +439,11 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
audit_add_to_parent(krule, parent);
- /* match get in audit_find_parent or audit_init_parent */
- audit_put_parent(parent);
-
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
path_put(&parent_path);
+ audit_put_watch(watch);
return ret;
}
@@ -455,48 +455,34 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
/* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- u32 mask, void *data, int data_type,
- const unsigned char *dname, u32 cookie)
+static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname, u32 cookie)
{
- struct inode *inode;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
- BUG_ON(group != audit_watch_group);
-
- switch (data_type) {
- case (FSNOTIFY_EVENT_PATH):
- inode = d_backing_inode(((struct path *)data)->dentry);
- break;
- case (FSNOTIFY_EVENT_INODE):
- inode = (struct inode *)data;
- break;
- default:
- BUG();
- inode = NULL;
- break;
- };
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
+ return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
else if (mask & (FS_DELETE|FS_MOVED_FROM))
- audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
@@ -504,12 +490,13 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .handle_event = audit_watch_handle_event,
+ .handle_inode_event = audit_watch_handle_event,
+ .free_mark = audit_watch_free_mark,
};
static int __init audit_watch_init(void)
{
- audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0);
if (IS_ERR(audit_watch_group)) {
audit_watch_group = NULL;
audit_panic("cannot create audit fsnotify group");
@@ -517,3 +504,44 @@ static int __init audit_watch_init(void)
return 0;
}
device_initcall(audit_watch_init);
+
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ char *pathname;
+
+ pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+ if (!pathname)
+ return -ENOMEM;
+
+ audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+ if (IS_ERR(audit_mark)) {
+ kfree(pathname);
+ return PTR_ERR(audit_mark);
+ }
+ new->exe = audit_mark;
+
+ return 0;
+}
+
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+ struct file *exe_file;
+ unsigned long ino;
+ dev_t dev;
+
+ /* only do exe filtering if we are recording @current events/records */
+ if (tsk != current)
+ return 0;
+
+ if (!current->mm)
+ return 0;
+ exe_file = get_mm_exe_file(current->mm);
+ if (!exe_file)
+ return 0;
+ ino = file_inode(exe_file)->i_ino;
+ dev = file_inode(exe_file)->i_sb->s_dev;
+ fput(exe_file);
+
+ return audit_mark_compare(mark, ino, dev);
+}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 72e1660a79a3..6a86c0683b67 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditfilter.c -- filtering of audit events
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -39,13 +26,13 @@
* Locking model:
*
* audit_filter_mutex:
- * Synchronizes writes and blocking reads of audit's filterlist
- * data. Rcu is used to traverse the filterlist and access
- * contents of structs audit_entry, audit_watch and opaque
- * LSM rules during filtering. If modified, these structures
- * must be copied and replace their counterparts in the filterlist.
- * An audit_parent struct is not accessed during filtering, so may
- * be written directly provided audit_filter_mutex is held.
+ * Synchronizes writes and blocking reads of audit's filterlist
+ * data. Rcu is used to traverse the filterlist and access
+ * contents of structs audit_entry, audit_watch and opaque
+ * LSM rules during filtering. If modified, these structures
+ * must be copied and replace their counterparts in the filterlist.
+ * An audit_parent struct is not accessed during filtering, so may
+ * be written directly provided audit_filter_mutex is held.
*/
/* Audit filter lists, defined in <linux/audit.h> */
@@ -56,7 +43,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[3]),
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+ LIST_HEAD_INIT(audit_filter_list[6]),
+ LIST_HEAD_INIT(audit_filter_list[7]),
+#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
@@ -67,6 +56,8 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[3]),
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
+ LIST_HEAD_INIT(audit_rules_list[6]),
+ LIST_HEAD_INIT(audit_rules_list[7]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -158,11 +149,12 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
return str;
}
-/* Translate an inode field to kernel respresentation. */
+/* Translate an inode field to kernel representation. */
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
- if (krule->listnr != AUDIT_FILTER_EXIT ||
+ if ((krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
@@ -229,7 +221,7 @@ static int audit_match_signal(struct audit_entry *entry)
entry->rule.mask));
}
- switch(audit_classify_arch(arch->val)) {
+ switch (audit_classify_arch(arch->val)) {
case 0: /* native */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
entry->rule.mask));
@@ -251,18 +243,20 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
err = -EINVAL;
listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
- switch(listnr) {
+ switch (listnr) {
default:
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
- if (rule->action == AUDIT_ALWAYS)
- goto exit_err;
+ pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
+ goto exit_err;
case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_URING_EXIT:
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
+ case AUDIT_FILTER_EXCLUDE:
+ case AUDIT_FILTER_FS:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -332,17 +326,43 @@ static u32 audit_to_op(u32 op)
/* check if an audit field is valid */
static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- switch(f->type) {
+ switch (f->type) {
case AUDIT_MSGTYPE:
- if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
entry->rule.listnr != AUDIT_FILTER_USER)
return -EINVAL;
break;
- };
+ case AUDIT_FSTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_FS)
+ return -EINVAL;
+ break;
+ case AUDIT_PERM:
+ if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
+ return -EINVAL;
+ break;
+ }
- switch(f->type) {
- default:
- return -EINVAL;
+ switch (entry->rule.listnr) {
+ case AUDIT_FILTER_FS:
+ switch (f->type) {
+ case AUDIT_FSTYPE:
+ case AUDIT_FILTERKEY:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* Check for valid field type and op */
+ switch (f->type) {
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_PERS: /* <uapi/linux/personality.h> */
+ case AUDIT_DEVMINOR:
+ /* all ops are valid */
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -355,44 +375,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
case AUDIT_PID:
- case AUDIT_PERS:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
case AUDIT_EXIT:
case AUDIT_SUCCESS:
case AUDIT_INODE:
+ case AUDIT_SESSIONID:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_SADDR_FAM:
/* bit ops are only useful on syscall args */
if (f->op == Audit_bitmask || f->op == Audit_bittest)
return -EINVAL;
break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
- case AUDIT_SUBJ_SEN:
- case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
- case AUDIT_OBJ_LEV_LOW:
- case AUDIT_OBJ_LEV_HIGH:
case AUDIT_WATCH:
case AUDIT_DIR:
case AUDIT_FILTERKEY:
- break;
case AUDIT_LOGINUID_SET:
- if ((f->val != 0) && (f->val != 1))
- return -EINVAL;
- /* FALL THROUGH */
case AUDIT_ARCH:
+ case AUDIT_FSTYPE:
+ case AUDIT_PERM:
+ case AUDIT_FILETYPE:
+ case AUDIT_FIELD_COMPARE:
+ case AUDIT_EXE:
+ /* only equal and not equal valid ops */
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
+ default:
+ /* field not recognized */
+ return -EINVAL;
+ }
+
+ /* Check for select valid field values */
+ switch (f->type) {
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ break;
case AUDIT_PERM:
if (f->val & ~15)
return -EINVAL;
@@ -405,11 +434,18 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
- };
+ case AUDIT_SADDR_FAM:
+ if (f->val >= AF_MAX)
+ return -EINVAL;
+ break;
+ default:
+ break;
+ }
+
return 0;
}
-/* Translate struct audit_rule_data to kernel's rule respresentation. */
+/* Translate struct audit_rule_data to kernel's rule representation. */
static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
size_t datasz)
{
@@ -419,6 +455,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
size_t remain = datasz - sizeof(struct audit_rule_data);
int i;
char *str;
+ struct audit_fsnotify_mark *audit_mark;
entry = audit_to_entry_common(data);
if (IS_ERR(entry))
@@ -427,6 +464,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
bufp = data->buf;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &entry->rule.fields[i];
+ u32 f_val;
err = -EINVAL;
@@ -435,12 +473,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
f->type = data->fields[i];
- f->val = data->values[i];
+ f_val = data->values[i];
/* Support legacy tests for a valid loginuid */
- if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+ if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
- f->val = 0;
+ f_val = 0;
entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
@@ -456,7 +494,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_OBJ_UID:
- f->uid = make_kuid(current_user_ns(), f->val);
+ f->uid = make_kuid(current_user_ns(), f_val);
if (!uid_valid(f->uid))
goto exit_free;
break;
@@ -465,11 +503,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
- f->gid = make_kgid(current_user_ns(), f->val);
+ f->gid = make_kgid(current_user_ns(), f_val);
if (!gid_valid(f->gid))
goto exit_free;
break;
case AUDIT_ARCH:
+ f->val = f_val;
entry->rule.arch_f = f;
break;
case AUDIT_SUBJ_USER:
@@ -482,63 +521,87 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
+ entry->rule.buflen += f_val;
+ f->lsm_str = str;
err = security_audit_rule_init(f->type, f->op, str,
- (void **)&f->lsm_rule);
+ (void **)&f->lsm_rule,
+ GFP_KERNEL);
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
pr_warn("audit rule for LSM \'%s\' is invalid\n",
str);
err = 0;
- }
- if (err) {
- kfree(str);
+ } else if (err)
goto exit_free;
- } else
- f->lsm_str = str;
break;
case AUDIT_WATCH:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
- err = audit_to_watch(&entry->rule, str, f->val, f->op);
+ }
+ err = audit_to_watch(&entry->rule, str, f_val, f->op);
if (err) {
kfree(str);
goto exit_free;
}
+ entry->rule.buflen += f_val;
break;
case AUDIT_DIR:
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
-
+ }
err = audit_make_tree(&entry->rule, str, f->op);
kfree(str);
if (err)
goto exit_free;
+ entry->rule.buflen += f_val;
break;
case AUDIT_INODE:
+ f->val = f_val;
err = audit_to_inode(&entry->rule, f);
if (err)
goto exit_free;
break;
case AUDIT_FILTERKEY:
- if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+ if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
goto exit_free;
- str = audit_unpack_string(&bufp, &remain, f->val);
- if (IS_ERR(str))
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
goto exit_free;
- entry->rule.buflen += f->val;
+ }
+ entry->rule.buflen += f_val;
entry->rule.filterkey = str;
break;
+ case AUDIT_EXE:
+ if (entry->rule.exe || f_val > PATH_MAX)
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f_val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
+ goto exit_free;
+ }
+ audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
+ if (IS_ERR(audit_mark)) {
+ kfree(str);
+ err = PTR_ERR(audit_mark);
+ goto exit_free;
+ }
+ entry->rule.buflen += f_val;
+ entry->rule.exe = audit_mark;
+ break;
+ default:
+ f->val = f_val;
+ break;
}
}
@@ -549,10 +612,10 @@ exit_nofree:
return entry;
exit_free:
- if (entry->rule.watch)
- audit_put_watch(entry->rule.watch); /* matches initial get */
if (entry->rule.tree)
audit_put_tree(entry->rule.tree); /* that's the temporary one */
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe); /* that's the template one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -568,17 +631,16 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
return len;
}
-/* Translate kernel rule respresentation to struct audit_rule_data. */
+/* Translate kernel rule representation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
struct audit_rule_data *data;
void *bufp;
int i;
- data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ data = kzalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
- memset(data, 0, sizeof(*data));
data->flags = krule->flags | krule->listnr;
data->action = krule->action;
@@ -589,7 +651,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->fields[i] = f->type;
data->fieldflags[i] = audit_ops[f->op];
- switch(f->type) {
+ switch (f->type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -617,18 +679,23 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->filterkey);
break;
+ case AUDIT_EXE:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, audit_mark_path(krule->exe));
+ break;
case AUDIT_LOGINUID_SET:
if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
data->fields[i] = AUDIT_LOGINUID;
data->values[i] = AUDIT_UID_UNSET;
break;
}
- /* fallthrough if set */
+ fallthrough; /* if set */
default:
data->values[i] = f->val;
}
}
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+ data->mask[i] = krule->mask[i];
return data;
}
@@ -651,7 +718,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
a->fields[i].op != b->fields[i].op)
return 1;
- switch(a->fields[i].type) {
+ switch (a->fields[i].type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -680,6 +747,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_EXE:
+ /* both paths exist based on above type compare */
+ if (strcmp(audit_mark_path(a->exe),
+ audit_mark_path(b->exe)))
+ return 1;
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -715,7 +788,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
static inline int audit_dupe_lsm_field(struct audit_field *df,
struct audit_field *sf)
{
- int ret = 0;
+ int ret;
char *lsm_str;
/* our own copy of lsm_str */
@@ -726,7 +799,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
/* our own (refreshed) copy of lsm_rule */
ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
- (void **)&df->lsm_rule);
+ (void **)&df->lsm_rule, GFP_KERNEL);
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
@@ -801,8 +874,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
err = -ENOMEM;
else
new->filterkey = fk;
+ break;
+ case AUDIT_EXE:
+ err = audit_dupe_exe(new, old);
+ break;
}
if (err) {
+ if (new->exe)
+ audit_remove_mark(new->exe);
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -863,14 +942,17 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- int err;
+ int err = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch (entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_EXCLUDE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
@@ -881,7 +963,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
if (watch) {
@@ -895,19 +977,20 @@ static inline int audit_add_rule(struct audit_entry *entry)
*/
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
if (err) {
mutex_unlock(&audit_filter_mutex);
- goto error;
+ return err;
}
}
entry->rule.prio = ~0ULL;
- if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+ if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
+ entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
entry->rule.prio = ++prio_high;
else
@@ -933,35 +1016,31 @@ static inline int audit_add_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- return 0;
-
-error:
- if (watch)
- audit_put_watch(watch); /* tmp watch, matches initial get */
return err;
}
/* Remove an existing rule from filterlist. */
-static inline int audit_del_rule(struct audit_entry *entry)
+int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
- struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
- /* If either of these, don't count towards total */
- if (entry->rule.listnr == AUDIT_FILTER_USER ||
- entry->rule.listnr == AUDIT_FILTER_TYPE)
+ /* If any of these, don't count towards total */
+ switch (entry->rule.listnr) {
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_EXCLUDE:
+ case AUDIT_FILTER_FS:
dont_count = 1;
+ }
#endif
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
if (!e) {
- mutex_unlock(&audit_filter_mutex);
ret = -ENOENT;
goto out;
}
@@ -972,9 +1051,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
- list_del_rcu(&e->list);
- list_del(&e->rule.list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
+ if (e->rule.exe)
+ audit_remove_mark_rule(&e->rule);
#ifdef CONFIG_AUDITSYSCALL
if (!dont_count)
@@ -983,11 +1061,14 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (!audit_match_signal(entry))
audit_signals--;
#endif
- mutex_unlock(&audit_filter_mutex);
+
+ list_del_rcu(&e->list);
+ list_del(&e->rule.list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
out:
- if (watch)
- audit_put_watch(watch); /* match initial get */
+ mutex_unlock(&audit_filter_mutex);
+
if (tree)
audit_put_tree(tree); /* that's the temporary one */
@@ -995,7 +1076,7 @@ out:
}
/* List rules using struct audit_rule_data. */
-static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(int seq, struct sk_buff_head *q)
{
struct sk_buff *skb;
struct audit_krule *r;
@@ -1003,22 +1084,22 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
/* This is a blocking read, so use audit_filter_mutex instead of rcu
* iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ for (i = 0; i < AUDIT_NR_FILTERS; i++) {
list_for_each_entry(r, &audit_rules_list[i], list) {
struct audit_rule_data *data;
data = audit_krule_to_data(r);
if (unlikely(!data))
break;
- skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
- 0, 1, data,
- sizeof(*data) + data->buflen);
+ skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
+ data,
+ struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
}
}
- skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
if (skb)
skb_queue_tail(q, skb);
}
@@ -1027,19 +1108,16 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
- uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
- unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_session_info(ab);
audit_log_task_context(ab);
- audit_log_format(ab, " op=");
- audit_log_string(ab, action);
+ audit_log_format(ab, " op=%s", action);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
audit_log_end(ab);
@@ -1048,37 +1126,40 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
/**
* audit_rule_change - apply all rules to the specified message type
* @type: audit message type
- * @portid: target port id for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
*/
-int audit_rule_change(int type, __u32 portid, int seq, void *data,
- size_t datasz)
+int audit_rule_change(int type, int seq, void *data, size_t datasz)
{
int err = 0;
struct audit_entry *entry;
- entry = audit_data_to_entry(data, datasz);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
-
switch (type) {
case AUDIT_ADD_RULE:
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
err = audit_add_rule(entry);
audit_log_rule_change("add_rule", &entry->rule, !err);
break;
case AUDIT_DEL_RULE:
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
err = audit_del_rule(entry);
audit_log_rule_change("remove_rule", &entry->rule, !err);
break;
default:
- err = -EINVAL;
WARN_ON(1);
+ return -EINVAL;
}
- if (err || type == AUDIT_DEL_RULE)
+ if (err || type == AUDIT_DEL_RULE) {
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
audit_free_rule(entry);
+ }
return err;
}
@@ -1090,11 +1171,8 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
*/
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
- u32 portid = NETLINK_CB(request_skb).portid;
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
- int err = 0;
/* We can't just spew out the rules here because we might fill
* the available socket buffer space and deadlock waiting for
@@ -1102,25 +1180,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
* happen if we're actually running in the context of auditctl
* trying to _send_ the stuff */
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ dest = kmalloc(sizeof(*dest), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(net);
- dest->portid = portid;
+ dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
+ dest->portid = NETLINK_CB(request_skb).portid;
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
- audit_list_rules(portid, seq, &dest->q);
+ audit_list_rules(seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
if (IS_ERR(tsk)) {
skb_queue_purge(&dest->q);
+ put_net(dest->net);
kfree(dest);
- err = PTR_ERR(tsk);
+ return PTR_ERR(tsk);
}
- return err;
+ return 0;
}
int audit_comparator(u32 left, u32 op, u32 right)
@@ -1143,7 +1222,6 @@ int audit_comparator(u32 left, u32 op, u32 right)
case Audit_bittest:
return ((left & right) == right);
default:
- BUG();
return 0;
}
}
@@ -1166,7 +1244,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
@@ -1189,7 +1266,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
@@ -1232,132 +1308,102 @@ int parent_len(const char *path)
* @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
* here indicates that we must compute this value.
*/
-int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)
{
int dlen, pathlen;
const char *p;
- dlen = strlen(dname);
+ dlen = dname->len;
pathlen = strlen(path);
if (pathlen < dlen)
return 1;
- parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
- if (pathlen - parentlen != dlen)
- return 1;
+ if (parentlen == AUDIT_NAME_FULL)
+ parentlen = parent_len(path);
p = path + parentlen;
- return strncmp(p, dname, dlen);
-}
-
-static int audit_filter_user_rules(struct audit_krule *rule, int type,
- enum audit_state *state)
-{
- int i;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &rule->fields[i];
- pid_t pid;
- int result = 0;
- u32 sid;
+ /* handle trailing slashes */
+ pathlen -= parentlen;
+ while (pathlen > 0 && p[pathlen - 1] == '/')
+ pathlen--;
- switch (f->type) {
- case AUDIT_PID:
- pid = task_pid_nr(current);
- result = audit_comparator(pid, f->op, f->val);
- break;
- case AUDIT_UID:
- result = audit_uid_comparator(current_uid(), f->op, f->uid);
- break;
- case AUDIT_GID:
- result = audit_gid_comparator(current_gid(), f->op, f->gid);
- break;
- case AUDIT_LOGINUID:
- result = audit_uid_comparator(audit_get_loginuid(current),
- f->op, f->uid);
- break;
- case AUDIT_LOGINUID_SET:
- result = audit_comparator(audit_loginuid_set(current),
- f->op, f->val);
- break;
- case AUDIT_MSGTYPE:
- result = audit_comparator(type, f->op, f->val);
- break;
- case AUDIT_SUBJ_USER:
- case AUDIT_SUBJ_ROLE:
- case AUDIT_SUBJ_TYPE:
- case AUDIT_SUBJ_SEN:
- case AUDIT_SUBJ_CLR:
- if (f->lsm_rule) {
- security_task_getsecid(current, &sid);
- result = security_audit_rule_match(sid,
- f->type,
- f->op,
- f->lsm_rule,
- NULL);
- }
- break;
- }
-
- if (!result)
- return 0;
- }
- switch (rule->action) {
- case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
- }
- return 1;
-}
-
-int audit_filter_user(int type)
-{
- enum audit_state state = AUDIT_DISABLED;
- struct audit_entry *e;
- int rc, ret;
-
- ret = 1; /* Audit by default */
-
- rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- rc = audit_filter_user_rules(&e->rule, type, &state);
- if (rc) {
- if (rc > 0 && state == AUDIT_DISABLED)
- ret = 0;
- break;
- }
- }
- rcu_read_unlock();
+ if (pathlen != dlen)
+ return 1;
- return ret;
+ return memcmp(p, dname->name, dlen);
}
-int audit_filter_type(int type)
+int audit_filter(int msgtype, unsigned int listtype)
{
struct audit_entry *e;
- int result = 0;
+ int ret = 1; /* Audit by default */
rcu_read_lock();
- if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
- goto unlock_and_return;
+ list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
+ int i, result = 0;
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
- list) {
- int i;
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
- if (f->type == AUDIT_MSGTYPE) {
- result = audit_comparator(type, f->op, f->val);
- if (!result)
- break;
+ struct lsm_prop prop = { };
+ pid_t pid;
+
+ switch (f->type) {
+ case AUDIT_PID:
+ pid = task_tgid_nr(current);
+ result = audit_comparator(pid, f->op, f->val);
+ break;
+ case AUDIT_UID:
+ result = audit_uid_comparator(current_uid(), f->op, f->uid);
+ break;
+ case AUDIT_GID:
+ result = audit_gid_comparator(current_gid(), f->op, f->gid);
+ break;
+ case AUDIT_LOGINUID:
+ result = audit_uid_comparator(audit_get_loginuid(current),
+ f->op, f->uid);
+ break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
+ f->op, f->val);
+ break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(msgtype, f->op, f->val);
+ break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ if (f->lsm_rule) {
+ security_current_getlsmprop_subj(&prop);
+ result = security_audit_rule_match(
+ &prop, f->type, f->op,
+ f->lsm_rule);
+ }
+ break;
+ case AUDIT_EXE:
+ result = audit_exe_compare(current, e->rule.exe);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ break;
+ default:
+ goto unlock_and_return;
}
+ if (result < 0) /* error */
+ goto unlock_and_return;
+ if (!result)
+ break;
+ }
+ if (result > 0) {
+ if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE)
+ ret = 0;
+ break;
}
- if (result)
- goto unlock_and_return;
}
unlock_and_return:
rcu_read_unlock();
- return result;
+ return ret;
}
static int update_lsm_rule(struct audit_krule *r)
@@ -1370,6 +1416,8 @@ static int update_lsm_rule(struct audit_krule *r)
return 0;
nentry = audit_dupe_rule(r);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
@@ -1391,7 +1439,7 @@ static int update_lsm_rule(struct audit_krule *r)
}
/* This function will re-initialize the lsm_rule field of all applicable rules.
- * It will traverse the filter lists serarching for rules that contain LSM
+ * It will traverse the filter lists searching for rules that contain LSM
* specific filter fields. When such a rule is found, it is copied, the
* LSM field is re-initialized, and the old rule is replaced with the
* updated rule. */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9fb9d1cb83ce..dd0563a8e0be 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditsc.c -- System-call auditing support
* Handles all system-call specific auditing features.
*
@@ -6,20 +7,6 @@
* Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Many of the ideas implemented here are from Stephen C. Tweedie,
@@ -63,7 +50,6 @@
#include <asm/unistd.h>
#include <linux/security.h>
#include <linux/list.h>
-#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
@@ -73,7 +59,12 @@
#include <linux/compat.h>
#include <linux/ctype.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <uapi/linux/openat2.h> // struct open_how
+#include <uapi/linux/fanotify.h>
#include "audit.h"
@@ -82,7 +73,8 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
#define MAX_EXECVE_AUDIT_LEN 7500
/* max length to print of cmdline/proctitle value during audit */
@@ -99,8 +91,6 @@ struct audit_aux_data {
int type;
};
-#define AUDIT_AUX_IPCPERM 0
-
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
@@ -110,7 +100,7 @@ struct audit_aux_data_pids {
kuid_t target_auid[AUDIT_AUX_PIDS];
kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
- u32 target_sid[AUDIT_AUX_PIDS];
+ struct lsm_prop target_ref[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
int pid_count;
};
@@ -128,15 +118,46 @@ struct audit_tree_refs {
struct audit_chunk *c[31];
};
+struct audit_nfcfgop_tab {
+ enum audit_nfcfgop op;
+ const char *s;
+};
+
+static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
+ { AUDIT_XT_OP_REGISTER, "xt_register" },
+ { AUDIT_XT_OP_REPLACE, "xt_replace" },
+ { AUDIT_XT_OP_UNREGISTER, "xt_unregister" },
+ { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" },
+ { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" },
+ { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" },
+ { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" },
+ { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" },
+ { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" },
+ { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" },
+ { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" },
+ { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" },
+ { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" },
+ { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" },
+ { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" },
+ { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" },
+ { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
+ { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
+ { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
+ { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" },
+ { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" },
+ { AUDIT_NFT_OP_INVALID, "nft_invalid" },
+};
+
static int audit_match_perm(struct audit_context *ctx, int mask)
{
unsigned n;
+
if (unlikely(!ctx))
return 0;
n = ctx->major;
switch (audit_classify_syscall(ctx->arch, n)) {
- case 0: /* native */
+ case AUDITSC_NATIVE:
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE, n))
return 1;
@@ -147,7 +168,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR, n))
return 1;
return 0;
- case 1: /* 32bit on biarch */
+ case AUDITSC_COMPAT: /* 32bit on biarch */
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE_32, n))
return 1;
@@ -158,14 +179,16 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
return 1;
return 0;
- case 2: /* open */
+ case AUDITSC_OPEN:
return mask & ACC_MODE(ctx->argv[1]);
- case 3: /* openat */
+ case AUDITSC_OPENAT:
return mask & ACC_MODE(ctx->argv[2]);
- case 4: /* socketcall */
+ case AUDITSC_SOCKETCALL:
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
- case 5: /* execve */
+ case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
+ case AUDITSC_OPENAT2:
+ return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
@@ -180,7 +203,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
return 0;
list_for_each_entry(n, &ctx->names_list, list) {
- if ((n->ino != -1) &&
+ if ((n->ino != AUDIT_INO_UNSET) &&
((n->mode & S_IFMT) == mode))
return 1;
}
@@ -198,12 +221,11 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
* References in it _are_ dropped - at the same time we free/drop aux stuff.
*/
-#ifdef CONFIG_AUDIT_TREE
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
}
@@ -211,6 +233,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
int left = ctx->tree_count;
+
if (likely(left)) {
p->c[--left] = chunk;
ctx->tree_count = left;
@@ -231,6 +254,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
static int grow_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p = ctx->trees;
+
ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
if (!ctx->trees) {
ctx->trees = p;
@@ -243,14 +267,13 @@ static int grow_tree_refs(struct audit_context *ctx)
ctx->tree_count = 31;
return 1;
}
-#endif
static void unroll_tree_refs(struct audit_context *ctx,
struct audit_tree_refs *p, int count)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_tree_refs *q;
int n;
+
if (!p) {
/* we started with empty chain */
p = ctx->first_trees;
@@ -272,12 +295,12 @@ static void unroll_tree_refs(struct audit_context *ctx,
}
ctx->trees = p;
ctx->tree_count = count;
-#endif
}
static void free_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p, *q;
+
for (p = ctx->first_trees; p; p = q) {
q = p->next;
kfree(p);
@@ -286,9 +309,9 @@ static void free_tree_refs(struct audit_context *ctx)
static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_tree_refs *p;
int n;
+
if (!tree)
return 0;
/* full ones */
@@ -303,7 +326,6 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
if (audit_tree_match(p->c[n], tree))
return 1;
}
-#endif
return 0;
}
@@ -314,13 +336,13 @@ static int audit_compare_uid(kuid_t uid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_uid_comparator(uid, f->op, n->uid);
@@ -338,13 +360,13 @@ static int audit_compare_gid(kgid_t gid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_gid_comparator(gid, f->op, name->gid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_gid_comparator(gid, f->op, n->gid);
@@ -372,7 +394,7 @@ static int audit_field_compare(struct task_struct *tsk,
case AUDIT_COMPARE_EGID_TO_OBJ_GID:
return audit_compare_gid(cred->egid, name, f, ctx);
case AUDIT_COMPARE_AUID_TO_OBJ_UID:
- return audit_compare_uid(tsk->loginuid, name, f, ctx);
+ return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx);
case AUDIT_COMPARE_SUID_TO_OBJ_UID:
return audit_compare_uid(cred->suid, name, f, ctx);
case AUDIT_COMPARE_SGID_TO_OBJ_GID:
@@ -383,7 +405,8 @@ static int audit_field_compare(struct task_struct *tsk,
return audit_compare_gid(cred->fsgid, name, f, ctx);
/* uid comparisons */
case AUDIT_COMPARE_UID_TO_AUID:
- return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+ return audit_uid_comparator(cred->uid, f->op,
+ audit_get_loginuid(tsk));
case AUDIT_COMPARE_UID_TO_EUID:
return audit_uid_comparator(cred->uid, f->op, cred->euid);
case AUDIT_COMPARE_UID_TO_SUID:
@@ -392,11 +415,14 @@ static int audit_field_compare(struct task_struct *tsk,
return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
/* auid comparisons */
case AUDIT_COMPARE_AUID_TO_EUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->euid);
case AUDIT_COMPARE_AUID_TO_SUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->suid);
case AUDIT_COMPARE_AUID_TO_FSUID:
- return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+ return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+ cred->fsuid);
/* euid comparisons */
case AUDIT_COMPARE_EUID_TO_SUID:
return audit_uid_comparator(cred->euid, f->op, cred->suid);
@@ -444,7 +470,11 @@ static int audit_filter_rules(struct task_struct *tsk,
{
const struct cred *cred;
int i, need_sid = 1;
- u32 sid;
+ struct lsm_prop prop = { };
+ unsigned int sessionid;
+
+ if (ctx && rule->prio <= ctx->prio)
+ return 0;
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
@@ -456,7 +486,7 @@ static int audit_filter_rules(struct task_struct *tsk,
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(tsk);
+ pid = task_tgid_nr(tsk);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
@@ -466,6 +496,11 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
+ case AUDIT_EXE:
+ result = audit_exe_compare(tsk, rule->exe);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ break;
case AUDIT_UID:
result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
@@ -482,20 +517,20 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_gid_comparator(cred->gid, f->op, f->gid);
if (f->op == Audit_equal) {
if (!result)
- result = in_group_p(f->gid);
+ result = groups_search(cred->group_info, f->gid);
} else if (f->op == Audit_not_equal) {
if (result)
- result = !in_group_p(f->gid);
+ result = !groups_search(cred->group_info, f->gid);
}
break;
case AUDIT_EGID:
result = audit_gid_comparator(cred->egid, f->op, f->gid);
if (f->op == Audit_equal) {
if (!result)
- result = in_egroup_p(f->gid);
+ result = groups_search(cred->group_info, f->gid);
} else if (f->op == Audit_not_equal) {
if (result)
- result = !in_egroup_p(f->gid);
+ result = !groups_search(cred->group_info, f->gid);
}
break;
case AUDIT_SGID:
@@ -504,6 +539,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_FSGID:
result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
+ case AUDIT_SESSIONID:
+ sessionid = audit_get_sessionid(tsk);
+ result = audit_comparator(sessionid, f->op, f->val);
+ break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
break;
@@ -513,11 +552,11 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_EXIT:
- if (ctx && ctx->return_valid)
+ if (ctx && ctx->return_valid != AUDITSC_INVALID)
result = audit_comparator(ctx->return_code, f->op, f->val);
break;
case AUDIT_SUCCESS:
- if (ctx && ctx->return_valid) {
+ if (ctx && ctx->return_valid != AUDITSC_INVALID) {
if (f->val)
result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
else
@@ -591,21 +630,33 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name)
- result = audit_watch_compare(rule->watch, name->ino, name->dev);
+ if (name) {
+ result = audit_watch_compare(rule->watch,
+ name->ino,
+ name->dev);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_DIR:
- if (ctx)
+ if (ctx) {
result = match_tree_refs(ctx, rule->tree);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_LOGINUID:
- result = 0;
- if (ctx)
- result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+ result = audit_uid_comparator(audit_get_loginuid(tsk),
+ f->op, f->uid);
break;
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
+ case AUDIT_SADDR_FAM:
+ if (ctx && ctx->sockaddr)
+ result = audit_comparator(ctx->sockaddr->ss_family,
+ f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -618,13 +669,23 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid(tsk, &sid);
+ /* @tsk should always be equal to
+ * @current with the exception of
+ * fork()/copy_process() in which case
+ * the new @tsk creds are still a dup
+ * of @current's creds so we can still
+ * use
+ * security_current_getlsmprop_subj()
+ * here even though it always refs
+ * @current's creds
+ */
+ security_current_getlsmprop_subj(&prop);
need_sid = 0;
}
- result = security_audit_rule_match(sid, f->type,
- f->op,
- f->lsm_rule,
- ctx);
+ result = security_audit_rule_match(&prop,
+ f->type,
+ f->op,
+ f->lsm_rule);
}
break;
case AUDIT_OBJ_USER:
@@ -638,13 +699,17 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find files that match */
if (name) {
result = security_audit_rule_match(
- name->osid, f->type, f->op,
- f->lsm_rule, ctx);
+ &name->oprop,
+ f->type,
+ f->op,
+ f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (security_audit_rule_match(n->osid, f->type,
- f->op, f->lsm_rule,
- ctx)) {
+ if (security_audit_rule_match(
+ &n->oprop,
+ f->type,
+ f->op,
+ f->lsm_rule)) {
++result;
break;
}
@@ -653,9 +718,9 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find ipc objects that match */
if (!ctx || ctx->type != AUDIT_IPC)
break;
- if (security_audit_rule_match(ctx->ipc.osid,
+ if (security_audit_rule_match(&ctx->ipc.oprop,
f->type, f->op,
- f->lsm_rule, ctx))
+ f->lsm_rule))
++result;
}
break;
@@ -672,9 +737,13 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_PERM:
result = audit_match_perm(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FIELD_COMPARE:
result = audit_field_compare(tsk, cred, f, ctx, name);
@@ -685,8 +754,6 @@ static int audit_filter_rules(struct task_struct *tsk,
}
if (ctx) {
- if (rule->prio <= ctx->prio)
- return 0;
if (rule->filterkey) {
kfree(ctx->filterkey);
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -694,8 +761,12 @@ static int audit_filter_rules(struct task_struct *tsk,
ctx->prio = rule->prio;
}
switch (rule->action) {
- case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ case AUDIT_NEVER:
+ *state = AUDIT_STATE_DISABLED;
+ break;
+ case AUDIT_ALWAYS:
+ *state = AUDIT_STATE_RECORD;
+ break;
}
return 1;
}
@@ -713,14 +784,14 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
&state, true)) {
- if (state == AUDIT_RECORD_CONTEXT)
+ if (state == AUDIT_STATE_RECORD)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
return state;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+ return AUDIT_STATE_BUILD;
}
static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
@@ -739,35 +810,72 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
return rule->mask[word] & bit;
}
-/* At syscall entry and exit time, this filter is called if the
- * audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+/**
+ * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
+ * @tsk: associated task
+ * @ctx: audit context
+ * @list: audit filter list
+ * @name: audit_name (can be NULL)
+ * @op: current syscall/uring_op
+ *
+ * Run the udit filters specified in @list against @tsk using @ctx,
+ * @name, and @op, as necessary; the caller is responsible for ensuring
+ * that the call is made while the RCU read lock is held. The @name
+ * parameter can be NULL, but all others must be specified.
+ * Returns 1/true if the filter finds a match, 0/false if none are found.
*/
-static enum audit_state audit_filter_syscall(struct task_struct *tsk,
- struct audit_context *ctx,
- struct list_head *list)
+static int __audit_filter_op(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list,
+ struct audit_names *name,
+ unsigned long op)
{
struct audit_entry *e;
enum audit_state state;
- if (audit_pid && tsk->tgid == audit_pid)
- return AUDIT_DISABLED;
-
- rcu_read_lock();
- if (!list_empty(list)) {
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return state;
- }
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, name,
+ &state, false)) {
+ ctx->current_state = state;
+ return 1;
}
}
+ return 0;
+}
+
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ if (auditd_test_task(tsk))
+ return;
+
+ rcu_read_lock();
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ NULL, ctx->uring_op);
+ rcu_read_unlock();
+}
+
+/* At syscall exit time, this filter is called if the audit_state is
+ * not low enough that auditing cannot take place, but is also not
+ * high enough that we already know we have to write an audit record
+ * (i.e., the state is AUDIT_STATE_BUILD).
+ */
+static void audit_filter_syscall(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ if (auditd_test_task(tsk))
+ return;
+
+ rcu_read_lock();
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT],
+ NULL, ctx->major);
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
}
/*
@@ -776,24 +884,12 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
*/
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
- struct audit_context *ctx) {
+ struct audit_context *ctx)
+{
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
- struct audit_entry *e;
- enum audit_state state;
- if (list_empty(list))
- return 0;
-
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
- ctx->current_state = state;
- return 1;
- }
- }
-
- return 0;
+ return __audit_filter_op(tsk, ctx, list, n, ctx->major);
}
/* At syscall exit time, this filter is called if any audit_names have been
@@ -805,7 +901,7 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
struct audit_names *n;
- if (audit_pid && tsk->tgid == audit_pid)
+ if (auditd_test_task(tsk))
return;
rcu_read_lock();
@@ -817,44 +913,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
-/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
-static inline struct audit_context *audit_take_context(struct task_struct *tsk,
- int return_valid,
- long return_code)
-{
- struct audit_context *context = tsk->audit_context;
-
- if (!context)
- return NULL;
- context->return_valid = return_valid;
-
- /*
- * we need to fix up the return code in the audit logs if the actual
- * return codes are later going to be fixed up by the arch specific
- * signal handlers
- *
- * This is actually a test for:
- * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
- * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
- *
- * but is faster than a bunch of ||
- */
- if (unlikely(return_code <= -ERESTARTSYS) &&
- (return_code >= -ERESTART_RESTARTBLOCK) &&
- (return_code != -ENOIOCTLCMD))
- context->return_code = -EINTR;
- else
- context->return_code = return_code;
-
- if (context->in_syscall && !context->dummy) {
- audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
- audit_filter_inodes(tsk, context);
- }
-
- tsk->audit_context = NULL;
- return context;
-}
-
static inline void audit_proctitle_free(struct audit_context *context)
{
kfree(context->proctitle.value);
@@ -862,6 +920,13 @@ static inline void audit_proctitle_free(struct audit_context *context)
context->proctitle.len = 0;
}
+static inline void audit_free_module(struct audit_context *context)
+{
+ if (context->type == AUDIT_KERN_MODULE) {
+ kfree(context->module.name);
+ context->module.name = NULL;
+ }
+}
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
@@ -887,10 +952,80 @@ static inline void audit_free_aux(struct audit_context *context)
context->aux = aux->next;
kfree(aux);
}
+ context->aux = NULL;
while ((aux = context->aux_pids)) {
context->aux_pids = aux->next;
kfree(aux);
}
+ context->aux_pids = NULL;
+}
+
+/**
+ * audit_reset_context - reset a audit_context structure
+ * @ctx: the audit_context to reset
+ *
+ * All fields in the audit_context will be reset to an initial state, all
+ * references held by fields will be dropped, and private memory will be
+ * released. When this function returns the audit_context will be suitable
+ * for reuse, so long as the passed context is not NULL or a dummy context.
+ */
+static void audit_reset_context(struct audit_context *ctx)
+{
+ if (!ctx)
+ return;
+
+ /* if ctx is non-null, reset the "ctx->context" regardless */
+ ctx->context = AUDIT_CTX_UNUSED;
+ if (ctx->dummy)
+ return;
+
+ /*
+ * NOTE: It shouldn't matter in what order we release the fields, so
+ * release them in the order in which they appear in the struct;
+ * this gives us some hope of quickly making sure we are
+ * resetting the audit_context properly.
+ *
+ * Other things worth mentioning:
+ * - we don't reset "dummy"
+ * - we don't reset "state", we do reset "current_state"
+ * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
+ * - much of this is likely overkill, but play it safe for now
+ * - we really need to work on improving the audit_context struct
+ */
+
+ ctx->current_state = ctx->state;
+ ctx->stamp.serial = 0;
+ ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
+ ctx->major = 0;
+ ctx->uring_op = 0;
+ memset(ctx->argv, 0, sizeof(ctx->argv));
+ ctx->return_code = 0;
+ ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
+ ctx->return_valid = AUDITSC_INVALID;
+ audit_free_names(ctx);
+ if (ctx->state != AUDIT_STATE_RECORD) {
+ kfree(ctx->filterkey);
+ ctx->filterkey = NULL;
+ }
+ audit_free_aux(ctx);
+ kfree(ctx->sockaddr);
+ ctx->sockaddr = NULL;
+ ctx->sockaddr_len = 0;
+ ctx->ppid = 0;
+ ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
+ ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
+ ctx->personality = 0;
+ ctx->arch = 0;
+ ctx->target_pid = 0;
+ ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
+ ctx->target_sessionid = 0;
+ lsmprop_init(&ctx->target_ref);
+ ctx->target_comm[0] = '\0';
+ unroll_tree_refs(ctx, NULL, 0);
+ WARN_ON(!list_empty(&ctx->killed_trees));
+ audit_free_module(ctx);
+ ctx->fds[0] = -1;
+ ctx->type = 0; /* reset last for audit_free_*() */
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -900,10 +1035,13 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
+ context->context = AUDIT_CTX_UNUSED;
context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
+ context->fds[0] = -1;
+ context->return_valid = AUDITSC_INVALID;
return context;
}
@@ -923,45 +1061,43 @@ int audit_alloc(struct task_struct *tsk)
char *key = NULL;
if (likely(!audit_ever_enabled))
- return 0; /* Return if not auditing. */
+ return 0;
state = audit_filter_task(tsk, &key);
- if (state == AUDIT_DISABLED) {
- clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ if (state == AUDIT_STATE_DISABLED) {
+ clear_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
- if (!(context = audit_alloc_context(state))) {
+ context = audit_alloc_context(state);
+ if (!context) {
kfree(key);
audit_log_lost("out of memory in audit_alloc");
return -ENOMEM;
}
context->filterkey = key;
- tsk->audit_context = context;
- set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ audit_set_context(tsk, context);
+ set_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
static inline void audit_free_context(struct audit_context *context)
{
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
+ /* resetting is extra work, but it is likely just noise */
+ audit_reset_context(context);
+ audit_proctitle_free(context);
free_tree_refs(context);
- audit_free_aux(context);
kfree(context->filterkey);
- kfree(context->sockaddr);
- audit_proctitle_free(context);
kfree(context);
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- kuid_t auid, kuid_t uid, unsigned int sessionid,
- u32 sid, char *comm)
+ kuid_t auid, kuid_t uid,
+ unsigned int sessionid, struct lsm_prop *prop,
+ char *comm)
{
struct audit_buffer *ab;
- char *ctx = NULL;
- u32 len;
int rc = 0;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
@@ -971,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
- if (sid) {
- if (security_secid_to_secctx(sid, &ctx, &len)) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop))
+ rc = 1;
+
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
audit_log_end(ab);
@@ -987,185 +1117,248 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
return rc;
}
-/*
- * to_send and len_sent accounting are very loose estimates. We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf? an int is up to 12 digits long. if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message. In one 7500 byte message we can log up to
- * about 1000 min size arguments. That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
- struct audit_buffer **ab,
- int arg_num,
- size_t *len_sent,
- const char __user *p,
- char *buf)
-{
- char arg_num_len_buf[12];
- const char __user *tmp_p = p;
- /* how many digits are in arg_num? 5 is the length of ' a=""' */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
- size_t len, len_left, to_send;
- size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
- unsigned int i, has_cntl = 0, too_long = 0;
- int ret;
-
- /* strnlen_user includes the null we don't want to send */
- len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
- /*
- * We just created this mm, if we can't find the strings
- * we just copied into it something is _very_ wrong. Similar
- * for strings that are too long, we should not have created
- * any.
- */
- if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
+static void audit_log_execve_info(struct audit_context *context,
+ struct audit_buffer **ab)
+{
+ long len_max;
+ long len_rem;
+ long len_full;
+ long len_buf;
+ long len_abuf = 0;
+ long len_tmp;
+ bool require_data;
+ bool encode;
+ unsigned int iter;
+ unsigned int arg;
+ char *buf_head;
+ char *buf;
+ const char __user *p = (const char __user *)current->mm->arg_start;
+
+ /* NOTE: this buffer needs to be large enough to hold all the non-arg
+ * data we put in the audit record for this argument (see the
+ * code below) ... at this point in time 96 is plenty */
+ char abuf[96];
+
+ /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+ * current value of 7500 is not as important as the fact that it
+ * is less than 8k, a setting of 7500 gives us plenty of wiggle
+ * room if we go over a little bit in the logging below */
+ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+ len_max = MAX_EXECVE_AUDIT_LEN;
+
+ /* scratch buffer to hold the userspace args */
+ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+ if (!buf_head) {
+ audit_panic("out of memory for argv string");
+ return;
}
+ buf = buf_head;
+
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
- /* walk the whole argument looking for non-ascii chars */
+ len_rem = len_max;
+ len_buf = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ iter = 0;
+ arg = 0;
do {
- if (len_left > MAX_EXECVE_AUDIT_LEN)
- to_send = MAX_EXECVE_AUDIT_LEN;
- else
- to_send = len_left;
- ret = copy_from_user(buf, tmp_p, to_send);
- /*
- * There is no reason for this copy to be short. We just
- * copied them here, and the mm hasn't been exposed to user-
- * space yet.
- */
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
- }
- buf[to_send] = '\0';
- has_cntl = audit_string_contains_control(buf, to_send);
- if (has_cntl) {
- /*
- * hex messages get logged as 2 bytes, so we can only
- * send half as much in each message
- */
- max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
- break;
- }
- len_left -= to_send;
- tmp_p += to_send;
- } while (len_left > 0);
-
- len_left = len;
-
- if (len > max_execve_audit_len)
- too_long = 1;
-
- /* rewalk the argument actually logging the message */
- for (i = 0; len_left > 0; i++) {
- int room_left;
-
- if (len_left > max_execve_audit_len)
- to_send = max_execve_audit_len;
- else
- to_send = len_left;
-
- /* do we have space left to send this argument in this ab? */
- room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
- if (has_cntl)
- room_left -= (to_send * 2);
- else
- room_left -= to_send;
- if (room_left < 0) {
- *len_sent = 0;
- audit_log_end(*ab);
- *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
- if (!*ab)
- return 0;
+ /* NOTE: we don't ever want to trust this value for anything
+ * serious, but the audit record format insists we
+ * provide an argument length for really long arguments,
+ * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+ * to use strncpy_from_user() to obtain this value for
+ * recording in the log, although we don't use it
+ * anywhere here to avoid a double-fetch problem */
+ if (len_full == 0)
+ len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+ /* read more data from userspace */
+ if (require_data) {
+ /* can we make more room in the buffer? */
+ if (buf != buf_head) {
+ memmove(buf_head, buf, len_buf);
+ buf = buf_head;
+ }
+
+ /* fetch as much as we can of the argument */
+ len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+ len_max - len_buf);
+ if (len_tmp == -EFAULT) {
+ /* unable to copy from userspace */
+ send_sig(SIGKILL, current, 0);
+ goto out;
+ } else if (len_tmp == (len_max - len_buf)) {
+ /* buffer is not large enough */
+ require_data = true;
+ /* NOTE: if we are going to span multiple
+ * buffers force the encoding so we stand
+ * a chance at a sane len_full value and
+ * consistent record encoding */
+ encode = true;
+ len_full = len_full * 2;
+ p += len_tmp;
+ } else {
+ require_data = false;
+ if (!encode)
+ encode = audit_string_contains_control(
+ buf, len_tmp);
+ /* try to use a trusted value for len_full */
+ if (len_full < len_max)
+ len_full = (encode ?
+ len_tmp * 2 : len_tmp);
+ p += len_tmp + 1;
+ }
+ len_buf += len_tmp;
+ buf_head[len_buf] = '\0';
+
+ /* length of the buffer in the audit record? */
+ len_abuf = (encode ? len_buf * 2 : len_buf + 2);
}
- /*
- * first record needs to say how long the original string was
- * so we can be sure nothing was lost.
- */
- if ((i == 0) && (too_long))
- audit_log_format(*ab, " a%d_len=%zu", arg_num,
- has_cntl ? 2*len : len);
+ /* write as much as we can to the audit log */
+ if (len_buf >= 0) {
+ /* NOTE: some magic numbers here - basically if we
+ * can't fit a reasonable amount of data into the
+ * existing audit buffer, flush it and start with
+ * a new buffer */
+ if ((sizeof(abuf) + 8) > len_rem) {
+ len_rem = len_max;
+ audit_log_end(*ab);
+ *ab = audit_log_start(context,
+ GFP_KERNEL, AUDIT_EXECVE);
+ if (!*ab)
+ goto out;
+ }
- /*
- * normally arguments are small enough to fit and we already
- * filled buf above when we checked for control characters
- * so don't bother with another copy_from_user
- */
- if (len >= max_execve_audit_len)
- ret = copy_from_user(buf, p, to_send);
- else
- ret = 0;
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
+ /* create the non-arg portion of the arg record */
+ len_tmp = 0;
+ if (require_data || (iter > 0) ||
+ ((len_abuf + sizeof(abuf)) > len_rem)) {
+ if (iter == 0) {
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d_len=%lu",
+ arg, len_full);
+ }
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d[%d]=", arg, iter++);
+ } else
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d=", arg);
+ WARN_ON(len_tmp >= sizeof(abuf));
+ abuf[sizeof(abuf) - 1] = '\0';
+
+ /* log the arg in the audit record */
+ audit_log_format(*ab, "%s", abuf);
+ len_rem -= len_tmp;
+ len_tmp = len_buf;
+ if (encode) {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem / 2; /* encoding */
+ audit_log_n_hex(*ab, buf, len_tmp);
+ len_rem -= len_tmp * 2;
+ len_abuf -= len_tmp * 2;
+ } else {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem - 2; /* quotes */
+ audit_log_n_string(*ab, buf, len_tmp);
+ len_rem -= len_tmp + 2;
+ /* don't subtract the "2" because we still need
+ * to add quotes to the remaining string */
+ len_abuf -= len_tmp;
+ }
+ len_buf -= len_tmp;
+ buf += len_tmp;
}
- buf[to_send] = '\0';
-
- /* actually log it */
- audit_log_format(*ab, " a%d", arg_num);
- if (too_long)
- audit_log_format(*ab, "[%d]", i);
- audit_log_format(*ab, "=");
- if (has_cntl)
- audit_log_n_hex(*ab, buf, to_send);
- else
- audit_log_string(*ab, buf);
-
- p += to_send;
- len_left -= to_send;
- *len_sent += arg_num_len;
- if (has_cntl)
- *len_sent += to_send * 2;
- else
- *len_sent += to_send;
- }
- /* include the null we didn't log */
- return len + 1;
-}
-static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab)
-{
- int i, len;
- size_t len_sent = 0;
- const char __user *p;
- char *buf;
+ /* ready to move to the next argument? */
+ if ((len_buf == 0) && !require_data) {
+ arg++;
+ iter = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ }
+ } while (arg < context->execve.argc);
- p = (const char __user *)current->mm->arg_start;
+ /* NOTE: the caller handles the final audit_log_end() call */
- audit_log_format(*ab, "argc=%d", context->execve.argc);
+out:
+ kfree(buf_head);
+}
- /*
- * we need some kernel buffer to hold the userspace args. Just
- * allocate one big one rather than allocating one of the right size
- * for every single argument inside audit_log_single_execve_arg()
- * should be <8k allocation so should be pretty safe.
- */
- buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
- if (!buf) {
- audit_panic("out of memory for argv string");
+static void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap)
+{
+ if (cap_isclear(*cap)) {
+ audit_log_format(ab, " %s=0", prefix);
return;
}
+ audit_log_format(ab, " %s=%016llx", prefix, cap->val);
+}
- for (i = 0; i < context->execve.argc; i++) {
- len = audit_log_single_execve_arg(context, ab, i,
- &len_sent, p, buf);
- if (len <= 0)
- break;
- p += len;
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ if (name->fcap_ver == -1) {
+ audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
+ return;
+ }
+ audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+ audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
+ name->fcap.fE, name->fcap_ver,
+ from_kuid(&init_user_ns, name->fcap.rootid));
+}
+
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
}
- kfree(buf);
}
static void show_special(struct audit_context *context, int *call_panic)
@@ -1180,28 +1373,20 @@ static void show_special(struct audit_context *context, int *call_panic)
switch (context->type) {
case AUDIT_SOCKETCALL: {
int nargs = context->socketcall.nargs;
+
audit_log_format(ab, "nargs=%d", nargs);
for (i = 0; i < nargs; i++)
audit_log_format(ab, " a%d=%lx", i,
context->socketcall.args[i]);
break; }
- case AUDIT_IPC: {
- u32 osid = context->ipc.osid;
-
+ case AUDIT_IPC:
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
from_kuid(&init_user_ns, context->ipc.uid),
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
- if (osid) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", osid);
+ if (lsmprop_is_set(&context->ipc.oprop)) {
+ if (audit_log_obj_ctx(ab, &context->ipc.oprop))
*call_panic = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
}
if (context->ipc.has_perm) {
audit_log_end(ab);
@@ -1216,8 +1401,8 @@ static void show_special(struct audit_context *context, int *call_panic)
context->ipc.perm_gid,
context->ipc.perm_mode);
}
- break; }
- case AUDIT_MQ_OPEN: {
+ break;
+ case AUDIT_MQ_OPEN:
audit_log_format(ab,
"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
"mq_msgsize=%ld mq_curmsgs=%ld",
@@ -1226,24 +1411,25 @@ static void show_special(struct audit_context *context, int *call_panic)
context->mq_open.attr.mq_maxmsg,
context->mq_open.attr.mq_msgsize,
context->mq_open.attr.mq_curmsgs);
- break; }
- case AUDIT_MQ_SENDRECV: {
+ break;
+ case AUDIT_MQ_SENDRECV:
audit_log_format(ab,
"mqdes=%d msg_len=%zd msg_prio=%u "
- "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+ "abs_timeout_sec=%lld abs_timeout_nsec=%ld",
context->mq_sendrecv.mqdes,
context->mq_sendrecv.msg_len,
context->mq_sendrecv.msg_prio,
- context->mq_sendrecv.abs_timeout.tv_sec,
+ (long long) context->mq_sendrecv.abs_timeout.tv_sec,
context->mq_sendrecv.abs_timeout.tv_nsec);
- break; }
- case AUDIT_MQ_NOTIFY: {
+ break;
+ case AUDIT_MQ_NOTIFY:
audit_log_format(ab, "mqdes=%d sigev_signo=%d",
context->mq_notify.mqdes,
context->mq_notify.sigev_signo);
- break; }
+ break;
case AUDIT_MQ_GETSETATTR: {
struct mq_attr *attr = &context->mq_getsetattr.mqstat;
+
audit_log_format(ab,
"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
"mq_curmsgs=%ld ",
@@ -1251,19 +1437,39 @@ static void show_special(struct audit_context *context, int *call_panic)
attr->mq_flags, attr->mq_maxmsg,
attr->mq_msgsize, attr->mq_curmsgs);
break; }
- case AUDIT_CAPSET: {
+ case AUDIT_CAPSET:
audit_log_format(ab, "pid=%d", context->capset.pid);
audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
- break; }
- case AUDIT_MMAP: {
+ audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
+ break;
+ case AUDIT_MMAP:
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
- break; }
- case AUDIT_EXECVE: {
+ break;
+ case AUDIT_OPENAT2:
+ audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
+ context->openat2.flags,
+ context->openat2.mode,
+ context->openat2.resolve);
+ break;
+ case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
- break; }
+ break;
+ case AUDIT_KERN_MODULE:
+ audit_log_format(ab, "name=");
+ if (context->module.name) {
+ audit_log_untrustedstring(ab, context->module.name);
+ } else
+ audit_log_format(ab, "(null)");
+
+ break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -1271,6 +1477,7 @@ static void show_special(struct audit_context *context, int *call_panic)
static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
char *end = proctitle + len - 1;
+
while (end > proctitle && !isprint(*end))
end--;
@@ -1280,13 +1487,96 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
return len;
}
-static void audit_log_proctitle(struct task_struct *tsk,
- struct audit_context *context)
+/*
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+ const struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd
+ */
+ if (context->pwd.dentry && context->pwd.mnt)
+ audit_log_d_path(ab, " name=", &context->pwd);
+ else
+ audit_log_format(ab, " name=(null)");
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != AUDIT_INO_UNSET)
+ audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ if (lsmprop_is_set(&n->oprop) &&
+ audit_log_obj_ctx(ab, &n->oprop))
+ *call_panic = 2;
+
+ /* log the audit_names record type */
+ switch (n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, " nametype=NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, " nametype=PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, " nametype=DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, " nametype=CREATE");
+ break;
+ default:
+ audit_log_format(ab, " nametype=UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+static void audit_log_proctitle(void)
{
int res;
char *buf;
char *msg = "(null)";
int len = strlen(msg);
+ struct audit_context *context = audit_context();
struct audit_buffer *ab;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
@@ -1301,7 +1591,7 @@ static void audit_log_proctitle(struct task_struct *tsk,
if (!buf)
goto out;
/* Historically called this from procfs naming */
- res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+ res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN);
if (res == 0) {
kfree(buf);
goto out;
@@ -1321,39 +1611,86 @@ out:
audit_log_end(ab);
}
-static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
+/**
+ * audit_log_uring - generate a AUDIT_URINGOP record
+ * @ctx: the audit context
+ */
+static void audit_log_uring(struct audit_context *ctx)
{
- int i, call_panic = 0;
struct audit_buffer *ab;
- struct audit_aux_data *aux;
- struct audit_names *n;
-
- /* tsk == current */
- context->personality = tsk->personality;
+ const struct cred *cred;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
if (!ab)
- return; /* audit_panic has been called */
- audit_log_format(ab, "arch=%x syscall=%d",
- context->arch, context->major);
- if (context->personality != PER_LINUX)
- audit_log_format(ab, " per=%lx", context->personality);
- if (context->return_valid)
+ return;
+ cred = current_cred();
+ audit_log_format(ab, "uring_op=%d", ctx->uring_op);
+ if (ctx->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
- context->return_code);
-
+ str_yes_no(ctx->return_valid ==
+ AUDITSC_SUCCESS),
+ ctx->return_code);
audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count);
-
- audit_log_task_info(ab, tsk);
- audit_log_key(ab, context->filterkey);
+ " items=%d"
+ " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
+ " fsuid=%u egid=%u sgid=%u fsgid=%u",
+ ctx->name_count,
+ task_ppid_nr(current), task_tgid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid));
+ audit_log_task_context(ab);
+ audit_log_key(ab, ctx->filterkey);
audit_log_end(ab);
+}
+
+static void audit_log_exit(void)
+{
+ int i, call_panic = 0;
+ struct audit_context *context = audit_context();
+ struct audit_buffer *ab;
+ struct audit_aux_data *aux;
+ struct audit_names *n;
+
+ context->personality = current->personality;
+
+ switch (context->context) {
+ case AUDIT_CTX_SYSCALL:
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ if (!ab)
+ return;
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ str_yes_no(context->return_valid ==
+ AUDITSC_SUCCESS),
+ context->return_code);
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
+ audit_log_task_info(ab);
+ audit_log_key(ab, context->filterkey);
+ audit_log_end(ab);
+ break;
+ case AUDIT_CTX_URING:
+ audit_log_uring(context);
+ break;
+ default:
+ BUG();
+ break;
+ }
for (aux = context->aux; aux; aux = aux->next) {
@@ -1365,6 +1702,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+
audit_log_format(ab, "fver=%x", axs->fcap_ver);
audit_log_cap(ab, "fp", &axs->fcap.permitted);
audit_log_cap(ab, "fi", &axs->fcap.inheritable);
@@ -1372,9 +1710,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
- audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
- audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
- audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
+ audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
+ audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
+ audit_log_cap(ab, "pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
+ audit_log_format(ab, " frootid=%d",
+ from_kuid(&init_user_ns,
+ axs->fcap.rootid));
break; }
}
@@ -1411,7 +1754,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
axs->target_auid[i],
axs->target_uid[i],
axs->target_sessionid[i],
- axs->target_sid[i],
+ &axs->target_ref[i],
axs->target_comm[i]))
call_panic = 1;
}
@@ -1420,13 +1763,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
- context->target_sid, context->target_comm))
- call_panic = 1;
+ &context->target_ref,
+ context->target_comm))
+ call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, " cwd=", &context->pwd);
+ audit_log_d_path(ab, "cwd=", &context->pwd);
audit_log_end(ab);
}
}
@@ -1438,45 +1782,193 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_name(context, n, NULL, i++, &call_panic);
}
- audit_log_proctitle(tsk, context);
+ if (context->context == AUDIT_CTX_SYSCALL)
+ audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
audit_log_end(ab);
if (call_panic)
- audit_panic("error converting sid to string");
+ audit_panic("error in audit_log_exit()");
}
/**
- * audit_free - free a per-task audit context
+ * __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
- * Called from copy_process and do_exit
+ * Called from copy_process, do_exit, and the io_uring code
*/
void __audit_free(struct task_struct *tsk)
{
- struct audit_context *context;
+ struct audit_context *context = tsk->audit_context;
- context = audit_take_context(tsk, 0, 0);
if (!context)
return;
- /* Check for system calls that do not go through the exit
- * function (e.g., exit_group), then free context block.
- * We use GFP_ATOMIC here because we might be doing this
- * in the context of the idle thread */
- /* that can happen only if we are called from do_exit() */
- if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, tsk);
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
+ audit_kill_trees(context);
+ /* We are called either by do_exit() or the fork() error handling code;
+ * in the former case tsk == current and in the latter tsk is a
+ * random task_struct that doesn't have any meaningful data we
+ * need to log via audit_log_exit().
+ */
+ if (tsk == current && !context->dummy) {
+ context->return_valid = AUDITSC_INVALID;
+ context->return_code = 0;
+ if (context->context == AUDIT_CTX_SYSCALL) {
+ audit_filter_syscall(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_exit();
+ } else if (context->context == AUDIT_CTX_URING) {
+ /* TODO: verify this case is real and valid */
+ audit_filter_uring(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_uring(context);
+ }
+ }
+
+ audit_set_context(tsk, NULL);
audit_free_context(context);
}
/**
- * audit_syscall_entry - fill in an audit record at syscall entry
+ * audit_return_fixup - fixup the return codes in the audit_context
+ * @ctx: the audit_context
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * We need to fixup the return code in the audit logs if the actual return
+ * codes are later going to be fixed by the arch specific signal handlers.
+ */
+static void audit_return_fixup(struct audit_context *ctx,
+ int success, long code)
+{
+ /*
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(code <= -ERESTARTSYS) &&
+ (code >= -ERESTART_RESTARTBLOCK) &&
+ (code != -ENOIOCTLCMD))
+ ctx->return_code = -EINTR;
+ else
+ ctx->return_code = code;
+ ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
+}
+
+/**
+ * __audit_uring_entry - prepare the kernel task's audit context for io_uring
+ * @op: the io_uring opcode
+ *
+ * This is similar to audit_syscall_entry() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_entry() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_entry(u8 op)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->state == AUDIT_STATE_DISABLED)
+ return;
+
+ /*
+ * NOTE: It's possible that we can be called from the process' context
+ * before it returns to userspace, and before audit_syscall_exit()
+ * is called. In this case there is not much to do, just record
+ * the io_uring details and return.
+ */
+ ctx->uring_op = op;
+ if (ctx->context == AUDIT_CTX_SYSCALL)
+ return;
+
+ ctx->dummy = !audit_n_rules;
+ if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
+ ctx->prio = 0;
+
+ ctx->context = AUDIT_CTX_URING;
+ ctx->current_state = ctx->state;
+ ktime_get_coarse_real_ts64(&ctx->stamp.ctime);
+}
+
+/**
+ * __audit_uring_exit - wrap up the kernel task's audit context after io_uring
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * This is similar to audit_syscall_exit() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_exit() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_exit(int success, long code)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->dummy) {
+ if (ctx->context != AUDIT_CTX_URING)
+ return;
+ goto out;
+ }
+
+ audit_return_fixup(ctx, success, code);
+ if (ctx->context == AUDIT_CTX_SYSCALL) {
+ /*
+ * NOTE: See the note in __audit_uring_entry() about the case
+ * where we may be called from process context before we
+ * return to userspace via audit_syscall_exit(). In this
+ * case we simply emit a URINGOP record and bail, the
+ * normal syscall exit handling will take care of
+ * everything else.
+ * It is also worth mentioning that when we are called,
+ * the current process creds may differ from the creds
+ * used during the normal syscall processing; keep that
+ * in mind if/when we move the record generation code.
+ */
+
+ /*
+ * We need to filter on the syscall info here to decide if we
+ * should emit a URINGOP record. I know it seems odd but this
+ * solves the problem where users have a filter to block *all*
+ * syscall records in the "exit" filter; we want to preserve
+ * the behavior here.
+ */
+ audit_filter_syscall(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ return;
+
+ audit_log_uring(ctx);
+ return;
+ }
+
+ /* this may generate CONFIG_CHANGE records */
+ if (!list_empty(&ctx->killed_trees))
+ audit_kill_trees(ctx);
+
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ goto out;
+ audit_log_exit();
+
+out:
+ audit_reset_context(ctx);
+}
+
+/**
+ * __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
* @a2: additional syscall register 2
@@ -1486,7 +1978,7 @@ void __audit_free(struct task_struct *tsk)
* Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
- * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD,
* then the record will be written at syscall exit time (otherwise, it
* will only be written if another part of the kernel requests that it
* be written).
@@ -1494,102 +1986,87 @@ void __audit_free(struct task_struct *tsk)
void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
- struct task_struct *tsk = current;
- struct audit_context *context = tsk->audit_context;
+ struct audit_context *context = audit_context();
enum audit_state state;
- if (!context)
+ if (!audit_enabled || !context)
return;
- BUG_ON(context->in_syscall || context->name_count);
+ WARN_ON(context->context != AUDIT_CTX_UNUSED);
+ WARN_ON(context->name_count);
+ if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
+ audit_panic("unrecoverable error in audit_syscall_entry()");
+ return;
+ }
- if (!audit_enabled)
+ state = context->state;
+ if (state == AUDIT_STATE_DISABLED)
return;
- context->arch = syscall_get_arch();
+ context->dummy = !audit_n_rules;
+ if (!context->dummy && state == AUDIT_STATE_BUILD) {
+ context->prio = 0;
+ if (auditd_test_task(current))
+ return;
+ }
+
+ context->arch = syscall_get_arch(current);
context->major = major;
context->argv[0] = a1;
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
-
- state = context->state;
- context->dummy = !audit_n_rules;
- if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
- context->prio = 0;
- state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
- }
- if (state == AUDIT_DISABLED)
- return;
-
- context->serial = 0;
- context->ctime = CURRENT_TIME;
- context->in_syscall = 1;
+ context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- context->ppid = 0;
+ ktime_get_coarse_real_ts64(&context->stamp.ctime);
}
/**
- * audit_syscall_exit - deallocate audit context after a system call
+ * __audit_syscall_exit - deallocate audit context after a system call
* @success: success value of the syscall
* @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
- * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * auditable (either because of the AUDIT_STATE_RECORD state from
* filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
void __audit_syscall_exit(int success, long return_code)
{
- struct task_struct *tsk = current;
- struct audit_context *context;
-
- if (success)
- success = AUDITSC_SUCCESS;
- else
- success = AUDITSC_FAILURE;
+ struct audit_context *context = audit_context();
- context = audit_take_context(tsk, success, return_code);
- if (!context)
- return;
+ if (!context || context->dummy ||
+ context->context != AUDIT_CTX_SYSCALL)
+ goto out;
- if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit(context, tsk);
+ /* this may generate CONFIG_CHANGE records */
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(context);
- context->in_syscall = 0;
- context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ audit_return_fixup(context, success, return_code);
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_syscall(current, context);
+ audit_filter_inodes(current, context);
+ if (context->current_state != AUDIT_STATE_RECORD)
+ goto out;
- if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
+ audit_log_exit();
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
- tsk->audit_context = context;
+out:
+ audit_reset_context(context);
}
static inline void handle_one(const struct inode *inode)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_context *context;
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
- if (likely(hlist_empty(&inode->i_fsnotify_marks)))
+
+ if (likely(!inode->i_fsnotify_marks))
return;
- context = current->audit_context;
+ context = audit_context();
p = context->trees;
count = context->tree_count;
rcu_read_lock();
@@ -1607,12 +2084,10 @@ static inline void handle_one(const struct inode *inode)
return;
}
put_tree_ref(context, chunk);
-#endif
}
static void handle_path(const struct dentry *dentry)
{
-#ifdef CONFIG_AUDIT_TREE
struct audit_context *context;
struct audit_tree_refs *p;
const struct dentry *d, *parent;
@@ -1620,7 +2095,7 @@ static void handle_path(const struct dentry *dentry)
unsigned long seq;
int count;
- context = current->audit_context;
+ context = audit_context();
p = context->trees;
count = context->tree_count;
retry:
@@ -1628,10 +2103,12 @@ retry:
d = dentry;
rcu_read_lock();
seq = read_seqbegin(&rename_lock);
- for(;;) {
+ for (;;) {
struct inode *inode = d_backing_inode(d);
- if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
+
+ if (inode && unlikely(inode->i_fsnotify_marks)) {
struct audit_chunk *chunk;
+
chunk = audit_tree_lookup(inode);
if (chunk) {
if (unlikely(!put_tree_ref(context, chunk))) {
@@ -1665,7 +2142,6 @@ retry:
return;
}
rcu_read_unlock();
-#endif
}
static struct audit_names *audit_alloc_name(struct audit_context *context,
@@ -1683,16 +2159,18 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
aname->should_free = true;
}
- aname->ino = (unsigned long)-1;
+ aname->ino = AUDIT_INO_UNSET;
aname->type = type;
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
return aname;
}
/**
- * audit_reusename - fill out filename with info from existing entry
+ * __audit_reusename - fill out filename with info from existing entry
* @uptr: userland ptr to pathname
*
* Search the audit_names list for the current audit context. If there is an
@@ -1702,22 +2180,20 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
struct filename *
__audit_reusename(const __user char *uptr)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct audit_names *n;
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr) {
- n->name->refcnt++;
- return n->name;
- }
+ if (n->name->uptr == uptr)
+ return refname(n->name);
}
return NULL;
}
/**
- * audit_getname - add a name to the list
+ * __audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
@@ -1725,10 +2201,10 @@ __audit_reusename(const __user char *uptr)
*/
void __audit_getname(struct filename *name)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct audit_names *n;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
@@ -1738,10 +2214,49 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- name->refcnt++;
+ refname(name);
+}
- if (!context->pwd.dentry)
- get_fs_pwd(current->fs, &context->pwd);
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap.rootid = caps.rootid;
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+static void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getlsmprop(inode, &name->oprop);
+ if (flags & AUDIT_INODE_NOEVAL) {
+ name->fcap_ver = -1;
+ return;
+ }
+ audit_copy_fcaps(name, dentry);
}
/**
@@ -1753,14 +2268,33 @@ void __audit_getname(struct filename *name)
void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
- struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
+ struct audit_context *context = audit_context();
+ struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(inode->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (!name)
goto out_alloc;
@@ -1812,7 +2346,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- name->refcnt++;
+ refname(name);
}
out:
@@ -1826,7 +2360,7 @@ out:
n->type = AUDIT_TYPE_NORMAL;
}
handle_path(dentry);
- audit_copy_inode(n, dentry, inode);
+ audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
}
void __audit_file(const struct file *file)
@@ -1848,54 +2382,70 @@ void __audit_file(const struct file *file)
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
-void __audit_inode_child(const struct inode *parent,
+void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
- struct audit_context *context = current->audit_context;
- const struct inode *inode = d_backing_inode(dentry);
- const char *dname = dentry->d_name.name;
+ struct audit_context *context = audit_context();
+ struct inode *inode = d_backing_inode(dentry);
+ const struct qstr *dname = &dentry->d_name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (inode)
handle_one(inode);
- /* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name ||
- (n->type != AUDIT_TYPE_PARENT &&
- n->type != AUDIT_TYPE_UNKNOWN))
+ /* can only match entries that have a name */
+ if (!n->name)
continue;
- if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
- !audit_compare_dname_path(dname,
- n->name->name, n->name_len)) {
- if (n->type == AUDIT_TYPE_UNKNOWN)
- n->type = AUDIT_TYPE_PARENT;
+ /* look for a parent entry first */
+ if (!found_parent &&
+ (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) &&
+ (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname, n->name->name, n->name_len))) {
+ n->type = AUDIT_TYPE_PARENT;
found_parent = n;
- break;
- }
- }
-
- /* is there a matching child entry? */
- list_for_each_entry(n, &context->names_list, list) {
- /* can only match entries that have a name */
- if (!n->name ||
- (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
+ if (found_child)
+ break;
continue;
+ }
- if (!strcmp(dname, n->name->name) ||
- !audit_compare_dname_path(dname, n->name->name,
+ /* is there a matching child entry? */
+ if (!found_child &&
+ (n->type == type || n->type == AUDIT_TYPE_UNKNOWN) &&
+ (!strcmp(dname->name, n->name->name) ||
+ !audit_compare_dname_path(dname, n->name->name,
found_parent ?
found_parent->name_len :
- AUDIT_NAME_FULL)) {
+ AUDIT_NAME_FULL))) {
if (n->type == AUDIT_TYPE_UNKNOWN)
n->type = type;
found_child = n;
- break;
+ if (found_parent)
+ break;
}
}
@@ -1904,7 +2454,7 @@ void __audit_inode_child(const struct inode *parent,
n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
- audit_copy_inode(n, NULL, parent);
+ audit_copy_inode(n, NULL, parent, 0);
}
if (!found_child) {
@@ -1918,119 +2468,38 @@ void __audit_inode_child(const struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- found_child->name->refcnt++;
+ refname(found_child->name);
}
}
if (inode)
- audit_copy_inode(found_child, dentry, inode);
+ audit_copy_inode(found_child, dentry, inode, 0);
else
- found_child->ino = (unsigned long)-1;
+ found_child->ino = AUDIT_INO_UNSET;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
/**
* auditsc_get_stamp - get local copies of audit_context values
* @ctx: audit_context for the task
- * @t: timespec to store time recorded in the audit_context
- * @serial: serial value that is recorded in the audit_context
+ * @stamp: timestamp to record
*
* Also sets the context as auditable.
*/
-int auditsc_get_stamp(struct audit_context *ctx,
- struct timespec *t, unsigned int *serial)
+int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp)
{
- if (!ctx->in_syscall)
+ if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
- if (!ctx->serial)
- ctx->serial = audit_serial();
- t->tv_sec = ctx->ctime.tv_sec;
- t->tv_nsec = ctx->ctime.tv_nsec;
- *serial = ctx->serial;
+ if (!ctx->stamp.serial)
+ ctx->stamp.serial = audit_serial();
+ *stamp = ctx->stamp;
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
return 1;
}
-/* global counter which is incremented every time something logs in */
-static atomic_t session_id = ATOMIC_INIT(0);
-
-static int audit_set_loginuid_perm(kuid_t loginuid)
-{
- /* if we are unset, we don't need privs */
- if (!audit_loginuid_set(current))
- return 0;
- /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
- if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
- return -EPERM;
- /* it is set, you need permission */
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
- /* reject if this is not an unset and we don't allow that */
- if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
- return -EPERM;
- return 0;
-}
-
-static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
- unsigned int oldsessionid, unsigned int sessionid,
- int rc)
-{
- struct audit_buffer *ab;
- uid_t uid, oldloginuid, loginuid;
-
- if (!audit_enabled)
- return;
-
- uid = from_kuid(&init_user_ns, task_uid(current));
- oldloginuid = from_kuid(&init_user_ns, koldloginuid);
- loginuid = from_kuid(&init_user_ns, kloginuid),
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (!ab)
- return;
- audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
- audit_log_task_context(ab);
- audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
- oldloginuid, loginuid, oldsessionid, sessionid, !rc);
- audit_log_end(ab);
-}
-
-/**
- * audit_set_loginuid - set current task's audit_context loginuid
- * @loginuid: loginuid value
- *
- * Returns 0.
- *
- * Called (set) from fs/proc/base.c::proc_loginuid_write().
- */
-int audit_set_loginuid(kuid_t loginuid)
-{
- struct task_struct *task = current;
- unsigned int oldsessionid, sessionid = (unsigned int)-1;
- kuid_t oldloginuid;
- int rc;
-
- oldloginuid = audit_get_loginuid(current);
- oldsessionid = audit_get_sessionid(current);
-
- rc = audit_set_loginuid_perm(loginuid);
- if (rc)
- goto out;
-
- /* are we setting or clearing? */
- if (uid_valid(loginuid))
- sessionid = (unsigned int)atomic_inc_return(&session_id);
-
- task->sessionid = sessionid;
- task->loginuid = loginuid;
-out:
- audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
- return rc;
-}
-
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
@@ -2040,7 +2509,7 @@ out:
*/
void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (attr)
memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
@@ -2062,15 +2531,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
*
*/
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
- const struct timespec *abs_timeout)
+ const struct timespec64 *abs_timeout)
{
- struct audit_context *context = current->audit_context;
- struct timespec *p = &context->mq_sendrecv.abs_timeout;
+ struct audit_context *context = audit_context();
+ struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
if (abs_timeout)
- memcpy(p, abs_timeout, sizeof(struct timespec));
+ memcpy(p, abs_timeout, sizeof(*p));
else
- memset(p, 0, sizeof(struct timespec));
+ memset(p, 0, sizeof(*p));
context->mq_sendrecv.mqdes = mqdes;
context->mq_sendrecv.msg_len = msg_len;
@@ -2088,7 +2557,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (notification)
context->mq_notify.sigev_signo = notification->sigev_signo;
@@ -2107,30 +2576,32 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
*/
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
+
context->mq_getsetattr.mqdes = mqdes;
context->mq_getsetattr.mqstat = *mqstat;
context->type = AUDIT_MQ_GETSETATTR;
}
/**
- * audit_ipc_obj - record audit data for ipc object
+ * __audit_ipc_obj - record audit data for ipc object
* @ipcp: ipc permissions
*
*/
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
+
context->ipc.uid = ipcp->uid;
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
context->ipc.has_perm = 0;
- security_ipc_getsecid(ipcp, &context->ipc.osid);
+ security_ipc_getlsmprop(ipcp, &context->ipc.oprop);
context->type = AUDIT_IPC;
}
/**
- * audit_ipc_set_perm - record audit data for new ipc permissions
+ * __audit_ipc_set_perm - record audit data for new ipc permissions
* @qbytes: msgq bytes
* @uid: msgq user id
* @gid: msgq group id
@@ -2140,7 +2611,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
*/
void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->ipc.qbytes = qbytes;
context->ipc.perm_uid = uid;
@@ -2151,7 +2622,7 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
void __audit_bprm(struct linux_binprm *bprm)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
context->type = AUDIT_EXECVE;
context->execve.argc = bprm->argc;
@@ -2159,14 +2630,14 @@ void __audit_bprm(struct linux_binprm *bprm)
/**
- * audit_socketcall - record audit data for sys_socketcall
+ * __audit_socketcall - record audit data for sys_socketcall
* @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
*/
int __audit_socketcall(int nargs, unsigned long *args)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
return -EINVAL;
@@ -2184,13 +2655,14 @@ int __audit_socketcall(int nargs, unsigned long *args)
*/
void __audit_fd_pair(int fd1, int fd2)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
+
context->fds[0] = fd1;
context->fds[1] = fd2;
}
/**
- * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
* @len: data length in user space
* @a: data address in kernel space
*
@@ -2198,10 +2670,11 @@ void __audit_fd_pair(int fd1, int fd2)
*/
int __audit_sockaddr(int len, void *a)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
+
if (!p)
return -ENOMEM;
context->sockaddr = p;
@@ -2214,43 +2687,31 @@ int __audit_sockaddr(int len, void *a)
void __audit_ptrace(struct task_struct *t)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
- context->target_pid = task_pid_nr(t);
+ context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &context->target_sid);
- memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
+ strscpy(context->target_comm, t->comm);
+ security_task_getlsmprop_obj(t, &context->target_ref);
}
/**
- * audit_signal_info - record signal info for shutting down audit subsystem
- * @sig: signal value
+ * audit_signal_info_syscall - record signal info for syscalls
* @t: task being signaled
*
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
-int __audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info_syscall(struct task_struct *t)
{
struct audit_aux_data_pids *axp;
- struct task_struct *tsk = current;
- struct audit_context *ctx = tsk->audit_context;
- kuid_t uid = current_uid(), t_uid = task_uid(t);
-
- if (audit_pid && t->tgid == audit_pid) {
- if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_pid_nr(tsk);
- if (uid_valid(tsk->loginuid))
- audit_sig_uid = tsk->loginuid;
- else
- audit_sig_uid = uid;
- security_task_getsecid(tsk, &audit_sig_sid);
- }
- if (!audit_signals || audit_dummy_context())
- return 0;
- }
+ struct audit_context *ctx = audit_context();
+ kuid_t t_uid = task_uid(t);
+
+ if (!audit_signals || audit_dummy_context())
+ return 0;
/* optimize the common case by putting first signal recipient directly
* in audit_context */
@@ -2259,8 +2720,8 @@ int __audit_signal_info(int sig, struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &ctx->target_sid);
- memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
+ strscpy(ctx->target_comm, t->comm);
+ security_task_getlsmprop_obj(t, &ctx->target_ref);
return 0;
}
@@ -2280,8 +2741,8 @@ int __audit_signal_info(int sig, struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
- memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
+ security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]);
+ strscpy(axp->target_comm[axp->pid_count], t->comm);
axp->pid_count++;
return 0;
@@ -2302,7 +2763,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
const struct cred *new, const struct cred *old)
{
struct audit_aux_data_bprm_fcaps *ax;
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
struct cpu_vfs_cap_data vcaps;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
@@ -2313,20 +2774,24 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ get_vfs_caps_from_disk(&nop_mnt_idmap,
+ bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ ax->fcap.rootid = vcaps.rootid;
ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
ax->old_pcap.permitted = old->cap_permitted;
ax->old_pcap.inheritable = old->cap_inheritable;
ax->old_pcap.effective = old->cap_effective;
+ ax->old_pcap.ambient = old->cap_ambient;
ax->new_pcap.permitted = new->cap_permitted;
ax->new_pcap.inheritable = new->cap_inheritable;
ax->new_pcap.effective = new->cap_effective;
+ ax->new_pcap.ambient = new->cap_ambient;
return 0;
}
@@ -2340,22 +2805,106 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
*/
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
- struct audit_context *context = current->audit_context;
- context->capset.pid = task_pid_nr(current);
+ struct audit_context *context = audit_context();
+
+ context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
+ context->capset.cap.ambient = new->cap_ambient;
context->type = AUDIT_CAPSET;
}
void __audit_mmap_fd(int fd, int flags)
{
- struct audit_context *context = current->audit_context;
+ struct audit_context *context = audit_context();
+
context->mmap.fd = fd;
context->mmap.flags = flags;
context->type = AUDIT_MMAP;
}
+void __audit_openat2_how(struct open_how *how)
+{
+ struct audit_context *context = audit_context();
+
+ context->openat2.flags = how->flags;
+ context->openat2.mode = how->mode;
+ context->openat2.resolve = how->resolve;
+ context->type = AUDIT_OPENAT2;
+}
+
+void __audit_log_kern_module(const char *name)
+{
+ struct audit_context *context = audit_context();
+
+ context->module.name = kstrdup(name, GFP_KERNEL);
+ if (!context->module.name)
+ audit_log_lost("out of memory in __audit_log_kern_module");
+ context->type = AUDIT_KERN_MODULE;
+}
+
+void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
+{
+ /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */
+ switch (friar->hdr.type) {
+ case FAN_RESPONSE_INFO_NONE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2",
+ response, FAN_RESPONSE_INFO_NONE);
+ break;
+ case FAN_RESPONSE_INFO_AUDIT_RULE:
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY,
+ "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u",
+ response, friar->hdr.type, friar->rule_number,
+ friar->subj_trust, friar->obj_trust);
+ }
+}
+
+void __audit_tk_injoffset(struct timespec64 offset)
+{
+ struct audit_context *context = audit_context();
+
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
+}
+
+void __audit_ntp_log(const struct audit_ntp_data *ad)
+{
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
+}
+
+void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
+ enum audit_nfcfgop op, gfp_t gfp)
+{
+ struct audit_buffer *ab;
+ char comm[sizeof(current->comm)];
+
+ ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
+ if (!ab)
+ return;
+ audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
+ name, af, nentries, audit_nfcfgs[op].s);
+
+ audit_log_format(ab, " pid=%u", task_tgid_nr(current));
+ audit_log_task_context(ab); /* subj= */
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_end(ab);
+}
+EXPORT_SYMBOL_GPL(__audit_log_nfcfg);
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
@@ -2373,7 +2922,7 @@ static void audit_log_task(struct audit_buffer *ab)
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
}
@@ -2395,32 +2944,63 @@ void audit_core_dumps(long signr)
if (signr == SIGQUIT) /* don't care for those */
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND);
if (unlikely(!ab))
return;
audit_log_task(ab);
- audit_log_format(ab, " sig=%ld", signr);
+ audit_log_format(ab, " sig=%ld res=1", signr);
audit_log_end(ab);
}
-void __audit_seccomp(unsigned long syscall, long signr, int code)
+/**
+ * audit_seccomp - record information about a seccomp action
+ * @syscall: syscall number
+ * @signr: signal value
+ * @code: the seccomp action
+ *
+ * Record the information associated with a seccomp action. Event filtering for
+ * seccomp actions that are not to be logged is done in seccomp_log().
+ * Therefore, this function forces auditing independent of the audit_enabled
+ * and dummy context state because seccomp actions should be logged even when
+ * audit is not in use.
+ */
+void audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP);
if (unlikely(!ab))
return;
audit_log_task(ab);
audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
- signr, syscall_get_arch(), syscall, is_compat_task(),
- KSTK_EIP(current), code);
+ signr, syscall_get_arch(current), syscall,
+ in_compat_syscall(), KSTK_EIP(current), code);
+ audit_log_end(ab);
+}
+
+void audit_seccomp_actions_logged(const char *names, const char *old_names,
+ int res)
+{
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL,
+ AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+
+ audit_log_format(ab,
+ "op=seccomp-logging actions=%s old-actions=%s res=%d",
+ names, old_names, res);
audit_log_end(ab);
}
struct list_head *audit_killed_trees(void)
{
- struct audit_context *ctx = current->audit_context;
- if (likely(!ctx || !ctx->in_syscall))
+ struct audit_context *ctx = audit_context();
+ if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
return NULL;
return &ctx->killed_trees;
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 1323360d90e3..2dfe66b9ed76 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include <linux/completion.h>
@@ -25,42 +21,33 @@ static void backtrace_test_normal(void)
dump_stack();
}
-static DECLARE_COMPLETION(backtrace_work);
-
-static void backtrace_test_irq_callback(unsigned long data)
+static void backtrace_test_bh_workfn(struct work_struct *work)
{
dump_stack();
- complete(&backtrace_work);
}
-static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+static DECLARE_WORK(backtrace_bh_work, &backtrace_test_bh_workfn);
-static void backtrace_test_irq(void)
+static void backtrace_test_bh(void)
{
- pr_info("Testing a backtrace from irq context.\n");
+ pr_info("Testing a backtrace from BH context.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
- init_completion(&backtrace_work);
- tasklet_schedule(&backtrace_tasklet);
- wait_for_completion(&backtrace_work);
+ queue_work(system_bh_wq, &backtrace_bh_work);
+ flush_work(&backtrace_bh_work);
}
#ifdef CONFIG_STACKTRACE
static void backtrace_test_saved(void)
{
- struct stack_trace trace;
unsigned long entries[8];
+ unsigned int nr_entries;
pr_info("Testing a saved backtrace.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
- trace.nr_entries = 0;
- trace.max_entries = ARRAY_SIZE(entries);
- trace.entries = entries;
- trace.skip = 0;
-
- save_stack_trace(&trace);
- print_stack_trace(&trace, 0);
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ stack_trace_print(entries, nr_entries, 0);
}
#else
static void backtrace_test_saved(void)
@@ -74,7 +61,7 @@ static int backtrace_regression_test(void)
pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
- backtrace_test_irq();
+ backtrace_test_bh();
backtrace_test_saved();
pr_info("====[ end of backtrace testing ]====\n");
@@ -87,5 +74,6 @@ static void exitf(void)
module_init(backtrace_regression_test);
module_exit(exitf);
+MODULE_DESCRIPTION("Simple stack backtrace regression test module");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/bounds.c b/kernel/bounds.c
index e1d1d1952bfa..02b619eb6106 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Generate definitions needed by the preprocessor.
* This code generates raw asm output which is post-processed
@@ -5,6 +6,7 @@
*/
#define __GENERATING_BOUNDS_H
+#define COMPILE_OFFSETS
/* Include headers that define the enum constants of interest */
#include <linux/page-flags.h>
#include <linux/mmzone.h>
@@ -12,14 +14,23 @@
#include <linux/log2.h>
#include <linux/spinlock_types.h>
-void foo(void)
+int main(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ DEFINE(NR_CPUS_BITS, order_base_2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+ DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+ DEFINE(__LRU_REFS_WIDTH, 0);
+#endif
/* End of constants */
+
+ return 0;
}
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
new file mode 100644
index 000000000000..eb3de35734f0
--- /dev/null
+++ b/kernel/bpf/Kconfig
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# BPF interpreter that, for example, classic socket filters depend on.
+config BPF
+ bool
+ select CRYPTO_LIB_SHA256
+
+# Used by archs to tell that they support BPF JIT compiler plus which
+# flavour. Only one of the two can be selected for a specific arch since
+# eBPF JIT supersedes the cBPF JIT.
+
+# Classic BPF JIT (cBPF)
+config HAVE_CBPF_JIT
+ bool
+
+# Extended BPF JIT (eBPF)
+config HAVE_EBPF_JIT
+ bool
+
+# Used by archs to tell that they want the BPF JIT compiler enabled by
+# default for kernels that were compiled with BPF JIT support.
+config ARCH_WANT_DEFAULT_BPF_JIT
+ bool
+
+menu "BPF subsystem"
+
+config BPF_SYSCALL
+ bool "Enable bpf() system call"
+ select BPF
+ select IRQ_WORK
+ select NEED_TASKS_RCU
+ select TASKS_TRACE_RCU
+ select BINARY_PRINTF
+ select NET_SOCK_MSG if NET
+ select NET_XGRESS if NET
+ select PAGE_POOL if NET
+ default n
+ help
+ Enable the bpf() system call that allows to manipulate BPF programs
+ and maps via file descriptors.
+
+config BPF_JIT
+ bool "Enable BPF Just In Time compiler"
+ depends on BPF
+ depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
+ select EXECMEM
+ help
+ BPF programs are normally handled by a BPF interpreter. This option
+ allows the kernel to generate native code when a program is loaded
+ into the kernel. This will significantly speed-up processing of BPF
+ programs.
+
+ Note, an admin should enable this feature changing:
+ /proc/sys/net/core/bpf_jit_enable
+ /proc/sys/net/core/bpf_jit_harden (optional)
+ /proc/sys/net/core/bpf_jit_kallsyms (optional)
+
+config BPF_JIT_ALWAYS_ON
+ bool "Permanently enable BPF JIT and remove BPF interpreter"
+ depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
+ help
+ Enables BPF JIT and removes BPF interpreter to avoid speculative
+ execution of BPF instructions by the interpreter.
+
+ When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+ is permanently set to 1 and setting any other value than that will
+ return failure.
+
+config BPF_JIT_DEFAULT_ON
+ def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
+ depends on HAVE_EBPF_JIT && BPF_JIT
+
+config BPF_UNPRIV_DEFAULT_OFF
+ bool "Disable unprivileged BPF by default"
+ default y
+ depends on BPF_SYSCALL
+ help
+ Disables unprivileged BPF by default by setting the corresponding
+ /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can
+ still reenable it by setting it to 0 later on, or permanently
+ disable it by setting it to 1 (from which no other transition to
+ 0 is possible anymore).
+
+ Unprivileged BPF could be used to exploit certain potential
+ speculative execution side-channel vulnerabilities on unmitigated
+ affected hardware.
+
+ If you are unsure how to answer this question, answer Y.
+
+source "kernel/bpf/preload/Kconfig"
+
+config BPF_LSM
+ bool "Enable BPF LSM Instrumentation"
+ depends on BPF_EVENTS
+ depends on BPF_SYSCALL
+ depends on SECURITY
+ depends on BPF_JIT
+ help
+ Enables instrumentation of the security hooks with BPF programs for
+ implementing dynamic MAC and Audit Policies.
+
+ If you are unsure how to answer this question, answer N.
+
+endmenu # "BPF subsystem"
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6983be12bd3..232cbc97434d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,2 +1,65 @@
+# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
+ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
+# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
+cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
+endif
+CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy)
+
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
+obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
+obj-$(CONFIG_BPF_JIT) += trampoline.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o
+ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
+obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
+endif
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
+obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
+obj-$(CONFIG_BPF_SYSCALL) += tcx.o
+endif
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+endif
+ifeq ($(CONFIG_CGROUPS),y)
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
+endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
+ifeq ($(CONFIG_INET),y)
+obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
+endif
+ifeq ($(CONFIG_SYSFS),y)
+obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
+endif
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
+obj-${CONFIG_BPF_LSM} += bpf_lsm.o
+endif
+ifneq ($(CONFIG_CRYPTO),)
+obj-$(CONFIG_BPF_SYSCALL) += crypto.o
+endif
+obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
+ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
+obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
+endif
+
+CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
new file mode 100644
index 000000000000..872dc0e41c65
--- /dev/null
+++ b/kernel/bpf/arena.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include "linux/filter.h"
+#include <linux/btf_ids.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include "range_tree.h"
+
+/*
+ * bpf_arena is a sparsely populated shared memory region between bpf program and
+ * user space process.
+ *
+ * For example on x86-64 the values could be:
+ * user_vm_start 7f7d26200000 // picked by mmap()
+ * kern_vm_start ffffc90001e69000 // picked by get_vm_area()
+ * For user space all pointers within the arena are normal 8-byte addresses.
+ * In this example 7f7d26200000 is the address of the first page (pgoff=0).
+ * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr
+ * (u32)7f7d26200000 -> 26200000
+ * hence
+ * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb
+ * kernel memory region.
+ *
+ * BPF JITs generate the following code to access arena:
+ * mov eax, eax // eax has lower 32-bit of user pointer
+ * mov word ptr [rax + r12 + off], bx
+ * where r12 == kern_vm_start and off is s16.
+ * Hence allocate 4Gb + GUARD_SZ/2 on each side.
+ *
+ * Initially kernel vm_area and user vma are not populated.
+ * User space can fault-in any address which will insert the page
+ * into kernel and user vma.
+ * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc
+ * which will insert it into kernel vm_area.
+ * The later fault-in from user space will populate that page into user vma.
+ */
+
+/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
+#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
+#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
+
+struct bpf_arena {
+ struct bpf_map map;
+ u64 user_vm_start;
+ u64 user_vm_end;
+ struct vm_struct *kern_vm;
+ struct range_tree rt;
+ struct list_head vma_list;
+ struct mutex lock;
+};
+
+u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0;
+}
+
+u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return arena ? arena->user_vm_start : 0;
+}
+
+static long arena_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long arena_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static long compute_pgoff(struct bpf_arena *arena, long uaddr)
+{
+ return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
+}
+
+static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
+{
+ struct vm_struct *kern_vm;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_arena *arena;
+ u64 vm_range;
+ int err = -ENOMEM;
+
+ if (!bpf_jit_supports_arena())
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
+ /* BPF_F_MMAPABLE must be set */
+ !(attr->map_flags & BPF_F_MMAPABLE) ||
+ /* No unsupported flags present */
+ (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->map_extra & ~PAGE_MASK)
+ /* If non-zero the map_extra is an expected user VMA start address */
+ return ERR_PTR(-EINVAL);
+
+ vm_range = (u64)attr->max_entries * PAGE_SIZE;
+ if (vm_range > SZ_4G)
+ return ERR_PTR(-E2BIG);
+
+ if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32))
+ /* user vma must not cross 32-bit boundary */
+ return ERR_PTR(-ERANGE);
+
+ kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP);
+ if (!kern_vm)
+ return ERR_PTR(-ENOMEM);
+
+ arena = bpf_map_area_alloc(sizeof(*arena), numa_node);
+ if (!arena)
+ goto err;
+
+ arena->kern_vm = kern_vm;
+ arena->user_vm_start = attr->map_extra;
+ if (arena->user_vm_start)
+ arena->user_vm_end = arena->user_vm_start + vm_range;
+
+ INIT_LIST_HEAD(&arena->vma_list);
+ bpf_map_init_from_attr(&arena->map, attr);
+ range_tree_init(&arena->rt);
+ err = range_tree_set(&arena->rt, 0, attr->max_entries);
+ if (err) {
+ bpf_map_area_free(arena);
+ goto err;
+ }
+ mutex_init(&arena->lock);
+
+ return &arena->map;
+err:
+ free_vm_area(kern_vm);
+ return ERR_PTR(err);
+}
+
+static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
+{
+ struct page *page;
+ pte_t pte;
+
+ pte = ptep_get(ptep);
+ if (!pte_present(pte)) /* sanity check */
+ return 0;
+ page = pte_page(pte);
+ /*
+ * We do not update pte here:
+ * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
+ * 2. TLB flushing is batched or deferred. Even if we clear pte,
+ * the TLB entries can stick around and continue to permit access to
+ * the freed page. So it all relies on 1.
+ */
+ __free_page(page);
+ return 0;
+}
+
+static void arena_map_free(struct bpf_map *map)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ /*
+ * Check that user vma-s are not around when bpf map is freed.
+ * mmap() holds vm_file which holds bpf_map refcnt.
+ * munmap() must have happened on vma followed by arena_vm_close()
+ * which would clear arena->vma_list.
+ */
+ if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
+ return;
+
+ /*
+ * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area().
+ * It unmaps everything from vmalloc area and clears pgtables.
+ * Call apply_to_existing_page_range() first to find populated ptes and
+ * free those pages.
+ */
+ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ free_vm_area(arena->kern_vm);
+ range_tree_destroy(&arena->rt);
+ bpf_map_area_free(arena);
+}
+
+static void *arena_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static long arena_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type, const struct btf_type *value_type)
+{
+ return 0;
+}
+
+static u64 arena_map_mem_usage(const struct bpf_map *map)
+{
+ return 0;
+}
+
+struct vma_list {
+ struct vm_area_struct *vma;
+ struct list_head head;
+ refcount_t mmap_count;
+};
+
+static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
+{
+ struct vma_list *vml;
+
+ vml = kmalloc(sizeof(*vml), GFP_KERNEL);
+ if (!vml)
+ return -ENOMEM;
+ refcount_set(&vml->mmap_count, 1);
+ vma->vm_private_data = vml;
+ vml->vma = vma;
+ list_add(&vml->head, &arena->vma_list);
+ return 0;
+}
+
+static void arena_vm_open(struct vm_area_struct *vma)
+{
+ struct vma_list *vml = vma->vm_private_data;
+
+ refcount_inc(&vml->mmap_count);
+}
+
+static void arena_vm_close(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct vma_list *vml = vma->vm_private_data;
+
+ if (!refcount_dec_and_test(&vml->mmap_count))
+ return;
+ guard(mutex)(&arena->lock);
+ /* update link list under lock */
+ list_del(&vml->head);
+ vma->vm_private_data = NULL;
+ kfree(vml);
+}
+
+static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
+{
+ struct bpf_map *map = vmf->vma->vm_file->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct page *page;
+ long kbase, kaddr;
+ int ret;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+ kaddr = kbase + (u32)(vmf->address);
+
+ guard(mutex)(&arena->lock);
+ page = vmalloc_to_page((void *)kaddr);
+ if (page)
+ /* already have a page vmap-ed */
+ goto out;
+
+ if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
+ /* User space requested to segfault when page is not allocated by bpf prog */
+ return VM_FAULT_SIGSEGV;
+
+ ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
+ if (ret)
+ return VM_FAULT_SIGSEGV;
+
+ /* Account into memcg of the process that created bpf_arena */
+ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
+ if (ret) {
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
+ return VM_FAULT_SIGSEGV;
+ }
+
+ ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+ if (ret) {
+ range_tree_set(&arena->rt, vmf->pgoff, 1);
+ __free_page(page);
+ return VM_FAULT_SIGSEGV;
+ }
+out:
+ page_ref_add(page, 1);
+ vmf->page = page;
+ return 0;
+}
+
+static const struct vm_operations_struct arena_vm_ops = {
+ .open = arena_vm_open,
+ .close = arena_vm_close,
+ .fault = arena_vm_fault,
+};
+
+static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ long ret;
+
+ if (pgoff)
+ return -EINVAL;
+ if (len > SZ_4G)
+ return -E2BIG;
+
+ /* if user_vm_start was specified at arena creation time */
+ if (arena->user_vm_start) {
+ if (len > arena->user_vm_end - arena->user_vm_start)
+ return -E2BIG;
+ if (len != arena->user_vm_end - arena->user_vm_start)
+ return -EINVAL;
+ if (addr != arena->user_vm_start)
+ return -EINVAL;
+ }
+
+ ret = mm_get_unmapped_area(filp, addr, len * 2, 0, flags);
+ if (IS_ERR_VALUE(ret))
+ return ret;
+ if ((ret >> 32) == ((ret + len - 1) >> 32))
+ return ret;
+ if (WARN_ON_ONCE(arena->user_vm_start))
+ /* checks at map creation time should prevent this */
+ return -EFAULT;
+ return round_up(ret, SZ_4G);
+}
+
+static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ guard(mutex)(&arena->lock);
+ if (arena->user_vm_start && arena->user_vm_start != vma->vm_start)
+ /*
+ * If map_extra was not specified at arena creation time then
+ * 1st user process can do mmap(NULL, ...) to pick user_vm_start
+ * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..);
+ * or
+ * specify addr in map_extra and
+ * use the same addr later with mmap(addr, MAP_FIXED..);
+ */
+ return -EBUSY;
+
+ if (arena->user_vm_end && arena->user_vm_end != vma->vm_end)
+ /* all user processes must have the same size of mmap-ed region */
+ return -EBUSY;
+
+ /* Earlier checks should prevent this */
+ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff))
+ return -EFAULT;
+
+ if (remember_vma(arena, vma))
+ return -ENOMEM;
+
+ arena->user_vm_start = vma->vm_start;
+ arena->user_vm_end = vma->vm_end;
+ /*
+ * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and
+ * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid
+ * potential change of user_vm_start.
+ */
+ vm_flags_set(vma, VM_DONTEXPAND);
+ vma->vm_ops = &arena_vm_ops;
+ return 0;
+}
+
+static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+ return -ERANGE;
+ *imm = (unsigned long)arena->user_vm_start;
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena)
+const struct bpf_map_ops arena_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = arena_map_alloc,
+ .map_free = arena_map_free,
+ .map_direct_value_addr = arena_map_direct_value_addr,
+ .map_mmap = arena_map_mmap,
+ .map_get_unmapped_area = arena_get_unmapped_area,
+ .map_get_next_key = arena_map_get_next_key,
+ .map_push_elem = arena_map_push_elem,
+ .map_peek_elem = arena_map_peek_elem,
+ .map_pop_elem = arena_map_pop_elem,
+ .map_lookup_elem = arena_map_lookup_elem,
+ .map_update_elem = arena_map_update_elem,
+ .map_delete_elem = arena_map_delete_elem,
+ .map_check_btf = arena_map_check_btf,
+ .map_mem_usage = arena_map_mem_usage,
+ .map_btf_id = &bpf_arena_map_btf_ids[0],
+};
+
+static u64 clear_lo32(u64 val)
+{
+ return val & ~(u64)~0U;
+}
+
+/*
+ * Allocate pages and vmap them into kernel vmalloc area.
+ * Later the pages will be mmaped into user space vma.
+ */
+static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id)
+{
+ /* user_vm_end/start are fixed before bpf prog runs */
+ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
+ struct page **pages;
+ long pgoff = 0;
+ u32 uaddr32;
+ int ret, i;
+
+ if (page_cnt > page_cnt_max)
+ return 0;
+
+ if (uaddr) {
+ if (uaddr & ~PAGE_MASK)
+ return 0;
+ pgoff = compute_pgoff(arena, uaddr);
+ if (pgoff > page_cnt_max - page_cnt)
+ /* requested address will be outside of user VMA */
+ return 0;
+ }
+
+ /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
+ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ guard(mutex)(&arena->lock);
+
+ if (uaddr) {
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ goto out_free_pages;
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ } else {
+ ret = pgoff = range_tree_find(&arena->rt, page_cnt);
+ if (pgoff >= 0)
+ ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
+ }
+ if (ret)
+ goto out_free_pages;
+
+ ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
+ if (ret)
+ goto out;
+
+ uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
+ /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+ * will not overflow 32-bit. Lower 32-bit need to represent
+ * contiguous user address range.
+ * Map these pages at kern_vm_start base.
+ * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+ * lower 32-bit and it's ok.
+ */
+ ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
+ kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
+ if (ret) {
+ for (i = 0; i < page_cnt; i++)
+ __free_page(pages[i]);
+ goto out;
+ }
+ kvfree(pages);
+ return clear_lo32(arena->user_vm_start) + uaddr32;
+out:
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+out_free_pages:
+ kvfree(pages);
+ return 0;
+}
+
+/*
+ * If page is present in vmalloc area, unmap it from vmalloc area,
+ * unmap it from all user space vma-s,
+ * and free it.
+ */
+static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ struct vma_list *vml;
+
+ list_for_each_entry(vml, &arena->vma_list, head)
+ zap_page_range_single(vml->vma, uaddr,
+ PAGE_SIZE * page_cnt, NULL);
+}
+
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
+{
+ u64 full_uaddr, uaddr_end;
+ long kaddr, pgoff, i;
+ struct page *page;
+
+ /* only aligned lower 32-bit are relevant */
+ uaddr = (u32)uaddr;
+ uaddr &= PAGE_MASK;
+ full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
+ uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
+ if (full_uaddr >= uaddr_end)
+ return;
+
+ page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
+
+ guard(mutex)(&arena->lock);
+
+ pgoff = compute_pgoff(arena, uaddr);
+ /* clear range */
+ range_tree_set(&arena->rt, pgoff, page_cnt);
+
+ if (page_cnt > 1)
+ /* bulk zap if multiple pages being freed */
+ zap_pages(arena, full_uaddr, page_cnt);
+
+ kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
+ for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) {
+ page = vmalloc_to_page((void *)kaddr);
+ if (!page)
+ continue;
+ if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */
+ /* Optimization for the common case of page_cnt==1:
+ * If page wasn't mapped into some user vma there
+ * is no need to call zap_pages which is slow. When
+ * page_cnt is big it's faster to do the batched zap.
+ */
+ zap_pages(arena, full_uaddr, 1);
+ vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
+ __free_page(page);
+ }
+}
+
+/*
+ * Reserve an arena virtual address range without populating it. This call stops
+ * bpf_arena_alloc_pages from adding pages to this range.
+ */
+static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
+{
+ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ long pgoff;
+ int ret;
+
+ if (uaddr & ~PAGE_MASK)
+ return 0;
+
+ pgoff = compute_pgoff(arena, uaddr);
+ if (pgoff + page_cnt > page_cnt_max)
+ return -EINVAL;
+
+ guard(mutex)(&arena->lock);
+
+ /* Cannot guard already allocated pages. */
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ return -EBUSY;
+
+ /* "Allocate" the region to prevent it from being allocated. */
+ return range_tree_clear(&arena->rt, pgoff, page_cnt);
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id);
+}
+
+__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
+ return;
+ arena_free_pages(arena, (long)ptr__ign, page_cnt);
+}
+
+__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA)
+ return -EINVAL;
+
+ if (!page_cnt)
+ return 0;
+
+ return arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(arena_kfuncs)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
+BTF_KFUNCS_END(arena_kfuncs)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &arena_kfuncs,
+};
+
+static int __init kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+late_initcall(kfunc_init);
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+ u64 user_vm_start;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+
+ /* Use main prog for stream access */
+ prog = prog->aux->main_prog_aux->prog;
+
+ user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
+ addr += clear_lo32(user_vm_start);
+
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n",
+ write ? "WRITE" : "READ", addr);
+ bpf_stream_dump_stack(ss);
+ }));
+}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583f38..1eeb31c5b317 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,89 +1,351 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
+ * Copyright (c) 2016,2017 Facebook
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/err.h>
-#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
+#include <crypto/sha2.h>
-struct bpf_array {
- struct bpf_map map;
- u32 elem_size;
- char value[0] __aligned(8);
-};
+#include "map_in_map.h"
+
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
+ BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
+
+static void bpf_array_free_percpu(struct bpf_array *array)
+{
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++) {
+ free_percpu(array->pptrs[i]);
+ cond_resched();
+ }
+}
+
+static int bpf_array_alloc_percpu(struct bpf_array *array)
+{
+ void __percpu *ptr;
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++) {
+ ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
+ GFP_USER | __GFP_NOWARN);
+ if (!ptr) {
+ bpf_array_free_percpu(array);
+ return -ENOMEM;
+ }
+ array->pptrs[i] = ptr;
+ cond_resched();
+ }
+
+ return 0;
+}
/* Called from syscall */
-static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+int array_map_alloc_check(union bpf_attr *attr)
{
- struct bpf_array *array;
- u32 elem_size, array_size;
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0)
- return ERR_PTR(-EINVAL);
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
+ (percpu && numa_node != NUMA_NO_NODE))
+ return -EINVAL;
+
+ if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+ attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
+ return -EINVAL;
+
+ if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+ attr->map_flags & BPF_F_PRESERVE_ELEMS)
+ return -EINVAL;
+
+ /* avoid overflow on round_up(map->value_size) */
+ if (attr->value_size > INT_MAX)
+ return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ u32 elem_size, index_mask, max_entries;
+ bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
+ u64 array_size, mask64;
+ struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
- /* check round_up into zero and u32 overflow */
- if (elem_size == 0 ||
- attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
- return ERR_PTR(-ENOMEM);
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (!bypass_spec_v1) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
- array_size = sizeof(*array) + attr->max_entries * elem_size;
+ array_size = sizeof(*array);
+ if (percpu) {
+ array_size += (u64) max_entries * sizeof(void *);
+ } else {
+ /* rely on vmalloc() to return page-aligned memory and
+ * ensure array->value is exactly page-aligned
+ */
+ if (attr->map_flags & BPF_F_MMAPABLE) {
+ array_size = PAGE_ALIGN(array_size);
+ array_size += PAGE_ALIGN((u64) max_entries * elem_size);
+ } else {
+ array_size += (u64) max_entries * elem_size;
+ }
+ }
/* allocate all map elements and zero-initialize them */
- array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
- if (!array) {
- array = vzalloc(array_size);
- if (!array)
+ if (attr->map_flags & BPF_F_MMAPABLE) {
+ void *data;
+
+ /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
+ data = bpf_map_area_mmapable_alloc(array_size, numa_node);
+ if (!data)
return ERR_PTR(-ENOMEM);
+ array = data + PAGE_ALIGN(sizeof(struct bpf_array))
+ - offsetof(struct bpf_array, value);
+ } else {
+ array = bpf_map_area_alloc(array_size, numa_node);
}
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ array->index_mask = index_mask;
+ array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
- array->map.key_size = attr->key_size;
- array->map.value_size = attr->value_size;
- array->map.max_entries = attr->max_entries;
-
+ bpf_map_init_from_attr(&array->map, attr);
array->elem_size = elem_size;
+ if (percpu && bpf_array_alloc_percpu(array)) {
+ bpf_map_area_free(array);
+ return ERR_PTR(-ENOMEM);
+ }
+
return &array->map;
}
+static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
+{
+ return array->value + (u64)array->elem_size * index;
+}
+
/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + (u64)array->elem_size * (index & array->index_mask);
}
-/* Called from syscall */
-static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size,
+ void *hash_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ sha256(array->value, (u64)array->elem_size * array->map.max_entries,
+ hash_buf);
+ memcpy(array->map.sha, hash_buf, sizeof(array->map.sha));
+ return 0;
+}
+
+static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
+ u32 off)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ if (map->max_entries != 1)
+ return -ENOTSUPP;
+ if (off >= map->value_size)
+ return -EINVAL;
+
+ *imm = (unsigned long)array->value;
+ return 0;
+}
+
+static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
+ u32 *off)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u64 base = (unsigned long)array->value;
+ u64 range = array->elem_size;
+
+ if (map->max_entries != 1)
+ return -ENOTSUPP;
+ if (imm < base || imm >= base + range)
+ return -ENOENT;
+
+ *off = imm - base;
+ return 0;
+}
+
+/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
+static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_insn *insn = insn_buf;
+ u32 elem_size = array->elem_size;
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ if (!map->bypass_spec_v1) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ }
+
+ if (is_power_of_2(elem_size)) {
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ } else {
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ }
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+ return insn - insn_buf;
+}
+
+/* Called from eBPF program */
+static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
+}
+
+/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
+static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));
+
+ *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
+ if (!map->bypass_spec_v1) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
+ }
+
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
+ return insn - insn_buf;
+}
+
+static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu);
+}
+
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -ENOENT;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = array->elem_size;
+ rcu_read_lock();
+ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+/* Called from syscall */
+int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
- if (index >= array->map.max_entries) {
+ if (index >= map->max_entries) {
*next = 0;
return 0;
}
- if (index == array->map.max_entries - 1)
+ if (index == map->max_entries - 1)
return -ENOENT;
*next = index + 1;
@@ -91,66 +353,1091 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
}
/* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
+ char *val;
- if (map_flags > BPF_EXIST)
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (map_flags == BPF_NOEXIST)
+ if (unlikely(map_flags & BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- memcpy(array->value + array->elem_size * index, value, array->elem_size);
+ if (unlikely((map_flags & BPF_F_LOCK) &&
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
+ return -EINVAL;
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
+ } else {
+ val = array->value +
+ (u64)array->elem_size * (index & array->index_mask);
+ if (map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, val, value, false);
+ else
+ copy_map_value(map, val, value);
+ bpf_obj_free_fields(array->map.record, val);
+ }
+ return 0;
+}
+
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ if (unlikely(index >= array->map.max_entries))
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (unlikely(map_flags == BPF_NOEXIST))
+ /* all elements already exist */
+ return -EEXIST;
+
+ /* the user space will provide round_up(value_size, 8) bytes that
+ * will be copied into per-cpu area. bpf programs can only access
+ * value_size of it. During lookup the same extra bytes will be
+ * returned or zeros which were zero-filled by percpu_alloc,
+ * so no kernel data leaks possible
+ */
+ size = array->elem_size;
+ rcu_read_lock();
+ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+ bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
+ off += size;
+ }
+ rcu_read_unlock();
return 0;
}
/* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
+static void *array_map_vmalloc_addr(struct bpf_array *array)
+{
+ return (void *)round_down((unsigned long)array, PAGE_SIZE);
+}
+
+static void array_map_free_internal_structs(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_map_free_internal_structs(map, array_map_elem_ptr(array, i));
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (!IS_ERR_OR_NULL(map->record)) {
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ for (i = 0; i < array->map.max_entries; i++) {
+ void __percpu *pptr = array->pptrs[i & array->index_mask];
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ }
+ } else {
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
+ }
+ }
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ bpf_array_free_percpu(array);
+
+ if (array->map.map_flags & BPF_F_MMAPABLE)
+ bpf_map_area_free(array_map_vmalloc_addr(array));
+ else
+ bpf_map_area_free(array);
+}
+
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+
+ rcu_read_lock();
+
+ value = array_map_lookup_elem(map, key);
+ if (!value) {
+ rcu_read_unlock();
+ return;
+ }
+
+ if (map->btf_key_type_id)
+ seq_printf(m, "%u: ", *(u32 *)key);
+ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+ seq_putc(m, '\n');
+
+ rcu_read_unlock();
+}
+
+static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu;
+
+ rcu_read_lock();
+
+ seq_printf(m, "%u: {\n", *(u32 *)key);
+ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(pptr, cpu), m);
+ seq_putc(m, '\n');
+ }
+ seq_puts(m, "}\n");
+
+ rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* One exception for keyless BTF: .bss/.data/.rodata map */
+ if (btf_type_is_void(key_type)) {
+ if (map->map_type != BPF_MAP_TYPE_ARRAY ||
+ map->max_entries != 1)
+ return -EINVAL;
+
+ if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
+ return -EINVAL;
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding programs to complete
- * and free the array
+ return 0;
+ }
+
+ /*
+ * Bpf array can only take a u32 key. This check makes sure
+ * that the btf matches the attr used during map_create.
*/
- synchronize_rcu();
+ if (!btf_type_is_i32(key_type))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;
+
+ if (!(map->map_flags & BPF_F_MMAPABLE))
+ return -EINVAL;
+
+ if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) >
+ PAGE_ALIGN((u64)array->map.max_entries * array->elem_size))
+ return -EINVAL;
+
+ return remap_vmalloc_range(vma, array_map_vmalloc_addr(array),
+ vma->vm_pgoff + pgoff);
+}
+
+static bool array_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ if (!bpf_map_meta_equal(meta0, meta1))
+ return false;
+ return meta0->map_flags & BPF_F_INNER_MAP ? true :
+ meta0->max_entries == meta1->max_entries;
+}
+
+struct bpf_iter_seq_array_map_info {
+ struct bpf_map *map;
+ void *percpu_value_buf;
+ u32 index;
+};
+
+static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ if (info->index >= map->max_entries)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return (void *)(uintptr_t)array->pptrs[index];
+ return array_map_elem_ptr(array, index);
+}
+
+static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ ++*pos;
+ ++info->index;
+ if (info->index >= map->max_entries)
+ return NULL;
- kvfree(array);
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return (void *)(uintptr_t)array->pptrs[index];
+ return array_map_elem_ptr(array, index);
}
-static const struct bpf_map_ops array_ops = {
+static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int off = 0, cpu = 0;
+ void __percpu *pptr;
+ u32 size;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, v == NULL);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+
+ if (!info->percpu_value_buf) {
+ ctx.value = v;
+ } else {
+ pptr = (void __percpu *)(uintptr_t)v;
+ size = array->elem_size;
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
+ off += size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_array_map_seq_show(seq, v);
+}
+
+static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_array_map_seq_show(seq, NULL);
+}
+
+static int bpf_iter_init_array_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ buf_size = array->elem_size * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ /* bpf_iter_attach_map() acquires a map uref, and the uref may be
+ * released before or in the middle of iterating map elements, so
+ * acquire an extra map uref for iterator.
+ */
+ bpf_map_inc_with_uref(map);
+ seq_info->map = map;
+ return 0;
+}
+
+static void bpf_iter_fini_array_map(void *priv_data)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_array_map_seq_ops = {
+ .start = bpf_array_map_seq_start,
+ .next = bpf_array_map_seq_next,
+ .stop = bpf_array_map_seq_stop,
+ .show = bpf_array_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_array_map_seq_ops,
+ .init_seq_private = bpf_iter_init_array_map,
+ .fini_seq_private = bpf_iter_fini_array_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
+};
+
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ u32 i, key, num_elems = 0;
+ struct bpf_array *array;
+ bool is_percpu;
+ u64 ret = 0;
+ void *val;
+
+ cant_migrate();
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ array = container_of(map, struct bpf_array, map);
+ for (i = 0; i < map->max_entries; i++) {
+ if (is_percpu)
+ val = this_cpu_ptr(array->pptrs[i]);
+ else
+ val = array_map_elem_ptr(array, i);
+ num_elems++;
+ key = i;
+ ret = callback_fn((u64)(long)map, (u64)(long)&key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ break;
+ }
+
+ return num_elems;
+}
+
+static u64 array_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size = array->elem_size;
+ u64 entries = map->max_entries;
+ u64 usage = sizeof(*array);
+
+ if (percpu) {
+ usage += entries * sizeof(void *);
+ usage += entries * elem_size * num_possible_cpus();
+ } else {
+ if (map->map_flags & BPF_F_MMAPABLE) {
+ usage = PAGE_ALIGN(usage);
+ usage += PAGE_ALIGN(entries * elem_size);
+ } else {
+ usage += entries * elem_size;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
+const struct bpf_map_ops array_map_ops = {
+ .map_meta_equal = array_map_meta_equal,
+ .map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
- .map_get_next_key = array_map_get_next_key,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_release_uref = array_map_free_internal_structs,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
+ .map_gen_lookup = array_map_gen_lookup,
+ .map_direct_value_addr = array_map_direct_value_addr,
+ .map_direct_value_meta = array_map_direct_value_meta,
+ .map_mmap = array_map_mmap,
+ .map_seq_show_elem = array_map_seq_show_elem,
+ .map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
+ .map_get_hash = &array_map_get_hash,
};
-static struct bpf_map_type_list array_type __read_mostly = {
- .ops = &array_ops,
- .type = BPF_MAP_TYPE_ARRAY,
+const struct bpf_map_ops percpu_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = array_map_alloc_check,
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_gen_lookup = percpu_array_map_gen_lookup,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+ .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
+ .map_seq_show_elem = percpu_array_map_seq_show_elem,
+ .map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
-static int __init register_array_map(void)
+static int fd_array_map_alloc_check(union bpf_attr *attr)
{
- bpf_register_map_type(&array_type);
+ /* only file descriptors can be stored in this type of map */
+ if (attr->value_size != sizeof(u32))
+ return -EINVAL;
+ /* Program read-only/write-only not supported for special maps yet. */
+ if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
+ return -EINVAL;
+ return array_map_alloc_check(attr);
+}
+
+static void fd_array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ /* make sure it's empty */
+ for (i = 0; i < array->map.max_entries; i++)
+ BUG_ON(array->ptrs[i] != NULL);
+
+ bpf_map_area_free(array);
+}
+
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+/* only called from syscall */
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **elem, *ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ elem = array_map_lookup_elem(map, key);
+ if (elem && (ptr = READ_ONCE(*elem)))
+ *value = map->ops->map_fd_sys_lookup_elem(ptr);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *new_ptr, *old_ptr;
+ u32 index = *(u32 *)key, ufd;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ ufd = *(u32 *)value;
+ new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+ if (IS_ERR(new_ptr))
+ return PTR_ERR(new_ptr);
+
+ if (map->ops->map_poke_run) {
+ mutex_lock(&array->aux->poke_mutex);
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+ mutex_unlock(&array->aux->poke_mutex);
+ } else {
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ }
+
+ if (old_ptr)
+ map->ops->map_fd_put_ptr(map, old_ptr, true);
return 0;
}
-late_initcall(register_array_map);
+
+static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *old_ptr;
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ if (map->ops->map_poke_run) {
+ mutex_lock(&array->aux->poke_mutex);
+ old_ptr = xchg(array->ptrs + index, NULL);
+ map->ops->map_poke_run(map, index, old_ptr, NULL);
+ mutex_unlock(&array->aux->poke_mutex);
+ } else {
+ old_ptr = xchg(array->ptrs + index, NULL);
+ }
+
+ if (old_ptr) {
+ map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return __fd_array_map_delete_elem(map, key, true);
+}
+
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
+{
+ struct bpf_prog *prog = bpf_prog_get(fd);
+ bool is_extended;
+
+ if (IS_ERR(prog))
+ return prog;
+
+ if (prog->type == BPF_PROG_TYPE_EXT ||
+ !bpf_prog_map_compatible(map, prog)) {
+ bpf_prog_put(prog);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mutex_lock(&prog->aux->ext_mutex);
+ is_extended = prog->aux->is_extended;
+ if (!is_extended)
+ prog->aux->prog_array_member_cnt++;
+ mutex_unlock(&prog->aux->ext_mutex);
+ if (is_extended) {
+ /* Extended prog can not be tail callee. It's to prevent a
+ * potential infinite loop like:
+ * tail callee prog entry -> tail callee prog subprog ->
+ * freplace prog entry --tailcall-> tail callee prog entry.
+ */
+ bpf_prog_put(prog);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return prog;
+}
+
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
+{
+ struct bpf_prog *prog = ptr;
+
+ mutex_lock(&prog->aux->ext_mutex);
+ prog->aux->prog_array_member_cnt--;
+ mutex_unlock(&prog->aux->ext_mutex);
+ /* bpf_prog is freed after one RCU or tasks trace grace period */
+ bpf_prog_put(prog);
+}
+
+static u32 prog_fd_array_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_prog *)ptr)->aux->id;
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ __fd_array_map_delete_elem(map, &i, need_defer);
+}
+
+static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void **elem, *ptr;
+ u32 prog_id;
+
+ rcu_read_lock();
+
+ elem = array_map_lookup_elem(map, key);
+ if (elem) {
+ ptr = READ_ONCE(*elem);
+ if (ptr) {
+ seq_printf(m, "%u: ", *(u32 *)key);
+ prog_id = prog_fd_array_sys_lookup_elem(ptr);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ &prog_id, m);
+ seq_putc(m, '\n');
+ }
+ }
+
+ rcu_read_unlock();
+}
+
+struct prog_poke_elem {
+ struct list_head list;
+ struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+ struct bpf_prog_aux *prog_aux)
+{
+ struct prog_poke_elem *elem;
+ struct bpf_array_aux *aux;
+ int ret = 0;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ mutex_lock(&aux->poke_mutex);
+ list_for_each_entry(elem, &aux->poke_progs, list) {
+ if (elem->aux == prog_aux)
+ goto out;
+ }
+
+ elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+ if (!elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&elem->list);
+ /* We must track the program's aux info at this point in time
+ * since the program pointer itself may not be stable yet, see
+ * also comment in prog_array_map_poke_run().
+ */
+ elem->aux = prog_aux;
+
+ list_add_tail(&elem->list, &aux->poke_progs);
+out:
+ mutex_unlock(&aux->poke_mutex);
+ return ret;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+ struct bpf_prog_aux *prog_aux)
+{
+ struct prog_poke_elem *elem, *tmp;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ mutex_lock(&aux->poke_mutex);
+ list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+ if (elem->aux == prog_aux) {
+ list_del_init(&elem->list);
+ kfree(elem);
+ break;
+ }
+ }
+ mutex_unlock(&aux->poke_mutex);
+}
+
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ WARN_ON_ONCE(1);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+ struct bpf_prog *old,
+ struct bpf_prog *new)
+{
+ struct prog_poke_elem *elem;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
+
+ list_for_each_entry(elem, &aux->poke_progs, list) {
+ struct bpf_jit_poke_descriptor *poke;
+ int i;
+
+ for (i = 0; i < elem->aux->size_poke_tab; i++) {
+ poke = &elem->aux->poke_tab[i];
+
+ /* Few things to be aware of:
+ *
+ * 1) We can only ever access aux in this context, but
+ * not aux->prog since it might not be stable yet and
+ * there could be danger of use after free otherwise.
+ * 2) Initially when we start tracking aux, the program
+ * is not JITed yet and also does not have a kallsyms
+ * entry. We skip these as poke->tailcall_target_stable
+ * is not active yet. The JIT will do the final fixup
+ * before setting it stable. The various
+ * poke->tailcall_target_stable are successively
+ * activated, so tail call updates can arrive from here
+ * while JIT is still finishing its final fixup for
+ * non-activated poke entries.
+ * 3) Also programs reaching refcount of zero while patching
+ * is in progress is okay since we're protected under
+ * poke_mutex and untrack the programs before the JIT
+ * buffer is freed.
+ */
+ if (!READ_ONCE(poke->tailcall_target_stable))
+ continue;
+ if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+ continue;
+ if (poke->tail_call.map != map ||
+ poke->tail_call.key != key)
+ continue;
+
+ bpf_arch_poke_desc_update(poke, new, old);
+ }
+ }
+}
+
+static void prog_array_map_clear_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_array_aux,
+ work)->map;
+ bpf_fd_array_map_clear(map, true);
+ bpf_map_put(map);
+}
+
+static void prog_array_map_clear(struct bpf_map *map)
+{
+ struct bpf_array_aux *aux = container_of(map, struct bpf_array,
+ map)->aux;
+ bpf_map_inc(map);
+ schedule_work(&aux->work);
+}
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array_aux *aux;
+ struct bpf_map *map;
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT);
+ if (!aux)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_WORK(&aux->work, prog_array_map_clear_deferred);
+ INIT_LIST_HEAD(&aux->poke_progs);
+ mutex_init(&aux->poke_mutex);
+
+ map = array_map_alloc(attr);
+ if (IS_ERR(map)) {
+ kfree(aux);
+ return map;
+ }
+
+ container_of(map, struct bpf_array, map)->aux = aux;
+ aux->map = map;
+
+ return map;
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+ struct prog_poke_elem *elem, *tmp;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+ list_del_init(&elem->list);
+ kfree(elem);
+ }
+ kfree(aux);
+ fd_array_map_free(map);
+}
+
+/* prog_array->aux->{type,jited} is a runtime binding.
+ * Doing static check alone in the verifier is not enough.
+ * Thus, prog_array_map cannot be used as an inner_map
+ * and map_meta_equal is not implemented.
+ */
+const struct bpf_map_ops prog_array_map_ops = {
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = prog_array_map_alloc,
+ .map_free = prog_array_map_free,
+ .map_poke_track = prog_array_map_poke_track,
+ .map_poke_untrack = prog_array_map_poke_untrack,
+ .map_poke_run = prog_array_map_poke_run,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = prog_fd_array_get_ptr,
+ .map_fd_put_ptr = prog_fd_array_put_ptr,
+ .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+ .map_release_uref = prog_array_map_clear,
+ .map_seq_show_elem = prog_array_map_seq_show_elem,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+};
+
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+ struct file *map_file)
+{
+ struct bpf_event_entry *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+ if (ee) {
+ ee->event = perf_file->private_data;
+ ee->perf_file = perf_file;
+ ee->map_file = map_file;
+ }
+
+ return ee;
+}
+
+static void __bpf_event_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_event_entry *ee;
+
+ ee = container_of(rcu, struct bpf_event_entry, rcu);
+ fput(ee->perf_file);
+ kfree(ee);
+}
+
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+ call_rcu(&ee->rcu, __bpf_event_entry_free);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
+{
+ struct bpf_event_entry *ee;
+ struct perf_event *event;
+ struct file *perf_file;
+ u64 value;
+
+ perf_file = perf_event_get(fd);
+ if (IS_ERR(perf_file))
+ return perf_file;
+
+ ee = ERR_PTR(-EOPNOTSUPP);
+ event = perf_file->private_data;
+ if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
+ goto err_out;
+
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
+err_out:
+ fput(perf_file);
+ return ee;
+}
+
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
+{
+ /* bpf_perf_event is freed after one RCU grace period */
+ bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+ struct file *map_file)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_event_entry *ee;
+ int i;
+
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ return;
+
+ rcu_read_lock();
+ for (i = 0; i < array->map.max_entries; i++) {
+ ee = READ_ONCE(array->ptrs[i]);
+ if (ee && ee->map_file == map_file)
+ __fd_array_map_delete_elem(map, &i, true);
+ }
+ rcu_read_unlock();
+}
+
+static void perf_event_fd_array_map_free(struct bpf_map *map)
+{
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ bpf_fd_array_map_clear(map, false);
+ fd_array_map_free(map);
+}
+
+const struct bpf_map_ops perf_event_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
+ .map_free = perf_event_fd_array_map_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+ .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+ .map_release = perf_event_fd_array_release,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+};
+
+#ifdef CONFIG_CGROUPS
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int fd)
+{
+ return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
+{
+ /* cgroup_put free cgrp after a rcu grace period */
+ cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map, false);
+ fd_array_map_free(map);
+}
+
+const struct bpf_map_ops cgroup_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_map_alloc,
+ .map_free = cgroup_fd_array_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+ .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+};
+#endif
+
+static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map, *inner_map_meta;
+
+ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+ if (IS_ERR(inner_map_meta))
+ return inner_map_meta;
+
+ map = array_map_alloc(attr);
+ if (IS_ERR(map)) {
+ bpf_map_meta_free(inner_map_meta);
+ return map;
+ }
+
+ map->inner_map_meta = inner_map_meta;
+
+ return map;
+}
+
+static void array_of_map_free(struct bpf_map *map)
+{
+ /* map->inner_map_meta is only accessed by syscall which
+ * is protected by fdget/fdput.
+ */
+ bpf_map_meta_free(map->inner_map_meta);
+ bpf_fd_array_map_clear(map, false);
+ fd_array_map_free(map);
+}
+
+static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_map **inner_map = array_map_lookup_elem(map, key);
+
+ if (!inner_map)
+ return NULL;
+
+ return READ_ONCE(*inner_map);
+}
+
+static int array_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 elem_size = array->elem_size;
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int map_ptr = BPF_REG_1;
+ const int index = BPF_REG_2;
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+ if (!map->bypass_spec_v1) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ }
+ if (is_power_of_2(elem_size))
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ else
+ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *insn++ = BPF_MOV64_IMM(ret, 0);
+
+ return insn - insn_buf;
+}
+
+const struct bpf_map_ops array_of_maps_map_ops = {
+ .map_alloc_check = fd_array_map_alloc_check,
+ .map_alloc = array_of_map_alloc,
+ .map_free = array_of_map_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = array_of_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = bpf_map_fd_get_ptr,
+ .map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = array_of_map_gen_lookup,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
+ .map_btf_id = &array_map_btf_ids[0],
+};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
new file mode 100644
index 000000000000..35e1ddca74d2
--- /dev/null
+++ b/kernel/bpf/bloom_filter.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bitmap.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <linux/btf_ids.h>
+
+#define BLOOM_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
+
+struct bpf_bloom_filter {
+ struct bpf_map map;
+ u32 bitset_mask;
+ u32 hash_seed;
+ u32 nr_hash_funcs;
+ unsigned long bitset[];
+};
+
+static u32 hash(struct bpf_bloom_filter *bloom, void *value,
+ u32 value_size, u32 index)
+{
+ u32 h;
+
+ if (likely(value_size % 4 == 0))
+ h = jhash2(value, value_size / 4, bloom->hash_seed + index);
+ else
+ h = jhash(value, value_size, bloom->hash_seed + index);
+
+ return h & bloom->bitset_mask;
+}
+
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ if (!test_bit(h, bloom->bitset))
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ if (flags != BPF_ANY)
+ return -EINVAL;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ set_bit(h, bloom->bitset);
+ }
+
+ return 0;
+}
+
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+/* Called from syscall */
+static int bloom_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
+{
+ u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_bloom_filter *bloom;
+
+ if (attr->key_size != 0 || attr->value_size == 0 ||
+ attr->max_entries == 0 ||
+ attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
+ /* The lower 4 bits of map_extra (0xF) specify the number
+ * of hash functions
+ */
+ (attr->map_extra & ~0xF))
+ return ERR_PTR(-EINVAL);
+
+ nr_hash_funcs = attr->map_extra;
+ if (nr_hash_funcs == 0)
+ /* Default to using 5 hash functions if unspecified */
+ nr_hash_funcs = 5;
+
+ /* For the bloom filter, the optimal bit array size that minimizes the
+ * false positive probability is n * k / ln(2) where n is the number of
+ * expected entries in the bloom filter and k is the number of hash
+ * functions. We use 7 / 5 to approximate 1 / ln(2).
+ *
+ * We round this up to the nearest power of two to enable more efficient
+ * hashing using bitmasks. The bitmask will be the bit array size - 1.
+ *
+ * If this overflows a u32, the bit array size will have 2^32 (4
+ * GB) bits.
+ */
+ if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
+ check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
+ nr_bits > (1UL << 31)) {
+ /* The bit array size is 2^32 bits but to avoid overflowing the
+ * u32, we use U32_MAX, which will round up to the equivalent
+ * number of bytes
+ */
+ bitset_bytes = BITS_TO_BYTES(U32_MAX);
+ bitset_mask = U32_MAX;
+ } else {
+ if (nr_bits <= BITS_PER_LONG)
+ nr_bits = BITS_PER_LONG;
+ else
+ nr_bits = roundup_pow_of_two(nr_bits);
+ bitset_bytes = BITS_TO_BYTES(nr_bits);
+ bitset_mask = nr_bits - 1;
+ }
+
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
+
+ if (!bloom)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&bloom->map, attr);
+
+ bloom->nr_hash_funcs = nr_hash_funcs;
+ bloom->bitset_mask = bitset_mask;
+
+ if (!(attr->map_flags & BPF_F_ZERO_SEED))
+ bloom->hash_seed = get_random_u32();
+
+ return &bloom->map;
+}
+
+static void bloom_map_free(struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+
+ bpf_map_area_free(bloom);
+}
+
+static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ /* The eBPF program should use map_peek_elem instead */
+ return ERR_PTR(-EINVAL);
+}
+
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ /* The eBPF program should use map_push_elem instead */
+ return -EINVAL;
+}
+
+static int bloom_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* Bloom filter maps are keyless */
+ return btf_type_is_void(key_type) ? 0 : -EINVAL;
+}
+
+static u64 bloom_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom;
+ u64 bitset_bytes;
+
+ bloom = container_of(map, struct bpf_bloom_filter, map);
+ bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1);
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ return sizeof(*bloom) + bitset_bytes;
+}
+
+BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
+const struct bpf_map_ops bloom_filter_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bloom_map_alloc_check,
+ .map_alloc = bloom_map_alloc,
+ .map_free = bloom_map_free,
+ .map_get_next_key = bloom_map_get_next_key,
+ .map_push_elem = bloom_map_push_elem,
+ .map_peek_elem = bloom_map_peek_elem,
+ .map_pop_elem = bloom_map_pop_elem,
+ .map_lookup_elem = bloom_map_lookup_elem,
+ .map_update_elem = bloom_map_update_elem,
+ .map_delete_elem = bloom_map_delete_elem,
+ .map_check_btf = bloom_map_check_btf,
+ .map_mem_usage = bloom_map_mem_usage,
+ .map_btf_id = &bpf_bloom_map_btf_ids[0],
+};
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..0687a760974a
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+ cant_migrate();
+ this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+ this_cpu_dec(bpf_cgrp_storage_busy);
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+ cant_migrate();
+ if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+ struct cgroup *cg = owner;
+
+ return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+ struct bpf_local_storage *local_storage;
+
+ rcu_read_lock_dont_migrate();
+ local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+ if (!local_storage)
+ goto out;
+
+ bpf_cgrp_storage_lock();
+ bpf_local_storage_destroy(local_storage);
+ bpf_cgrp_storage_unlock();
+out:
+ rcu_read_unlock_migrate();
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *cgroup_storage;
+ struct bpf_local_storage_map *smap;
+
+ cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+ bpf_rcu_lock_held());
+ if (!cgroup_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return ERR_CAST(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return sdata ? sdata->data : NULL;
+}
+
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, map_flags, false, GFP_ATOMIC);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = cgroup_storage_lookup(cgroup, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+ return 0;
+}
+
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct cgroup *cgroup;
+ int err, fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_v1v2_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ err = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ bool nobusy;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!cgroup)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_cgrp_storage_trylock();
+
+ sdata = cgroup_storage_lookup(cgroup, map, nobusy);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the cgroup is refcounted */
+ if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, BPF_NOEXIST, false, gfp_flags);
+
+unlock:
+ if (nobusy)
+ bpf_cgrp_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!cgroup)
+ return -EINVAL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return -EBUSY;
+
+ ret = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ return ret;
+}
+
+const struct bpf_map_ops cgrp_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+ .map_update_elem = bpf_cgrp_storage_update_elem,
+ .map_delete_elem = bpf_cgrp_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+ .func = bpf_cgrp_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+ .func = bpf_cgrp_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
new file mode 100644
index 000000000000..e54cce2b9175
--- /dev/null
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(inode_cache);
+
+static struct bpf_local_storage __rcu **
+inode_storage_ptr(void *owner)
+{
+ struct inode *inode = owner;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+ return &bsb->storage;
+}
+
+static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
+ struct bpf_map *map,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage *inode_storage;
+ struct bpf_local_storage_map *smap;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+
+ inode_storage =
+ rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
+ if (!inode_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
+}
+
+void bpf_inode_storage_free(struct inode *inode)
+{
+ struct bpf_local_storage *local_storage;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return;
+
+ rcu_read_lock_dont_migrate();
+
+ local_storage = rcu_dereference(bsb->storage);
+ if (!local_storage)
+ goto out;
+
+ bpf_local_storage_destroy(local_storage);
+out:
+ rcu_read_unlock_migrate();
+}
+
+static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ CLASS(fd_raw, f)(*(int *)key);
+
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+
+ sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true);
+ return sdata ? sdata->data : NULL;
+}
+
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ CLASS(fd_raw, f)(*(int *)key);
+
+ if (fd_empty(f))
+ return -EBADF;
+ if (!inode_storage_ptr(file_inode(fd_file(f))))
+ return -EBADF;
+
+ sdata = bpf_local_storage_update(file_inode(fd_file(f)),
+ (struct bpf_local_storage_map *)map,
+ value, map_flags, false, GFP_ATOMIC);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = inode_storage_lookup(inode, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+
+ return 0;
+}
+
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ CLASS(fd_raw, f)(*(int *)key);
+
+ if (fd_empty(f))
+ return -EBADF;
+ return inode_storage_delete(file_inode(fd_file(f)), map);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ /* explicitly check that the inode_storage_ptr is not
+ * NULL as inode_storage_lookup returns NULL in this case and
+ * bpf_local_storage_update expects the owner to have a
+ * valid storage pointer.
+ */
+ if (!inode || !inode_storage_ptr(inode))
+ return (unsigned long)NULL;
+
+ sdata = inode_storage_lookup(inode, map, true);
+ if (sdata)
+ return (unsigned long)sdata->data;
+
+ /* This helper must only called from where the inode is guaranteed
+ * to have a refcount and cannot be freed.
+ */
+ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+ sdata = bpf_local_storage_update(
+ inode, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST, false, gfp_flags);
+ return IS_ERR(sdata) ? (unsigned long)NULL :
+ (unsigned long)sdata->data;
+ }
+
+ return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_inode_storage_delete,
+ struct bpf_map *, map, struct inode *, inode)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!inode)
+ return -EINVAL;
+
+ /* This helper must only called from where the inode is guaranteed
+ * to have a refcount and cannot be freed.
+ */
+ return inode_storage_delete(inode, map);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &inode_cache, false);
+}
+
+static void inode_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &inode_cache, NULL);
+}
+
+const struct bpf_map_ops inode_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = inode_storage_map_alloc,
+ .map_free = inode_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
+ .map_update_elem = bpf_fd_inode_storage_update_elem,
+ .map_delete_elem = bpf_fd_inode_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = inode_storage_ptr,
+};
+
+BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode)
+
+const struct bpf_func_proto bpf_inode_storage_get_proto = {
+ .func = bpf_inode_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_inode_storage_delete_proto = {
+ .func = bpf_inode_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+};
diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c
new file mode 100644
index 000000000000..c96630cb75bf
--- /dev/null
+++ b/kernel/bpf/bpf_insn_array.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Isovalent */
+
+#include <linux/bpf.h>
+
+struct bpf_insn_array {
+ struct bpf_map map;
+ atomic_t used;
+ long *ips;
+ DECLARE_FLEX_ARRAY(struct bpf_insn_array_value, values);
+};
+
+#define cast_insn_array(MAP_PTR) \
+ container_of((MAP_PTR), struct bpf_insn_array, map)
+
+#define INSN_DELETED ((u32)-1)
+
+static inline u64 insn_array_alloc_size(u32 max_entries)
+{
+ const u64 base_size = sizeof(struct bpf_insn_array);
+ const u64 entry_size = sizeof(struct bpf_insn_array_value);
+
+ return base_size + max_entries * (entry_size + sizeof(long));
+}
+
+static int insn_array_alloc_check(union bpf_attr *attr)
+{
+ u32 value_size = sizeof(struct bpf_insn_array_value);
+
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size != value_size || attr->map_flags != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void insn_array_free(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ bpf_map_area_free(insn_array);
+}
+
+static struct bpf_map *insn_array_alloc(union bpf_attr *attr)
+{
+ u64 size = insn_array_alloc_size(attr->max_entries);
+ struct bpf_insn_array *insn_array;
+
+ insn_array = bpf_map_area_alloc(size, NUMA_NO_NODE);
+ if (!insn_array)
+ return ERR_PTR(-ENOMEM);
+
+ /* ips are allocated right after the insn_array->values[] array */
+ insn_array->ips = (void *)&insn_array->values[attr->max_entries];
+
+ bpf_map_init_from_attr(&insn_array->map, attr);
+
+ /* BPF programs aren't allowed to write to the map */
+ insn_array->map.map_flags |= BPF_F_RDONLY_PROG;
+
+ return &insn_array->map;
+}
+
+static void *insn_array_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return NULL;
+
+ return &insn_array->values[index];
+}
+
+static long insn_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ u32 index = *(u32 *)key;
+ struct bpf_insn_array_value val = {};
+
+ if (unlikely(index >= insn_array->map.max_entries))
+ return -E2BIG;
+
+ if (unlikely(map_flags & BPF_NOEXIST))
+ return -EEXIST;
+
+ copy_map_value(map, &val, value);
+ if (val.jitted_off || val.xlated_off)
+ return -EINVAL;
+
+ insn_array->values[index].orig_off = val.orig_off;
+
+ return 0;
+}
+
+static long insn_array_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+static int insn_array_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ if (!btf_type_is_i32(key_type))
+ return -EINVAL;
+
+ if (!btf_type_is_i64(value_type))
+ return -EINVAL;
+
+ return 0;
+}
+
+static u64 insn_array_mem_usage(const struct bpf_map *map)
+{
+ return insn_array_alloc_size(map->max_entries);
+}
+
+static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ if ((off % sizeof(long)) != 0 ||
+ (off / sizeof(long)) >= map->max_entries)
+ return -EINVAL;
+
+ /* from BPF's point of view, this map is a jump table */
+ *imm = (unsigned long)insn_array->ips + off;
+
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(insn_array_btf_ids, struct, bpf_insn_array)
+
+const struct bpf_map_ops insn_array_map_ops = {
+ .map_alloc_check = insn_array_alloc_check,
+ .map_alloc = insn_array_alloc,
+ .map_free = insn_array_free,
+ .map_get_next_key = bpf_array_get_next_key,
+ .map_lookup_elem = insn_array_lookup_elem,
+ .map_update_elem = insn_array_update_elem,
+ .map_delete_elem = insn_array_delete_elem,
+ .map_check_btf = insn_array_check_btf,
+ .map_mem_usage = insn_array_mem_usage,
+ .map_direct_value_addr = insn_array_map_direct_value_addr,
+ .map_btf_id = &insn_array_btf_ids[0],
+};
+
+static inline bool is_frozen(struct bpf_map *map)
+{
+ guard(mutex)(&map->freeze_mutex);
+
+ return map->frozen;
+}
+
+static bool is_insn_array(const struct bpf_map *map)
+{
+ return map->map_type == BPF_MAP_TYPE_INSN_ARRAY;
+}
+
+static inline bool valid_offsets(const struct bpf_insn_array *insn_array,
+ const struct bpf_prog *prog)
+{
+ u32 off;
+ int i;
+
+ for (i = 0; i < insn_array->map.max_entries; i++) {
+ off = insn_array->values[i].orig_off;
+
+ if (off >= prog->len)
+ return false;
+
+ if (off > 0) {
+ if (prog->insnsi[off-1].code == (BPF_LD | BPF_DW | BPF_IMM))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ struct bpf_insn_array_value *values = insn_array->values;
+ int i;
+
+ if (!is_frozen(map))
+ return -EINVAL;
+
+ if (!valid_offsets(insn_array, prog))
+ return -EINVAL;
+
+ /*
+ * There can be only one program using the map
+ */
+ if (atomic_xchg(&insn_array->used, 1))
+ return -EBUSY;
+
+ /*
+ * Reset all the map indexes to the original values. This is needed,
+ * e.g., when a replay of verification with different log level should
+ * be performed.
+ */
+ for (i = 0; i < map->max_entries; i++)
+ values[i].xlated_off = values[i].orig_off;
+
+ return 0;
+}
+
+int bpf_insn_array_ready(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (!insn_array->ips[i])
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+void bpf_insn_array_release(struct bpf_map *map)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+
+ atomic_set(&insn_array->used, 0);
+}
+
+void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off <= off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ insn_array->values[i].xlated_off += len - 1;
+ }
+}
+
+void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len)
+{
+ struct bpf_insn_array *insn_array = cast_insn_array(map);
+ int i;
+
+ for (i = 0; i < map->max_entries; i++) {
+ if (insn_array->values[i].xlated_off < off)
+ continue;
+ if (insn_array->values[i].xlated_off == INSN_DELETED)
+ continue;
+ if (insn_array->values[i].xlated_off < off + len)
+ insn_array->values[i].xlated_off = INSN_DELETED;
+ else
+ insn_array->values[i].xlated_off -= len;
+ }
+}
+
+/*
+ * This function is called by JITs. The image is the real program
+ * image, the offsets array set up the xlated -> jitted mapping.
+ * The offsets[xlated] offset should point to the beginning of
+ * the jitted instruction.
+ */
+void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
+{
+ struct bpf_insn_array *insn_array;
+ struct bpf_map *map;
+ u32 xlated_off;
+ int i, j;
+
+ if (!offsets || !image)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (!is_insn_array(map))
+ continue;
+
+ insn_array = cast_insn_array(map);
+ for (j = 0; j < map->max_entries; j++) {
+ xlated_off = insn_array->values[j].xlated_off;
+ if (xlated_off == INSN_DELETED)
+ continue;
+ if (xlated_off < prog->aux->subprog_start)
+ continue;
+ xlated_off -= prog->aux->subprog_start;
+ if (xlated_off >= prog->len)
+ continue;
+
+ insn_array->values[j].jitted_off = offsets[xlated_off];
+ insn_array->ips[j] = (long)(image + offsets[xlated_off]);
+ }
+ }
+}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
new file mode 100644
index 000000000000..eec60b57bd3d
--- /dev/null
+++ b/kernel/bpf/bpf_iter.c
@@ -0,0 +1,827 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
+
+struct bpf_iter_target_info {
+ struct list_head list;
+ const struct bpf_iter_reg *reg_info;
+ u32 btf_id; /* cached value */
+};
+
+struct bpf_iter_link {
+ struct bpf_link link;
+ struct bpf_iter_aux_info aux;
+ struct bpf_iter_target_info *tinfo;
+};
+
+struct bpf_iter_priv_data {
+ struct bpf_iter_target_info *tinfo;
+ const struct bpf_iter_seq_info *seq_info;
+ struct bpf_prog *prog;
+ u64 session_id;
+ u64 seq_num;
+ bool done_stop;
+ u8 target_private[] __aligned(8);
+};
+
+static struct list_head targets = LIST_HEAD_INIT(targets);
+static DEFINE_MUTEX(targets_mutex);
+
+/* protect bpf_iter_link changes */
+static DEFINE_MUTEX(link_mutex);
+
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+
+static void bpf_iter_inc_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num++;
+}
+
+static void bpf_iter_dec_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num--;
+}
+
+static void bpf_iter_done_stop(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->done_stop = true;
+}
+
+static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo)
+{
+ return tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
+static bool bpf_iter_support_resched(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ return bpf_iter_target_support_resched(iter_priv->tinfo);
+}
+
+/* maximum visited objects before bailing out */
+#define MAX_ITER_OBJECTS 1000000
+
+/* bpf_seq_read, a customized and simpler version for bpf iterator.
+ * The following are differences from seq_read():
+ * . fixed buffer size (PAGE_SIZE)
+ * . assuming NULL ->llseek()
+ * . stop() may call bpf program, handling potential overflow there
+ */
+static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ size_t n, offs, copied = 0;
+ int err = 0, num_objs = 0;
+ bool can_resched;
+ void *p;
+
+ mutex_lock(&seq->lock);
+
+ if (!seq->buf) {
+ seq->size = PAGE_SIZE << 3;
+ seq->buf = kvmalloc(seq->size, GFP_KERNEL);
+ if (!seq->buf) {
+ err = -ENOMEM;
+ goto done;
+ }
+ }
+
+ if (seq->count) {
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf + seq->from, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ seq->count -= n;
+ seq->from += n;
+ copied = n;
+ goto done;
+ }
+
+ seq->from = 0;
+ p = seq->op->start(seq, &seq->index);
+ if (!p)
+ goto stop;
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ /* object is skipped, decrease seq_num, so next
+ * valid object can reuse the same seq_num.
+ */
+ bpf_iter_dec_seq_num(seq);
+ seq->count = 0;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ can_resched = bpf_iter_support_resched(seq);
+ while (1) {
+ loff_t pos = seq->index;
+
+ num_objs++;
+ offs = seq->count;
+ p = seq->op->next(seq, p, &seq->index);
+ if (pos == seq->index) {
+ pr_info_ratelimited("buggy seq_file .next function %ps "
+ "did not updated position index\n",
+ seq->op->next);
+ seq->index++;
+ }
+
+ if (IS_ERR_OR_NULL(p))
+ break;
+
+ /* got a valid next object, increase seq_num */
+ bpf_iter_inc_seq_num(seq);
+
+ if (seq->count >= size)
+ break;
+
+ if (num_objs >= MAX_ITER_OBJECTS) {
+ if (offs == 0) {
+ err = -EAGAIN;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ bpf_iter_dec_seq_num(seq);
+ seq->count = offs;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ seq->count = offs;
+ if (offs == 0) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+
+ if (can_resched)
+ cond_resched();
+ }
+stop:
+ offs = seq->count;
+ if (IS_ERR(p)) {
+ seq->op->stop(seq, NULL);
+ err = PTR_ERR(p);
+ goto done;
+ }
+ /* bpf program called if !p */
+ seq->op->stop(seq, p);
+ if (!p) {
+ if (!seq_has_overflowed(seq)) {
+ bpf_iter_done_stop(seq);
+ } else {
+ seq->count = offs;
+ if (offs == 0) {
+ err = -E2BIG;
+ goto done;
+ }
+ }
+ }
+
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ copied = n;
+ seq->count -= n;
+ seq->from = n;
+done:
+ if (!copied)
+ copied = err;
+ else
+ *ppos += copied;
+ mutex_unlock(&seq->lock);
+ return copied;
+}
+
+static const struct bpf_iter_seq_info *
+__get_seq_info(struct bpf_iter_link *link)
+{
+ const struct bpf_iter_seq_info *seq_info;
+
+ if (link->aux.map) {
+ seq_info = link->aux.map->ops->iter_seq_info;
+ if (seq_info)
+ return seq_info;
+ }
+
+ return link->tinfo->reg_info->seq_info;
+}
+
+static int iter_open(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_link *link = inode->i_private;
+
+ return prepare_seq_file(file, link);
+}
+
+static int iter_release(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ if (!seq)
+ return 0;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+
+ if (iter_priv->seq_info->fini_seq_private)
+ iter_priv->seq_info->fini_seq_private(seq->private);
+
+ bpf_prog_put(iter_priv->prog);
+ seq->private = iter_priv;
+
+ return seq_release_private(inode, file);
+}
+
+const struct file_operations bpf_iter_fops = {
+ .open = iter_open,
+ .read = bpf_seq_read,
+ .release = iter_release,
+};
+
+/* The argument reg_info will be cached in bpf_iter_target_info.
+ * The common practice is to declare target reg_info as
+ * a const static variable and passed as an argument to
+ * bpf_iter_reg_target().
+ */
+int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+
+ tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ return -ENOMEM;
+
+ tinfo->reg_info = reg_info;
+ INIT_LIST_HEAD(&tinfo->list);
+
+ mutex_lock(&targets_mutex);
+ list_add(&tinfo->list, &targets);
+ mutex_unlock(&targets_mutex);
+
+ return 0;
+}
+
+void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+ bool found = false;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (reg_info == tinfo->reg_info) {
+ list_del(&tinfo->list);
+ kfree(tinfo);
+ found = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ WARN_ON(found == false);
+}
+
+static void cache_btf_id(struct bpf_iter_target_info *tinfo,
+ struct bpf_prog *prog)
+{
+ tinfo->btf_id = prog->aux->attach_btf_id;
+}
+
+int bpf_iter_prog_supported(struct bpf_prog *prog)
+{
+ const char *attach_fname = prog->aux->attach_func_name;
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
+ u32 prog_btf_id = prog->aux->attach_btf_id;
+ const char *prefix = BPF_ITER_FUNC_PREFIX;
+ int prefix_len = strlen(prefix);
+
+ if (strncmp(attach_fname, prefix, prefix_len))
+ return -EINVAL;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id && iter->btf_id == prog_btf_id) {
+ tinfo = iter;
+ break;
+ }
+ if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
+ cache_btf_id(iter, prog);
+ tinfo = iter;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ if (!tinfo)
+ return -EINVAL;
+
+ return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info,
+ tinfo->reg_info->ctx_arg_info_size);
+}
+
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_iter_target_info *tinfo;
+ const struct bpf_func_proto *fn = NULL;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog->aux->attach_btf_id) {
+ const struct bpf_iter_reg *reg_info;
+
+ reg_info = tinfo->reg_info;
+ if (reg_info->get_func_proto)
+ fn = reg_info->get_func_proto(func_id, prog);
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ return fn;
+}
+
+static void bpf_iter_link_release(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ if (iter_link->tinfo->reg_info->detach_target)
+ iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
+}
+
+static void bpf_iter_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ kfree(iter_link);
+}
+
+static int bpf_iter_link_replace(struct bpf_link *link,
+ struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ int ret = 0;
+
+ mutex_lock(&link_mutex);
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (link->prog->type != new_prog->type ||
+ link->prog->expected_attach_type != new_prog->expected_attach_type ||
+ link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ mutex_unlock(&link_mutex);
+ return ret;
+}
+
+static void bpf_iter_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ bpf_iter_show_fdinfo_t show_fdinfo;
+
+ seq_printf(seq,
+ "target_name:\t%s\n",
+ iter_link->tinfo->reg_info->target);
+
+ show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo;
+ if (show_fdinfo)
+ show_fdinfo(&iter_link->aux, seq);
+}
+
+static int bpf_iter_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->iter.target_name);
+ bpf_iter_fill_link_info_t fill_link_info;
+ u32 ulen = info->iter.target_name_len;
+ const char *target_name;
+ u32 target_len;
+
+ if (!ulen ^ !ubuf)
+ return -EINVAL;
+
+ target_name = iter_link->tinfo->reg_info->target;
+ target_len = strlen(target_name);
+ info->iter.target_name_len = target_len + 1;
+
+ if (ubuf) {
+ if (ulen >= target_len + 1) {
+ if (copy_to_user(ubuf, target_name, target_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, target_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+ }
+
+ fill_link_info = iter_link->tinfo->reg_info->fill_link_info;
+ if (fill_link_info)
+ return fill_link_info(&iter_link->aux, info);
+
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_iter_link_lops = {
+ .release = bpf_iter_link_release,
+ .dealloc = bpf_iter_link_dealloc,
+ .update_prog = bpf_iter_link_replace,
+ .show_fdinfo = bpf_iter_link_show_fdinfo,
+ .fill_link_info = bpf_iter_link_fill_link_info,
+};
+
+bool bpf_link_is_iter(struct bpf_link *link)
+{
+ return link->ops == &bpf_iter_link_lops;
+}
+
+int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_prog *prog)
+{
+ struct bpf_iter_target_info *tinfo = NULL, *iter;
+ struct bpf_link_primer link_primer;
+ union bpf_iter_link_info linfo;
+ struct bpf_iter_link *link;
+ u32 prog_btf_id, linfo_len;
+ bpfptr_t ulinfo;
+ int err;
+
+ if (attr->link_create.target_fd || attr->link_create.flags)
+ return -EINVAL;
+
+ memset(&linfo, 0, sizeof(union bpf_iter_link_info));
+
+ ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
+ linfo_len = attr->link_create.iter_info_len;
+ if (bpfptr_is_null(ulinfo) ^ !linfo_len)
+ return -EINVAL;
+
+ if (!bpfptr_is_null(ulinfo)) {
+ err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
+ linfo_len);
+ if (err)
+ return err;
+ linfo_len = min_t(u32, linfo_len, sizeof(linfo));
+ if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
+ return -EFAULT;
+ }
+
+ prog_btf_id = prog->aux->attach_btf_id;
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(iter, &targets, list) {
+ if (iter->btf_id == prog_btf_id) {
+ tinfo = iter;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+ if (!tinfo)
+ return -ENOENT;
+
+ /* Only allow sleepable program for resched-able iterator */
+ if (prog->sleepable && !bpf_iter_target_support_resched(tinfo))
+ return -EINVAL;
+
+ link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog,
+ attr->link_create.attach_type);
+ link->tinfo = tinfo;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ if (tinfo->reg_info->attach_target) {
+ err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
+ return bpf_link_settle(&link_primer);
+}
+
+static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
+ struct bpf_iter_target_info *tinfo,
+ const struct bpf_iter_seq_info *seq_info,
+ struct bpf_prog *prog)
+{
+ priv_data->tinfo = tinfo;
+ priv_data->seq_info = seq_info;
+ priv_data->prog = prog;
+ priv_data->session_id = atomic64_inc_return(&session_id);
+ priv_data->seq_num = 0;
+ priv_data->done_stop = false;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+{
+ const struct bpf_iter_seq_info *seq_info = __get_seq_info(link);
+ struct bpf_iter_priv_data *priv_data;
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_prog *prog;
+ u32 total_priv_dsize;
+ struct seq_file *seq;
+ int err = 0;
+
+ mutex_lock(&link_mutex);
+ prog = link->link.prog;
+ bpf_prog_inc(prog);
+ mutex_unlock(&link_mutex);
+
+ tinfo = link->tinfo;
+ total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
+ seq_info->seq_priv_size;
+ priv_data = __seq_open_private(file, seq_info->seq_ops,
+ total_priv_dsize);
+ if (!priv_data) {
+ err = -ENOMEM;
+ goto release_prog;
+ }
+
+ if (seq_info->init_seq_private) {
+ err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
+ if (err)
+ goto release_seq_file;
+ }
+
+ init_seq_meta(priv_data, tinfo, seq_info, prog);
+ seq = file->private_data;
+ seq->private = priv_data->target_private;
+
+ return 0;
+
+release_seq_file:
+ seq_release_private(file->f_inode, file);
+ file->private_data = NULL;
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link;
+ unsigned int flags;
+ int err;
+
+ if (link->ops != &bpf_iter_link_lops)
+ return -EINVAL;
+
+ flags = O_RDONLY | O_CLOEXEC;
+
+ FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags));
+ if (fdf.err)
+ return fdf.err;
+
+ iter_link = container_of(link, struct bpf_iter_link, link);
+ err = prepare_seq_file(fd_prepare_file(fdf), iter_link);
+ if (err)
+ return err; /* Automatic cleanup handles fput */
+
+ return fd_publish(fdf);
+}
+
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+ void *seq_priv;
+
+ seq = meta->seq;
+ if (seq->file->f_op != &bpf_iter_fops)
+ return NULL;
+
+ seq_priv = seq->private;
+ iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
+ target_private);
+
+ if (in_stop && iter_priv->done_stop)
+ return NULL;
+
+ meta->session_id = iter_priv->session_id;
+ meta->seq_num = iter_priv->seq_num;
+
+ return iter_priv->prog;
+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+ struct bpf_run_ctx run_ctx, *old_run_ctx;
+ int ret;
+
+ if (prog->sleepable) {
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ migrate_enable();
+ rcu_read_unlock_trace();
+ } else {
+ rcu_read_lock_dont_migrate();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx);
+ ret = bpf_prog_run(prog, ctx);
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock_migrate();
+ }
+
+ /* bpf program can only return 0 or 1:
+ * 0 : okay
+ * 1 : retry the same object
+ * The bpf_iter_run_prog() return value
+ * will be seq_ops->show() return value.
+ */
+ return ret == 0 ? 0 : -EAGAIN;
+}
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+ void *, callback_ctx, u64, flags)
+{
+ return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+ .func = bpf_for_each_map_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+ u64, flags)
+{
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 ret;
+ u32 i;
+
+ /* Note: these safety checks are also verified when bpf_loop
+ * is inlined, be careful to modify this code in sync. See
+ * function verifier.c:inline_bpf_loop.
+ */
+ if (flags)
+ return -EINVAL;
+ if (nr_loops > BPF_MAX_LOOPS)
+ return -E2BIG;
+
+ for (i = 0; i < nr_loops; i++) {
+ ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ return i + 1;
+ }
+
+ return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+ .func = bpf_loop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+struct bpf_iter_num_kern {
+ int cur; /* current value, inclusive */
+ int end; /* final value, exclusive */
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+ /* start == end is legit, it's an empty range and we'll just get NULL
+ * on first (and any subsequent) bpf_iter_num_next() call
+ */
+ if (start > end) {
+ s->cur = s->end = 0;
+ return -EINVAL;
+ }
+
+ /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+ if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+ s->cur = s->end = 0;
+ return -E2BIG;
+ }
+
+ /* user will call bpf_iter_num_next() first,
+ * which will set s->cur to exactly start value;
+ * underflow shouldn't matter
+ */
+ s->cur = start - 1;
+ s->end = end;
+
+ return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ /* check failed initialization or if we are done (same behavior);
+ * need to be careful about overflow, so convert to s64 for checks,
+ * e.g., if s->cur == s->end == INT_MAX, we can't just do
+ * s->cur + 1 >= s->end
+ */
+ if ((s64)(s->cur + 1) >= s->end) {
+ s->cur = s->end = 0;
+ return NULL;
+ }
+
+ s->cur++;
+
+ return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ s->cur = s->end = 0;
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
new file mode 100644
index 000000000000..e2fe6c32822b
--- /dev/null
+++ b/kernel/bpf/bpf_local_storage.c
@@ -0,0 +1,825 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
+
+#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
+static struct bpf_local_storage_map_bucket *
+select_bucket(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
+}
+
+static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (!map->ops->map_local_storage_charge)
+ return 0;
+
+ return map->ops->map_local_storage_charge(smap, owner, size);
+}
+
+static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
+ u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (map->ops->map_local_storage_uncharge)
+ map->ops->map_local_storage_uncharge(smap, owner, size);
+}
+
+static struct bpf_local_storage __rcu **
+owner_storage(struct bpf_local_storage_map *smap, void *owner)
+{
+ struct bpf_map *map = &smap->map;
+
+ return map->ops->map_owner_storage_ptr(owner);
+}
+
+static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->snode);
+}
+
+static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->snode);
+}
+
+static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->map_node);
+}
+
+static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->map_node);
+}
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
+ void *value, bool swap_uptrs, gfp_t gfp_flags)
+{
+ struct bpf_local_storage_elem *selem;
+
+ if (mem_charge(smap, owner, smap->elem_size))
+ return NULL;
+
+ if (smap->use_kmalloc_nolock) {
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
+ } else {
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ gfp_flags | __GFP_NOWARN);
+ }
+
+ if (selem) {
+ RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+
+ if (value) {
+ /* No need to call check_and_init_map_value as memory is zero init */
+ copy_map_value(&smap->map, SDATA(selem)->data, value);
+ if (swap_uptrs)
+ bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
+ }
+ return selem;
+ }
+
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ return NULL;
+}
+
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree(), else do kfree_rcu().
+ */
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(local_storage);
+ else
+ kfree_rcu(local_storage, rcu);
+}
+
+/* Handle use_kmalloc_nolock == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ kfree_nolock(local_storage);
+}
+
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_local_storage_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_local_storage_free_rcu);
+}
+
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool reuse_now)
+{
+ if (!local_storage)
+ return;
+
+ if (!local_storage->use_kmalloc_nolock) {
+ __bpf_local_storage_free(local_storage, reuse_now);
+ return;
+ }
+
+ if (reuse_now) {
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ return;
+ }
+
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
+}
+
+/* rcu tasks trace callback for use_kmalloc_nolock == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(selem);
+ else
+ kfree_rcu(selem, rcu);
+}
+
+/* Handle use_kmalloc_nolock == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(selem, rcu);
+ else
+ call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ /* The bpf_local_storage_map_free will wait for rcu_barrier */
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+ migrate_disable();
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
+ kfree_nolock(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_selem_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_selem_free_rcu);
+}
+
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ if (!smap->use_kmalloc_nolock) {
+ /*
+ * No uptr will be unpin even when reuse_now == false since uptr
+ * is only supported in task local storage, where
+ * smap->use_kmalloc_nolock == true.
+ */
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ __bpf_selem_free(selem, reuse_now);
+ return;
+ }
+
+ if (reuse_now) {
+ /*
+ * While it is okay to call bpf_obj_free_fields() that unpins uptr when
+ * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
+ */
+ call_rcu(&selem->rcu, bpf_selem_free_rcu);
+ return;
+ }
+
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
+{
+ struct bpf_local_storage_elem *selem;
+ struct hlist_node *n;
+
+ /* The "_safe" iteration is needed.
+ * The loop is not removing the selem from the list
+ * but bpf_selem_free will use the selem->rcu_head
+ * which is union-ized with the selem->free_node.
+ */
+ hlist_for_each_entry_safe(selem, n, list, free_node)
+ bpf_selem_free(selem, reuse_now);
+}
+
+/* local_storage->lock must be held and selem->local_storage == local_storage.
+ * The caller must ensure selem->smap is still valid to be
+ * dereferenced for its smap->elem_size and smap->cache_idx.
+ */
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ struct hlist_head *free_selem_list)
+{
+ struct bpf_local_storage_map *smap;
+ bool free_local_storage;
+ void *owner;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ owner = local_storage->owner;
+
+ /* All uncharging on the owner must be done first.
+ * The owner may be freed once the last selem is unlinked
+ * from local_storage.
+ */
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ free_local_storage = hlist_is_singular_node(&selem->snode,
+ &local_storage->list);
+ if (free_local_storage) {
+ mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
+ local_storage->owner = NULL;
+
+ /* After this RCU_INIT, owner may be freed and cannot be used */
+ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+
+ /* local_storage is not freed now. local_storage->lock is
+ * still held and raw_spin_unlock_bh(&local_storage->lock)
+ * will be done by the caller.
+ *
+ * Although the unlock will be done under
+ * rcu_read_lock(), it is more intuitive to
+ * read if the freeing of the storage is done
+ * after the raw_spin_unlock_bh(&local_storage->lock).
+ *
+ * Hence, a "bool free_local_storage" is returned
+ * to the caller which then calls then frees the storage after
+ * all the RCU grace periods have expired.
+ */
+ }
+ hlist_del_init_rcu(&selem->snode);
+ if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
+ SDATA(selem))
+ RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
+
+ hlist_add_head(&selem->free_node, free_selem_list);
+
+ if (rcu_access_pointer(local_storage->smap) == smap)
+ RCU_INIT_POINTER(local_storage->smap, NULL);
+
+ return free_local_storage;
+}
+
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
+{
+ struct bpf_local_storage *local_storage;
+ bool free_local_storage = false;
+ HLIST_HEAD(selem_free_list);
+ unsigned long flags;
+
+ if (unlikely(!selem_linked_to_storage_lockless(selem)))
+ /* selem has already been unlinked from sk */
+ return;
+
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
+
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (likely(selem_linked_to_storage(selem)))
+ free_local_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, &selem_free_list);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+ bpf_selem_free_list(&selem_free_list, reuse_now);
+
+ if (free_local_storage)
+ bpf_local_storage_free(local_storage, reuse_now);
+}
+
+void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ RCU_INIT_POINTER(selem->local_storage, local_storage);
+ hlist_add_head_rcu(&selem->snode, &local_storage->list);
+}
+
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map *smap;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
+
+ if (unlikely(!selem_linked_to_map_lockless(selem)))
+ /* selem has already be unlinked from smap */
+ return;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ b = select_bucket(smap, selem);
+ raw_spin_lock_irqsave(&b->lock, flags);
+ if (likely(selem_linked_to_map(selem)))
+ hlist_del_init_rcu(&selem->map_node);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+}
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+}
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
+{
+ /* Always unlink from map before unlinking from local_storage
+ * because selem will be freed after successfully unlinked from
+ * the local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ bpf_selem_unlink_storage(selem, reuse_now);
+}
+
+void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ unsigned long flags;
+
+ /* spinlock is needed to avoid racing with the
+ * parallel delete. Otherwise, publishing an already
+ * deleted sdata to the cache will become a use-after-free
+ * problem in the next bpf_local_storage_lookup().
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (selem_linked_to_storage(selem))
+ rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+}
+
+static int check_flags(const struct bpf_local_storage_data *old_sdata,
+ u64 map_flags)
+{
+ if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
+int bpf_local_storage_alloc(void *owner,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *first_selem,
+ gfp_t gfp_flags)
+{
+ struct bpf_local_storage *prev_storage, *storage;
+ struct bpf_local_storage **owner_storage_ptr;
+ int err;
+
+ err = mem_charge(smap, owner, sizeof(*storage));
+ if (err)
+ return err;
+
+ if (smap->use_kmalloc_nolock)
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
+ else
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ gfp_flags | __GFP_NOWARN);
+ if (!storage) {
+ err = -ENOMEM;
+ goto uncharge;
+ }
+
+ RCU_INIT_POINTER(storage->smap, smap);
+ INIT_HLIST_HEAD(&storage->list);
+ raw_spin_lock_init(&storage->lock);
+ storage->owner = owner;
+ storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
+
+ bpf_selem_link_storage_nolock(storage, first_selem);
+ bpf_selem_link_map(smap, first_selem);
+
+ owner_storage_ptr =
+ (struct bpf_local_storage **)owner_storage(smap, owner);
+ /* Publish storage to the owner.
+ * Instead of using any lock of the kernel object (i.e. owner),
+ * cmpxchg will work with any kernel object regardless what
+ * the running context is, bh, irq...etc.
+ *
+ * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
+ * is protected by the storage->lock. Hence, when freeing
+ * the owner->storage, the storage->lock must be held before
+ * setting owner->storage ptr to NULL.
+ */
+ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
+ if (unlikely(prev_storage)) {
+ bpf_selem_unlink_map(first_selem);
+ err = -EAGAIN;
+ goto uncharge;
+ }
+
+ return 0;
+
+uncharge:
+ bpf_local_storage_free(storage, true);
+ mem_uncharge(smap, owner, sizeof(*storage));
+ return err;
+}
+
+/* sk cannot be going away because it is linking new elem
+ * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
+ * Otherwise, it will become a leak (and other memory issues
+ * during map destruction).
+ */
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
+{
+ struct bpf_local_storage_data *old_sdata = NULL;
+ struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
+ struct bpf_local_storage *local_storage;
+ HLIST_HEAD(old_selem_free_list);
+ unsigned long flags;
+ int err;
+
+ /* BPF_EXIST and BPF_NOEXIST cannot be both set */
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
+ /* BPF_F_LOCK can only be used in a value with spin_lock */
+ unlikely((map_flags & BPF_F_LOCK) &&
+ !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
+ return ERR_PTR(-EINVAL);
+
+ if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
+ return ERR_PTR(-EINVAL);
+
+ local_storage = rcu_dereference_check(*owner_storage(smap, owner),
+ bpf_rcu_lock_held());
+ if (!local_storage || hlist_empty(&local_storage->list)) {
+ /* Very first elem for the owner */
+ err = check_flags(NULL, map_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+
+ err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
+ if (err) {
+ bpf_selem_free(selem, true);
+ mem_uncharge(smap, owner, smap->elem_size);
+ return ERR_PTR(err);
+ }
+
+ return SDATA(selem);
+ }
+
+ if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
+ /* Hoping to find an old_sdata to do inline update
+ * such that it can avoid taking the local_storage->lock
+ * and changing the lists.
+ */
+ old_sdata =
+ bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ return ERR_PTR(err);
+ if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
+ copy_map_value_locked(&smap->map, old_sdata->data,
+ value, false);
+ return old_sdata;
+ }
+ }
+
+ /* A lookup has just been done before and concluded a new selem is
+ * needed. The chance of an unnecessary alloc is unlikely.
+ */
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ if (!alloc_selem)
+ return ERR_PTR(-ENOMEM);
+
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+
+ /* Recheck local_storage->list under local_storage->lock */
+ if (unlikely(hlist_empty(&local_storage->list))) {
+ /* A parallel del is happening and local_storage is going
+ * away. It has just been checked before, so very
+ * unlikely. Return instead of retry to keep things
+ * simple.
+ */
+ err = -EAGAIN;
+ goto unlock;
+ }
+
+ old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ goto unlock;
+
+ if (old_sdata && (map_flags & BPF_F_LOCK)) {
+ copy_map_value_locked(&smap->map, old_sdata->data, value,
+ false);
+ selem = SELEM(old_sdata);
+ goto unlock;
+ }
+
+ alloc_selem = NULL;
+ /* First, link the new selem to the map */
+ bpf_selem_link_map(smap, selem);
+
+ /* Second, link (and publish) the new selem to local_storage */
+ bpf_selem_link_storage_nolock(local_storage, selem);
+
+ /* Third, remove old selem, SELEM(old_sdata) */
+ if (old_sdata) {
+ bpf_selem_unlink_map(SELEM(old_sdata));
+ bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
+ &old_selem_free_list);
+ }
+
+unlock:
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_selem_free_list(&old_selem_free_list, false);
+ if (alloc_selem) {
+ mem_uncharge(smap, owner, smap->elem_size);
+ bpf_selem_free(alloc_selem, true);
+ }
+ return err ? ERR_PTR(err) : SDATA(selem);
+}
+
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+{
+ u64 min_usage = U64_MAX;
+ u16 i, res = 0;
+
+ spin_lock(&cache->idx_lock);
+
+ for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
+ if (cache->idx_usage_counts[i] < min_usage) {
+ min_usage = cache->idx_usage_counts[i];
+ res = i;
+
+ /* Found a free cache_idx */
+ if (!min_usage)
+ break;
+ }
+ }
+ cache->idx_usage_counts[res]++;
+
+ spin_unlock(&cache->idx_lock);
+
+ return res;
+}
+
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
+{
+ spin_lock(&cache->idx_lock);
+ cache->idx_usage_counts[idx]--;
+ spin_unlock(&cache->idx_lock);
+}
+
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
+ attr->key_size != sizeof(int) || !attr->value_size ||
+ /* Enforce BTF for userspace sk dumping */
+ !attr->btf_key_type_id || !attr->btf_value_type_id)
+ return -EINVAL;
+
+ if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ if (!btf_type_is_i32(key_type))
+ return -EINVAL;
+
+ return 0;
+}
+
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
+{
+ struct bpf_local_storage_elem *selem;
+ bool free_storage = false;
+ HLIST_HEAD(free_selem_list);
+ struct hlist_node *n;
+ unsigned long flags;
+
+ /* Neither the bpf_prog nor the bpf_map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added to or deleted from the
+ * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ /* If local_storage list has only one element, the
+ * bpf_selem_unlink_storage_nolock() will return true.
+ * Otherwise, it will return false. The current loop iteration
+ * intends to remove all local storage. So the last iteration
+ * of the loop will set the free_cgroup_storage to true.
+ */
+ free_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, &free_selem_list);
+ }
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+ bpf_selem_free_list(&free_selem_list, true);
+
+ if (free_storage)
+ bpf_local_storage_free(local_storage, true);
+}
+
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+ u64 usage = sizeof(*smap);
+
+ /* The dynamically callocated selems are not counted currently. */
+ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+ return usage;
+}
+
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+ struct bpf_local_storage_cache *cache,
+ bool use_kmalloc_nolock)
+{
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ int err;
+
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+
+ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
+ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ err = -ENOMEM;
+ goto free_smap;
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
+
+ /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
+ * preemptible context. Thus, enforce all storages to use
+ * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
+ */
+ smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+ return &smap->map;
+
+free_smap:
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+ struct bpf_local_storage_cache *cache,
+ int __percpu *busy_counter)
+{
+ struct bpf_local_storage_map_bucket *b;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
+ synchronize_rcu();
+
+ /* bpf prog and the userspace can no longer access this map
+ * now. No new selem (of this map) can be added
+ * to the owner->storage or to the map bucket's list.
+ *
+ * The elem of this map can be cleaned up here
+ * or when the storage is freed e.g.
+ * by bpf_sk_storage_free() during __sk_destruct().
+ */
+ for (i = 0; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+
+ rcu_read_lock();
+ /* No one is adding to b->list now */
+ while ((selem = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(&b->list)),
+ struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter)
+ this_cpu_inc(*busy_counter);
+ bpf_selem_unlink(selem, true);
+ if (busy_counter)
+ this_cpu_dec(*busy_counter);
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+ }
+
+ /* While freeing the storage we may still need to access the map.
+ *
+ * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+ * which then made the above while((selem = ...)) loop
+ * exit immediately.
+ *
+ * However, while freeing the storage one still needs to access the
+ * smap->elem_size to do the uncharging in
+ * bpf_selem_unlink_storage_nolock().
+ *
+ * Hence, wait another rcu grace period for the storage to be freed.
+ */
+ synchronize_rcu();
+
+ if (smap->use_kmalloc_nolock) {
+ rcu_barrier_tasks_trace();
+ rcu_barrier();
+ }
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
new file mode 100644
index 000000000000..e7a2fc60523f
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.c
@@ -0,0 +1,695 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include "bpf_lru_list.h"
+
+#define LOCAL_FREE_TARGET (128)
+#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
+
+#define PERCPU_FREE_TARGET (4)
+#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
+
+/* Helpers to get the local list index */
+#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
+#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
+#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
+#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
+
+/* Local list helpers */
+static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_FREE_LIST_IDX];
+}
+
+static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
+{
+ return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
+}
+
+/* bpf_lru_node helpers */
+static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
+{
+ return READ_ONCE(node->ref);
+}
+
+static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
+{
+ WRITE_ONCE(node->ref, 0);
+}
+
+static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]++;
+}
+
+static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
+ enum bpf_lru_list_type type)
+{
+ if (type < NR_BPF_LRU_LIST_COUNT)
+ l->counts[type]--;
+}
+
+static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ /* If the removing node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ bpf_lru_list_count_dec(l, node->type);
+
+ node->type = tgt_free_type;
+ list_move(&node->list, free_list);
+}
+
+/* Move nodes from local list to the LRU list */
+static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ bpf_lru_node_clear_ref(node);
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+/* Move nodes between or within active and inactive list (like
+ * active to inactive, inactive to active or tail of active back to
+ * the head of active).
+ */
+static void __bpf_lru_node_move(struct bpf_lru_list *l,
+ struct bpf_lru_node *node,
+ enum bpf_lru_list_type tgt_type)
+{
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
+ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+ return;
+
+ if (node->type != tgt_type) {
+ bpf_lru_list_count_dec(l, node->type);
+ bpf_lru_list_count_inc(l, tgt_type);
+ node->type = tgt_type;
+ }
+ bpf_lru_node_clear_ref(node);
+
+ /* If the moving node is the next_inactive_rotation candidate,
+ * move the next_inactive_rotation pointer also.
+ */
+ if (&node->list == l->next_inactive_rotation)
+ l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+ list_move(&node->list, &l->lists[tgt_type]);
+}
+
+static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
+{
+ return l->counts[BPF_LRU_LIST_T_INACTIVE] <
+ l->counts[BPF_LRU_LIST_T_ACTIVE];
+}
+
+/* Rotate the active list:
+ * 1. Start from tail
+ * 2. If the node has the ref bit set, it will be rotated
+ * back to the head of active list with the ref bit cleared.
+ * Give this node one more chance to survive in the active list.
+ * 3. If the ref bit is not set, move it to the head of the
+ * inactive list.
+ * 4. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+ struct bpf_lru_node *node, *tmp_node, *first_node;
+ unsigned int i = 0;
+
+ first_node = list_first_entry(active, struct bpf_lru_node, list);
+ list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+
+ if (++i == lru->nr_scans || node == first_node)
+ break;
+ }
+}
+
+/* Rotate the inactive list. It starts from the next_inactive_rotation
+ * 1. If the node has ref bit set, it will be moved to the head
+ * of active list with the ref bit cleared.
+ * 2. If the node does not have ref bit set, it will leave it
+ * at its current location (i.e. do nothing) so that it can
+ * be considered during the next inactive_shrink.
+ * 3. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct list_head *cur, *last, *next = inactive;
+ struct bpf_lru_node *node;
+ unsigned int i = 0;
+
+ if (list_empty(inactive))
+ return;
+
+ last = l->next_inactive_rotation->next;
+ if (last == inactive)
+ last = last->next;
+
+ cur = l->next_inactive_rotation;
+ while (i < lru->nr_scans) {
+ if (cur == inactive) {
+ cur = cur->prev;
+ continue;
+ }
+
+ node = list_entry(cur, struct bpf_lru_node, list);
+ next = cur->prev;
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ if (cur == last)
+ break;
+ cur = next;
+ i++;
+ }
+
+ l->next_inactive_rotation = next;
+}
+
+/* Shrink the inactive list. It starts from the tail of the
+ * inactive list and only move the nodes without the ref bit
+ * set to the designated free list.
+ */
+static unsigned int
+__bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+{
+ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ struct bpf_lru_node *node, *tmp_node;
+ unsigned int nshrinked = 0;
+ unsigned int i = 0;
+
+ list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
+ if (bpf_lru_node_is_ref(node)) {
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+ } else if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ if (++nshrinked == tgt_nshrink)
+ break;
+ }
+
+ if (++i == lru->nr_scans)
+ break;
+ }
+
+ return nshrinked;
+}
+
+/* 1. Rotate the active list (if needed)
+ * 2. Always rotate the inactive list
+ */
+static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
+{
+ if (bpf_lru_list_inactive_low(l))
+ __bpf_lru_list_rotate_active(lru, l);
+
+ __bpf_lru_list_rotate_inactive(lru, l);
+}
+
+/* Calls __bpf_lru_list_shrink_inactive() to shrink some
+ * ref-bit-cleared nodes and move them to the designated
+ * free list.
+ *
+ * If it cannot get a free node after calling
+ * __bpf_lru_list_shrink_inactive(). It will just remove
+ * one node from either inactive or active list without
+ * honoring the ref-bit. It prefers inactive list to active
+ * list in this situation.
+ */
+static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
+ struct bpf_lru_list *l,
+ unsigned int tgt_nshrink,
+ struct list_head *free_list,
+ enum bpf_lru_list_type tgt_free_type)
+
+{
+ struct bpf_lru_node *node, *tmp_node;
+ struct list_head *force_shrink_list;
+ unsigned int nshrinked;
+
+ nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
+ free_list, tgt_free_type);
+ if (nshrinked)
+ return nshrinked;
+
+ /* Do a force shrink by ignoring the reference bit */
+ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+ else
+ force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+
+ list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
+ list) {
+ if (lru->del_from_htab(lru->del_arg, node)) {
+ __bpf_lru_node_move_to_free(l, node, free_list,
+ tgt_free_type);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Flush the nodes from the local pending list to the LRU list */
+static void __local_list_flush(struct bpf_lru_list *l,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node, *tmp_node;
+
+ list_for_each_entry_safe_reverse(node, tmp_node,
+ local_pending_list(loc_l), list) {
+ if (bpf_lru_node_is_ref(node))
+ __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
+ else
+ __bpf_lru_node_move_in(l, node,
+ BPF_LRU_LIST_T_INACTIVE);
+ }
+}
+
+static void bpf_lru_list_push_free(struct bpf_lru_list *l,
+ struct bpf_lru_node *node)
+{
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+ return;
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ struct bpf_lru_node *node, *tmp_node;
+ unsigned int nfree = 0;
+
+ raw_spin_lock(&l->lock);
+
+ __local_list_flush(l, loc_l);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
+ list) {
+ __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+ if (++nfree == lru->target_free)
+ break;
+ }
+
+ if (nfree < lru->target_free)
+ __bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
+ local_free_list(loc_l),
+ BPF_LRU_LOCAL_LIST_T_FREE);
+
+ raw_spin_unlock(&l->lock);
+}
+
+static void __local_list_add_pending(struct bpf_lru *lru,
+ struct bpf_lru_locallist *loc_l,
+ int cpu,
+ struct bpf_lru_node *node,
+ u32 hash)
+{
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ node->cpu = cpu;
+ node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+ bpf_lru_node_clear_ref(node);
+ list_add(&node->list, local_pending_list(loc_l));
+}
+
+static struct bpf_lru_node *
+__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+
+ node = list_first_entry_or_null(local_free_list(loc_l),
+ struct bpf_lru_node,
+ list);
+ if (node)
+ list_del(&node->list);
+
+ return node;
+}
+
+static struct bpf_lru_node *
+__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
+{
+ struct bpf_lru_node *node;
+ bool force = false;
+
+ignore_ref:
+ /* Get from the tail (i.e. older element) of the pending list. */
+ list_for_each_entry_reverse(node, local_pending_list(loc_l),
+ list) {
+ if ((!bpf_lru_node_is_ref(node) || force) &&
+ lru->del_from_htab(lru->del_arg, node)) {
+ list_del(&node->list);
+ return node;
+ }
+ }
+
+ if (!force) {
+ force = true;
+ goto ignore_ref;
+ }
+
+ return NULL;
+}
+
+static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct list_head *free_list;
+ struct bpf_lru_node *node = NULL;
+ struct bpf_lru_list *l;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_list_rotate(lru, l);
+
+ free_list = &l->lists[BPF_LRU_LIST_T_FREE];
+ if (list_empty(free_list))
+ __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
+ BPF_LRU_LIST_T_FREE);
+
+ if (!list_empty(free_list)) {
+ node = list_first_entry(free_list, struct bpf_lru_node, list);
+ *(u32 *)((void *)node + lru->hash_offset) = hash;
+ bpf_lru_node_clear_ref(node);
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+ }
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+
+ return node;
+}
+
+static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
+ u32 hash)
+{
+ struct bpf_lru_locallist *loc_l, *steal_loc_l;
+ struct bpf_common_lru *clru = &lru->common_lru;
+ struct bpf_lru_node *node;
+ int steal, first_steal;
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ node = __local_list_pop_free(loc_l);
+ if (!node) {
+ bpf_lru_list_pop_free_to_local(lru, loc_l);
+ node = __local_list_pop_free(loc_l);
+ }
+
+ if (node)
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+
+ if (node)
+ return node;
+
+ /* No free nodes found from the local free list and
+ * the global LRU list.
+ *
+ * Steal from the local free/pending list of the
+ * current CPU and remote CPU in RR. It starts
+ * with the loc_l->next_steal CPU.
+ */
+
+ first_steal = loc_l->next_steal;
+ steal = first_steal;
+ do {
+ steal_loc_l = per_cpu_ptr(clru->local_list, steal);
+
+ raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
+
+ node = __local_list_pop_free(steal_loc_l);
+ if (!node)
+ node = __local_list_pop_pending(lru, steal_loc_l);
+
+ raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+
+ steal = cpumask_next_wrap(steal, cpu_possible_mask);
+ } while (!node && steal != first_steal);
+
+ loc_l->next_steal = steal;
+
+ if (node) {
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+ __local_list_add_pending(lru, loc_l, cpu, node, hash);
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ }
+
+ return node;
+}
+
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+{
+ if (lru->percpu)
+ return bpf_percpu_lru_pop_free(lru, hash);
+ else
+ return bpf_common_lru_pop_free(lru, hash);
+}
+
+static void bpf_common_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ u8 node_type = READ_ONCE(node->type);
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
+ WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
+ return;
+
+ if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
+
+ raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+ if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ goto check_lru_list;
+ }
+
+ node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+ bpf_lru_node_clear_ref(node);
+ list_move(&node->list, local_free_list(loc_l));
+
+ raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+ return;
+ }
+
+check_lru_list:
+ bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
+}
+
+static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
+ struct bpf_lru_node *node)
+{
+ struct bpf_lru_list *l;
+ unsigned long flags;
+
+ l = per_cpu_ptr(lru->percpu_lru, node->cpu);
+
+ raw_spin_lock_irqsave(&l->lock, flags);
+
+ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+
+ raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_push_free(lru, node);
+ else
+ bpf_common_lru_push_free(lru, node);
+}
+
+static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
+ u32 node_offset, u32 elem_size,
+ u32 nr_elems)
+{
+ struct bpf_lru_list *l = &lru->common_lru.lru_list;
+ u32 i;
+
+ for (i = 0; i < nr_elems; i++) {
+ struct bpf_lru_node *node;
+
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->type = BPF_LRU_LIST_T_FREE;
+ bpf_lru_node_clear_ref(node);
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ buf += elem_size;
+ }
+
+ lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
+ 1, LOCAL_FREE_TARGET);
+}
+
+static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
+ u32 node_offset, u32 elem_size,
+ u32 nr_elems)
+{
+ u32 i, pcpu_entries;
+ int cpu;
+ struct bpf_lru_list *l;
+
+ pcpu_entries = nr_elems / num_possible_cpus();
+
+ i = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_node *node;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+again:
+ node = (struct bpf_lru_node *)(buf + node_offset);
+ node->cpu = cpu;
+ node->type = BPF_LRU_LIST_T_FREE;
+ bpf_lru_node_clear_ref(node);
+ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+ i++;
+ buf += elem_size;
+ if (i == nr_elems)
+ break;
+ if (i % pcpu_entries)
+ goto again;
+ }
+}
+
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems)
+{
+ if (lru->percpu)
+ bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+ else
+ bpf_common_lru_populate(lru, buf, node_offset, elem_size,
+ nr_elems);
+}
+
+static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
+ INIT_LIST_HEAD(&loc_l->lists[i]);
+
+ loc_l->next_steal = cpu;
+
+ raw_spin_lock_init(&loc_l->lock);
+}
+
+static void bpf_lru_list_init(struct bpf_lru_list *l)
+{
+ int i;
+
+ for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
+ INIT_LIST_HEAD(&l->lists[i]);
+
+ for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
+ l->counts[i] = 0;
+
+ l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+
+ raw_spin_lock_init(&l->lock);
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *del_arg)
+{
+ int cpu;
+
+ if (percpu) {
+ lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+ if (!lru->percpu_lru)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_list *l;
+
+ l = per_cpu_ptr(lru->percpu_lru, cpu);
+ bpf_lru_list_init(l);
+ }
+ lru->nr_scans = PERCPU_NR_SCANS;
+ } else {
+ struct bpf_common_lru *clru = &lru->common_lru;
+
+ clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+ if (!clru->local_list)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_lru_locallist *loc_l;
+
+ loc_l = per_cpu_ptr(clru->local_list, cpu);
+ bpf_lru_locallist_init(loc_l, cpu);
+ }
+
+ bpf_lru_list_init(&clru->lru_list);
+ lru->nr_scans = LOCAL_NR_SCANS;
+ }
+
+ lru->percpu = percpu;
+ lru->del_from_htab = del_from_htab;
+ lru->del_arg = del_arg;
+ lru->hash_offset = hash_offset;
+
+ return 0;
+}
+
+void bpf_lru_destroy(struct bpf_lru *lru)
+{
+ if (lru->percpu)
+ free_percpu(lru->percpu_lru);
+ else
+ free_percpu(lru->common_lru.local_list);
+}
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
new file mode 100644
index 000000000000..fe2661a58ea9
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2016 Facebook
+ */
+#ifndef __BPF_LRU_LIST_H_
+#define __BPF_LRU_LIST_H_
+
+#include <linux/cache.h>
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+#define NR_BPF_LRU_LIST_T (3)
+#define NR_BPF_LRU_LIST_COUNT (2)
+#define NR_BPF_LRU_LOCAL_LIST_T (2)
+#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
+
+enum bpf_lru_list_type {
+ BPF_LRU_LIST_T_ACTIVE,
+ BPF_LRU_LIST_T_INACTIVE,
+ BPF_LRU_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_FREE,
+ BPF_LRU_LOCAL_LIST_T_PENDING,
+};
+
+struct bpf_lru_node {
+ struct list_head list;
+ u16 cpu;
+ u8 type;
+ u8 ref;
+};
+
+struct bpf_lru_list {
+ struct list_head lists[NR_BPF_LRU_LIST_T];
+ unsigned int counts[NR_BPF_LRU_LIST_COUNT];
+ /* The next inactive list rotation starts from here */
+ struct list_head *next_inactive_rotation;
+
+ raw_spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct bpf_lru_locallist {
+ struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+ u16 next_steal;
+ raw_spinlock_t lock;
+};
+
+struct bpf_common_lru {
+ struct bpf_lru_list lru_list;
+ struct bpf_lru_locallist __percpu *local_list;
+};
+
+typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
+
+struct bpf_lru {
+ union {
+ struct bpf_common_lru common_lru;
+ struct bpf_lru_list __percpu *percpu_lru;
+ };
+ del_from_htab_func del_from_htab;
+ void *del_arg;
+ unsigned int hash_offset;
+ unsigned int target_free;
+ unsigned int nr_scans;
+ bool percpu;
+};
+
+static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
+{
+ if (!READ_ONCE(node->ref))
+ WRITE_ONCE(node->ref, 1);
+}
+
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+ del_from_htab_func del_from_htab, void *delete_arg);
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+ u32 elem_size, u32 nr_elems);
+void bpf_lru_destroy(struct bpf_lru *lru);
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
+
+#endif
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
new file mode 100644
index 000000000000..7cb6e8d4282c
--- /dev/null
+++ b/kernel/bpf/bpf_lsm.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/binfmts.h>
+#include <linux/lsm_hooks.h>
+#include <linux/bpf_lsm.h>
+#include <linux/kallsyms.h>
+#include <net/bpf_sk_storage.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/btf_ids.h>
+#include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
+
+/* For every LSM hook that allows attachment of BPF programs, declare a nop
+ * function where a BPF program can be attached.
+ */
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
+noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
+{ \
+ return DEFAULT; \
+}
+
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
+BTF_SET_START(bpf_lsm_hooks)
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+BTF_SET_END(bpf_lsm_hooks)
+
+BTF_SET_START(bpf_lsm_disabled_hooks)
+BTF_ID(func, bpf_lsm_vm_enough_memory)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_getsecurity)
+BTF_ID(func, bpf_lsm_inode_listsecurity)
+BTF_ID(func, bpf_lsm_inode_copy_up_xattr)
+BTF_ID(func, bpf_lsm_getselfattr)
+BTF_ID(func, bpf_lsm_getprocattr)
+BTF_ID(func, bpf_lsm_setprocattr)
+#ifdef CONFIG_KEYS
+BTF_ID(func, bpf_lsm_key_getsecurity)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_match)
+#endif
+BTF_ID(func, bpf_lsm_ismaclabel)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_SET_END(bpf_lsm_disabled_hooks)
+
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif
+BTF_SET_END(bpf_lsm_current_hooks)
+
+/* List of LSM hooks that trigger while the socket is properly locked.
+ */
+BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sock_graft)
+BTF_ID(func, bpf_lsm_inet_csk_clone)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif
+BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
+
+/* List of LSM hooks that trigger while the socket is _not_ locked,
+ * but it's ok to call bpf_{g,s}etsockopt because the socket is still
+ * in the early init phase.
+ */
+BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif
+BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
+
+#ifdef CONFIG_CGROUP_BPF
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+ bpf_func_t *bpf_func)
+{
+ const struct btf_param *args __maybe_unused;
+
+ if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+ btf_id_set_contains(&bpf_lsm_current_hooks,
+ prog->aux->attach_btf_id)) {
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+ return;
+ }
+
+#ifdef CONFIG_NET
+ args = btf_params(prog->aux->attach_func_proto);
+
+ if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+ *bpf_func = __cgroup_bpf_run_lsm_socket;
+ else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+ *bpf_func = __cgroup_bpf_run_lsm_sock;
+ else
+#endif
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+}
+#endif
+
+int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
+ const struct bpf_prog *prog)
+{
+ u32 btf_id = prog->aux->attach_btf_id;
+ const char *func_name = prog->aux->attach_func_name;
+
+ if (!prog->gpl_compatible) {
+ bpf_log(vlog,
+ "LSM programs must have a GPL compatible license\n");
+ return -EINVAL;
+ }
+
+ if (btf_id_set_contains(&bpf_lsm_disabled_hooks, btf_id)) {
+ bpf_log(vlog, "attach_btf_id %u points to disabled hook %s\n",
+ btf_id, func_name);
+ return -EINVAL;
+ }
+
+ if (!btf_id_set_contains(&bpf_lsm_hooks, btf_id)) {
+ bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
+ btf_id, func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Mask for all the currently supported BPRM option flags */
+#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC
+
+BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
+{
+ if (flags & ~BPF_F_BRPM_OPTS_MASK)
+ return -EINVAL;
+
+ bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
+
+static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
+ .func = bpf_bprm_opts_set,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0],
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
+{
+ return ima_inode_hash(inode, dst, size);
+}
+
+static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
+{
+ return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
+
+static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
+ .func = bpf_ima_inode_hash,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+ return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+ .func = bpf_ima_file_hash,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
+{
+ struct bpf_trace_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
+ return run_ctx->bpf_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
+ .func = bpf_get_attach_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+static const struct bpf_func_proto *
+bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ if (prog->expected_attach_type == BPF_LSM_CGROUP) {
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+ }
+
+ switch (func_id) {
+ case BPF_FUNC_inode_storage_get:
+ return &bpf_inode_storage_get_proto;
+ case BPF_FUNC_inode_storage_delete:
+ return &bpf_inode_storage_delete_proto;
+#ifdef CONFIG_NET
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#endif /* CONFIG_NET */
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_bprm_opts_set:
+ return &bpf_bprm_opts_set_proto;
+ case BPF_FUNC_ima_inode_hash:
+ return &bpf_ima_inode_hash_proto;
+ case BPF_FUNC_ima_file_hash:
+ return &bpf_ima_file_hash_proto;
+ case BPF_FUNC_get_attach_cookie:
+ return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+#ifdef CONFIG_NET
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_setsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_getsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_getsockopt_proto;
+ return NULL;
+#endif
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
+/* The set of hooks which are called without pagefaults disabled and are allowed
+ * to "sleep" and thus can be used for sleepable BPF programs.
+ */
+BTF_SET_START(sleepable_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf)
+BTF_ID(func, bpf_lsm_bpf_map)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
+BTF_ID(func, bpf_lsm_bprm_check_security)
+BTF_ID(func, bpf_lsm_bprm_committed_creds)
+BTF_ID(func, bpf_lsm_bprm_committing_creds)
+BTF_ID(func, bpf_lsm_bprm_creds_for_exec)
+BTF_ID(func, bpf_lsm_bprm_creds_from_file)
+BTF_ID(func, bpf_lsm_capget)
+BTF_ID(func, bpf_lsm_capset)
+BTF_ID(func, bpf_lsm_cred_prepare)
+BTF_ID(func, bpf_lsm_file_ioctl)
+BTF_ID(func, bpf_lsm_file_lock)
+BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_post_open)
+BTF_ID(func, bpf_lsm_file_receive)
+
+BTF_ID(func, bpf_lsm_inode_create)
+BTF_ID(func, bpf_lsm_inode_free_security)
+BTF_ID(func, bpf_lsm_inode_getattr)
+BTF_ID(func, bpf_lsm_inode_getxattr)
+BTF_ID(func, bpf_lsm_inode_mknod)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_post_removexattr)
+BTF_ID(func, bpf_lsm_inode_readlink)
+BTF_ID(func, bpf_lsm_inode_removexattr)
+BTF_ID(func, bpf_lsm_inode_rename)
+BTF_ID(func, bpf_lsm_inode_rmdir)
+BTF_ID(func, bpf_lsm_inode_setattr)
+BTF_ID(func, bpf_lsm_inode_setxattr)
+BTF_ID(func, bpf_lsm_inode_symlink)
+BTF_ID(func, bpf_lsm_inode_unlink)
+BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
+BTF_ID(func, bpf_lsm_kernfs_init_security)
+
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_mkdir)
+BTF_ID(func, bpf_lsm_path_rmdir)
+BTF_ID(func, bpf_lsm_path_truncate)
+BTF_ID(func, bpf_lsm_path_symlink)
+BTF_ID(func, bpf_lsm_path_link)
+BTF_ID(func, bpf_lsm_path_rename)
+BTF_ID(func, bpf_lsm_path_chmod)
+BTF_ID(func, bpf_lsm_path_chown)
+#endif /* CONFIG_SECURITY_PATH */
+
+BTF_ID(func, bpf_lsm_mmap_file)
+BTF_ID(func, bpf_lsm_netlink_send)
+BTF_ID(func, bpf_lsm_path_notify)
+BTF_ID(func, bpf_lsm_release_secctx)
+BTF_ID(func, bpf_lsm_sb_alloc_security)
+BTF_ID(func, bpf_lsm_sb_eat_lsm_opts)
+BTF_ID(func, bpf_lsm_sb_kern_mount)
+BTF_ID(func, bpf_lsm_sb_mount)
+BTF_ID(func, bpf_lsm_sb_remount)
+BTF_ID(func, bpf_lsm_sb_set_mnt_opts)
+BTF_ID(func, bpf_lsm_sb_show_options)
+BTF_ID(func, bpf_lsm_sb_statfs)
+BTF_ID(func, bpf_lsm_sb_umount)
+BTF_ID(func, bpf_lsm_settime)
+
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_inet_conn_established)
+
+BTF_ID(func, bpf_lsm_socket_accept)
+BTF_ID(func, bpf_lsm_socket_bind)
+BTF_ID(func, bpf_lsm_socket_connect)
+BTF_ID(func, bpf_lsm_socket_create)
+BTF_ID(func, bpf_lsm_socket_getpeername)
+BTF_ID(func, bpf_lsm_socket_getpeersec_dgram)
+BTF_ID(func, bpf_lsm_socket_getsockname)
+BTF_ID(func, bpf_lsm_socket_getsockopt)
+BTF_ID(func, bpf_lsm_socket_listen)
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_recvmsg)
+BTF_ID(func, bpf_lsm_socket_sendmsg)
+BTF_ID(func, bpf_lsm_socket_shutdown)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif /* CONFIG_SECURITY_NETWORK */
+
+BTF_ID(func, bpf_lsm_syslog)
+BTF_ID(func, bpf_lsm_task_alloc)
+BTF_ID(func, bpf_lsm_task_prctl)
+BTF_ID(func, bpf_lsm_task_setscheduler)
+BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
+BTF_SET_END(sleepable_lsm_hooks)
+
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+#endif /* CONFIG_SECURITY_NETWORK */
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
+bool bpf_lsm_is_sleepable_hook(u32 btf_id)
+{
+ return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
+}
+
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+ return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
+const struct bpf_prog_ops lsm_prog_ops = {
+};
+
+const struct bpf_verifier_ops lsm_verifier_ops = {
+ .get_func_proto = bpf_lsm_func_proto,
+ .is_valid_access = btf_ctx_access,
+};
+
+/* hooks return 0 or 1 */
+BTF_SET_START(bool_lsm_hooks)
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+BTF_ID(func, bpf_lsm_xfrm_state_pol_flow_match)
+#endif
+#ifdef CONFIG_AUDIT
+BTF_ID(func, bpf_lsm_audit_rule_known)
+#endif
+BTF_ID(func, bpf_lsm_inode_xattr_skipcap)
+BTF_SET_END(bool_lsm_hooks)
+
+int bpf_lsm_get_retval_range(const struct bpf_prog *prog,
+ struct bpf_retval_range *retval_range)
+{
+ /* no return value range for void hooks */
+ if (!prog->aux->attach_func_proto->type)
+ return -EINVAL;
+
+ if (btf_id_set_contains(&bool_lsm_hooks, prog->aux->attach_btf_id)) {
+ retval_range->minval = 0;
+ retval_range->maxval = 1;
+ } else {
+ /* All other available LSM hooks, except task_prctl, return 0
+ * on success and negative error code on failure.
+ * To keep things simple, we only allow bpf progs to return 0
+ * or negative errno for task_prctl too.
+ */
+ retval_range->minval = -MAX_ERRNO;
+ retval_range->maxval = 0;
+ }
+ return 0;
+}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
new file mode 100644
index 000000000000..278490683d28
--- /dev/null
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -0,0 +1,1404 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/slab.h>
+#include <linux/numa.h>
+#include <linux/seq_file.h>
+#include <linux/refcount.h>
+#include <linux/mutex.h>
+#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/poll.h>
+
+struct bpf_struct_ops_value {
+ struct bpf_struct_ops_common_value common;
+ char data[] ____cacheline_aligned_in_smp;
+};
+
+#define MAX_TRAMP_IMAGE_PAGES 8
+
+struct bpf_struct_ops_map {
+ struct bpf_map map;
+ const struct bpf_struct_ops_desc *st_ops_desc;
+ /* protect map_update */
+ struct mutex lock;
+ /* link has all the bpf_links that is populated
+ * to the func ptr of the kernel's struct
+ * (in kvalue.data).
+ */
+ struct bpf_link **links;
+ /* ksyms for bpf trampolines */
+ struct bpf_ksym **ksyms;
+ u32 funcs_cnt;
+ u32 image_pages_cnt;
+ /* image_pages is an array of pages that has all the trampolines
+ * that stores the func args before calling the bpf_prog.
+ */
+ void *image_pages[MAX_TRAMP_IMAGE_PAGES];
+ /* The owner moduler's btf. */
+ struct btf *btf;
+ /* uvalue->data stores the kernel struct
+ * (e.g. tcp_congestion_ops) that is more useful
+ * to userspace than the kvalue. For example,
+ * the bpf_prog's id is stored instead of the kernel
+ * address of a func ptr.
+ */
+ struct bpf_struct_ops_value *uvalue;
+ /* kvalue.data stores the actual kernel's struct
+ * (e.g. tcp_congestion_ops) that will be
+ * registered to the kernel subsystem.
+ */
+ struct bpf_struct_ops_value kvalue;
+};
+
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+ wait_queue_head_t wait_hup;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
+#define VALUE_PREFIX "bpf_struct_ops_"
+#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
+
+const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
+};
+
+const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+#ifdef CONFIG_NET
+ .test_run = bpf_struct_ops_test_run,
+#endif
+};
+
+BTF_ID_LIST(st_ops_ids)
+BTF_ID(struct, module)
+BTF_ID(struct, bpf_struct_ops_common_value)
+
+enum {
+ IDX_MODULE_ID,
+ IDX_ST_OPS_COMMON_VALUE_ID,
+};
+
+extern struct btf *btf_vmlinux;
+
+static bool is_valid_value_type(struct btf *btf, s32 value_id,
+ const struct btf_type *type,
+ const char *value_name)
+{
+ const struct btf_type *common_value_type;
+ const struct btf_member *member;
+ const struct btf_type *vt, *mt;
+
+ vt = btf_type_by_id(btf, value_id);
+ if (btf_vlen(vt) != 2) {
+ pr_warn("The number of %s's members should be 2, but we get %d\n",
+ value_name, btf_vlen(vt));
+ return false;
+ }
+ member = btf_type_member(vt);
+ mt = btf_type_by_id(btf, member->type);
+ common_value_type = btf_type_by_id(btf_vmlinux,
+ st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]);
+ if (mt != common_value_type) {
+ pr_warn("The first member of %s should be bpf_struct_ops_common_value\n",
+ value_name);
+ return false;
+ }
+ member++;
+ mt = btf_type_by_id(btf, member->type);
+ if (mt != type) {
+ pr_warn("The second member of %s should be %s\n",
+ value_name, btf_name_by_offset(btf, type->name_off));
+ return false;
+ }
+
+ return true;
+}
+
+static void *bpf_struct_ops_image_alloc(void)
+{
+ void *image;
+ int err;
+
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
+ if (err)
+ return ERR_PTR(err);
+ image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+ if (!image) {
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return image;
+}
+
+void bpf_struct_ops_image_free(void *image)
+{
+ if (image) {
+ arch_free_bpf_trampoline(image, PAGE_SIZE);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
+ }
+}
+
+#define MAYBE_NULL_SUFFIX "__nullable"
+#define REFCOUNTED_SUFFIX "__ref"
+
+/* Prepare argument info for every nullable argument of a member of a
+ * struct_ops type.
+ *
+ * Initialize a struct bpf_struct_ops_arg_info according to type info of
+ * the arguments of a stub function. (Check kCFI for more information about
+ * stub functions.)
+ *
+ * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info
+ * to provide an array of struct bpf_ctx_arg_aux, which in turn provides
+ * the information that used by the verifier to check the arguments of the
+ * BPF struct_ops program assigned to the member. Here, we only care about
+ * the arguments that are marked as __nullable.
+ *
+ * The array of struct bpf_ctx_arg_aux is eventually assigned to
+ * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the
+ * verifier. (See check_struct_ops_btf_id())
+ *
+ * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If
+ * fails, it will be kept untouched.
+ */
+static int prepare_arg_info(struct btf *btf,
+ const char *st_ops_name,
+ const char *member_name,
+ const struct btf_type *func_proto, void *stub_func_addr,
+ struct bpf_struct_ops_arg_info *arg_info)
+{
+ const struct btf_type *stub_func_proto, *pointed_type;
+ bool is_nullable = false, is_refcounted = false;
+ const struct btf_param *stub_args, *args;
+ struct bpf_ctx_arg_aux *info, *info_buf;
+ u32 nargs, arg_no, info_cnt = 0;
+ char ksym[KSYM_SYMBOL_LEN];
+ const char *stub_fname;
+ const char *suffix;
+ s32 stub_func_id;
+ u32 arg_btf_id;
+ int offset;
+
+ stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym);
+ if (!stub_fname) {
+ pr_warn("Cannot find the stub function name for the %s in struct %s\n",
+ member_name, st_ops_name);
+ return -ENOENT;
+ }
+
+ stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC);
+ if (stub_func_id < 0) {
+ pr_warn("Cannot find the stub function %s in btf\n", stub_fname);
+ return -ENOENT;
+ }
+
+ stub_func_proto = btf_type_by_id(btf, stub_func_id);
+ stub_func_proto = btf_type_by_id(btf, stub_func_proto->type);
+
+ /* Check if the number of arguments of the stub function is the same
+ * as the number of arguments of the function pointer.
+ */
+ nargs = btf_type_vlen(func_proto);
+ if (nargs != btf_type_vlen(stub_func_proto)) {
+ pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n",
+ stub_fname, member_name, st_ops_name);
+ return -EINVAL;
+ }
+
+ if (!nargs)
+ return 0;
+
+ args = btf_params(func_proto);
+ stub_args = btf_params(stub_func_proto);
+
+ info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL);
+ if (!info_buf)
+ return -ENOMEM;
+
+ /* Prepare info for every nullable argument */
+ info = info_buf;
+ for (arg_no = 0; arg_no < nargs; arg_no++) {
+ /* Skip arguments that is not suffixed with
+ * "__nullable or __ref".
+ */
+ is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
+ MAYBE_NULL_SUFFIX);
+ is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
+ REFCOUNTED_SUFFIX);
+
+ if (is_nullable)
+ suffix = MAYBE_NULL_SUFFIX;
+ else if (is_refcounted)
+ suffix = REFCOUNTED_SUFFIX;
+ else
+ continue;
+
+ /* Should be a pointer to struct */
+ pointed_type = btf_type_resolve_ptr(btf,
+ args[arg_no].type,
+ &arg_btf_id);
+ if (!pointed_type ||
+ !btf_type_is_struct(pointed_type)) {
+ pr_warn("stub function %s has %s tagging to an unsupported type\n",
+ stub_fname, suffix);
+ goto err_out;
+ }
+
+ offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
+ if (offset < 0) {
+ pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n",
+ stub_fname, arg_no);
+ goto err_out;
+ }
+
+ if (args[arg_no].type != stub_args[arg_no].type) {
+ pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n",
+ arg_no, stub_fname);
+ goto err_out;
+ }
+
+ /* Fill the information of the new argument */
+ info->btf_id = arg_btf_id;
+ info->btf = btf;
+ info->offset = offset;
+ if (is_nullable) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+ } else if (is_refcounted) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
+ info->refcounted = true;
+ }
+
+ info++;
+ info_cnt++;
+ }
+
+ if (info_cnt) {
+ arg_info->info = info_buf;
+ arg_info->cnt = info_cnt;
+ } else {
+ kfree(info_buf);
+ }
+
+ return 0;
+
+err_out:
+ kfree(info_buf);
+
+ return -EINVAL;
+}
+
+/* Clean up the arg_info in a struct bpf_struct_ops_desc. */
+void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
+{
+ struct bpf_struct_ops_arg_info *arg_info;
+ int i;
+
+ arg_info = st_ops_desc->arg_info;
+ for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++)
+ kfree(arg_info[i].info);
+
+ kfree(arg_info);
+}
+
+static bool is_module_member(const struct btf *btf, u32 id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_ptr(btf, id, NULL);
+ if (!t)
+ return false;
+
+ if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t))
+ return false;
+
+ return !strcmp(btf_name_by_offset(btf, t->name_off), "module");
+}
+
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+ void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
+
+ return func_ptr ? 0 : -ENOTSUPP;
+}
+
+int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+ struct btf *btf,
+ struct bpf_verifier_log *log)
+{
+ struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
+ struct bpf_struct_ops_arg_info *arg_info;
+ const struct btf_member *member;
+ const struct btf_type *t;
+ s32 type_id, value_id;
+ char value_name[128];
+ const char *mname;
+ int i, err;
+
+ if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
+ sizeof(value_name)) {
+ pr_warn("struct_ops name %s is too long\n",
+ st_ops->name);
+ return -EINVAL;
+ }
+ sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
+
+ if (!st_ops->cfi_stubs) {
+ pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name);
+ return -EINVAL;
+ }
+
+ type_id = btf_find_by_name_kind(btf, st_ops->name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ st_ops->name, btf_get_name(btf));
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, type_id);
+ if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
+ pr_warn("Cannot support #%u members in struct %s\n",
+ btf_type_vlen(t), st_ops->name);
+ return -EINVAL;
+ }
+
+ value_id = btf_find_by_name_kind(btf, value_name,
+ BTF_KIND_STRUCT);
+ if (value_id < 0) {
+ pr_warn("Cannot find struct %s in %s\n",
+ value_name, btf_get_name(btf));
+ return -EINVAL;
+ }
+ if (!is_valid_value_type(btf, value_id, t, value_name))
+ return -EINVAL;
+
+ arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info),
+ GFP_KERNEL);
+ if (!arg_info)
+ return -ENOMEM;
+
+ st_ops_desc->arg_info = arg_info;
+ st_ops_desc->type = t;
+ st_ops_desc->type_id = type_id;
+ st_ops_desc->value_id = value_id;
+ st_ops_desc->value_type = btf_type_by_id(btf, value_id);
+
+ for_each_member(i, t, member) {
+ const struct btf_type *func_proto, *ret_type;
+ void **stub_func_addr;
+ u32 moff;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ mname = btf_name_by_offset(btf, member->name_off);
+ if (!*mname) {
+ pr_warn("anon member in struct %s is not supported\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (__btf_member_bitfield_size(t, member)) {
+ pr_warn("bit field member %s in struct %s is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) {
+ pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ func_proto = btf_type_resolve_func_ptr(btf,
+ member->type,
+ NULL);
+
+ /* The member is not a function pointer or
+ * the function pointer is not supported.
+ */
+ if (!func_proto || bpf_struct_ops_supported(st_ops, moff))
+ continue;
+
+ if (func_proto->type) {
+ ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL);
+ if (ret_type && !__btf_type_is_struct(ret_type)) {
+ pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
+ if (btf_distill_func_proto(log, btf,
+ func_proto, mname,
+ &st_ops->func_models[i])) {
+ pr_warn("Error in parsing func ptr %s in struct %s\n",
+ mname, st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ stub_func_addr = *(void **)(st_ops->cfi_stubs + moff);
+ err = prepare_arg_info(btf, st_ops->name, mname,
+ func_proto, stub_func_addr,
+ arg_info + i);
+ if (err)
+ goto errout;
+ }
+
+ if (st_ops->init(btf)) {
+ pr_warn("Error in init bpf_struct_ops %s\n",
+ st_ops->name);
+ err = -EINVAL;
+ goto errout;
+ }
+
+ return 0;
+
+errout:
+ bpf_struct_ops_desc_release(st_ops_desc);
+
+ return err;
+}
+
+static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ if (key && *(u32 *)key == 0)
+ return -ENOENT;
+
+ *(u32 *)next_key = 0;
+ return 0;
+}
+
+int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ enum bpf_struct_ops_state state;
+ s64 refcnt;
+
+ if (unlikely(*(u32 *)key != 0))
+ return -ENOENT;
+
+ kvalue = &st_map->kvalue;
+ /* Pair with smp_store_release() during map_update */
+ state = smp_load_acquire(&kvalue->common.state);
+ if (state == BPF_STRUCT_OPS_STATE_INIT) {
+ memset(value, 0, map->value_size);
+ return 0;
+ }
+
+ /* No lock is needed. state and refcnt do not need
+ * to be updated together under atomic context.
+ */
+ uvalue = value;
+ memcpy(uvalue, st_map->uvalue, map->value_size);
+ uvalue->common.state = state;
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0));
+
+ return 0;
+}
+
+static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ break;
+ bpf_link_put(st_map->links[i]);
+ st_map->links[i] = NULL;
+ }
+}
+
+static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
+{
+ int i;
+
+ for (i = 0; i < st_map->image_pages_cnt; i++)
+ bpf_struct_ops_image_free(st_map->image_pages[i]);
+ st_map->image_pages_cnt = 0;
+}
+
+static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data)
+{
+ const struct btf_member *member;
+ u32 i, moff, msize, prev_mend = 0;
+ const struct btf_type *mtype;
+
+ for_each_member(i, t, member) {
+ moff = __btf_member_bit_offset(t, member) / 8;
+ if (moff > prev_mend &&
+ memchr_inv(data + prev_mend, 0, moff - prev_mend))
+ return -EINVAL;
+
+ mtype = btf_type_by_id(btf, member->type);
+ mtype = btf_resolve_size(btf, mtype, &msize);
+ if (IS_ERR(mtype))
+ return PTR_ERR(mtype);
+ prev_mend = moff + msize;
+ }
+
+ if (t->size > prev_mend &&
+ memchr_inv(data + prev_mend, 0, t->size - prev_mend))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_struct_ops_link_release(struct bpf_link *link)
+{
+}
+
+static void bpf_struct_ops_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link);
+
+ kfree(tlink);
+}
+
+const struct bpf_link_ops bpf_struct_ops_link_lops = {
+ .release = bpf_struct_ops_link_release,
+ .dealloc = bpf_struct_ops_link_dealloc,
+};
+
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
+ struct bpf_tramp_link *link,
+ const struct btf_func_model *model,
+ void *stub_func,
+ void **_image, u32 *_image_off,
+ bool allow_alloc)
+{
+ u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT;
+ void *image = *_image;
+ int size;
+
+ tlinks[BPF_TRAMP_FENTRY].links[0] = link;
+ tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+
+ if (model->ret_size > 0)
+ flags |= BPF_TRAMP_F_RET_FENTRY_RET;
+
+ size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
+ if (size <= 0)
+ return size ? : -EFAULT;
+
+ /* Allocate image buffer if necessary */
+ if (!image || size > PAGE_SIZE - image_off) {
+ if (!allow_alloc)
+ return -E2BIG;
+
+ image = bpf_struct_ops_image_alloc();
+ if (IS_ERR(image))
+ return PTR_ERR(image);
+ image_off = 0;
+ }
+
+ size = arch_prepare_bpf_trampoline(NULL, image + image_off,
+ image + image_off + size,
+ model, flags, tlinks, stub_func);
+ if (size <= 0) {
+ if (image != *_image)
+ bpf_struct_ops_image_free(image);
+ return size ? : -EFAULT;
+ }
+
+ *_image = image;
+ *_image_off = image_off + size;
+ return 0;
+}
+
+static void bpf_struct_ops_ksym_init(const char *tname, const char *mname,
+ void *image, unsigned int size,
+ struct bpf_ksym *ksym)
+{
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname);
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ bpf_image_ksym_init(image, size, ksym);
+}
+
+static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_add(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ bpf_image_ksym_del(st_map->ksyms[i]);
+ }
+}
+
+static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map)
+{
+ u32 i;
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->ksyms[i])
+ break;
+ kfree(st_map->ksyms[i]);
+ st_map->ksyms[i] = NULL;
+ }
+}
+
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
+ struct bpf_struct_ops_value *uvalue, *kvalue;
+ const struct btf_type *module_type;
+ const struct btf_member *member;
+ const struct btf_type *t = st_ops_desc->type;
+ struct bpf_tramp_links *tlinks;
+ void *udata, *kdata;
+ int prog_fd, err;
+ u32 i, trampoline_start, image_off = 0;
+ void *cur_image = NULL, *image = NULL;
+ struct bpf_link **plink;
+ struct bpf_ksym **pksym;
+ const char *tname, *mname;
+
+ if (flags)
+ return -EINVAL;
+
+ if (*(u32 *)key != 0)
+ return -E2BIG;
+
+ err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value);
+ if (err)
+ return err;
+
+ uvalue = value;
+ err = check_zero_holes(st_map->btf, t, uvalue->data);
+ if (err)
+ return err;
+
+ if (uvalue->common.state || refcount_read(&uvalue->common.refcnt))
+ return -EINVAL;
+
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
+ return -ENOMEM;
+
+ uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
+ kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
+
+ mutex_lock(&st_map->lock);
+
+ if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) {
+ err = -EBUSY;
+ goto unlock;
+ }
+
+ memcpy(uvalue, value, map->value_size);
+
+ udata = &uvalue->data;
+ kdata = &kvalue->data;
+
+ plink = st_map->links;
+ pksym = st_map->ksyms;
+ tname = btf_name_by_offset(st_map->btf, t->name_off);
+ module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
+ for_each_member(i, t, member) {
+ const struct btf_type *mtype, *ptype;
+ struct bpf_prog *prog;
+ struct bpf_tramp_link *link;
+ struct bpf_ksym *ksym;
+ u32 moff;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ mname = btf_name_by_offset(st_map->btf, member->name_off);
+ ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
+ if (ptype == module_type) {
+ if (*(void **)(udata + moff))
+ goto reset_unlock;
+ *(void **)(kdata + moff) = BPF_MODULE_OWNER;
+ continue;
+ }
+
+ err = st_ops->init_member(t, member, kdata, udata);
+ if (err < 0)
+ goto reset_unlock;
+
+ /* The ->init_member() has handled this member */
+ if (err > 0)
+ continue;
+
+ /* If st_ops->init_member does not handle it,
+ * we will only handle func ptrs and zero-ed members
+ * here. Reject everything else.
+ */
+
+ /* All non func ptr member must be 0 */
+ if (!ptype || !btf_type_is_func_proto(ptype)) {
+ u32 msize;
+
+ mtype = btf_type_by_id(st_map->btf, member->type);
+ mtype = btf_resolve_size(st_map->btf, mtype, &msize);
+ if (IS_ERR(mtype)) {
+ err = PTR_ERR(mtype);
+ goto reset_unlock;
+ }
+
+ if (memchr_inv(udata + moff, 0, msize)) {
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ continue;
+ }
+
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ /* Similar check as the attr->attach_prog_fd */
+ if (!prog_fd)
+ continue;
+
+ prog = bpf_prog_get(prog_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto reset_unlock;
+ }
+
+ if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
+ prog->aux->attach_btf_id != st_ops_desc->type_id ||
+ prog->expected_attach_type != i) {
+ bpf_prog_put(prog);
+ err = -EINVAL;
+ goto reset_unlock;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ bpf_prog_put(prog);
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+ &bpf_struct_ops_link_lops, prog, prog->expected_attach_type);
+ *plink++ = &link->link;
+
+ ksym = kzalloc(sizeof(*ksym), GFP_USER);
+ if (!ksym) {
+ err = -ENOMEM;
+ goto reset_unlock;
+ }
+ *pksym++ = ksym;
+
+ trampoline_start = image_off;
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ &st_ops->func_models[i],
+ *(void **)(st_ops->cfi_stubs + moff),
+ &image, &image_off,
+ st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES);
+ if (err)
+ goto reset_unlock;
+
+ if (cur_image != image) {
+ st_map->image_pages[st_map->image_pages_cnt++] = image;
+ cur_image = image;
+ trampoline_start = 0;
+ }
+
+ *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset();
+
+ /* put prog_id to udata */
+ *(unsigned long *)(udata + moff) = prog->aux->id;
+
+ /* init ksym for this trampoline */
+ bpf_struct_ops_ksym_init(tname, mname,
+ image + trampoline_start,
+ image_off - trampoline_start,
+ ksym);
+ }
+
+ if (st_ops->validate) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ }
+ for (i = 0; i < st_map->image_pages_cnt; i++) {
+ err = arch_protect_bpf_trampoline(st_map->image_pages[i],
+ PAGE_SIZE);
+ if (err)
+ goto reset_unlock;
+ }
+
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = 0;
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
+
+ err = st_ops->reg(kdata, NULL);
+ if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
+ /* Pair with smp_load_acquire() during lookup_elem().
+ * It ensures the above udata updates (e.g. prog->aux->id)
+ * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
+ */
+ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE);
+ goto unlock;
+ }
+
+ /* Error during st_ops->reg(). Can happen if this struct_ops needs to be
+ * verified as a whole, after all init_member() calls. Can also happen if
+ * there was a race in registering the struct_ops (under the same name) to
+ * a sub-system through different struct_ops's maps.
+ */
+
+reset_unlock:
+ bpf_struct_ops_map_free_ksyms(st_map);
+ bpf_struct_ops_map_free_image(st_map);
+ bpf_struct_ops_map_put_progs(st_map);
+ memset(uvalue, 0, map->value_size);
+ memset(kvalue, 0, map->value_size);
+unlock:
+ kfree(tlinks);
+ mutex_unlock(&st_map->lock);
+ if (!err)
+ bpf_struct_ops_map_add_ksyms(st_map);
+ return err;
+}
+
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+{
+ enum bpf_struct_ops_state prev_state;
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
+ prev_state = cmpxchg(&st_map->kvalue.common.state,
+ BPF_STRUCT_OPS_STATE_INUSE,
+ BPF_STRUCT_OPS_STATE_TOBEFREE);
+ switch (prev_state) {
+ case BPF_STRUCT_OPS_STATE_INUSE:
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
+ bpf_map_put(map);
+ return 0;
+ case BPF_STRUCT_OPS_STATE_TOBEFREE:
+ return -EINPROGRESS;
+ case BPF_STRUCT_OPS_STATE_INIT:
+ return -ENOENT;
+ default:
+ WARN_ON_ONCE(1);
+ /* Should never happen. Treat it as not found. */
+ return -ENOENT;
+ }
+}
+
+static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ void *value;
+ int err;
+
+ value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ return;
+
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ if (!err) {
+ btf_type_seq_show(st_map->btf,
+ map->btf_vmlinux_value_type_id,
+ value, m);
+ seq_putc(m, '\n');
+ }
+
+ kfree(value);
+}
+
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ if (st_map->links)
+ bpf_struct_ops_map_put_progs(st_map);
+ if (st_map->ksyms)
+ bpf_struct_ops_map_free_ksyms(st_map);
+ bpf_map_area_free(st_map->links);
+ bpf_map_area_free(st_map->ksyms);
+ bpf_struct_ops_map_free_image(st_map);
+ bpf_map_area_free(st_map->uvalue);
+ bpf_map_area_free(st_map);
+}
+
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ /* st_ops->owner was acquired during map_alloc to implicitly holds
+ * the btf's refcnt. The acquire was only done when btf_is_module()
+ * st_map->btf cannot be NULL here.
+ */
+ if (btf_is_module(st_map->btf))
+ module_put(st_map->st_ops_desc->st_ops->owner);
+
+ bpf_struct_ops_map_del_ksyms(st_map);
+
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
+static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
+ (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) ||
+ !attr->btf_vmlinux_value_type_id)
+ return -EINVAL;
+ return 0;
+}
+
+static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t)
+{
+ int i;
+ u32 count;
+ const struct btf_member *member;
+
+ count = 0;
+ for_each_member(i, t, member)
+ if (btf_type_resolve_func_ptr(btf, member->type, NULL))
+ count++;
+ return count;
+}
+
+static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
+{
+ const struct bpf_struct_ops_desc *st_ops_desc;
+ size_t st_map_size;
+ struct bpf_struct_ops_map *st_map;
+ const struct btf_type *t, *vt;
+ struct module *mod = NULL;
+ struct bpf_map *map;
+ struct btf *btf;
+ int ret;
+
+ if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) {
+ /* The map holds btf for its whole life time. */
+ btf = btf_get_by_fd(attr->value_type_btf_obj_fd);
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf_is_module(btf)) {
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mod = btf_try_get_module(btf);
+ /* mod holds a refcnt to btf. We don't need an extra refcnt
+ * here.
+ */
+ btf_put(btf);
+ if (!mod)
+ return ERR_PTR(-EINVAL);
+ } else {
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return ERR_CAST(btf);
+ if (!btf)
+ return ERR_PTR(-ENOTSUPP);
+ }
+
+ st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id);
+ if (!st_ops_desc) {
+ ret = -ENOTSUPP;
+ goto errout;
+ }
+
+ vt = st_ops_desc->value_type;
+ if (attr->value_size != vt->size) {
+ ret = -EINVAL;
+ goto errout;
+ }
+
+ t = st_ops_desc->type;
+
+ st_map_size = sizeof(*st_map) +
+ /* kvalue stores the
+ * struct bpf_struct_ops_tcp_congestions_ops
+ */
+ (vt->size - sizeof(struct bpf_struct_ops_value));
+
+ st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
+ if (!st_map) {
+ ret = -ENOMEM;
+ goto errout;
+ }
+
+ st_map->st_ops_desc = st_ops_desc;
+ map = &st_map->map;
+
+ st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
+ st_map->funcs_cnt = count_func_ptrs(btf, t);
+ st_map->links =
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *),
+ NUMA_NO_NODE);
+
+ st_map->ksyms =
+ bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *),
+ NUMA_NO_NODE);
+ if (!st_map->uvalue || !st_map->links || !st_map->ksyms) {
+ ret = -ENOMEM;
+ goto errout_free;
+ }
+ st_map->btf = btf;
+
+ mutex_init(&st_map->lock);
+ bpf_map_init_from_attr(map, attr);
+
+ return map;
+
+errout_free:
+ __bpf_struct_ops_map_free(map);
+errout:
+ module_put(mod);
+
+ return ERR_PTR(ret);
+}
+
+static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+ const struct btf_type *vt = st_ops_desc->value_type;
+ u64 usage;
+
+ usage = sizeof(*st_map) +
+ vt->size - sizeof(struct bpf_struct_ops_value);
+ usage += vt->size;
+ usage += st_map->funcs_cnt * sizeof(struct bpf_link *);
+ usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *);
+ usage += PAGE_SIZE;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
+const struct bpf_map_ops bpf_struct_ops_map_ops = {
+ .map_alloc_check = bpf_struct_ops_map_alloc_check,
+ .map_alloc = bpf_struct_ops_map_alloc,
+ .map_free = bpf_struct_ops_map_free,
+ .map_get_next_key = bpf_struct_ops_map_get_next_key,
+ .map_lookup_elem = bpf_struct_ops_map_lookup_elem,
+ .map_delete_elem = bpf_struct_ops_map_delete_elem,
+ .map_update_elem = bpf_struct_ops_map_update_elem,
+ .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+ .map_mem_usage = bpf_struct_ops_map_mem_usage,
+ .map_btf_id = &bpf_struct_ops_map_btf_ids[0],
+};
+
+/* "const void *" because some subsystem is
+ * passing a const (e.g. const struct tcp_congestion_ops *)
+ */
+bool bpf_struct_ops_get(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_get);
+
+void bpf_struct_ops_put(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ bpf_map_put(&st_map->map);
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_put);
+
+u32 bpf_struct_ops_id(const void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ return st_map->map.id;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
+
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY;
+}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+ bpf_map_put(&st_map->map);
+ }
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ if (map)
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ if (!st_map->st_ops_desc->st_ops->update)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!old_map) {
+ err = -ENOLINK;
+ goto err_out;
+ }
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
+static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+
+ mutex_lock(&update_mutex);
+
+ map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!map) {
+ mutex_unlock(&update_mutex);
+ return 0;
+ }
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+
+ RCU_INIT_POINTER(st_link->map, NULL);
+ /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
+ * bpf_map_inc() in bpf_struct_ops_map_link_update().
+ */
+ bpf_map_put(&st_map->map);
+
+ mutex_unlock(&update_mutex);
+
+ wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);
+
+ return 0;
+}
+
+static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
+ struct poll_table_struct *pts)
+{
+ struct bpf_struct_ops_link *st_link = file->private_data;
+
+ poll_wait(file, &st_link->wait_hup, pts);
+
+ return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .detach = bpf_struct_ops_map_link_detach,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+ .poll = bpf_struct_ops_map_link_poll,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
+ attr->link_create.attach_type);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ init_waitqueue_head(&link->wait_hup);
+
+ /* Hold the update_mutex such that the subsystem cannot
+ * do link->ops->detach() before the link is fully initialized.
+ */
+ mutex_lock(&update_mutex);
+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
+ if (err) {
+ mutex_unlock(&update_mutex);
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+ mutex_unlock(&update_mutex);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
+void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ info->btf_vmlinux_id = btf_obj_id(st_map->btf);
+}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
new file mode 100644
index 000000000000..a1dc1bf0848a
--- /dev/null
+++ b/kernel/bpf/bpf_task_storage.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(task_cache);
+
+static DEFINE_PER_CPU(int, bpf_task_storage_busy);
+
+static void bpf_task_storage_lock(void)
+{
+ cant_migrate();
+ this_cpu_inc(bpf_task_storage_busy);
+}
+
+static void bpf_task_storage_unlock(void)
+{
+ this_cpu_dec(bpf_task_storage_busy);
+}
+
+static bool bpf_task_storage_trylock(void)
+{
+ cant_migrate();
+ if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ this_cpu_dec(bpf_task_storage_busy);
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
+{
+ struct task_struct *task = owner;
+
+ return &task->bpf_storage;
+}
+
+static struct bpf_local_storage_data *
+task_storage_lookup(struct task_struct *task, struct bpf_map *map,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage *task_storage;
+ struct bpf_local_storage_map *smap;
+
+ task_storage =
+ rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
+ if (!task_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit);
+}
+
+void bpf_task_storage_free(struct task_struct *task)
+{
+ struct bpf_local_storage *local_storage;
+
+ rcu_read_lock_dont_migrate();
+
+ local_storage = rcu_dereference(task->bpf_storage);
+ if (!local_storage)
+ goto out;
+
+ bpf_task_storage_lock();
+ bpf_local_storage_destroy(local_storage);
+ bpf_task_storage_unlock();
+out:
+ rcu_read_unlock_migrate();
+}
+
+static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ sdata = task_storage_lookup(task, map, true);
+ bpf_task_storage_unlock();
+ put_pid(pid);
+ return sdata ? sdata->data : NULL;
+out:
+ put_pid(pid);
+ return ERR_PTR(err);
+}
+
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR))
+ return -EOPNOTSUPP;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ sdata = bpf_local_storage_update(
+ task, (struct bpf_local_storage_map *)map, value, map_flags,
+ true, GFP_ATOMIC);
+ bpf_task_storage_unlock();
+
+ err = PTR_ERR_OR_ZERO(sdata);
+out:
+ put_pid(pid);
+ return err;
+}
+
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
+ bool nobusy)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = task_storage_lookup(task, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ if (!nobusy)
+ return -EBUSY;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+
+ return 0;
+}
+
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ err = task_storage_delete(task, map, true);
+ bpf_task_storage_unlock();
+out:
+ put_pid(pid);
+ return err;
+}
+
+/* Called by bpf_task_storage_get*() helpers */
+static void *__bpf_task_storage_get(struct bpf_map *map,
+ struct task_struct *task, void *value,
+ u64 flags, gfp_t gfp_flags, bool nobusy)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = task_storage_lookup(task, map, nobusy);
+ if (sdata)
+ return sdata->data;
+
+ /* only allocate new storage, when the task is refcounted */
+ if (refcount_read(&task->usage) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
+ sdata = bpf_local_storage_update(
+ task, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST, false, gfp_flags);
+ return IS_ERR(sdata) ? NULL : sdata->data;
+ }
+
+ return NULL;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ bool nobusy;
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_task_storage_trylock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ bpf_task_storage_lock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, true);
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ bool nobusy;
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+
+ nobusy = bpf_task_storage_trylock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+
+ bpf_task_storage_lock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, true);
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &task_cache, true);
+}
+
+static void task_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
+}
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
+const struct bpf_map_ops task_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = task_storage_map_alloc,
+ .map_free = task_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_pid_task_storage_lookup_elem,
+ .map_update_elem = bpf_pid_task_storage_update_elem,
+ .map_delete_elem = bpf_pid_task_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = task_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+ .func = bpf_task_storage_get_recur,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_task_storage_get_proto = {
+ .func = bpf_task_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+ .func = bpf_task_storage_delete_recur,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
+const struct bpf_func_proto bpf_task_storage_delete_proto = {
+ .func = bpf_task_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..0de8fc8a0e0b
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,9579 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <uapi/linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/compiler.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/sort.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/skmsg.h>
+#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/overflow.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
+#include <net/sock.h>
+#include <net/xdp.h>
+#include "../tools/lib/bpf/relo_core.h"
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map. Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data. F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type. Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to describe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id. The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section. The first one has type_id 1. The second
+ * one has type_id 2...etc. Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *". A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *". This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ * - Each line started with "[?]" is a btf_type object
+ * - [?] is the type_id of the btf_type object.
+ * - CONST/PTR is the BTF_KIND_XXX
+ * - "(anon)" is the name of the type. It just
+ * happens that CONST and PTR has no name.
+ * - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data. We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done. e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ * struct A {
+ * struct B b;
+ * };
+ *
+ * struct B {
+ * struct A a;
+ * };
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type. A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR. Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ * struct A {
+ * int m;
+ * struct A *a;
+ * };
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR. Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ * ^ |
+ * +-----------------------------------------+
+ *
+ */
+
+#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+#define BTF_INFO_MASK 0x9f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+
+#define for_each_member_from(i, from, struct_type, member) \
+ for (i = from, member = btf_type_member(struct_type) + from; \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+#define for_each_vsi_from(i, from, struct_type, member) \
+ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+DEFINE_IDR(btf_idr);
+DEFINE_SPINLOCK(btf_idr_lock);
+
+enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_COMMON,
+ BTF_KFUNC_HOOK_XDP,
+ BTF_KFUNC_HOOK_TC,
+ BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_TRACING,
+ BTF_KFUNC_HOOK_SYSCALL,
+ BTF_KFUNC_HOOK_FMODRET,
+ BTF_KFUNC_HOOK_CGROUP,
+ BTF_KFUNC_HOOK_SCHED_ACT,
+ BTF_KFUNC_HOOK_SK_SKB,
+ BTF_KFUNC_HOOK_SOCKET_FILTER,
+ BTF_KFUNC_HOOK_LWT,
+ BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_KPROBE,
+ BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+ BTF_KFUNC_SET_MAX_CNT = 256,
+ BTF_DTOR_KFUNC_MAX_CNT = 256,
+ BTF_KFUNC_FILTER_MAX_CNT = 16,
+};
+
+struct btf_kfunc_hook_filter {
+ btf_kfunc_filter_t filters[BTF_KFUNC_FILTER_MAX_CNT];
+ u32 nr_filters;
+};
+
+struct btf_kfunc_set_tab {
+ struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
+ struct btf_kfunc_hook_filter hook_filters[BTF_KFUNC_HOOK_MAX];
+};
+
+struct btf_id_dtor_kfunc_tab {
+ u32 cnt;
+ struct btf_id_dtor_kfunc dtors[];
+};
+
+struct btf_struct_ops_tab {
+ u32 cnt;
+ u32 capacity;
+ struct bpf_struct_ops_desc ops[];
+};
+
+struct btf {
+ void *data;
+ struct btf_type **types;
+ u32 *resolved_ids;
+ u32 *resolved_sizes;
+ const char *strings;
+ void *nohdr_data;
+ struct btf_header hdr;
+ u32 nr_types; /* includes VOID for base BTF */
+ u32 types_size;
+ u32 data_size;
+ refcount_t refcnt;
+ u32 id;
+ struct rcu_head rcu;
+ struct btf_kfunc_set_tab *kfunc_set_tab;
+ struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+ struct btf_struct_metas *struct_meta_tab;
+ struct btf_struct_ops_tab *struct_ops_tab;
+
+ /* split BTF support */
+ struct btf *base_btf;
+ u32 start_id; /* first type ID in this BTF (0 for base BTF) */
+ u32 start_str_off; /* first string offset (0 for base BTF) */
+ char name[MODULE_NAME_LEN];
+ bool kernel_btf;
+ __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */
+};
+
+enum verifier_phase {
+ CHECK_META,
+ CHECK_TYPE,
+};
+
+struct resolve_vertex {
+ const struct btf_type *t;
+ u32 type_id;
+ u16 next_member;
+};
+
+enum visit_state {
+ NOT_VISITED,
+ VISITED,
+ RESOLVED,
+};
+
+enum resolve_mode {
+ RESOLVE_TBD, /* To Be Determined */
+ RESOLVE_PTR, /* Resolving for Pointer */
+ RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union
+ * or array
+ */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
+struct btf_sec_info {
+ u32 off;
+ u32 len;
+};
+
+struct btf_verifier_env {
+ struct btf *btf;
+ u8 *visit_states;
+ struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
+ struct bpf_verifier_log log;
+ u32 log_type_id;
+ u32 top_stack;
+ enum verifier_phase phase;
+ enum resolve_mode resolve_mode;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+ [BTF_KIND_UNKN] = "UNKNOWN",
+ [BTF_KIND_INT] = "INT",
+ [BTF_KIND_PTR] = "PTR",
+ [BTF_KIND_ARRAY] = "ARRAY",
+ [BTF_KIND_STRUCT] = "STRUCT",
+ [BTF_KIND_UNION] = "UNION",
+ [BTF_KIND_ENUM] = "ENUM",
+ [BTF_KIND_FWD] = "FWD",
+ [BTF_KIND_TYPEDEF] = "TYPEDEF",
+ [BTF_KIND_VOLATILE] = "VOLATILE",
+ [BTF_KIND_CONST] = "CONST",
+ [BTF_KIND_RESTRICT] = "RESTRICT",
+ [BTF_KIND_FUNC] = "FUNC",
+ [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
+ [BTF_KIND_VAR] = "VAR",
+ [BTF_KIND_DATASEC] = "DATASEC",
+ [BTF_KIND_FLOAT] = "FLOAT",
+ [BTF_KIND_DECL_TAG] = "DECL_TAG",
+ [BTF_KIND_TYPE_TAG] = "TYPE_TAG",
+ [BTF_KIND_ENUM64] = "ENUM64",
+};
+
+const char *btf_type_str(const struct btf_type *t)
+{
+ return btf_kind_str[BTF_INFO_KIND(t->info)];
+}
+
+/* Chunk size we use in safe copy of data to be shown. */
+#define BTF_SHOW_OBJ_SAFE_SIZE 32
+
+/*
+ * This is the maximum size of a base type value (equivalent to a
+ * 128-bit int); if we are at the end of our safe buffer and have
+ * less than 16 bytes space we can't be assured of being able
+ * to copy the next type safely, so in such cases we will initiate
+ * a new copy.
+ */
+#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16
+
+/* Type name size */
+#define BTF_SHOW_NAME_SIZE 80
+
+/*
+ * The suffix of a type that indicates it cannot alias another type when
+ * comparing BTF IDs for kfunc invocations.
+ */
+#define NOCAST_ALIAS_SUFFIX "___init"
+
+/*
+ * Common data to all BTF show operations. Private show functions can add
+ * their own data to a structure containing a struct btf_show and consult it
+ * in the show callback. See btf_type_show() below.
+ *
+ * One challenge with showing nested data is we want to skip 0-valued
+ * data, but in order to figure out whether a nested object is all zeros
+ * we need to walk through it. As a result, we need to make two passes
+ * when handling structs, unions and arrays; the first path simply looks
+ * for nonzero data, while the second actually does the display. The first
+ * pass is signalled by show->state.depth_check being set, and if we
+ * encounter a non-zero value we set show->state.depth_to_show to
+ * the depth at which we encountered it. When we have completed the
+ * first pass, we will know if anything needs to be displayed if
+ * depth_to_show > depth. See btf_[struct,array]_show() for the
+ * implementation of this.
+ *
+ * Another problem is we want to ensure the data for display is safe to
+ * access. To support this, the anonymous "struct {} obj" tracks the data
+ * object and our safe copy of it. We copy portions of the data needed
+ * to the object "copy" buffer, but because its size is limited to
+ * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we
+ * traverse larger objects for display.
+ *
+ * The various data type show functions all start with a call to
+ * btf_show_start_type() which returns a pointer to the safe copy
+ * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the
+ * raw data itself). btf_show_obj_safe() is responsible for
+ * using copy_from_kernel_nofault() to update the safe data if necessary
+ * as we traverse the object's data. skbuff-like semantics are
+ * used:
+ *
+ * - obj.head points to the start of the toplevel object for display
+ * - obj.size is the size of the toplevel object
+ * - obj.data points to the current point in the original data at
+ * which our safe data starts. obj.data will advance as we copy
+ * portions of the data.
+ *
+ * In most cases a single copy will suffice, but larger data structures
+ * such as "struct task_struct" will require many copies. The logic in
+ * btf_show_obj_safe() handles the logic that determines if a new
+ * copy_from_kernel_nofault() is needed.
+ */
+struct btf_show {
+ u64 flags;
+ void *target; /* target of show operation (seq file, buffer) */
+ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ const struct btf *btf;
+ /* below are used during iteration */
+ struct {
+ u8 depth;
+ u8 depth_to_show;
+ u8 depth_check;
+ u8 array_member:1,
+ array_terminated:1;
+ u16 array_encoding;
+ u32 type_id;
+ int status; /* non-zero for error */
+ const struct btf_type *type;
+ const struct btf_member *member;
+ char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */
+ } state;
+ struct {
+ u32 size;
+ void *head;
+ void *data;
+ u8 safe[BTF_SHOW_OBJ_SAFE_SIZE];
+ } obj;
+};
+
+struct btf_kind_operations {
+ s32 (*check_meta)(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left);
+ int (*resolve)(struct btf_verifier_env *env,
+ const struct resolve_vertex *v);
+ int (*check_member)(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type);
+ int (*check_kflag_member)(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type);
+ void (*log_details)(struct btf_verifier_env *env,
+ const struct btf_type *t);
+ void (*show)(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct btf_show *show);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id);
+
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t);
+
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+ /* Some of them is not strictly a C modifier
+ * but they are grouped into the same bucket
+ * for BTF concern:
+ * A type (t) that refers to another
+ * type through t->type AND its size cannot
+ * be determined without following the t->type.
+ *
+ * ptr does not fall into this bucket
+ * because its size is always sizeof(void *).
+ */
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_is_void(const struct btf_type *t)
+{
+ return t == &btf_void;
+}
+
+static bool btf_type_is_datasec(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+}
+
+static bool btf_type_is_decl_tag(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
+}
+
+static bool btf_type_nosize(const struct btf_type *t)
+{
+ return btf_type_is_void(t) || btf_type_is_fwd(t) ||
+ btf_type_is_func(t) || btf_type_is_func_proto(t) ||
+ btf_type_is_decl_tag(t);
+}
+
+static bool btf_type_nosize_or_null(const struct btf_type *t)
+{
+ return !t || btf_type_nosize(t);
+}
+
+static bool btf_type_is_decl_tag_target(const struct btf_type *t)
+{
+ return btf_type_is_func(t) || btf_type_is_struct(t) ||
+ btf_type_is_var(t) || btf_type_is_typedef(t);
+}
+
+bool btf_is_vmlinux(const struct btf *btf)
+{
+ return btf->kernel_btf && !btf->base_btf;
+}
+
+u32 btf_nr_types(const struct btf *btf)
+{
+ u32 total = 0;
+
+ while (btf) {
+ total += btf->nr_types;
+ btf = btf->base_btf;
+ }
+
+ return total;
+}
+
+s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
+{
+ const struct btf_type *t;
+ const char *tname;
+ u32 i, total;
+
+ total = btf_nr_types(btf);
+ for (i = 1; i < total; i++) {
+ t = btf_type_by_id(btf, i);
+ if (BTF_INFO_KIND(t->info) != kind)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, name))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+ struct btf *btf;
+ s32 ret;
+ int id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -EINVAL;
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret > 0) {
+ btf_get(btf);
+ *btf_p = btf;
+ return ret;
+ }
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, btf, id) {
+ if (!btf_is_module(btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(btf, name, kind);
+ if (ret > 0) {
+ *btf_p = btf;
+ return ret;
+ }
+ btf_put(btf);
+ spin_lock_bh(&btf_idr_lock);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bpf_find_btf_id);
+
+const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t)) {
+ id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ if (res_id)
+ *res_id = id;
+
+ return t;
+}
+
+const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, id, NULL);
+ if (!btf_type_is_ptr(t))
+ return NULL;
+
+ return btf_type_skip_modifiers(btf, t->type, res_id);
+}
+
+const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
+ u32 id, u32 *res_id)
+{
+ const struct btf_type *ptype;
+
+ ptype = btf_type_resolve_ptr(btf, id, res_id);
+ if (ptype && btf_type_is_func_proto(ptype))
+ return ptype;
+
+ return NULL;
+}
+
+/* Types that act only as a source, not sink or intermediate
+ * type when resolving.
+ */
+static bool btf_type_is_resolve_source_only(const struct btf_type *t)
+{
+ return btf_type_is_var(t) ||
+ btf_type_is_decl_tag(t) ||
+ btf_type_is_datasec(t);
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+ *
+ * btf_type_is_var() because the variable refers to
+ * another type. btf_type_is_datasec() holds multiple
+ * btf_type_is_var() types that need resolving.
+ *
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type. Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+ return btf_type_is_modifier(t) ||
+ btf_type_is_ptr(t) ||
+ btf_type_is_struct(t) ||
+ btf_type_is_array(t) ||
+ btf_type_is_var(t) ||
+ btf_type_is_func(t) ||
+ btf_type_is_decl_tag(t) ||
+ btf_type_is_datasec(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_INT:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ case BTF_KIND_DATASEC:
+ case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
+ return true;
+ }
+
+ return false;
+}
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+ if (encoding == 0)
+ return "(none)";
+ else if (encoding == BTF_INT_SIGNED)
+ return "SIGNED";
+ else if (encoding == BTF_INT_CHAR)
+ return "CHAR";
+ else if (encoding == BTF_INT_BOOL)
+ return "BOOL";
+ else
+ return "UNKN";
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+ return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+ return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+ return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_var *btf_type_var(const struct btf_type *t)
+{
+ return (const struct btf_var *)(t + 1);
+}
+
+static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
+{
+ return (const struct btf_decl_tag *)(t + 1);
+}
+
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+ return (const struct btf_enum64 *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+ return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+ if (!BTF_STR_OFFSET_VALID(offset))
+ return false;
+
+ while (offset < btf->start_str_off)
+ btf = btf->base_btf;
+
+ offset -= btf->start_str_off;
+ return offset < btf->hdr.str_len;
+}
+
+static bool __btf_name_char_ok(char c, bool first)
+{
+ if ((first ? !isalpha(c) :
+ !isalnum(c)) &&
+ c != '_' &&
+ c != '.')
+ return false;
+ return true;
+}
+
+const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+{
+ while (offset < btf->start_str_off)
+ btf = btf->base_btf;
+
+ offset -= btf->start_str_off;
+ if (offset < btf->hdr.str_len)
+ return &btf->strings[offset];
+
+ return NULL;
+}
+
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+ /* offset must be valid */
+ const char *src = btf_str_by_offset(btf, offset);
+ const char *src_limit;
+
+ if (!__btf_name_char_ok(*src, true))
+ return false;
+
+ /* set a limit on identifier length */
+ src_limit = src + KSYM_NAME_LEN;
+ src++;
+ while (*src && src < src_limit) {
+ if (!__btf_name_char_ok(*src, false))
+ return false;
+ src++;
+ }
+
+ return !*src;
+}
+
+/* Allow any printable character in DATASEC names */
+static bool btf_name_valid_section(const struct btf *btf, u32 offset)
+{
+ /* offset must be valid */
+ const char *src = btf_str_by_offset(btf, offset);
+ const char *src_limit;
+
+ if (!*src)
+ return false;
+
+ /* set a limit on identifier length */
+ src_limit = src + KSYM_NAME_LEN;
+ while (*src && src < src_limit) {
+ if (!isprint(*src))
+ return false;
+ src++;
+ }
+
+ return !*src;
+}
+
+static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+ const char *name;
+
+ if (!offset)
+ return "(anon)";
+
+ name = btf_str_by_offset(btf, offset);
+ return name ?: "(invalid-name-offset)";
+}
+
+const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+ return btf_str_by_offset(btf, offset);
+}
+
+const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+
+ type_id -= btf->start_id;
+ if (type_id >= btf->nr_types)
+ return NULL;
+ return btf->types[type_id];
+}
+EXPORT_SYMBOL_GPL(btf_type_by_id);
+
+/*
+ * Check that the type @t is a regular int. This means that @t is not
+ * a bit field and it has the same size as either of u8/u16/u32/u64
+ * or __int128. If @expected_size is not zero, then size of @t should
+ * be the same. A caller should already have checked that the type @t
+ * is an integer.
+ */
+static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size)
+{
+ u32 int_data = btf_type_int(t);
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+ return BITS_PER_BYTE_MASKED(nr_bits) == 0 &&
+ BTF_INT_OFFSET(int_data) == 0 &&
+ (nr_bytes <= 16 && is_power_of_2(nr_bytes)) &&
+ (expected_size == 0 || nr_bytes == expected_size);
+}
+
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+ return __btf_type_int_is_regular(t, 0);
+}
+
+bool btf_type_is_i32(const struct btf_type *t)
+{
+ return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4);
+}
+
+bool btf_type_is_i64(const struct btf_type *t)
+{
+ return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8);
+}
+
+bool btf_type_is_primitive(const struct btf_type *t)
+{
+ return (btf_type_is_int(t) && btf_type_int_is_regular(t)) ||
+ btf_is_any_enum(t);
+}
+
+/*
+ * Check that given struct member is a regular int with expected
+ * offset and size.
+ */
+bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
+ const struct btf_member *m,
+ u32 expected_offset, u32 expected_size)
+{
+ const struct btf_type *t;
+ u32 id, int_data;
+ u8 nr_bits;
+
+ id = m->type;
+ t = btf_type_id_size(btf, &id, NULL);
+ if (!t || !btf_type_is_int(t))
+ return false;
+
+ int_data = btf_type_int(t);
+ nr_bits = BTF_INT_BITS(int_data);
+ if (btf_type_kflag(s)) {
+ u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset);
+ u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset);
+
+ /* if kflag set, int should be a regular int and
+ * bit offset should be at byte boundary.
+ */
+ return !bitfield_size &&
+ BITS_ROUNDUP_BYTES(bit_offset) == expected_offset &&
+ BITS_ROUNDUP_BYTES(nr_bits) == expected_size;
+ }
+
+ if (BTF_INT_OFFSET(int_data) ||
+ BITS_PER_BYTE_MASKED(m->offset) ||
+ BITS_ROUNDUP_BYTES(m->offset) != expected_offset ||
+ BITS_PER_BYTE_MASKED(nr_bits) ||
+ BITS_ROUNDUP_BYTES(nr_bits) != expected_size)
+ return false;
+
+ return true;
+}
+
+/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */
+static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
+ u32 id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t) &&
+ BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ return t;
+}
+
+#define BTF_SHOW_MAX_ITER 10
+
+#define BTF_KIND_BIT(kind) (1ULL << kind)
+
+/*
+ * Populate show->state.name with type name information.
+ * Format of type name is
+ *
+ * [.member_name = ] (type_name)
+ */
+static const char *btf_show_name(struct btf_show *show)
+{
+ /* BTF_MAX_ITER array suffixes "[]" */
+ const char *array_suffixes = "[][][][][][][][][][]";
+ const char *array_suffix = &array_suffixes[strlen(array_suffixes)];
+ /* BTF_MAX_ITER pointer suffixes "*" */
+ const char *ptr_suffixes = "**********";
+ const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
+ const char *name = NULL, *prefix = "", *parens = "";
+ const struct btf_member *m = show->state.member;
+ const struct btf_type *t;
+ const struct btf_array *array;
+ u32 id = show->state.type_id;
+ const char *member = NULL;
+ bool show_member = false;
+ u64 kinds = 0;
+ int i;
+
+ show->state.name[0] = '\0';
+
+ /*
+ * Don't show type name if we're showing an array member;
+ * in that case we show the array type so don't need to repeat
+ * ourselves for each member.
+ */
+ if (show->state.array_member)
+ return "";
+
+ /* Retrieve member name, if any. */
+ if (m) {
+ member = btf_name_by_offset(show->btf, m->name_off);
+ show_member = strlen(member) > 0;
+ id = m->type;
+ }
+
+ /*
+ * Start with type_id, as we have resolved the struct btf_type *
+ * via btf_modifier_show() past the parent typedef to the child
+ * struct, int etc it is defined as. In such cases, the type_id
+ * still represents the starting type while the struct btf_type *
+ * in our show->state points at the resolved type of the typedef.
+ */
+ t = btf_type_by_id(show->btf, id);
+ if (!t)
+ return "";
+
+ /*
+ * The goal here is to build up the right number of pointer and
+ * array suffixes while ensuring the type name for a typedef
+ * is represented. Along the way we accumulate a list of
+ * BTF kinds we have encountered, since these will inform later
+ * display; for example, pointer types will not require an
+ * opening "{" for struct, we will just display the pointer value.
+ *
+ * We also want to accumulate the right number of pointer or array
+ * indices in the format string while iterating until we get to
+ * the typedef/pointee/array member target type.
+ *
+ * We start by pointing at the end of pointer and array suffix
+ * strings; as we accumulate pointers and arrays we move the pointer
+ * or array string backwards so it will show the expected number of
+ * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers
+ * and/or arrays and typedefs are supported as a precaution.
+ *
+ * We also want to get typedef name while proceeding to resolve
+ * type it points to so that we can add parentheses if it is a
+ * "typedef struct" etc.
+ */
+ for (i = 0; i < BTF_SHOW_MAX_ITER; i++) {
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ if (!name)
+ name = btf_name_by_offset(show->btf,
+ t->name_off);
+ kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF);
+ id = t->type;
+ break;
+ case BTF_KIND_ARRAY:
+ kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY);
+ parens = "[";
+ if (!t)
+ return "";
+ array = btf_type_array(t);
+ if (array_suffix > array_suffixes)
+ array_suffix -= 2;
+ id = array->type;
+ break;
+ case BTF_KIND_PTR:
+ kinds |= BTF_KIND_BIT(BTF_KIND_PTR);
+ if (ptr_suffix > ptr_suffixes)
+ ptr_suffix -= 1;
+ id = t->type;
+ break;
+ default:
+ id = 0;
+ break;
+ }
+ if (!id)
+ break;
+ t = btf_type_skip_qualifiers(show->btf, id);
+ }
+ /* We may not be able to represent this type; bail to be safe */
+ if (i == BTF_SHOW_MAX_ITER)
+ return "";
+
+ if (!name)
+ name = btf_name_by_offset(show->btf, t->name_off);
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ?
+ "struct" : "union";
+ /* if it's an array of struct/union, parens is already set */
+ if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY))))
+ parens = "{";
+ break;
+ case BTF_KIND_ENUM:
+ case BTF_KIND_ENUM64:
+ prefix = "enum";
+ break;
+ default:
+ break;
+ }
+
+ /* pointer does not require parens */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_PTR))
+ parens = "";
+ /* typedef does not require struct/union/enum prefix */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF))
+ prefix = "";
+
+ if (!name)
+ name = "";
+
+ /* Even if we don't want type name info, we want parentheses etc */
+ if (show->flags & BTF_SHOW_NONAME)
+ snprintf(show->state.name, sizeof(show->state.name), "%s",
+ parens);
+ else
+ snprintf(show->state.name, sizeof(show->state.name),
+ "%s%s%s(%s%s%s%s%s%s)%s",
+ /* first 3 strings comprise ".member = " */
+ show_member ? "." : "",
+ show_member ? member : "",
+ show_member ? " = " : "",
+ /* ...next is our prefix (struct, enum, etc) */
+ prefix,
+ strlen(prefix) > 0 && strlen(name) > 0 ? " " : "",
+ /* ...this is the type name itself */
+ name,
+ /* ...suffixed by the appropriate '*', '[]' suffixes */
+ strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix,
+ array_suffix, parens);
+
+ return show->state.name;
+}
+
+static const char *__btf_show_indent(struct btf_show *show)
+{
+ const char *indents = " ";
+ const char *indent = &indents[strlen(indents)];
+
+ if ((indent - show->state.depth) >= indents)
+ return indent - show->state.depth;
+ return indents;
+}
+
+static const char *btf_show_indent(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show);
+}
+
+static const char *btf_show_newline(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : "\n";
+}
+
+static const char *btf_show_delim(struct btf_show *show)
+{
+ if (show->state.depth == 0)
+ return "";
+
+ if ((show->flags & BTF_SHOW_COMPACT) && show->state.type &&
+ BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION)
+ return "|";
+
+ return ",";
+}
+
+__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!show->state.depth_check) {
+ va_start(args, fmt);
+ show->showfn(show, fmt, args);
+ va_end(args);
+ }
+}
+
+/* Macros are used here as btf_show_type_value[s]() prepends and appends
+ * format specifiers to the format specifier passed in; these do the work of
+ * adding indentation, delimiters etc while the caller simply has to specify
+ * the type value(s) in the format specifier + value(s).
+ */
+#define btf_show_type_value(show, fmt, value) \
+ do { \
+ if ((value) != (__typeof__(value))0 || \
+ (show->flags & BTF_SHOW_ZERO) || \
+ show->state.depth == 0) { \
+ btf_show(show, "%s%s" fmt "%s%s", \
+ btf_show_indent(show), \
+ btf_show_name(show), \
+ value, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } \
+ } while (0)
+
+#define btf_show_type_values(show, fmt, ...) \
+ do { \
+ btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \
+ btf_show_name(show), \
+ __VA_ARGS__, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } while (0)
+
+/* How much is left to copy to safe buffer after @data? */
+static int btf_show_obj_size_left(struct btf_show *show, void *data)
+{
+ return show->obj.head + show->obj.size - data;
+}
+
+/* Is object pointed to by @data of @size already copied to our safe buffer? */
+static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size)
+{
+ return data >= show->obj.data &&
+ (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE);
+}
+
+/*
+ * If object pointed to by @data of @size falls within our safe buffer, return
+ * the equivalent pointer to the same safe data. Assumes
+ * copy_from_kernel_nofault() has already happened and our safe buffer is
+ * populated.
+ */
+static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size)
+{
+ if (btf_show_obj_is_safe(show, data, size))
+ return show->obj.safe + (data - show->obj.data);
+ return NULL;
+}
+
+/*
+ * Return a safe-to-access version of data pointed to by @data.
+ * We do this by copying the relevant amount of information
+ * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault().
+ *
+ * If BTF_SHOW_UNSAFE is specified, just return data as-is; no
+ * safe copy is needed.
+ *
+ * Otherwise we need to determine if we have the required amount
+ * of data (determined by the @data pointer and the size of the
+ * largest base type we can encounter (represented by
+ * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures
+ * that we will be able to print some of the current object,
+ * and if more is needed a copy will be triggered.
+ * Some objects such as structs will not fit into the buffer;
+ * in such cases additional copies when we iterate over their
+ * members may be needed.
+ *
+ * btf_show_obj_safe() is used to return a safe buffer for
+ * btf_show_start_type(); this ensures that as we recurse into
+ * nested types we always have safe data for the given type.
+ * This approach is somewhat wasteful; it's possible for example
+ * that when iterating over a large union we'll end up copying the
+ * same data repeatedly, but the goal is safety not performance.
+ * We use stack data as opposed to per-CPU buffers because the
+ * iteration over a type can take some time, and preemption handling
+ * would greatly complicate use of the safe buffer.
+ */
+static void *btf_show_obj_safe(struct btf_show *show,
+ const struct btf_type *t,
+ void *data)
+{
+ const struct btf_type *rt;
+ int size_left, size;
+ void *safe = NULL;
+
+ if (show->flags & BTF_SHOW_UNSAFE)
+ return data;
+
+ rt = btf_resolve_size(show->btf, t, &size);
+ if (IS_ERR(rt)) {
+ show->state.status = PTR_ERR(rt);
+ return NULL;
+ }
+
+ /*
+ * Is this toplevel object? If so, set total object size and
+ * initialize pointers. Otherwise check if we still fall within
+ * our safe object data.
+ */
+ if (show->state.depth == 0) {
+ show->obj.size = size;
+ show->obj.head = data;
+ } else {
+ /*
+ * If the size of the current object is > our remaining
+ * safe buffer we _may_ need to do a new copy. However
+ * consider the case of a nested struct; it's size pushes
+ * us over the safe buffer limit, but showing any individual
+ * struct members does not. In such cases, we don't need
+ * to initiate a fresh copy yet; however we definitely need
+ * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left
+ * in our buffer, regardless of the current object size.
+ * The logic here is that as we resolve types we will
+ * hit a base type at some point, and we need to be sure
+ * the next chunk of data is safely available to display
+ * that type info safely. We cannot rely on the size of
+ * the current object here because it may be much larger
+ * than our current buffer (e.g. task_struct is 8k).
+ * All we want to do here is ensure that we can print the
+ * next basic type, which we can if either
+ * - the current type size is within the safe buffer; or
+ * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in
+ * the safe buffer.
+ */
+ safe = __btf_show_obj_safe(show, data,
+ min(size,
+ BTF_SHOW_OBJ_BASE_TYPE_SIZE));
+ }
+
+ /*
+ * We need a new copy to our safe object, either because we haven't
+ * yet copied and are initializing safe data, or because the data
+ * we want falls outside the boundaries of the safe object.
+ */
+ if (!safe) {
+ size_left = btf_show_obj_size_left(show, data);
+ if (size_left > BTF_SHOW_OBJ_SAFE_SIZE)
+ size_left = BTF_SHOW_OBJ_SAFE_SIZE;
+ show->state.status = copy_from_kernel_nofault(show->obj.safe,
+ data, size_left);
+ if (!show->state.status) {
+ show->obj.data = data;
+ safe = show->obj.safe;
+ }
+ }
+
+ return safe;
+}
+
+/*
+ * Set the type we are starting to show and return a safe data pointer
+ * to be used for showing the associated data.
+ */
+static void *btf_show_start_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ show->state.type = t;
+ show->state.type_id = type_id;
+ show->state.name[0] = '\0';
+
+ return btf_show_obj_safe(show, t, data);
+}
+
+static void btf_show_end_type(struct btf_show *show)
+{
+ show->state.type = NULL;
+ show->state.type_id = 0;
+ show->state.name[0] = '\0';
+}
+
+static void *btf_show_start_aggr_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ void *safe_data = btf_show_start_type(show, t, type_id, data);
+
+ if (!safe_data)
+ return safe_data;
+
+ btf_show(show, "%s%s%s", btf_show_indent(show),
+ btf_show_name(show),
+ btf_show_newline(show));
+ show->state.depth++;
+ return safe_data;
+}
+
+static void btf_show_end_aggr_type(struct btf_show *show,
+ const char *suffix)
+{
+ show->state.depth--;
+ btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix,
+ btf_show_delim(show), btf_show_newline(show));
+ btf_show_end_type(show);
+}
+
+static void btf_show_start_member(struct btf_show *show,
+ const struct btf_member *m)
+{
+ show->state.member = m;
+}
+
+static void btf_show_start_array_member(struct btf_show *show)
+{
+ show->state.array_member = 1;
+ btf_show_start_member(show, NULL);
+}
+
+static void btf_show_end_member(struct btf_show *show)
+{
+ show->state.member = NULL;
+}
+
+static void btf_show_end_array_member(struct btf_show *show)
+{
+ show->state.array_member = 0;
+ btf_show_end_member(show);
+}
+
+static void *btf_show_start_array_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ u16 array_encoding,
+ void *data)
+{
+ show->state.array_encoding = array_encoding;
+ show->state.array_terminated = 0;
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_array_type(struct btf_show *show)
+{
+ show->state.array_encoding = 0;
+ show->state.array_terminated = 0;
+ btf_show_end_aggr_type(show, "]");
+}
+
+static void *btf_show_start_struct_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ void *data)
+{
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_struct_type(struct btf_show *show)
+{
+ btf_show_end_aggr_type(show, "}");
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ bool log_details,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ if (log->level == BPF_LOG_KERNEL) {
+ /* btf verifier prints all types it is processing via
+ * btf_verifier_log_type(..., fmt = NULL).
+ * Skip those prints for in-kernel BTF verification.
+ */
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
+ __btf_verifier_log(log, "[%u] %s %s%s",
+ env->log_type_id,
+ btf_type_str(t),
+ __btf_name_by_offset(btf, t->name_off),
+ log_details ? " " : "");
+
+ if (log_details)
+ btf_type_ops(t)->log_details(env, t);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+ __btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+ __btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ if (log->level == BPF_LOG_KERNEL) {
+ if (!fmt)
+ return;
+
+ /* Skip logging when loading module BTF with mismatches permitted */
+ if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ return;
+ }
+
+ /* The CHECK_META phase already did a btf dump.
+ *
+ * If member is logged again, it must hit an error in
+ * parsing this member. It is useful to print out which
+ * struct this member belongs to.
+ */
+ if (env->phase != CHECK_META)
+ btf_verifier_log_type(env, struct_type, NULL);
+
+ if (btf_type_kflag(struct_type))
+ __btf_verifier_log(log,
+ "\t%s type_id=%u bitfield_size=%u bits_offset=%u",
+ __btf_name_by_offset(btf, member->name_off),
+ member->type,
+ BTF_MEMBER_BITFIELD_SIZE(member->offset),
+ BTF_MEMBER_BIT_OFFSET(member->offset));
+ else
+ __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+ __btf_name_by_offset(btf, member->name_off),
+ member->type, member->offset);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+__printf(4, 5)
+static void btf_verifier_log_vsi(struct btf_verifier_env *env,
+ const struct btf_type *datasec_type,
+ const struct btf_var_secinfo *vsi,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+ if (log->level == BPF_LOG_KERNEL && !fmt)
+ return;
+ if (env->phase != CHECK_META)
+ btf_verifier_log_type(env, datasec_type, NULL);
+
+ __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u",
+ vsi->type, vsi->offset, vsi->size);
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+ u32 btf_data_size)
+{
+ struct bpf_verifier_log *log = &env->log;
+ const struct btf *btf = env->btf;
+ const struct btf_header *hdr;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ if (log->level == BPF_LOG_KERNEL)
+ return;
+ hdr = &btf->hdr;
+ __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+ __btf_verifier_log(log, "version: %u\n", hdr->version);
+ __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+ __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
+ __btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+ __btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
+ __btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+ __btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+ __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+ struct btf *btf = env->btf;
+
+ if (btf->types_size == btf->nr_types) {
+ /* Expand 'types' array */
+
+ struct btf_type **new_types;
+ u32 expand_by, new_size;
+
+ if (btf->start_id + btf->types_size == BTF_MAX_TYPE) {
+ btf_verifier_log(env, "Exceeded max num of types");
+ return -E2BIG;
+ }
+
+ expand_by = max_t(u32, btf->types_size >> 2, 16);
+ new_size = min_t(u32, BTF_MAX_TYPE,
+ btf->types_size + expand_by);
+
+ new_types = kvcalloc(new_size, sizeof(*new_types),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_types)
+ return -ENOMEM;
+
+ if (btf->nr_types == 0) {
+ if (!btf->base_btf) {
+ /* lazily init VOID type */
+ new_types[0] = &btf_void;
+ btf->nr_types++;
+ }
+ } else {
+ memcpy(new_types, btf->types,
+ sizeof(*btf->types) * btf->nr_types);
+ }
+
+ kvfree(btf->types);
+ btf->types = new_types;
+ btf->types_size = new_size;
+ }
+
+ btf->types[btf->nr_types++] = t;
+
+ return 0;
+}
+
+static int btf_alloc_id(struct btf *btf)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&btf_idr_lock);
+ id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ btf->id = id;
+ spin_unlock_bh(&btf_idr_lock);
+ idr_preload_end();
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+ unsigned long flags;
+
+ /*
+ * In map-in-map, calling map_delete_elem() on outer
+ * map will call bpf_map_put on the inner map.
+ * It will then eventually call btf_free_id()
+ * on the inner map. Some of the map_delete_elem()
+ * implementation may have irq disabled, so
+ * we need to use the _irqsave() version instead
+ * of the _bh() version.
+ */
+ spin_lock_irqsave(&btf_idr_lock, flags);
+ idr_remove(&btf_idr, btf->id);
+ spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+ struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+ int hook;
+
+ if (!tab)
+ return;
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
+ kfree(tab->sets[hook]);
+ kfree(tab);
+ btf->kfunc_set_tab = NULL;
+}
+
+static void btf_free_dtor_kfunc_tab(struct btf *btf)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+
+ if (!tab)
+ return;
+ kfree(tab);
+ btf->dtor_kfunc_tab = NULL;
+}
+
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+ int i;
+
+ if (!tab)
+ return;
+ for (i = 0; i < tab->cnt; i++)
+ btf_record_free(tab->types[i].record);
+ kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+ struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+ btf_struct_metas_free(tab);
+ btf->struct_meta_tab = NULL;
+}
+
+static void btf_free_struct_ops_tab(struct btf *btf)
+{
+ struct btf_struct_ops_tab *tab = btf->struct_ops_tab;
+ u32 i;
+
+ if (!tab)
+ return;
+
+ for (i = 0; i < tab->cnt; i++)
+ bpf_struct_ops_desc_release(&tab->ops[i]);
+
+ kfree(tab);
+ btf->struct_ops_tab = NULL;
+}
+
+static void btf_free(struct btf *btf)
+{
+ btf_free_struct_meta_tab(btf);
+ btf_free_dtor_kfunc_tab(btf);
+ btf_free_kfunc_set_tab(btf);
+ btf_free_struct_ops_tab(btf);
+ kvfree(btf->types);
+ kvfree(btf->resolved_sizes);
+ kvfree(btf->resolved_ids);
+ /* vmlinux does not allocate btf->data, it simply points it at
+ * __start_BTF.
+ */
+ if (!btf_is_vmlinux(btf))
+ kvfree(btf->data);
+ kvfree(btf->base_id_map);
+ kfree(btf);
+}
+
+static void btf_free_rcu(struct rcu_head *rcu)
+{
+ struct btf *btf = container_of(rcu, struct btf, rcu);
+
+ btf_free(btf);
+}
+
+const char *btf_get_name(const struct btf *btf)
+{
+ return btf->name;
+}
+
+void btf_get(struct btf *btf)
+{
+ refcount_inc(&btf->refcnt);
+}
+
+void btf_put(struct btf *btf)
+{
+ if (btf && refcount_dec_and_test(&btf->refcnt)) {
+ btf_free_id(btf);
+ call_rcu(&btf->rcu, btf_free_rcu);
+ }
+}
+
+struct btf *btf_base_btf(const struct btf *btf)
+{
+ return btf->base_btf;
+}
+
+const struct btf_header *btf_header(const struct btf *btf)
+{
+ return &btf->hdr;
+}
+
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
+{
+ btf->base_btf = (struct btf *)base_btf;
+ btf->start_id = btf_nr_types(base_btf);
+ btf->start_str_off = base_btf->hdr.str_len;
+}
+
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ u32 nr_types = btf->nr_types;
+ u32 *resolved_sizes = NULL;
+ u32 *resolved_ids = NULL;
+ u8 *visit_states = NULL;
+
+ resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_sizes)
+ goto nomem;
+
+ resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_ids)
+ goto nomem;
+
+ visit_states = kvcalloc(nr_types, sizeof(*visit_states),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!visit_states)
+ goto nomem;
+
+ btf->resolved_sizes = resolved_sizes;
+ btf->resolved_ids = resolved_ids;
+ env->visit_states = visit_states;
+
+ return 0;
+
+nomem:
+ kvfree(resolved_sizes);
+ kvfree(resolved_ids);
+ kvfree(visit_states);
+ return -ENOMEM;
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+ kvfree(env->visit_states);
+ kfree(env);
+}
+
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+ const struct btf_type *next_type)
+{
+ switch (env->resolve_mode) {
+ case RESOLVE_TBD:
+ /* int, enum or void is a sink */
+ return !btf_type_needs_resolve(next_type);
+ case RESOLVE_PTR:
+ /* int, enum, void, struct, array, func or func_proto is a sink
+ * for ptr
+ */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_ptr(next_type);
+ case RESOLVE_STRUCT_OR_ARRAY:
+ /* int, enum, void, ptr, func or func_proto is a sink
+ * for struct and array
+ */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_array(next_type) &&
+ !btf_type_is_struct(next_type);
+ default:
+ BUG();
+ }
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+ u32 type_id)
+{
+ /* base BTF types should be resolved by now */
+ if (type_id < env->btf->start_id)
+ return true;
+
+ return env->visit_states[type_id - env->btf->start_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ const struct btf *btf = env->btf;
+ struct resolve_vertex *v;
+
+ if (env->top_stack == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ if (type_id < btf->start_id
+ || env->visit_states[type_id - btf->start_id] != NOT_VISITED)
+ return -EEXIST;
+
+ env->visit_states[type_id - btf->start_id] = VISITED;
+
+ v = &env->stack[env->top_stack++];
+ v->t = t;
+ v->type_id = type_id;
+ v->next_member = 0;
+
+ if (env->resolve_mode == RESOLVE_TBD) {
+ if (btf_type_is_ptr(t))
+ env->resolve_mode = RESOLVE_PTR;
+ else if (btf_type_is_struct(t) || btf_type_is_array(t))
+ env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+ }
+
+ return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+ u16 next_member)
+{
+ env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+ u32 resolved_type_id,
+ u32 resolved_size)
+{
+ u32 type_id = env->stack[--(env->top_stack)].type_id;
+ struct btf *btf = env->btf;
+
+ type_id -= btf->start_id; /* adjust to local type id */
+ btf->resolved_sizes[type_id] = resolved_size;
+ btf->resolved_ids[type_id] = resolved_type_id;
+ env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+ return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* Resolve the size of a passed-in "type"
+ *
+ * type: is an array (e.g. u32 array[x][y])
+ * return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY,
+ * *type_size: (x * y * sizeof(u32)). Hence, *type_size always
+ * corresponds to the return type.
+ * *elem_type: u32
+ * *elem_id: id of u32
+ * *total_nelems: (x * y). Hence, individual elem size is
+ * (*type_size / *total_nelems)
+ * *type_id: id of type if it's changed within the function, 0 if not
+ *
+ * type: is not an array (e.g. const struct X)
+ * return type: type "struct X"
+ * *type_size: sizeof(struct X)
+ * *elem_type: same as return type ("struct X")
+ * *elem_id: 0
+ * *total_nelems: 1
+ * *type_id: id of type if it's changed within the function, 0 if not
+ */
+static const struct btf_type *
+__btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size, const struct btf_type **elem_type,
+ u32 *elem_id, u32 *total_nelems, u32 *type_id)
+{
+ const struct btf_type *array_type = NULL;
+ const struct btf_array *array = NULL;
+ u32 i, size, nelems = 1, id = 0;
+
+ for (i = 0; i < MAX_RESOLVE_DEPTH; i++) {
+ switch (BTF_INFO_KIND(type->info)) {
+ /* type->size can be used */
+ case BTF_KIND_INT:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
+ size = type->size;
+ goto resolved;
+
+ case BTF_KIND_PTR:
+ size = sizeof(void *);
+ goto resolved;
+
+ /* Modifiers */
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
+ id = type->type;
+ type = btf_type_by_id(btf, type->type);
+ break;
+
+ case BTF_KIND_ARRAY:
+ if (!array_type)
+ array_type = type;
+ array = btf_type_array(type);
+ if (nelems && array->nelems > U32_MAX / nelems)
+ return ERR_PTR(-EINVAL);
+ nelems *= array->nelems;
+ type = btf_type_by_id(btf, array->type);
+ break;
+
+ /* type without size */
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+
+resolved:
+ if (nelems && size > U32_MAX / nelems)
+ return ERR_PTR(-EINVAL);
+
+ *type_size = nelems * size;
+ if (total_nelems)
+ *total_nelems = nelems;
+ if (elem_type)
+ *elem_type = type;
+ if (elem_id)
+ *elem_id = array ? array->type : 0;
+ if (type_id && id)
+ *type_id = id;
+
+ return array_type ? : type;
+}
+
+const struct btf_type *
+btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size)
+{
+ return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL);
+}
+
+static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id)
+{
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+
+ return btf->resolved_ids[type_id - btf->start_id];
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+ u32 *type_id)
+{
+ *type_id = btf_resolved_type_id(btf, *type_id);
+ return btf_type_by_id(btf, *type_id);
+}
+
+static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id)
+{
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+
+ return btf->resolved_sizes[type_id - btf->start_id];
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+ u32 *type_id, u32 *ret_size)
+{
+ const struct btf_type *size_type;
+ u32 size_type_id = *type_id;
+ u32 size = 0;
+
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_nosize_or_null(size_type))
+ return NULL;
+
+ if (btf_type_has_size(size_type)) {
+ size = size_type->size;
+ } else if (btf_type_is_array(size_type)) {
+ size = btf_resolved_type_size(btf, size_type_id);
+ } else if (btf_type_is_ptr(size_type)) {
+ size = sizeof(void *);
+ } else {
+ if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) &&
+ !btf_type_is_var(size_type)))
+ return NULL;
+
+ size_type_id = btf_resolved_type_id(btf, size_type_id);
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_nosize_or_null(size_type))
+ return NULL;
+ else if (btf_type_has_size(size_type))
+ size = size_type->size;
+ else if (btf_type_is_array(size_type))
+ size = btf_resolved_type_size(btf, size_type_id);
+ else if (btf_type_is_ptr(size_type))
+ size = sizeof(void *);
+ else
+ return NULL;
+ }
+
+ *type_id = size_type_id;
+ if (ret_size)
+ *ret_size = size;
+
+ return size_type;
+}
+
+static int btf_df_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ btf_verifier_log_basic(env, struct_type,
+ "Unsupported check_member");
+ return -EINVAL;
+}
+
+static int btf_df_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ btf_verifier_log_basic(env, struct_type,
+ "Unsupported check_kflag_member");
+ return -EINVAL;
+}
+
+/* Used for ptr, array struct/union and float type members.
+ * int, enum and modifier types have their specific callback functions.
+ */
+static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ /* bitfield size is 0, so member->offset represents bit offset only.
+ * It is safe to call non kflag check_member variants.
+ */
+ return btf_type_ops(member_type)->check_member(env, struct_type,
+ member,
+ member_type);
+}
+
+static int btf_df_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+ return -EINVAL;
+}
+
+static void btf_df_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct btf_show *show)
+{
+ btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
+static int btf_int_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 int_data = btf_type_int(member_type);
+ u32 struct_bits_off = member->offset;
+ u32 struct_size = struct_type->size;
+ u32 nr_copy_bits;
+ u32 bytes_offset;
+
+ if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "bits_offset exceeds U32_MAX");
+ return -EINVAL;
+ }
+
+ struct_bits_off += BTF_INT_OFFSET(int_data);
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ nr_copy_bits = BTF_INT_BITS(int_data) +
+ BITS_PER_BYTE_MASKED(struct_bits_off);
+
+ if (nr_copy_bits > BITS_PER_U128) {
+ btf_verifier_log_member(env, struct_type, member,
+ "nr_copy_bits exceeds 128");
+ return -EINVAL;
+ }
+
+ if (struct_size < bytes_offset ||
+ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_int_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset;
+ u32 int_data = btf_type_int(member_type);
+ u32 struct_size = struct_type->size;
+ u32 nr_copy_bits;
+
+ /* a regular int type is required for the kflag int member */
+ if (!btf_type_int_is_regular(member_type)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member base type");
+ return -EINVAL;
+ }
+
+ /* check sanity of bitfield size */
+ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+ nr_int_data_bits = BTF_INT_BITS(int_data);
+ if (!nr_bits) {
+ /* Not a bitfield member, member offset must be at byte
+ * boundary.
+ */
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member offset");
+ return -EINVAL;
+ }
+
+ nr_bits = nr_int_data_bits;
+ } else if (nr_bits > nr_int_data_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
+ if (nr_copy_bits > BITS_PER_U128) {
+ btf_verifier_log_member(env, struct_type, member,
+ "nr_copy_bits exceeds 128");
+ return -EINVAL;
+ }
+
+ if (struct_size < bytes_offset ||
+ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+ u16 encoding;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ int_data = btf_type_int(t);
+ if (int_data & ~BTF_INT_MASK) {
+ btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+ int_data);
+ return -EINVAL;
+ }
+
+ nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+ if (nr_bits > BITS_PER_U128) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+ BITS_PER_U128);
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+ return -EINVAL;
+ }
+
+ /*
+ * Only one of the encoding bits is allowed and it
+ * should be sufficient for the pretty print purpose (i.e. decoding).
+ * Multiple bits can be allowed later if it is found
+ * to be insufficient.
+ */
+ encoding = BTF_INT_ENCODING(int_data);
+ if (encoding &&
+ encoding != BTF_INT_SIGNED &&
+ encoding != BTF_INT_CHAR &&
+ encoding != BTF_INT_BOOL) {
+ btf_verifier_log_type(env, t, "Unsupported encoding");
+ return -ENOTSUPP;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ int int_data = btf_type_int(t);
+
+ btf_verifier_log(env,
+ "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+ t->size, BTF_INT_OFFSET(int_data),
+ BTF_INT_BITS(int_data),
+ btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static void btf_int128_print(struct btf_show *show, void *data)
+{
+ /* data points to a __int128 number.
+ * Suppose
+ * int128_num = *(__int128 *)data;
+ * The below formulas shows what upper_num and lower_num represents:
+ * upper_num = int128_num >> 64;
+ * lower_num = int128_num & 0xffffffffFFFFFFFFULL;
+ */
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = *(u64 *)data;
+ lower_num = *(u64 *)(data + 8);
+#else
+ upper_num = *(u64 *)(data + 8);
+ lower_num = *(u64 *)data;
+#endif
+ if (upper_num == 0)
+ btf_show_type_value(show, "0x%llx", lower_num);
+ else
+ btf_show_type_values(show, "0x%llx%016llx", upper_num,
+ lower_num);
+}
+
+static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
+ u16 right_shift_bits)
+{
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = print_num[0];
+ lower_num = print_num[1];
+#else
+ upper_num = print_num[1];
+ lower_num = print_num[0];
+#endif
+
+ /* shake out un-needed bits by shift/or operations */
+ if (left_shift_bits >= 64) {
+ upper_num = lower_num << (left_shift_bits - 64);
+ lower_num = 0;
+ } else {
+ upper_num = (upper_num << left_shift_bits) |
+ (lower_num >> (64 - left_shift_bits));
+ lower_num = lower_num << left_shift_bits;
+ }
+
+ if (right_shift_bits >= 64) {
+ lower_num = upper_num >> (right_shift_bits - 64);
+ upper_num = 0;
+ } else {
+ lower_num = (lower_num >> right_shift_bits) |
+ (upper_num << (64 - right_shift_bits));
+ upper_num = upper_num >> right_shift_bits;
+ }
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ print_num[0] = upper_num;
+ print_num[1] = lower_num;
+#else
+ print_num[0] = lower_num;
+ print_num[1] = upper_num;
+#endif
+}
+
+static void btf_bitfield_show(void *data, u8 bits_offset,
+ u8 nr_bits, struct btf_show *show)
+{
+ u16 left_shift_bits, right_shift_bits;
+ u8 nr_copy_bytes;
+ u8 nr_copy_bits;
+ u64 print_num[2] = {};
+
+ nr_copy_bits = nr_bits + bits_offset;
+ nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+ memcpy(print_num, data, nr_copy_bytes);
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ left_shift_bits = bits_offset;
+#else
+ left_shift_bits = BITS_PER_U128 - nr_copy_bits;
+#endif
+ right_shift_bits = BITS_PER_U128 - nr_bits;
+
+ btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
+ btf_int128_print(show, print_num);
+}
+
+
+static void btf_int_bits_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ u32 int_data = btf_type_int(t);
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ u8 total_bits_offset;
+
+ /*
+ * bits_offset is at most 7.
+ * BTF_INT_OFFSET() cannot exceed 128 bits.
+ */
+ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+ data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+ bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+ btf_bitfield_show(data, bits_offset, nr_bits, show);
+}
+
+static void btf_int_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ u32 int_data = btf_type_int(t);
+ u8 encoding = BTF_INT_ENCODING(int_data);
+ bool sign = encoding & BTF_INT_SIGNED;
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ if (bits_offset || BTF_INT_OFFSET(int_data) ||
+ BITS_PER_BYTE_MASKED(nr_bits)) {
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ goto out;
+ }
+
+ switch (nr_bits) {
+ case 128:
+ btf_int128_print(show, safe_data);
+ break;
+ case 64:
+ if (sign)
+ btf_show_type_value(show, "%lld", *(s64 *)safe_data);
+ else
+ btf_show_type_value(show, "%llu", *(u64 *)safe_data);
+ break;
+ case 32:
+ if (sign)
+ btf_show_type_value(show, "%d", *(s32 *)safe_data);
+ else
+ btf_show_type_value(show, "%u", *(u32 *)safe_data);
+ break;
+ case 16:
+ if (sign)
+ btf_show_type_value(show, "%d", *(s16 *)safe_data);
+ else
+ btf_show_type_value(show, "%u", *(u16 *)safe_data);
+ break;
+ case 8:
+ if (show->state.array_encoding == BTF_INT_CHAR) {
+ /* check for null terminator */
+ if (show->state.array_terminated)
+ break;
+ if (*(char *)data == '\0') {
+ show->state.array_terminated = 1;
+ break;
+ }
+ if (isprint(*(char *)data)) {
+ btf_show_type_value(show, "'%c'",
+ *(char *)safe_data);
+ break;
+ }
+ }
+ if (sign)
+ btf_show_type_value(show, "%d", *(s8 *)safe_data);
+ else
+ btf_show_type_value(show, "%u", *(u8 *)safe_data);
+ break;
+ default:
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ break;
+ }
+out:
+ btf_show_end_type(show);
+}
+
+static const struct btf_kind_operations int_ops = {
+ .check_meta = btf_int_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_int_check_member,
+ .check_kflag_member = btf_int_check_kflag_member,
+ .log_details = btf_int_log,
+ .show = btf_int_show,
+};
+
+static int btf_modifier_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id = member->type;
+ struct btf_member resolved_member;
+ struct btf *btf = env->btf;
+
+ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+ if (!resolved_type) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ resolved_member = *member;
+ resolved_member.type = resolved_type_id;
+
+ return btf_type_ops(resolved_type)->check_member(env, struct_type,
+ &resolved_member,
+ resolved_type);
+}
+
+static int btf_modifier_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id = member->type;
+ struct btf_member resolved_member;
+ struct btf *btf = env->btf;
+
+ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+ if (!resolved_type) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ resolved_member = *member;
+ resolved_member.type = resolved_type_id;
+
+ return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type,
+ &resolved_member,
+ resolved_type);
+}
+
+static int btf_ptr_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_size, struct_bits_off, bytes_offset;
+
+ struct_size = struct_type->size;
+ struct_bits_off = member->offset;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ if (struct_size - bytes_offset < sizeof(void *)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const char *value;
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (!BTF_TYPE_ID_VALID(t->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ /* typedef/type_tag type must have a valid name, and other ref types,
+ * volatile, const, restrict, should have a null name.
+ */
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+ } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+ } else {
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ const struct btf_type *next_type;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* Figure out the resolved next_type_id with size.
+ * They will be stored in the current modifier's
+ * resolved_ids and resolved_sizes such that it can
+ * save us a few type-following when we use it later (e.g. in
+ * pretty print).
+ */
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ /* "typedef void new_void", "const void"...etc */
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static int btf_var_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ if (btf_type_is_modifier(next_type)) {
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id;
+
+ resolved_type_id = next_type_id;
+ resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+ if (btf_type_is_ptr(resolved_type) &&
+ !env_type_is_resolve_sink(env, resolved_type) &&
+ !env_type_is_resolved(env, resolved_type_id))
+ return env_stack_push(env, resolved_type,
+ resolved_type_id);
+ }
+
+ /* We must resolve to something concrete at this point, no
+ * forward types or similar that would resolve to size of
+ * zero is allowed.
+ */
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+ * the modifier may have stopped resolving when it was resolved
+ * to a ptr (last-resolved-ptr).
+ *
+ * We now need to continue from the last-resolved-ptr to
+ * ensure the last-resolved-ptr will not referring back to
+ * the current ptr (t).
+ */
+ if (btf_type_is_modifier(next_type)) {
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id;
+
+ resolved_type_id = next_type_id;
+ resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+ if (btf_type_is_ptr(resolved_type) &&
+ !env_type_is_resolve_sink(env, resolved_type) &&
+ !env_type_is_resolved(env, resolved_type_id))
+ return env_stack_push(env, resolved_type,
+ resolved_type_id);
+ }
+
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
+ if (env_type_is_resolved(env, next_type_id))
+ next_type = btf_type_id_resolve(btf, &next_type_id);
+
+ if (!btf_type_is_void(next_type) &&
+ !btf_type_is_fwd(next_type) &&
+ !btf_type_is_func_proto(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_modifier_show(const struct btf *btf,
+ const struct btf_type *t,
+ u32 type_id, void *data,
+ u8 bits_offset, struct btf_show *show)
+{
+ if (btf->resolved_ids)
+ t = btf_type_id_resolve(btf, &type_id);
+ else
+ t = btf_type_skip_modifiers(btf, type_id, NULL);
+
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
+}
+
+static void btf_var_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ t = btf_type_id_resolve(btf, &type_id);
+
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
+}
+
+static void btf_ptr_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */
+ if (show->flags & BTF_SHOW_PTR_RAW)
+ btf_show_type_value(show, "0x%px", *(void **)safe_data);
+ else
+ btf_show_type_value(show, "0x%p", *(void **)safe_data);
+ btf_show_end_type(show);
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static const struct btf_kind_operations modifier_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .resolve = btf_modifier_resolve,
+ .check_member = btf_modifier_check_member,
+ .check_kflag_member = btf_modifier_check_kflag_member,
+ .log_details = btf_ref_type_log,
+ .show = btf_modifier_show,
+};
+
+static const struct btf_kind_operations ptr_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .resolve = btf_ptr_resolve,
+ .check_member = btf_ptr_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_ref_type_log,
+ .show = btf_ptr_show,
+};
+
+static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (t->type) {
+ btf_verifier_log_type(env, t, "type != 0");
+ return -EINVAL;
+ }
+
+ /* fwd type must have a valid name */
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static void btf_fwd_type_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
+}
+
+static const struct btf_kind_operations fwd_ops = {
+ .check_meta = btf_fwd_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_fwd_type_log,
+ .show = btf_df_show,
+};
+
+static int btf_array_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+ u32 array_type_id, array_size;
+ struct btf *btf = env->btf;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ array_type_id = member->type;
+ btf_type_id_size(btf, &array_type_id, &array_size);
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < array_size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_array *array = btf_type_array(t);
+ u32 meta_needed = sizeof(*array);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ /* array type should not have a name */
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (t->size) {
+ btf_verifier_log_type(env, t, "size != 0");
+ return -EINVAL;
+ }
+
+ /* Array elem type and index type cannot be in type void,
+ * so !array->type and !array->index_type are not allowed.
+ */
+ if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
+ btf_verifier_log_type(env, t, "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
+ btf_verifier_log_type(env, t, "Invalid index");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_array_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_array *array = btf_type_array(v->t);
+ const struct btf_type *elem_type, *index_type;
+ u32 elem_type_id, index_type_id;
+ struct btf *btf = env->btf;
+ u32 elem_size;
+
+ /* Check array->index_type */
+ index_type_id = array->index_type;
+ index_type = btf_type_by_id(btf, index_type_id);
+ if (btf_type_nosize_or_null(index_type) ||
+ btf_type_is_resolve_source_only(index_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid index");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, index_type) &&
+ !env_type_is_resolved(env, index_type_id))
+ return env_stack_push(env, index_type, index_type_id);
+
+ index_type = btf_type_id_size(btf, &index_type_id, NULL);
+ if (!index_type || !btf_type_is_int(index_type) ||
+ !btf_type_int_is_regular(index_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid index");
+ return -EINVAL;
+ }
+
+ /* Check array->type */
+ elem_type_id = array->type;
+ elem_type = btf_type_by_id(btf, elem_type_id);
+ if (btf_type_nosize_or_null(elem_type) ||
+ btf_type_is_resolve_source_only(elem_type)) {
+ btf_verifier_log_type(env, v->t,
+ "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, elem_type) &&
+ !env_type_is_resolved(env, elem_type_id))
+ return env_stack_push(env, elem_type, elem_type_id);
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ if (!elem_type) {
+ btf_verifier_log_type(env, v->t, "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid array of int");
+ return -EINVAL;
+ }
+
+ if (array->nelems && elem_size > U32_MAX / array->nelems) {
+ btf_verifier_log_type(env, v->t,
+ "Array size overflows U32_MAX");
+ return -EINVAL;
+ }
+
+ env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+ return 0;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_array *array = btf_type_array(t);
+
+ btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+ array->type, array->index_type, array->nelems);
+}
+
+static void __btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_kind_operations *elem_ops;
+ const struct btf_type *elem_type;
+ u32 i, elem_size = 0, elem_type_id;
+ u16 encoding = 0;
+
+ elem_type_id = array->type;
+ elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL);
+ if (elem_type && btf_type_has_size(elem_type))
+ elem_size = elem_type->size;
+
+ if (elem_type && btf_type_is_int(elem_type)) {
+ u32 int_type = btf_type_int(elem_type);
+
+ encoding = BTF_INT_ENCODING(int_type);
+
+ /*
+ * BTF_INT_CHAR encoding never seems to be set for
+ * char arrays, so if size is 1 and element is
+ * printable as a char, we'll do that.
+ */
+ if (elem_size == 1)
+ encoding = BTF_INT_CHAR;
+ }
+
+ if (!btf_show_start_array_type(show, t, type_id, encoding, data))
+ return;
+
+ if (!elem_type)
+ goto out;
+ elem_ops = btf_type_ops(elem_type);
+
+ for (i = 0; i < array->nelems; i++) {
+
+ btf_show_start_array_member(show);
+
+ elem_ops->show(btf, elem_type, elem_type_id, data,
+ bits_offset, show);
+ data += elem_size;
+
+ btf_show_end_array_member(show);
+
+ if (show->state.array_terminated)
+ break;
+ }
+out:
+ btf_show_end_array_type(show);
+}
+
+static void btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
+ show->state.member = m;
+
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero array member(s).
+ */
+ }
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
+}
+
+static const struct btf_kind_operations array_ops = {
+ .check_meta = btf_array_check_meta,
+ .resolve = btf_array_resolve,
+ .check_member = btf_array_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_array_log,
+ .show = btf_array_show,
+};
+
+static int btf_struct_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < member_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+ const struct btf_member *member;
+ u32 meta_needed, last_offset;
+ struct btf *btf = env->btf;
+ u32 struct_size = t->size;
+ u32 offset;
+ u16 i;
+
+ meta_needed = btf_type_vlen(t) * sizeof(*member);
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ /* struct type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ last_offset = 0;
+ for_each_member(i, t, member) {
+ if (!btf_name_offset_valid(btf, member->name_off)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member name_offset:%u",
+ member->name_off);
+ return -EINVAL;
+ }
+
+ /* struct member either no name or a valid one */
+ if (member->name_off &&
+ !btf_name_valid_identifier(btf, member->name_off)) {
+ btf_verifier_log_member(env, t, member, "Invalid name");
+ return -EINVAL;
+ }
+ /* A member cannot be in type void */
+ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid type_id");
+ return -EINVAL;
+ }
+
+ offset = __btf_member_bit_offset(t, member);
+ if (is_union && offset) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member bits_offset");
+ return -EINVAL;
+ }
+
+ /*
+ * ">" instead of ">=" because the last member could be
+ * "char a[0];"
+ */
+ if (last_offset > offset) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member bits_offset");
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(offset) > struct_size) {
+ btf_verifier_log_member(env, t, member,
+ "Member bits_offset exceeds its struct size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_member(env, t, member, NULL);
+ last_offset = offset;
+ }
+
+ return meta_needed;
+}
+
+static int btf_struct_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_member *member;
+ int err;
+ u16 i;
+
+ /* Before continue resolving the next_member,
+ * ensure the last member is indeed resolved to a
+ * type with size info.
+ */
+ if (v->next_member) {
+ const struct btf_type *last_member_type;
+ const struct btf_member *last_member;
+ u32 last_member_type_id;
+
+ last_member = btf_type_member(v->t) + v->next_member - 1;
+ last_member_type_id = last_member->type;
+ if (WARN_ON_ONCE(!env_type_is_resolved(env,
+ last_member_type_id)))
+ return -EINVAL;
+
+ last_member_type = btf_type_by_id(env->btf,
+ last_member_type_id);
+ if (btf_type_kflag(v->t))
+ err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t,
+ last_member,
+ last_member_type);
+ else
+ err = btf_type_ops(last_member_type)->check_member(env, v->t,
+ last_member,
+ last_member_type);
+ if (err)
+ return err;
+ }
+
+ for_each_member_from(i, v->next_member, v->t, member) {
+ u32 member_type_id = member->type;
+ const struct btf_type *member_type = btf_type_by_id(env->btf,
+ member_type_id);
+
+ if (btf_type_nosize_or_null(member_type) ||
+ btf_type_is_resolve_source_only(member_type)) {
+ btf_verifier_log_member(env, v->t, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, member_type) &&
+ !env_type_is_resolved(env, member_type_id)) {
+ env_stack_set_next_member(env, i + 1);
+ return env_stack_push(env, member_type, member_type_id);
+ }
+
+ if (btf_type_kflag(v->t))
+ err = btf_type_ops(member_type)->check_kflag_member(env, v->t,
+ member,
+ member_type);
+ else
+ err = btf_type_ops(member_type)->check_member(env, v->t,
+ member,
+ member_type);
+ if (err)
+ return err;
+ }
+
+ env_stack_pop_resolved(env, 0, 0);
+
+ return 0;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+enum {
+ BTF_FIELD_IGNORE = 0,
+ BTF_FIELD_FOUND = 1,
+};
+
+struct btf_field_info {
+ enum btf_field_type type;
+ u32 off;
+ union {
+ struct {
+ u32 type_id;
+ } kptr;
+ struct {
+ const char *node_name;
+ u32 value_btf_id;
+ } graph_root;
+ };
+};
+
+static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, enum btf_field_type field_type,
+ struct btf_field_info *info)
+{
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ info->type = field_type;
+ info->off = off;
+ return BTF_FIELD_FOUND;
+}
+
+static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
+ u32 off, int sz, struct btf_field_info *info, u32 field_mask)
+{
+ enum btf_field_type type;
+ const char *tag_value;
+ bool is_type_tag;
+ u32 res_id;
+
+ /* Permit modifiers on the pointer itself */
+ if (btf_type_is_volatile(t))
+ t = btf_type_by_id(btf, t->type);
+ /* For PTR, sz is always == 8 */
+ if (!btf_type_is_ptr(t))
+ return BTF_FIELD_IGNORE;
+ t = btf_type_by_id(btf, t->type);
+ is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t);
+ if (!is_type_tag)
+ return BTF_FIELD_IGNORE;
+ /* Reject extra tags */
+ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
+ return -EINVAL;
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (!strcmp("kptr_untrusted", tag_value))
+ type = BPF_KPTR_UNREF;
+ else if (!strcmp("kptr", tag_value))
+ type = BPF_KPTR_REF;
+ else if (!strcmp("percpu_kptr", tag_value))
+ type = BPF_KPTR_PERCPU;
+ else if (!strcmp("uptr", tag_value))
+ type = BPF_UPTR;
+ else
+ return -EINVAL;
+
+ if (!(type & field_mask))
+ return BTF_FIELD_IGNORE;
+
+ /* Get the base type */
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ /* Only pointer to struct is allowed */
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ info->type = type;
+ info->off = off;
+ info->kptr.type_id = res_id;
+ return BTF_FIELD_FOUND;
+}
+
+int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key, int last_id)
+{
+ int len = strlen(tag_key);
+ int i, n;
+
+ for (i = last_id + 1, n = btf_nr_types(btf); i < n; i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+
+ if (!btf_type_is_decl_tag(t))
+ continue;
+ if (pt != btf_type_by_id(btf, t->type))
+ continue;
+ if (btf_type_decl_tag(t)->component_idx != comp_idx)
+ continue;
+ if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+ continue;
+ return i;
+ }
+ return -ENOENT;
+}
+
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ const char *value = NULL;
+ const struct btf_type *t;
+ int len, id;
+
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0);
+ if (id < 0)
+ return ERR_PTR(id);
+
+ t = btf_type_by_id(btf, id);
+ len = strlen(tag_key);
+ value = __btf_name_by_offset(btf, t->name_off) + len;
+
+ /* Prevent duplicate entries for same type */
+ id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, id);
+ if (id >= 0)
+ return ERR_PTR(-EEXIST);
+
+ return value;
+}
+
+static int
+btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx, u32 off,
+ int sz, struct btf_field_info *info,
+ enum btf_field_type head_type)
+{
+ const char *node_field_name;
+ const char *value_type;
+ s32 id;
+
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+ if (IS_ERR(value_type))
+ return -EINVAL;
+ node_field_name = strstr(value_type, ":");
+ if (!node_field_name)
+ return -EINVAL;
+ value_type = kstrndup(value_type, node_field_name - value_type,
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!value_type)
+ return -ENOMEM;
+ id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+ kfree(value_type);
+ if (id < 0)
+ return id;
+ node_field_name++;
+ if (str_is_empty(node_field_name))
+ return -EINVAL;
+ info->type = head_type;
+ info->off = off;
+ info->graph_root.value_btf_id = id;
+ info->graph_root.node_name = node_field_name;
+ return BTF_FIELD_FOUND;
+}
+
+static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
+ u32 field_mask, u32 *seen_mask, int *align, int *sz)
+{
+ const struct {
+ enum btf_field_type type;
+ const char *const name;
+ const bool is_unique;
+ } field_types[] = {
+ { BPF_SPIN_LOCK, "bpf_spin_lock", true },
+ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true },
+ { BPF_TIMER, "bpf_timer", true },
+ { BPF_WORKQUEUE, "bpf_wq", true },
+ { BPF_TASK_WORK, "bpf_task_work", true },
+ { BPF_LIST_HEAD, "bpf_list_head", false },
+ { BPF_LIST_NODE, "bpf_list_node", false },
+ { BPF_RB_ROOT, "bpf_rb_root", false },
+ { BPF_RB_NODE, "bpf_rb_node", false },
+ { BPF_REFCOUNT, "bpf_refcount", false },
+ };
+ int type = 0, i;
+ const char *name = __btf_name_by_offset(btf, var_type->name_off);
+ const char *field_type_name;
+ enum btf_field_type field_type;
+ bool is_unique;
+
+ for (i = 0; i < ARRAY_SIZE(field_types); ++i) {
+ field_type = field_types[i].type;
+ field_type_name = field_types[i].name;
+ is_unique = field_types[i].is_unique;
+ if (!(field_mask & field_type) || strcmp(name, field_type_name))
+ continue;
+ if (is_unique) {
+ if (*seen_mask & field_type)
+ return -E2BIG;
+ *seen_mask |= field_type;
+ }
+ type = field_type;
+ goto end;
+ }
+
+ /* Only return BPF_KPTR when all other types with matchable names fail */
+ if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
+ type = BPF_KPTR_REF;
+ goto end;
+ }
+ return 0;
+end:
+ *sz = btf_field_type_size(type);
+ *align = btf_field_type_align(type);
+ return type;
+}
+
+/* Repeat a number of fields for a specified number of times.
+ *
+ * Copy the fields starting from the first field and repeat them for
+ * repeat_cnt times. The fields are repeated by adding the offset of each
+ * field with
+ * (i + 1) * elem_size
+ * where i is the repeat index and elem_size is the size of an element.
+ */
+static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
+ u32 field_cnt, u32 repeat_cnt, u32 elem_size)
+{
+ u32 i, j;
+ u32 cur;
+
+ /* Ensure not repeating fields that should not be repeated. */
+ for (i = 0; i < field_cnt; i++) {
+ switch (info[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* The type of struct size or variable size is u32,
+ * so the multiplication will not overflow.
+ */
+ if (field_cnt * (repeat_cnt + 1) > info_cnt)
+ return -E2BIG;
+
+ cur = field_cnt;
+ for (i = 0; i < repeat_cnt; i++) {
+ memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0]));
+ for (j = 0; j < field_cnt; j++)
+ info[cur++].off += (i + 1) * elem_size;
+ }
+
+ return 0;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level);
+
+/* Find special fields in the struct type of a field.
+ *
+ * This function is used to find fields of special types that is not a
+ * global variable or a direct field of a struct type. It also handles the
+ * repetition if it is the element type of an array.
+ */
+static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, u32 nelems,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
+{
+ int ret, err, i;
+
+ level++;
+ if (level >= MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level);
+
+ if (ret <= 0)
+ return ret;
+
+ /* Shift the offsets of the nested struct fields to the offsets
+ * related to the container.
+ */
+ for (i = 0; i < ret; i++)
+ info[i].off += off;
+
+ if (nelems > 1) {
+ err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size);
+ if (err == 0)
+ ret *= nelems;
+ else
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int btf_find_field_one(const struct btf *btf,
+ const struct btf_type *var,
+ const struct btf_type *var_type,
+ int var_idx,
+ u32 off, u32 expected_size,
+ u32 field_mask, u32 *seen_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, align, sz, field_type;
+ struct btf_field_info tmp;
+ const struct btf_array *array;
+ u32 i, nelems = 1;
+
+ /* Walk into array types to find the element type and the number of
+ * elements in the (flattened) array.
+ */
+ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) {
+ array = btf_array(var_type);
+ nelems *= array->nelems;
+ var_type = btf_type_by_id(btf, array->type);
+ }
+ if (i == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+ if (nelems == 0)
+ return 0;
+
+ field_type = btf_get_field_type(btf, var_type,
+ field_mask, seen_mask, &align, &sz);
+ /* Look into variables of struct types */
+ if (!field_type && __btf_type_is_struct(var_type)) {
+ sz = var_type->size;
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask,
+ &info[0], info_cnt, level);
+ return ret;
+ }
+
+ if (field_type == 0)
+ return 0;
+ if (field_type < 0)
+ return field_type;
+
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ if (off % align)
+ return 0;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ case BPF_TASK_WORK:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ info_cnt ? &info[0] : &tmp, field_mask);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ var_idx, off, sz,
+ info_cnt ? &info[0] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ return 0;
+ if (!info_cnt)
+ return -E2BIG;
+ if (nelems > 1) {
+ ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz);
+ if (ret < 0)
+ return ret;
+ }
+ return nelems;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, idx = 0;
+ const struct btf_member *member;
+ u32 i, off, seen_mask = 0;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+
+ off = __btf_member_bit_offset(t, member);
+ if (off % 8)
+ /* valid C code cannot generate such BTF */
+ return -EINVAL;
+ off /= 8;
+
+ ret = btf_find_field_one(btf, t, member_type, i,
+ off, 0,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx, level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
+ }
+ return idx;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
+{
+ int ret, idx = 0;
+ const struct btf_var_secinfo *vsi;
+ u32 i, off, seen_mask = 0;
+
+ for_each_vsi(i, t, vsi) {
+ const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+ const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+ off = vsi->offset;
+ ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx,
+ level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
+ }
+ return idx;
+}
+
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
+{
+ if (__btf_type_is_struct(t))
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);
+ else if (btf_type_is_datasec(t))
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);
+ return -EINVAL;
+}
+
+/* Callers have to ensure the life cycle of btf if it is program BTF */
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ struct module *mod = NULL;
+ const struct btf_type *t;
+ /* If a matching btf type is found in kernel or module BTFs, kptr_ref
+ * is that BTF, otherwise it's program BTF
+ */
+ struct btf *kptr_btf;
+ int ret;
+ s32 id;
+
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info->kptr.type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kptr_btf);
+ if (id == -ENOENT) {
+ /* btf_parse_kptr should only be called w/ btf = program BTF */
+ WARN_ON_ONCE(btf_is_kernel(btf));
+
+ /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+ * kptr allocated via bpf_obj_new
+ */
+ field->kptr.dtor = NULL;
+ id = info->kptr.type_id;
+ kptr_btf = (struct btf *)btf;
+ goto found_dtor;
+ }
+ if (id < 0)
+ return id;
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info->type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kptr_btf)) {
+ mod = btf_try_get_module(kptr_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ field->kptr.dtor = (void *)addr;
+ }
+
+found_dtor:
+ field->kptr.btf_id = id;
+ field->kptr.btf = kptr_btf;
+ field->kptr.module = mod;
+ return 0;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kptr_btf);
+ return ret;
+}
+
+static int btf_parse_graph_root(const struct btf *btf,
+ struct btf_field *field,
+ struct btf_field_info *info,
+ const char *node_type_name,
+ size_t node_type_align)
+{
+ const struct btf_type *t, *n = NULL;
+ const struct btf_member *member;
+ u32 offset;
+ int i;
+
+ t = btf_type_by_id(btf, info->graph_root.value_btf_id);
+ /* We've already checked that value_btf_id is a struct type. We
+ * just need to figure out the offset of the list_node, and
+ * verify its type.
+ */
+ for_each_member(i, t, member) {
+ if (strcmp(info->graph_root.node_name,
+ __btf_name_by_offset(btf, member->name_off)))
+ continue;
+ /* Invalid BTF, two members with same name */
+ if (n)
+ return -EINVAL;
+ n = btf_type_by_id(btf, member->type);
+ if (!__btf_type_is_struct(n))
+ return -EINVAL;
+ if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))
+ return -EINVAL;
+ offset = __btf_member_bit_offset(n, member);
+ if (offset % 8)
+ return -EINVAL;
+ offset /= 8;
+ if (offset % node_type_align)
+ return -EINVAL;
+
+ field->graph_root.btf = (struct btf *)btf;
+ field->graph_root.value_btf_id = info->graph_root.value_btf_id;
+ field->graph_root.node_offset = offset;
+ }
+ if (!n)
+ return -ENOENT;
+ return 0;
+}
+
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_list_node",
+ __alignof__(struct bpf_list_node));
+}
+
+static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
+{
+ return btf_parse_graph_root(btf, field, info, "bpf_rb_node",
+ __alignof__(struct bpf_rb_node));
+}
+
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const struct btf_field *a = (const struct btf_field *)_a;
+ const struct btf_field *b = (const struct btf_field *)_b;
+
+ if (a->offset < b->offset)
+ return -1;
+ else if (a->offset > b->offset)
+ return 1;
+ return 0;
+}
+
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, u32 value_size)
+{
+ struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ u32 next_off = 0, field_type_size;
+ struct btf_record *rec;
+ int ret, i, cnt;
+
+ ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (!ret)
+ return NULL;
+
+ cnt = ret;
+ /* This needs to be kzalloc to zero out padding and unused fields, see
+ * comment in btf_record_equal.
+ */
+ rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!rec)
+ return ERR_PTR(-ENOMEM);
+
+ rec->spin_lock_off = -EINVAL;
+ rec->res_spin_lock_off = -EINVAL;
+ rec->timer_off = -EINVAL;
+ rec->wq_off = -EINVAL;
+ rec->refcount_off = -EINVAL;
+ rec->task_work_off = -EINVAL;
+ for (i = 0; i < cnt; i++) {
+ field_type_size = btf_field_type_size(info_arr[i].type);
+ if (info_arr[i].off + field_type_size > value_size) {
+ WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+ ret = -EFAULT;
+ goto end;
+ }
+ if (info_arr[i].off < next_off) {
+ ret = -EEXIST;
+ goto end;
+ }
+ next_off = info_arr[i].off + field_type_size;
+
+ rec->field_mask |= info_arr[i].type;
+ rec->fields[i].offset = info_arr[i].off;
+ rec->fields[i].type = info_arr[i].type;
+ rec->fields[i].size = field_type_size;
+
+ switch (info_arr[i].type) {
+ case BPF_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_RES_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->res_spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_TIMER:
+ WARN_ON_ONCE(rec->timer_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->timer_off = rec->fields[i].offset;
+ break;
+ case BPF_WORKQUEUE:
+ WARN_ON_ONCE(rec->wq_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->wq_off = rec->fields[i].offset;
+ break;
+ case BPF_TASK_WORK:
+ WARN_ON_ONCE(rec->task_work_off >= 0);
+ rec->task_work_off = rec->fields[i].offset;
+ break;
+ case BPF_REFCOUNT:
+ WARN_ON_ONCE(rec->refcount_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->refcount_off = rec->fields[i].offset;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_HEAD:
+ ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_RB_ROOT:
+ ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ break;
+ default:
+ ret = -EFAULT;
+ goto end;
+ }
+ rec->cnt++;
+ }
+
+ if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* bpf_{list_head, rb_node} require bpf_spin_lock */
+ if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
+ btf_record_has_field(rec, BPF_RB_ROOT)) &&
+ (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (rec->refcount_off < 0 &&
+ btf_record_has_field(rec, BPF_LIST_NODE) &&
+ btf_record_has_field(rec, BPF_RB_NODE)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+ NULL, rec);
+
+ return rec;
+end:
+ btf_record_free(rec);
+ return ERR_PTR(ret);
+}
+
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
+{
+ int i;
+
+ /* There are three types that signify ownership of some other type:
+ * kptr_ref, bpf_list_head, bpf_rb_root.
+ * kptr_ref only supports storing kernel types, which can't store
+ * references to program allocated local types.
+ *
+ * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
+ * does not form cycles.
+ */
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
+ return 0;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *meta;
+ const struct btf_type *t;
+ u32 btf_id;
+
+ if (rec->fields[i].type == BPF_UPTR) {
+ /* The uptr only supports pinning one page and cannot
+ * point to a kernel struct
+ */
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ return -EINVAL;
+ t = btf_type_by_id(rec->fields[i].kptr.btf,
+ rec->fields[i].kptr.btf_id);
+ if (!t->size)
+ return -EINVAL;
+ if (t->size > PAGE_SIZE)
+ return -E2BIG;
+ continue;
+ }
+
+ if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
+ continue;
+ btf_id = rec->fields[i].graph_root.value_btf_id;
+ meta = btf_find_struct_meta(btf, btf_id);
+ if (!meta)
+ return -EFAULT;
+ rec->fields[i].graph_root.value_rec = meta->record;
+
+ /* We need to set value_rec for all root types, but no need
+ * to check ownership cycle for a type unless it's also a
+ * node type.
+ */
+ if (!(rec->field_mask & BPF_GRAPH_NODE))
+ continue;
+
+ /* We need to ensure ownership acyclicity among all types. The
+ * proper way to do it would be to topologically sort all BTF
+ * IDs based on the ownership edges, since there can be multiple
+ * bpf_{list_head,rb_node} in a type. Instead, we use the
+ * following resaoning:
+ *
+ * - A type can only be owned by another type in user BTF if it
+ * has a bpf_{list,rb}_node. Let's call these node types.
+ * - A type can only _own_ another type in user BTF if it has a
+ * bpf_{list_head,rb_root}. Let's call these root types.
+ *
+ * We ensure that if a type is both a root and node, its
+ * element types cannot be root types.
+ *
+ * To ensure acyclicity:
+ *
+ * When A is an root type but not a node, its ownership
+ * chain can be:
+ * A -> B -> C
+ * Where:
+ * - A is an root, e.g. has bpf_rb_root.
+ * - B is both a root and node, e.g. has bpf_rb_node and
+ * bpf_list_head.
+ * - C is only an root, e.g. has bpf_list_node
+ *
+ * When A is both a root and node, some other type already
+ * owns it in the BTF domain, hence it can not own
+ * another root type through any of the ownership edges.
+ * A -> B
+ * Where:
+ * - A is both an root and node.
+ * - B is only an node.
+ */
+ if (meta->record->field_mask & BPF_GRAPH_ROOT)
+ return -ELOOP;
+ }
+ return 0;
+}
+
+static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *member;
+ void *safe_data;
+ u32 i;
+
+ safe_data = btf_show_start_struct_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ const struct btf_kind_operations *ops;
+ u32 member_offset, bitfield_size;
+ u32 bytes_offset;
+ u8 bits8_offset;
+
+ btf_show_start_member(show, member);
+
+ member_offset = __btf_member_bit_offset(t, member);
+ bitfield_size = __btf_member_bitfield_size(t, member);
+ bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+ bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+ if (bitfield_size) {
+ safe_data = btf_show_start_type(show, member_type,
+ member->type,
+ data + bytes_offset);
+ if (safe_data)
+ btf_bitfield_show(safe_data,
+ bits8_offset,
+ bitfield_size, show);
+ btf_show_end_type(show);
+ } else {
+ ops = btf_type_ops(member_type);
+ ops->show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, show);
+ }
+
+ btf_show_end_member(show);
+ }
+
+ btf_show_end_struct_type(show);
+}
+
+static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
+ /* Restore saved member data here */
+ show->state.member = m;
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero child values.
+ */
+ }
+
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
+}
+
+static const struct btf_kind_operations struct_ops = {
+ .check_meta = btf_struct_check_meta,
+ .resolve = btf_struct_resolve,
+ .check_member = btf_struct_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_struct_log,
+ .show = btf_struct_show,
+};
+
+static int btf_enum_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < member_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off, nr_bits, bytes_end, struct_size;
+ u32 int_bitsize = sizeof(int) * BITS_PER_BYTE;
+
+ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
+ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
+ if (!nr_bits) {
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ nr_bits = int_bitsize;
+ } else if (nr_bits > int_bitsize) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member bitfield_size");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits);
+ if (struct_size < bytes_end) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ struct btf *btf = env->btf;
+ const char *fmt_str;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
+ return -EINVAL;
+ }
+
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+ btf_verifier_log(env, fmt_str,
+ __btf_name_by_offset(btf, enums[i].name_off),
+ enums[i].val);
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ void *safe_data;
+ int v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(int *)safe_data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v != enums[i].val)
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
+ }
+
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%d", v);
+ else
+ btf_show_type_value(show, "%u", v);
+ btf_show_end_type(show);
+}
+
+static const struct btf_kind_operations enum_ops = {
+ .check_meta = btf_enum_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
+ .log_details = btf_enum_log,
+ .show = btf_enum_show,
+};
+
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ struct btf *btf = env->btf;
+ const char *fmt_str;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
+ return -EINVAL;
+ }
+
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
+
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+ btf_verifier_log(env, fmt_str,
+ __btf_name_by_offset(btf, enums[i].name_off),
+ btf_enum64_value(enums + i));
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ void *safe_data;
+ s64 v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(u64 *)safe_data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v != btf_enum64_value(enums + i))
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
+ }
+
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%lld", v);
+ else
+ btf_show_type_value(show, "%llu", v);
+ btf_show_end_type(show);
+}
+
+static const struct btf_kind_operations enum64_ops = {
+ .check_meta = btf_enum64_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
+ .log_details = btf_enum_log,
+ .show = btf_enum64_show,
+};
+
+static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->name_off) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_func_proto_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_param *args = (const struct btf_param *)(t + 1);
+ u16 nr_args = btf_type_vlen(t), i;
+
+ btf_verifier_log(env, "return=%u args=(", t->type);
+ if (!nr_args) {
+ btf_verifier_log(env, "void");
+ goto done;
+ }
+
+ if (nr_args == 1 && !args[0].type) {
+ /* Only one vararg */
+ btf_verifier_log(env, "vararg");
+ goto done;
+ }
+
+ btf_verifier_log(env, "%u %s", args[0].type,
+ __btf_name_by_offset(env->btf,
+ args[0].name_off));
+ for (i = 1; i < nr_args - 1; i++)
+ btf_verifier_log(env, ", %u %s", args[i].type,
+ __btf_name_by_offset(env->btf,
+ args[i].name_off));
+
+ if (nr_args > 1) {
+ const struct btf_param *last_arg = &args[nr_args - 1];
+
+ if (last_arg->type)
+ btf_verifier_log(env, ", %u %s", last_arg->type,
+ __btf_name_by_offset(env->btf,
+ last_arg->name_off));
+ else
+ btf_verifier_log(env, ", vararg");
+ }
+
+done:
+ btf_verifier_log(env, ")");
+}
+
+static const struct btf_kind_operations func_proto_ops = {
+ .check_meta = btf_func_proto_check_meta,
+ .resolve = btf_df_resolve,
+ /*
+ * BTF_KIND_FUNC_PROTO cannot be directly referred by
+ * a struct's member.
+ *
+ * It should be a function pointer instead.
+ * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
+ *
+ * Hence, there is no btf_func_check_member().
+ */
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_func_proto_log,
+ .show = btf_df_show,
+};
+
+static s32 btf_func_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
+ btf_verifier_log_type(env, t, "Invalid func linkage");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_func_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ int err;
+
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+ return 0;
+}
+
+static const struct btf_kind_operations func_ops = {
+ .check_meta = btf_func_check_meta,
+ .resolve = btf_func_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_ref_type_log,
+ .show = btf_df_show,
+};
+
+static s32 btf_var_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_var *var;
+ u32 meta_needed = sizeof(*var);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (!t->name_off ||
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ /* A var cannot be in type void */
+ if (!t->type || !BTF_TYPE_ID_VALID(t->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ var = btf_type_var(t);
+ if (var->linkage != BTF_VAR_STATIC &&
+ var->linkage != BTF_VAR_GLOBAL_ALLOCATED) {
+ btf_verifier_log_type(env, t, "Linkage not supported");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ const struct btf_var *var = btf_type_var(t);
+
+ btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage);
+}
+
+static const struct btf_kind_operations var_ops = {
+ .check_meta = btf_var_check_meta,
+ .resolve = btf_var_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_var_log,
+ .show = btf_var_show,
+};
+
+static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_var_secinfo *vsi;
+ u64 last_vsi_end_off = 0, sum = 0;
+ u32 i, meta_needed;
+
+ meta_needed = btf_type_vlen(t) * sizeof(*vsi);
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (!t->size) {
+ btf_verifier_log_type(env, t, "size == 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (!t->name_off ||
+ !btf_name_valid_section(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for_each_vsi(i, t, vsi) {
+ /* A var cannot be in type void */
+ if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid offset");
+ return -EINVAL;
+ }
+
+ if (!vsi->size || vsi->size > t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid size");
+ return -EINVAL;
+ }
+
+ last_vsi_end_off = vsi->offset + vsi->size;
+ if (last_vsi_end_off > t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid offset+size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_vsi(env, t, vsi, NULL);
+ sum += vsi->size;
+ }
+
+ if (t->size < sum) {
+ btf_verifier_log_type(env, t, "Invalid btf_info size");
+ return -EINVAL;
+ }
+
+ return meta_needed;
+}
+
+static int btf_datasec_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_var_secinfo *vsi;
+ struct btf *btf = env->btf;
+ u16 i;
+
+ env->resolve_mode = RESOLVE_TBD;
+ for_each_vsi_from(i, v->next_member, v->t, vsi) {
+ u32 var_type_id = vsi->type, type_id, type_size = 0;
+ const struct btf_type *var_type = btf_type_by_id(env->btf,
+ var_type_id);
+ if (!var_type || !btf_type_is_var(var_type)) {
+ btf_verifier_log_vsi(env, v->t, vsi,
+ "Not a VAR kind member");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, var_type) &&
+ !env_type_is_resolved(env, var_type_id)) {
+ env_stack_set_next_member(env, i + 1);
+ return env_stack_push(env, var_type, var_type_id);
+ }
+
+ type_id = var_type->type;
+ if (!btf_type_id_size(btf, &type_id, &type_size)) {
+ btf_verifier_log_vsi(env, v->t, vsi, "Invalid type");
+ return -EINVAL;
+ }
+
+ if (vsi->size < type_size) {
+ btf_verifier_log_vsi(env, v->t, vsi, "Invalid size");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, 0, 0);
+ return 0;
+}
+
+static void btf_datasec_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_datasec_show(const struct btf *btf,
+ const struct btf_type *t, u32 type_id,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_var_secinfo *vsi;
+ const struct btf_type *var;
+ u32 i;
+
+ if (!btf_show_start_type(show, t, type_id, data))
+ return;
+
+ btf_show_type_value(show, "section (\"%s\") = {",
+ __btf_name_by_offset(btf, t->name_off));
+ for_each_vsi(i, t, vsi) {
+ var = btf_type_by_id(btf, vsi->type);
+ if (i)
+ btf_show(show, ",");
+ btf_type_ops(var)->show(btf, var, vsi->type,
+ data + vsi->offset, bits_offset, show);
+ }
+ btf_show_end_type(show);
+}
+
+static const struct btf_kind_operations datasec_ops = {
+ .check_meta = btf_datasec_check_meta,
+ .resolve = btf_datasec_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_datasec_log,
+ .show = btf_datasec_show,
+};
+
+static s32 btf_float_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
+ t->size != 16) {
+ btf_verifier_log_type(env, t, "Invalid type_size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_float_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u64 start_offset_bytes;
+ u64 end_offset_bytes;
+ u64 misalign_bits;
+ u64 align_bytes;
+ u64 align_bits;
+
+ /* Different architectures have different alignment requirements, so
+ * here we check only for the reasonable minimum. This way we ensure
+ * that types after CO-RE can pass the kernel BTF verifier.
+ */
+ align_bytes = min_t(u64, sizeof(void *), member_type->size);
+ align_bits = align_bytes * BITS_PER_BYTE;
+ div64_u64_rem(member->offset, align_bits, &misalign_bits);
+ if (misalign_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not properly aligned");
+ return -EINVAL;
+ }
+
+ start_offset_bytes = member->offset / BITS_PER_BYTE;
+ end_offset_bytes = start_offset_bytes + member_type->size;
+ if (end_offset_bytes > struct_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void btf_float_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u", t->size);
+}
+
+static const struct btf_kind_operations float_ops = {
+ .check_meta = btf_float_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_float_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_float_log,
+ .show = btf_df_show,
+};
+
+static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_decl_tag *tag;
+ u32 meta_needed = sizeof(*tag);
+ s32 component_idx;
+ const char *value;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid value");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx < -1) {
+ btf_verifier_log_type(env, t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_decl_tag_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ s32 component_idx;
+ u32 vlen;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx != -1) {
+ if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_struct(next_type)) {
+ vlen = btf_type_vlen(next_type);
+ } else {
+ /* next_type should be a function */
+ next_type = btf_type_by_id(btf, next_type->type);
+ vlen = btf_type_vlen(next_type);
+ }
+
+ if ((u32)component_idx >= vlen) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ btf_verifier_log(env, "type=%u component_idx=%d", t->type,
+ btf_type_decl_tag(t)->component_idx);
+}
+
+static const struct btf_kind_operations decl_tag_ops = {
+ .check_meta = btf_decl_tag_check_meta,
+ .resolve = btf_decl_tag_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_decl_tag_log,
+ .show = btf_df_show,
+};
+
+static int btf_func_proto_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *ret_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+ int err;
+
+ btf = env->btf;
+ args = (const struct btf_param *)(t + 1);
+ nr_args = btf_type_vlen(t);
+
+ /* Check func return type which could be "void" (t->type == 0) */
+ if (t->type) {
+ u32 ret_type_id = t->type;
+
+ ret_type = btf_type_by_id(btf, ret_type_id);
+ if (!ret_type) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(ret_type)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+
+ if (btf_type_needs_resolve(ret_type) &&
+ !env_type_is_resolved(env, ret_type_id)) {
+ err = btf_resolve(env, ret_type, ret_type_id);
+ if (err)
+ return err;
+ }
+
+ /* Ensure the return type is a type that has a size */
+ if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid return type");
+ return -EINVAL;
+ }
+ }
+
+ if (!nr_args)
+ return 0;
+
+ /* Last func arg type_id could be 0 if it is a vararg */
+ if (!args[nr_args - 1].type) {
+ if (args[nr_args - 1].name_off) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u",
+ nr_args);
+ return -EINVAL;
+ }
+ nr_args--;
+ }
+
+ for (i = 0; i < nr_args; i++) {
+ const struct btf_type *arg_type;
+ u32 arg_type_id;
+
+ arg_type_id = args[i].type;
+ arg_type = btf_type_by_id(btf, arg_type_id);
+ if (!arg_type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(arg_type)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+
+ if (args[i].name_off &&
+ (!btf_name_offset_valid(btf, args[i].name_off) ||
+ !btf_name_valid_identifier(btf, args[i].name_off))) {
+ btf_verifier_log_type(env, t,
+ "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+
+ if (btf_type_needs_resolve(arg_type) &&
+ !env_type_is_resolved(env, arg_type_id)) {
+ err = btf_resolve(env, arg_type, arg_type_id);
+ if (err)
+ return err;
+ }
+
+ if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_type *proto_type;
+ const struct btf_param *args;
+ const struct btf *btf;
+ u16 nr_args, i;
+
+ btf = env->btf;
+ proto_type = btf_type_by_id(btf, t->type);
+
+ if (!proto_type || !btf_type_is_func_proto(proto_type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ args = (const struct btf_param *)(proto_type + 1);
+ nr_args = btf_type_vlen(proto_type);
+ for (i = 0; i < nr_args; i++) {
+ if (!args[i].name_off && args[i].type) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+ [BTF_KIND_INT] = &int_ops,
+ [BTF_KIND_PTR] = &ptr_ops,
+ [BTF_KIND_ARRAY] = &array_ops,
+ [BTF_KIND_STRUCT] = &struct_ops,
+ [BTF_KIND_UNION] = &struct_ops,
+ [BTF_KIND_ENUM] = &enum_ops,
+ [BTF_KIND_FWD] = &fwd_ops,
+ [BTF_KIND_TYPEDEF] = &modifier_ops,
+ [BTF_KIND_VOLATILE] = &modifier_ops,
+ [BTF_KIND_CONST] = &modifier_ops,
+ [BTF_KIND_RESTRICT] = &modifier_ops,
+ [BTF_KIND_FUNC] = &func_ops,
+ [BTF_KIND_FUNC_PROTO] = &func_proto_ops,
+ [BTF_KIND_VAR] = &var_ops,
+ [BTF_KIND_DATASEC] = &datasec_ops,
+ [BTF_KIND_FLOAT] = &float_ops,
+ [BTF_KIND_DECL_TAG] = &decl_tag_ops,
+ [BTF_KIND_TYPE_TAG] = &modifier_ops,
+ [BTF_KIND_ENUM64] = &enum64_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 saved_meta_left = meta_left;
+ s32 var_meta_size;
+
+ if (meta_left < sizeof(*t)) {
+ btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+ env->log_type_id, meta_left, sizeof(*t));
+ return -EINVAL;
+ }
+ meta_left -= sizeof(*t);
+
+ if (t->info & ~BTF_INFO_MASK) {
+ btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+ env->log_type_id, t->info);
+ return -EINVAL;
+ }
+
+ if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+ BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+ btf_verifier_log(env, "[%u] Invalid kind:%u",
+ env->log_type_id, BTF_INFO_KIND(t->info));
+ return -EINVAL;
+ }
+
+ if (!btf_name_offset_valid(env->btf, t->name_off)) {
+ btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+ env->log_type_id, t->name_off);
+ return -EINVAL;
+ }
+
+ var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+ if (var_meta_size < 0)
+ return var_meta_size;
+
+ meta_left -= var_meta_size;
+
+ return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ struct btf_header *hdr;
+ void *cur, *end;
+
+ hdr = &btf->hdr;
+ cur = btf->nohdr_data + hdr->type_off;
+ end = cur + hdr->type_len;
+
+ env->log_type_id = btf->base_btf ? btf->start_id : 1;
+ while (cur < end) {
+ struct btf_type *t = cur;
+ s32 meta_size;
+
+ meta_size = btf_check_meta(env, t, end - cur);
+ if (meta_size < 0)
+ return meta_size;
+
+ btf_add_type(env, t);
+ cur += meta_size;
+ env->log_type_id++;
+ }
+
+ return 0;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 type_id)
+{
+ struct btf *btf = env->btf;
+
+ if (!env_type_is_resolved(env, type_id))
+ return false;
+
+ if (btf_type_is_struct(t) || btf_type_is_datasec(t))
+ return !btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
+
+ if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
+ return btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
+
+ if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
+ btf_type_is_var(t)) {
+ t = btf_type_id_resolve(btf, &type_id);
+ return t &&
+ !btf_type_is_modifier(t) &&
+ !btf_type_is_var(t) &&
+ !btf_type_is_datasec(t);
+ }
+
+ if (btf_type_is_array(t)) {
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_type *elem_type;
+ u32 elem_type_id = array->type;
+ u32 elem_size;
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ return elem_type && !btf_type_is_modifier(elem_type) &&
+ (array->nelems * elem_size ==
+ btf_resolved_type_size(btf, type_id));
+ }
+
+ return false;
+}
+
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ u32 save_log_type_id = env->log_type_id;
+ const struct resolve_vertex *v;
+ int err = 0;
+
+ env->resolve_mode = RESOLVE_TBD;
+ env_stack_push(env, t, type_id);
+ while (!err && (v = env_stack_peak(env))) {
+ env->log_type_id = v->type_id;
+ err = btf_type_ops(v->t)->resolve(env, v);
+ }
+
+ env->log_type_id = type_id;
+ if (err == -E2BIG) {
+ btf_verifier_log_type(env, t,
+ "Exceeded max resolving depth:%u",
+ MAX_RESOLVE_DEPTH);
+ } else if (err == -EEXIST) {
+ btf_verifier_log_type(env, t, "Loop detected");
+ }
+
+ /* Final sanity check */
+ if (!err && !btf_resolve_valid(env, t, type_id)) {
+ btf_verifier_log_type(env, t, "Invalid resolve state");
+ err = -EINVAL;
+ }
+
+ env->log_type_id = save_log_type_id;
+ return err;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ const struct btf_type *t;
+ u32 type_id, i;
+ int err;
+
+ err = env_resolve_init(env);
+ if (err)
+ return err;
+
+ env->phase++;
+ for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) {
+ type_id = btf->start_id + i;
+ t = btf_type_by_id(btf, type_id);
+
+ env->log_type_id = type_id;
+ if (btf_type_needs_resolve(t) &&
+ !env_type_is_resolved(env, type_id)) {
+ err = btf_resolve(env, t, type_id);
+ if (err)
+ return err;
+ }
+
+ if (btf_type_is_func_proto(t)) {
+ err = btf_func_proto_check(env, t);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr = &env->btf->hdr;
+ int err;
+
+ /* Type section must align to 4 bytes */
+ if (hdr->type_off & (sizeof(u32) - 1)) {
+ btf_verifier_log(env, "Unaligned type_off");
+ return -EINVAL;
+ }
+
+ if (!env->btf->base_btf && !hdr->type_len) {
+ btf_verifier_log(env, "No type found");
+ return -EINVAL;
+ }
+
+ err = btf_check_all_metas(env);
+ if (err)
+ return err;
+
+ return btf_check_all_types(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr;
+ struct btf *btf = env->btf;
+ const char *start, *end;
+
+ hdr = &btf->hdr;
+ start = btf->nohdr_data + hdr->str_off;
+ end = start + hdr->str_len;
+
+ if (end != btf->data + btf->data_size) {
+ btf_verifier_log(env, "String section is not at the end");
+ return -EINVAL;
+ }
+
+ btf->strings = start;
+
+ if (btf->base_btf && !hdr->str_len)
+ return 0;
+ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) {
+ btf_verifier_log(env, "Invalid string section");
+ return -EINVAL;
+ }
+ if (!btf->base_btf && start[0]) {
+ btf_verifier_log(env, "Invalid string section");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const size_t btf_sec_info_offset[] = {
+ offsetof(struct btf_header, type_off),
+ offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+ const struct btf_sec_info *x = a;
+ const struct btf_sec_info *y = b;
+
+ return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+ u32 btf_data_size)
+{
+ struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
+ u32 total, expected_total, i;
+ const struct btf_header *hdr;
+ const struct btf *btf;
+
+ btf = env->btf;
+ hdr = &btf->hdr;
+
+ /* Populate the secs from hdr */
+ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
+ secs[i] = *(struct btf_sec_info *)((void *)hdr +
+ btf_sec_info_offset[i]);
+
+ sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+ sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
+
+ /* Check for gaps and overlap among sections */
+ total = 0;
+ expected_total = btf_data_size - hdr->hdr_len;
+ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
+ if (expected_total < secs[i].off) {
+ btf_verifier_log(env, "Invalid section offset");
+ return -EINVAL;
+ }
+ if (total < secs[i].off) {
+ /* gap */
+ btf_verifier_log(env, "Unsupported section found");
+ return -EINVAL;
+ }
+ if (total > secs[i].off) {
+ btf_verifier_log(env, "Section overlap found");
+ return -EINVAL;
+ }
+ if (expected_total - total < secs[i].len) {
+ btf_verifier_log(env,
+ "Total section length too long");
+ return -EINVAL;
+ }
+ total += secs[i].len;
+ }
+
+ /* There is data other than hdr and known sections */
+ if (expected_total != total) {
+ btf_verifier_log(env, "Unsupported section found");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env)
+{
+ u32 hdr_len, hdr_copy, btf_data_size;
+ const struct btf_header *hdr;
+ struct btf *btf;
+
+ btf = env->btf;
+ btf_data_size = btf->data_size;
+
+ if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
+ btf_verifier_log(env, "hdr_len not found");
+ return -EINVAL;
+ }
+
+ hdr = btf->data;
+ hdr_len = hdr->hdr_len;
+ if (btf_data_size < hdr_len) {
+ btf_verifier_log(env, "btf_header not found");
+ return -EINVAL;
+ }
+
+ /* Ensure the unsupported header fields are zero */
+ if (hdr_len > sizeof(btf->hdr)) {
+ u8 *expected_zero = btf->data + sizeof(btf->hdr);
+ u8 *end = btf->data + hdr_len;
+
+ for (; expected_zero < end; expected_zero++) {
+ if (*expected_zero) {
+ btf_verifier_log(env, "Unsupported btf_header");
+ return -E2BIG;
+ }
+ }
+ }
+
+ hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+ memcpy(&btf->hdr, btf->data, hdr_copy);
+
+ hdr = &btf->hdr;
+
+ btf_verifier_log_hdr(env, btf_data_size);
+
+ if (hdr->magic != BTF_MAGIC) {
+ btf_verifier_log(env, "Invalid magic");
+ return -EINVAL;
+ }
+
+ if (hdr->version != BTF_VERSION) {
+ btf_verifier_log(env, "Unsupported version");
+ return -ENOTSUPP;
+ }
+
+ if (hdr->flags) {
+ btf_verifier_log(env, "Unsupported flags");
+ return -ENOTSUPP;
+ }
+
+ if (!btf->base_btf && btf_data_size == hdr->hdr_len) {
+ btf_verifier_log(env, "No data");
+ return -EINVAL;
+ }
+
+ return btf_check_sec_info(env, btf_data_size);
+}
+
+static const char *alloc_obj_fields[] = {
+ "bpf_spin_lock",
+ "bpf_list_head",
+ "bpf_list_node",
+ "bpf_rb_root",
+ "bpf_rb_node",
+ "bpf_refcount",
+};
+
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+ struct btf_struct_metas *tab = NULL;
+ struct btf_id_set *aof;
+ int i, n, id, ret;
+
+ BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+ BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+ aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
+ if (!aof)
+ return ERR_PTR(-ENOMEM);
+ aof->cnt = 0;
+
+ for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+ /* Try to find whether this special type exists in user BTF, and
+ * if so remember its ID so we can easily find it among members
+ * of structs that we iterate in the next loop.
+ */
+ struct btf_id_set *new_aof;
+
+ id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+ if (id < 0)
+ continue;
+
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = id;
+ }
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ /* Try to find if there are kptrs in user BTF and remember their ID */
+ struct btf_id_set *new_aof;
+ struct btf_field_info tmp;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free_aof;
+ }
+
+ ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
+ if (ret != BTF_FIELD_FOUND)
+ continue;
+
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_aof) {
+ ret = -ENOMEM;
+ goto free_aof;
+ }
+ aof = new_aof;
+ aof->ids[aof->cnt++] = i;
+ }
+
+ if (!aof->cnt) {
+ kfree(aof);
+ return NULL;
+ }
+ sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
+
+ for (i = 1; i < n; i++) {
+ struct btf_struct_metas *new_tab;
+ const struct btf_member *member;
+ struct btf_struct_meta *type;
+ struct btf_record *record;
+ const struct btf_type *t;
+ int j, tab_cnt;
+
+ t = btf_type_by_id(btf, i);
+ if (!__btf_type_is_struct(t))
+ continue;
+
+ cond_resched();
+
+ for_each_member(j, t, member) {
+ if (btf_id_set_contains(aof, member->type))
+ goto parse;
+ }
+ continue;
+ parse:
+ tab_cnt = tab ? tab->cnt : 0;
+ new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (!tab)
+ new_tab->cnt = 0;
+ tab = new_tab;
+
+ type = &tab->types[tab->cnt];
+ type->btf_id = i;
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+ BPF_KPTR, t->size);
+ /* The record cannot be unset, treat it as an error if so */
+ if (IS_ERR_OR_NULL(record)) {
+ ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+ goto free;
+ }
+ type->record = record;
+ tab->cnt++;
+ }
+ kfree(aof);
+ return tab;
+free:
+ btf_struct_metas_free(tab);
+free_aof:
+ kfree(aof);
+ return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+ struct btf_struct_metas *tab;
+
+ BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+ tab = btf->struct_meta_tab;
+ if (!tab)
+ return NULL;
+ return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
+}
+
+static int btf_check_type_tags(struct btf_verifier_env *env,
+ struct btf *btf, int start_id)
+{
+ int i, n, good_id = start_id - 1;
+ bool in_tags;
+
+ n = btf_nr_types(btf);
+ for (i = start_id; i < n; i++) {
+ const struct btf_type *t;
+ int chain_limit = 32;
+ u32 cur_id = i;
+
+ t = btf_type_by_id(btf, i);
+ if (!t)
+ return -EINVAL;
+ if (!btf_type_is_modifier(t))
+ continue;
+
+ cond_resched();
+
+ in_tags = btf_type_is_type_tag(t);
+ while (btf_type_is_modifier(t)) {
+ if (!chain_limit--) {
+ btf_verifier_log(env, "Max chain length or cycle detected");
+ return -ELOOP;
+ }
+ if (btf_type_is_type_tag(t)) {
+ if (!in_tags) {
+ btf_verifier_log(env, "Type tags don't precede modifiers");
+ return -EINVAL;
+ }
+ } else if (in_tags) {
+ in_tags = false;
+ }
+ if (cur_id <= good_id)
+ break;
+ /* Move to next type */
+ cur_id = t->type;
+ t = btf_type_by_id(btf, cur_id);
+ if (!t)
+ return -EINVAL;
+ }
+ good_id = i;
+ }
+ return 0;
+}
+
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+ &log_true_size, sizeof(log_true_size)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+{
+ bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+ char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
+ struct btf_struct_metas *struct_meta_tab;
+ struct btf_verifier_env *env = NULL;
+ struct btf *btf = NULL;
+ u8 *data;
+ int err, ret;
+
+ if (attr->btf_size > BTF_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ err = bpf_vlog_init(&env->log, attr->btf_log_level,
+ log_ubuf, attr->btf_log_size);
+ if (err)
+ goto errout_free;
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ env->btf = btf;
+
+ data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ btf->data = data;
+ btf->data_size = attr->btf_size;
+
+ if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
+ err = -EFAULT;
+ goto errout;
+ }
+
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_parse_type_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
+ goto errout;
+
+ struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
+ if (IS_ERR(struct_meta_tab)) {
+ err = PTR_ERR(struct_meta_tab);
+ goto errout;
+ }
+ btf->struct_meta_tab = struct_meta_tab;
+
+ if (struct_meta_tab) {
+ int i;
+
+ for (i = 0; i < struct_meta_tab->cnt; i++) {
+ err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+ if (err < 0)
+ goto errout_meta;
+ }
+ }
+
+ err = finalize_log(&env->log, uattr, uattr_size);
+ if (err)
+ goto errout_free;
+
+ btf_verifier_env_free(env);
+ refcount_set(&btf->refcnt, 1);
+ return btf;
+
+errout_meta:
+ btf_free_struct_meta_tab(btf);
+errout:
+ /* overwrite err with -ENOSPC or -EFAULT */
+ ret = finalize_log(&env->log, uattr, uattr_size);
+ if (ret)
+ err = ret;
+errout_free:
+ btf_verifier_env_free(env);
+ if (btf)
+ btf_free(btf);
+ return ERR_PTR(err);
+}
+
+extern char __start_BTF[];
+extern char __stop_BTF[];
+extern struct btf *btf_vmlinux;
+
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
+static union {
+ struct bpf_ctx_convert {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ prog_ctx_type _id##_prog; \
+ kern_ctx_type _id##_kern;
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+ } *__t;
+ /* 't' is written once under lock. Read many times. */
+ const struct btf_type *t;
+} bpf_ctx_convert;
+enum {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ __ctx_convert##_id,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+ __ctx_convert_unused, /* to avoid empty enum in extreme .config */
+};
+static u8 bpf_ctx_convert_map[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ [_id] = __ctx_convert##_id,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+ 0, /* avoid empty array */
+};
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
+
+static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
+{
+ const struct btf_type *conv_struct;
+ const struct btf_member *ctx_type;
+
+ conv_struct = bpf_ctx_convert.t;
+ if (!conv_struct)
+ return NULL;
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct __sk_buff'
+ */
+ return btf_type_by_id(btf_vmlinux, ctx_type->type);
+}
+
+static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
+{
+ const struct btf_type *conv_struct;
+ const struct btf_member *ctx_type;
+
+ conv_struct = bpf_ctx_convert.t;
+ if (!conv_struct)
+ return -EFAULT;
+ /* prog_type is valid bpf program type. No need for bounds check. */
+ ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ /* ctx_type is a pointer to prog_ctx_type in vmlinux.
+ * Like 'struct sk_buff'
+ */
+ return ctx_type->type;
+}
+
+bool btf_is_projection_of(const char *pname, const char *tname)
+{
+ if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return true;
+ if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return true;
+ return false;
+}
+
+bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, enum bpf_prog_type prog_type,
+ int arg)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
+ t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE programs allow bpf_user_pt_regs_t typedef, which we need to
+ * check before we skip all the typedef below.
+ */
+ if (prog_type == BPF_PROG_TYPE_KPROBE) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return true;
+ }
+ }
+
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_struct(t)) {
+ /* Only pointer to struct is supported for now.
+ * That means that BPF_PROG_TYPE_TRACEPOINT with BTF
+ * is not supported yet.
+ * BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
+ */
+ return false;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname) {
+ bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
+ return false;
+ }
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ /* should not happen */
+ return false;
+ }
+again:
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
+ if (!ctx_tname) {
+ /* should not happen */
+ bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
+ return false;
+ }
+ /* program types without named context types work only with arg:ctx tag */
+ if (ctx_tname[0] == '\0')
+ return false;
+ /* only compare that prog's ctx type name is the same as
+ * kernel expects. No need to compare field by field.
+ * It's ok for bpf prog to do:
+ * struct __sk_buff {};
+ * int socket_filter_bpf_prog(struct __sk_buff *skb)
+ * { // no fields of skb are ever used }
+ */
+ if (btf_is_projection_of(ctx_tname, tname))
+ return true;
+ if (strcmp(ctx_tname, tname)) {
+ /* bpf_user_pt_regs_t is a typedef, so resolve it to
+ * underlying struct and check name again
+ */
+ if (!btf_type_is_modifier(ctx_type))
+ return false;
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+ goto again;
+ }
+ return true;
+}
+
+/* forward declarations for arch-specific underlying types of
+ * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
+ * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
+ * works correctly with __builtin_types_compatible_p() on respective
+ * architectures
+ */
+struct user_regs_struct;
+struct user_pt_regs;
+
+static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int arg,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ const struct btf_type *ctx_type;
+ const char *tname, *ctx_tname;
+
+ if (!btf_is_ptr(t)) {
+ bpf_log(log, "arg#%d type isn't a pointer\n", arg);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+
+ /* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
+ if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
+ while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_typedef(t)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+ return 0;
+ }
+ }
+
+ /* all other program types don't use typedefs for context type */
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+
+ /* `void *ctx __arg_ctx` is always valid */
+ if (btf_type_is_void(t))
+ return 0;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (str_is_empty(tname)) {
+ bpf_log(log, "arg#%d type doesn't have a name\n", arg);
+ return -EINVAL;
+ }
+
+ /* special cases */
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
+ return 0;
+ if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
+ __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACING:
+ switch (attach_type) {
+ case BPF_TRACE_RAW_TP:
+ /* tp_btf program is TRACING, so need special case here */
+ if (__btf_type_is_struct(t) &&
+ strcmp(tname, "bpf_raw_tracepoint_args") == 0)
+ return 0;
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_TRACE_ITER:
+ /* allow struct bpf_iter__xxx types only */
+ if (__btf_type_is_struct(t) &&
+ strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
+ return 0;
+ break;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ /* allow u64* as ctx */
+ if (btf_is_int(t) && t->size == 8)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_SYSCALL:
+ case BPF_PROG_TYPE_EXT:
+ return 0; /* anything goes */
+ default:
+ break;
+ }
+
+ ctx_type = find_canonical_prog_ctx_type(prog_type);
+ if (!ctx_type) {
+ /* should not happen */
+ bpf_log(log, "btf_vmlinux is malformed\n");
+ return -EINVAL;
+ }
+
+ /* resolve typedefs and check that underlying structs are matching as well */
+ while (btf_type_is_modifier(ctx_type))
+ ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+
+ /* if program type doesn't have distinctly named struct type for
+ * context, then __arg_ctx argument can only be `void *`, which we
+ * already checked above
+ */
+ if (!__btf_type_is_struct(ctx_type)) {
+ bpf_log(log, "arg#%d should be void pointer\n", arg);
+ return -EINVAL;
+ }
+
+ ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
+ if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
+ bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
+ struct btf *btf,
+ const struct btf_type *t,
+ enum bpf_prog_type prog_type,
+ int arg)
+{
+ if (!btf_is_prog_ctx_type(log, btf, t, prog_type, arg))
+ return -ENOENT;
+ return find_kern_ctx_type_id(prog_type);
+}
+
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+ const struct btf_member *kctx_member;
+ const struct btf_type *conv_struct;
+ const struct btf_type *kctx_type;
+ u32 kctx_type_id;
+
+ conv_struct = bpf_ctx_convert.t;
+ /* get member for kernel ctx type */
+ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ kctx_type_id = kctx_member->type;
+ kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+ if (!btf_type_is_struct(kctx_type)) {
+ bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+ return -EINVAL;
+ }
+
+ return kctx_type_id;
+}
+
+BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert)
+
+static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
+ void *data, unsigned int data_size)
+{
+ struct btf *btf = NULL;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
+ return ERR_PTR(-ENOENT);
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ env->btf = btf;
+
+ btf->data = data;
+ btf->data_size = data_size;
+ btf->kernel_btf = true;
+ snprintf(btf->name, sizeof(btf->name), "%s", name);
+
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_all_metas(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_type_tags(env, btf, 1);
+ if (err)
+ goto errout;
+
+ refcount_set(&btf->refcnt, 1);
+
+ return btf;
+
+errout:
+ if (btf) {
+ kvfree(btf->types);
+ kfree(btf);
+ }
+ return ERR_PTR(err);
+}
+
+struct btf *btf_parse_vmlinux(void)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err;
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+ btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF);
+ if (IS_ERR(btf))
+ goto err_out;
+
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ btf = ERR_PTR(err);
+ }
+err_out:
+ btf_verifier_env_free(env);
+ return btf;
+}
+
+/* If .BTF_ids section was created with distilled base BTF, both base and
+ * split BTF ids will need to be mapped to actual base/split ids for
+ * BTF now that it has been relocated.
+ */
+static __u32 btf_relocate_id(const struct btf *btf, __u32 id)
+{
+ if (!btf->base_btf || !btf->base_id_map)
+ return id;
+ return btf->base_id_map[id];
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+
+static struct btf *btf_parse_module(const char *module_name, const void *data,
+ unsigned int data_size, void *base_data,
+ unsigned int base_data_size)
+{
+ struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ int err = 0;
+
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(vmlinux_btf))
+ return vmlinux_btf;
+ if (!vmlinux_btf)
+ return ERR_PTR(-EINVAL);
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+
+ if (base_data) {
+ base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size);
+ if (IS_ERR(base_btf)) {
+ err = PTR_ERR(base_btf);
+ goto errout;
+ }
+ } else {
+ base_btf = vmlinux_btf;
+ }
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ env->btf = btf;
+
+ btf->base_btf = base_btf;
+ btf->start_id = base_btf->nr_types;
+ btf->start_str_off = base_btf->hdr.str_len;
+ btf->kernel_btf = true;
+ snprintf(btf->name, sizeof(btf->name), "%s", module_name);
+
+ btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!btf->data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ btf->data_size = data_size;
+
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_all_metas(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_type_tags(env, btf, btf_nr_types(base_btf));
+ if (err)
+ goto errout;
+
+ if (base_btf != vmlinux_btf) {
+ err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map);
+ if (err)
+ goto errout;
+ btf_free(base_btf);
+ base_btf = vmlinux_btf;
+ }
+
+ btf_verifier_env_free(env);
+ refcount_set(&btf->refcnt, 1);
+ return btf;
+
+errout:
+ btf_verifier_env_free(env);
+ if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
+ btf_free(base_btf);
+ if (btf) {
+ kvfree(btf->data);
+ kvfree(btf->types);
+ kfree(btf);
+ }
+ return ERR_PTR(err);
+}
+
+#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
+struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
+{
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
+
+ if (tgt_prog)
+ return tgt_prog->aux->btf;
+ else
+ return prog->aux->attach_btf;
+}
+
+static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
+{
+ /* skip modifiers */
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ return btf_type_is_void(t) || btf_type_is_int(t);
+}
+
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 offset = 0, nr_args;
+ int i;
+
+ if (!func_proto)
+ return off / 8;
+
+ nr_args = btf_type_vlen(func_proto);
+ args = (const struct btf_param *)(func_proto + 1);
+ for (i = 0; i < nr_args; i++) {
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return i;
+ }
+
+ t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+ if (off < offset)
+ return nr_args;
+
+ return nr_args + 1;
+}
+
+static bool prog_args_trusted(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type atype = prog->expected_attach_type;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
+ case BPF_PROG_TYPE_LSM:
+ return bpf_lsm_is_trusted(prog);
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return true;
+ default:
+ return false;
+ }
+}
+
+int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
+ u32 arg_no)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ int off = 0, i;
+ u32 sz;
+
+ args = btf_params(func_proto);
+ for (i = 0; i < arg_no; i++) {
+ t = btf_type_by_id(btf, args[i].type);
+ t = btf_resolve_size(btf, t, &sz);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ off += roundup(sz, 8);
+ }
+
+ return off;
+}
+
+struct bpf_raw_tp_null_args {
+ const char *func;
+ u64 mask;
+};
+
+static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
+ /* sched */
+ { "sched_pi_setprio", 0x10 },
+ /* ... from sched_numa_pair_template event class */
+ { "sched_stick_numa", 0x100 },
+ { "sched_swap_numa", 0x100 },
+ /* afs */
+ { "afs_make_fs_call", 0x10 },
+ { "afs_make_fs_calli", 0x10 },
+ { "afs_make_fs_call1", 0x10 },
+ { "afs_make_fs_call2", 0x10 },
+ { "afs_protocol_error", 0x1 },
+ { "afs_flock_ev", 0x10 },
+ /* cachefiles */
+ { "cachefiles_lookup", 0x1 | 0x200 },
+ { "cachefiles_unlink", 0x1 },
+ { "cachefiles_rename", 0x1 },
+ { "cachefiles_prep_read", 0x1 },
+ { "cachefiles_mark_active", 0x1 },
+ { "cachefiles_mark_failed", 0x1 },
+ { "cachefiles_mark_inactive", 0x1 },
+ { "cachefiles_vfs_error", 0x1 },
+ { "cachefiles_io_error", 0x1 },
+ { "cachefiles_ondemand_open", 0x1 },
+ { "cachefiles_ondemand_copen", 0x1 },
+ { "cachefiles_ondemand_close", 0x1 },
+ { "cachefiles_ondemand_read", 0x1 },
+ { "cachefiles_ondemand_cread", 0x1 },
+ { "cachefiles_ondemand_fd_write", 0x1 },
+ { "cachefiles_ondemand_fd_release", 0x1 },
+ /* ext4, from ext4__mballoc event class */
+ { "ext4_mballoc_discard", 0x10 },
+ { "ext4_mballoc_free", 0x10 },
+ /* fib */
+ { "fib_table_lookup", 0x100 },
+ /* filelock */
+ /* ... from filelock_lock event class */
+ { "posix_lock_inode", 0x10 },
+ { "fcntl_setlk", 0x10 },
+ { "locks_remove_posix", 0x10 },
+ { "flock_lock_inode", 0x10 },
+ /* ... from filelock_lease event class */
+ { "break_lease_noblock", 0x10 },
+ { "break_lease_block", 0x10 },
+ { "break_lease_unblock", 0x10 },
+ { "generic_delete_lease", 0x10 },
+ { "time_out_leases", 0x10 },
+ /* host1x */
+ { "host1x_cdma_push_gather", 0x10000 },
+ /* huge_memory */
+ { "mm_khugepaged_scan_pmd", 0x10 },
+ { "mm_collapse_huge_page_isolate", 0x1 },
+ { "mm_khugepaged_scan_file", 0x10 },
+ { "mm_khugepaged_collapse_file", 0x10 },
+ /* kmem */
+ { "mm_page_alloc", 0x1 },
+ { "mm_page_pcpu_drain", 0x1 },
+ /* .. from mm_page event class */
+ { "mm_page_alloc_zone_locked", 0x1 },
+ /* netfs */
+ { "netfs_failure", 0x10 },
+ /* power */
+ { "device_pm_callback_start", 0x10 },
+ /* qdisc */
+ { "qdisc_dequeue", 0x1000 },
+ /* rxrpc */
+ { "rxrpc_recvdata", 0x1 },
+ { "rxrpc_resend", 0x10 },
+ { "rxrpc_tq", 0x10 },
+ { "rxrpc_client", 0x1 },
+ /* skb */
+ {"kfree_skb", 0x1000},
+ /* sunrpc */
+ { "xs_stream_read_data", 0x1 },
+ /* ... from xprt_cong_event event class */
+ { "xprt_reserve_cong", 0x10 },
+ { "xprt_release_cong", 0x10 },
+ { "xprt_get_cong", 0x10 },
+ { "xprt_put_cong", 0x10 },
+ /* tcp */
+ { "tcp_send_reset", 0x11 },
+ { "tcp_sendmsg_locked", 0x100 },
+ /* tegra_apb_dma */
+ { "tegra_dma_tx_status", 0x100 },
+ /* timer_migration */
+ { "tmigr_update_events", 0x1 },
+ /* writeback, from writeback_folio_template event class */
+ { "writeback_dirty_folio", 0x10 },
+ { "folio_wait_writeback", 0x10 },
+ /* rdma */
+ { "mr_integ_alloc", 0x2000 },
+ /* bpf_testmod */
+ { "bpf_testmod_test_read", 0x0 },
+ /* amdgpu */
+ { "amdgpu_vm_bo_map", 0x1 },
+ { "amdgpu_vm_bo_unmap", 0x1 },
+ /* netfs */
+ { "netfs_folioq", 0x1 },
+ /* xfs from xfs_defer_pending_class */
+ { "xfs_defer_create_intent", 0x1 },
+ { "xfs_defer_cancel_list", 0x1 },
+ { "xfs_defer_pending_finish", 0x1 },
+ { "xfs_defer_pending_abort", 0x1 },
+ { "xfs_defer_relog_intent", 0x1 },
+ { "xfs_defer_isolate_paused", 0x1 },
+ { "xfs_defer_item_pause", 0x1 },
+ { "xfs_defer_item_unpause", 0x1 },
+ /* xfs from xfs_defer_pending_item_class */
+ { "xfs_defer_add_item", 0x1 },
+ { "xfs_defer_cancel_item", 0x1 },
+ { "xfs_defer_finish_item", 0x1 },
+ /* xfs from xfs_icwalk_class */
+ { "xfs_ioc_free_eofblocks", 0x10 },
+ { "xfs_blockgc_free_space", 0x10 },
+ /* xfs from xfs_btree_cur_class */
+ { "xfs_btree_updkeys", 0x100 },
+ { "xfs_btree_overlapped_query_range", 0x100 },
+ /* xfs from xfs_imap_class*/
+ { "xfs_map_blocks_found", 0x10000 },
+ { "xfs_map_blocks_alloc", 0x10000 },
+ { "xfs_iomap_alloc", 0x1000 },
+ { "xfs_iomap_found", 0x1000 },
+ /* xfs from xfs_fs_class */
+ { "xfs_inodegc_flush", 0x1 },
+ { "xfs_inodegc_push", 0x1 },
+ { "xfs_inodegc_start", 0x1 },
+ { "xfs_inodegc_stop", 0x1 },
+ { "xfs_inodegc_queue", 0x1 },
+ { "xfs_inodegc_throttle", 0x1 },
+ { "xfs_fs_sync_fs", 0x1 },
+ { "xfs_blockgc_start", 0x1 },
+ { "xfs_blockgc_stop", 0x1 },
+ { "xfs_blockgc_worker", 0x1 },
+ { "xfs_blockgc_flush_all", 0x1 },
+ /* xfs_scrub */
+ { "xchk_nlinks_live_update", 0x10 },
+ /* xfs_scrub from xchk_metapath_class */
+ { "xchk_metapath_lookup", 0x100 },
+ /* nfsd */
+ { "nfsd_dirent", 0x1 },
+ { "nfsd_file_acquire", 0x1001 },
+ { "nfsd_file_insert_err", 0x1 },
+ { "nfsd_file_cons_err", 0x1 },
+ /* nfs4 */
+ { "nfs4_setup_sequence", 0x1 },
+ { "pnfs_update_layout", 0x10000 },
+ { "nfs4_inode_callback_event", 0x200 },
+ { "nfs4_inode_stateid_callback_event", 0x200 },
+ /* nfs from pnfs_layout_event */
+ { "pnfs_mds_fallback_pg_init_read", 0x10000 },
+ { "pnfs_mds_fallback_pg_init_write", 0x10000 },
+ { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 },
+ { "pnfs_mds_fallback_read_done", 0x10000 },
+ { "pnfs_mds_fallback_write_done", 0x10000 },
+ { "pnfs_mds_fallback_read_pagelist", 0x10000 },
+ { "pnfs_mds_fallback_write_pagelist", 0x10000 },
+ /* coda */
+ { "coda_dec_pic_run", 0x10 },
+ { "coda_dec_pic_done", 0x10 },
+ /* cfg80211 */
+ { "cfg80211_scan_done", 0x11 },
+ { "rdev_set_coalesce", 0x10 },
+ { "cfg80211_report_wowlan_wakeup", 0x100 },
+ { "cfg80211_inform_bss_frame", 0x100 },
+ { "cfg80211_michael_mic_failure", 0x10000 },
+ /* cfg80211 from wiphy_work_event */
+ { "wiphy_work_queue", 0x10 },
+ { "wiphy_work_run", 0x10 },
+ { "wiphy_work_cancel", 0x10 },
+ { "wiphy_work_flush", 0x10 },
+ /* hugetlbfs */
+ { "hugetlbfs_alloc_inode", 0x10 },
+ /* spufs */
+ { "spufs_context", 0x10 },
+ /* kvm_hv */
+ { "kvm_page_fault_enter", 0x100 },
+ /* dpu */
+ { "dpu_crtc_setup_mixer", 0x100 },
+ /* binder */
+ { "binder_transaction", 0x100 },
+ /* bcachefs */
+ { "btree_path_free", 0x100 },
+ /* hfi1_tx */
+ { "hfi1_sdma_progress", 0x1000 },
+ /* iptfs */
+ { "iptfs_ingress_postq_event", 0x1000 },
+ /* neigh */
+ { "neigh_update", 0x10 },
+ /* snd_firewire_lib */
+ { "amdtp_packet", 0x100 },
+};
+
+bool btf_ctx_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const struct btf_type *t = prog->aux->attach_func_proto;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
+ struct btf *btf = bpf_prog_get_target_btf(prog);
+ const char *tname = prog->aux->attach_func_name;
+ struct bpf_verifier_log *log = info->log;
+ const struct btf_param *args;
+ bool ptr_err_raw_tp = false;
+ const char *tag_value;
+ u32 nr_args, arg;
+ int i, ret;
+
+ if (off % 8) {
+ bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
+ tname, off);
+ return false;
+ }
+ arg = btf_ctx_arg_idx(btf, t, off);
+ args = (const struct btf_param *)(t + 1);
+ /* if (t == NULL) Fall back to default BPF prog with
+ * MAX_BPF_FUNC_REG_ARGS u64 arguments.
+ */
+ nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
+ if (prog->aux->attach_btf_trace) {
+ /* skip first 'void *__data' argument in btf_trace_##name typedef */
+ args++;
+ nr_args--;
+ }
+
+ if (arg > nr_args) {
+ bpf_log(log, "func '%s' doesn't have %d-th argument\n",
+ tname, arg + 1);
+ return false;
+ }
+
+ if (arg == nr_args) {
+ switch (prog->expected_attach_type) {
+ case BPF_LSM_MAC:
+ /* mark we are accessing the return value */
+ info->is_retval = true;
+ fallthrough;
+ case BPF_LSM_CGROUP:
+ case BPF_TRACE_FEXIT:
+ /* When LSM programs are attached to void LSM hooks
+ * they use FEXIT trampolines and when attached to
+ * int LSM hooks, they use MODIFY_RETURN trampolines.
+ *
+ * While the LSM programs are BPF_MODIFY_RETURN-like
+ * the check:
+ *
+ * if (ret_type != 'int')
+ * return -EINVAL;
+ *
+ * is _not_ done here. This is still safe as LSM hooks
+ * have only void and int return types.
+ */
+ if (!t)
+ return true;
+ t = btf_type_by_id(btf, t->type);
+ break;
+ case BPF_MODIFY_RETURN:
+ /* For now the BPF_MODIFY_RETURN can only be attached to
+ * functions that return an int.
+ */
+ if (!t)
+ return false;
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!btf_type_is_small_int(t)) {
+ bpf_log(log,
+ "ret type %s not allowed for fmod_ret\n",
+ btf_type_str(t));
+ return false;
+ }
+ break;
+ default:
+ bpf_log(log, "func '%s' doesn't have %d-th argument\n",
+ tname, arg + 1);
+ return false;
+ }
+ } else {
+ if (!t)
+ /* Default prog with MAX_BPF_FUNC_REG_ARGS args */
+ return true;
+ t = btf_type_by_id(btf, args[arg].type);
+ }
+
+ /* skip modifiers */
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
+ /* accessing a scalar */
+ return true;
+ if (!btf_type_is_ptr(t)) {
+ bpf_log(log,
+ "func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
+ tname, arg,
+ __btf_name_by_offset(btf, t->name_off),
+ btf_type_str(t));
+ return false;
+ }
+
+ if (size != sizeof(u64)) {
+ bpf_log(log, "func '%s' size %d must be 8\n",
+ tname, size);
+ return false;
+ }
+
+ /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ u32 type, flag;
+
+ type = base_type(ctx_arg_info->reg_type);
+ flag = type_flag(ctx_arg_info->reg_type);
+ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
+ (flag & PTR_MAYBE_NULL)) {
+ info->reg_type = ctx_arg_info->reg_type;
+ return true;
+ }
+ }
+
+ /*
+ * If it's a pointer to void, it's the same as scalar from the verifier
+ * safety POV. Either way, no futher pointer walking is allowed.
+ */
+ if (is_void_or_int_ptr(btf, t))
+ return true;
+
+ /* this is a pointer to another type */
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+
+ if (ctx_arg_info->offset == off) {
+ if (!ctx_arg_info->btf_id) {
+ bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
+ return false;
+ }
+
+ info->reg_type = ctx_arg_info->reg_type;
+ info->btf = ctx_arg_info->btf ? : btf_vmlinux;
+ info->btf_id = ctx_arg_info->btf_id;
+ info->ref_obj_id = ctx_arg_info->ref_obj_id;
+ return true;
+ }
+ }
+
+ info->reg_type = PTR_TO_BTF_ID;
+ if (prog_args_trusted(prog))
+ info->reg_type |= PTR_TRUSTED;
+
+ if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ info->reg_type |= PTR_MAYBE_NULL;
+
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ struct btf *btf = prog->aux->attach_btf;
+ const struct btf_type *t;
+ const char *tname;
+
+ /* BTF lookups cannot fail, return false on error */
+ t = btf_type_by_id(btf, prog->aux->attach_btf_id);
+ if (!t)
+ return false;
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname)
+ return false;
+ /* Checked by bpf_check_attach_target */
+ tname += sizeof("btf_trace_") - 1;
+ for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) {
+ /* Is this a func with potential NULL args? */
+ if (strcmp(tname, raw_tp_null_args[i].func))
+ continue;
+ if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4)))
+ info->reg_type |= PTR_MAYBE_NULL;
+ /* Is the current arg IS_ERR? */
+ if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4)))
+ ptr_err_raw_tp = true;
+ break;
+ }
+ /* If we don't know NULL-ness specification and the tracepoint
+ * is coming from a loadable module, be conservative and mark
+ * argument as PTR_MAYBE_NULL.
+ */
+ if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf))
+ info->reg_type |= PTR_MAYBE_NULL;
+ }
+
+ if (tgt_prog) {
+ enum bpf_prog_type tgt_type;
+
+ if (tgt_prog->type == BPF_PROG_TYPE_EXT)
+ tgt_type = tgt_prog->aux->saved_dst_prog_type;
+ else
+ tgt_type = tgt_prog->type;
+
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
+ if (ret > 0) {
+ info->btf = btf_vmlinux;
+ info->btf_id = ret;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ info->btf = btf;
+ info->btf_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tag_value, "user") == 0)
+ info->reg_type |= MEM_USER;
+ if (strcmp(tag_value, "percpu") == 0)
+ info->reg_type |= MEM_PERCPU;
+ }
+
+ /* skip modifiers */
+ while (btf_type_is_modifier(t)) {
+ info->btf_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+ if (!btf_type_is_struct(t)) {
+ bpf_log(log,
+ "func '%s' arg%d type %s is not a struct\n",
+ tname, arg, btf_type_str(t));
+ return false;
+ }
+ bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
+ tname, arg, info->btf_id, btf_type_str(t),
+ __btf_name_by_offset(btf, t->name_off));
+
+ /* Perform all checks on the validity of type for this argument, but if
+ * we know it can be IS_ERR at runtime, scrub pointer type and mark as
+ * scalar.
+ */
+ if (ptr_err_raw_tp) {
+ bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg);
+ info->reg_type = SCALAR_VALUE;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(btf_ctx_access);
+
+enum bpf_struct_walk_result {
+ /* < 0 error */
+ WALK_SCALAR = 0,
+ WALK_PTR,
+ WALK_PTR_UNTRUSTED,
+ WALK_STRUCT,
+};
+
+static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int off, int size,
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
+{
+ u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
+ const struct btf_type *mtype, *elem_type = NULL;
+ const struct btf_member *member;
+ const char *tname, *mname, *tag_value;
+ u32 vlen, elem_id, mid;
+
+again:
+ if (btf_type_is_modifier(t))
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ tname = __btf_name_by_offset(btf, t->name_off);
+ if (!btf_type_is_struct(t)) {
+ bpf_log(log, "Type '%s' is not a struct\n", tname);
+ return -EINVAL;
+ }
+
+ vlen = btf_type_vlen(t);
+ if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED))
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
+ if (off + size > t->size) {
+ /* If the last element is a variable size array, we may
+ * need to relax the rule.
+ */
+ struct btf_array *array_elem;
+
+ if (vlen == 0)
+ goto error;
+
+ member = btf_type_member(t) + vlen - 1;
+ mtype = btf_type_skip_modifiers(btf, member->type,
+ NULL);
+ if (!btf_type_is_array(mtype))
+ goto error;
+
+ array_elem = (struct btf_array *)(mtype + 1);
+ if (array_elem->nelems != 0)
+ goto error;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ if (off < moff)
+ goto error;
+
+ /* allow structure and integer */
+ t = btf_type_skip_modifiers(btf, array_elem->type,
+ NULL);
+
+ if (btf_type_is_int(t))
+ return WALK_SCALAR;
+
+ if (!btf_type_is_struct(t))
+ goto error;
+
+ off = (off - moff) % t->size;
+ goto again;
+
+error:
+ bpf_log(log, "access beyond struct %s at off %u size %u\n",
+ tname, off, size);
+ return -EACCES;
+ }
+
+ for_each_member(i, t, member) {
+ /* offset of the field in bytes */
+ moff = __btf_member_bit_offset(t, member) / 8;
+ if (off + size <= moff)
+ /* won't find anything, field is already too far */
+ break;
+
+ if (__btf_member_bitfield_size(t, member)) {
+ u32 end_bit = __btf_member_bit_offset(t, member) +
+ __btf_member_bitfield_size(t, member);
+
+ /* off <= moff instead of off == moff because clang
+ * does not generate a BTF member for anonymous
+ * bitfield like the ":16" here:
+ * struct {
+ * int :16;
+ * int x:8;
+ * };
+ */
+ if (off <= moff &&
+ BITS_ROUNDUP_BYTES(end_bit) <= off + size)
+ return WALK_SCALAR;
+
+ /* off may be accessing a following member
+ *
+ * or
+ *
+ * Doing partial access at either end of this
+ * bitfield. Continue on this case also to
+ * treat it as not accessing this bitfield
+ * and eventually error out as field not
+ * found to keep it simple.
+ * It could be relaxed if there was a legit
+ * partial access case later.
+ */
+ continue;
+ }
+
+ /* In case of "off" is pointing to holes of a struct */
+ if (off < moff)
+ break;
+
+ /* type of the field */
+ mid = member->type;
+ mtype = btf_type_by_id(btf, member->type);
+ mname = __btf_name_by_offset(btf, member->name_off);
+
+ mtype = __btf_resolve_size(btf, mtype, &msize,
+ &elem_type, &elem_id, &total_nelems,
+ &mid);
+ if (IS_ERR(mtype)) {
+ bpf_log(log, "field %s doesn't have size\n", mname);
+ return -EFAULT;
+ }
+
+ mtrue_end = moff + msize;
+ if (off >= mtrue_end)
+ /* no overlap with member, keep iterating */
+ continue;
+
+ if (btf_type_is_array(mtype)) {
+ u32 elem_idx;
+
+ /* __btf_resolve_size() above helps to
+ * linearize a multi-dimensional array.
+ *
+ * The logic here is treating an array
+ * in a struct as the following way:
+ *
+ * struct outer {
+ * struct inner array[2][2];
+ * };
+ *
+ * looks like:
+ *
+ * struct outer {
+ * struct inner array_elem0;
+ * struct inner array_elem1;
+ * struct inner array_elem2;
+ * struct inner array_elem3;
+ * };
+ *
+ * When accessing outer->array[1][0], it moves
+ * moff to "array_elem2", set mtype to
+ * "struct inner", and msize also becomes
+ * sizeof(struct inner). Then most of the
+ * remaining logic will fall through without
+ * caring the current member is an array or
+ * not.
+ *
+ * Unlike mtype/msize/moff, mtrue_end does not
+ * change. The naming difference ("_true") tells
+ * that it is not always corresponding to
+ * the current mtype/msize/moff.
+ * It is the true end of the current
+ * member (i.e. array in this case). That
+ * will allow an int array to be accessed like
+ * a scratch space,
+ * i.e. allow access beyond the size of
+ * the array's element as long as it is
+ * within the mtrue_end boundary.
+ */
+
+ /* skip empty array */
+ if (moff == mtrue_end)
+ continue;
+
+ msize /= total_nelems;
+ elem_idx = (off - moff) / msize;
+ moff += elem_idx * msize;
+ mtype = elem_type;
+ mid = elem_id;
+ }
+
+ /* the 'off' we're looking for is either equal to start
+ * of this field or inside of this struct
+ */
+ if (btf_type_is_struct(mtype)) {
+ /* our field must be inside that union or struct */
+ t = mtype;
+
+ /* return if the offset matches the member offset */
+ if (off == moff) {
+ *next_btf_id = mid;
+ return WALK_STRUCT;
+ }
+
+ /* adjust offset we're looking for */
+ off -= moff;
+ goto again;
+ }
+
+ if (btf_type_is_ptr(mtype)) {
+ const struct btf_type *stype, *t;
+ enum bpf_type_flag tmp_flag = 0;
+ u32 id;
+
+ if (msize != size || off != moff) {
+ bpf_log(log,
+ "cannot access ptr member %s with moff %u in struct %s with off %u size %u\n",
+ mname, moff, tname, off, size);
+ return -EACCES;
+ }
+
+ /* check type tag */
+ t = btf_type_by_id(btf, mtype->type);
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ /* check __user tag */
+ if (strcmp(tag_value, "user") == 0)
+ tmp_flag = MEM_USER;
+ /* check __percpu tag */
+ if (strcmp(tag_value, "percpu") == 0)
+ tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ tmp_flag = MEM_RCU;
+ }
+
+ stype = btf_type_skip_modifiers(btf, mtype->type, &id);
+ if (btf_type_is_struct(stype)) {
+ *next_btf_id = id;
+ *flag |= tmp_flag;
+ if (field_name)
+ *field_name = mname;
+ return WALK_PTR;
+ }
+
+ return WALK_PTR_UNTRUSTED;
+ }
+
+ /* Allow more flexible access within an int as long as
+ * it is within mtrue_end.
+ * Since mtrue_end could be the end of an array,
+ * that also allows using an array of int as a scratch
+ * space. e.g. skb->cb[].
+ */
+ if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) {
+ bpf_log(log,
+ "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
+ mname, mtrue_end, tname, off, size);
+ return -EACCES;
+ }
+
+ return WALK_SCALAR;
+ }
+ bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
+ return -EINVAL;
+}
+
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype __maybe_unused,
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
+{
+ const struct btf *btf = reg->btf;
+ enum bpf_type_flag tmp_flag = 0;
+ const struct btf_type *t;
+ u32 id = reg->btf_id;
+ int err;
+
+ while (type_is_alloc(reg->type)) {
+ struct btf_struct_meta *meta;
+ struct btf_record *rec;
+ int i;
+
+ meta = btf_find_struct_meta(btf, id);
+ if (!meta)
+ break;
+ rec = meta->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 offset = field->offset;
+ if (off < offset + field->size && offset < off + size) {
+ bpf_log(log,
+ "direct access to %s is disallowed\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+
+ t = btf_type_by_id(btf, id);
+ do {
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
+
+ switch (err) {
+ case WALK_PTR:
+ /* For local types, the destination register cannot
+ * become a pointer again.
+ */
+ if (type_is_alloc(reg->type))
+ return SCALAR_VALUE;
+ /* If we found the pointer or scalar on t+off,
+ * we're done.
+ */
+ *next_btf_id = id;
+ *flag = tmp_flag;
+ return PTR_TO_BTF_ID;
+ case WALK_PTR_UNTRUSTED:
+ *flag = MEM_RDONLY | PTR_UNTRUSTED;
+ return PTR_TO_MEM;
+ case WALK_SCALAR:
+ return SCALAR_VALUE;
+ case WALK_STRUCT:
+ /* We found nested struct, so continue the search
+ * by diving in it. At this point the offset is
+ * aligned with the new type, so set it to 0.
+ */
+ t = btf_type_by_id(btf, id);
+ off = 0;
+ break;
+ default:
+ /* It's either error or unknown return value..
+ * scream and leave.
+ */
+ if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value"))
+ return -EINVAL;
+ return err;
+ }
+ } while (t);
+
+ return -EINVAL;
+}
+
+/* Check that two BTF types, each specified as an BTF object + id, are exactly
+ * the same. Trivial ID check is not enough due to module BTFs, because we can
+ * end up with two different module BTFs, but IDs point to the common type in
+ * vmlinux BTF.
+ */
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
+{
+ if (id1 != id2)
+ return false;
+ if (btf1 == btf2)
+ return true;
+ return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
+}
+
+bool btf_struct_ids_match(struct bpf_verifier_log *log,
+ const struct btf *btf, u32 id, int off,
+ const struct btf *need_btf, u32 need_type_id,
+ bool strict)
+{
+ const struct btf_type *type;
+ enum bpf_type_flag flag = 0;
+ int err;
+
+ /* Are we already done? */
+ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
+ return true;
+ /* In case of strict type match, we do not walk struct, the top level
+ * type match must succeed. When strict is true, off should have already
+ * been 0.
+ */
+ if (strict)
+ return false;
+again:
+ type = btf_type_by_id(btf, id);
+ if (!type)
+ return false;
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
+ if (err != WALK_STRUCT)
+ return false;
+
+ /* We found nested struct object. If it matches
+ * the requested ID, we're done. Otherwise let's
+ * continue the search with offset 0 in the new
+ * type.
+ */
+ if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
+ off = 0;
+ goto again;
+ }
+
+ return true;
+}
+
+static int __get_type_size(struct btf *btf, u32 btf_id,
+ const struct btf_type **ret_type)
+{
+ const struct btf_type *t;
+
+ *ret_type = btf_type_by_id(btf, 0);
+ if (!btf_id)
+ /* void */
+ return 0;
+ t = btf_type_by_id(btf, btf_id);
+ while (t && btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!t)
+ return -EINVAL;
+ *ret_type = t;
+ if (btf_type_is_ptr(t))
+ /* kernel size of pointer. Not BPF's size of pointer*/
+ return sizeof(void *);
+ if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t))
+ return t->size;
+ return -EINVAL;
+}
+
+static u8 __get_type_fmodel_flags(const struct btf_type *t)
+{
+ u8 flags = 0;
+
+ if (btf_type_is_struct(t))
+ flags |= BTF_FMODEL_STRUCT_ARG;
+ if (btf_type_is_signed_int(t))
+ flags |= BTF_FMODEL_SIGNED_ARG;
+
+ return flags;
+}
+
+int btf_distill_func_proto(struct bpf_verifier_log *log,
+ struct btf *btf,
+ const struct btf_type *func,
+ const char *tname,
+ struct btf_func_model *m)
+{
+ const struct btf_param *args;
+ const struct btf_type *t;
+ u32 i, nargs;
+ int ret;
+
+ if (!func) {
+ /* BTF function prototype doesn't match the verifier types.
+ * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
+ */
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+ m->arg_size[i] = 8;
+ m->arg_flags[i] = 0;
+ }
+ m->ret_size = 8;
+ m->ret_flags = 0;
+ m->nr_args = MAX_BPF_FUNC_REG_ARGS;
+ return 0;
+ }
+ args = (const struct btf_param *)(func + 1);
+ nargs = btf_type_vlen(func);
+ if (nargs > MAX_BPF_FUNC_ARGS) {
+ bpf_log(log,
+ "The function %s has %d arguments. Too many.\n",
+ tname, nargs);
+ return -EINVAL;
+ }
+ ret = __get_type_size(btf, func->type, &t);
+ if (ret < 0 || btf_type_is_struct(t)) {
+ bpf_log(log,
+ "The function %s return type %s is unsupported.\n",
+ tname, btf_type_str(t));
+ return -EINVAL;
+ }
+ m->ret_size = ret;
+ m->ret_flags = __get_type_fmodel_flags(t);
+
+ for (i = 0; i < nargs; i++) {
+ if (i == nargs - 1 && args[i].type == 0) {
+ bpf_log(log,
+ "The function %s with variable args is unsupported.\n",
+ tname);
+ return -EINVAL;
+ }
+ ret = __get_type_size(btf, args[i].type, &t);
+
+ /* No support of struct argument size greater than 16 bytes */
+ if (ret < 0 || ret > 16) {
+ bpf_log(log,
+ "The function %s arg%d type %s is unsupported.\n",
+ tname, i, btf_type_str(t));
+ return -EINVAL;
+ }
+ if (ret == 0) {
+ bpf_log(log,
+ "The function %s has malformed void argument.\n",
+ tname);
+ return -EINVAL;
+ }
+ m->arg_size[i] = ret;
+ m->arg_flags[i] = __get_type_fmodel_flags(t);
+ }
+ m->nr_args = nargs;
+ return 0;
+}
+
+/* Compare BTFs of two functions assuming only scalars and pointers to context.
+ * t1 points to BTF_KIND_FUNC in btf1
+ * t2 points to BTF_KIND_FUNC in btf2
+ * Returns:
+ * EINVAL - function prototype mismatch
+ * EFAULT - verifier bug
+ * 0 - 99% match. The last 1% is validated by the verifier.
+ */
+static int btf_check_func_type_match(struct bpf_verifier_log *log,
+ struct btf *btf1, const struct btf_type *t1,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ const struct btf_param *args1, *args2;
+ const char *fn1, *fn2, *s1, *s2;
+ u32 nargs1, nargs2, i;
+
+ fn1 = btf_name_by_offset(btf1, t1->name_off);
+ fn2 = btf_name_by_offset(btf2, t2->name_off);
+
+ if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn1);
+ return -EINVAL;
+ }
+ if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
+ bpf_log(log, "%s() is not a global function\n", fn2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_by_id(btf1, t1->type);
+ if (!t1 || !btf_type_is_func_proto(t1))
+ return -EFAULT;
+ t2 = btf_type_by_id(btf2, t2->type);
+ if (!t2 || !btf_type_is_func_proto(t2))
+ return -EFAULT;
+
+ args1 = (const struct btf_param *)(t1 + 1);
+ nargs1 = btf_type_vlen(t1);
+ args2 = (const struct btf_param *)(t2 + 1);
+ nargs2 = btf_type_vlen(t2);
+
+ if (nargs1 != nargs2) {
+ bpf_log(log, "%s() has %d args while %s() has %d args\n",
+ fn1, nargs1, fn2, nargs2);
+ return -EINVAL;
+ }
+
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (t1->info != t2->info) {
+ bpf_log(log,
+ "Return type %s of %s() doesn't match type %s of %s()\n",
+ btf_type_str(t1), fn1,
+ btf_type_str(t2), fn2);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nargs1; i++) {
+ t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);
+
+ if (t1->info != t2->info) {
+ bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
+ i, fn1, btf_type_str(t1),
+ fn2, btf_type_str(t2));
+ return -EINVAL;
+ }
+ if (btf_type_has_size(t1) && t1->size != t2->size) {
+ bpf_log(log,
+ "arg%d in %s() has size %d while %s() has %d\n",
+ i, fn1, t1->size,
+ fn2, t2->size);
+ return -EINVAL;
+ }
+
+ /* global functions are validated with scalars and pointers
+ * to context only. And only global functions can be replaced.
+ * Hence type check only those types.
+ */
+ if (btf_type_is_int(t1) || btf_is_any_enum(t1))
+ continue;
+ if (!btf_type_is_ptr(t1)) {
+ bpf_log(log,
+ "arg%d in %s() has unrecognized type\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+ t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+ if (!btf_type_is_struct(t1)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn1);
+ return -EINVAL;
+ }
+ if (!btf_type_is_struct(t2)) {
+ bpf_log(log,
+ "arg%d in %s() is not a pointer to context\n",
+ i, fn2);
+ return -EINVAL;
+ }
+ /* This is an optional check to make program writing easier.
+ * Compare names of structs and report an error to the user.
+ * btf_prepare_func_args() already checked that t2 struct
+ * is a context type. btf_prepare_func_args() will check
+ * later that t1 struct is a context type as well.
+ */
+ s1 = btf_name_by_offset(btf1, t1->name_off);
+ s2 = btf_name_by_offset(btf2, t2->name_off);
+ if (strcmp(s1, s2)) {
+ bpf_log(log,
+ "arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
+ i, fn1, s1, fn2, s2);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* Compare BTFs of given program with BTF of target program */
+int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
+ struct btf *btf2, const struct btf_type *t2)
+{
+ struct btf *btf1 = prog->aux->btf;
+ const struct btf_type *t1;
+ u32 btf_id = 0;
+
+ if (!prog->aux->func_info) {
+ bpf_log(log, "Program extension requires BTF\n");
+ return -EINVAL;
+ }
+
+ btf_id = prog->aux->func_info[0].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ t1 = btf_type_by_id(btf1, btf_id);
+ if (!t1 || !btf_type_is_func(t1))
+ return -EFAULT;
+
+ return btf_check_func_type_match(log, btf1, t1, btf2, t2);
+}
+
+static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
+{
+ const char *name;
+
+ t = btf_type_by_id(btf, t->type); /* skip PTR */
+
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+
+ /* allow either struct or struct forward declaration */
+ if (btf_type_is_struct(t) ||
+ (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) {
+ name = btf_str_by_offset(btf, t->name_off);
+ return name && strcmp(name, "bpf_dynptr") == 0;
+ }
+
+ return false;
+}
+
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id);
+
+static int btf_get_ptr_to_btf_id(struct bpf_verifier_log *log, int arg_idx,
+ const struct btf *btf, const struct btf_type *t)
+{
+ struct bpf_cand_cache *cc;
+ struct bpf_core_ctx ctx = {
+ .btf = btf,
+ .log = log,
+ };
+ u32 kern_type_id, type_id;
+ int err = 0;
+
+ /* skip PTR and modifiers */
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t)) {
+ type_id = t->type;
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(&ctx, type_id);
+ if (IS_ERR(cc)) {
+ err = PTR_ERR(cc);
+ bpf_log(log, "arg#%d reference type('%s %s') candidate matching error: %d\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ err);
+ goto cand_cache_unlock;
+ }
+ if (cc->cnt != 1) {
+ bpf_log(log, "arg#%d reference type('%s %s') %s\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off),
+ cc->cnt == 0 ? "has no matches" : "is ambiguous");
+ err = cc->cnt == 0 ? -ENOENT : -ESRCH;
+ goto cand_cache_unlock;
+ }
+ if (btf_is_module(cc->cands[0].btf)) {
+ bpf_log(log, "arg#%d reference type('%s %s') points to kernel module type (unsupported)\n",
+ arg_idx, btf_type_str(t), __btf_name_by_offset(btf, t->name_off));
+ err = -EOPNOTSUPP;
+ goto cand_cache_unlock;
+ }
+ kern_type_id = cc->cands[0].id;
+
+cand_cache_unlock:
+ mutex_unlock(&cand_cache_mutex);
+ if (err)
+ return err;
+
+ return kern_type_id;
+}
+
+enum btf_arg_tag {
+ ARG_TAG_CTX = BIT_ULL(0),
+ ARG_TAG_NONNULL = BIT_ULL(1),
+ ARG_TAG_TRUSTED = BIT_ULL(2),
+ ARG_TAG_UNTRUSTED = BIT_ULL(3),
+ ARG_TAG_NULLABLE = BIT_ULL(4),
+ ARG_TAG_ARENA = BIT_ULL(5),
+};
+
+/* Process BTF of a function to produce high-level expectation of function
+ * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
+ * is cached in subprog info for reuse.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - cannot convert BTF.
+ * 0 - Successfully processed BTF and constructed argument expectations.
+ */
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
+{
+ bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL;
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_verifier_log *log = &env->log;
+ struct bpf_prog *prog = env->prog;
+ enum bpf_prog_type prog_type = prog->type;
+ struct btf *btf = prog->aux->btf;
+ const struct btf_param *args;
+ const struct btf_type *t, *ref_t, *fn_t;
+ u32 i, nargs, btf_id;
+ const char *tname;
+
+ if (sub->args_cached)
+ return 0;
+
+ if (!prog->aux->func_info) {
+ verifier_bug(env, "func_info undefined");
+ return -EFAULT;
+ }
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id) {
+ if (!is_global) /* not fatal for static funcs */
+ return -EINVAL;
+ bpf_log(log, "Global functions need valid BTF\n");
+ return -EFAULT;
+ }
+
+ fn_t = btf_type_by_id(btf, btf_id);
+ if (!fn_t || !btf_type_is_func(fn_t)) {
+ /* These checks were already done by the verifier while loading
+ * struct bpf_func_info
+ */
+ bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
+ subprog);
+ return -EFAULT;
+ }
+ tname = btf_name_by_offset(btf, fn_t->name_off);
+
+ if (prog->aux->func_info_aux[subprog].unreliable) {
+ verifier_bug(env, "unreliable BTF for function %s()", tname);
+ return -EFAULT;
+ }
+ if (prog_type == BPF_PROG_TYPE_EXT)
+ prog_type = prog->aux->dst_prog->type;
+
+ t = btf_type_by_id(btf, fn_t->type);
+ if (!t || !btf_type_is_func_proto(t)) {
+ bpf_log(log, "Invalid type of function %s()\n", tname);
+ return -EFAULT;
+ }
+ args = (const struct btf_param *)(t + 1);
+ nargs = btf_type_vlen(t);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ if (!is_global)
+ return -EINVAL;
+ bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+ tname, nargs, MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+ /* check that function returns int, exception cb also requires this */
+ t = btf_type_by_id(btf, t->type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
+ if (!is_global)
+ return -EINVAL;
+ bpf_log(log,
+ "Global function %s() doesn't return scalar. Only those are supported.\n",
+ tname);
+ return -EINVAL;
+ }
+ /* Convert BTF function arguments into verifier types.
+ * Only PTR_TO_CTX and SCALAR are supported atm.
+ */
+ for (i = 0; i < nargs; i++) {
+ u32 tags = 0;
+ int id = 0;
+
+ /* 'arg:<tag>' decl_tag takes precedence over derivation of
+ * register type from BTF type itself
+ */
+ while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) {
+ const struct btf_type *tag_t = btf_type_by_id(btf, id);
+ const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4;
+
+ /* disallow arg tags in static subprogs */
+ if (!is_global) {
+ bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
+ return -EOPNOTSUPP;
+ }
+
+ if (strcmp(tag, "ctx") == 0) {
+ tags |= ARG_TAG_CTX;
+ } else if (strcmp(tag, "trusted") == 0) {
+ tags |= ARG_TAG_TRUSTED;
+ } else if (strcmp(tag, "untrusted") == 0) {
+ tags |= ARG_TAG_UNTRUSTED;
+ } else if (strcmp(tag, "nonnull") == 0) {
+ tags |= ARG_TAG_NONNULL;
+ } else if (strcmp(tag, "nullable") == 0) {
+ tags |= ARG_TAG_NULLABLE;
+ } else if (strcmp(tag, "arena") == 0) {
+ tags |= ARG_TAG_ARENA;
+ } else {
+ bpf_log(log, "arg#%d has unsupported set of tags\n", i);
+ return -EOPNOTSUPP;
+ }
+ }
+ if (id != -ENOENT) {
+ bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id);
+ return id;
+ }
+
+ t = btf_type_by_id(btf, args[i].type);
+ while (btf_type_is_modifier(t))
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_ptr(t))
+ goto skip_pointer;
+
+ if ((tags & ARG_TAG_CTX) || btf_is_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (tags & ~ARG_TAG_CTX) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+ if ((tags & ARG_TAG_CTX) &&
+ btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+ prog->expected_attach_type))
+ return -EINVAL;
+ sub->args[i].arg_type = ARG_PTR_TO_CTX;
+ continue;
+ }
+ if (btf_is_dynptr_ptr(btf, t)) {
+ if (tags) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
+ continue;
+ }
+ if (tags & ARG_TAG_TRUSTED) {
+ int kern_type_id;
+
+ if (tags & ARG_TAG_NONNULL) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+
+ kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+ if (kern_type_id < 0)
+ return kern_type_id;
+
+ sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_TRUSTED;
+ if (tags & ARG_TAG_NULLABLE)
+ sub->args[i].arg_type |= PTR_MAYBE_NULL;
+ sub->args[i].btf_id = kern_type_id;
+ continue;
+ }
+ if (tags & ARG_TAG_UNTRUSTED) {
+ struct btf *vmlinux_btf;
+ int kern_type_id;
+
+ if (tags & ~ARG_TAG_UNTRUSTED) {
+ bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i);
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) {
+ sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
+ sub->args[i].mem_size = 0;
+ continue;
+ }
+
+ kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+ if (kern_type_id < 0)
+ return kern_type_id;
+
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ ref_t = btf_type_by_id(vmlinux_btf, kern_type_id);
+ if (!btf_type_is_struct(ref_t)) {
+ tname = __btf_name_by_offset(vmlinux_btf, t->name_off);
+ bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n",
+ i, btf_type_str(ref_t), tname);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ sub->args[i].btf_id = kern_type_id;
+ continue;
+ }
+ if (tags & ARG_TAG_ARENA) {
+ if (tags & ~ARG_TAG_ARENA) {
+ bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_ARENA;
+ continue;
+ }
+ if (is_global) { /* generic user data pointer */
+ u32 mem_size;
+
+ if (tags & ARG_TAG_NULLABLE) {
+ bpf_log(log, "arg#%d has invalid combination of tags\n", i);
+ return -EINVAL;
+ }
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ ref_t = btf_resolve_size(btf, t, &mem_size);
+ if (IS_ERR(ref_t)) {
+ bpf_log(log, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ PTR_ERR(ref_t));
+ return -EINVAL;
+ }
+
+ sub->args[i].arg_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL;
+ if (tags & ARG_TAG_NONNULL)
+ sub->args[i].arg_type &= ~PTR_MAYBE_NULL;
+ sub->args[i].mem_size = mem_size;
+ continue;
+ }
+
+skip_pointer:
+ if (tags) {
+ bpf_log(log, "arg#%d has pointer tag, but is not a pointer type\n", i);
+ return -EINVAL;
+ }
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
+ sub->args[i].arg_type = ARG_ANYTHING;
+ continue;
+ }
+ if (!is_global)
+ return -EINVAL;
+ bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
+ i, btf_type_str(t), tname);
+ return -EINVAL;
+ }
+
+ sub->arg_cnt = nargs;
+ sub->args_cached = true;
+
+ return 0;
+}
+
+static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
+ struct btf_show *show)
+{
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ show->btf = btf;
+ memset(&show->state, 0, sizeof(show->state));
+ memset(&show->obj, 0, sizeof(show->obj));
+
+ btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
+}
+
+__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ seq_vprintf((struct seq_file *)show->target, fmt, args);
+}
+
+int btf_type_seq_show_flags(const struct btf *btf, u32 type_id,
+ void *obj, struct seq_file *m, u64 flags)
+{
+ struct btf_show sseq;
+
+ sseq.target = m;
+ sseq.showfn = btf_seq_show;
+ sseq.flags = flags;
+
+ btf_type_show(btf, type_id, obj, &sseq);
+
+ return sseq.state.status;
+}
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+ struct seq_file *m)
+{
+ (void) btf_type_seq_show_flags(btf, type_id, obj, m,
+ BTF_SHOW_NONAME | BTF_SHOW_COMPACT |
+ BTF_SHOW_ZERO | BTF_SHOW_UNSAFE);
+}
+
+struct btf_show_snprintf {
+ struct btf_show show;
+ int len_left; /* space left in string */
+ int len; /* length we would have written */
+};
+
+__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
+ int len;
+
+ len = vsnprintf(show->target, ssnprintf->len_left, fmt, args);
+
+ if (len < 0) {
+ ssnprintf->len_left = 0;
+ ssnprintf->len = len;
+ } else if (len >= ssnprintf->len_left) {
+ /* no space, drive on to get length we would have written */
+ ssnprintf->len_left = 0;
+ ssnprintf->len += len;
+ } else {
+ ssnprintf->len_left -= len;
+ ssnprintf->len += len;
+ show->target += len;
+ }
+}
+
+int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
+ char *buf, int len, u64 flags)
+{
+ struct btf_show_snprintf ssnprintf;
+
+ ssnprintf.show.target = buf;
+ ssnprintf.show.flags = flags;
+ ssnprintf.show.showfn = btf_snprintf_show;
+ ssnprintf.len_left = len;
+ ssnprintf.len = 0;
+
+ btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
+
+ /* If we encountered an error, return it. */
+ if (ssnprintf.show.state.status)
+ return ssnprintf.show.state.status;
+
+ /* Otherwise return length we would have written */
+ return ssnprintf.len;
+}
+
+#ifdef CONFIG_PROC_FS
+static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct btf *btf = filp->private_data;
+
+ seq_printf(m, "btf_id:\t%u\n", btf->id);
+}
+#endif
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+ btf_put(filp->private_data);
+ return 0;
+}
+
+const struct file_operations btf_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_btf_show_fdinfo,
+#endif
+ .release = btf_release,
+};
+
+static int __btf_new_fd(struct btf *btf)
+{
+ return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+{
+ struct btf *btf;
+ int ret;
+
+ btf = btf_parse(attr, uattr, uattr_size);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ ret = btf_alloc_id(btf);
+ if (ret) {
+ btf_free(btf);
+ return ret;
+ }
+
+ /*
+ * The BTF ID is published to the userspace.
+ * All BTF free must go through call_rcu() from
+ * now on (i.e. free by calling btf_put()).
+ */
+
+ ret = __btf_new_fd(btf);
+ if (ret < 0)
+ btf_put(btf);
+
+ return ret;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+ struct btf *btf;
+ CLASS(fd, f)(fd);
+
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf))
+ refcount_inc(&btf->refcnt);
+
+ return btf;
+}
+
+int btf_get_info_by_fd(const struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_btf_info __user *uinfo;
+ struct bpf_btf_info info;
+ u32 info_copy, btf_copy;
+ void __user *ubtf;
+ char __user *uname;
+ u32 uinfo_len, uname_len, name_len;
+ int ret = 0;
+
+ uinfo = u64_to_user_ptr(attr->info.info);
+ uinfo_len = attr->info.info_len;
+
+ info_copy = min_t(u32, uinfo_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_copy))
+ return -EFAULT;
+
+ info.id = btf->id;
+ ubtf = u64_to_user_ptr(info.btf);
+ btf_copy = min_t(u32, btf->data_size, info.btf_size);
+ if (copy_to_user(ubtf, btf->data, btf_copy))
+ return -EFAULT;
+ info.btf_size = btf->data_size;
+
+ info.kernel_btf = btf->kernel_btf;
+
+ uname = u64_to_user_ptr(info.name);
+ uname_len = info.name_len;
+ if (!uname ^ !uname_len)
+ return -EINVAL;
+
+ name_len = strlen(btf->name);
+ info.name_len = name_len;
+
+ if (uname) {
+ if (uname_len >= name_len + 1) {
+ if (copy_to_user(uname, btf->name, name_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(uname, btf->name, uname_len - 1))
+ return -EFAULT;
+ if (put_user(zero, uname + uname_len - 1))
+ return -EFAULT;
+ /* let user-space know about too short buffer */
+ ret = -ENOSPC;
+ }
+ }
+
+ if (copy_to_user(uinfo, &info, info_copy) ||
+ put_user(info_copy, &uattr->info.info_len))
+ return -EFAULT;
+
+ return ret;
+}
+
+int btf_get_fd_by_id(u32 id)
+{
+ struct btf *btf;
+ int fd;
+
+ rcu_read_lock();
+ btf = idr_find(&btf_idr, id);
+ if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+ btf = ERR_PTR(-ENOENT);
+ rcu_read_unlock();
+
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ fd = __btf_new_fd(btf);
+ if (fd < 0)
+ btf_put(btf);
+
+ return fd;
+}
+
+u32 btf_obj_id(const struct btf *btf)
+{
+ return btf->id;
+}
+
+bool btf_is_kernel(const struct btf *btf)
+{
+ return btf->kernel_btf;
+}
+
+bool btf_is_module(const struct btf *btf)
+{
+ return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
+}
+
+enum {
+ BTF_MODULE_F_LIVE = (1 << 0),
+};
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+struct btf_module {
+ struct list_head list;
+ struct module *module;
+ struct btf *btf;
+ struct bin_attribute *sysfs_attr;
+ int flags;
+};
+
+static LIST_HEAD(btf_modules);
+static DEFINE_MUTEX(btf_module_mutex);
+
+static void purge_cand_cache(struct btf *btf);
+
+static int btf_module_notify(struct notifier_block *nb, unsigned long op,
+ void *module)
+{
+ struct btf_module *btf_mod, *tmp;
+ struct module *mod = module;
+ struct btf *btf;
+ int err = 0;
+
+ if (mod->btf_data_size == 0 ||
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+ op != MODULE_STATE_GOING))
+ goto out;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL);
+ if (!btf_mod) {
+ err = -ENOMEM;
+ goto out;
+ }
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size,
+ mod->btf_base_data, mod->btf_base_data_size);
+ if (IS_ERR(btf)) {
+ kfree(btf_mod);
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
+ err = PTR_ERR(btf);
+ } else {
+ pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n");
+ }
+ goto out;
+ }
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ kfree(btf_mod);
+ goto out;
+ }
+
+ purge_cand_cache(NULL);
+ mutex_lock(&btf_module_mutex);
+ btf_mod->module = module;
+ btf_mod->btf = btf;
+ list_add(&btf_mod->list, &btf_modules);
+ mutex_unlock(&btf_module_mutex);
+
+ if (IS_ENABLED(CONFIG_SYSFS)) {
+ struct bin_attribute *attr;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr)
+ goto out;
+
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = btf->name;
+ attr->attr.mode = 0444;
+ attr->size = btf->data_size;
+ attr->private = btf->data;
+ attr->read = sysfs_bin_attr_simple_read;
+
+ err = sysfs_create_bin_file(btf_kobj, attr);
+ if (err) {
+ pr_warn("failed to register module [%s] BTF in sysfs: %d\n",
+ mod->name, err);
+ kfree(attr);
+ err = 0;
+ goto out;
+ }
+
+ btf_mod->sysfs_attr = attr;
+ }
+
+ break;
+ case MODULE_STATE_LIVE:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_mod->flags |= BTF_MODULE_F_LIVE;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
+ case MODULE_STATE_GOING:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ list_del(&btf_mod->list);
+ if (btf_mod->sysfs_attr)
+ sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+ purge_cand_cache(btf_mod->btf);
+ btf_put(btf_mod->btf);
+ kfree(btf_mod->sysfs_attr);
+ kfree(btf_mod);
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
+ }
+out:
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block btf_module_nb = {
+ .notifier_call = btf_module_notify,
+};
+
+static int __init btf_module_init(void)
+{
+ register_module_notifier(&btf_module_nb);
+ return 0;
+}
+
+fs_initcall(btf_module_init);
+#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
+struct module *btf_try_get_module(const struct btf *btf)
+{
+ struct module *res = NULL;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->btf != btf)
+ continue;
+
+ /* We must only consider module whose __init routine has
+ * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+ * which is set from the notifier callback for
+ * MODULE_STATE_LIVE.
+ */
+ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
+ res = btf_mod->module;
+
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return res;
+}
+
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ struct btf *btf = NULL;
+
+ if (!module) {
+ btf = bpf_get_btf_vmlinux();
+ if (!IS_ERR_OR_NULL(btf))
+ btf_get(btf);
+ return btf;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_get(btf_mod->btf);
+ btf = btf_mod->btf;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return btf;
+}
+
+static int check_btf_kconfigs(const struct module *module, const char *feature)
+{
+ if (!module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register %s\n", feature);
+ return -ENOENT;
+ }
+ if (module && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+ pr_warn("missing module BTF, cannot register %s\n", feature);
+ return 0;
+}
+
+BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
+{
+ struct btf *btf = NULL;
+ int btf_obj_fd = 0;
+ long ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ ret = bpf_find_btf_id(name, kind, &btf);
+ if (ret > 0 && btf_is_module(btf)) {
+ btf_obj_fd = __btf_new_fd(btf);
+ if (btf_obj_fd < 0) {
+ btf_put(btf);
+ return btf_obj_fd;
+ }
+ return ret | (((u64)btf_obj_fd) << 32);
+ }
+ if (ret > 0)
+ btf_put(btf);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
+ .func = bpf_btf_find_by_name_kind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
+#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
+
+/* Validate well-formedness of iter argument type.
+ * On success, return positive BTF ID of iter state's STRUCT type.
+ * On error, negative error is returned.
+ */
+int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+{
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ const char *name;
+ int btf_id;
+
+ if (btf_type_vlen(func) <= arg_idx)
+ return -EINVAL;
+
+ arg = &btf_params(func)[arg_idx];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, &btf_id);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ return btf_id;
+}
+
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+ const struct btf_type *func, u32 func_flags)
+{
+ u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+ const char *sfx, *iter_name;
+ const struct btf_type *t;
+ char exp_name[128];
+ u32 nr_args;
+ int btf_id;
+
+ /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+ if (!flags || (flags & (flags - 1)))
+ return -EINVAL;
+
+ /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+ nr_args = btf_type_vlen(func);
+ if (nr_args < 1)
+ return -EINVAL;
+
+ btf_id = btf_check_iter_arg(btf, func, 0);
+ if (btf_id < 0)
+ return btf_id;
+
+ /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+ * fit nicely in stack slots
+ */
+ t = btf_type_by_id(btf, btf_id);
+ if (t->size == 0 || (t->size % 8))
+ return -EINVAL;
+
+ /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+ * naming pattern
+ */
+ iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
+ if (flags & KF_ITER_NEW)
+ sfx = "new";
+ else if (flags & KF_ITER_NEXT)
+ sfx = "next";
+ else /* (flags & KF_ITER_DESTROY) */
+ sfx = "destroy";
+
+ snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+ if (strcmp(func_name, exp_name))
+ return -EINVAL;
+
+ /* only iter constructor should have extra arguments */
+ if (!(flags & KF_ITER_NEW) && nr_args != 1)
+ return -EINVAL;
+
+ if (flags & KF_ITER_NEXT) {
+ /* bpf_iter_<type>_next() should return pointer */
+ t = btf_type_skip_modifiers(btf, func->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+
+ if (flags & KF_ITER_DESTROY) {
+ /* bpf_iter_<type>_destroy() should return void */
+ t = btf_type_by_id(btf, func->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+ const struct btf_type *func;
+ const char *func_name;
+ int err;
+
+ /* any kfunc should be FUNC -> FUNC_PROTO */
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func))
+ return -EINVAL;
+
+ /* sanity check kfunc name */
+ func_name = btf_name_by_offset(btf, func->name_off);
+ if (!func_name || !func_name[0])
+ return -EINVAL;
+
+ func = btf_type_by_id(btf, func->type);
+ if (!func || !btf_type_is_func_proto(func))
+ return -EINVAL;
+
+ if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+ err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Kernel Function (kfunc) BTF ID set registration API */
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
+{
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *add_set = kset->set;
+ bool vmlinux_set = !btf_is_module(btf);
+ bool add_filter = !!kset->filter;
+ struct btf_kfunc_set_tab *tab;
+ struct btf_id_set8 *set;
+ u32 set_cnt, i;
+ int ret;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (!add_set->cnt)
+ return 0;
+
+ tab = btf->kfunc_set_tab;
+
+ if (tab && add_filter) {
+ u32 i;
+
+ hook_filter = &tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i] == kset->filter) {
+ add_filter = false;
+ break;
+ }
+ }
+
+ if (add_filter && hook_filter->nr_filters == BTF_KFUNC_FILTER_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+ }
+
+ if (!tab) {
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return -ENOMEM;
+ btf->kfunc_set_tab = tab;
+ }
+
+ set = tab->sets[hook];
+ /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+ * for module sets.
+ */
+ if (WARN_ON_ONCE(set && !vmlinux_set)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* In case of vmlinux sets, there may be more than one set being
+ * registered per hook. To create a unified set, we allocate a new set
+ * and concatenate all individual sets being registered. While each set
+ * is individually sorted, they may become unsorted when concatenated,
+ * hence re-sorting the final set again is required to make binary
+ * searching the set using btf_id_set8_contains function work.
+ *
+ * For module sets, we need to allocate as we may need to relocate
+ * BTF ids.
+ */
+ set_cnt = set ? set->cnt : 0;
+
+ if (set_cnt > U32_MAX - add_set->cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+
+ if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Grow set */
+ set = krealloc(tab->sets[hook],
+ struct_size(set, pairs, set_cnt + add_set->cnt),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!set) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* For newly allocated set, initialize set->cnt to 0 */
+ if (!tab->sets[hook])
+ set->cnt = 0;
+ tab->sets[hook] = set;
+
+ /* Concatenate the two sets */
+ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ /* Now that the set is copied, update with relocated BTF ids */
+ for (i = set->cnt; i < set->cnt + add_set->cnt; i++)
+ set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id);
+
+ set->cnt += add_set->cnt;
+
+ sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
+
+ if (add_filter) {
+ hook_filter = &tab->hook_filters[hook];
+ hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
+ }
+ return 0;
+end:
+ btf_free_kfunc_set_tab(btf);
+ return ret;
+}
+
+static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ struct btf_kfunc_hook_filter *hook_filter;
+ struct btf_id_set8 *set;
+ u32 *id, i;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return NULL;
+ if (!btf->kfunc_set_tab)
+ return NULL;
+ hook_filter = &btf->kfunc_set_tab->hook_filters[hook];
+ for (i = 0; i < hook_filter->nr_filters; i++) {
+ if (hook_filter->filters[i](prog, kfunc_btf_id))
+ return NULL;
+ }
+ set = btf->kfunc_set_tab->sets[hook];
+ if (!set)
+ return NULL;
+ id = btf_id_set8_contains(set, kfunc_btf_id);
+ if (!id)
+ return NULL;
+ /* The flags for BTF ID are located next to it */
+ return id + 1;
+}
+
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_UNSPEC:
+ return BTF_KFUNC_HOOK_COMMON;
+ case BPF_PROG_TYPE_XDP:
+ return BTF_KFUNC_HOOK_XDP;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return BTF_KFUNC_HOOK_TC;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return BTF_KFUNC_HOOK_STRUCT_OPS;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_LSM:
+ return BTF_KFUNC_HOOK_TRACING;
+ case BPF_PROG_TYPE_SYSCALL:
+ return BTF_KFUNC_HOOK_SYSCALL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ return BTF_KFUNC_HOOK_CGROUP;
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return BTF_KFUNC_HOOK_SCHED_ACT;
+ case BPF_PROG_TYPE_SK_SKB:
+ return BTF_KFUNC_HOOK_SK_SKB;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ return BTF_KFUNC_HOOK_SOCKET_FILTER;
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ return BTF_KFUNC_HOOK_LWT;
+ case BPF_PROG_TYPE_NETFILTER:
+ return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_KPROBE:
+ return BTF_KFUNC_HOOK_KPROBE;
+ default:
+ return BTF_KFUNC_HOOK_MAX;
+ }
+}
+
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
+ u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog);
+ if (kfunc_flags)
+ return kfunc_flags;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog);
+}
+
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id,
+ const struct bpf_prog *prog)
+{
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
+{
+ struct btf *btf;
+ int ret, i;
+
+ btf = btf_get_module_btf(kset->owner);
+ if (!btf)
+ return check_btf_kconfigs(kset->owner, "kfunc");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ for (i = 0; i < kset->set->cnt; i++) {
+ ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),
+ kset->set->pairs[i].flags);
+ if (ret)
+ goto err_out;
+ }
+
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+
+err_out:
+ btf_put(btf);
+ return ret;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+
+ /* All kfuncs need to be tagged as such in BTF.
+ * WARN() for initcall registrations that do not check errors.
+ */
+ if (!(kset->set->flags & BTF_SET8_KFUNCS)) {
+ WARN_ON(!kset->owner);
+ return -EINVAL;
+ }
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __register_btf_kfunc_id_set(hook, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+ return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
+s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
+{
+ struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
+ struct btf_id_dtor_kfunc *dtor;
+
+ if (!tab)
+ return -ENOENT;
+ /* Even though the size of tab->dtors[0] is > sizeof(u32), we only need
+ * to compare the first u32 with btf_id, so we can reuse btf_id_cmp_func.
+ */
+ BUILD_BUG_ON(offsetof(struct btf_id_dtor_kfunc, btf_id) != 0);
+ dtor = bsearch(&btf_id, tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func);
+ if (!dtor)
+ return -ENOENT;
+ return dtor->kfunc_btf_id;
+}
+
+static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc *dtors, u32 cnt)
+{
+ const struct btf_type *dtor_func, *dtor_func_proto, *t;
+ const struct btf_param *args;
+ s32 dtor_btf_id;
+ u32 nr_args, i;
+
+ for (i = 0; i < cnt; i++) {
+ dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);
+
+ dtor_func = btf_type_by_id(btf, dtor_btf_id);
+ if (!dtor_func || !btf_type_is_func(dtor_func))
+ return -EINVAL;
+
+ dtor_func_proto = btf_type_by_id(btf, dtor_func->type);
+ if (!dtor_func_proto || !btf_type_is_func_proto(dtor_func_proto))
+ return -EINVAL;
+
+ /* Make sure the prototype of the destructor kfunc is 'void func(type *)' */
+ t = btf_type_by_id(btf, dtor_func_proto->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+
+ nr_args = btf_type_vlen(dtor_func_proto);
+ if (nr_args != 1)
+ return -EINVAL;
+ args = btf_params(dtor_func_proto);
+ t = btf_type_by_id(btf, args[0].type);
+ /* Allow any pointer type, as width on targets Linux supports
+ * will be same for all pointer types (i.e. sizeof(void *))
+ */
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
+ struct module *owner)
+{
+ struct btf_id_dtor_kfunc_tab *tab;
+ struct btf *btf;
+ u32 tab_cnt, i;
+ int ret;
+
+ btf = btf_get_module_btf(owner);
+ if (!btf)
+ return check_btf_kconfigs(owner, "dtor kfuncs");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ if (add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Ensure that the prototype of dtor kfuncs being registered is sane */
+ ret = btf_check_dtor_kfuncs(btf, dtors, add_cnt);
+ if (ret < 0)
+ goto end;
+
+ tab = btf->dtor_kfunc_tab;
+ /* Only one call allowed for modules */
+ if (WARN_ON_ONCE(tab && btf_is_module(btf))) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ tab_cnt = tab ? tab->cnt : 0;
+ if (tab_cnt > U32_MAX - add_cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+ if (tab_cnt + add_cnt >= BTF_DTOR_KFUNC_MAX_CNT) {
+ pr_err("cannot register more than %d kfunc destructors\n", BTF_DTOR_KFUNC_MAX_CNT);
+ ret = -E2BIG;
+ goto end;
+ }
+
+ tab = krealloc(btf->dtor_kfunc_tab,
+ struct_size(tab, dtors, tab_cnt + add_cnt),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!tab) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ if (!btf->dtor_kfunc_tab)
+ tab->cnt = 0;
+ btf->dtor_kfunc_tab = tab;
+
+ memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+
+ /* remap BTF ids based on BTF relocation (if any) */
+ for (i = tab_cnt; i < tab_cnt + add_cnt; i++) {
+ tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id);
+ tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id);
+ }
+
+ tab->cnt += add_cnt;
+
+ sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
+
+end:
+ if (ret)
+ btf_free_dtor_kfunc_tab(btf);
+ btf_put(btf);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
+
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
+ * kind should match for local and target types (i.e., STRUCT is not
+ * compatible with UNION);
+ * - for ENUMs/ENUM64s, the size is ignored;
+ * - for INT, size and signedness are ignored;
+ * - for ARRAY, dimensionality is ignored, element types are checked for
+ * compatibility recursively;
+ * - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ * - FUNC_PROTOs are compatible if they have compatible signature: same
+ * number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id)
+{
+ return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id,
+ MAX_TYPES_ARE_COMPAT_DEPTH);
+}
+
+#define MAX_TYPES_MATCH_DEPTH 2
+
+int bpf_core_types_match(const struct btf *local_btf, u32 local_id,
+ const struct btf *targ_btf, u32 targ_id)
+{
+ return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false,
+ MAX_TYPES_MATCH_DEPTH);
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+ /* check X___Y name pattern, where X and Y are not underscores */
+ return s[0] != '_' && /* X */
+ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
+ s[4] != '_'; /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+ size_t n = strlen(name);
+ int i;
+
+ for (i = n - 5; i >= 0; i--) {
+ if (bpf_core_is_flavor_sep(name + i))
+ return i + 1;
+ }
+ return n;
+}
+
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+ if (!cands->cnt)
+ /* empty candidate array was allocated on stack */
+ return;
+ kfree(cands);
+}
+
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
+{
+ kfree(cands->name);
+ kfree(cands);
+}
+
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ bpf_log(log, "[%d]%s(", i, cc->name);
+ for (j = 0; j < cc->cnt; j++) {
+ bpf_log(log, "%d", cc->cands[j].id);
+ if (j < cc->cnt - 1)
+ bpf_log(log, " ");
+ }
+ bpf_log(log, "), ");
+ }
+}
+
+static void print_cand_cache(struct bpf_verifier_log *log)
+{
+ mutex_lock(&cand_cache_mutex);
+ bpf_log(log, "vmlinux_cand_cache:");
+ __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ bpf_log(log, "\nmodule_cand_cache:");
+ __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ bpf_log(log, "\n");
+ mutex_unlock(&cand_cache_mutex);
+}
+
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+ return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+ if (cc && cc->name_len == cands->name_len &&
+ !strncmp(cc->name, cands->name, cands->name_len))
+ return cc;
+ return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+ return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+ if (*cc) {
+ bpf_free_cands_from_cache(*cc);
+ *cc = NULL;
+ }
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* strdup the name, since it will stay in cache.
+ * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+ */
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT);
+ bpf_free_cands(cands);
+ if (!new_cands->name) {
+ kfree(new_cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ *cc = new_cands;
+ return new_cands;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ if (!btf) {
+ /* when new module is loaded purge all of module_cand_cache,
+ * since new module might have candidates with the name
+ * that matches cached cands.
+ */
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ continue;
+ }
+ /* when module is unloaded purge cache entries
+ * that match module's btf
+ */
+ for (j = 0; j < cc->cnt; j++)
+ if (cc->cands[j].btf == btf) {
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ break;
+ }
+ }
+
+}
+
+static void purge_cand_cache(struct btf *btf)
+{
+ mutex_lock(&cand_cache_mutex);
+ __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ mutex_unlock(&cand_cache_mutex);
+}
+#endif
+
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+ int targ_start_id)
+{
+ struct bpf_cand_cache *new_cands;
+ const struct btf_type *t;
+ const char *targ_name;
+ size_t targ_essent_len;
+ int n, i;
+
+ n = btf_nr_types(targ_btf);
+ for (i = targ_start_id; i < n; i++) {
+ t = btf_type_by_id(targ_btf, i);
+ if (btf_kind(t) != cands->kind)
+ continue;
+
+ targ_name = btf_name_by_offset(targ_btf, t->name_off);
+ if (!targ_name)
+ continue;
+
+ /* the resched point is before strncmp to make sure that search
+ * for non-existing name will have a chance to schedule().
+ */
+ cond_resched();
+
+ if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+ continue;
+
+ targ_essent_len = bpf_core_essential_name_len(targ_name);
+ if (targ_essent_len != cands->name_len)
+ continue;
+
+ /* most of the time there is only one candidate for a given kind+name pair */
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+ bpf_free_cands(cands);
+ cands = new_cands;
+ cands->cands[cands->cnt].btf = targ_btf;
+ cands->cands[cands->cnt].id = i;
+ cands->cnt++;
+ }
+ return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+ struct bpf_cand_cache *cands, *cc, local_cand = {};
+ const struct btf *local_btf = ctx->btf;
+ const struct btf_type *local_type;
+ const struct btf *main_btf;
+ size_t local_essent_len;
+ struct btf *mod_btf;
+ const char *name;
+ int id;
+
+ main_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(main_btf))
+ return ERR_CAST(main_btf);
+ if (!main_btf)
+ return ERR_PTR(-EINVAL);
+
+ local_type = btf_type_by_id(local_btf, local_type_id);
+ if (!local_type)
+ return ERR_PTR(-EINVAL);
+
+ name = btf_name_by_offset(local_btf, local_type->name_off);
+ if (str_is_empty(name))
+ return ERR_PTR(-EINVAL);
+ local_essent_len = bpf_core_essential_name_len(name);
+
+ cands = &local_cand;
+ cands->name = name;
+ cands->kind = btf_kind(local_type);
+ cands->name_len = local_essent_len;
+
+ cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ /* cands is a pointer to stack here */
+ if (cc) {
+ if (cc->cnt)
+ return cc;
+ goto check_modules;
+ }
+
+ /* Attempt to find target candidates in vmlinux BTF first */
+ cands = bpf_core_add_cands(cands, main_btf, 1);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+ /* populate cache even when cands->cnt == 0 */
+ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ if (IS_ERR(cc))
+ return ERR_CAST(cc);
+
+ /* if vmlinux BTF has any candidate, don't go for module BTFs */
+ if (cc->cnt)
+ return cc;
+
+check_modules:
+ /* cands is a pointer to stack here and cands->cnt == 0 */
+ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ if (cc)
+ /* if cache has it return it even if cc->cnt == 0 */
+ return cc;
+
+ /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ btf_put(mod_btf);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+ spin_lock_bh(&btf_idr_lock);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+ * or pointer to stack if cands->cnd == 0.
+ * Copy it into the cache even when cands->cnt == 0 and
+ * return the result.
+ */
+ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+ int relo_idx, void *insn)
+{
+ bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+ struct bpf_core_cand_list cands = {};
+ struct bpf_core_relo_res targ_res;
+ struct bpf_core_spec *specs;
+ const struct btf_type *type;
+ int err;
+
+ /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+ * into arrays of btf_ids of struct fields and array indices.
+ */
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL_ACCOUNT);
+ if (!specs)
+ return -ENOMEM;
+
+ type = btf_type_by_id(ctx->btf, relo->type_id);
+ if (!type) {
+ bpf_log(ctx->log, "relo #%u: bad type id %u\n",
+ relo_idx, relo->type_id);
+ kfree(specs);
+ return -EINVAL;
+ }
+
+ if (need_cands) {
+ struct bpf_cand_cache *cc;
+ int i;
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(ctx, relo->type_id);
+ if (IS_ERR(cc)) {
+ bpf_log(ctx->log, "target candidate search failed for %d\n",
+ relo->type_id);
+ err = PTR_ERR(cc);
+ goto out;
+ }
+ if (cc->cnt) {
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL_ACCOUNT);
+ if (!cands.cands) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < cc->cnt; i++) {
+ bpf_log(ctx->log,
+ "CO-RE relocating %s %s: found target candidate [%d]\n",
+ btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+ cands.cands[i].btf = cc->cands[i].btf;
+ cands.cands[i].id = cc->cands[i].id;
+ }
+ cands.len = cc->cnt;
+ /* cand_cache_mutex needs to span the cache lookup and
+ * copy of btf pointer into bpf_core_cand_list,
+ * since module can be unloaded while bpf_core_calc_relo_insn
+ * is working with module's btf.
+ */
+ }
+
+ err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+ &targ_res);
+ if (err)
+ goto out;
+
+ err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+ &targ_res);
+
+out:
+ kfree(specs);
+ if (need_cands) {
+ kfree(cands.cands);
+ mutex_unlock(&cand_cache_mutex);
+ if (ctx->log->level & BPF_LOG_LEVEL2)
+ print_cand_cache(ctx->log);
+ }
+ return err;
+}
+
+bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id, const char *suffix)
+{
+ struct btf *btf = reg->btf;
+ const struct btf_type *walk_type, *safe_type;
+ const char *tname;
+ char safe_tname[64];
+ long ret, safe_id;
+ const struct btf_member *member;
+ u32 i;
+
+ walk_type = btf_type_by_id(btf, reg->btf_id);
+ if (!walk_type)
+ return false;
+
+ tname = btf_name_by_offset(btf, walk_type->name_off);
+
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
+ if (ret >= sizeof(safe_tname))
+ return false;
+
+ safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
+ if (safe_id < 0)
+ return false;
+
+ safe_type = btf_type_by_id(btf, safe_id);
+ if (!safe_type)
+ return false;
+
+ for_each_member(i, safe_type, member) {
+ const char *m_name = __btf_name_by_offset(btf, member->name_off);
+ const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+ u32 id;
+
+ if (!btf_type_is_ptr(mtype))
+ continue;
+
+ btf_type_skip_modifiers(btf, mtype->type, &id);
+ /* If we match on both type and name, the field is considered trusted. */
+ if (btf_id == id && !strcmp(field_name, m_name))
+ return true;
+ }
+
+ return false;
+}
+
+bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
+ const struct btf *reg_btf, u32 reg_id,
+ const struct btf *arg_btf, u32 arg_id)
+{
+ const char *reg_name, *arg_name, *search_needle;
+ const struct btf_type *reg_type, *arg_type;
+ int reg_len, arg_len, cmp_len;
+ size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char);
+
+ reg_type = btf_type_by_id(reg_btf, reg_id);
+ if (!reg_type)
+ return false;
+
+ arg_type = btf_type_by_id(arg_btf, arg_id);
+ if (!arg_type)
+ return false;
+
+ reg_name = btf_name_by_offset(reg_btf, reg_type->name_off);
+ arg_name = btf_name_by_offset(arg_btf, arg_type->name_off);
+
+ reg_len = strlen(reg_name);
+ arg_len = strlen(arg_name);
+
+ /* Exactly one of the two type names may be suffixed with ___init, so
+ * if the strings are the same size, they can't possibly be no-cast
+ * aliases of one another. If you have two of the same type names, e.g.
+ * they're both nf_conn___init, it would be improper to return true
+ * because they are _not_ no-cast aliases, they are the same type.
+ */
+ if (reg_len == arg_len)
+ return false;
+
+ /* Either of the two names must be the other name, suffixed with ___init. */
+ if ((reg_len != arg_len + pattern_len) &&
+ (arg_len != reg_len + pattern_len))
+ return false;
+
+ if (reg_len < arg_len) {
+ search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = reg_len;
+ } else {
+ search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX);
+ cmp_len = arg_len;
+ }
+
+ if (!search_needle)
+ return false;
+
+ /* ___init suffix must come at the end of the name */
+ if (*(search_needle + pattern_len) != '\0')
+ return false;
+
+ return !strncmp(reg_name, arg_name, cmp_len);
+}
+
+#ifdef CONFIG_BPF_JIT
+static int
+btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
+ struct bpf_verifier_log *log)
+{
+ struct btf_struct_ops_tab *tab, *new_tab;
+ int i, err;
+
+ tab = btf->struct_ops_tab;
+ if (!tab) {
+ tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+ tab->capacity = 4;
+ btf->struct_ops_tab = tab;
+ }
+
+ for (i = 0; i < tab->cnt; i++)
+ if (tab->ops[i].st_ops == st_ops)
+ return -EEXIST;
+
+ if (tab->cnt == tab->capacity) {
+ new_tab = krealloc(tab,
+ struct_size(tab, ops, tab->capacity * 2),
+ GFP_KERNEL);
+ if (!new_tab)
+ return -ENOMEM;
+ tab = new_tab;
+ tab->capacity *= 2;
+ btf->struct_ops_tab = tab;
+ }
+
+ tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops;
+
+ err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log);
+ if (err)
+ return err;
+
+ btf->struct_ops_tab->cnt++;
+
+ return 0;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!value_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].value_id == value_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+const struct bpf_struct_ops_desc *
+bpf_struct_ops_find(struct btf *btf, u32 type_id)
+{
+ const struct bpf_struct_ops_desc *st_ops_list;
+ unsigned int i;
+ u32 cnt;
+
+ if (!type_id)
+ return NULL;
+ if (!btf->struct_ops_tab)
+ return NULL;
+
+ cnt = btf->struct_ops_tab->cnt;
+ st_ops_list = btf->struct_ops_tab->ops;
+ for (i = 0; i < cnt; i++) {
+ if (st_ops_list[i].type_id == type_id)
+ return &st_ops_list[i];
+ }
+
+ return NULL;
+}
+
+int __register_bpf_struct_ops(struct bpf_struct_ops *st_ops)
+{
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err = 0;
+
+ btf = btf_get_module_btf(st_ops->owner);
+ if (!btf)
+ return check_btf_kconfigs(st_ops->owner, "struct_ops");
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN);
+ if (!log) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ log->level = BPF_LOG_KERNEL;
+
+ err = btf_add_struct_ops(btf, st_ops, log);
+
+errout:
+ kfree(log);
+ btf_put(btf);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(__register_bpf_struct_ops);
+#endif
+
+bool btf_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len <= suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
diff --git a/kernel/bpf/btf_iter.c b/kernel/bpf/btf_iter.c
new file mode 100644
index 000000000000..0e2c66a52df9
--- /dev/null
+++ b/kernel/bpf/btf_iter.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_iter.c"
diff --git a/kernel/bpf/btf_relocate.c b/kernel/bpf/btf_relocate.c
new file mode 100644
index 000000000000..c12ccbf66507
--- /dev/null
+++ b/kernel/bpf/btf_relocate.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/btf_relocate.c"
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..69988af44b37
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,2761 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/filter.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/string.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
+#include <net/sock.h>
+#include <net/bpf_sk_storage.h>
+
+#include "../cgroup/cgroup-internal.h"
+
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_percpu_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy",
+ WQ_PERCPU, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data);
+
+static struct notifier_block cgroup_bpf_lifetime_nb = {
+ .notifier_call = cgroup_bpf_lifetime_notify,
+};
+
+void __init cgroup_bpf_lifetime_notifier_init(void)
+{
+ BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &cgroup_bpf_lifetime_nb));
+}
+
+/* __always_inline is necessary to prevent indirect call through run_prog
+ * function pointer.
+ */
+static __always_inline int
+bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ const void *ctx, bpf_prog_run_fn run_prog,
+ int retval, u32 *ret_flags)
+{
+ const struct bpf_prog_array_item *item;
+ const struct bpf_prog *prog;
+ const struct bpf_prog_array *array;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_cg_run_ctx run_ctx;
+ u32 func_ret;
+
+ run_ctx.retval = retval;
+ rcu_read_lock_dont_migrate();
+ array = rcu_dereference(cgrp->effective[atype]);
+ item = &array->items[0];
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ while ((prog = READ_ONCE(item->prog))) {
+ run_ctx.prog_item = item;
+ func_ret = run_prog(prog, ctx);
+ if (ret_flags) {
+ *(ret_flags) |= (func_ret >> 1);
+ func_ret &= 1;
+ }
+ if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
+ run_ctx.retval = -EPERM;
+ item++;
+ }
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock_migrate();
+ return run_ctx.retval;
+}
+
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct sock *sk;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sk = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct socket *sock;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sock = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+ cgrp = task_dfl_cgroup(current);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+ u32 attach_btf_id;
+ int refcnt;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ int i;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
+ return CGROUP_LSM_START + i;
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == 0)
+ return CGROUP_LSM_START + i;
+
+ return -E2BIG;
+
+}
+
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
+ cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+
+ cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ cgroup_lsm_atype[i].refcnt++;
+}
+
+void bpf_cgroup_atype_put(int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ cgroup_lock();
+ if (--cgroup_lsm_atype[i].refcnt <= 0)
+ cgroup_lsm_atype[i].attach_btf_id = 0;
+ WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
+ cgroup_unlock();
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
+static void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ percpu_ref_kill(&cgrp->bpf.refcnt);
+}
+
+static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
+{
+ enum bpf_cgroup_storage_type stype;
+
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(storages[stype]);
+}
+
+static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
+ struct bpf_cgroup_storage *new_storages[],
+ enum bpf_attach_type type,
+ struct bpf_prog *prog,
+ struct cgroup *cgrp)
+{
+ enum bpf_cgroup_storage_type stype;
+ struct bpf_cgroup_storage_key key;
+ struct bpf_map *map;
+
+ key.cgroup_inode_id = cgroup_id(cgrp);
+ key.attach_type = type;
+
+ for_each_cgroup_storage_type(stype) {
+ map = prog->aux->cgroup_storage[stype];
+ if (!map)
+ continue;
+
+ storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
+ if (storages[stype])
+ continue;
+
+ storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
+ if (IS_ERR(storages[stype])) {
+ bpf_cgroup_storages_free(new_storages);
+ return -ENOMEM;
+ }
+
+ new_storages[stype] = storages[stype];
+ }
+
+ return 0;
+}
+
+static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
+ struct bpf_cgroup_storage *src[])
+{
+ enum bpf_cgroup_storage_type stype;
+
+ for_each_cgroup_storage_type(stype)
+ dst[stype] = src[stype];
+}
+
+static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
+ struct cgroup *cgrp,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_cgroup_storage_type stype;
+
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
+}
+
+/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
+ * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
+ * doesn't free link memory, which will eventually be done by bpf_link's
+ * release() callback, when its last FD is closed.
+ */
+static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
+{
+ cgroup_put(link->cgroup);
+ link->cgroup = NULL;
+}
+
+/**
+ * cgroup_bpf_release() - put references of all bpf programs and
+ * release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
+ */
+static void cgroup_bpf_release(struct work_struct *work)
+{
+ struct cgroup *p, *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
+ struct bpf_prog_array *old_array;
+ struct list_head *storages = &cgrp->bpf.storages;
+ struct bpf_cgroup_storage *storage, *stmp;
+
+ unsigned int atype;
+
+ cgroup_lock();
+
+ for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+ struct hlist_head *progs = &cgrp->bpf.progs[atype];
+ struct bpf_prog_list *pl;
+ struct hlist_node *pltmp;
+
+ hlist_for_each_entry_safe(pl, pltmp, progs, node) {
+ hlist_del(&pl->node);
+ if (pl->prog) {
+ if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->prog);
+ bpf_prog_put(pl->prog);
+ }
+ if (pl->link) {
+ if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
+ bpf_cgroup_link_auto_detach(pl->link);
+ }
+ kfree(pl);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
+ }
+ old_array = rcu_dereference_protected(
+ cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ bpf_prog_array_free(old_array);
+ }
+
+ list_for_each_entry_safe(storage, stmp, storages, list_cg) {
+ bpf_cgroup_storage_unlink(storage);
+ bpf_cgroup_storage_free(storage);
+ }
+
+ cgroup_unlock();
+
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_put(p);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+ cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ * of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+ struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+
+ INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
+}
+
+/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
+ * link or direct prog.
+ */
+static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
+{
+ if (pl->prog)
+ return pl->prog;
+ if (pl->link)
+ return pl->link->link.prog;
+ return NULL;
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
+{
+ struct bpf_prog_list *pl;
+ u32 cnt = 0;
+
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
+ (*preorder_cnt)++;
+ cnt++;
+ }
+ return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup *p;
+
+ p = cgroup_parent(cgrp);
+ if (!p)
+ return true;
+ do {
+ u32 flags = p->bpf.flags[atype];
+ u32 cnt;
+
+ if (flags & BPF_F_ALLOW_MULTI)
+ return true;
+ cnt = prog_list_length(&p->bpf.progs[atype], NULL);
+ WARN_ON_ONCE(cnt > 1);
+ if (cnt == 1)
+ return !!(flags & BPF_F_ALLOW_OVERRIDE);
+ p = cgroup_parent(p);
+ } while (p);
+ return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ struct bpf_prog_array **array)
+{
+ struct bpf_prog_array_item *item;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct cgroup *p = cgrp;
+ int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
+
+ /* count number of effective programs by walking parents */
+ do {
+ if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
+ p = cgroup_parent(p);
+ } while (p);
+
+ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!progs)
+ return -ENOMEM;
+
+ /* populate the array with effective progs */
+ cnt = 0;
+ p = cgrp;
+ fstart = preorder_cnt;
+ bstart = preorder_cnt - 1;
+ do {
+ if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ init_bstart = bstart;
+ hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
+ if (!prog_list_prog(pl))
+ continue;
+
+ if (pl->flags & BPF_F_PREORDER) {
+ item = &progs->items[bstart];
+ bstart--;
+ } else {
+ item = &progs->items[fstart];
+ fstart++;
+ }
+ item->prog = prog_list_prog(pl);
+ bpf_cgroup_storages_assign(item->cgroup_storage,
+ pl->storage);
+ cnt++;
+ }
+
+ /* reverse pre-ordering progs at this cgroup level */
+ for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
+ swap(progs->items[i], progs->items[j]);
+
+ } while ((p = cgroup_parent(p)));
+
+ *array = progs;
+ return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ struct bpf_prog_array *old_array)
+{
+ old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
+ lockdep_is_held(&cgroup_mutex));
+ /* free prog array after grace period, since __cgroup_bpf_run_*()
+ * might be still walking the array
+ */
+ bpf_prog_array_free(old_array);
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ */
+static int cgroup_bpf_inherit(struct cgroup *cgrp)
+{
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define NR ARRAY_SIZE(cgrp->bpf.effective)
+ struct bpf_prog_array *arrays[NR] = {};
+ struct cgroup *p;
+ int ret, i;
+
+ ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_get(p);
+
+ for (i = 0; i < NR; i++)
+ INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
+
+ INIT_LIST_HEAD(&cgrp->bpf.storages);
+
+ for (i = 0; i < NR; i++)
+ if (compute_effective_progs(cgrp, i, &arrays[i]))
+ goto cleanup;
+
+ for (i = 0; i < NR; i++)
+ activate_effective_progs(cgrp, i, arrays[i]);
+
+ return 0;
+cleanup:
+ for (i = 0; i < NR; i++)
+ bpf_prog_array_free(arrays[i]);
+
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_put(p);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+
+ return -ENOMEM;
+}
+
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ int ret = 0;
+
+ if (cgrp->root != &cgrp_dfl_root)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ ret = cgroup_bpf_inherit(cgrp);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ cgroup_bpf_offline(cgrp);
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+static int update_effective_progs(struct cgroup *cgrp,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup_subsys_state *css;
+ int err;
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
+ if (unlikely(desc->bpf.inactive)) {
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+ continue;
+ }
+
+ activate_effective_progs(desc, atype, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ return err;
+}
+
+#define BPF_CGROUP_MAX_PROGS 64
+
+static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
+ struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *replace_prog,
+ bool allow_multi)
+{
+ struct bpf_prog_list *pl;
+
+ /* single-attach case */
+ if (!allow_multi) {
+ if (hlist_empty(progs))
+ return NULL;
+ return hlist_entry(progs->first, typeof(*pl), node);
+ }
+
+ hlist_for_each_entry(pl, progs, node) {
+ if (prog && pl->prog == prog && prog != replace_prog)
+ /* disallow attaching the same prog twice */
+ return ERR_PTR(-EINVAL);
+ if (link && pl->link == link)
+ /* disallow attaching the same link twice */
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* direct prog multi-attach w/ replacement case */
+ if (replace_prog) {
+ hlist_for_each_entry(pl, progs, node) {
+ if (pl->prog == replace_prog)
+ /* a match found */
+ return pl;
+ }
+ /* prog to replace not found for cgroup */
+ return ERR_PTR(-ENOENT);
+ }
+
+ return NULL;
+}
+
+static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+
+ if (flags & BPF_F_ID)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ return link;
+}
+
+static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+
+ if (flags & BPF_F_ID)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ return prog;
+}
+
+static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
+{
+ bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
+ struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
+ bool preorder = flags & BPF_F_PREORDER;
+ struct bpf_link *anchor_link = NULL;
+ struct bpf_prog *anchor_prog = NULL;
+ bool is_before, is_after;
+
+ is_before = flags & BPF_F_BEFORE;
+ is_after = flags & BPF_F_AFTER;
+ if (is_link || is_id || id_or_fd) {
+ /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
+ if (is_before == is_after)
+ return ERR_PTR(-EINVAL);
+ if ((is_link && !link) || (!is_link && !prog))
+ return ERR_PTR(-EINVAL);
+ } else if (!hlist_empty(progs)) {
+ /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
+ if (is_before && is_after)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (is_link) {
+ anchor_link = bpf_get_anchor_link(flags, id_or_fd);
+ if (IS_ERR(anchor_link))
+ return ERR_CAST(anchor_link);
+ } else if (is_id || id_or_fd) {
+ anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
+ if (IS_ERR(anchor_prog))
+ return ERR_CAST(anchor_prog);
+ }
+
+ if (!anchor_prog && !anchor_link) {
+ /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
+ * doesn't matter since either prepend or append to a combined
+ * list of progs will end up with correct result.
+ */
+ hlist_for_each_entry(pltmp, progs, node) {
+ if (is_before)
+ return pltmp;
+ if (pltmp->node.next)
+ continue;
+ return pltmp;
+ }
+ return NULL;
+ }
+
+ hlist_for_each_entry(pltmp, progs, node) {
+ if ((anchor_prog && anchor_prog == pltmp->prog) ||
+ (anchor_link && anchor_link == &pltmp->link->link)) {
+ if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
+ goto out;
+ pl = pltmp;
+ goto out;
+ }
+ }
+
+ pl = ERR_PTR(-ENOENT);
+out:
+ if (anchor_link)
+ bpf_link_put(anchor_link);
+ else
+ bpf_prog_put(anchor_prog);
+ return pl;
+}
+
+static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
+ struct bpf_prog *prog, struct bpf_cgroup_link *link,
+ u32 flags, u32 id_or_fd)
+{
+ struct bpf_prog_list *pltmp;
+
+ pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
+ if (IS_ERR(pltmp))
+ return PTR_ERR(pltmp);
+
+ if (!pltmp)
+ hlist_add_head(&pl->node, progs);
+ else if (flags & BPF_F_BEFORE)
+ hlist_add_before(&pl->node, &pltmp->node);
+ else
+ hlist_add_behind(&pl->node, &pltmp->node);
+
+ return 0;
+}
+
+/**
+ * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to attach
+ * @link: A link to attach
+ * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
+ * @type: Type of attach operation
+ * @flags: Option flags
+ * @id_or_fd: Relative prog id or fd
+ * @revision: bpf_prog_list revision
+ *
+ * Exactly one of @prog or @link can be non-null.
+ * Must be called with cgroup_mutex held.
+ */
+static int __cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type, u32 flags, u32 id_or_fd,
+ u64 revision)
+{
+ u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
+ struct bpf_prog *old_prog = NULL;
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_prog *new_prog = prog ? : link->link.prog;
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog_list *pl;
+ struct hlist_head *progs;
+ int err;
+
+ if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
+ ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
+ /* invalid combination */
+ return -EINVAL;
+ if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
+ /* only either replace or insertion with before/after */
+ return -EINVAL;
+ if (link && (prog || replace_prog))
+ /* only either link or prog/replace_prog can be specified */
+ return -EINVAL;
+ if (!!replace_prog != !!(flags & BPF_F_REPLACE))
+ /* replace_prog implies BPF_F_REPLACE, and vice versa */
+ return -EINVAL;
+
+ atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+ if (revision && revision != cgrp->bpf.revisions[atype])
+ return -ESTALE;
+
+ progs = &cgrp->bpf.progs[atype];
+
+ if (!hierarchy_allows_attach(cgrp, atype))
+ return -EPERM;
+
+ if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
+ /* Disallow attaching non-overridable on top
+ * of existing overridable in this cgroup.
+ * Disallow attaching multi-prog if overridable or none
+ */
+ return -EPERM;
+
+ if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
+ return -E2BIG;
+
+ pl = find_attach_entry(progs, prog, link, replace_prog,
+ flags & BPF_F_ALLOW_MULTI);
+ if (IS_ERR(pl))
+ return PTR_ERR(pl);
+
+ if (bpf_cgroup_storages_alloc(storage, new_storage, type,
+ prog ? : link->link.prog, cgrp))
+ return -ENOMEM;
+
+ if (pl) {
+ old_prog = pl->prog;
+ } else {
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl) {
+ bpf_cgroup_storages_free(new_storage);
+ return -ENOMEM;
+ }
+
+ err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
+ if (err) {
+ kfree(pl);
+ bpf_cgroup_storages_free(new_storage);
+ return err;
+ }
+ }
+
+ pl->prog = prog;
+ pl->link = link;
+ pl->flags = flags;
+ bpf_cgroup_storages_assign(pl->storage, storage);
+ cgrp->bpf.flags[atype] = saved_flags;
+
+ if (type == BPF_LSM_CGROUP) {
+ err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type);
+ if (err)
+ goto cleanup;
+ }
+
+ err = update_effective_progs(cgrp, atype);
+ if (err)
+ goto cleanup_trampoline;
+
+ cgrp->bpf.revisions[atype] += 1;
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
+ bpf_prog_put(old_prog);
+ } else {
+ static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ }
+ bpf_cgroup_storages_link(new_storage, cgrp, type);
+ return 0;
+
+cleanup_trampoline:
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(new_prog);
+
+cleanup:
+ if (old_prog) {
+ pl->prog = old_prog;
+ pl->link = NULL;
+ }
+ bpf_cgroup_storages_free(new_storage);
+ if (!old_prog) {
+ hlist_del(&pl->node);
+ kfree(pl);
+ }
+ return err;
+}
+
+static int cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
+ id_or_fd, revision);
+ cgroup_unlock();
+ return ret;
+}
+
+/* Swap updated BPF program for given link in effective program arrays across
+ * all descendant cgroups. This function is guaranteed to succeed.
+ */
+static void replace_effective_prog(struct cgroup *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ struct bpf_cgroup_link *link)
+{
+ struct bpf_prog_array_item *item;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct hlist_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+found:
+ BUG_ON(!cg);
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ item = &progs->items[pos];
+ WRITE_ONCE(item->prog, link->link.prog);
+ }
+}
+
+/**
+ * __cgroup_bpf_replace() - Replace link's program and propagate the change
+ * to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @link: A link for which to replace BPF program
+ * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
+ * incremented
+ *
+ * Must be called with cgroup_mutex held.
+ */
+static int __cgroup_bpf_replace(struct cgroup *cgrp,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *new_prog)
+{
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog *old_prog;
+ struct bpf_prog_list *pl;
+ struct hlist_head *progs;
+ bool found = false;
+
+ atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
+ if (link->link.prog->type != new_prog->type)
+ return -EINVAL;
+
+ hlist_for_each_entry(pl, progs, node) {
+ if (pl->link == link) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ return -ENOENT;
+
+ cgrp->bpf.revisions[atype] += 1;
+ old_prog = xchg(&link->link.prog, new_prog);
+ replace_effective_prog(cgrp, atype, link);
+ bpf_prog_put(old_prog);
+ return 0;
+}
+
+static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_cgroup_link *cg_link;
+ int ret;
+
+ cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ cgroup_lock();
+ /* link might have been auto-released by dying cgroup, so fail */
+ if (!cg_link->cgroup) {
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+ cgroup_unlock();
+ return ret;
+}
+
+static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
+ struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ bool allow_multi)
+{
+ struct bpf_prog_list *pl;
+
+ if (!allow_multi) {
+ if (hlist_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return ERR_PTR(-ENOENT);
+
+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
+ * allow detaching with invalid FD (prog==NULL) in legacy mode
+ */
+ return hlist_entry(progs->first, typeof(*pl), node);
+ }
+
+ if (!prog && !link)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program or link to be detached
+ */
+ return ERR_PTR(-EINVAL);
+
+ /* find the prog or link and detach it */
+ hlist_for_each_entry(pl, progs, node) {
+ if (pl->prog == prog && pl->link == link)
+ return pl;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+/**
+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
+ * cgrp->bpf.inactive table we can recover by
+ * recomputing the array in place.
+ *
+ * @cgrp: The cgroup which descendants to travers
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @atype: Type of detach operation
+ */
+static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct hlist_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ /* recompute effective prog array in place */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link or prog in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->prog == prog && pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+
+ /* no link or prog match, skip the cgroup of this layer */
+ continue;
+found:
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+
+ /* Remove the program from the array */
+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ "Failed to purge a prog from array at index %d", pos);
+ }
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @type: Type of detach operation
+ * @revision: bpf_prog_list revision
+ *
+ * At most one of @prog or @link can be non-NULL.
+ * Must be called with cgroup_mutex held.
+ */
+static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, enum bpf_attach_type type,
+ u64 revision)
+{
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog *old_prog;
+ struct bpf_prog_list *pl;
+ struct hlist_head *progs;
+ u32 attach_btf_id = 0;
+ u32 flags;
+
+ if (prog)
+ attach_btf_id = prog->aux->attach_btf_id;
+ if (link)
+ attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+ atype = bpf_cgroup_atype_find(type, attach_btf_id);
+ if (atype < 0)
+ return -EINVAL;
+
+ if (revision && revision != cgrp->bpf.revisions[atype])
+ return -ESTALE;
+
+ progs = &cgrp->bpf.progs[atype];
+ flags = cgrp->bpf.flags[atype];
+
+ if (prog && link)
+ /* only one of prog or link can be specified */
+ return -EINVAL;
+
+ pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
+ if (IS_ERR(pl))
+ return PTR_ERR(pl);
+
+ /* mark it deleted, so it's ignored while recomputing effective */
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ pl->link = NULL;
+
+ if (update_effective_progs(cgrp, atype)) {
+ /* if update effective array failed replace the prog with a dummy prog*/
+ pl->prog = old_prog;
+ pl->link = link;
+ purge_effective_progs(cgrp, old_prog, link, atype);
+ }
+
+ /* now can actually delete it from this cgroup list */
+ hlist_del(&pl->node);
+ cgrp->bpf.revisions[atype] += 1;
+
+ kfree(pl);
+ if (hlist_empty(progs))
+ /* last program was detached, reset flags to zero */
+ cgrp->bpf.flags[atype] = 0;
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
+ bpf_prog_put(old_prog);
+ }
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
+ return 0;
+}
+
+static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u64 revision)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
+ cgroup_unlock();
+ return ret;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ enum bpf_attach_type type = attr->query.attach_type;
+ enum cgroup_bpf_attach_type from_atype, to_atype;
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog_array *effective;
+ int cnt, ret = 0, i;
+ int total_cnt = 0;
+ u64 revision = 0;
+ u32 flags;
+
+ if (effective_query && prog_attach_flags)
+ return -EINVAL;
+
+ if (type == BPF_LSM_CGROUP) {
+ if (!effective_query && attr->query.prog_cnt &&
+ prog_ids && !prog_attach_flags)
+ return -EINVAL;
+
+ from_atype = CGROUP_LSM_START;
+ to_atype = CGROUP_LSM_END;
+ flags = 0;
+ } else {
+ from_atype = to_cgroup_bpf_attach_type(type);
+ if (from_atype < 0)
+ return -EINVAL;
+ to_atype = from_atype;
+ flags = cgrp->bpf.flags[from_atype];
+ }
+
+ for (atype = from_atype; atype <= to_atype; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ total_cnt += bpf_prog_array_length(effective);
+ } else {
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
+ }
+ }
+
+ /* always output uattr->query.attach_flags as 0 during effective query */
+ flags = effective_query ? 0 : flags;
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
+ return -EFAULT;
+ if (!effective_query && from_atype == to_atype)
+ revision = cgrp->bpf.revisions[from_atype];
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ return -EFAULT;
+ if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
+ /* return early if user requested only program count + flags */
+ return 0;
+
+ if (attr->query.prog_cnt < total_cnt) {
+ total_cnt = attr->query.prog_cnt;
+ ret = -ENOSPC;
+ }
+
+ for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+ if (effective_query) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+ ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ } else {
+ struct hlist_head *progs;
+ struct bpf_prog_list *pl;
+ struct bpf_prog *prog;
+ u32 id;
+
+ progs = &cgrp->bpf.progs[atype];
+ cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
+ i = 0;
+ hlist_for_each_entry(pl, progs, node) {
+ prog = prog_list_prog(pl);
+ id = prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
+
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i,
+ &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
+ }
+
+ prog_ids += cnt;
+ total_cnt -= cnt;
+ }
+ return ret;
+}
+
+static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ret;
+
+ cgroup_lock();
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_unlock();
+ return ret;
+}
+
+int cgroup_bpf_prog_attach(const union bpf_attr *attr,
+ enum bpf_prog_type ptype, struct bpf_prog *prog)
+{
+ struct bpf_prog *replace_prog = NULL;
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
+ (attr->attach_flags & BPF_F_REPLACE)) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
+ if (IS_ERR(replace_prog)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(replace_prog);
+ }
+ }
+
+ ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
+ attr->attach_type, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
+
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
+ cgroup_put(cgrp);
+ return ret;
+}
+
+int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
+ if (prog)
+ bpf_prog_put(prog);
+
+ cgroup_put(cgrp);
+ return ret;
+}
+
+static void bpf_cgroup_link_release(struct bpf_link *link)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ struct cgroup *cg;
+
+ /* link might have been auto-detached by dying cgroup already,
+ * in that case our work is done here
+ */
+ if (!cg_link->cgroup)
+ return;
+
+ cgroup_lock();
+
+ /* re-check cgroup under lock again */
+ if (!cg_link->cgroup) {
+ cgroup_unlock();
+ return;
+ }
+
+ WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
+ link->attach_type, 0));
+ if (link->attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
+
+ cg = cg_link->cgroup;
+ cg_link->cgroup = NULL;
+
+ cgroup_unlock();
+
+ cgroup_put(cg);
+}
+
+static void bpf_cgroup_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+
+ kfree(cg_link);
+}
+
+static int bpf_cgroup_link_detach(struct bpf_link *link)
+{
+ bpf_cgroup_link_release(link);
+
+ return 0;
+}
+
+static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ cgroup_lock();
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ cgroup_unlock();
+
+ seq_printf(seq,
+ "cgroup_id:\t%llu\n"
+ "attach_type:\t%d\n",
+ cg_id,
+ link->attach_type);
+}
+
+static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ cgroup_lock();
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ cgroup_unlock();
+
+ info->cgroup.cgroup_id = cg_id;
+ info->cgroup.attach_type = link->attach_type;
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_cgroup_link_lops = {
+ .release = bpf_cgroup_link_release,
+ .dealloc = bpf_cgroup_link_dealloc,
+ .detach = bpf_cgroup_link_detach,
+ .update_prog = cgroup_bpf_replace,
+ .show_fdinfo = bpf_cgroup_link_show_fdinfo,
+ .fill_link_info = bpf_cgroup_link_fill_link_info,
+};
+
+#define BPF_F_LINK_ATTACH_MASK \
+ (BPF_F_ID | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_PREORDER | \
+ BPF_F_LINK)
+
+int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_cgroup_link *link;
+ struct cgroup *cgrp;
+ int err;
+
+ if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
+ return -EINVAL;
+
+ cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_cgroup;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
+ prog, attr->link_create.attach_type);
+ link->cgroup = cgrp;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_cgroup;
+ }
+
+ err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+ link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
+ attr->link_create.cgroup.relative_fd,
+ attr->link_create.cgroup.expected_revision);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_cgroup;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+out_put_cgroup:
+ cgroup_put(cgrp);
+ return err;
+}
+
+int cgroup_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+
+ cgroup_put(cgrp);
+ return ret;
+}
+
+/**
+ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
+ * @sk: The socket sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @atype: The type of program to be executed
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * For egress packets, this function can return:
+ * NET_XMIT_SUCCESS (0) - continue with packet output
+ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
+ * NET_XMIT_CN (2) - continue with packet output and notify TCP
+ * to call cwr
+ * -err - drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
+ */
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+ struct sk_buff *skb,
+ enum cgroup_bpf_attach_type atype)
+{
+ unsigned int offset = -skb_network_offset(skb);
+ struct sock *save_sk;
+ void *saved_data_end;
+ struct cgroup *cgrp;
+ int ret;
+
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ save_sk = skb->sk;
+ skb->sk = sk;
+ __skb_push(skb, offset);
+
+ /* compute pointers for the bpf prog */
+ bpf_compute_and_save_data_end(skb, &saved_data_end);
+
+ if (atype == CGROUP_INET_EGRESS) {
+ u32 flags = 0;
+ bool cn;
+
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
+ __bpf_prog_run_save_cb, 0, &flags);
+
+ /* Return values of CGROUP EGRESS BPF programs are:
+ * 0: drop packet
+ * 1: keep packet
+ * 2: drop packet and cn
+ * 3: keep packet and cn
+ *
+ * The returned value is then converted to one of the NET_XMIT
+ * or an error code that is then interpreted as drop packet
+ * (and no cn):
+ * 0: NET_XMIT_SUCCESS skb should be transmitted
+ * 1: NET_XMIT_DROP skb should be dropped and cn
+ * 2: NET_XMIT_CN skb should be transmitted and cn
+ * 3: -err skb should be dropped
+ */
+
+ cn = flags & BPF_RET_SET_CN;
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
+ if (!ret)
+ ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
+ else
+ ret = (cn ? NET_XMIT_DROP : ret);
+ } else {
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
+ skb, __bpf_prog_run_save_cb, 0,
+ NULL);
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
+ }
+ bpf_restore_data_end(skb, saved_data_end);
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @atype: The type of program to be executed
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
+ NULL);
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
+ * provided by user sockaddr
+ * @sk: sock struct that will use sockaddr
+ * @uaddr: sockaddr struct provided by user
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ * uaddr.
+ * @atype: The type of program to be executed
+ * @t_ctx: Pointer to attach type specific context
+ * @flags: Pointer to u32 which contains higher bits of BPF program
+ * return value (OR'ed together).
+ *
+ * socket is expected to be of type INET, INET6 or UNIX.
+ *
+ * This function will return %-EPERM if an attached program is found and
+ * returned value != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+ struct sockaddr_unsized *uaddr,
+ int *uaddrlen,
+ enum cgroup_bpf_attach_type atype,
+ void *t_ctx,
+ u32 *flags)
+{
+ struct bpf_sock_addr_kern ctx = {
+ .sk = sk,
+ .uaddr = uaddr,
+ .t_ctx = t_ctx,
+ };
+ struct sockaddr_storage storage;
+ struct cgroup *cgrp;
+ int ret;
+
+ /* Check socket family since not all sockets represent network
+ * endpoint (e.g. AF_UNIX).
+ */
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+ sk->sk_family != AF_UNIX)
+ return 0;
+
+ if (!ctx.uaddr) {
+ memset(&storage, 0, sizeof(storage));
+ ctx.uaddr = (struct sockaddr_unsized *)&storage;
+ ctx.uaddrlen = 0;
+ } else {
+ ctx.uaddrlen = *uaddrlen;
+ }
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+ 0, flags);
+
+ if (!ret && uaddr)
+ *uaddrlen = ctx.uaddrlen;
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @atype: The type of program to be executed
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+ struct bpf_sock_ops_kern *sock_ops,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+ return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
+ 0, NULL);
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+ short access, enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup *cgrp;
+ struct bpf_cgroup_dev_ctx ctx = {
+ .access_type = (access << 16) | dev_type,
+ .major = major,
+ .minor = minor,
+ };
+ int ret;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
+{
+ /* flags argument is not used now,
+ * but provides an ability to extend the API.
+ * verifier checks that its value is correct.
+ */
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
+ struct bpf_cgroup_storage *storage;
+ struct bpf_cg_run_ctx *ctx;
+ void *ptr;
+
+ /* get current cgroup storage from BPF run context */
+ ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+ storage = ctx->prog_item->cgroup_storage[stype];
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
+}
+
+const struct bpf_func_proto bpf_get_local_storage_proto = {
+ .func = bpf_get_local_storage,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_0(bpf_get_retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+const struct bpf_func_proto bpf_get_retval_proto = {
+ .func = bpf_get_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ ctx->retval = retval;
+ return 0;
+}
+
+const struct bpf_func_proto bpf_set_retval_proto = {
+ .func = bpf_set_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static bool cgroup_dev_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (type == BPF_WRITE)
+ return false;
+
+ if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+ return false;
+ /* The verifier guarantees that size > 0. */
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ break;
+ default:
+ if (size != size_default)
+ return false;
+ }
+
+ return true;
+}
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+ .get_func_proto = cgroup_dev_func_proto,
+ .is_valid_access = cgroup_dev_is_valid_access,
+};
+
+/**
+ * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
+ *
+ * @head: sysctl table header
+ * @table: sysctl table
+ * @write: sysctl is being read (= 0) or written (= 1)
+ * @buf: pointer to buffer (in and out)
+ * @pcount: value-result argument: value is size of buffer pointed to by @buf,
+ * result is size of @new_buf if program set new value, initial value
+ * otherwise
+ * @ppos: value-result argument: value is position at which read from or write
+ * to sysctl is happening, result is new position if program overrode it,
+ * initial value otherwise
+ * @atype: type of program to be executed
+ *
+ * Program is run when sysctl is being accessed, either read or written, and
+ * can allow or deny such access.
+ *
+ * This function will return %-EPERM if an attached program is found and
+ * returned value != 1 during execution. In all other cases 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
+ const struct ctl_table *table, int write,
+ char **buf, size_t *pcount, loff_t *ppos,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct bpf_sysctl_kern ctx = {
+ .head = head,
+ .table = table,
+ .write = write,
+ .ppos = ppos,
+ .cur_val = NULL,
+ .cur_len = PAGE_SIZE,
+ .new_val = NULL,
+ .new_len = 0,
+ .new_updated = 0,
+ };
+ struct cgroup *cgrp;
+ loff_t pos = 0;
+ int ret;
+
+ ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
+ if (!ctx.cur_val ||
+ table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
+ /* Let BPF program decide how to proceed. */
+ ctx.cur_len = 0;
+ }
+
+ if (write && *buf && *pcount) {
+ /* BPF program should be able to override new value with a
+ * buffer bigger than provided by user.
+ */
+ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
+ ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
+ if (ctx.new_val) {
+ memcpy(ctx.new_val, *buf, ctx.new_len);
+ } else {
+ /* Let BPF program decide how to proceed. */
+ ctx.new_len = 0;
+ }
+ }
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
+ NULL);
+ rcu_read_unlock();
+
+ kfree(ctx.cur_val);
+
+ if (ret == 1 && ctx.new_updated) {
+ kfree(*buf);
+ *buf = ctx.new_val;
+ *pcount = ctx.new_len;
+ } else {
+ kfree(ctx.new_val);
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_NET
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+ struct bpf_sockopt_buf *buf)
+{
+ if (unlikely(max_optlen < 0))
+ return -EINVAL;
+
+ if (unlikely(max_optlen > PAGE_SIZE)) {
+ /* We don't expose optvals that are greater than PAGE_SIZE
+ * to the BPF program.
+ */
+ max_optlen = PAGE_SIZE;
+ }
+
+ if (max_optlen <= sizeof(buf->data)) {
+ /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+ * bytes avoid the cost of kzalloc.
+ */
+ ctx->optval = buf->data;
+ ctx->optval_end = ctx->optval + max_optlen;
+ return max_optlen;
+ }
+
+ ctx->optval = kzalloc(max_optlen, GFP_USER);
+ if (!ctx->optval)
+ return -ENOMEM;
+
+ ctx->optval_end = ctx->optval + max_optlen;
+
+ return max_optlen;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
+{
+ if (ctx->optval == buf->data)
+ return;
+ kfree(ctx->optval);
+}
+
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
+{
+ return ctx->optval != buf->data;
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+ int *optname, sockptr_t optval,
+ int *optlen, char **kernel_optval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = *level,
+ .optname = *optname,
+ };
+ int ret, max_optlen;
+
+ /* Allocate a bit more than the initial user buffer for
+ * BPF program. The canonical use case is overriding
+ * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
+ */
+ max_optlen = max_t(int, 16, *optlen);
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
+ if (max_optlen < 0)
+ return max_optlen;
+
+ ctx.optlen = *optlen;
+
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(*optlen, max_optlen))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ lock_sock(sk);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
+ &ctx, bpf_prog_run, 0, NULL);
+ release_sock(sk);
+
+ if (ret)
+ goto out;
+
+ if (ctx.optlen == -1) {
+ /* optlen set to -1, bypass kernel */
+ ret = 1;
+ } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
+ /* optlen is out of bounds */
+ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = 0;
+ goto out;
+ }
+ ret = -EFAULT;
+ } else {
+ /* optlen within bounds, run kernel handler */
+ ret = 0;
+
+ /* export any potential modifications */
+ *level = ctx.level;
+ *optname = ctx.optname;
+
+ /* optlen == 0 from BPF indicates that we should
+ * use original userspace data.
+ */
+ if (ctx.optlen != 0) {
+ *optlen = ctx.optlen;
+ /* We've used bpf_sockopt_kern->buf as an intermediary
+ * storage, but the BPF program indicates that we need
+ * to pass this data to the kernel setsockopt handler.
+ * No way to export on-stack buf, have to allocate a
+ * new buffer.
+ */
+ if (!sockopt_buf_allocated(&ctx, &buf)) {
+ void *p = kmalloc(ctx.optlen, GFP_USER);
+
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(p, ctx.optval, ctx.optlen);
+ *kernel_optval = p;
+ } else {
+ *kernel_optval = ctx.optval;
+ }
+ /* export and don't free sockopt buf */
+ return 0;
+ }
+ }
+
+out:
+ sockopt_free_buf(&ctx, &buf);
+ return ret;
+}
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+ int optname, sockptr_t optval,
+ sockptr_t optlen, int max_optlen,
+ int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .current_task = current,
+ };
+ int orig_optlen;
+ int ret;
+
+ orig_optlen = max_optlen;
+ ctx.optlen = max_optlen;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
+ if (max_optlen < 0)
+ return max_optlen;
+
+ if (!retval) {
+ /* If kernel getsockopt finished successfully,
+ * copy whatever was returned to the user back
+ * into our temporary buffer. Set optlen to the
+ * one that kernel returned as well to let
+ * BPF programs inspect the value.
+ */
+ if (copy_from_sockptr(&ctx.optlen, optlen,
+ sizeof(ctx.optlen))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ctx.optlen < 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ orig_optlen = ctx.optlen;
+
+ if (copy_from_sockptr(ctx.optval, optval,
+ min(ctx.optlen, max_optlen))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ lock_sock(sk);
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
+ release_sock(sk);
+
+ if (ret < 0)
+ goto out;
+
+ if (!sockptr_is_null(optval) &&
+ (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+ if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
+ pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
+ ctx.optlen, max_optlen);
+ ret = retval;
+ goto out;
+ }
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ctx.optlen != 0) {
+ if (!sockptr_is_null(optval) &&
+ copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+out:
+ sockopt_free_buf(&ctx, &buf);
+ return ret;
+}
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+ int optname, void *optval,
+ int *optlen, int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .optlen = *optlen,
+ .optval = optval,
+ .optval_end = optval + *optlen,
+ .current_task = current,
+ };
+ int ret;
+
+ /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
+ * user data back into BPF buffer when reval != 0. This is
+ * done as an optimization to avoid extra copy, assuming
+ * kernel won't populate the data in case of an error.
+ * Here we always pass the data and memset() should
+ * be called if that data shouldn't be "exported".
+ */
+
+ ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
+ &ctx, bpf_prog_run, retval, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (ctx.optlen > *optlen)
+ return -EFAULT;
+
+ /* BPF programs can shrink the buffer, export the modifications.
+ */
+ if (ctx.optlen != 0)
+ *optlen = ctx.optlen;
+
+ return ret;
+}
+#endif
+
+static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
+ size_t *lenp)
+{
+ ssize_t tmp_ret = 0, ret;
+
+ if (dir->header.parent) {
+ tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ }
+
+ ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
+ if (ret < 0)
+ return ret;
+ *bufp += ret;
+ *lenp -= ret;
+ ret += tmp_ret;
+
+ /* Avoid leading slash. */
+ if (!ret)
+ return ret;
+
+ tmp_ret = strscpy(*bufp, "/", *lenp);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ *bufp += tmp_ret;
+ *lenp -= tmp_ret;
+
+ return ret + tmp_ret;
+}
+
+BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
+ size_t, buf_len, u64, flags)
+{
+ ssize_t tmp_ret = 0, ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
+ if (!ctx->head)
+ return -EINVAL;
+ tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ }
+
+ ret = strscpy(buf, ctx->table->procname, buf_len);
+
+ return ret < 0 ? ret : tmp_ret + ret;
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
+ .func = bpf_sysctl_get_name,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
+ size_t src_len)
+{
+ if (!dst)
+ return -EINVAL;
+
+ if (!dst_len)
+ return -E2BIG;
+
+ if (!src || !src_len) {
+ memset(dst, 0, dst_len);
+ return -EINVAL;
+ }
+
+ memcpy(dst, src, min(dst_len, src_len));
+
+ if (dst_len > src_len) {
+ memset(dst + src_len, '\0', dst_len - src_len);
+ return src_len;
+ }
+
+ dst[dst_len - 1] = '\0';
+
+ return -E2BIG;
+}
+
+BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
+ char *, buf, size_t, buf_len)
+{
+ return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
+ .func = bpf_sysctl_get_current_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
+ size_t, buf_len)
+{
+ if (!ctx->write) {
+ if (buf && buf_len)
+ memset(buf, '\0', buf_len);
+ return -EINVAL;
+ }
+ return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
+ .func = bpf_sysctl_get_new_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
+ const char *, buf, size_t, buf_len)
+{
+ if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
+ return -EINVAL;
+
+ if (buf_len > PAGE_SIZE - 1)
+ return -E2BIG;
+
+ memcpy(ctx->new_val, buf, buf_len);
+ ctx->new_len = buf_len;
+ ctx->new_updated = 1;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
+ .func = bpf_sysctl_set_new_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+static const struct bpf_func_proto *
+sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ switch (func_id) {
+ case BPF_FUNC_sysctl_get_name:
+ return &bpf_sysctl_get_name_proto;
+ case BPF_FUNC_sysctl_get_current_value:
+ return &bpf_sysctl_get_current_value_proto;
+ case BPF_FUNC_sysctl_get_new_value:
+ return &bpf_sysctl_get_new_value_proto;
+ case BPF_FUNC_sysctl_set_new_value:
+ return &bpf_sysctl_set_new_value_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sysctl, write):
+ if (type != BPF_READ)
+ return false;
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case bpf_ctx_range(struct bpf_sysctl, file_pos):
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ } else {
+ return size == size_default;
+ }
+ default:
+ return false;
+ }
+}
+
+static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ u32 read_size;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sysctl, write):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sysctl_kern, write,
+ sizeof_field(struct bpf_sysctl_kern,
+ write),
+ target_size));
+ break;
+ case offsetof(struct bpf_sysctl, file_pos):
+ /* ppos is a pointer so it should be accessed via indirect
+ * loads and stores. Also for stores additional temporary
+ * register is used since neither src_reg nor dst_reg can be
+ * overridden.
+ */
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(
+ BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sysctl_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sysctl_kern, ppos));
+ *insn++ = BPF_RAW_INSN(
+ BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
+ treg, si->src_reg,
+ bpf_ctx_narrow_access_offset(
+ 0, sizeof(u32), sizeof(loff_t)),
+ si->imm);
+ *insn++ = BPF_LDX_MEM(
+ BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sysctl_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sysctl_kern, ppos));
+ read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
+ bpf_ctx_narrow_access_offset(
+ 0, read_size, sizeof(loff_t)));
+ }
+ *target_size = sizeof(u32);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
+ .get_func_proto = sysctl_func_proto,
+ .is_valid_access = sysctl_is_valid_access,
+ .convert_ctx_access = sysctl_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sysctl_prog_ops = {
+};
+
+#ifdef CONFIG_NET
+BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
+{
+ const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
+ .func = bpf_get_netns_cookie_sockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+#endif
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ switch (func_id) {
+#ifdef CONFIG_NET
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sockopt_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sockopt))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_GETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optname):
+ fallthrough;
+ case offsetof(struct bpf_sockopt, level):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_SETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optlen):
+ return size == size_default;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range_ptr(struct bpf_sockopt, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sockopt, optval):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case bpf_ctx_range(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ return true;
+}
+
+#define CG_SOCKOPT_READ_FIELD(F) \
+ BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+#define CG_SOCKOPT_WRITE_FIELD(F) \
+ BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
+ BPF_MEM | BPF_CLASS(si->code)), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F), \
+ si->imm)
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sockopt, sk):
+ *insn++ = CG_SOCKOPT_READ_FIELD(sk);
+ break;
+ case offsetof(struct bpf_sockopt, level):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
+ else
+ *insn++ = CG_SOCKOPT_READ_FIELD(level);
+ break;
+ case offsetof(struct bpf_sockopt, optname):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
+ else
+ *insn++ = CG_SOCKOPT_READ_FIELD(optname);
+ break;
+ case offsetof(struct bpf_sockopt, optlen):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
+ else
+ *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
+ BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval),
+ si->imm);
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval);
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+ bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Nothing to do for sockopt argument. The data is kzalloc'ated.
+ */
+ return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+ .get_func_proto = cg_sockopt_func_proto,
+ .is_valid_access = cg_sockopt_is_valid_access,
+ .convert_ctx_access = cg_sockopt_convert_ctx_access,
+ .gen_prologue = cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
+
+/* Common helpers for cgroup hooks. */
+const struct bpf_func_proto *
+cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_get_retval_proto;
+ }
+ case BPF_FUNC_set_retval:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return NULL;
+ default:
+ return &bpf_set_retval_proto;
+ }
+ default:
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
new file mode 100644
index 000000000000..f04a468cf6a7
--- /dev/null
+++ b/kernel/bpf/cgroup_iter.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/cgroup.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */
+
+/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+ *
+ * 1. Walk the descendants of a cgroup in pre-order.
+ * 2. Walk the descendants of a cgroup in post-order.
+ * 3. Walk the ancestors of a cgroup.
+ * 4. Show the given cgroup only.
+ *
+ * For walking descendants, cgroup_iter can walk in either pre-order or
+ * post-order. For walking ancestors, the iter walks up from a cgroup to
+ * the root.
+ *
+ * The iter program can terminate the walk early by returning 1. Walk
+ * continues if prog returns 0.
+ *
+ * The prog can check (seq->num == 0) to determine whether this is
+ * the first element. The prog may also be passed a NULL cgroup,
+ * which means the walk has completed and the prog has a chance to
+ * do post-processing, such as outputting an epilogue.
+ *
+ * Note: the iter_prog is called with cgroup_mutex held.
+ *
+ * Currently only one session is supported, which means, depending on the
+ * volume of data bpf program intends to send to user space, the number
+ * of cgroups that can be walked is limited. For example, given the current
+ * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each
+ * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can
+ * be walked is 512. This is a limitation of cgroup_iter. If the output data
+ * is larger than the kernel buffer size, after all data in the kernel buffer
+ * is consumed by user space, the subsequent read() syscall will signal
+ * EOPNOTSUPP. In order to work around, the user may have to update their
+ * program to reduce the volume of data sent to output. For example, skip
+ * some uninteresting cgroups.
+ */
+
+struct bpf_iter__cgroup {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct cgroup *, cgroup);
+};
+
+struct cgroup_iter_priv {
+ struct cgroup_subsys_state *start_css;
+ bool visited_all;
+ bool terminate;
+ int order;
+};
+
+static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_lock();
+
+ /* cgroup_iter doesn't support read across multiple sessions. */
+ if (*pos > 0) {
+ if (p->visited_all)
+ return NULL;
+
+ /* Haven't visited all, but because cgroup_mutex has dropped,
+ * return -EOPNOTSUPP to indicate incomplete iteration.
+ */
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ ++*pos;
+ p->terminate = false;
+ p->visited_all = false;
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(NULL, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(NULL, p->start_css);
+ else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
+ return p->start_css;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop);
+
+static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct cgroup_iter_priv *p = seq->private;
+
+ cgroup_unlock();
+
+ /* pass NULL to the prog for post-processing */
+ if (!v) {
+ __cgroup_iter_seq_show(seq, NULL, true);
+ p->visited_all = true;
+ }
+}
+
+static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
+ struct cgroup_iter_priv *p = seq->private;
+
+ ++*pos;
+ if (p->terminate)
+ return NULL;
+
+ if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ return css_next_descendant_pre(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ return css_next_descendant_post(curr, p->start_css);
+ else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ return curr->parent;
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ return NULL;
+}
+
+static int __cgroup_iter_seq_show(struct seq_file *seq,
+ struct cgroup_subsys_state *css, int in_stop)
+{
+ struct cgroup_iter_priv *p = seq->private;
+ struct bpf_iter__cgroup ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ /* cgroup is dead, skip this element */
+ if (css && cgroup_is_dead(css->cgroup))
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.cgroup = css ? css->cgroup : NULL;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ /* if prog returns > 0, terminate after this element. */
+ if (ret != 0)
+ p->terminate = true;
+
+ return 0;
+}
+
+static int cgroup_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v,
+ false);
+}
+
+static const struct seq_operations cgroup_iter_seq_ops = {
+ .start = cgroup_iter_seq_start,
+ .next = cgroup_iter_seq_next,
+ .stop = cgroup_iter_seq_stop,
+ .show = cgroup_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+
+static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+ struct cgroup *cgrp = aux->cgroup.start;
+
+ /* bpf_iter_attach_cgroup() has already acquired an extra reference
+ * for the start cgroup, but the reference may be released after
+ * cgroup_iter_seq_init(), so acquire another reference for the
+ * start cgroup.
+ */
+ p->start_css = &cgrp->self;
+ css_get(p->start_css);
+ p->terminate = false;
+ p->visited_all = false;
+ p->order = aux->cgroup.order;
+ return 0;
+}
+
+static void cgroup_iter_seq_fini(void *priv)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+
+ css_put(p->start_css);
+}
+
+static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
+ .seq_ops = &cgroup_iter_seq_ops,
+ .init_seq_private = cgroup_iter_seq_init,
+ .fini_seq_private = cgroup_iter_seq_fini,
+ .seq_priv_size = sizeof(struct cgroup_iter_priv),
+};
+
+static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ int fd = linfo->cgroup.cgroup_fd;
+ u64 id = linfo->cgroup.cgroup_id;
+ int order = linfo->cgroup.order;
+ struct cgroup *cgrp;
+
+ if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
+ order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
+ order != BPF_CGROUP_ITER_ANCESTORS_UP &&
+ order != BPF_CGROUP_ITER_SELF_ONLY)
+ return -EINVAL;
+
+ if (fd && id)
+ return -EINVAL;
+
+ if (fd)
+ cgrp = cgroup_v1v2_get_from_fd(fd);
+ else if (id)
+ cgrp = cgroup_get_from_id(id);
+ else /* walk the entire hierarchy by default. */
+ cgrp = cgroup_get_from_path("/");
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ aux->cgroup.start = cgrp;
+ aux->cgroup.order = order;
+ return 0;
+}
+
+static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux)
+{
+ cgroup_put(aux->cgroup.start);
+}
+
+static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ char *buf;
+
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf) {
+ seq_puts(seq, "cgroup_path:\t<unknown>\n");
+ goto show_order;
+ }
+
+ /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path
+ * will print nothing.
+ *
+ * Path is in the calling process's cgroup namespace.
+ */
+ cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ seq_printf(seq, "cgroup_path:\t%s\n", buf);
+ kfree(buf);
+
+show_order:
+ if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE)
+ seq_puts(seq, "order: descendants_pre\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST)
+ seq_puts(seq, "order: descendants_post\n");
+ else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
+ seq_puts(seq, "order: ancestors_up\n");
+ else /* BPF_CGROUP_ITER_SELF_ONLY */
+ seq_puts(seq, "order: self_only\n");
+}
+
+static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.cgroup.order = aux->cgroup.order;
+ info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start);
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta,
+ struct cgroup *cgroup)
+
+static struct bpf_iter_reg bpf_cgroup_reg_info = {
+ .target = "cgroup",
+ .feature = BPF_ITER_RESCHED,
+ .attach_target = bpf_iter_attach_cgroup,
+ .detach_target = bpf_iter_detach_cgroup,
+ .show_fdinfo = bpf_iter_cgroup_show_fdinfo,
+ .fill_link_info = bpf_iter_cgroup_fill_link_info,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__cgroup, cgroup),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &cgroup_iter_seq_info,
+};
+
+static int __init bpf_cgroup_iter_init(void)
+{
+ bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0];
+ return bpf_iter_reg_target(&bpf_cgroup_reg_info);
+}
+
+late_initcall(bpf_cgroup_iter_init);
+
+struct bpf_iter_css {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_kern {
+ struct cgroup_subsys_state *start;
+ struct cgroup_subsys_state *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
+ struct cgroup_subsys_state *start, unsigned int flags)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
+
+ kit->start = NULL;
+ switch (flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->start = start;
+ kit->pos = NULL;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
+{
+ struct bpf_iter_css_kern *kit = (void *)it;
+
+ if (!kit->start)
+ return NULL;
+
+ switch (kit->flags) {
+ case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+ kit->pos = css_next_descendant_pre(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_DESCENDANTS_POST:
+ kit->pos = css_next_descendant_post(kit->pos, kit->start);
+ break;
+ case BPF_CGROUP_ITER_ANCESTORS_UP:
+ kit->pos = kit->pos ? kit->pos->parent : kit->start;
+ }
+
+ return kit->pos;
+}
+
+__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
+{
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd0e2..c8ae6ab31651 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux Socket Filter - Kernel level socket filtering
*
@@ -12,22 +13,36 @@
* Alexei Starovoitov <ast@plumgrid.com>
* Daniel Borkmann <dborkman@redhat.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+#include <uapi/linux/btf.h>
+#include <crypto/sha1.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
-#include <linux/random.h>
-#include <linux/moduleloader.h>
-#include <asm/unaligned.h>
+#include <linux/prandom.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/objtool.h>
+#include <linux/overflow.h>
+#include <linux/rbtree_latch.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
+#include <linux/perf_event.h>
+#include <linux/extable.h>
+#include <linux/log2.h>
+#include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
+#include <linux/nospec.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <linux/execmem.h>
+#include <crypto/sha2.h>
+
+#include <asm/barrier.h>
+#include <linux/unaligned.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -46,10 +61,15 @@
#define DST regs[insn->dst_reg]
#define SRC regs[insn->src_reg]
#define FP regs[BPF_REG_FP]
+#define AX regs[BPF_REG_AX]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
+#define OFF insn->off
#define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -58,102 +78,1012 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
{
u8 *ptr = NULL;
- if (k >= SKF_NET_OFF)
+ if (k >= SKF_NET_OFF) {
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
+ } else if (k >= SKF_LL_OFF) {
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return NULL;
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+ }
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
return NULL;
}
-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
+enum page_size_enum {
+ __PAGE_SIZE = PAGE_SIZE
+};
+
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
- gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
- size = round_up(size, PAGE_SIZE);
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ size = round_up(size, __PAGE_SIZE);
+ fp = __vmalloc(size, gfp_flags);
if (fp == NULL)
return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
if (aux == NULL) {
vfree(fp);
return NULL;
}
+ fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+ if (!fp->active) {
+ vfree(fp);
+ kfree(aux);
+ return NULL;
+ }
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
+ fp->aux->main_prog_aux = aux;
+ fp->aux->prog = fp;
+ fp->jit_requested = ebpf_jit_enabled();
+ fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+#ifdef CONFIG_CGROUP_BPF
+ aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
+#endif
+
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+#ifdef CONFIG_FINEIBT
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+#endif
+ mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->ext_mutex);
+ mutex_init(&fp->aux->dst_mutex);
+
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_prog_stream_init(fp);
+#endif
return fp;
}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
+ struct bpf_prog *prog;
+ int cpu;
+
+ prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
+ if (!prog)
+ return NULL;
+
+ prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->stats) {
+ free_percpu(prog->active);
+ kfree(prog->aux);
+ vfree(prog);
+ return NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_prog_stats *pstats;
+
+ pstats = per_cpu_ptr(prog->stats, cpu);
+ u64_stats_init(&pstats->syncp);
+ }
+ return prog;
+}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
+{
+ if (!prog->aux->nr_linfo || !prog->jit_requested)
+ return 0;
+
+ prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
+ if (!prog->aux->jited_linfo)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
+{
+ if (prog->aux->jited_linfo &&
+ (!prog->jited || !prog->aux->jited_linfo[0])) {
+ kvfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+ }
+
+ kfree(prog->aux->kfunc_tab);
+ prog->aux->kfunc_tab = NULL;
+}
+
+/* The jit engine is responsible to provide an array
+ * for insn_off to the jited_off mapping (insn_to_jit_off).
+ *
+ * The idx to this array is the insn_off. Hence, the insn_off
+ * here is relative to the prog itself instead of the main prog.
+ * This array has one entry for each xlated bpf insn.
+ *
+ * jited_off is the byte off to the end of the jited insn.
+ *
+ * Hence, with
+ * insn_start:
+ * The first bpf insn off of the prog. The insn off
+ * here is relative to the main prog.
+ * e.g. if prog is a subprog, insn_start > 0
+ * linfo_idx:
+ * The prog's idx to prog->aux->linfo and jited_linfo
+ *
+ * jited_linfo[linfo_idx] = prog->bpf_func
+ *
+ * For i > linfo_idx,
+ *
+ * jited_linfo[i] = prog->bpf_func +
+ * insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
+ */
+void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
+ const u32 *insn_to_jit_off)
+{
+ u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
+ const struct bpf_line_info *linfo;
+ void **jited_linfo;
+
+ if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
+ /* Userspace did not provide linfo */
+ return;
+
+ linfo_idx = prog->aux->linfo_idx;
+ linfo = &prog->aux->linfo[linfo_idx];
+ insn_start = linfo[0].insn_off;
+ insn_end = insn_start + prog->len;
+
+ jited_linfo = &prog->aux->jited_linfo[linfo_idx];
+ jited_linfo[0] = prog->bpf_func;
+
+ nr_linfo = prog->aux->nr_linfo - linfo_idx;
+
+ for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
+ /* The verifier ensures that linfo[i].insn_off is
+ * strictly increasing
+ */
+ jited_linfo[i] = prog->bpf_func +
+ insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
+}
+
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
- gfp_extra_flags;
+ gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
struct bpf_prog *fp;
-
- BUG_ON(fp_old == NULL);
+ u32 pages;
size = round_up(size, PAGE_SIZE);
- if (size <= fp_old->pages * PAGE_SIZE)
+ pages = size / PAGE_SIZE;
+ if (pages <= fp_old->pages)
return fp_old;
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- if (fp != NULL) {
+ fp = __vmalloc(size, gfp_flags);
+ if (fp) {
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
- fp->pages = size / PAGE_SIZE;
+ fp->pages = pages;
+ fp->aux->prog = fp;
/* We keep fp->aux from fp_old around in the new
* reallocated structure.
*/
fp_old->aux = NULL;
+ fp_old->stats = NULL;
+ fp_old->active = NULL;
__bpf_prog_free(fp_old);
}
return fp;
}
-EXPORT_SYMBOL_GPL(bpf_prog_realloc);
void __bpf_prog_free(struct bpf_prog *fp)
{
- kfree(fp->aux);
+ if (fp->aux) {
+ mutex_destroy(&fp->aux->used_maps_mutex);
+ mutex_destroy(&fp->aux->dst_mutex);
+ kfree(fp->aux->poke_tab);
+ kfree(fp->aux);
+ }
+ free_percpu(fp->stats);
+ free_percpu(fp->active);
vfree(fp);
}
-EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
+int bpf_prog_calc_tag(struct bpf_prog *fp)
+{
+ size_t size = bpf_prog_insn_size(fp);
+ struct bpf_insn *dst;
+ bool was_ld_map;
+ u32 i;
+
+ dst = vmalloc(size);
+ if (!dst)
+ return -ENOMEM;
+
+ /* We need to take out the map fd for the digest calculation
+ * since they are unstable from user space side.
+ */
+ for (i = 0, was_ld_map = false; i < fp->len; i++) {
+ dst[i] = fp->insnsi[i];
+ if (!was_ld_map &&
+ dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
+ dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
+ was_ld_map = true;
+ dst[i].imm = 0;
+ } else if (was_ld_map &&
+ dst[i].code == 0 &&
+ dst[i].dst_reg == 0 &&
+ dst[i].src_reg == 0 &&
+ dst[i].off == 0) {
+ was_ld_map = false;
+ dst[i].imm = 0;
+ } else {
+ was_ld_map = false;
+ }
+ }
+ sha256((u8 *)dst, size, fp->digest);
+ vfree(dst);
+ return 0;
+}
+
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, s32 curr, const bool probe_pass)
+{
+ const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s32 delta = end_new - end_old;
+ s64 imm = insn->imm;
+
+ if (curr < pos && curr + imm + 1 >= end_old)
+ imm += delta;
+ else if (curr >= end_new && curr + imm + 1 < end_new)
+ imm -= delta;
+ if (imm < imm_min || imm > imm_max)
+ return -ERANGE;
+ if (!probe_pass)
+ insn->imm = imm;
+ return 0;
+}
+
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, s32 curr, const bool probe_pass)
+{
+ s64 off_min, off_max, off;
+ s32 delta = end_new - end_old;
+
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ off = insn->imm;
+ off_min = S32_MIN;
+ off_max = S32_MAX;
+ } else {
+ off = insn->off;
+ off_min = S16_MIN;
+ off_max = S16_MAX;
+ }
+
+ if (curr < pos && curr + off + 1 >= end_old)
+ off += delta;
+ else if (curr >= end_new && curr + off + 1 < end_new)
+ off -= delta;
+ if (off < off_min || off > off_max)
+ return -ERANGE;
+ if (!probe_pass) {
+ if (insn->code == (BPF_JMP32 | BPF_JA))
+ insn->imm = off;
+ else
+ insn->off = off;
+ }
+ return 0;
+}
+
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
+ s32 end_new, const bool probe_pass)
+{
+ u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
+ struct bpf_insn *insn = prog->insnsi;
+ int ret = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ u8 code;
+
+ /* In the probing pass we still operate on the original,
+ * unpatched image in order to check overflows before we
+ * do any other adjustments. Therefore skip the patchlet.
+ */
+ if (probe_pass && i == pos) {
+ i = end_new;
+ insn = prog->insnsi + end_old;
+ }
+ if (bpf_pseudo_func(insn)) {
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
+ if (ret)
+ return ret;
+ continue;
+ }
+ code = insn->code;
+ if ((BPF_CLASS(code) != BPF_JMP &&
+ BPF_CLASS(code) != BPF_JMP32) ||
+ BPF_OP(code) == BPF_EXIT)
+ continue;
+ /* Adjust offset of jmps if we cross patch boundaries. */
+ if (BPF_OP(code) == BPF_CALL) {
+ if (insn->src_reg != BPF_PSEUDO_CALL)
+ continue;
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
+ } else {
+ ret = bpf_adj_delta_to_off(insn, pos, end_old,
+ end_new, i, probe_pass);
+ }
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
+{
+ struct bpf_line_info *linfo;
+ u32 i, nr_linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo || !delta)
+ return;
+
+ linfo = prog->aux->linfo;
+
+ for (i = 0; i < nr_linfo; i++)
+ if (off < linfo[i].insn_off)
+ break;
+
+ /* Push all off < linfo[i].insn_off by delta */
+ for (; i < nr_linfo; i++)
+ linfo[i].insn_off += delta;
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ const u32 cnt_max = S16_MAX;
+ struct bpf_prog *prog_adj;
+ int err;
+
+ /* Since our patchlet doesn't expand the image, we're done. */
+ if (insn_delta == 0) {
+ memcpy(prog->insnsi + off, patch, sizeof(*patch));
+ return prog;
+ }
+
+ insn_adj_cnt = prog->len + insn_delta;
+
+ /* Reject anything that would potentially let the insn->off
+ * target overflow when we have excessive program expansions.
+ * We need to probe here before we do any reallocation where
+ * we afterwards may not fail anymore.
+ */
+ if (insn_adj_cnt > cnt_max &&
+ (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
+ return ERR_PTR(err);
+
+ /* Several new instructions need to be inserted. Make room
+ * for them. Likely, there's no need for a new allocation as
+ * last page could have large enough tailroom.
+ */
+ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+ GFP_USER);
+ if (!prog_adj)
+ return ERR_PTR(-ENOMEM);
+
+ prog_adj->len = insn_adj_cnt;
+
+ /* Patching happens in 3 steps:
+ *
+ * 1) Move over tail of insnsi from next instruction onwards,
+ * so we can patch the single target insn with one or more
+ * new ones (patching is always from 1 to n insns, n > 0).
+ * 2) Inject new instructions at the target location.
+ * 3) Adjust branch offsets if necessary.
+ */
+ insn_rest = insn_adj_cnt - off - len;
+
+ memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+ sizeof(*patch) * insn_rest);
+ memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+ /* We are guaranteed to not fail at this point, otherwise
+ * the ship has sailed to reverse to the original state. An
+ * overflow cannot happen at this point.
+ */
+ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
+
+ bpf_adj_linfo(prog_adj, off, insn_delta);
+
+ return prog_adj;
+}
+
+int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
+{
+ int err;
+
+ /* Branch offsets can't overflow when program is shrinking, no need
+ * to call bpf_adj_branches(..., true) here
+ */
+ memmove(prog->insnsi + off, prog->insnsi + off + cnt,
+ sizeof(struct bpf_insn) * (prog->len - off - cnt));
+ prog->len -= cnt;
+
+ err = bpf_adj_branches(prog, off, off + cnt, off, false);
+ WARN_ON_ONCE(err);
+ return err;
+}
+
+static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
+{
+ int i;
+
+ for (i = 0; i < fp->aux->real_func_cnt; i++)
+ bpf_prog_kallsyms_del(fp->aux->func[i]);
+}
+
+void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
+{
+ bpf_prog_kallsyms_del_subprogs(fp);
+ bpf_prog_kallsyms_del(fp);
+}
#ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
+int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
+int bpf_jit_harden __read_mostly;
+long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
+
+static void
+bpf_prog_ksym_set_addr(struct bpf_prog *prog)
+{
+ WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
+
+ prog->aux->ksym.start = (unsigned long) prog->bpf_func;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
+}
+
+static void
+bpf_prog_ksym_set_name(struct bpf_prog *prog)
+{
+ char *sym = prog->aux->ksym.name;
+ const char *end = sym + KSYM_NAME_LEN;
+ const struct btf_type *type;
+ const char *func_name;
+
+ BUILD_BUG_ON(sizeof("bpf_prog_") +
+ sizeof(prog->tag) * 2 +
+ /* name has been null terminated.
+ * We should need +1 for the '_' preceding
+ * the name. However, the null character
+ * is double counted between the name and the
+ * sizeof("bpf_prog_") above, so we omit
+ * the +1 here.
+ */
+ sizeof(prog->aux->name) > KSYM_NAME_LEN);
+
+ sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
+ sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
+
+ /* prog->aux->name will be ignored if full btf name is available */
+ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
+ type = btf_type_by_id(prog->aux->btf,
+ prog->aux->func_info[prog->aux->func_idx].type_id);
+ func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
+ snprintf(sym, (size_t)(end - sym), "_%s", func_name);
+ return;
+ }
+
+ if (prog->aux->name[0])
+ snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+ else
+ *sym = 0;
+}
+
+static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
+{
+ return container_of(n, struct bpf_ksym, tnode)->start;
+}
+
+static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
+}
+
+static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ const struct bpf_ksym *ksym;
+
+ ksym = container_of(n, struct bpf_ksym, tnode);
+
+ if (val < ksym->start)
+ return -1;
+ /* Ensure that we detect return addresses as part of the program, when
+ * the final instruction is a call for a program part of the stack
+ * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
+ */
+ if (val > ksym->end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops bpf_tree_ops = {
+ .less = bpf_tree_less,
+ .comp = bpf_tree_comp,
+};
+
+static DEFINE_SPINLOCK(bpf_lock);
+static LIST_HEAD(bpf_kallsyms);
+static struct latch_tree_root bpf_tree __cacheline_aligned;
+
+void bpf_ksym_add(struct bpf_ksym *ksym)
+{
+ spin_lock_bh(&bpf_lock);
+ WARN_ON_ONCE(!list_empty(&ksym->lnode));
+ list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
+ latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
+ spin_unlock_bh(&bpf_lock);
+}
+
+static void __bpf_ksym_del(struct bpf_ksym *ksym)
+{
+ if (list_empty(&ksym->lnode))
+ return;
+
+ latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
+ list_del_rcu(&ksym->lnode);
+}
+
+void bpf_ksym_del(struct bpf_ksym *ksym)
+{
+ spin_lock_bh(&bpf_lock);
+ __bpf_ksym_del(ksym);
+ spin_unlock_bh(&bpf_lock);
+}
+
+static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
+{
+ return fp->jited && !bpf_prog_was_classic(fp);
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+ if (!bpf_prog_kallsyms_candidate(fp) ||
+ !bpf_token_capable(fp->aux->token, CAP_BPF))
+ return;
+
+ bpf_prog_ksym_set_addr(fp);
+ bpf_prog_ksym_set_name(fp);
+ fp->aux->ksym.prog = true;
+
+ bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+ /*
+ * When FineIBT, code in the __cfi_foo() symbols can get executed
+ * and hence unwinder needs help.
+ */
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+
+ snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+ "__cfi_%s", fp->aux->ksym.name);
+
+ fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+ fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func;
+
+ bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
+}
+
+void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+ if (!bpf_prog_kallsyms_candidate(fp))
+ return;
+
+ bpf_ksym_del(&fp->aux->ksym);
+#ifdef CONFIG_FINEIBT
+ if (cfi_mode != CFI_FINEIBT)
+ return;
+ bpf_ksym_del(&fp->aux->ksym_prefix);
+#endif
+}
+
+static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
+{
+ struct latch_tree_node *n;
+
+ n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
+ return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
+}
+
+int __bpf_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
+{
+ struct bpf_ksym *ksym;
+ int ret = 0;
+
+ rcu_read_lock();
+ ksym = bpf_ksym_find(addr);
+ if (ksym) {
+ unsigned long symbol_start = ksym->start;
+ unsigned long symbol_end = ksym->end;
+
+ ret = strscpy(sym, ksym->name, KSYM_NAME_LEN);
+
+ if (size)
+ *size = symbol_end - symbol_start;
+ if (off)
+ *off = addr - symbol_start;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+bool is_bpf_text_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = bpf_ksym_find(addr) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+ struct bpf_ksym *ksym;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ ksym = bpf_ksym_find(addr);
+
+ return ksym && ksym->prog ?
+ container_of(ksym, struct bpf_prog_aux, ksym)->prog :
+ NULL;
+}
+
+const struct exception_table_entry *search_bpf_extables(unsigned long addr)
+{
+ const struct exception_table_entry *e = NULL;
+ struct bpf_prog *prog;
+
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(addr);
+ if (!prog)
+ goto out;
+ if (!prog->aux->num_exentries)
+ goto out;
+
+ e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
+out:
+ rcu_read_unlock();
+ return e;
+}
+
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+ struct bpf_ksym *ksym;
+ unsigned int it = 0;
+ int ret = -ERANGE;
+
+ if (!bpf_jit_kallsyms_enabled())
+ return ret;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
+ if (it++ != symnum)
+ continue;
+
+ strscpy(sym, ksym->name, KSYM_NAME_LEN);
+
+ *value = ksym->start;
+ *type = BPF_SYM_ELF_TYPE;
+
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+ struct bpf_jit_poke_descriptor *poke)
+{
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+ static const u32 poke_tab_max = 1024;
+ u32 slot = prog->aux->size_poke_tab;
+ u32 size = slot + 1;
+
+ if (size > poke_tab_max)
+ return -ENOSPC;
+ if (poke->tailcall_target || poke->tailcall_target_stable ||
+ poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
+ return -EINVAL;
+
+ switch (poke->reason) {
+ case BPF_POKE_REASON_TAIL_CALL:
+ if (!poke->tail_call.map)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
+ if (!tab)
+ return -ENOMEM;
+
+ memcpy(&tab[slot], poke, sizeof(*poke));
+ prog->aux->size_poke_tab = size;
+ prog->aux->poke_tab = tab;
+
+ return slot;
+}
+
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[];
+};
+
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
+{
+ memset(area, 0, size);
+}
+
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+/* PMD_SIZE is really big for some archs. It doesn't make sense to
+ * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
+ * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
+ * greater than or equal to 2MB.
+ */
+#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
+#else
+#define BPF_PROG_PACK_SIZE PAGE_SIZE
+#endif
+
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_prog_pack *pack;
+ int err;
+
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
+ GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
+ if (!pack->ptr)
+ goto out;
+ bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ err = set_memory_rox((unsigned long)pack->ptr,
+ BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ if (err)
+ goto out;
+ list_add_tail(&pack->list, &pack_list);
+ return pack;
+
+out:
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ return NULL;
+}
+
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ if (size > BPF_PROG_PACK_SIZE) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = bpf_jit_alloc_exec(size);
+ if (ptr) {
+ int err;
+
+ bpf_fill_ill_insns(ptr, size);
+ set_vm_flush_reset_perms(ptr);
+ err = set_memory_rox((unsigned long)ptr,
+ size / PAGE_SIZE);
+ if (err) {
+ bpf_jit_free_exec(ptr);
+ ptr = NULL;
+ }
+ }
+ goto out;
+ }
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ nbits, 0);
+ if (pos < BPF_PROG_CHUNK_COUNT)
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack(bpf_fill_ill_insns);
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+void bpf_prog_pack_free(void *ptr, u32 size)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+
+ mutex_lock(&pack_mutex);
+ if (size > BPF_PROG_PACK_SIZE) {
+ bpf_jit_free_exec(ptr);
+ goto out;
+ }
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
+ "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
+ list_del(&pack->list);
+ bpf_jit_free_exec(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
+static atomic_long_t bpf_jit_current;
+
+/* Can be overridden by an arch's JIT compiler if it has a custom,
+ * dedicated BPF backend memory area, or if neither of the two
+ * below apply.
+ */
+u64 __weak bpf_jit_alloc_exec_limit(void)
+{
+#if defined(MODULES_VADDR)
+ return MODULES_END - MODULES_VADDR;
+#else
+ return VMALLOC_END - VMALLOC_START;
+#endif
+}
+
+static int __init bpf_jit_charge_init(void)
+{
+ /* Only used as heuristic here to derive limit. */
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
+ PAGE_SIZE), LONG_MAX);
+ return 0;
+}
+pure_initcall(bpf_jit_charge_init);
+
+int bpf_jit_charge_modmem(u32 size)
+{
+ if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
+ if (!bpf_capable()) {
+ atomic_long_sub(size, &bpf_jit_current);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+void bpf_jit_uncharge_modmem(u32 size)
+{
+ atomic_long_sub(size, &bpf_jit_current);
+}
+
+void *__weak bpf_jit_alloc_exec(unsigned long size)
+{
+ return execmem_alloc(EXECMEM_BPF, size);
+}
+
+void __weak bpf_jit_free_exec(void *addr)
+{
+ execmem_free(addr);
+}
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- unsigned int size, hole, start;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
/* Most of BPF filters are really small, but if some of them
* fill a page, allow at least 128 extra bytes to insert a
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- hdr = module_alloc(size);
- if (hdr == NULL)
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ hdr = bpf_jit_alloc_exec(size);
+ if (!hdr) {
+ bpf_jit_uncharge_modmem(size);
return NULL;
+ }
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = size / PAGE_SIZE;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (prandom_u32() % hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -163,168 +1093,717 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- module_memfree(hdr);
+ u32 size = hdr->size;
+
+ bpf_jit_free_exec(hdr);
+ bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_prog_pack_free(ro_header, size);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = get_random_u32_below(hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header, ro_header->size);
+ return PTR_ERR(ptr);
+ }
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = ro_header->size;
+
+ bpf_prog_pack_free(ro_header, size);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ return (void *)addr;
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & PAGE_MASK;
+ return (void *)addr;
+}
+
+/* This symbol is only overridden by archs that have different
+ * requirements than the usual eBPF JITs, f.e. when they only
+ * implement cBPF JIT, do not set images read-only, etc.
+ */
+void __weak bpf_jit_free(struct bpf_prog *fp)
+{
+ if (fp->jited) {
+ struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+ bpf_jit_binary_free(hdr);
+ WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
+ }
+
+ bpf_prog_unlock_free(fp);
+}
+
+int bpf_jit_get_func_addr(const struct bpf_prog *prog,
+ const struct bpf_insn *insn, bool extra_pass,
+ u64 *func_addr, bool *func_addr_fixed)
+{
+ s16 off = insn->off;
+ s32 imm = insn->imm;
+ u8 *addr;
+ int err;
+
+ *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
+ if (!*func_addr_fixed) {
+ /* Place-holder address till the last pass has collected
+ * all addresses for JITed subprograms in which case we
+ * can pick them up from prog->aux.
+ */
+ if (!extra_pass)
+ addr = NULL;
+ else if (prog->aux->func &&
+ off >= 0 && off < prog->aux->real_func_cnt)
+ addr = (u8 *)prog->aux->func[off]->bpf_func;
+ else
+ return -EINVAL;
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ bpf_jit_supports_far_kfunc_call()) {
+ err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+ if (err)
+ return err;
+ } else {
+ /* Address of a BPF helper call. Since part of the core
+ * kernel, it's always at a fixed location. __bpf_call_base
+ * and the helper with imm relative to it are both in core
+ * kernel.
+ */
+ addr = (u8 *)__bpf_call_base + imm;
+ }
+
+ *func_addr = (unsigned long)addr;
+ return 0;
+}
+
+const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
+{
+ if (prog->aux->ksym.prog)
+ return prog->aux->ksym.name;
+ return prog->aux->name;
+}
+
+static int bpf_jit_blind_insn(const struct bpf_insn *from,
+ const struct bpf_insn *aux,
+ struct bpf_insn *to_buff,
+ bool emit_zext)
+{
+ struct bpf_insn *to = to_buff;
+ u32 imm_rnd = get_random_u32();
+ s16 off;
+
+ BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
+ BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+
+ /* Constraints on AX register:
+ *
+ * AX register is inaccessible from user space. It is mapped in
+ * all JITs, and used here for constant blinding rewrites. It is
+ * typically "stateless" meaning its contents are only valid within
+ * the executed instruction, but not across several instructions.
+ * There are a few exceptions however which are further detailed
+ * below.
+ *
+ * Constant blinding is only used by JITs, not in the interpreter.
+ * The interpreter uses AX in some occasions as a local temporary
+ * register e.g. in DIV or MOD instructions.
+ *
+ * In restricted circumstances, the verifier can also use the AX
+ * register for rewrites as long as they do not interfere with
+ * the above cases!
+ */
+ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
+ goto out;
+
+ if (from->imm == 0 &&
+ (from->code == (BPF_ALU | BPF_MOV | BPF_K) ||
+ from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
+ *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
+ goto out;
+ }
+
+ switch (from->code) {
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MOV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
+ break;
+
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
+ break;
+
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
+ case BPF_JMP32 | BPF_JNE | BPF_K:
+ case BPF_JMP32 | BPF_JGT | BPF_K:
+ case BPF_JMP32 | BPF_JLT | BPF_K:
+ case BPF_JMP32 | BPF_JGE | BPF_K:
+ case BPF_JMP32 | BPF_JLE | BPF_K:
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
+ case BPF_JMP32 | BPF_JSGE | BPF_K:
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
+ case BPF_JMP32 | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
+ off);
+ break;
+
+ case BPF_LD | BPF_IMM | BPF_DW:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
+ break;
+ case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ if (emit_zext)
+ *to++ = BPF_ZEXT_REG(BPF_REG_AX);
+ *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
+ break;
+
+ case BPF_ST | BPF_MEM | BPF_DW:
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
+ break;
+ }
+out:
+ return to - to_buff;
+}
+
+static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
+ if (fp != NULL) {
+ /* aux->prog still points to the fp_other one, so
+ * when promoting the clone to the real program,
+ * this still needs to be adapted.
+ */
+ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
+ }
+
+ return fp;
+}
+
+static void bpf_prog_clone_free(struct bpf_prog *fp)
+{
+ /* aux was stolen by the other clone, so we cannot free
+ * it from this path! It will be freed eventually by the
+ * other program on release.
+ *
+ * At this point, we don't need a deferred release since
+ * clone is guaranteed to not be locked.
+ */
+ fp->aux = NULL;
+ fp->stats = NULL;
+ fp->active = NULL;
+ __bpf_prog_free(fp);
+}
+
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
+{
+ /* We have to repoint aux->prog to self, as we don't
+ * know whether fp here is the clone or the original.
+ */
+ fp->aux->prog = fp;
+ bpf_prog_clone_free(fp_other);
+}
+
+static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_map *map;
+ int i;
+
+ if (len <= 1)
+ return;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ bpf_insn_array_adjust(map, off, len);
+ }
+#endif
+}
+
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+{
+ struct bpf_insn insn_buff[16], aux[2];
+ struct bpf_prog *clone, *tmp;
+ int insn_delta, insn_cnt;
+ struct bpf_insn *insn;
+ int i, rewritten;
+
+ if (!prog->blinding_requested || prog->blinded)
+ return prog;
+
+ clone = bpf_prog_clone_create(prog, GFP_USER);
+ if (!clone)
+ return ERR_PTR(-ENOMEM);
+
+ insn_cnt = clone->len;
+ insn = clone->insnsi;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* ld_imm64 with an address of bpf subprog is not
+ * a user controlled constant. Don't randomize it,
+ * since it will conflict with jit_subprogs() logic.
+ */
+ insn++;
+ i++;
+ continue;
+ }
+
+ /* We temporarily need to hold the original ld64 insn
+ * so that we can still access the first part in the
+ * second blinding run.
+ */
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ insn[1].code == 0)
+ memcpy(aux, insn, sizeof(aux));
+
+ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
+ clone->aux->verifier_zext);
+ if (!rewritten)
+ continue;
+
+ tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+ if (IS_ERR(tmp)) {
+ /* Patching may have repointed aux->prog during
+ * realloc from the original one, so we need to
+ * fix it up here on error.
+ */
+ bpf_jit_prog_release_other(prog, clone);
+ return tmp;
+ }
+
+ clone = tmp;
+ insn_delta = rewritten - 1;
+
+ /* Instructions arrays must be updated using absolute xlated offsets */
+ adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten);
+
+ /* Walk new program and skip insns we just inserted. */
+ insn = clone->insnsi + i + insn_delta;
+ insn_cnt += insn_delta;
+ i += insn_delta;
+ }
+
+ clone->blinded = 1;
+ return clone;
}
#endif /* CONFIG_BPF_JIT */
/* Base function for offset calculation. Needs to go into .text section,
* therefore keeping it non-static as well; will also be used by JITs
- * anyway later on, so do not let the compiler omit it.
+ * anyway later on, so do not let the compiler omit it. This also needs
+ * to go into kallsyms for correlation from e.g. bpftool, so naming
+ * must not change.
*/
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
return 0;
}
+EXPORT_SYMBOL_GPL(__bpf_call_base);
+
+/* All UAPI available opcodes. */
+#define BPF_INSN_MAP(INSN_2, INSN_3) \
+ /* 32 bit ALU operations. */ \
+ /* Register based. */ \
+ INSN_3(ALU, ADD, X), \
+ INSN_3(ALU, SUB, X), \
+ INSN_3(ALU, AND, X), \
+ INSN_3(ALU, OR, X), \
+ INSN_3(ALU, LSH, X), \
+ INSN_3(ALU, RSH, X), \
+ INSN_3(ALU, XOR, X), \
+ INSN_3(ALU, MUL, X), \
+ INSN_3(ALU, MOV, X), \
+ INSN_3(ALU, ARSH, X), \
+ INSN_3(ALU, DIV, X), \
+ INSN_3(ALU, MOD, X), \
+ INSN_2(ALU, NEG), \
+ INSN_3(ALU, END, TO_BE), \
+ INSN_3(ALU, END, TO_LE), \
+ /* Immediate based. */ \
+ INSN_3(ALU, ADD, K), \
+ INSN_3(ALU, SUB, K), \
+ INSN_3(ALU, AND, K), \
+ INSN_3(ALU, OR, K), \
+ INSN_3(ALU, LSH, K), \
+ INSN_3(ALU, RSH, K), \
+ INSN_3(ALU, XOR, K), \
+ INSN_3(ALU, MUL, K), \
+ INSN_3(ALU, MOV, K), \
+ INSN_3(ALU, ARSH, K), \
+ INSN_3(ALU, DIV, K), \
+ INSN_3(ALU, MOD, K), \
+ /* 64 bit ALU operations. */ \
+ /* Register based. */ \
+ INSN_3(ALU64, ADD, X), \
+ INSN_3(ALU64, SUB, X), \
+ INSN_3(ALU64, AND, X), \
+ INSN_3(ALU64, OR, X), \
+ INSN_3(ALU64, LSH, X), \
+ INSN_3(ALU64, RSH, X), \
+ INSN_3(ALU64, XOR, X), \
+ INSN_3(ALU64, MUL, X), \
+ INSN_3(ALU64, MOV, X), \
+ INSN_3(ALU64, ARSH, X), \
+ INSN_3(ALU64, DIV, X), \
+ INSN_3(ALU64, MOD, X), \
+ INSN_2(ALU64, NEG), \
+ INSN_3(ALU64, END, TO_LE), \
+ /* Immediate based. */ \
+ INSN_3(ALU64, ADD, K), \
+ INSN_3(ALU64, SUB, K), \
+ INSN_3(ALU64, AND, K), \
+ INSN_3(ALU64, OR, K), \
+ INSN_3(ALU64, LSH, K), \
+ INSN_3(ALU64, RSH, K), \
+ INSN_3(ALU64, XOR, K), \
+ INSN_3(ALU64, MUL, K), \
+ INSN_3(ALU64, MOV, K), \
+ INSN_3(ALU64, ARSH, K), \
+ INSN_3(ALU64, DIV, K), \
+ INSN_3(ALU64, MOD, K), \
+ /* Call instruction. */ \
+ INSN_2(JMP, CALL), \
+ /* Exit instruction. */ \
+ INSN_2(JMP, EXIT), \
+ /* 32-bit Jump instructions. */ \
+ /* Register based. */ \
+ INSN_3(JMP32, JEQ, X), \
+ INSN_3(JMP32, JNE, X), \
+ INSN_3(JMP32, JGT, X), \
+ INSN_3(JMP32, JLT, X), \
+ INSN_3(JMP32, JGE, X), \
+ INSN_3(JMP32, JLE, X), \
+ INSN_3(JMP32, JSGT, X), \
+ INSN_3(JMP32, JSLT, X), \
+ INSN_3(JMP32, JSGE, X), \
+ INSN_3(JMP32, JSLE, X), \
+ INSN_3(JMP32, JSET, X), \
+ /* Immediate based. */ \
+ INSN_3(JMP32, JEQ, K), \
+ INSN_3(JMP32, JNE, K), \
+ INSN_3(JMP32, JGT, K), \
+ INSN_3(JMP32, JLT, K), \
+ INSN_3(JMP32, JGE, K), \
+ INSN_3(JMP32, JLE, K), \
+ INSN_3(JMP32, JSGT, K), \
+ INSN_3(JMP32, JSLT, K), \
+ INSN_3(JMP32, JSGE, K), \
+ INSN_3(JMP32, JSLE, K), \
+ INSN_3(JMP32, JSET, K), \
+ /* Jump instructions. */ \
+ /* Register based. */ \
+ INSN_3(JMP, JEQ, X), \
+ INSN_3(JMP, JNE, X), \
+ INSN_3(JMP, JGT, X), \
+ INSN_3(JMP, JLT, X), \
+ INSN_3(JMP, JGE, X), \
+ INSN_3(JMP, JLE, X), \
+ INSN_3(JMP, JSGT, X), \
+ INSN_3(JMP, JSLT, X), \
+ INSN_3(JMP, JSGE, X), \
+ INSN_3(JMP, JSLE, X), \
+ INSN_3(JMP, JSET, X), \
+ /* Immediate based. */ \
+ INSN_3(JMP, JEQ, K), \
+ INSN_3(JMP, JNE, K), \
+ INSN_3(JMP, JGT, K), \
+ INSN_3(JMP, JLT, K), \
+ INSN_3(JMP, JGE, K), \
+ INSN_3(JMP, JLE, K), \
+ INSN_3(JMP, JSGT, K), \
+ INSN_3(JMP, JSLT, K), \
+ INSN_3(JMP, JSGE, K), \
+ INSN_3(JMP, JSLE, K), \
+ INSN_3(JMP, JSET, K), \
+ INSN_2(JMP, JA), \
+ INSN_2(JMP32, JA), \
+ /* Atomic operations. */ \
+ INSN_3(STX, ATOMIC, B), \
+ INSN_3(STX, ATOMIC, H), \
+ INSN_3(STX, ATOMIC, W), \
+ INSN_3(STX, ATOMIC, DW), \
+ /* Store instructions. */ \
+ /* Register based. */ \
+ INSN_3(STX, MEM, B), \
+ INSN_3(STX, MEM, H), \
+ INSN_3(STX, MEM, W), \
+ INSN_3(STX, MEM, DW), \
+ /* Immediate based. */ \
+ INSN_3(ST, MEM, B), \
+ INSN_3(ST, MEM, H), \
+ INSN_3(ST, MEM, W), \
+ INSN_3(ST, MEM, DW), \
+ /* Load instructions. */ \
+ /* Register based. */ \
+ INSN_3(LDX, MEM, B), \
+ INSN_3(LDX, MEM, H), \
+ INSN_3(LDX, MEM, W), \
+ INSN_3(LDX, MEM, DW), \
+ INSN_3(LDX, MEMSX, B), \
+ INSN_3(LDX, MEMSX, H), \
+ INSN_3(LDX, MEMSX, W), \
+ /* Immediate based. */ \
+ INSN_3(LD, IMM, DW)
+
+bool bpf_opcode_in_insntable(u8 code)
+{
+#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true
+#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
+ static const bool public_insntable[256] = {
+ [0 ... 255] = false,
+ /* Now overwrite non-defaults ... */
+ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+ /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
+ [BPF_LD | BPF_ABS | BPF_B] = true,
+ [BPF_LD | BPF_ABS | BPF_H] = true,
+ [BPF_LD | BPF_ABS | BPF_W] = true,
+ [BPF_LD | BPF_IND | BPF_B] = true,
+ [BPF_LD | BPF_IND | BPF_H] = true,
+ [BPF_LD | BPF_IND | BPF_W] = true,
+ [BPF_JMP | BPF_JA | BPF_X] = true,
+ [BPF_JMP | BPF_JCOND] = true,
+ };
+#undef BPF_INSN_3_TBL
+#undef BPF_INSN_2_TBL
+ return public_insntable[code];
+}
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
- * __bpf_prog_run - run eBPF program on a given context
- * @ctx: is the data we are operating on
+ * ___bpf_prog_run - run eBPF program on a given context
+ * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
- u64 stack[MAX_BPF_STACK / sizeof(u64)];
- u64 regs[MAX_BPF_REG], tmp;
- static const void *jumptable[256] = {
+#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
+#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
+ static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
- /* 32 bit ALU operations */
- [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
- [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
- [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
- [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
- [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
- [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
- [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
- [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
- [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
- [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
- [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
- [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
- [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
- [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
- [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
- [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
- [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
- [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
- [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
- [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
- [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
- [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
- [BPF_ALU | BPF_NEG] = &&ALU_NEG,
- [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
- [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
- /* 64 bit ALU operations */
- [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
- [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
- [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
- [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
- [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
- [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
- [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
- [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
- [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
- [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
- [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
- [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
- [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
- [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
- [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
- [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
- [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
- [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
- [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
- [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
- [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
- [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
- [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
- [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
- [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
- /* Call instruction */
- [BPF_JMP | BPF_CALL] = &&JMP_CALL,
- /* Jumps */
- [BPF_JMP | BPF_JA] = &&JMP_JA,
- [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
- [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
- [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
- [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
- [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
- [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
- [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
- [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
- [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
- [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
- [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
- [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
- [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
- [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
- /* Program return */
- [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
- /* Store instructions */
- [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
- [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
- [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
- [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
- [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
- [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
- [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
- [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
- [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
- [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
- /* Load instructions */
- [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
- [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
- [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
- [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
- [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
- [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
- [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
- [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
- [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
- [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
- [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
+ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
+ /* Non-UAPI available opcodes. */
+ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
+ [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
+ [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
};
- void *ptr;
- int off;
+#undef BPF_INSN_3_LBL
+#undef BPF_INSN_2_LBL
+ u32 tail_call_cnt = 0;
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
- FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
- ARG1 = (u64) (unsigned long) ctx;
-
- /* Registers used in classic BPF programs need to be reset first. */
- regs[BPF_REG_A] = 0;
- regs[BPF_REG_X] = 0;
-
select_insn:
goto *jumptable[insn->code];
- /* ALU */
-#define ALU(OPCODE, OP) \
- ALU64_##OPCODE##_X: \
- DST = DST OP SRC; \
- CONT; \
- ALU_##OPCODE##_X: \
- DST = (u32) DST OP (u32) SRC; \
- CONT; \
- ALU64_##OPCODE##_K: \
- DST = DST OP IMM; \
- CONT; \
- ALU_##OPCODE##_K: \
- DST = (u32) DST OP (u32) IMM; \
+ /* Explicitly mask the register-based shift amounts with 63 or 31
+ * to avoid undefined behavior. Normally this won't affect the
+ * generated code, for example, in case of native 64 bit archs such
+ * as x86-64 or arm64, the compiler is optimizing the AND away for
+ * the interpreter. In case of JITs, each of the JIT backends compiles
+ * the BPF shift operations to machine instructions which produce
+ * implementation-defined results in such a case; the resulting
+ * contents of the register may be arbitrary, but program behaviour
+ * as a whole remains defined. In other words, in case of JIT backends,
+ * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
+ */
+ /* ALU (shifts) */
+#define SHT(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP (SRC & 63); \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP ((u32) SRC & 31); \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+ /* ALU (rest) */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
CONT;
-
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
- ALU(LSH, <<)
- ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
+ SHT(LSH, <<)
+ SHT(RSH, >>)
+#undef SHT
#undef ALU
ALU_NEG:
DST = (u32) -DST;
@@ -333,13 +1812,36 @@ select_insn:
DST = -DST;
CONT;
ALU_MOV_X:
- DST = (u32) SRC;
+ switch (OFF) {
+ case 0:
+ DST = (u32) SRC;
+ break;
+ case 8:
+ DST = (u32)(s8) SRC;
+ break;
+ case 16:
+ DST = (u32)(s16) SRC;
+ break;
+ }
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
- DST = SRC;
+ switch (OFF) {
+ case 0:
+ DST = SRC;
+ break;
+ case 8:
+ DST = (s8) SRC;
+ break;
+ case 16:
+ DST = (s16) SRC;
+ break;
+ case 32:
+ DST = (s32) SRC;
+ break;
+ }
CONT;
ALU64_MOV_K:
DST = IMM;
@@ -348,51 +1850,127 @@ select_insn:
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
+ ALU_ARSH_X:
+ DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
+ CONT;
+ ALU_ARSH_K:
+ DST = (u64) (u32) (((s32) DST) >> IMM);
+ CONT;
ALU64_ARSH_X:
- (*(s64 *) &DST) >>= SRC;
+ (*(s64 *) &DST) >>= (SRC & 63);
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
CONT;
ALU64_MOD_X:
- if (unlikely(SRC == 0))
- return 0;
- div64_u64_rem(DST, SRC, &tmp);
- DST = tmp;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, SRC, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, SRC);
+ DST = DST - AX * SRC;
+ break;
+ }
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
- return 0;
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) SRC);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) SRC);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)SRC));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_MOD_K:
- div64_u64_rem(DST, IMM, &tmp);
- DST = tmp;
+ switch (OFF) {
+ case 0:
+ div64_u64_rem(DST, IMM, &AX);
+ DST = AX;
+ break;
+ case 1:
+ AX = div64_s64(DST, IMM);
+ DST = DST - AX * IMM;
+ break;
+ }
CONT;
ALU_MOD_K:
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) IMM);
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) IMM);
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ AX = do_div(AX, abs((s32)IMM));
+ if ((s32)DST < 0)
+ DST = (u32)-AX;
+ else
+ DST = (u32)AX;
+ break;
+ }
CONT;
ALU64_DIV_X:
- if (unlikely(SRC == 0))
- return 0;
- DST = div64_u64(DST, SRC);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, SRC);
+ break;
+ case 1:
+ DST = div64_s64(DST, SRC);
+ break;
+ }
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
- return 0;
- tmp = (u32) DST;
- do_div(tmp, (u32) SRC);
- DST = (u32) tmp;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) SRC);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)SRC));
+ if (((s32)DST < 0) == ((s32)SRC < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU64_DIV_K:
- DST = div64_u64(DST, IMM);
+ switch (OFF) {
+ case 0:
+ DST = div64_u64(DST, IMM);
+ break;
+ case 1:
+ DST = div64_s64(DST, IMM);
+ break;
+ }
CONT;
ALU_DIV_K:
- tmp = (u32) DST;
- do_div(tmp, (u32) IMM);
- DST = (u32) tmp;
+ switch (OFF) {
+ case 0:
+ AX = (u32) DST;
+ do_div(AX, (u32) IMM);
+ DST = (u32) AX;
+ break;
+ case 1:
+ AX = abs((s32)DST);
+ do_div(AX, abs((s32)IMM));
+ if (((s32)DST < 0) == ((s32)IMM < 0))
+ DST = (u32)AX;
+ else
+ DST = (u32)-AX;
+ break;
+ }
CONT;
ALU_END_TO_BE:
switch (IMM) {
@@ -420,6 +1998,19 @@ select_insn:
break;
}
CONT;
+ ALU64_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) __swab16(DST);
+ break;
+ case 32:
+ DST = (__force u32) __swab32(DST);
+ break;
+ case 64:
+ DST = (__force u64) __swab64(DST);
+ break;
+ }
+ CONT;
/* CALL */
JMP_CALL:
@@ -431,98 +2022,101 @@ select_insn:
BPF_R4, BPF_R5);
CONT;
- /* JMP */
- JMP_JA:
- insn += insn->off;
- CONT;
- JMP_JEQ_X:
- if (DST == SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JEQ_K:
- if (DST == IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_X:
- if (DST != SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_K:
- if (DST != IMM) {
- insn += insn->off;
- CONT_JMP;
- }
+ JMP_CALL_ARGS:
+ BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
+ BPF_R3, BPF_R4,
+ BPF_R5,
+ insn + insn->off + 1);
CONT;
- JMP_JGT_X:
- if (DST > SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_K:
- if (DST > IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_X:
- if (DST >= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_K:
- if (DST >= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_X:
- if (((s64) DST) > ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_K:
- if (((s64) DST) > ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_X:
- if (((s64) DST) >= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_K:
- if (((s64) DST) >= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
+
+ JMP_TAIL_CALL: {
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog;
+ u32 index = BPF_R3;
+
+ if (unlikely(index >= array->map.max_entries))
+ goto out;
+
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
+ goto out;
+
+ tail_call_cnt++;
+
+ prog = READ_ONCE(array->ptrs[index]);
+ if (!prog)
+ goto out;
+
+ /* ARG1 at this point is guaranteed to point to CTX from
+ * the verifier side due to the fact that the tail call is
+ * handled like a helper, that is, bpf_tail_call_proto,
+ * where arg1_type is ARG_PTR_TO_CTX.
+ */
+ insn = prog->insnsi;
+ goto select_insn;
+out:
CONT;
- JMP_JSET_X:
- if (DST & SRC) {
- insn += insn->off;
- CONT_JMP;
- }
+ }
+ JMP_JA:
+ insn += insn->off;
CONT;
- JMP_JSET_K:
- if (DST & IMM) {
- insn += insn->off;
- CONT_JMP;
- }
+ JMP32_JA:
+ insn += insn->imm;
CONT;
JMP_EXIT:
return BPF_R0;
-
- /* STX and ST and LDX*/
+ /* JMP */
+#define COND_JMP(SIGN, OPCODE, CMP_OP) \
+ JMP_##OPCODE##_X: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_X: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP_##OPCODE##_K: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_K: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT;
+ COND_JMP(u, JEQ, ==)
+ COND_JMP(u, JNE, !=)
+ COND_JMP(u, JGT, >)
+ COND_JMP(u, JLT, <)
+ COND_JMP(u, JGE, >=)
+ COND_JMP(u, JLE, <=)
+ COND_JMP(u, JSET, &)
+ COND_JMP(s, JSGT, >)
+ COND_JMP(s, JSLT, <)
+ COND_JMP(s, JSGE, >=)
+ COND_JMP(s, JSLE, <=)
+#undef COND_JMP
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass,
+ * Bounds-Check Bypass and Type Confusion. In case of arm64, we
+ * rely on the firmware mitigation as controlled via the ssbd
+ * kernel parameter. Whenever the mitigation is enabled, it
+ * works for all of the kernel code with no need to provide any
+ * additional instructions here. In case of x86, we use 'lfence'
+ * insn for mitigation. We reuse preexisting logic from Spectre
+ * v1 mitigation that happens to produce the required code on
+ * x86 for v4 as well.
+ */
+ barrier_nospec();
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -532,6 +2126,11 @@ select_insn:
CONT; \
LDX_MEM_##SIZEOP: \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEM_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
CONT;
LDST(B, u8)
@@ -539,130 +2138,1008 @@ select_insn:
LDST(W, u32)
LDST(DW, u64)
#undef LDST
- STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
- atomic_add((u32) SRC, (atomic_t *)(unsigned long)
- (DST + insn->off));
- CONT;
- STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
- atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
- (DST + insn->off));
- CONT;
- LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
- off = IMM;
-load_word:
- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
- * only appearing in the programs where ctx ==
- * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
- * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
- * internal BPF verifier will check that BPF_R6 ==
- * ctx.
- *
- * BPF_ABS and BPF_IND are wrappers of function calls,
- * so they scratch BPF_R1-BPF_R5 registers, preserve
- * BPF_R6-BPF_R9, and store return value into BPF_R0.
- *
- * Implicit input:
- * ctx == skb == BPF_R6 == CTX
- *
- * Explicit input:
- * SRC == any register
- * IMM == 32-bit immediate
- *
- * Output:
- * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+
+#define LDSX(SIZEOP, SIZE) \
+ LDX_MEMSX_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT; \
+ LDX_PROBE_MEMSX_##SIZEOP: \
+ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \
+ (const void *)(long) (SRC + insn->off)); \
+ DST = *((SIZE *)&DST); \
+ CONT;
+
+ LDSX(B, s8)
+ LDSX(H, s16)
+ LDSX(W, s32)
+#undef LDSX
+
+#define ATOMIC_ALU_OP(BOP, KOP) \
+ case BOP: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
+ (DST + insn->off)); \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
+ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
+ (DST + insn->off)); \
+ else \
+ goto default_label; \
+ break; \
+ case BOP | BPF_FETCH: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ SRC = (u32) atomic_fetch_##KOP( \
+ (u32) SRC, \
+ (atomic_t *)(unsigned long) (DST + insn->off)); \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
+ SRC = (u64) atomic64_fetch_##KOP( \
+ (u64) SRC, \
+ (atomic64_t *)(unsigned long) (DST + insn->off)); \
+ else \
+ goto default_label; \
+ break;
+
+ STX_ATOMIC_DW:
+ STX_ATOMIC_W:
+ STX_ATOMIC_H:
+ STX_ATOMIC_B:
+ switch (IMM) {
+ /* Atomic read-modify-write instructions support only W and DW
+ * size modifiers.
*/
+ ATOMIC_ALU_OP(BPF_ADD, add)
+ ATOMIC_ALU_OP(BPF_AND, and)
+ ATOMIC_ALU_OP(BPF_OR, or)
+ ATOMIC_ALU_OP(BPF_XOR, xor)
+#undef ATOMIC_ALU_OP
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be32(ptr);
- CONT;
- }
+ case BPF_XCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ SRC = (u32) atomic_xchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) SRC);
+ else if (BPF_SIZE(insn->code) == BPF_DW)
+ SRC = (u64) atomic64_xchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) SRC);
+ else
+ goto default_label;
+ break;
+ case BPF_CMPXCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ BPF_R0 = (u32) atomic_cmpxchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) BPF_R0, (u32) SRC);
+ else if (BPF_SIZE(insn->code) == BPF_DW)
+ BPF_R0 = (u64) atomic64_cmpxchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) BPF_R0, (u64) SRC);
+ else
+ goto default_label;
+ break;
+ /* Atomic load and store instructions support all size
+ * modifiers.
+ */
+ case BPF_LOAD_ACQ:
+ switch (BPF_SIZE(insn->code)) {
+#define LOAD_ACQUIRE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ DST = (SIZE)smp_load_acquire( \
+ (SIZE *)(unsigned long)(SRC + insn->off)); \
+ break;
+ LOAD_ACQUIRE(B, u8)
+ LOAD_ACQUIRE(H, u16)
+ LOAD_ACQUIRE(W, u32)
+#ifdef CONFIG_64BIT
+ LOAD_ACQUIRE(DW, u64)
+#endif
+#undef LOAD_ACQUIRE
+ default:
+ goto default_label;
+ }
+ break;
+ case BPF_STORE_REL:
+ switch (BPF_SIZE(insn->code)) {
+#define STORE_RELEASE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ smp_store_release( \
+ (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \
+ break;
+ STORE_RELEASE(B, u8)
+ STORE_RELEASE(H, u16)
+ STORE_RELEASE(W, u32)
+#ifdef CONFIG_64BIT
+ STORE_RELEASE(DW, u64)
+#endif
+#undef STORE_RELEASE
+ default:
+ goto default_label;
+ }
+ break;
- return 0;
- LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
- off = IMM;
-load_half:
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be16(ptr);
- CONT;
+ default:
+ goto default_label;
}
+ CONT;
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. Die hard here
+ * instead of just returning 0; we could be somewhere in a subprog,
+ * so execution could continue otherwise which we do /not/ want.
+ *
+ * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
+ */
+ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
+ insn->code, insn->imm);
+ BUG_ON(1);
return 0;
- LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
- off = IMM;
-load_byte:
- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = *(u8 *)ptr;
- CONT;
+}
+
+#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
+#define DEFINE_BPF_PROG_RUN(stack_size) \
+static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
+{ \
+ u64 stack[stack_size / sizeof(u64)]; \
+ u64 regs[MAX_BPF_EXT_REG] = {}; \
+\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+ ARG1 = (u64) (unsigned long) ctx; \
+ return ___bpf_prog_run(regs, insn); \
+}
+
+#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
+#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
+static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
+ const struct bpf_insn *insn) \
+{ \
+ u64 stack[stack_size / sizeof(u64)]; \
+ u64 regs[MAX_BPF_EXT_REG]; \
+\
+ kmsan_unpoison_memory(stack, sizeof(stack)); \
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+ BPF_R1 = r1; \
+ BPF_R2 = r2; \
+ BPF_R3 = r3; \
+ BPF_R4 = r4; \
+ BPF_R5 = r5; \
+ return ___bpf_prog_run(regs, insn); \
+}
+
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+
+EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
+
+#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
+
+static unsigned int (*interpreters[])(const void *ctx,
+ const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
+#undef PROG_NAME_LIST
+#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
+static __maybe_unused
+u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
+ const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
+#undef PROG_NAME_LIST
+
+#ifdef CONFIG_BPF_SYSCALL
+void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
+{
+ stack_depth = max_t(u32, stack_depth, 1);
+ insn->off = (s16) insn->imm;
+ insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
+ __bpf_call_base_args;
+ insn->code = BPF_JMP | BPF_CALL_ARGS;
+}
+#endif
+#endif
+
+static unsigned int __bpf_prog_ret0_warn(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+ * is not working properly, so warn about it!
+ */
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static bool __bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
+ struct bpf_prog_aux *aux = fp->aux;
+ enum bpf_cgroup_storage_type i;
+ bool ret = false;
+ u64 cookie;
+
+ if (fp->kprobe_override)
+ return ret;
+
+ spin_lock(&map->owner_lock);
+ /* There's no owner yet where we could check for compatibility. */
+ if (!map->owner) {
+ map->owner = bpf_map_owner_alloc(map);
+ if (!map->owner)
+ goto err;
+ map->owner->type = prog_type;
+ map->owner->jited = fp->jited;
+ map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->expected_attach_type = fp->expected_attach_type;
+ map->owner->attach_func_proto = aux->attach_func_proto;
+ for_each_cgroup_storage_type(i) {
+ map->owner->storage_cookie[i] =
+ aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ }
+ ret = true;
+ } else {
+ ret = map->owner->type == prog_type &&
+ map->owner->jited == fp->jited &&
+ map->owner->xdp_has_frags == aux->xdp_has_frags;
+ if (ret &&
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+ map->owner->expected_attach_type != fp->expected_attach_type)
+ ret = false;
+ for_each_cgroup_storage_type(i) {
+ if (!ret)
+ break;
+ cookie = aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ ret = map->owner->storage_cookie[i] == cookie ||
+ !cookie;
}
+ if (ret &&
+ map->owner->attach_func_proto != aux->attach_func_proto) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ ret = false;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+err:
+ spin_unlock(&map->owner_lock);
+ return ret;
+}
- return 0;
- LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_word;
- LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_half;
- LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
- off = IMM + SRC;
- goto load_byte;
+bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
+{
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
- default_label:
- /* If we ever reach this, we have a bug somewhere. */
- WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
- return 0;
+ return __bpf_prog_map_compatible(map, fp);
+}
+
+static int bpf_check_tail_call(const struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+ int i, ret = 0;
+
+ mutex_lock(&aux->used_maps_mutex);
+ for (i = 0; i < aux->used_map_cnt; i++) {
+ struct bpf_map *map = aux->used_maps[i];
+
+ if (!map_type_contains_progs(map))
+ continue;
+
+ if (!__bpf_prog_map_compatible(map, fp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+out:
+ mutex_unlock(&aux->used_maps_mutex);
+ return ret;
}
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
+ bool select_interpreter = false;
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+ u32 idx = (round_up(stack_depth, 32) / 32) - 1;
+
+ /* may_goto may cause stack size > 512, leading to idx out-of-bounds.
+ * But for non-JITed programs, we don't need bpf_func, so no bounds
+ * check needed.
+ */
+ if (idx < ARRAY_SIZE(interpreters)) {
+ fp->bpf_func = interpreters[idx];
+ select_interpreter = true;
+ } else {
+ fp->bpf_func = __bpf_prog_ret0_warn;
+ }
+#else
+ fp->bpf_func = __bpf_prog_ret0_warn;
+#endif
+ return select_interpreter;
}
/**
- * bpf_prog_select_runtime - select execution runtime for BPF program
- * @fp: bpf_prog populated with internal BPF program
+ * bpf_prog_select_runtime - select exec runtime for BPF program
+ * @fp: bpf_prog populated with BPF program
+ * @err: pointer to error variable
*
- * try to JIT internal BPF program, if JIT is not available select interpreter
- * BPF program will be executed via BPF_PROG_RUN() macro
+ * Try to JIT eBPF program, if JIT is not available, use interpreter.
+ * The BPF program will be executed via bpf_prog_run() function.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
- fp->bpf_func = (void *) __bpf_prog_run;
+ /* In case of BPF to BPF calls, verifier did all the prep
+ * work with regards to JITing, etc.
+ */
+ bool jit_needed = false;
+
+ if (fp->bpf_func)
+ goto finalize;
+
+ if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+ bpf_prog_has_kfunc_call(fp))
+ jit_needed = true;
+
+ if (!bpf_prog_select_interpreter(fp))
+ jit_needed = true;
+
+ /* eBPF JITs can rewrite the program in case constant
+ * blinding is active. However, in case of error during
+ * blinding, bpf_int_jit_compile() must always return a
+ * valid program, which in this case would simply not
+ * be JITed, but falls back to the interpreter.
+ */
+ if (!bpf_prog_is_offloaded(fp->aux)) {
+ *err = bpf_prog_alloc_jited_linfo(fp);
+ if (*err)
+ return fp;
+
+ fp = bpf_int_jit_compile(fp);
+ bpf_prog_jit_attempt_done(fp);
+ if (!fp->jited && jit_needed) {
+ *err = -ENOTSUPP;
+ return fp;
+ }
+ } else {
+ *err = bpf_prog_offload_compile(fp);
+ if (*err)
+ return fp;
+ }
- /* Probe if internal BPF can be JITed */
- bpf_int_jit_compile(fp);
- /* Lock whole bpf_prog as read-only */
- bpf_prog_lock_ro(fp);
+finalize:
+ *err = bpf_prog_lock_ro(fp);
+ if (*err)
+ return fp;
+
+ /* The tail call compatibility check can only be done at
+ * this late stage as we need to determine, if we deal
+ * with JITed or non JITed program concatenations and not
+ * all eBPF JITs might immediately support all features.
+ */
+ *err = bpf_check_tail_call(fp);
+
+ return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
+struct bpf_empty_prog_array bpf_empty_prog_array = {
+ .null_prog = NULL,
+};
+EXPORT_SYMBOL(bpf_empty_prog_array);
+
+struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+ struct bpf_prog_array *p;
+
+ if (prog_cnt)
+ p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
+ else
+ p = &bpf_empty_prog_array.hdr;
+
+ return p;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array *progs)
+{
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
+ return;
+ kfree_rcu(progs, rcu);
+}
+
+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+ struct bpf_prog_array *progs;
+
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is
+ * no need to call kfree_rcu(), just call kfree() directly.
+ */
+ progs = container_of(rcu, struct bpf_prog_array, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(progs);
+ else
+ kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
+ return;
+ call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array *array)
+{
+ struct bpf_prog_array_item *item;
+ u32 cnt = 0;
+
+ for (item = array->items; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
+ cnt++;
+ return cnt;
+}
+
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+ struct bpf_prog_array_item *item;
+
+ for (item = array->items; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
+ return false;
+ return true;
+}
+
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
+ u32 *prog_ids,
+ u32 request_cnt)
+{
+ struct bpf_prog_array_item *item;
+ int i = 0;
+
+ for (item = array->items; item->prog; item++) {
+ if (item->prog == &dummy_bpf_prog.prog)
+ continue;
+ prog_ids[i] = item->prog->aux->id;
+ if (++i == request_cnt) {
+ item++;
+ break;
+ }
+ }
+
+ return !!(item->prog);
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
+ __u32 __user *prog_ids, u32 cnt)
+{
+ unsigned long err = 0;
+ bool nospc;
+ u32 *ids;
+
+ /* users of this function are doing:
+ * cnt = bpf_prog_array_length();
+ * if (cnt > 0)
+ * bpf_prog_array_copy_to_user(..., cnt);
+ * so below kcalloc doesn't need extra cnt > 0 check.
+ */
+ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
+ if (!ids)
+ return -ENOMEM;
+ nospc = bpf_prog_array_copy_core(array, ids, cnt);
+ err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
+ kfree(ids);
+ if (err)
+ return -EFAULT;
+ if (nospc)
+ return -ENOSPC;
+ return 0;
+}
+
+void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog_array_item *item;
+
+ for (item = array->items; item->prog; item++)
+ if (item->prog == old_prog) {
+ WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+/**
+ * bpf_prog_array_delete_safe_at() - Replaces the program at the given
+ * index into the program array with
+ * a dummy no-op program.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to replace
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to replace.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
+{
+ return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
+}
+
+/**
+ * bpf_prog_array_update_at() - Updates the program at the given index
+ * into the program array.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to update
+ * @prog: the program to insert into the array
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to update.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array_item *item;
+
+ if (unlikely(index < 0))
+ return -EINVAL;
+
+ for (item = array->items; item->prog; item++) {
+ if (item->prog == &dummy_bpf_prog.prog)
+ continue;
+ if (!index) {
+ WRITE_ONCE(item->prog, prog);
+ return 0;
+ }
+ index--;
+ }
+ return -ENOENT;
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ u64 bpf_cookie,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog_array_item *existing, *new;
+ struct bpf_prog_array *array;
+ bool found_exclude = false;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing = old_array->items;
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog) {
+ found_exclude = true;
+ continue;
+ }
+ if (existing->prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (existing->prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ if (exclude_prog && !found_exclude)
+ return -ENOENT;
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+ new = array->items;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing = old_array->items;
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog ||
+ existing->prog == &dummy_bpf_prog.prog)
+ continue;
+
+ new->prog = existing->prog;
+ new->bpf_cookie = existing->bpf_cookie;
+ new++;
+ }
+ }
+ if (include_prog) {
+ new->prog = include_prog;
+ new->bpf_cookie = bpf_cookie;
+ new++;
+ }
+ new->prog = NULL;
+ *new_array = array;
+ return 0;
+}
+
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
+ u32 *prog_ids, u32 request_cnt,
+ u32 *prog_cnt)
+{
+ u32 cnt = 0;
+
+ if (array)
+ cnt = bpf_prog_array_length(array);
+
+ *prog_cnt = cnt;
+
+ /* return early if user requested only program count or nothing to copy */
+ if (!request_cnt || !cnt)
+ return 0;
+
+ /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
+ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
+ : 0;
+}
+
+void __bpf_free_used_maps(struct bpf_prog_aux *aux,
+ struct bpf_map **used_maps, u32 len)
+{
+ struct bpf_map *map;
+ bool sleepable;
+ u32 i;
+
+ sleepable = aux->prog->sleepable;
+ for (i = 0; i < len; i++) {
+ map = used_maps[i];
+ if (map->ops->map_poke_untrack)
+ map->ops->map_poke_untrack(map, aux);
+ if (sleepable)
+ atomic64_dec(&map->sleepable_refcnt);
+ bpf_map_put(map);
+ }
+}
+
+static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
+ kfree(aux->used_maps);
+}
+
+void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct btf_mod_pair *btf_mod;
+ u32 i;
+
+ for (i = 0; i < len; i++) {
+ btf_mod = &used_btfs[i];
+ if (btf_mod->module)
+ module_put(btf_mod->module);
+ btf_put(btf_mod->btf);
+ }
+#endif
+}
+
+static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
+ kfree(aux->used_btfs);
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
+ int i;
aux = container_of(work, struct bpf_prog_aux, work);
- bpf_jit_free(aux->prog);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+ bpf_prog_stream_free(aux->prog);
+#endif
+#ifdef CONFIG_CGROUP_BPF
+ if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
+ bpf_cgroup_atype_put(aux->cgroup_atype);
+#endif
+ bpf_free_used_maps(aux);
+ bpf_free_used_btfs(aux);
+ if (bpf_prog_is_dev_bound(aux))
+ bpf_prog_dev_bound_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+ if (aux->prog->has_callchain_buf)
+ put_callchain_buffers();
+#endif
+ if (aux->dst_trampoline)
+ bpf_trampoline_put(aux->dst_trampoline);
+ for (i = 0; i < aux->real_func_cnt; i++) {
+ /* We can just unlink the subprog poke descriptor table as
+ * it was originally linked to the main program and is also
+ * released along with it.
+ */
+ aux->func[i]->aux->poke_tab = NULL;
+ bpf_jit_free(aux->func[i]);
+ }
+ if (aux->real_func_cnt) {
+ kfree(aux->func);
+ bpf_prog_unlock_free(aux->prog);
+ } else {
+ bpf_jit_free(aux->prog);
+ }
}
-/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
+ if (aux->dst_prog)
+ bpf_prog_put(aux->dst_prog);
+ bpf_token_put(aux->token);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
- aux->prog = fp;
schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* RNG for unprivileged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+ prandom_init_once(&bpf_user_rnd_state);
+}
+
+BPF_CALL_0(bpf_user_rnd_u32)
+{
+ /* Should someone ever have the rather unwise idea to use some
+ * of the registers passed into this function, then note that
+ * this function is called from native eBPF and classic-to-eBPF
+ * transformations. Register assignments from both sides are
+ * different, f.e. classic always sets fn(ctx, A, X) here.
+ */
+ struct rnd_state *state;
+ u32 res;
+
+ state = &get_cpu_var(bpf_user_rnd_state);
+ res = prandom_u32_state(state);
+ put_cpu_var(bpf_user_rnd_state);
+
+ return res;
+}
+
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+ return raw_smp_processor_id();
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+const struct bpf_func_proto bpf_map_push_elem_proto __weak;
+const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
+const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
+const struct bpf_func_proto bpf_jiffies64_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_local_storage_proto __weak;
+const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
+const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
+
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+ return NULL;
+}
+
+const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
+{
+ return NULL;
+}
+
+const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
+{
+ return NULL;
+}
+
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
+{
+ return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(bpf_event_output);
+
+/* Always built-in helper functions. */
+const struct bpf_func_proto bpf_tail_call_proto = {
+ /* func is unused for tail_call, we set it to pass the
+ * get_helper_proto check
+ */
+ .func = BPF_PTR_POISON,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
+ * It is encouraged to implement bpf_int_jit_compile() instead, so that
+ * eBPF and implicitly also cBPF can get JITed!
+ */
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ return prog;
+}
+
+/* Stub for JITs that support eBPF. All cBPF code gets transformed into
+ * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
+ */
+void __weak bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
+bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
+{
+ return false;
+}
+
+/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
+ * analysis code and wants explicit zero extension inserted by verifier.
+ * Otherwise, return FALSE.
+ *
+ * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
+ * you don't override this. JITs that don't want these extra insns can detect
+ * them using insn_is_zext.
+ */
+bool __weak bpf_jit_needs_zext(void)
+{
+ return false;
+}
+
+/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
+ * all archs. The value returned must not change at runtime as there is
+ * currently no support for reloading programs that were loaded without
+ * mitigations.
+ */
+bool __weak bpf_jit_bypass_spec_v1(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_bypass_spec_v4(void)
+{
+ return false;
+}
+
+/* Return true if the JIT inlines the call to the helper corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_helper_call(s32 imm)
+{
+ return false;
+}
+
+/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool __weak bpf_jit_supports_subprog_tailcalls(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_percpu_insn(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_arena(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+ return false;
+}
+
+u64 __weak bpf_arch_uaddress_limit(void)
+{
+#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
+ return TASK_SIZE;
+#else
+ return 0;
+#endif
+}
+
+/* Return TRUE if the JIT backend satisfies the following two conditions:
+ * 1) JIT backend supports atomic_xchg() on pointer-sized words.
+ * 2) Under the specific arch, the implementation of xchg() is the same
+ * as atomic_xchg() on pointer-sized words.
+ */
+bool __weak bpf_jit_supports_ptr_xchg(void)
+{
+ return false;
+}
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
@@ -672,3 +3149,192 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
{
return -EFAULT;
}
+
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t,
+ enum bpf_text_poke_type new_t, void *old_addr,
+ void *new_addr)
+{
+ return -ENOTSUPP;
+}
+
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+int __weak bpf_arch_text_invalidate(void *dst, size_t len)
+{
+ return -ENOTSUPP;
+}
+
+bool __weak bpf_jit_supports_exceptions(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
+bool __weak bpf_jit_supports_timed_may_goto(void)
+{
+ return false;
+}
+
+u64 __weak arch_bpf_timed_may_goto(void)
+{
+ return 0;
+}
+
+static noinline void bpf_prog_report_may_goto_violation(void)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return;
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
+ bpf_stream_dump_stack(ss);
+ }));
+#endif
+}
+
+u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
+{
+ u64 time = ktime_get_mono_fast_ns();
+
+ /* Populate the timestamp for this stack frame, and refresh count. */
+ if (!p->timestamp) {
+ p->timestamp = time;
+ return BPF_MAX_TIMED_LOOPS;
+ }
+ /* Check if we've exhausted our time slice, and zero count. */
+ if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
+ bpf_prog_report_may_goto_violation();
+ return 0;
+ }
+ /* Refresh the count for the stack frame. */
+ return BPF_MAX_TIMED_LOOPS;
+}
+
+/* for configs without MMU or 32-bit */
+__weak const struct bpf_map_ops arena_map_ops;
+__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
+{
+ return 0;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+ int ret;
+
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ bpf_global_ma_set = !ret;
+ return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+
+/* All definitions of tracepoints related to BPF. */
+#define CREATE_TRACE_POINTS
+#include <linux/bpf_trace.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
+
+#ifdef CONFIG_BPF_SYSCALL
+
+int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
+ const char **linep, int *nump)
+{
+ int idx = -1, insn_start, insn_end, len;
+ struct bpf_line_info *linfo;
+ void **jited_linfo;
+ struct btf *btf;
+ int nr_linfo;
+
+ btf = prog->aux->btf;
+ linfo = prog->aux->linfo;
+ jited_linfo = prog->aux->jited_linfo;
+
+ if (!btf || !linfo || !jited_linfo)
+ return -EINVAL;
+ len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;
+
+ linfo = &prog->aux->linfo[prog->aux->linfo_idx];
+ jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];
+
+ insn_start = linfo[0].insn_off;
+ insn_end = insn_start + len;
+ nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;
+
+ for (int i = 0; i < nr_linfo &&
+ linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
+ if (jited_linfo[i] >= (void *)ip)
+ break;
+ idx = i;
+ }
+
+ if (idx == -1)
+ return -ENOENT;
+
+ /* Get base component of the file path. */
+ *filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
+ *filep = kbasename(*filep);
+ /* Obtain the source line, and strip whitespace in prefix. */
+ *linep = btf_name_by_offset(btf, linfo[idx].line_off);
+ while (isspace(**linep))
+ *linep += 1;
+ *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
+ return 0;
+}
+
+struct walk_stack_ctx {
+ struct bpf_prog *prog;
+};
+
+static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct walk_stack_ctx *ctxp = cookie;
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it has an
+ * active stack frame on the current stack trace, and won't disappear.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (!prog)
+ return true;
+ /* Make sure we return the main prog if we found a subprog */
+ ctxp->prog = prog->aux->main_prog_aux->prog;
+ return false;
+}
+
+struct bpf_prog *bpf_prog_find_from_stack(void)
+{
+ struct walk_stack_ctx ctx = {};
+
+ arch_bpf_stack_walk(find_from_stack_cb, &ctx);
+ return ctx.prog;
+}
+
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..703e5df1f4ef
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,813 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+
+/**
+ * DOC: cpu map
+ * The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out to another NIC device,
+ * this map type redirects raw XDP frames to another CPU. The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ */
+/*
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage. This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bitops.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+#include <net/xdp.h>
+#include <net/hotdata.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
+
+#include <linux/netdevice.h>
+#include <net/gro.h>
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call. It is
+ * guaranteed that queueing the frame and the flush operation happen on
+ * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct bpf_cpu_map_entry;
+struct bpf_cpu_map;
+
+struct xdp_bulk_queue {
+ void *q[CPU_MAP_BULK_SIZE];
+ struct list_head flush_node;
+ struct bpf_cpu_map_entry *obj;
+ unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+ u32 cpu; /* kthread CPU and map index */
+ int map_id; /* Back reference to map */
+
+ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+ struct xdp_bulk_queue __percpu *bulkq;
+
+ /* Queue with potential multi-producers, and single-consumer kthread */
+ struct ptr_ring *queue;
+ struct task_struct *kthread;
+
+ struct bpf_cpumap_val value;
+ struct bpf_prog *prog;
+ struct gro_node gro;
+
+ struct completion kthread_running;
+ struct rcu_work free_work;
+};
+
+struct bpf_cpu_map {
+ struct bpf_map map;
+ /* Below members specific for map type */
+ struct bpf_cpu_map_entry __rcu **cpu_map;
+};
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+ u32 value_size = attr->value_size;
+ struct bpf_cpu_map *cmap;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ (value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
+ value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~BPF_F_NUMA_NODE)
+ return ERR_PTR(-EINVAL);
+
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (attr->max_entries > NR_CPUS)
+ return ERR_PTR(-E2BIG);
+
+ cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
+ if (!cmap)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&cmap->map, attr);
+
+ /* Alloc array for possible remote "destination" CPUs */
+ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+ sizeof(struct bpf_cpu_map_entry *),
+ cmap->map.numa_node);
+ if (!cmap->cpu_map) {
+ bpf_map_area_free(cmap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &cmap->map;
+}
+
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+ /* The tear-down procedure should have made sure that queue is
+ * empty. See __cpu_map_entry_replace() and work-queue
+ * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+ * gracefully and warn once.
+ */
+ void *ptr;
+
+ while ((ptr = ptr_ring_consume(ring))) {
+ WARN_ON_ONCE(1);
+ if (unlikely(__ptr_test_bit(0, &ptr))) {
+ __ptr_clear_bit(0, &ptr);
+ kfree_skb(ptr);
+ continue;
+ }
+ xdp_return_frame(ptr);
+ }
+}
+
+static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ void **skbs, u32 skb_n,
+ struct xdp_cpumap_stats *stats)
+{
+ struct xdp_buff xdp;
+ u32 act, pass = 0;
+ int err;
+
+ for (u32 i = 0; i < skb_n; i++) {
+ struct sk_buff *skb = skbs[i];
+
+ act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
+ switch (act) {
+ case XDP_PASS:
+ skbs[pass++] = skb;
+ break;
+ case XDP_REDIRECT:
+ err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ napi_consume_skb(skb, true);
+ stats->drop++;
+ break;
+ }
+ }
+
+ stats->pass += pass;
+
+ return pass;
+}
+
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+ void **frames, int n,
+ struct xdp_cpumap_stats *stats)
+{
+ struct xdp_rxq_info rxq = {};
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ xdp.rxq = &rxq;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ rxq.dev = xdpf->dev_rx;
+ rxq.mem.type = xdpf->mem_type;
+ /* TODO: report queue_index to xdp_rxq_info */
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+
+ act = bpf_prog_run_xdp(rcpu->prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (err < 0) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ frames[nframes++] = xdpf;
+ }
+ break;
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(xdpf->dev_rx, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ break;
+ }
+ }
+
+ stats->pass += nframes;
+
+ return nframes;
+}
+
+#define CPUMAP_BATCH 8
+
+struct cpu_map_ret {
+ u32 xdp_n;
+ u32 skb_n;
+};
+
+static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ void **skbs, struct cpu_map_ret *ret,
+ struct xdp_cpumap_stats *stats)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+
+ if (!rcpu->prog)
+ goto out;
+
+ rcu_read_lock();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ xdp_set_return_frame_no_direct();
+
+ ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
+ if (unlikely(ret->skb_n))
+ ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n,
+ stats);
+
+ if (stats->redirect)
+ xdp_do_flush();
+
+ xdp_clear_return_frame_no_direct();
+ bpf_net_ctx_clear(bpf_net_ctx);
+ rcu_read_unlock();
+
+out:
+ if (unlikely(ret->skb_n) && ret->xdp_n)
+ memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs));
+}
+
+static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty)
+{
+ /*
+ * If the ring is not empty, there'll be a new iteration soon, and we
+ * only need to do a full flush if a tick is long (> 1 ms).
+ * If the ring is empty, to not hold GRO packets in the stack for too
+ * long, do a full flush.
+ * This is equivalent to how NAPI decides whether to perform a full
+ * flush.
+ */
+ gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000);
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+ struct bpf_cpu_map_entry *rcpu = data;
+ unsigned long last_qs = jiffies;
+ u32 packets = 0;
+
+ complete(&rcpu->kthread_running);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* When kthread gives stop order, then rcpu have been disconnected
+ * from map, thus no new packets can enter. Remaining in-flight
+ * per CPU stored packets are flushed to this queue. Wait honoring
+ * kthread_stop signal until queue is empty.
+ */
+ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+ struct xdp_cpumap_stats stats = {}; /* zero stats */
+ unsigned int kmem_alloc_drops = 0, sched = 0;
+ struct cpu_map_ret ret = { };
+ void *frames[CPUMAP_BATCH];
+ void *skbs[CPUMAP_BATCH];
+ u32 i, n, m;
+ bool empty;
+
+ /* Release CPU reschedule checks */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Recheck to avoid lost wake-up */
+ if (__ptr_ring_empty(rcpu->queue)) {
+ schedule();
+ sched = 1;
+ last_qs = jiffies;
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+ } else {
+ rcu_softirq_qs_periodic(last_qs);
+ sched = cond_resched();
+ }
+
+ /*
+ * The bpf_cpu_map_entry is single consumer, with this
+ * kthread CPU pinned. Lockless access to ptr_ring
+ * consume side valid as no-resize allowed of queue.
+ */
+ n = __ptr_ring_consume_batched(rcpu->queue, frames,
+ CPUMAP_BATCH);
+ for (i = 0; i < n; i++) {
+ void *f = frames[i];
+ struct page *page;
+
+ if (unlikely(__ptr_test_bit(0, &f))) {
+ struct sk_buff *skb = f;
+
+ __ptr_clear_bit(0, &skb);
+ skbs[ret.skb_n++] = skb;
+ continue;
+ }
+
+ frames[ret.xdp_n++] = f;
+ page = virt_to_page(f);
+
+ /* Bring struct page memory area to curr CPU. Read by
+ * build_skb_around via page_is_pfmemalloc(), and when
+ * freed written by page_frag_free call.
+ */
+ prefetchw(page);
+ }
+
+ local_bh_disable();
+
+ /* Support running another XDP prog on this CPU */
+ cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats);
+ if (!ret.xdp_n)
+ goto stats;
+
+ m = napi_skb_cache_get_bulk(skbs, ret.xdp_n);
+ if (unlikely(m < ret.xdp_n)) {
+ for (i = m; i < ret.xdp_n; i++)
+ xdp_return_frame(frames[i]);
+
+ if (ret.skb_n)
+ memmove(&skbs[m], &skbs[ret.xdp_n],
+ ret.skb_n * sizeof(*skbs));
+
+ kmem_alloc_drops += ret.xdp_n - m;
+ ret.xdp_n = m;
+ }
+
+ for (i = 0; i < ret.xdp_n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+
+ /* Can fail only when !skb -- already handled above */
+ __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx);
+ }
+
+stats:
+ /* Feedback loop via tracepoint.
+ * NB: keep before recv to allow measuring enqueue/dequeue latency.
+ */
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
+ sched, &stats);
+
+ for (i = 0; i < ret.xdp_n + ret.skb_n; i++)
+ gro_receive_skb(&rcpu->gro, skbs[i]);
+
+ /* Flush either every 64 packets or in case of empty ring */
+ packets += n;
+ empty = __ptr_ring_empty(rcpu->queue);
+ if (packets >= NAPI_POLL_WEIGHT || empty) {
+ cpu_map_gro_flush(rcpu, empty);
+ packets = 0;
+ }
+
+ local_bh_enable(); /* resched point, may call do_softirq() */
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+ struct bpf_map *map, int fd)
+{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+ !bpf_prog_map_compatible(map, prog)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ rcpu->value.bpf_prog.id = prog->aux->id;
+ rcpu->prog = prog;
+
+ return 0;
+}
+
+static struct bpf_cpu_map_entry *
+__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
+ u32 cpu)
+{
+ int numa, err, i, fd = value->bpf_prog.fd;
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ struct bpf_cpu_map_entry *rcpu;
+ struct xdp_bulk_queue *bq;
+
+ /* Have map->numa_node, but choose node of redirect target CPU */
+ numa = cpu_to_node(cpu);
+
+ rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
+ if (!rcpu)
+ return NULL;
+
+ /* Alloc percpu bulkq */
+ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
+ if (!rcpu->bulkq)
+ goto free_rcu;
+
+ for_each_possible_cpu(i) {
+ bq = per_cpu_ptr(rcpu->bulkq, i);
+ bq->obj = rcpu;
+ }
+
+ /* Alloc queue */
+ rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp,
+ numa);
+ if (!rcpu->queue)
+ goto free_bulkq;
+
+ err = ptr_ring_init(rcpu->queue, value->qsize, gfp);
+ if (err)
+ goto free_queue;
+
+ rcpu->cpu = cpu;
+ rcpu->map_id = map->id;
+ rcpu->value.qsize = value->qsize;
+ gro_init(&rcpu->gro);
+
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
+ goto free_ptr_ring;
+
+ /* Setup kthread */
+ init_completion(&rcpu->kthread_running);
+ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+ "cpumap/%d/map:%d", cpu,
+ map->id);
+ if (IS_ERR(rcpu->kthread))
+ goto free_prog;
+
+ /* Make sure kthread runs on a single CPU */
+ kthread_bind(rcpu->kthread, cpu);
+ wake_up_process(rcpu->kthread);
+
+ /* Make sure kthread has been running, so kthread_stop() will not
+ * stop the kthread prematurely and all pending frames or skbs
+ * will be handled by the kthread before kthread_stop() returns.
+ */
+ wait_for_completion(&rcpu->kthread_running);
+
+ return rcpu;
+
+free_prog:
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
+free_ptr_ring:
+ gro_cleanup(&rcpu->gro);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+ kfree(rcpu->queue);
+free_bulkq:
+ free_percpu(rcpu->bulkq);
+free_rcu:
+ kfree(rcpu);
+ return NULL;
+}
+
+static void __cpu_map_entry_free(struct work_struct *work)
+{
+ struct bpf_cpu_map_entry *rcpu;
+
+ /* This cpu_map_entry have been disconnected from map and one
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
+ * new packets and cannot change/set flush_needed that can
+ * find this entry.
+ */
+ rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
+
+ /* kthread_stop will wake_up_process and wait for it to complete.
+ * cpu_map_kthread_run() makes sure the pointer ring is empty
+ * before exiting.
+ */
+ kthread_stop(rcpu->kthread);
+
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
+ gro_cleanup(&rcpu->gro);
+ /* The queue should be empty at this point */
+ __cpu_map_ring_cleanup(rcpu->queue);
+ ptr_ring_cleanup(rcpu->queue, NULL);
+ kfree(rcpu->queue);
+ free_percpu(rcpu->bulkq);
+ kfree(rcpu);
+}
+
+/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
+ * period. This means that (a) all pending enqueue and flush operations have
+ * completed (because of the RCU callback), and (b) we are in a workqueue
+ * context where we can stop the kthread and wait for it to exit before freeing
+ * everything.
+ */
+static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+ u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+ struct bpf_cpu_map_entry *old_rcpu;
+
+ old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
+ if (old_rcpu) {
+ INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
+ queue_rcu_work(system_percpu_wq, &old_rcpu->free_work);
+ }
+}
+
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 key_cpu = *(u32 *)key;
+
+ if (key_cpu >= map->max_entries)
+ return -EINVAL;
+
+ /* notice caller map_delete_elem() uses rcu_read_lock() */
+ __cpu_map_entry_replace(cmap, key_cpu, NULL);
+ return 0;
+}
+
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpumap_val cpumap_value = {};
+ struct bpf_cpu_map_entry *rcpu;
+ /* Array index key correspond to CPU number */
+ u32 key_cpu = *(u32 *)key;
+
+ memcpy(&cpumap_value, value, map->value_size);
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(key_cpu >= cmap->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+ if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */
+ return -EOVERFLOW;
+
+ /* Make sure CPU is a valid possible cpu */
+ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
+ return -ENODEV;
+
+ if (cpumap_value.qsize == 0) {
+ rcpu = NULL; /* Same as deleting */
+ } else {
+ /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
+ if (!rcpu)
+ return -ENOMEM;
+ }
+ rcu_read_lock();
+ __cpu_map_entry_replace(cmap, key_cpu, rcpu);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void cpu_map_free(struct bpf_map *map)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 i;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the bpf programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete. synchronize_rcu() below not only
+ * guarantees no further "XDP/bpf-side" reads against
+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
+ * (if any) are completed.
+ */
+ synchronize_rcu();
+
+ /* The only possible user of bpf_cpu_map_entry is
+ * cpu_map_kthread_run().
+ */
+ for (i = 0; i < cmap->map.max_entries; i++) {
+ struct bpf_cpu_map_entry *rcpu;
+
+ rcpu = rcu_dereference_raw(cmap->cpu_map[i]);
+ if (!rcpu)
+ continue;
+
+ /* Stop kthread and cleanup entry directly */
+ __cpu_map_entry_free(&rcpu->free_work.work);
+ }
+ bpf_map_area_free(cmap->cpu_map);
+ bpf_map_area_free(cmap);
+}
+
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpu_map_entry *rcpu;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ rcpu = rcu_dereference_check(cmap->cpu_map[key],
+ rcu_read_lock_bh_held());
+ return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_cpu_map_entry *rcpu =
+ __cpu_map_lookup_elem(map, *(u32 *)key);
+
+ return rcpu ? &rcpu->value : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= cmap->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == cmap->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
+ __cpu_map_lookup_elem);
+}
+
+static u64 cpu_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_cpu_map);
+
+ /* Currently the dynamically allocated elements are not counted */
+ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
+const struct bpf_map_ops cpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = cpu_map_alloc,
+ .map_free = cpu_map_free,
+ .map_delete_elem = cpu_map_delete_elem,
+ .map_update_elem = cpu_map_update_elem,
+ .map_lookup_elem = cpu_map_lookup_elem,
+ .map_get_next_key = cpu_map_get_next_key,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = cpu_map_mem_usage,
+ .map_btf_id = &cpu_map_btf_ids[0],
+ .map_redirect = cpu_map_redirect,
+};
+
+static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
+{
+ struct bpf_cpu_map_entry *rcpu = bq->obj;
+ unsigned int processed = 0, drops = 0;
+ const int to_cpu = rcpu->cpu;
+ struct ptr_ring *q;
+ int i;
+
+ if (unlikely(!bq->count))
+ return;
+
+ q = rcpu->queue;
+ spin_lock(&q->producer_lock);
+
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+ int err;
+
+ err = __ptr_ring_produce(q, xdpf);
+ if (err) {
+ drops++;
+ xdp_return_frame_rx_napi(xdpf);
+ }
+ processed++;
+ }
+ bq->count = 0;
+ spin_unlock(&q->producer_lock);
+
+ __list_del_clearprev(&bq->flush_node);
+
+ /* Feedback loop via tracepoints */
+ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+ if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+ bq_flush_to_queue(bq);
+
+ /* Notice, xdp_buff/page MUST be queued here, long enough for
+ * driver to code invoking us to finished, due to driver
+ * (e.g. ixgbe) recycle tricks based on page-refcnt.
+ *
+ * Thus, incoming xdp_frame is always queued here (else we race
+ * with another CPU on page-refcnt and remaining driver code).
+ * Queue time is very short, as driver will invoke flush
+ * operation, when completing napi->poll call.
+ */
+ bq->q[bq->count++] = xdpf;
+
+ if (!bq->flush_node.prev) {
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
+
+ list_add(&bq->flush_node, flush_list);
+ }
+}
+
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
+ struct net_device *dev_rx)
+{
+ /* Info needed when constructing SKB on remote CPU */
+ xdpf->dev_rx = dev_rx;
+
+ bq_enqueue(rcpu, xdpf);
+ return 0;
+}
+
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ __skb_pull(skb, skb->mac_len);
+ skb_set_redirected(skb, false);
+ __ptr_set_bit(0, &skb);
+
+ ret = ptr_ring_produce(rcpu->queue, skb);
+ if (ret < 0)
+ goto trace;
+
+ wake_up_process(rcpu->kthread);
+trace:
+ trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
+ return ret;
+}
+
+void __cpu_map_flush(struct list_head *flush_list)
+{
+ struct xdp_bulk_queue *bq, *tmp;
+
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ bq_flush_to_queue(bq);
+
+ /* If already running, costs spin_lock_irqsave + smb_mb */
+ wake_up_process(bq->obj->kthread);
+ }
+}
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
new file mode 100644
index 000000000000..9876c5fe6c2a
--- /dev/null
+++ b/kernel/bpf/cpumask.c
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/cpumask.h>
+
+/**
+ * struct bpf_cpumask - refcounted BPF cpumask wrapper structure
+ * @cpumask: The actual cpumask embedded in the struct.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ *
+ * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t. This
+ * is done to avoid confusing the verifier due to the typedef of cpumask_var_t
+ * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See
+ * the details in <linux/cpumask.h>. The consequence is that this structure is
+ * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is
+ * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory
+ * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK
+ * not being defined, the structure is the same size regardless.
+ */
+struct bpf_cpumask {
+ cpumask_t cpumask;
+ refcount_t usage;
+};
+
+static struct bpf_mem_alloc bpf_cpumask_ma;
+
+static bool cpu_valid(u32 cpu)
+{
+ return cpu < nr_cpu_ids;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_cpumask_create() - Create a mutable BPF cpumask.
+ *
+ * Allocates a cpumask that can be queried, mutated, acquired, and released by
+ * a BPF program. The cpumask returned by this function must either be embedded
+ * in a map as a kptr, or freed with bpf_cpumask_release().
+ *
+ * bpf_cpumask_create() allocates memory using the BPF memory allocator, and
+ * will not block. It may return NULL if no memory is available.
+ *
+ * Return:
+ * * A pointer to a new struct bpf_cpumask instance on success.
+ * * NULL if the BPF memory allocator is out of memory.
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
+{
+ struct bpf_cpumask *cpumask;
+
+ /* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
+ BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
+
+ cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);
+ if (!cpumask)
+ return NULL;
+
+ memset(cpumask, 0, sizeof(*cpumask));
+ refcount_set(&cpumask->usage, 1);
+
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask.
+ * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF cpumask. The cpumask returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_cpumask_release().
+ *
+ * Return:
+ * * The struct bpf_cpumask pointer passed to the function.
+ *
+ */
+__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
+{
+ refcount_inc(&cpumask->usage);
+ return cpumask;
+}
+
+/**
+ * bpf_cpumask_release() - Release a previously acquired BPF cpumask.
+ * @cpumask: The cpumask being released.
+ *
+ * Releases a previously acquired reference to a BPF cpumask. When the final
+ * reference of the BPF cpumask has been released, it is subsequently freed in
+ * an RCU callback in the BPF memory allocator.
+ */
+__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
+{
+ if (!refcount_dec_and_test(&cpumask->usage))
+ return;
+
+ bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
+}
+
+__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
+{
+ bpf_cpumask_release(cpumask);
+}
+CFI_NOSEAL(bpf_cpumask_release_dtor);
+
+/**
+ * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first nonzero bit in the struct cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
+{
+ return cpumask_first(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_zero() - Get the index of the first unset bit in the
+ * cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
+ * pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first zero bit in the struct cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
+{
+ return cpumask_first_zero(cpumask);
+}
+
+/**
+ * bpf_cpumask_first_and() - Return the index of the first nonzero bit from the
+ * AND of two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Find the index of the first nonzero bit of the AND of two cpumasks.
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ *
+ * Return:
+ * * The index of the first bit that is nonzero in both cpumask instances.
+ */
+__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_first_and(src1, src2);
+}
+
+/**
+ * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be set in the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being set.
+ */
+__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask.
+ * @cpu: The CPU to be cleared from the cpumask.
+ * @cpumask: The BPF cpumask in which a bit is being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return;
+
+ cpumask_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask.
+ * @cpu: The CPU being queried for.
+ * @cpumask: The cpumask being queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu.
+ */
+__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask.
+ * @cpu: The CPU being set and queried for.
+ * @cpumask: The BPF cpumask being set and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF
+ * cpumask.
+ * @cpu: The CPU being cleared and queried for.
+ * @cpumask: The BPF cpumask being cleared and queried for containing a CPU.
+ *
+ * Return:
+ * * true - @cpu is set in the cpumask
+ * * false - @cpu was not set in the cpumask, or @cpu is invalid.
+ */
+__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask)
+{
+ if (!cpu_valid(cpu))
+ return false;
+
+ return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask having all of its bits set.
+ */
+__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask)
+{
+ cpumask_setall((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask.
+ * @cpumask: The BPF cpumask being cleared.
+ */
+__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask)
+{
+ cpumask_clear((struct cpumask *)cpumask);
+}
+
+/**
+ * bpf_cpumask_and() - AND two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @dst has at least one bit set following the operation
+ * * false - @dst is empty following the operation
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_and((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_or() - OR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_or((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_xor() - XOR two cpumasks and store the result.
+ * @dst: The BPF cpumask where the result is being stored.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst,
+ const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ cpumask_xor((struct cpumask *)dst, src1, src2);
+}
+
+/**
+ * bpf_cpumask_equal() - Check two cpumasks for equality.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have the same bits set.
+ * * false - @src1 and @src2 differ in at least one bit.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_equal(src1, src2);
+}
+
+/**
+ * bpf_cpumask_intersects() - Check two cpumasks for overlap.
+ * @src1: The first input.
+ * @src2: The second input.
+ *
+ * Return:
+ * * true - @src1 and @src2 have at least one of the same bits set.
+ * * false - @src1 and @src2 don't have any of the same bits set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_intersects(src1, src2);
+}
+
+/**
+ * bpf_cpumask_subset() - Check if a cpumask is a subset of another.
+ * @src1: The first cpumask being checked as a subset.
+ * @src2: The second cpumask being checked as a superset.
+ *
+ * Return:
+ * * true - All of the bits of @src1 are set in @src2.
+ * * false - At least one bit in @src1 is not set in @src2.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2)
+{
+ return cpumask_subset(src1, src2);
+}
+
+/**
+ * bpf_cpumask_empty() - Check if a cpumask is empty.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - None of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask)
+{
+ return cpumask_empty(cpumask);
+}
+
+/**
+ * bpf_cpumask_full() - Check if a cpumask has all bits set.
+ * @cpumask: The cpumask being checked.
+ *
+ * Return:
+ * * true - All of the bits in @cpumask are set.
+ * * false - At least one bit in @cpumask is cleared.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @cpumask.
+ */
+__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask)
+{
+ return cpumask_full(cpumask);
+}
+
+/**
+ * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask.
+ * @dst: The BPF cpumask being copied into.
+ * @src: The cpumask being copied.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src)
+{
+ cpumask_copy((struct cpumask *)dst, src);
+}
+
+/**
+ * bpf_cpumask_any_distribute() - Return a random set CPU from a cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) if at least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * A struct bpf_cpumask pointer may be safely passed to @src.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask)
+{
+ return cpumask_any_distribute(cpumask);
+}
+
+/**
+ * bpf_cpumask_any_and_distribute() - Return a random set CPU from the AND of
+ * two cpumasks.
+ * @src1: The first cpumask.
+ * @src2: The second cpumask.
+ *
+ * Return:
+ * * A random set bit within [0, num_cpus) from the AND of two cpumasks, if at
+ * least one bit is set.
+ * * >= num_cpus if no bit is set.
+ *
+ * struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ */
+__bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+ const struct cpumask *src2)
+{
+ return cpumask_any_and_distribute(src1, src2);
+}
+
+/**
+ * bpf_cpumask_weight() - Return the number of bits in @cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Count the number of set bits in the given cpumask.
+ *
+ * Return:
+ * * The number of bits set in the mask.
+ */
+__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
+{
+ return cpumask_weight(cpumask);
+}
+
+/**
+ * bpf_cpumask_populate() - Populate the CPU mask from the contents of
+ * a BPF memory region.
+ *
+ * @cpumask: The cpumask being populated.
+ * @src: The BPF memory holding the bit pattern.
+ * @src__sz: Length of the BPF memory region in bytes.
+ *
+ * Return:
+ * * 0 if the struct cpumask * instance was populated successfully.
+ * * -EACCES if the memory region is too small to populate the cpumask.
+ * * -EINVAL if the memory region is not aligned to the size of a long
+ * and the architecture does not support efficient unaligned accesses.
+ */
+__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz)
+{
+ unsigned long source = (unsigned long)src;
+
+ /* The memory region must be large enough to populate the entire CPU mask. */
+ if (src__sz < bitmap_size(nr_cpu_ids))
+ return -EACCES;
+
+ /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ !IS_ALIGNED(source, sizeof(long)))
+ return -EINVAL;
+
+ bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids);
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU)
+BTF_KFUNCS_END(cpumask_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set cpumask_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cpumask_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(cpumask_dtor_ids)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(func, bpf_cpumask_release_dtor)
+
+static int __init cpumask_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc cpumask_dtors[] = {
+ {
+ .btf_id = cpumask_dtor_ids[0],
+ .kfunc_btf_id = cpumask_dtor_ids[1]
+ },
+ };
+
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &cpumask_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
+ ARRAY_SIZE(cpumask_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(cpumask_kfunc_init);
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
new file mode 100644
index 000000000000..83c4d9943084
--- /dev/null
+++ b/kernel/bpf/crypto.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_crypto.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <crypto/skcipher.h>
+
+struct bpf_crypto_type_list {
+ const struct bpf_crypto_type *type;
+ struct list_head list;
+};
+
+/* BPF crypto initialization parameters struct */
+/**
+ * struct bpf_crypto_params - BPF crypto initialization parameters structure
+ * @type: The string of crypto operation type.
+ * @reserved: Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ * @algo: The string of algorithm to initialize.
+ * @key: The cipher key used to init crypto algorithm.
+ * @key_len: The length of cipher key.
+ * @authsize: The length of authentication tag used by algorithm.
+ */
+struct bpf_crypto_params {
+ char type[14];
+ u8 reserved[2];
+ char algo[128];
+ u8 key[256];
+ u32 key_len;
+ u32 authsize;
+};
+
+static LIST_HEAD(bpf_crypto_types);
+static DECLARE_RWSEM(bpf_crypto_types_sem);
+
+/**
+ * struct bpf_crypto_ctx - refcounted BPF crypto context structure
+ * @type: The pointer to bpf crypto type
+ * @tfm: The pointer to instance of crypto API struct.
+ * @siv_len: Size of IV and state storage for cipher
+ * @rcu: The RCU head used to free the crypto context with RCU safety.
+ * @usage: Object reference counter. When the refcount goes to 0, the
+ * memory is released back to the BPF allocator, which provides
+ * RCU safety.
+ */
+struct bpf_crypto_ctx {
+ const struct bpf_crypto_type *type;
+ void *tfm;
+ u32 siv_len;
+ struct rcu_head rcu;
+ refcount_t usage;
+};
+
+int bpf_crypto_register_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -EEXIST;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (!strcmp(node->type->name, type->name))
+ goto unlock;
+ }
+
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!node)
+ goto unlock;
+
+ node->type = type;
+ list_add(&node->list, &bpf_crypto_types);
+ err = 0;
+
+unlock:
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_register_type);
+
+int bpf_crypto_unregister_type(const struct bpf_crypto_type *type)
+{
+ struct bpf_crypto_type_list *node;
+ int err = -ENOENT;
+
+ down_write(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, type->name))
+ continue;
+
+ list_del(&node->list);
+ kfree(node);
+ err = 0;
+ break;
+ }
+ up_write(&bpf_crypto_types_sem);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type);
+
+static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name)
+{
+ const struct bpf_crypto_type *type = ERR_PTR(-ENOENT);
+ struct bpf_crypto_type_list *node;
+
+ down_read(&bpf_crypto_types_sem);
+ list_for_each_entry(node, &bpf_crypto_types, list) {
+ if (strcmp(node->type->name, name))
+ continue;
+
+ if (try_module_get(node->type->owner))
+ type = node->type;
+ break;
+ }
+ up_read(&bpf_crypto_types_sem);
+
+ return type;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_crypto_ctx_create() - Create a mutable BPF crypto context.
+ *
+ * Allocates a crypto context that can be used, acquired, and released by
+ * a BPF program. The crypto context returned by this function must either
+ * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release().
+ * As crypto API functions use GFP_KERNEL allocations, this function can
+ * only be used in sleepable BPF programs.
+ *
+ * bpf_crypto_ctx_create() allocates memory for crypto context.
+ * It may return NULL if no memory is available.
+ * @params: pointer to struct bpf_crypto_params which contains all the
+ * details needed to initialise crypto context.
+ * @params__sz: size of steuct bpf_crypto_params usef by bpf program
+ * @err: integer to store error code when NULL is returned.
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz,
+ int *err)
+{
+ const struct bpf_crypto_type *type;
+ struct bpf_crypto_ctx *ctx;
+
+ if (!params || params->reserved[0] || params->reserved[1] ||
+ params__sz != sizeof(struct bpf_crypto_params)) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ type = bpf_crypto_get_type(params->type);
+ if (IS_ERR(type)) {
+ *err = PTR_ERR(type);
+ return NULL;
+ }
+
+ if (!type->has_algo(params->algo)) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!!params->authsize ^ !!type->setauthsize) {
+ *err = -EOPNOTSUPP;
+ goto err_module_put;
+ }
+
+ if (!params->key_len || params->key_len > sizeof(params->key)) {
+ *err = -EINVAL;
+ goto err_module_put;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ *err = -ENOMEM;
+ goto err_module_put;
+ }
+
+ ctx->type = type;
+ ctx->tfm = type->alloc_tfm(params->algo);
+ if (IS_ERR(ctx->tfm)) {
+ *err = PTR_ERR(ctx->tfm);
+ goto err_free_ctx;
+ }
+
+ if (params->authsize) {
+ *err = type->setauthsize(ctx->tfm, params->authsize);
+ if (*err)
+ goto err_free_tfm;
+ }
+
+ *err = type->setkey(ctx->tfm, params->key, params->key_len);
+ if (*err)
+ goto err_free_tfm;
+
+ if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) {
+ *err = -EINVAL;
+ goto err_free_tfm;
+ }
+
+ ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm);
+
+ refcount_set(&ctx->usage, 1);
+
+ return ctx;
+
+err_free_tfm:
+ type->free_tfm(ctx->tfm);
+err_free_ctx:
+ kfree(ctx);
+err_module_put:
+ module_put(type->owner);
+
+ return NULL;
+}
+
+static void crypto_free_cb(struct rcu_head *head)
+{
+ struct bpf_crypto_ctx *ctx;
+
+ ctx = container_of(head, struct bpf_crypto_ctx, rcu);
+ ctx->type->free_tfm(ctx->tfm);
+ module_put(ctx->type->owner);
+ kfree(ctx);
+}
+
+/**
+ * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context.
+ * @ctx: The BPF crypto context being acquired. The ctx must be a trusted
+ * pointer.
+ *
+ * Acquires a reference to a BPF crypto context. The context returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_crypto_ctx_release().
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx)
+{
+ if (!refcount_inc_not_zero(&ctx->usage))
+ return NULL;
+ return ctx;
+}
+
+/**
+ * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context.
+ * @ctx: The crypto context being released.
+ *
+ * Releases a previously acquired reference to a BPF crypto context. When the final
+ * reference of the BPF crypto context has been released, its memory
+ * will be released.
+ */
+__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
+{
+ if (refcount_dec_and_test(&ctx->usage))
+ call_rcu(&ctx->rcu, crypto_free_cb);
+}
+
+static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr_kern *src,
+ const struct bpf_dynptr_kern *dst,
+ const struct bpf_dynptr_kern *siv,
+ bool decrypt)
+{
+ u32 src_len, dst_len, siv_len;
+ const u8 *psrc;
+ u8 *pdst, *piv;
+ int err;
+
+ if (__bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ siv_len = siv ? __bpf_dynptr_size(siv) : 0;
+ src_len = __bpf_dynptr_size(src);
+ dst_len = __bpf_dynptr_size(dst);
+ if (!src_len || !dst_len || src_len > dst_len)
+ return -EINVAL;
+
+ if (siv_len != ctx->siv_len)
+ return -EINVAL;
+
+ psrc = __bpf_dynptr_data(src, src_len);
+ if (!psrc)
+ return -EINVAL;
+ pdst = __bpf_dynptr_data_rw(dst, dst_len);
+ if (!pdst)
+ return -EINVAL;
+
+ piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL;
+ if (siv_len && !piv)
+ return -EINVAL;
+
+ err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv)
+ : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv);
+
+ return err;
+}
+
+/**
+ * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true);
+}
+
+/**
+ * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
+ *
+ * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
+{
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(crypt_init_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_KFUNCS_END(crypt_init_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_init_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_init_kfunc_btf_ids,
+};
+
+BTF_KFUNCS_START(crypt_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU)
+BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU)
+BTF_KFUNCS_END(crypt_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &crypt_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(bpf_crypto_dtor_ids)
+BTF_ID(struct, bpf_crypto_ctx)
+BTF_ID(func, bpf_crypto_ctx_release)
+
+static int __init crypto_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = {
+ {
+ .btf_id = bpf_crypto_dtor_ids[0],
+ .kfunc_btf_id = bpf_crypto_dtor_ids[1]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &crypt_init_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors,
+ ARRAY_SIZE(bpf_crypto_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(crypto_kfunc_init);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 000000000000..2625601de76e
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,1170 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
+ * this list is empty, indicating outstanding flush operations have completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * When removing the dev a cmpxchg() is used to ensure the correct dev is
+ * removed, in the case of a concurrent update or delete operation it is
+ * possible that the initially referenced dev is no longer in the map. As the
+ * notifier hook walks the map we know that new dev references can not be
+ * added by the user because core infrastructure ensures dev_get_by_index()
+ * calls will fail at this point.
+ *
+ * The devmap_hash type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
+ */
+#include <linux/bpf.h>
+#include <net/xdp.h>
+#include <linux/filter.h>
+#include <trace/events/xdp.h>
+#include <linux/btf_ids.h>
+
+#define DEV_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct xdp_dev_bulk_queue {
+ struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ struct list_head flush_node;
+ struct net_device *dev;
+ struct net_device *dev_rx;
+ struct bpf_prog *xdp_prog;
+ unsigned int count;
+};
+
+struct bpf_dtab_netdev {
+ struct net_device *dev; /* must be first member, due to tracepoint */
+ struct hlist_node index_hlist;
+ struct bpf_prog *xdp_prog;
+ struct rcu_head rcu;
+ unsigned int idx;
+ struct bpf_devmap_val val;
+};
+
+struct bpf_dtab {
+ struct bpf_map map;
+ struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
+ struct list_head list;
+
+ /* these are only used for DEVMAP_HASH type maps */
+ struct hlist_head *dev_index_head;
+ spinlock_t index_lock;
+ unsigned int items;
+ u32 n_buckets;
+};
+
+static DEFINE_SPINLOCK(dev_map_lock);
+static LIST_HEAD(dev_map_list);
+
+static struct hlist_head *dev_map_create_hash(unsigned int entries,
+ int numa_node)
+{
+ int i;
+ struct hlist_head *hash;
+
+ hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
+ if (hash != NULL)
+ for (i = 0; i < entries; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+ int idx)
+{
+ return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
+}
+
+static int dev_map_alloc_check(union bpf_attr *attr)
+{
+ u32 valsize = attr->value_size;
+
+ /* check sanity of attributes. 2 value sizes supported:
+ * 4 bytes: ifindex
+ * 8 bytes: ifindex + prog fd
+ */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
+ valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+ return -EINVAL;
+
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2; roundup_pow_of_two()
+ * can overflow into UB on 32-bit arches
+ */
+ if (attr->max_entries > 1UL << 31)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+{
+ /* Lookup returns a pointer straight to dev->ifindex, so make sure the
+ * verifier prevents writes from the BPF side
+ */
+ attr->map_flags |= BPF_F_RDONLY_PROG;
+ bpf_map_init_from_attr(&dtab->map, attr);
+
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2 */
+ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
+ dtab->map.numa_node);
+ if (!dtab->dev_index_head)
+ return -ENOMEM;
+
+ spin_lock_init(&dtab->index_lock);
+ } else {
+ dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
+ sizeof(struct bpf_dtab_netdev *),
+ dtab->map.numa_node);
+ if (!dtab->netdev_map)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ int err;
+
+ dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ err = dev_map_init_map(dtab, attr);
+ if (err) {
+ bpf_map_area_free(dtab);
+ return ERR_PTR(err);
+ }
+
+ spin_lock(&dev_map_lock);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
+ spin_unlock(&dev_map_lock);
+
+ return &dtab->map;
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 i;
+
+ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. The following synchronize_rcu() guarantees
+ * both rcu read critical sections complete and waits for
+ * preempt-disable regions (NAPI being the relevant context here) so we
+ * are certain there will be no further reads against the netdev_map and
+ * all flush operations are complete. Flush operations can only be done
+ * from NAPI context for this reason.
+ */
+
+ spin_lock(&dev_map_lock);
+ list_del_rcu(&dtab->list);
+ spin_unlock(&dev_map_lock);
+
+ /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
+ * during NAPI callback and cleared after the XDP redirect. There is no
+ * explicit RCU read section which protects bpf_redirect_info->map but
+ * local_bh_disable() also marks the beginning an RCU section. This
+ * makes the complete softirq callback RCU protected. Thus after
+ * following synchronize_rcu() there no bpf_redirect_info->map == map
+ * assignment.
+ */
+ synchronize_rcu();
+
+ /* Make sure prior __dev_map_entry_free() have completed. */
+ rcu_barrier();
+
+ if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ for (i = 0; i < dtab->n_buckets; i++) {
+ struct bpf_dtab_netdev *dev;
+ struct hlist_head *head;
+ struct hlist_node *next;
+
+ head = dev_map_index_hash(dtab, i);
+
+ hlist_for_each_entry_safe(dev, next, head, index_hlist) {
+ hlist_del_rcu(&dev->index_hlist);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+ }
+
+ bpf_map_area_free(dtab->dev_index_head);
+ } else {
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev;
+
+ dev = rcu_dereference_raw(dtab->netdev_map[i]);
+ if (!dev)
+ continue;
+
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
+ dev_put(dev->dev);
+ kfree(dev);
+ }
+
+ bpf_map_area_free(dtab->netdev_map);
+ }
+
+ bpf_map_area_free(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = next_key;
+
+ if (index >= dtab->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == dtab->map.max_entries - 1)
+ return -ENOENT;
+ *next = index + 1;
+ return 0;
+}
+
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct hlist_head *head = dev_map_index_hash(dtab, key);
+ struct bpf_dtab_netdev *dev;
+
+ hlist_for_each_entry_rcu(dev, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock))
+ if (dev->idx == key)
+ return dev;
+
+ return NULL;
+}
+
+static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 idx, *next = next_key;
+ struct bpf_dtab_netdev *dev, *next_dev;
+ struct hlist_head *head;
+ int i = 0;
+
+ if (!key)
+ goto find_first;
+
+ idx = *(u32 *)key;
+
+ dev = __dev_map_hash_lookup_elem(map, idx);
+ if (!dev)
+ goto find_first;
+
+ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+ struct bpf_dtab_netdev, index_hlist);
+
+ if (next_dev) {
+ *next = next_dev->idx;
+ return 0;
+ }
+
+ i = idx & (dtab->n_buckets - 1);
+ i++;
+
+ find_first:
+ for (; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+
+ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct bpf_dtab_netdev,
+ index_hlist);
+ if (next_dev) {
+ *next = next_dev->idx;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+ struct xdp_frame **frames, int n,
+ struct net_device *tx_dev,
+ struct net_device *rx_dev)
+{
+ struct xdp_txq_info txq = { .dev = tx_dev };
+ struct xdp_rxq_info rxq = { .dev = rx_dev };
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+ xdp.txq = &txq;
+ xdp.rxq = &rxq;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (unlikely(err < 0))
+ xdp_return_frame_rx_napi(xdpf);
+ else
+ frames[nframes++] = xdpf;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(tx_dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame_rx_napi(xdpf);
+ break;
+ }
+ }
+ return nframes; /* sent frames count */
+}
+
+static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
+{
+ struct net_device *dev = bq->dev;
+ unsigned int cnt = bq->count;
+ int sent = 0, err = 0;
+ int to_send = cnt;
+ int i;
+
+ if (unlikely(!cnt))
+ return;
+
+ for (i = 0; i < cnt; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+
+ prefetch(xdpf);
+ }
+
+ if (bq->xdp_prog) {
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
+ if (!to_send)
+ goto out;
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
+ if (sent < 0) {
+ /* If ndo_xdp_xmit fails with an errno, no frames have
+ * been xmit'ed.
+ */
+ err = sent;
+ sent = 0;
+ }
+
+ /* If not all frames have been transmitted, it is our
+ * responsibility to free them
+ */
+ for (i = sent; unlikely(i < to_send); i++)
+ xdp_return_frame_rx_napi(bq->q[i]);
+
+out:
+ bq->count = 0;
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
+}
+
+/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
+ * driver before returning from its napi->poll() routine. See the comment above
+ * xdp_do_flush() in filter.c.
+ */
+void __dev_flush(struct list_head *flush_list)
+{
+ struct xdp_dev_bulk_queue *bq, *tmp;
+
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ bq_xmit_all(bq, XDP_XMIT_FLUSH);
+ bq->dev_rx = NULL;
+ bq->xdp_prog = NULL;
+ __list_del_clearprev(&bq->flush_node);
+ }
+}
+
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *obj;
+
+ if (key >= map->max_entries)
+ return NULL;
+
+ obj = rcu_dereference_check(dtab->netdev_map[key],
+ rcu_read_lock_bh_held());
+ return obj;
+}
+
+/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
+ * variable access, and map elements stick around. See comment above
+ * xdp_do_flush() in filter.c.
+ */
+static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx, struct bpf_prog *xdp_prog)
+{
+ struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
+
+ if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+ bq_xmit_all(bq, 0);
+
+ /* Ingress dev_rx will be the same for all xdp_frame's in
+ * bulk_queue, because bq stored per-CPU and must be flushed
+ * from net_device drivers NAPI func end.
+ *
+ * Do the same with xdp_prog and flush_list since these fields
+ * are only ever modified together.
+ */
+ if (!bq->dev_rx) {
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
+
+ bq->dev_rx = dev_rx;
+ bq->xdp_prog = xdp_prog;
+ list_add(&bq->flush_node, flush_list);
+ }
+
+ bq->q[bq->count++] = xdpf;
+}
+
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx,
+ struct bpf_prog *xdp_prog)
+{
+ int err;
+
+ if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return -EOPNOTSUPP;
+
+ if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
+ return -EOPNOTSUPP;
+
+ err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
+ if (unlikely(err))
+ return err;
+
+ bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
+ return 0;
+}
+
+static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
+{
+ struct xdp_txq_info txq = { .dev = dst->dev };
+ struct xdp_buff xdp;
+ u32 act;
+
+ if (!dst->xdp_prog)
+ return XDP_PASS;
+
+ __skb_pull(skb, skb->mac_len);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
+ switch (act) {
+ case XDP_PASS:
+ __skb_push(skb, skb->mac_len);
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dst->dev, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ kfree_skb(skb);
+ break;
+ }
+
+ return act;
+}
+
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx)
+{
+ return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
+
+ return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
+}
+
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+ if (!obj)
+ return false;
+
+ if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
+ return false;
+
+ if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
+ xdp_frame_has_frags(xdpf)))
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
+ return false;
+
+ return true;
+}
+
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+ struct net_device *dev_rx,
+ struct xdp_frame *xdpf)
+{
+ struct xdp_frame *nxdpf;
+
+ nxdpf = xdpf_clone(xdpf);
+ if (!nxdpf)
+ return -ENOMEM;
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ return 0;
+}
+
+static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
+{
+ while (num_excluded--) {
+ if (ifindex == excluded[num_excluded])
+ return true;
+ }
+ return false;
+}
+
+/* Get ifindex of each upper device. 'indexes' must be able to hold at
+ * least MAX_NEST_DEV elements.
+ * Returns the number of ifindexes added.
+ */
+static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+ int n = 0;
+
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ indexes[n++] = upper->ifindex;
+ }
+ return n;
+}
+
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ excluded_devices[num_excluded++] = dev_rx->ifindex;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_rcu(dst, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock)) {
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the last copy of the frame */
+ if (last_dst)
+ bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+ else
+ xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+ return 0;
+}
+
+int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
+ const struct bpf_prog *xdp_prog)
+{
+ int err;
+
+ err = xdp_ok_fwd_dev(dst->dev, skb->len);
+ if (unlikely(err))
+ return err;
+
+ /* Redirect has already succeeded semantically at this point, so we just
+ * return 0 even if packet is dropped. Helper below takes care of
+ * freeing skb.
+ */
+ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
+ return 0;
+
+ skb->dev = dst->dev;
+ generic_xdp_tx(skb, xdp_prog);
+
+ return 0;
+}
+
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+ struct sk_buff *skb,
+ const struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *nskb;
+ int err;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+ if (unlikely(err)) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ const struct bpf_prog *xdp_prog,
+ struct bpf_map *map, bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ struct hlist_node *next;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ excluded_devices[num_excluded++] = dev->ifindex;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the first skb and return */
+ if (last_dst)
+ return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+ /* dtab is empty */
+ consume_skb(skb);
+ return 0;
+}
+
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+
+ return obj ? &obj->val : NULL;
+}
+
+static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
+ *(u32 *)key);
+ return obj ? &obj->val : NULL;
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+ struct bpf_dtab_netdev *dev;
+
+ dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
+ dev_put(dev->dev);
+ kfree(dev);
+}
+
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ u32 k = *(u32 *)key;
+
+ if (k >= map->max_entries)
+ return -EINVAL;
+
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
+ if (old_dev) {
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
+ return 0;
+}
+
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *old_dev;
+ u32 k = *(u32 *)key;
+ unsigned long flags;
+ int ret = -ENOENT;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+
+ old_dev = __dev_map_hash_lookup_elem(map, k);
+ if (old_dev) {
+ dtab->items--;
+ hlist_del_init_rcu(&old_dev->index_hlist);
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+ return ret;
+}
+
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+ struct bpf_dtab *dtab,
+ struct bpf_devmap_val *val,
+ unsigned int idx)
+{
+ struct bpf_prog *prog = NULL;
+ struct bpf_dtab_netdev *dev;
+
+ dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
+ GFP_NOWAIT,
+ dtab->map.numa_node);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ dev->dev = dev_get_by_index(net, val->ifindex);
+ if (!dev->dev)
+ goto err_out;
+
+ if (val->bpf_prog.fd > 0) {
+ prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
+ BPF_PROG_TYPE_XDP, false);
+ if (IS_ERR(prog))
+ goto err_put_dev;
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+ !bpf_prog_map_compatible(&dtab->map, prog))
+ goto err_put_prog;
+ }
+
+ dev->idx = idx;
+ if (prog) {
+ dev->xdp_prog = prog;
+ dev->val.bpf_prog.id = prog->aux->id;
+ } else {
+ dev->xdp_prog = NULL;
+ dev->val.bpf_prog.id = 0;
+ }
+ dev->val.ifindex = val->ifindex;
+
+ return dev;
+err_put_prog:
+ bpf_prog_put(prog);
+err_put_dev:
+ dev_put(dev->dev);
+err_out:
+ kfree(dev);
+ return ERR_PTR(-EINVAL);
+}
+
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev, *old_dev;
+ struct bpf_devmap_val val = {};
+ u32 i = *(u32 *)key;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(i >= dtab->map.max_entries))
+ return -E2BIG;
+ if (unlikely(map_flags == BPF_NOEXIST))
+ return -EEXIST;
+
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (!val.ifindex) {
+ dev = NULL;
+ /* can not specify fd if ifindex is 0 */
+ if (val.bpf_prog.fd > 0)
+ return -EINVAL;
+ } else {
+ dev = __dev_map_alloc_node(net, dtab, &val, i);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+ }
+
+ /* Use call_rcu() here to ensure rcu critical sections have completed
+ * Remembering the driver side flush operation will happen before the
+ * net device is removed.
+ */
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ else
+ atomic_inc((atomic_t *)&dtab->items);
+
+ return 0;
+}
+
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return __dev_map_update_elem(current->nsproxy->net_ns,
+ map, key, value, map_flags);
+}
+
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev, *old_dev;
+ struct bpf_devmap_val val = {};
+ u32 idx = *(u32 *)key;
+ unsigned long flags;
+ int err = -EEXIST;
+
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
+ return -EINVAL;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+
+ old_dev = __dev_map_hash_lookup_elem(map, idx);
+ if (old_dev && (map_flags & BPF_NOEXIST))
+ goto out_err;
+
+ dev = __dev_map_alloc_node(net, dtab, &val, idx);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out_err;
+ }
+
+ if (old_dev) {
+ hlist_del_rcu(&old_dev->index_hlist);
+ } else {
+ if (dtab->items >= dtab->map.max_entries) {
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+ call_rcu(&dev->rcu, __dev_map_entry_free);
+ return -E2BIG;
+ }
+ dtab->items++;
+ }
+
+ hlist_add_head_rcu(&dev->index_hlist,
+ dev_map_index_hash(dtab, idx));
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+
+out_err:
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+ return err;
+}
+
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return __dev_map_hash_update_elem(current->nsproxy->net_ns,
+ map, key, value, map_flags);
+}
+
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_lookup_elem);
+}
+
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_hash_lookup_elem);
+}
+
+static u64 dev_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u64 usage = sizeof(struct bpf_dtab);
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
+ usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
+ else
+ usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
+ usage += atomic_read((atomic_t *)&dtab->items) *
+ (u64)sizeof(struct bpf_dtab_netdev);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
+const struct bpf_map_ops dev_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_get_next_key,
+ .map_lookup_elem = dev_map_lookup_elem,
+ .map_update_elem = dev_map_update_elem,
+ .map_delete_elem = dev_map_delete_elem,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
+ .map_redirect = dev_map_redirect,
+};
+
+const struct bpf_map_ops dev_map_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_hash_get_next_key,
+ .map_lookup_elem = dev_map_hash_lookup_elem,
+ .map_update_elem = dev_map_hash_update_elem,
+ .map_delete_elem = dev_map_hash_delete_elem,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
+ .map_btf_id = &dev_map_btf_ids[0],
+ .map_redirect = dev_hash_map_redirect,
+};
+
+static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
+ struct net_device *netdev)
+{
+ unsigned long flags;
+ u32 i;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+ for (i = 0; i < dtab->n_buckets; i++) {
+ struct bpf_dtab_netdev *dev;
+ struct hlist_head *head;
+ struct hlist_node *next;
+
+ head = dev_map_index_hash(dtab, i);
+
+ hlist_for_each_entry_safe(dev, next, head, index_hlist) {
+ if (netdev != dev->dev)
+ continue;
+
+ dtab->items--;
+ hlist_del_rcu(&dev->index_hlist);
+ call_rcu(&dev->rcu, __dev_map_entry_free);
+ }
+ }
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+}
+
+static int dev_map_notification(struct notifier_block *notifier,
+ ulong event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct bpf_dtab *dtab;
+ int i, cpu;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
+ break;
+
+ /* will be freed in free_netdev() */
+ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
+ if (!netdev->xdp_bulkq)
+ return NOTIFY_BAD;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
+ break;
+ case NETDEV_UNREGISTER:
+ /* This rcu_read_lock/unlock pair is needed because
+ * dev_map_list is an RCU list AND to ensure a delete
+ * operation does not free a netdev_map entry while we
+ * are comparing it against the netdev being unregistered.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(dtab, &dev_map_list, list) {
+ if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dev_map_hash_remove_netdev(dtab, netdev);
+ continue;
+ }
+
+ for (i = 0; i < dtab->map.max_entries; i++) {
+ struct bpf_dtab_netdev *dev, *odev;
+
+ dev = rcu_dereference(dtab->netdev_map[i]);
+ if (!dev || netdev != dev->dev)
+ continue;
+ odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
+ if (dev == odev) {
+ call_rcu(&dev->rcu,
+ __dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
+ }
+ }
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+ .notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+ /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
+ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
+ offsetof(struct _bpf_dtab_netdev, dev));
+ register_netdevice_notifier(&dev_map_notifier);
+
+ return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..f8a3c7eb451e
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
+ const struct bpf_insn *insn,
+ char *buff, size_t len)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+ if (!insn->src_reg &&
+ insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
+ func_id_str[insn->imm])
+ return func_id_str[insn->imm];
+
+ if (cbs && cbs->cb_call) {
+ const char *res;
+
+ res = cbs->cb_call(cbs->private_data, insn);
+ if (res)
+ return res;
+ }
+
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ snprintf(buff, len, "%+d", insn->imm);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ snprintf(buff, len, "kernel-function");
+
+ return buff;
+}
+
+static const char *__func_imm_name(const struct bpf_insn_cbs *cbs,
+ const struct bpf_insn *insn,
+ u64 full_imm, char *buff, size_t len)
+{
+ if (cbs && cbs->cb_imm)
+ return cbs->cb_imm(cbs->private_data, insn, full_imm);
+
+ snprintf(buff, len, "0x%llx", (unsigned long long)full_imm);
+ return buff;
+}
+
+const char *func_id_name(int id)
+{
+ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+ return func_id_str[id];
+ else
+ return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_JMP32] = "jmp32",
+ [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_alu_sign_string[16] = {
+ [BPF_DIV >> 4] = "s/=",
+ [BPF_MOD >> 4] = "s%=",
+};
+
+static const char *const bpf_movsx_string[4] = {
+ [0] = "(s8)",
+ [1] = "(s16)",
+ [3] = "(s32)",
+};
+
+static const char *const bpf_atomic_alu_string[16] = {
+ [BPF_ADD >> 4] = "add",
+ [BPF_AND >> 4] = "and",
+ [BPF_OR >> 4] = "or",
+ [BPF_XOR >> 4] = "xor",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_ldsx_string[] = {
+ [BPF_W >> 3] = "s32",
+ [BPF_H >> 3] = "s16",
+ [BPF_B >> 3] = "s8",
+};
+
+static const char *const bpf_jmp_string[16] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JLT >> 4] = "<",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JLE >> 4] = "<=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSLT >> 4] = "s<",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_JSLE >> 4] = "s<=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_t verbose,
+ void *private_data,
+ const struct bpf_insn *insn)
+{
+ verbose(private_data, "(%02x) r%d = %s%d r%d\n",
+ insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
+static void print_bpf_bswap_insn(bpf_insn_print_t verbose,
+ void *private_data,
+ const struct bpf_insn *insn)
+{
+ verbose(private_data, "(%02x) r%d = bswap%d r%d\n",
+ insn->code, insn->dst_reg,
+ insn->imm, insn->dst_reg);
+}
+
+static bool is_sdiv_smod(const struct bpf_insn *insn)
+{
+ return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) &&
+ insn->off == 1;
+}
+
+static bool is_movsx(const struct bpf_insn *insn)
+{
+ return BPF_OP(insn->code) == BPF_MOV &&
+ (insn->off == 8 || insn->off == 16 || insn->off == 32);
+}
+
+static bool is_addr_space_cast(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
+ insn->off == BPF_ADDR_SPACE_CAST;
+}
+
+/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
+ * dst_reg = src_reg + <percpu_base_off>
+ * BPF_ADDR_PERCPU is used as a special insn->off value.
+ */
+#define BPF_ADDR_PERCPU (-1)
+
+static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
+}
+
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+ const struct bpf_insn *insn,
+ bool allow_ptr_leaks)
+{
+ const bpf_insn_print_t verbose = cbs->cb_print;
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ print_bpf_bswap_insn(verbose, cbs->private_data, insn);
+ else
+ print_bpf_end_insn(verbose, cbs->private_data, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg);
+ } else if (is_addr_space_cast(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n",
+ insn->code, insn->dst_reg,
+ insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
+ } else if (is_mov_percpu_addr(insn)) {
+ verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
+ insn->code, insn->dst_reg, insn->src_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg,
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
+ is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "",
+ class == BPF_ALU ? 'w' : 'r',
+ insn->src_reg);
+ } else {
+ verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg,
+ is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4]
+ : bpf_alu_string[BPF_OP(insn->code) >> 4],
+ insn->imm);
+ }
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == BPF_ADD || insn->imm == BPF_AND ||
+ insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
+ verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ bpf_alu_string[BPF_OP(insn->imm) >> 4],
+ insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == (BPF_ADD | BPF_FETCH) ||
+ insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH))) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG) {
+ verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n",
+ insn->code,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_XCHG) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ) {
+ verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_STORE_REL) {
+ verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else {
+ verbose(cbs->private_data, "BUG_%02x\n", insn->code);
+ }
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
+ verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
+ }
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) {
+ verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ BPF_MODE(insn->code) == BPF_MEM ?
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3] :
+ bpf_ldsx_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_VALUE;
+ char tmp[64];
+
+ if (is_ptr && !allow_ptr_leaks)
+ imm = 0;
+
+ verbose(cbs->private_data, "(%02x) r%d = %s\n",
+ insn->code, insn->dst_reg,
+ __func_imm_name(cbs, insn, imm,
+ tmp, sizeof(tmp)));
+ } else {
+ verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP32 || class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ char tmp[64];
+
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
+ verbose(cbs->private_data, "(%02x) call pc%s\n",
+ insn->code,
+ __func_get_name(cbs, insn,
+ tmp, sizeof(tmp)));
+ } else {
+ strcpy(tmp, "unknown");
+ verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code,
+ __func_get_name(cbs, insn,
+ tmp, sizeof(tmp)),
+ insn->imm);
+ }
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose(cbs->private_data, "(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) {
+ verbose(cbs->private_data, "(%02x) gotox r%d\n",
+ insn->code, insn->dst_reg);
+ } else if (insn->code == (BPF_JMP | BPF_JCOND) &&
+ insn->src_reg == BPF_MAY_GOTO) {
+ verbose(cbs->private_data, "(%02x) may_goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ verbose(cbs->private_data, "(%02x) gotol pc%+d\n",
+ insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose(cbs->private_data, "(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s %c%d goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ class == BPF_JMP32 ? 'w' : 'r',
+ insn->src_reg, insn->off);
+ } else {
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s 0x%x goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ (u32)insn->imm, insn->off);
+ }
+ } else {
+ verbose(cbs->private_data, "(%02x) %s\n",
+ insn->code, bpf_class_string[class]);
+ }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..a4b040793f44
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+#ifndef __KERNEL__
+#include <stdio.h>
+#include <string.h>
+#endif
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data,
+ const char *, ...);
+typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
+ const struct bpf_insn *insn);
+typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
+ const struct bpf_insn *insn,
+ __u64 full_imm);
+
+struct bpf_insn_cbs {
+ bpf_insn_print_t cb_print;
+ bpf_insn_revmap_call_t cb_call;
+ bpf_insn_print_imm_t cb_imm;
+ void *private_data;
+};
+
+void print_bpf_insn(const struct bpf_insn_cbs *cbs,
+ const struct bpf_insn *insn,
+ bool allow_ptr_leaks);
+#endif
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
new file mode 100644
index 000000000000..b77db7413f8c
--- /dev/null
+++ b/kernel/bpf/dispatcher.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/static_call.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
+ * unsigned int (*bpf_func)(const void *,
+ * const struct bpf_insn *));
+ */
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
+ struct bpf_dispatcher *d, struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (prog == d->progs[i].prog)
+ return &d->progs[i];
+ }
+ return NULL;
+}
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
+ struct bpf_dispatcher *d)
+{
+ return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (entry) {
+ refcount_inc(&entry->users);
+ return false;
+ }
+
+ entry = bpf_dispatcher_find_free(d);
+ if (!entry)
+ return false;
+
+ bpf_prog_inc(prog);
+ entry->prog = prog;
+ refcount_set(&entry->users, 1);
+ d->num_progs++;
+ return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+ struct bpf_prog *prog)
+{
+ struct bpf_dispatcher_prog *entry;
+
+ if (!prog)
+ return false;
+
+ entry = bpf_dispatcher_find_prog(d, prog);
+ if (!entry)
+ return false;
+
+ if (refcount_dec_and_test(&entry->users)) {
+ entry->prog = NULL;
+ bpf_prog_put(prog);
+ d->num_progs--;
+ return true;
+ }
+ return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
+{
+ return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
+{
+ s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+ int i;
+
+ for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+ if (d->progs[i].prog)
+ *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+ }
+ return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+ void *new, *tmp;
+ u32 noff = 0;
+
+ if (prev_num_progs)
+ noff = d->image_off ^ (PAGE_SIZE / 2);
+
+ new = d->num_progs ? d->image + noff : NULL;
+ tmp = d->num_progs ? d->rw_image + noff : NULL;
+ if (new) {
+ /* Prepare the dispatcher in d->rw_image. Then use
+ * bpf_arch_text_copy to update d->image, which is RO+X.
+ */
+ if (bpf_dispatcher_prepare(d, new, tmp))
+ return;
+ if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
+ return;
+ }
+
+ __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func);
+
+ /* Make sure all the callers executing the previous/old half of the
+ * image leave it, so following update call can modify it safely.
+ */
+ synchronize_rcu();
+
+ if (new)
+ d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+ struct bpf_prog *to)
+{
+ bool changed = false;
+ int prev_num_progs;
+
+ if (from == to)
+ return;
+
+ mutex_lock(&d->mutex);
+ if (!d->image) {
+ d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
+ if (!d->image)
+ goto out;
+ d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!d->rw_image) {
+ bpf_prog_pack_free(d->image, PAGE_SIZE);
+ d->image = NULL;
+ goto out;
+ }
+ bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
+ bpf_image_ksym_add(&d->ksym);
+ }
+
+ prev_num_progs = d->num_progs;
+ changed |= bpf_dispatcher_remove_prog(d, from);
+ changed |= bpf_dispatcher_add_prog(d, to);
+
+ if (!changed)
+ goto out;
+
+ bpf_dispatcher_update(d, prev_num_progs);
+out:
+ mutex_unlock(&d->mutex);
+}
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
new file mode 100644
index 000000000000..4dd7ef7c145c
--- /dev/null
+++ b/kernel/bpf/dmabuf_iter.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Google LLC */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/dma-buf.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return dma_buf_iter_begin();
+}
+
+static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct dma_buf *dmabuf = v;
+
+ ++*pos;
+
+ return dma_buf_iter_next(dmabuf);
+}
+
+struct bpf_iter__dmabuf {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct dma_buf *, dmabuf);
+};
+
+static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter_meta meta = {
+ .seq = seq,
+ };
+ struct bpf_iter__dmabuf ctx = {
+ .meta = &meta,
+ .dmabuf = v,
+ };
+ struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop);
+
+ if (prog)
+ return bpf_iter_run_prog(prog, &ctx);
+
+ return 0;
+}
+
+static int dmabuf_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __dmabuf_seq_show(seq, v, false);
+}
+
+static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct dma_buf *dmabuf = v;
+
+ if (dmabuf)
+ dma_buf_put(dmabuf);
+}
+
+static const struct seq_operations dmabuf_iter_seq_ops = {
+ .start = dmabuf_iter_seq_start,
+ .next = dmabuf_iter_seq_next,
+ .stop = dmabuf_iter_seq_stop,
+ .show = dmabuf_iter_seq_show,
+};
+
+static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "dmabuf iter\n");
+}
+
+static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
+ .seq_ops = &dmabuf_iter_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = 0,
+};
+
+static struct bpf_iter_reg bpf_dmabuf_reg_info = {
+ .target = "dmabuf",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_dmabuf_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__dmabuf, dmabuf),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &dmabuf_iter_seq_info,
+};
+
+DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf)
+BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf)
+
+static int __init dmabuf_iter_init(void)
+{
+ bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0];
+ return bpf_iter_reg_target(&bpf_dmabuf_reg_info);
+}
+
+late_initcall(dmabuf_iter_init);
+
+struct bpf_iter_dmabuf {
+ /*
+ * opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __aligned(8);
+
+/* Non-opaque version of bpf_iter_dmabuf */
+struct bpf_iter_dmabuf_kern {
+ struct dma_buf *dmabuf;
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->dmabuf = NULL;
+ return 0;
+}
+
+__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ kit->dmabuf = dma_buf_iter_next(kit->dmabuf);
+ else
+ kit->dmabuf = dma_buf_iter_begin();
+
+ return kit->dmabuf;
+}
+
+__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ dma_buf_put(kit->dmabuf);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..c8a9b27f8663 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,174 +1,842 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
+ * Copyright (c) 2016 Facebook
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
+#include <linux/rculist_nulls.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/random.h>
+#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/btf_ids.h>
+#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
+#include "map_in_map.h"
+#include <linux/bpf_mem_alloc.h>
+#include <asm/rqspinlock.h>
+
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
+ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
+
+#define BATCH_OPS(_name) \
+ .map_lookup_batch = \
+ _name##_map_lookup_batch, \
+ .map_lookup_and_delete_batch = \
+ _name##_map_lookup_and_delete_batch, \
+ .map_update_batch = \
+ generic_map_update_batch, \
+ .map_delete_batch = \
+ generic_map_delete_batch
+
+/*
+ * The bucket lock has two protection scopes:
+ *
+ * 1) Serializing concurrent operations from BPF programs on different
+ * CPUs
+ *
+ * 2) Serializing concurrent operations from BPF programs and sys_bpf()
+ *
+ * BPF programs can execute in any context including perf, kprobes and
+ * tracing. As there are almost no limits where perf, kprobes and tracing
+ * can be invoked from the lock operations need to be protected against
+ * deadlocks. Deadlocks can be caused by recursion and by an invocation in
+ * the lock held section when functions which acquire this lock are invoked
+ * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
+ * variable bpf_prog_active, which prevents BPF programs attached to perf
+ * events, kprobes and tracing to be invoked before the prior invocation
+ * from one of these contexts completed. sys_bpf() uses the same mechanism
+ * by pinning the task to the current CPU and incrementing the recursion
+ * protection across the map operation.
+ *
+ * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
+ * operations like memory allocations (even with GFP_ATOMIC) from atomic
+ * contexts. This is required because even with GFP_ATOMIC the memory
+ * allocator calls into code paths which acquire locks with long held lock
+ * sections. To ensure the deterministic behaviour these locks are regular
+ * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
+ * true atomic contexts on an RT kernel are the low level hardware
+ * handling, scheduling, low level interrupt handling, NMIs etc. None of
+ * these contexts should ever do memory allocations.
+ *
+ * As regular device interrupt handlers and soft interrupts are forced into
+ * thread context, the existing code which does
+ * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
+ * just works.
+ *
+ * In theory the BPF locks could be converted to regular spinlocks as well,
+ * but the bucket locks and percpu_freelist locks can be taken from
+ * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
+ */
+struct bucket {
+ struct hlist_nulls_head head;
+ rqspinlock_t raw_lock;
+};
+
+#define HASHTAB_MAP_LOCK_COUNT 8
+#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
- spinlock_t lock;
- u32 count; /* number of elements in this hashtable */
+ struct bpf_mem_alloc ma;
+ struct bpf_mem_alloc pcpu_ma;
+ struct bucket *buckets;
+ void *elems;
+ union {
+ struct pcpu_freelist freelist;
+ struct bpf_lru lru;
+ };
+ struct htab_elem *__percpu *extra_elems;
+ /* number of elements in non-preallocated hashtable are kept
+ * in either pcount or count
+ */
+ struct percpu_counter pcount;
+ atomic_t count;
+ bool use_percpu_counter;
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
+ u32 hashrnd;
};
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
- struct hlist_node hash_node;
- struct rcu_head rcu;
+ union {
+ struct hlist_nulls_node hash_node;
+ struct {
+ void *padding;
+ union {
+ struct pcpu_freelist_node fnode;
+ struct htab_elem *batch_flink;
+ };
+ };
+ };
+ union {
+ /* pointer to per-cpu pointer */
+ void *ptr_to_pptr;
+ struct bpf_lru_node lru_node;
+ };
u32 hash;
- char key[0] __aligned(8);
+ char key[] __aligned(8);
};
+static inline bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+ return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
+static void htab_init_buckets(struct bpf_htab *htab)
+{
+ unsigned int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
+ raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
+ cond_resched();
+ }
+}
+
+static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
+{
+ unsigned long flags;
+ int ret;
+
+ ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
+ if (ret)
+ return ret;
+ *pflags = flags;
+ return 0;
+}
+
+static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
+{
+ raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
+}
+
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
+
+static bool htab_is_lru(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+static bool htab_is_percpu(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+static inline bool is_fd_htab(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
+}
+
+static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
+{
+ return l->key + round_up(key_size, 8);
+}
+
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+ void __percpu *pptr)
+{
+ *(void __percpu **)htab_elem_value(l, key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+ return *(void __percpu **)htab_elem_value(l, key_size);
+}
+
+static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
+{
+ return *(void **)htab_elem_value(l, map->key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+ return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
+}
+
+/* Both percpu and fd htab support in-place update, so no need for
+ * extra elem. LRU itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+ return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
+}
+
+static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(elem, htab->map.key_size));
+ cond_resched();
+ }
+}
+
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ } else {
+ bpf_obj_free_fields(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
+ cond_resched();
+ }
+ cond_resched();
+ }
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+ int i;
+
+ if (!htab_is_percpu(htab))
+ goto free_elems;
+
+ for (i = 0; i < htab->map.max_entries; i++) {
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+ htab->map.key_size);
+ free_percpu(pptr);
+ cond_resched();
+ }
+free_elems:
+ bpf_map_area_free(htab->elems);
+}
+
+/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
+ * (bucket_lock). If both locks need to be acquired together, the lock
+ * order is always lru_lock -> bucket_lock and this only happens in
+ * bpf_lru_list.c logic. For example, certain code path of
+ * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
+ * will acquire lru_lock first followed by acquiring bucket_lock.
+ *
+ * In hashtab.c, to avoid deadlock, lock acquisition of
+ * bucket_lock followed by lru_lock is not allowed. In such cases,
+ * bucket_lock needs to be released first before acquiring lru_lock.
+ */
+static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
+ u32 hash)
+{
+ struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
+ struct htab_elem *l;
+
+ if (node) {
+ bpf_map_inc_elem_count(&htab->map);
+ l = container_of(node, struct htab_elem, lru_node);
+ memcpy(l->key, key, htab->map.key_size);
+ return l;
+ }
+
+ return NULL;
+}
+
+static int prealloc_init(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int err = -ENOMEM, i;
+
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
+ htab->map.numa_node);
+ if (!htab->elems)
+ return -ENOMEM;
+
+ if (!htab_is_percpu(htab))
+ goto skip_percpu_elems;
+
+ for (i = 0; i < num_entries; i++) {
+ u32 size = round_up(htab->map.value_size, 8);
+ void __percpu *pptr;
+
+ pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
+ GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ goto free_elems;
+ htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+ pptr);
+ cond_resched();
+ }
+
+skip_percpu_elems:
+ if (htab_is_lru(htab))
+ err = bpf_lru_init(&htab->lru,
+ htab->map.map_flags & BPF_F_NO_COMMON_LRU,
+ offsetof(struct htab_elem, hash) -
+ offsetof(struct htab_elem, lru_node),
+ htab_lru_map_delete_node,
+ htab);
+ else
+ err = pcpu_freelist_init(&htab->freelist);
+
+ if (err)
+ goto free_elems;
+
+ if (htab_is_lru(htab))
+ bpf_lru_populate(&htab->lru, htab->elems,
+ offsetof(struct htab_elem, lru_node),
+ htab->elem_size, num_entries);
+ else
+ pcpu_freelist_populate(&htab->freelist,
+ htab->elems + offsetof(struct htab_elem, fnode),
+ htab->elem_size, num_entries);
+
+ return 0;
+
+free_elems:
+ htab_free_elems(htab);
+ return err;
+}
+
+static void prealloc_destroy(struct bpf_htab *htab)
+{
+ htab_free_elems(htab);
+
+ if (htab_is_lru(htab))
+ bpf_lru_destroy(&htab->lru);
+ else
+ pcpu_freelist_destroy(&htab->freelist);
+}
+
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+ struct htab_elem *__percpu *pptr, *l_new;
+ struct pcpu_freelist_node *l;
+ int cpu;
+
+ pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
+ GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ l = pcpu_freelist_pop(&htab->freelist);
+ /* pop will succeed, since prealloc_init()
+ * preallocated extra num_possible_cpus elements
+ */
+ l_new = container_of(l, struct htab_elem, fnode);
+ *per_cpu_ptr(pptr, cpu) = l_new;
+ }
+ htab->extra_elems = pptr;
+ return 0;
+}
+
/* Called from syscall */
+static int htab_map_alloc_check(union bpf_attr *attr)
+{
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+ bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
+ int numa_node = bpf_map_attr_numa_node(attr);
+
+ BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
+ offsetof(struct htab_elem, hash_node.pprev));
+
+ if (zero_seed && !capable(CAP_SYS_ADMIN))
+ /* Guard against local DoS, and discourage production use. */
+ return -EPERM;
+
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
+ return -EINVAL;
+
+ if (!lru && percpu_lru)
+ return -EINVAL;
+
+ if (lru && !prealloc)
+ return -ENOTSUPP;
+
+ if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
+ return -EINVAL;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ if (attr->max_entries == 0 || attr->key_size == 0 ||
+ attr->value_size == 0)
+ return -EINVAL;
+
+ if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
+ sizeof(struct htab_elem))
+ /* if key_size + value_size is bigger, the user space won't be
+ * able to access the elements via bpf syscall. This check
+ * also makes sure that the elem_size doesn't overflow and it's
+ * kmalloc-able later in htab_map_update_elem()
+ */
+ return -E2BIG;
+ /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
+ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
+ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ /* percpu_lru means each cpu has its own LRU list.
+ * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+ * the map's value itself is percpu. percpu_lru has
+ * nothing to do with the map's value.
+ */
+ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
- int err, i;
+ int err;
- htab = kzalloc(sizeof(*htab), GFP_USER);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
- /* mandatory map attributes */
- htab->map.key_size = attr->key_size;
- htab->map.value_size = attr->value_size;
- htab->map.max_entries = attr->max_entries;
+ bpf_map_init_from_attr(&htab->map, attr);
- /* check sanity of attributes.
- * value_size == 0 may be allowed in the future to use map as a set
+ if (percpu_lru) {
+ /* ensure each CPU's lru list has >=1 elements.
+ * since we are at it, make each lru list has the same
+ * number of elements.
+ */
+ htab->map.max_entries = roundup(attr->max_entries,
+ num_possible_cpus());
+ if (htab->map.max_entries < attr->max_entries)
+ htab->map.max_entries = rounddown(attr->max_entries,
+ num_possible_cpus());
+ }
+
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
*/
- err = -EINVAL;
- if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
- htab->map.value_size == 0)
+ err = -E2BIG;
+ if (htab->map.max_entries > 1UL << 31)
goto free_htab;
- /* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
- err = -E2BIG;
- if (htab->map.key_size > MAX_BPF_STACK)
- /* eBPF programs initialize keys on stack, so they cannot be
- * larger than max stack size
- */
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8);
+ if (percpu)
+ htab->elem_size += sizeof(void *);
+ else
+ htab->elem_size += round_up(htab->map.value_size, 8);
+
+ /* check for u32 overflow */
+ if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- err = -ENOMEM;
- /* prevent zero size kmalloc and check for u32 overflow */
- if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ err = bpf_map_init_elem_count(&htab->map);
+ if (err)
goto free_htab;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
- GFP_USER | __GFP_NOWARN);
-
- if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
- if (!htab->buckets)
- goto free_htab;
+ err = -ENOMEM;
+ htab->buckets = bpf_map_area_alloc(htab->n_buckets *
+ sizeof(struct bucket),
+ htab->map.numa_node);
+ if (!htab->buckets)
+ goto free_elem_count;
+
+ if (htab->map.map_flags & BPF_F_ZERO_SEED)
+ htab->hashrnd = 0;
+ else
+ htab->hashrnd = get_random_u32();
+
+ htab_init_buckets(htab);
+
+/* compute_batch_value() computes batch value as num_online_cpus() * 2
+ * and __percpu_counter_compare() needs
+ * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
+ * for percpu_counter to be faster than atomic_t. In practice the average bpf
+ * hash map size is 10k, which means that a system with 64 cpus will fill
+ * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
+ * define our own batch count as 32 then 10k hash map can be filled up to 80%:
+ * 10k - 8k > 32 _batch_ * 64 _cpus_
+ * and __percpu_counter_compare() will still be fast. At that point hash map
+ * collisions will dominate its performance anyway. Assume that hash map filled
+ * to 50+% isn't going to be O(1) and use the following formula to choose
+ * between percpu_counter and atomic_t.
+ */
+#define PERCPU_COUNTER_BATCH 32
+ if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
+ htab->use_percpu_counter = true;
+
+ if (htab->use_percpu_counter) {
+ err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
+ if (err)
+ goto free_map_locked;
}
- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ if (prealloc) {
+ err = prealloc_init(htab);
+ if (err)
+ goto free_map_locked;
- spin_lock_init(&htab->lock);
- htab->count = 0;
+ if (htab_has_extra_elems(htab)) {
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_prealloc;
+ }
+ } else {
+ err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
+ if (err)
+ goto free_map_locked;
+ if (percpu) {
+ err = bpf_mem_alloc_init(&htab->pcpu_ma,
+ round_up(htab->map.value_size, 8), true);
+ if (err)
+ goto free_map_locked;
+ }
+ }
- htab->elem_size = sizeof(struct htab_elem) +
- round_up(htab->map.key_size, 8) +
- htab->map.value_size;
return &htab->map;
+free_prealloc:
+ prealloc_destroy(htab);
+free_map_locked:
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
+ bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+free_elem_count:
+ bpf_map_free_elem_count(&htab->map);
free_htab:
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
-static inline u32 htab_map_hash(const void *key, u32 key_len)
+static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
- return jhash(key, key_len, 0);
+ if (likely(key_len % 4 == 0))
+ return jhash2(key, key_len / 4, hashrnd);
+ return jhash(key, key_len, hashrnd);
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
void *key, u32 key_size)
{
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_rcu(l, head, hash_node)
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l->hash == hash && !memcmp(&l->key, key, key_size))
return l;
return NULL;
}
-/* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+/* can be called without bucket lock. it will repeat the loop in
+ * the unlikely event when elements moved from one bucket into another
+ * while link list is being walked
+ */
+static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
+ u32 hash, void *key,
+ u32 key_size, u32 n_buckets)
+{
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+again:
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
+ goto again;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program directly, so
+ * arguments have to match bpf_map_lookup_elem() exactly.
+ * The return value is adjusted by BPF instructions
+ * in htab_map_gen_lookup().
+ */
+static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l;
u32 hash, key_size;
- /* Must be called with rcu_read_lock. */
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
head = select_bucket(htab, hash);
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
+
+ return l;
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
+
+ return NULL;
+}
+
+/* inline bpf_map_lookup_elem() call.
+ * Instead of:
+ * bpf_prog
+ * bpf_map_lookup_elem
+ * map->ops->map_lookup_elem
+ * htab_map_lookup_elem
+ * __htab_map_lookup_elem
+ * do:
+ * bpf_prog
+ * __htab_map_lookup_elem
+ */
+static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
+static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
+ void *key, const bool mark)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ if (mark)
+ bpf_lru_node_set_ref(&l->lru_node);
+ return htab_elem_value(l, map->key_size);
+ }
return NULL;
}
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, true);
+}
+
+static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, false);
+}
+
+static int htab_lru_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+ const int ref_reg = BPF_REG_1;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
+ *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
+ *insn++ = BPF_ST_MEM(BPF_B, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref),
+ 1);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ return insn - insn_buf;
+}
+
+static void check_and_free_fields(struct bpf_htab *htab,
+ struct htab_elem *elem)
+{
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ } else {
+ void *map_value = htab_elem_value(elem, htab->map.key_size);
+
+ bpf_obj_free_fields(htab->map.record, map_value);
+ }
+}
+
+/* It is called from the bpf_lru_list when the LRU needs to delete
+ * older elements from the htab.
+ */
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
+{
+ struct bpf_htab *htab = arg;
+ struct htab_elem *l = NULL, *tgt_l;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ unsigned long flags;
+ struct bucket *b;
+ int ret;
+
+ tgt_l = container_of(node, struct htab_elem, lru_node);
+ b = __select_bucket(htab, tgt_l->hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ return false;
+
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ if (l == tgt_l) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ bpf_map_dec_elem_count(&htab->map);
+ break;
+ }
+
+ htab_unlock_bucket(b, flags);
+
+ if (l == tgt_l)
+ check_and_free_fields(htab, l);
+ return l == tgt_l;
+}
+
/* Called from syscall */
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
- int i;
+ int i = 0;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ if (!key)
+ goto find_first_elem;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
head = select_bucket(htab, hash);
/* lookup the key */
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
- if (!l) {
- i = 0;
+ if (!l)
goto find_first_elem;
- }
/* key was found, get next key in the same bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
struct htab_elem, hash_node);
if (next_l) {
@@ -187,7 +855,7 @@ find_first_elem:
head = select_bucket(htab, i);
/* pick first element in the bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
struct htab_elem, hash_node);
if (next_l) {
/* if it's not empty, just return it */
@@ -196,115 +864,604 @@ find_first_elem:
}
}
- /* itereated over all buckets and all elements */
+ /* iterated over all buckets and all elements */
return -ENOENT;
}
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
+{
+ check_and_free_fields(htab, l);
+
+ if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+ bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
+ bpf_mem_cache_free(&htab->ma, l);
+}
+
+static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
+{
+ struct bpf_map *map = &htab->map;
+ void *ptr;
+
+ if (map->ops->map_fd_put_ptr) {
+ ptr = fd_htab_map_get_ptr(map, l);
+ map->ops->map_fd_put_ptr(map, ptr, true);
+ }
+}
+
+static bool is_map_full(struct bpf_htab *htab)
+{
+ if (htab->use_percpu_counter)
+ return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
+ PERCPU_COUNTER_BATCH) >= 0;
+ return atomic_read(&htab->count) >= htab->map.max_entries;
+}
+
+static void inc_elem_count(struct bpf_htab *htab)
+{
+ bpf_map_inc_elem_count(&htab->map);
+
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_inc(&htab->count);
+}
+
+static void dec_elem_count(struct bpf_htab *htab)
+{
+ bpf_map_dec_elem_count(&htab->map);
+
+ if (htab->use_percpu_counter)
+ percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
+ else
+ atomic_dec(&htab->count);
+}
+
+
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ htab_put_fd_value(htab, l);
+
+ if (htab_is_prealloc(htab)) {
+ bpf_map_dec_elem_count(&htab->map);
+ check_and_free_fields(htab, l);
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
+ } else {
+ dec_elem_count(htab);
+ htab_elem_free(htab, l);
+ }
+}
+
+static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ void *ptr;
+
+ if (!onallcpus) {
+ /* copy true value_size bytes */
+ ptr = this_cpu_ptr(pptr);
+ copy_map_value(&htab->map, ptr, value);
+ bpf_obj_free_fields(htab->map.record, ptr);
+ } else {
+ u32 size = round_up(htab->map.value_size, 8);
+ int off = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ ptr = per_cpu_ptr(pptr, cpu);
+ copy_map_value_long(&htab->map, ptr, value + off);
+ bpf_obj_free_fields(htab->map.record, ptr);
+ off += size;
+ }
+ }
+}
+
+static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ /* When not setting the initial value on all cpus, zero-fill element
+ * values for other cpus. Otherwise, bpf program has no way to ensure
+ * known initial values for cpus other than current one
+ * (onallcpus=false always when coming from bpf prog).
+ */
+ if (!onallcpus) {
+ int current_cpu = raw_smp_processor_id();
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == current_cpu)
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ else /* Since elem is preallocated, we cannot touch special fields */
+ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
+ }
+ } else {
+ pcpu_copy_value(htab, pptr, value, onallcpus);
+ }
+}
+
+static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
+{
+ return is_fd_htab(htab) && BITS_PER_LONG == 64;
+}
+
+static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
+ void *value, u32 key_size, u32 hash,
+ bool percpu, bool onallcpus,
+ struct htab_elem *old_elem)
+{
+ u32 size = htab->map.value_size;
+ bool prealloc = htab_is_prealloc(htab);
+ struct htab_elem *l_new, **pl_new;
+ void __percpu *pptr;
+
+ if (prealloc) {
+ if (old_elem) {
+ /* if we're updating the existing element,
+ * use per-cpu extra elems to avoid freelist_pop/push
+ */
+ pl_new = this_cpu_ptr(htab->extra_elems);
+ l_new = *pl_new;
+ *pl_new = old_elem;
+ } else {
+ struct pcpu_freelist_node *l;
+
+ l = __pcpu_freelist_pop(&htab->freelist);
+ if (!l)
+ return ERR_PTR(-E2BIG);
+ l_new = container_of(l, struct htab_elem, fnode);
+ bpf_map_inc_elem_count(&htab->map);
+ }
+ } else {
+ if (is_map_full(htab))
+ if (!old_elem)
+ /* when map is full and update() is replacing
+ * old element, it's ok to allocate, since
+ * old element will be freed immediately.
+ * Otherwise return an error
+ */
+ return ERR_PTR(-E2BIG);
+ inc_elem_count(htab);
+ l_new = bpf_mem_cache_alloc(&htab->ma);
+ if (!l_new) {
+ l_new = ERR_PTR(-ENOMEM);
+ goto dec_count;
+ }
+ }
+
+ memcpy(l_new->key, key, key_size);
+ if (percpu) {
+ if (prealloc) {
+ pptr = htab_elem_get_ptr(l_new, key_size);
+ } else {
+ /* alloc_percpu zero-fills */
+ void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
+
+ if (!ptr) {
+ bpf_mem_cache_free(&htab->ma, l_new);
+ l_new = ERR_PTR(-ENOMEM);
+ goto dec_count;
+ }
+ l_new->ptr_to_pptr = ptr;
+ pptr = *(void __percpu **)ptr;
+ }
+
+ pcpu_init_value(htab, pptr, value, onallcpus);
+
+ if (!prealloc)
+ htab_elem_set_ptr(l_new, key_size, pptr);
+ } else if (fd_htab_map_needs_adjust(htab)) {
+ size = round_up(size, 8);
+ memcpy(htab_elem_value(l_new, key_size), value, size);
+ } else {
+ copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
+ }
+
+ l_new->hash = hash;
+ return l_new;
+dec_count:
+ dec_elem_count(htab);
+ return l_new;
+}
+
+static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
+ u64 map_flags)
+{
+ if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
- u32 key_size;
+ struct bucket *b;
+ u32 key_size, hash;
int ret;
- if (map_flags > BPF_EXIST)
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- /* allocate new element outside of lock */
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
- if (!l_new)
- return -ENOMEM;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
- memcpy(l_new->key, key, key_size);
- memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
-
- l_new->hash = htab_map_hash(l_new->key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ if (unlikely(map_flags & BPF_F_LOCK)) {
+ if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
+ return -EINVAL;
+ /* find an element without taking the bucket lock */
+ l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+ htab->n_buckets);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ return ret;
+ if (l_old) {
+ /* grab the element lock and update value in place */
+ copy_map_value_locked(map,
+ htab_elem_value(l_old, key_size),
+ value, false);
+ return 0;
+ }
+ /* fall through, grab the bucket lock and lookup again.
+ * 99.9% chance that the element won't be found,
+ * but second lookup under lock has to be done.
+ */
+ }
- /* bpf_map_update_elem() can be called in_irq() */
- spin_lock_irqsave(&htab->lock, flags);
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ return ret;
- head = select_bucket(htab, l_new->hash);
+ l_old = lookup_elem_raw(head, hash, key, key_size);
- l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
- if (!l_old && unlikely(htab->count >= map->max_entries)) {
- /* if elem with this 'key' doesn't exist and we've reached
- * max_entries limit, fail insertion of new elem
+ if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+ /* first lookup without the bucket lock didn't find the element,
+ * but second lookup with the bucket lock found it.
+ * This case is highly unlikely, but has to be dealt with:
+ * grab the element lock in addition to the bucket lock
+ * and update element in place
*/
- ret = -E2BIG;
+ copy_map_value_locked(map,
+ htab_elem_value(l_old, key_size),
+ value, false);
+ ret = 0;
goto err;
}
- if (l_old && map_flags == BPF_NOEXIST) {
- /* elem already exists */
- ret = -EEXIST;
+ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+ l_old);
+ if (IS_ERR(l_new)) {
+ /* all pre-allocated elements are in use or memory exhausted */
+ ret = PTR_ERR(l_new);
goto err;
}
- if (!l_old && map_flags == BPF_EXIST) {
- /* elem doesn't exist, cannot update it */
- ret = -ENOENT;
- goto err;
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_nulls_del_rcu(&l_old->hash_node);
+
+ /* l_old has already been stashed in htab->extra_elems, free
+ * its special fields before it is available for reuse.
+ */
+ if (htab_is_prealloc(htab))
+ check_and_free_fields(htab, l_old);
}
+ htab_unlock_bucket(b, flags);
+ if (l_old && !htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
+ return 0;
+err:
+ htab_unlock_bucket(b, flags);
+ return ret;
+}
+
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ check_and_free_fields(htab, elem);
+ bpf_map_dec_elem_count(&htab->map);
+ bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old = NULL;
+ struct hlist_nulls_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because getting free nodes from LRU may need
+ * to remove older elements from htab and this removal
+ * operation will need a bucket lock.
+ */
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
+
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ goto err_lock_bucket;
- /* add new element to the head of the list, so that concurrent
- * search will find it before old elem
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
*/
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
- hlist_del_rcu(&l_old->hash_node);
- kfree_rcu(l_old, rcu);
+ bpf_lru_node_set_ref(&l_new->lru_node);
+ hlist_nulls_del_rcu(&l_old->hash_node);
+ }
+ ret = 0;
+
+err:
+ htab_unlock_bucket(b, flags);
+
+err_lock_bucket:
+ if (ret)
+ htab_lru_push_free(htab, l_new);
+ else if (l_old)
+ htab_lru_push_free(htab, l_old);
+
+ return ret;
+}
+
+static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool percpu, bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_nulls_head *head;
+ void *old_map_ptr = NULL;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ return ret;
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ /* Update value in-place */
+ if (percpu) {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ void **inner_map_pptr = htab_elem_value(l_old, key_size);
+
+ old_map_ptr = *inner_map_pptr;
+ WRITE_ONCE(*inner_map_pptr, *(void **)value);
+ }
} else {
- htab->count++;
+ l_new = alloc_htab_elem(htab, key, value, key_size,
+ hash, percpu, onallcpus, NULL);
+ if (IS_ERR(l_new)) {
+ ret = PTR_ERR(l_new);
+ goto err;
+ }
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
- spin_unlock_irqrestore(&htab->lock, flags);
+err:
+ htab_unlock_bucket(b, flags);
+ if (old_map_ptr)
+ map->ops->map_fd_put_ptr(map, old_map_ptr, true);
+ return ret;
+}
- return 0;
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_nulls_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* For LRU, we need to alloc before taking bucket's
+ * spinlock because LRU's elem alloc may need
+ * to remove older elem from htab and this removal
+ * operation will need a bucket lock.
+ */
+ if (map_flags != BPF_EXIST) {
+ l_new = prealloc_lru_pop(htab, key, hash);
+ if (!l_new)
+ return -ENOMEM;
+ }
+
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ goto err_lock_bucket;
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ bpf_lru_node_set_ref(&l_old->lru_node);
+
+ /* per-cpu hash map can update value in-place */
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
+ value, onallcpus);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+ l_new = NULL;
+ }
+ ret = 0;
err:
- spin_unlock_irqrestore(&htab->lock, flags);
- kfree(l_new);
+ htab_unlock_bucket(b, flags);
+err_lock_bucket:
+ if (l_new) {
+ bpf_map_dec_elem_count(&htab->map);
+ bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ }
return ret;
}
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
+}
+
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
+ false);
+}
+
/* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
- int ret = -ENOENT;
+ int ret;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
key_size = map->key_size;
- hash = htab_map_hash(key, key_size);
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
- spin_lock_irqsave(&htab->lock, flags);
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ return ret;
- head = select_bucket(htab, hash);
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (l)
+ hlist_nulls_del_rcu(&l->hash_node);
+ else
+ ret = -ENOENT;
+
+ htab_unlock_bucket(b, flags);
+
+ if (l)
+ free_htab_elem(htab, l);
+ return ret;
+}
+
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ return ret;
l = lookup_elem_raw(head, hash, key, key_size);
- if (l) {
- hlist_del_rcu(&l->hash_node);
- htab->count--;
- kfree_rcu(l, rcu);
- ret = 0;
- }
+ if (l)
+ hlist_nulls_del_rcu(&l->hash_node);
+ else
+ ret = -ENOENT;
- spin_unlock_irqrestore(&htab->lock, flags);
+ htab_unlock_bucket(b, flags);
+ if (l)
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -312,17 +1469,54 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
+ /* It's called from a worker thread and migration has been disabled,
+ * therefore, it is OK to invoke bpf_mem_cache_free() directly.
+ */
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ htab_elem_free(htab, l);
+ }
+ cond_resched();
+ }
+}
+
+static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
+{
+ int i;
+
+ rcu_read_lock();
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
- struct hlist_node *n;
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_safe(l, n, head, hash_node) {
- hlist_del_rcu(&l->hash_node);
- htab->count--;
- kfree(l);
+ hlist_nulls_for_each_entry(l, n, head, hash_node) {
+ /* We only free internal structs on uref dropping to zero */
+ bpf_map_free_internal_structs(&htab->map,
+ htab_elem_value(l, htab->map.key_size));
}
+ cond_resched_rcu();
}
+ rcu_read_unlock();
+}
+
+static void htab_map_free_internal_structs(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* We only free internal structs on uref dropping to zero */
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ if (htab_is_prealloc(htab))
+ htab_free_prealloced_internal_structs(htab);
+ else
+ htab_free_malloced_internal_structs(htab);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -330,38 +1524,1097 @@ static void htab_map_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding critical sections in
- * these programs to complete
+ /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
+ * bpf_free_used_maps() is called after bpf prog is no longer executing.
+ * There is no need to synchronize_rcu() here to protect map elements.
+ */
+
+ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
+ * underneath and is responsible for waiting for callbacks to finish
+ * during bpf_mem_alloc_destroy().
+ */
+ if (!htab_is_prealloc(htab)) {
+ delete_all_elements(htab);
+ } else {
+ htab_free_prealloced_fields(htab);
+ prealloc_destroy(htab);
+ }
+
+ bpf_map_free_elem_count(map);
+ free_percpu(htab->extra_elems);
+ bpf_map_area_free(htab->buckets);
+ bpf_mem_alloc_destroy(&htab->pcpu_ma);
+ bpf_mem_alloc_destroy(&htab->ma);
+ if (htab->use_percpu_counter)
+ percpu_counter_destroy(&htab->pcount);
+ bpf_map_area_free(htab);
+}
+
+static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ void *value;
+
+ rcu_read_lock();
+
+ value = htab_map_lookup_elem(map, key);
+ if (!value) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ seq_puts(m, ": ");
+ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+ seq_putc(m, '\n');
+
+ rcu_read_unlock();
+}
+
+static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, bool is_lru_map,
+ bool is_percpu, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ unsigned long bflags;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ struct bucket *b;
+ int ret;
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(b, &bflags);
+ if (ret)
+ return ret;
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (!l) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
+
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
+ off += roundup_value_size;
+ }
+ } else {
+ void *src = htab_elem_value(l, map->key_size);
+
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, src, true);
+ else
+ copy_map_value(map, value, src);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, value);
+ }
+ hlist_nulls_del_rcu(&l->hash_node);
+
+out_unlock:
+ htab_unlock_bucket(b, bflags);
+
+ if (l) {
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
+ }
+
+ return ret;
+}
+
+static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
+ flags);
+}
+
+static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
+ flags);
+}
+
+static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
+ flags);
+}
+
+static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
+ void *key, void *value,
+ u64 flags)
+{
+ return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
+ flags);
+}
+
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete, bool is_lru_map,
+ bool is_percpu)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+ void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+ void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ u32 batch, max_count, size, bucket_size, map_id;
+ u32 bucket_cnt, total, key_size, value_size;
+ struct htab_elem *node_to_free = NULL;
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ unsigned long flags = 0;
+ bool locked = false;
+ struct htab_elem *l;
+ struct bucket *b;
+ int ret = 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & ~BPF_F_LOCK) ||
+ ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ batch = 0;
+ if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
+ return -EFAULT;
+
+ if (batch >= htab->n_buckets)
+ return -ENOENT;
+
+ key_size = htab->map.key_size;
+ value_size = htab->map.value_size;
+ size = round_up(value_size, 8);
+ if (is_percpu)
+ value_size = size * num_possible_cpus();
+ total = 0;
+ /* while experimenting with hash tables with sizes ranging from 10 to
+ * 1000, it was observed that a bucket can have up to 5 entries.
+ */
+ bucket_size = 5;
+
+alloc:
+ /* We cannot do copy_from_user or copy_to_user inside
+ * the rcu_read_lock. Allocate enough space here.
+ */
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ if (!keys || !values) {
+ ret = -ENOMEM;
+ goto after_loop;
+ }
+
+again:
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+again_nocopy:
+ dst_key = keys;
+ dst_val = values;
+ b = &htab->buckets[batch];
+ head = &b->head;
+ /* do not grab the lock unless need it (bucket_cnt > 0). */
+ if (locked) {
+ ret = htab_lock_bucket(b, &flags);
+ if (ret) {
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
+ }
+
+ bucket_cnt = 0;
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ bucket_cnt++;
+
+ if (bucket_cnt && !locked) {
+ locked = true;
+ goto again_nocopy;
+ }
+
+ if (bucket_cnt > (max_count - total)) {
+ if (total == 0)
+ ret = -ENOSPC;
+ /* Note that since bucket_cnt > 0 here, it is implicit
+ * that the locked was grabbed, so release it.
+ */
+ htab_unlock_bucket(b, flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ goto after_loop;
+ }
+
+ if (bucket_cnt > bucket_size) {
+ bucket_size = bucket_cnt;
+ /* Note that since bucket_cnt > 0 here, it is implicit
+ * that the locked was grabbed, so release it.
+ */
+ htab_unlock_bucket(b, flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ kvfree(keys);
+ kvfree(values);
+ goto alloc;
+ }
+
+ /* Next block is only safe to run if you have grabbed the lock */
+ if (!locked)
+ goto next_batch;
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ memcpy(dst_key, l->key, key_size);
+
+ if (is_percpu) {
+ int off = 0, cpu;
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
+ off += size;
+ }
+ } else {
+ value = htab_elem_value(l, key_size);
+ if (is_fd_htab(htab)) {
+ struct bpf_map **inner_map = value;
+
+ /* Actual value is the id of the inner map */
+ map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
+ value = &map_id;
+ }
+
+ if (elem_map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst_val, value,
+ true);
+ else
+ copy_map_value(map, dst_val, value);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, dst_val);
+ }
+ if (do_delete) {
+ hlist_nulls_del_rcu(&l->hash_node);
+
+ /* bpf_lru_push_free() will acquire lru_lock, which
+ * may cause deadlock. See comments in function
+ * prealloc_lru_pop(). Let us do bpf_lru_push_free()
+ * after releasing the bucket lock.
+ *
+ * For htab of maps, htab_put_fd_value() in
+ * free_htab_elem() may acquire a spinlock with bucket
+ * lock being held and it violates the lock rule, so
+ * invoke free_htab_elem() after unlock as well.
+ */
+ l->batch_flink = node_to_free;
+ node_to_free = l;
+ }
+ dst_key += key_size;
+ dst_val += value_size;
+ }
+
+ htab_unlock_bucket(b, flags);
+ locked = false;
+
+ while (node_to_free) {
+ l = node_to_free;
+ node_to_free = node_to_free->batch_flink;
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
+ }
+
+next_batch:
+ /* If we are not copying data, we can go to next bucket and avoid
+ * unlocking the rcu.
*/
- synchronize_rcu();
+ if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
+ batch++;
+ goto again_nocopy;
+ }
+
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
+ key_size * bucket_cnt) ||
+ copy_to_user(uvalues + total * value_size, values,
+ value_size * bucket_cnt))) {
+ ret = -EFAULT;
+ goto after_loop;
+ }
+
+ total += bucket_cnt;
+ batch++;
+ if (batch >= htab->n_buckets) {
+ ret = -ENOENT;
+ goto after_loop;
+ }
+ goto again;
+
+after_loop:
+ if (ret == -EFAULT)
+ goto out;
+
+ /* copy # of entries and next batch */
+ ubatch = u64_to_user_ptr(attr->batch.out_batch);
+ if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
+ put_user(total, &uattr->batch.count))
+ ret = -EFAULT;
+
+out:
+ kvfree(keys);
+ kvfree(values);
+ return ret;
+}
+
+static int
+htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, true);
+}
+
+static int
+htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, true);
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false, false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false, false);
+}
+
+static int
+htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, true);
+}
+
+static int
+htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, true);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true, false);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true, false);
+}
+
+struct bpf_iter_seq_hash_map_info {
+ struct bpf_map *map;
+ struct bpf_htab *htab;
+ void *percpu_value_buf; // non-zero means percpu hash
+ u32 bucket_id;
+ u32 skip_elems;
+};
+
+static struct htab_elem *
+bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
+ struct htab_elem *prev_elem)
+{
+ const struct bpf_htab *htab = info->htab;
+ u32 skip_elems = info->skip_elems;
+ u32 bucket_id = info->bucket_id;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ struct bucket *b;
+ u32 i, count;
+
+ if (bucket_id >= htab->n_buckets)
+ return NULL;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ /* no update/deletion on this bucket, prev_elem should be still valid
+ * and we won't skip elements.
+ */
+ n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
+ elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
+ if (elem)
+ return elem;
+
+ /* not found, unlock and go to the next bucket */
+ b = &htab->buckets[bucket_id++];
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ for (i = bucket_id; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ rcu_read_lock();
+
+ count = 0;
+ head = &b->head;
+ hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+ if (count >= skip_elems) {
+ info->bucket_id = i;
+ info->skip_elems = count;
+ return elem;
+ }
+ count++;
+ }
+
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ info->bucket_id = i;
+ info->skip_elems = 0;
+ return NULL;
+}
+
+static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ struct htab_elem *elem;
+
+ elem = bpf_hash_map_seq_find_next(info, NULL);
+ if (!elem)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return elem;
+}
+
+static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->skip_elems;
+ return bpf_hash_map_seq_find_next(info, v);
+}
+
+static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
+{
+ struct bpf_iter_seq_hash_map_info *info = seq->private;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_iter_meta meta;
+ int ret = 0, off = 0, cpu;
+ u32 roundup_value_size;
+ struct bpf_prog *prog;
+ void __percpu *pptr;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, elem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->key;
+ if (!info->percpu_value_buf) {
+ ctx.value = htab_elem_value(elem, map->key_size);
+ } else {
+ roundup_value_size = round_up(map->value_size, 8);
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
+ off += roundup_value_size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_hash_map_seq_show(seq, v);
+}
+
+static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_hash_map_seq_show(seq, NULL);
+ else
+ rcu_read_unlock();
+}
+
+static int bpf_iter_init_hash_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ bpf_map_inc_with_uref(map);
+ seq_info->map = map;
+ seq_info->htab = container_of(map, struct bpf_htab, map);
+ return 0;
+}
+
+static void bpf_iter_fini_hash_map(void *priv_data)
+{
+ struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_hash_map_seq_ops = {
+ .start = bpf_hash_map_seq_start,
+ .next = bpf_hash_map_seq_next,
+ .stop = bpf_hash_map_seq_stop,
+ .show = bpf_hash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_hash_map_seq_ops,
+ .init_seq_private = bpf_iter_init_hash_map,
+ .fini_seq_private = bpf_iter_fini_hash_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
+};
+
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ struct htab_elem *elem;
+ int i, num_elems = 0;
+ void __percpu *pptr;
+ struct bucket *b;
+ void *key, *val;
+ bool is_percpu;
+ u64 ret = 0;
+
+ cant_migrate();
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = htab_is_percpu(htab);
- /* some of kfree_rcu() callbacks for elements of this map may not have
- * executed. It's ok. Proceed to free residual elements and map itself
+ /* migration has been disabled, so percpu value prepared here will be
+ * the same as the one seen by the bpf program with
+ * bpf_map_lookup_elem().
*/
- delete_all_elements(htab);
- kvfree(htab->buckets);
- kfree(htab);
+ for (i = 0; i < htab->n_buckets; i++) {
+ b = &htab->buckets[i];
+ rcu_read_lock();
+ head = &b->head;
+ hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
+ key = elem->key;
+ if (is_percpu) {
+ /* current cpu value for percpu map */
+ pptr = htab_elem_get_ptr(elem, map->key_size);
+ val = this_cpu_ptr(pptr);
+ } else {
+ val = htab_elem_value(elem, map->key_size);
+ }
+ num_elems++;
+ ret = callback_fn((u64)(long)map, (u64)(long)key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret) {
+ rcu_read_unlock();
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+ }
+out:
+ return num_elems;
}
-static const struct bpf_map_ops htab_ops = {
+static u64 htab_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 value_size = round_up(htab->map.value_size, 8);
+ bool prealloc = htab_is_prealloc(htab);
+ bool percpu = htab_is_percpu(htab);
+ bool lru = htab_is_lru(htab);
+ u64 num_entries;
+ u64 usage = sizeof(struct bpf_htab);
+
+ usage += sizeof(struct bucket) * htab->n_buckets;
+ usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
+ if (prealloc) {
+ num_entries = map->max_entries;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ usage += htab->elem_size * num_entries;
+
+ if (percpu)
+ usage += value_size * num_possible_cpus() * num_entries;
+ else if (!lru)
+ usage += sizeof(struct htab_elem *) * num_possible_cpus();
+ } else {
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+ num_entries = htab->use_percpu_counter ?
+ percpu_counter_sum(&htab->pcount) :
+ atomic_read(&htab->count);
+ usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
+ if (percpu) {
+ usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
+ usage += value_size * num_possible_cpus() * num_entries;
+ }
+ }
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
+const struct bpf_map_ops htab_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_internal_structs,
.map_lookup_elem = htab_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_gen_lookup = htab_map_gen_lookup,
+ .map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
-static struct bpf_map_type_list htab_type __read_mostly = {
- .ops = &htab_ops,
- .type = BPF_MAP_TYPE_HASH,
+const struct bpf_map_ops htab_lru_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = htab_map_alloc_check,
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_internal_structs,
+ .map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
+ .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
+ .map_update_elem = htab_lru_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+ .map_gen_lookup = htab_lru_map_gen_lookup,
+ .map_seq_show_elem = htab_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab_lru),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
};
-static int __init register_htab_map(void)
+/* Called from eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
- bpf_register_map_type(&htab_type);
- return 0;
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l)
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ else
+ return NULL;
+}
+
+/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
+static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ if (!bpf_jit_supports_percpu_insn())
+ return -EOPNOTSUPP;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
+ offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
+ *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+
+ return insn - insn_buf;
+}
+
+static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l)
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ else
+ return NULL;
+}
+
+static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ }
+
+ return NULL;
+}
+
+static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
+{
+ struct htab_elem *l;
+
+ if (cpu >= nr_cpu_ids)
+ return NULL;
+
+ l = __htab_map_lookup_elem(map, key);
+ if (l) {
+ bpf_lru_node_set_ref(&l->lru_node);
+ return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
+ }
+
+ return NULL;
}
-late_initcall(register_htab_map);
+
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct htab_elem *l;
+ void __percpu *pptr;
+ int ret = -ENOENT;
+ int cpu, off = 0;
+ u32 size;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ l = __htab_map_lookup_elem(map, key);
+ if (!l)
+ goto out;
+ /* We do not mark LRU map element here in order to not mess up
+ * eviction heuristics when user space does a map walk.
+ */
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
+ off += size;
+ }
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ int ret;
+
+ rcu_read_lock();
+ if (htab_is_lru(htab))
+ ret = __htab_lru_percpu_map_update_elem(map, key, value,
+ map_flags, true);
+ else
+ ret = htab_map_update_elem_in_place(map, key, value, map_flags,
+ true, true);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ struct htab_elem *l;
+ void __percpu *pptr;
+ int cpu;
+
+ rcu_read_lock();
+
+ l = __htab_map_lookup_elem(map, key);
+ if (!l) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ seq_puts(m, ": {\n");
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(pptr, cpu), m);
+ seq_putc(m, '\n');
+ }
+ seq_puts(m, "}\n");
+
+ rcu_read_unlock();
+}
+
+const struct bpf_map_ops htab_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = htab_map_alloc_check,
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_gen_lookup = htab_percpu_map_gen_lookup,
+ .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
+ .map_update_elem = htab_percpu_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+ .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
+ .map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab_percpu),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
+};
+
+const struct bpf_map_ops htab_lru_percpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = htab_map_alloc_check,
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+ .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
+ .map_update_elem = htab_lru_percpu_map_update_elem,
+ .map_delete_elem = htab_lru_map_delete_elem,
+ .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
+ .map_seq_show_elem = htab_percpu_map_seq_show_elem,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab_lru_percpu),
+ .map_btf_id = &htab_map_btf_ids[0],
+ .iter_seq_info = &iter_seq_info,
+};
+
+static int fd_htab_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size != sizeof(u32))
+ return -EINVAL;
+ return htab_map_alloc_check(attr);
+}
+
+static void fd_htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_node *n;
+ struct hlist_nulls_head *head;
+ struct htab_elem *l;
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ void *ptr = fd_htab_map_get_ptr(map, l);
+
+ map->ops->map_fd_put_ptr(map, ptr, false);
+ }
+ }
+
+ htab_map_free(map);
+}
+
+/* only called from syscall */
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ ptr = htab_map_lookup_elem(map, key);
+ if (ptr)
+ *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* Only called from syscall */
+int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
+{
+ void *ptr;
+ int ret;
+
+ ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ /* The htab bucket lock is always held during update operations in fd
+ * htab map, and the following rcu_read_lock() is only used to avoid
+ * the WARN_ON_ONCE in htab_map_update_elem_in_place().
+ */
+ rcu_read_lock();
+ ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
+ rcu_read_unlock();
+ if (ret)
+ map->ops->map_fd_put_ptr(map, ptr, false);
+
+ return ret;
+}
+
+static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map, *inner_map_meta;
+
+ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+ if (IS_ERR(inner_map_meta))
+ return inner_map_meta;
+
+ map = htab_map_alloc(attr);
+ if (IS_ERR(map)) {
+ bpf_map_meta_free(inner_map_meta);
+ return map;
+ }
+
+ map->inner_map_meta = inner_map_meta;
+
+ return map;
+}
+
+static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
+
+ if (!inner_map)
+ return NULL;
+
+ return READ_ONCE(*inner_map);
+}
+
+static int htab_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+{
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+
+ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+ offsetof(struct htab_elem, key) +
+ round_up(map->key_size, 8));
+ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
+
+ return insn - insn_buf;
+}
+
+static void htab_of_map_free(struct bpf_map *map)
+{
+ bpf_map_meta_free(map->inner_map_meta);
+ fd_htab_map_free(map);
+}
+
+const struct bpf_map_ops htab_of_maps_map_ops = {
+ .map_alloc_check = fd_htab_map_alloc_check,
+ .map_alloc = htab_of_map_alloc,
+ .map_free = htab_of_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_of_map_lookup_elem,
+ .map_delete_elem = htab_map_delete_elem,
+ .map_fd_get_ptr = bpf_map_fd_get_ptr,
+ .map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
+ .map_gen_lookup = htab_of_map_gen_lookup,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = htab_map_mem_usage,
+ BATCH_OPS(htab),
+ .map_btf_id = &htab_map_btf_ids[0],
+};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd7f5988ed9c..db72b96f9c8c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1,18 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
+#include <linux/topology.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
+#include <linux/filter.h>
+#include <linux/ctype.h>
+#include <linux/jiffies.h>
+#include <linux/pid_namespace.h>
+#include <linux/poison.h>
+#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
+#include <linux/security.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/kasan.h>
+#include <linux/bpf_verifier.h>
+#include <linux/uaccess.h>
+#include <linux/verification.h>
+#include <linux/task_work.h>
+#include <linux/irq_work.h>
+#include <linux/buildid.h>
+
+#include "../../lib/kstrtox.h"
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -20,94 +38,4573 @@
*
* Different map implementations will rely on rcu in map methods
* lookup/update/delete, therefore eBPF programs must run under rcu lock
- * if program is allowed to access maps, so check rcu_read_lock_held in
- * all three functions.
+ * if program is allowed to access maps, so check rcu_read_lock_held() or
+ * rcu_read_lock_trace_held() in all three functions.
+ */
+BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ return (unsigned long) map->ops->map_lookup_elem(map, key);
+}
+
+const struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
+ void *, value, u64, flags)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ return map->ops->map_update_elem(map, key, value, flags);
+}
+
+const struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ return map->ops->map_delete_elem(map, key);
+}
+
+const struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
+{
+ return map->ops->map_push_elem(map, value, flags);
+}
+
+const struct bpf_func_proto bpf_map_push_elem_proto = {
+ .func = bpf_map_push_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
+{
+ return map->ops->map_pop_elem(map, value);
+}
+
+const struct bpf_func_proto bpf_map_pop_elem_proto = {
+ .func = bpf_map_pop_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
+};
+
+BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
+{
+ return map->ops->map_peek_elem(map, value);
+}
+
+const struct bpf_func_proto bpf_map_peek_elem_proto = {
+ .func = bpf_map_peek_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE,
+};
+
+BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
+}
+
+const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = {
+ .func = bpf_map_lookup_percpu_elem,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto = {
+ .func = bpf_user_rnd_u32,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_get_smp_processor_id)
+{
+ return smp_processor_id();
+}
+
+const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
+ .func = bpf_get_smp_processor_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .allow_fastcall = true,
+};
+
+BPF_CALL_0(bpf_get_numa_node_id)
+{
+ return numa_node_id();
+}
+
+const struct bpf_func_proto bpf_get_numa_node_id_proto = {
+ .func = bpf_get_numa_node_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_ns)
+{
+ /* NMI safe access to clock monotonic */
+ return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_boot_ns)
+{
+ /* NMI safe access to clock boottime */
+ return ktime_get_boot_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = {
+ .func = bpf_ktime_get_boot_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_coarse_ns)
+{
+ return ktime_get_coarse_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = {
+ .func = bpf_ktime_get_coarse_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_ktime_get_tai_ns)
+{
+ /* NMI safe access to clock tai */
+ return ktime_get_tai_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = {
+ .func = bpf_ktime_get_tai_ns,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_get_current_pid_tgid)
+{
+ struct task_struct *task = current;
+
+ if (unlikely(!task))
+ return -EINVAL;
+
+ return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+ .func = bpf_get_current_pid_tgid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_0(bpf_get_current_uid_gid)
+{
+ struct task_struct *task = current;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (unlikely(!task))
+ return -EINVAL;
+
+ current_uid_gid(&uid, &gid);
+ return (u64) from_kgid(&init_user_ns, gid) << 32 |
+ from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+ .func = bpf_get_current_uid_gid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
+{
+ struct task_struct *task = current;
+
+ if (unlikely(!task))
+ goto err_clear;
+
+ /* Verifier guarantees that size > 0 */
+ strscpy_pad(buf, task->comm, size);
+ return 0;
+err_clear:
+ memset(buf, 0, size);
+ return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+ .func = bpf_get_current_comm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+};
+
+#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+ union {
+ __u32 val;
+ arch_spinlock_t lock;
+ } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
+
+ compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
+ BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
+ BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ preempt_disable();
+ arch_spin_lock(l);
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+
+ arch_spin_unlock(l);
+ preempt_enable();
+}
+
+#else
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
+ do {
+ atomic_cond_read_relaxed(l, !VAL);
+ } while (atomic_xchg(l, 1));
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ atomic_set_release(l, 0);
+}
+
+#endif
+
+static DEFINE_PER_CPU(unsigned long, irqsave_flags);
+
+static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bpf_spin_lock(lock);
+ __this_cpu_write(irqsave_flags, flags);
+}
+
+NOTRACE_BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_lock_irqsave(lock);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+ .func = bpf_spin_lock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
+};
+
+static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
+{
+ unsigned long flags;
+
+ flags = __this_cpu_read(irqsave_flags);
+ __bpf_spin_unlock(lock);
+ local_irq_restore(flags);
+}
+
+NOTRACE_BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_unlock_irqrestore(lock);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+ .func = bpf_spin_unlock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
+};
+
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+ bool lock_src)
+{
+ struct bpf_spin_lock *lock;
+
+ if (lock_src)
+ lock = src + map->record->spin_lock_off;
+ else
+ lock = dst + map->record->spin_lock_off;
+ preempt_disable();
+ __bpf_spin_lock_irqsave(lock);
+ copy_map_value(map, dst, src);
+ __bpf_spin_unlock_irqrestore(lock);
+ preempt_enable();
+}
+
+BPF_CALL_0(bpf_jiffies64)
+{
+ return get_jiffies_64();
+}
+
+const struct bpf_func_proto bpf_jiffies64_proto = {
+ .func = bpf_jiffies64,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+ struct cgroup *cgrp;
+ u64 cgrp_id;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ cgrp_id = cgroup_id(cgrp);
+ rcu_read_unlock();
+
+ return cgrp_id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+ .func = bpf_get_current_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
+{
+ struct cgroup *cgrp;
+ struct cgroup *ancestor;
+ u64 cgrp_id;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ ancestor = cgroup_ancestor(cgrp, ancestor_level);
+ cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
+ rcu_read_unlock();
+
+ return cgrp_id;
+}
+
+const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
+ .func = bpf_get_current_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+#endif /* CONFIG_CGROUPS */
+
+#define BPF_STRTOX_BASE_MASK 0x1F
+
+static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
+ unsigned long long *res, bool *is_negative)
+{
+ unsigned int base = flags & BPF_STRTOX_BASE_MASK;
+ const char *cur_buf = buf;
+ size_t cur_len = buf_len;
+ unsigned int consumed;
+ size_t val_len;
+ char str[64];
+
+ if (!buf || !buf_len || !res || !is_negative)
+ return -EINVAL;
+
+ if (base != 0 && base != 8 && base != 10 && base != 16)
+ return -EINVAL;
+
+ if (flags & ~BPF_STRTOX_BASE_MASK)
+ return -EINVAL;
+
+ while (cur_buf < buf + buf_len && isspace(*cur_buf))
+ ++cur_buf;
+
+ *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
+ if (*is_negative)
+ ++cur_buf;
+
+ consumed = cur_buf - buf;
+ cur_len -= consumed;
+ if (!cur_len)
+ return -EINVAL;
+
+ cur_len = min(cur_len, sizeof(str) - 1);
+ memcpy(str, cur_buf, cur_len);
+ str[cur_len] = '\0';
+ cur_buf = str;
+
+ cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
+ val_len = _parse_integer(cur_buf, base, res);
+
+ if (val_len & KSTRTOX_OVERFLOW)
+ return -ERANGE;
+
+ if (val_len == 0)
+ return -EINVAL;
+
+ cur_buf += val_len;
+ consumed += cur_buf - str;
+
+ return consumed;
+}
+
+static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
+ long long *res)
+{
+ unsigned long long _res;
+ bool is_negative;
+ int err;
+
+ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+ if (err < 0)
+ return err;
+ if (is_negative) {
+ if ((long long)-_res > 0)
+ return -ERANGE;
+ *res = -_res;
+ } else {
+ if ((long long)_res < 0)
+ return -ERANGE;
+ *res = _res;
+ }
+ return err;
+}
+
+BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
+ s64 *, res)
+{
+ long long _res;
+ int err;
+
+ *res = 0;
+ err = __bpf_strtoll(buf, buf_len, flags, &_res);
+ if (err < 0)
+ return err;
+ *res = _res;
+ return err;
+}
+
+const struct bpf_func_proto bpf_strtol_proto = {
+ .func = bpf_strtol,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(s64),
+};
+
+BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
+ u64 *, res)
+{
+ unsigned long long _res;
+ bool is_negative;
+ int err;
+
+ *res = 0;
+ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+ if (err < 0)
+ return err;
+ if (is_negative)
+ return -EINVAL;
+ *res = _res;
+ return err;
+}
+
+const struct bpf_func_proto bpf_strtoul_proto = {
+ .func = bpf_strtoul,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
+};
+
+BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
+{
+ return strncmp(s1, s2, s1_sz);
+}
+
+static const struct bpf_func_proto bpf_strncmp_proto = {
+ .func = bpf_strncmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+};
+
+BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
+ struct bpf_pidns_info *, nsdata, u32, size)
+{
+ struct task_struct *task = current;
+ struct pid_namespace *pidns;
+ int err = -EINVAL;
+
+ if (unlikely(size != sizeof(struct bpf_pidns_info)))
+ goto clear;
+
+ if (unlikely((u64)(dev_t)dev != dev))
+ goto clear;
+
+ if (unlikely(!task))
+ goto clear;
+
+ pidns = task_active_pid_ns(task);
+ if (unlikely(!pidns)) {
+ err = -ENOENT;
+ goto clear;
+ }
+
+ if (!ns_match(&pidns->ns, (dev_t)dev, ino))
+ goto clear;
+
+ nsdata->pid = task_pid_nr_ns(task, pidns);
+ nsdata->tgid = task_tgid_nr_ns(task, pidns);
+ return 0;
+clear:
+ memset((void *)nsdata, 0, (size_t) size);
+ return err;
+}
+
+const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
+ .func = bpf_get_ns_current_pid_tgid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
+ .func = bpf_get_raw_cpu_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map,
+ u64, flags, void *, data, u64, size)
+{
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
+ const void __user *, user_ptr)
+{
+ int ret = copy_from_user(dst, user_ptr, size);
+
+ if (unlikely(ret)) {
+ memset(dst, 0, size);
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_proto = {
+ .func = bpf_copy_from_user,
+ .gpl_only = false,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+ const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+ int ret;
+
+ /* flags is not used yet */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(!size))
+ return 0;
+
+ ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+ if (ret == size)
+ return 0;
+
+ memset(dst, 0, size);
+ /* Return -EFAULT for partial read */
+ return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+ .func = bpf_copy_from_user_task,
+ .gpl_only = true,
+ .might_sleep = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_BTF_ID,
+ .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg5_type = ARG_ANYTHING
+};
+
+BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
+{
+ if (cpu >= nr_cpu_ids)
+ return (unsigned long)NULL;
+
+ return (unsigned long)per_cpu_ptr((const void __percpu *)(const uintptr_t)ptr, cpu);
+}
+
+const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
+ .func = bpf_per_cpu_ptr,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
+{
+ return (unsigned long)this_cpu_ptr((const void __percpu *)(const uintptr_t)percpu_ptr);
+}
+
+const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
+ .func = bpf_this_cpu_ptr,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
+ .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
+};
+
+static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+ size_t bufsz)
+{
+ void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+ buf[0] = 0;
+
+ switch (fmt_ptype) {
+ case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)unsafe_ptr < TASK_SIZE)
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ fallthrough;
+#endif
+ case 'k':
+ return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+ case 'u':
+ return strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ }
+
+ return -EINVAL;
+}
+
+/* Support executing three nested bprintf helper calls on a given CPU */
+#define MAX_BPRINTF_NEST_LEVEL 3
+
+static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
+static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
+
+int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
+{
+ int nest_level;
+
+ preempt_disable();
+ nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
+ if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) {
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
+ return -EBUSY;
+ }
+ *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);
+
+ return 0;
+}
+
+void bpf_put_buffers(void)
+{
+ if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
+ return;
+ this_cpu_dec(bpf_bprintf_nest_level);
+ preempt_enable();
+}
+
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
+{
+ if (!data->bin_args && !data->buf)
+ return;
+ bpf_put_buffers();
+}
+
+/*
+ * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
+ *
+ * Returns a negative value if fmt is an invalid format string or 0 otherwise.
+ *
+ * This can be used in two ways:
+ * - Format string verification only: when data->get_bin_args is false
+ * - Arguments preparation: in addition to the above verification, it writes in
+ * data->bin_args a binary representation of arguments usable by bstr_printf
+ * where pointers from BPF have been sanitized.
+ *
+ * In argument preparation mode, if 0 is returned, safe temporary buffers are
+ * allocated and bpf_bprintf_cleanup should be called to free them after use.
*/
-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
+ u32 num_args, struct bpf_bprintf_data *data)
+{
+ bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
+ char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end;
+ struct bpf_bprintf_buffers *buffers = NULL;
+ size_t sizeof_cur_arg, sizeof_cur_ip;
+ int err, i, num_spec = 0;
+ u64 cur_arg;
+ char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX";
+
+ fmt_end = strnchr(fmt, fmt_size, 0);
+ if (!fmt_end)
+ return -EINVAL;
+ fmt_size = fmt_end - fmt;
+
+ if (get_buffers && bpf_try_get_buffers(&buffers))
+ return -EBUSY;
+
+ if (data->get_bin_args) {
+ if (num_args)
+ tmp_buf = buffers->bin_args;
+ tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS;
+ data->bin_args = (u32 *)tmp_buf;
+ }
+
+ if (data->get_buf)
+ data->buf = buffers->buf;
+
+ for (i = 0; i < fmt_size; i++) {
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt[i + 1] == '%') {
+ i++;
+ continue;
+ }
+
+ if (num_spec >= num_args) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* The string is zero-terminated so if fmt[i] != 0, we can
+ * always access fmt[i + 1], in the worst case it will be a 0
+ */
+ i++;
+
+ /* skip optional "[0 +-][num]" width formatting field */
+ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
+ fmt[i] == ' ')
+ i++;
+ if (fmt[i] >= '1' && fmt[i] <= '9') {
+ i++;
+ while (fmt[i] >= '0' && fmt[i] <= '9')
+ i++;
+ }
+
+ if (fmt[i] == 'p') {
+ sizeof_cur_arg = sizeof(long);
+
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1])) {
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ goto nocopy_fmt;
+ }
+
+ if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
+ fmt[i + 2] == 's') {
+ fmt_ptype = fmt[i + 1];
+ i += 2;
+ goto fmt_str;
+ }
+
+ if (fmt[i + 1] == 'K' ||
+ fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
+ fmt[i + 1] == 'S') {
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ i++;
+ goto nocopy_fmt;
+ }
+
+ if (fmt[i + 1] == 'B') {
+ if (tmp_buf) {
+ err = snprintf(tmp_buf,
+ (tmp_buf_end - tmp_buf),
+ "%pB",
+ (void *)(long)raw_args[num_spec]);
+ tmp_buf += (err + 1);
+ }
+
+ i++;
+ num_spec++;
+ continue;
+ }
+
+ /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') ||
+ (fmt[i + 2] != '4' && fmt[i + 2] != '6')) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ i += 2;
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16;
+ if (tmp_buf_end - tmp_buf < sizeof_cur_ip) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = copy_from_kernel_nofault(cur_ip, unsafe_ptr,
+ sizeof_cur_ip);
+ if (err < 0)
+ memset(cur_ip, 0, sizeof_cur_ip);
+
+ /* hack: bstr_printf expects IP addresses to be
+ * pre-formatted as strings, ironically, the easiest way
+ * to do that is to call snprintf.
+ */
+ ip_spec[2] = fmt[i - 1];
+ ip_spec[3] = fmt[i];
+ err = snprintf(tmp_buf, tmp_buf_end - tmp_buf,
+ ip_spec, &cur_ip);
+
+ tmp_buf += err + 1;
+ num_spec++;
+
+ continue;
+ } else if (fmt[i] == 's') {
+ fmt_ptype = fmt[i];
+fmt_str:
+ if (fmt[i + 1] != 0 &&
+ !isspace(fmt[i + 1]) &&
+ !ispunct(fmt[i + 1])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ unsafe_ptr = (char *)(long)raw_args[num_spec];
+ err = bpf_trace_copy_string(tmp_buf, unsafe_ptr,
+ fmt_ptype,
+ tmp_buf_end - tmp_buf);
+ if (err < 0) {
+ tmp_buf[0] = '\0';
+ err = 1;
+ }
+
+ tmp_buf += err;
+ num_spec++;
+
+ continue;
+ } else if (fmt[i] == 'c') {
+ if (!tmp_buf)
+ goto nocopy_fmt;
+
+ if (tmp_buf_end == tmp_buf) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ *tmp_buf = raw_args[num_spec];
+ tmp_buf++;
+ num_spec++;
+
+ continue;
+ }
+
+ sizeof_cur_arg = sizeof(int);
+
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long);
+ i++;
+ }
+ if (fmt[i] == 'l') {
+ sizeof_cur_arg = sizeof(long long);
+ i++;
+ }
+
+ if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' &&
+ fmt[i] != 'x' && fmt[i] != 'X') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+nocopy_fmt:
+ if (tmp_buf) {
+ tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32));
+ if (tmp_buf_end - tmp_buf < sizeof_cur_arg) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ if (sizeof_cur_arg == 8) {
+ *(u32 *)tmp_buf = *(u32 *)&cur_arg;
+ *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1);
+ } else {
+ *(u32 *)tmp_buf = (u32)(long)cur_arg;
+ }
+ tmp_buf += sizeof_cur_arg;
+ }
+ num_spec++;
+ }
+
+ err = 0;
+out:
+ if (err)
+ bpf_bprintf_cleanup(data);
+ return err;
+}
+
+BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
+ const void *, args, u32, data_len)
{
- /* verifier checked that R1 contains a valid pointer to bpf_map
- * and R2 points to a program stack and map->key_size bytes were
- * initialized
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ };
+ int err, num_args;
+
+ if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we
+ * can safely give an unbounded size.
*/
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);
+ if (err < 0)
+ return err;
+
+ err = bstr_printf(str, str_size, fmt, data.bin_args);
+
+ bpf_bprintf_cleanup(&data);
+
+ return err + 1;
+}
+
+const struct bpf_func_proto bpf_snprintf_proto = {
+ .func = bpf_snprintf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx)
+{
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ *arr_idx = ((char *)value - array->value) / array->elem_size;
+ return arr_idx;
+ }
+ return (void *)value - round_up(map->key_size, 8);
+}
+
+struct bpf_async_cb {
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
void *value;
+ union {
+ struct rcu_head rcu;
+ struct work_struct delete_work;
+ };
+ u64 flags;
+};
- WARN_ON_ONCE(!rcu_read_lock_held());
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_async_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+ struct bpf_async_cb cb;
+ struct hrtimer timer;
+ atomic_t cancelling;
+};
- value = map->ops->map_lookup_elem(map, key);
+struct bpf_work {
+ struct bpf_async_cb cb;
+ struct work_struct work;
+ struct work_struct delete_work;
+};
- /* lookup() returns either pointer to element value or NULL
- * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+/* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */
+struct bpf_async_kern {
+ union {
+ struct bpf_async_cb *cb;
+ struct bpf_hrtimer *timer;
+ struct bpf_work *work;
+ };
+ /* bpf_spin_lock is used here instead of spinlock_t to make
+ * sure that it always fits into space reserved by struct bpf_timer
+ * regardless of LOCKDEP and spinlock debug flags.
*/
- return (unsigned long) value;
+ struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+enum bpf_async_type {
+ BPF_ASYNC_TYPE_TIMER = 0,
+ BPF_ASYNC_TYPE_WQ,
+};
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+ struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+ struct bpf_map *map = t->cb.map;
+ void *value = t->cb.value;
+ bpf_callback_t callback_fn;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_timer);
+ callback_fn = rcu_dereference_check(t->cb.callback_fn, rcu_read_lock_bh_held());
+ if (!callback_fn)
+ goto out;
+
+ /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+ * cannot be preempted by another bpf_timer_cb() on the same cpu.
+ * Remember the timer this callback is servicing to prevent
+ * deadlock if callback_fn() calls bpf_timer_cancel() or
+ * bpf_map_delete_elem() on the same timer.
+ */
+ this_cpu_write(hrtimer_running, t);
+
+ key = map_key_from_value(map, value, &idx);
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+ /* The verifier checked that return value is zero. */
+
+ this_cpu_write(hrtimer_running, NULL);
+out:
+ return HRTIMER_NORESTART;
}
-const struct bpf_func_proto bpf_map_lookup_elem_proto = {
- .func = bpf_map_lookup_elem,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+static void bpf_wq_work(struct work_struct *work)
+{
+ struct bpf_work *w = container_of(work, struct bpf_work, work);
+ struct bpf_async_cb *cb = &w->cb;
+ struct bpf_map *map = cb->map;
+ bpf_callback_t callback_fn;
+ void *value = cb->value;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ callback_fn = READ_ONCE(cb->callback_fn);
+ if (!callback_fn)
+ return;
+
+ key = map_key_from_value(map, value, &idx);
+
+ rcu_read_lock_trace();
+ migrate_disable();
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
+{
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+
+ kfree_nolock(cb);
+}
+
+static void bpf_wq_delete_work(struct work_struct *work)
+{
+ struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
+
+ cancel_work_sync(&w->work);
+
+ call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
+}
+
+static void bpf_timer_delete_work(struct work_struct *work)
+{
+ struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call
+ * call_rcu() right after for both preallocated and non-preallocated
+ * maps. The async->cb = NULL was already done and no code path can see
+ * address 't' anymore. Timer if armed for existing bpf_hrtimer before
+ * bpf_timer_cancel_and_free will have been cancelled.
+ */
+ hrtimer_cancel(&t->timer);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
+}
+
+static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
+ enum bpf_async_type type)
+{
+ struct bpf_async_cb *cb;
+ struct bpf_hrtimer *t;
+ struct bpf_work *w;
+ clockid_t clockid;
+ size_t size;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ size = sizeof(struct bpf_hrtimer);
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ size = sizeof(struct bpf_work);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ __bpf_spin_lock_irqsave(&async->lock);
+ t = async->timer;
+ if (t) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
+ if (!cb) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ switch (type) {
+ case BPF_ASYNC_TYPE_TIMER:
+ clockid = flags & (MAX_CLOCKS - 1);
+ t = (struct bpf_hrtimer *)cb;
+
+ atomic_set(&t->cancelling, 0);
+ INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
+ hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
+ cb->value = (void *)async - map->record->timer_off;
+ break;
+ case BPF_ASYNC_TYPE_WQ:
+ w = (struct bpf_work *)cb;
+
+ INIT_WORK(&w->work, bpf_wq_work);
+ INIT_WORK(&w->delete_work, bpf_wq_delete_work);
+ cb->value = (void *)async - map->record->wq_off;
+ break;
+ }
+ cb->map = map;
+ cb->prog = NULL;
+ cb->flags = flags;
+ rcu_assign_pointer(cb->callback_fn, NULL);
+
+ WRITE_ONCE(async->cb, cb);
+ /* Guarantee the order between async->cb and map->usercnt. So
+ * when there are concurrent uref release and bpf timer init, either
+ * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
+ * timer or atomic64_read() below returns a zero usercnt.
+ */
+ smp_mb();
+ if (!atomic64_read(&map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs.
+ */
+ WRITE_ONCE(async->cb, NULL);
+ kfree_nolock(cb);
+ ret = -EPERM;
+ }
+out:
+ __bpf_spin_unlock_irqrestore(&async->lock);
+ return ret;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clock_t clockid = flags & (MAX_CLOCKS - 1);
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_timer));
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+
+ return __bpf_async_init(timer, map, flags, BPF_ASYNC_TYPE_TIMER);
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+ .func = bpf_timer_init,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
};
-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn,
+ struct bpf_prog_aux *aux, unsigned int flags,
+ enum bpf_async_type type)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
- void *value = (void *) (unsigned long) r3;
+ struct bpf_prog *prev, *prog = aux->prog;
+ struct bpf_async_cb *cb;
+ int ret = 0;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&async->lock);
+ cb = async->cb;
+ if (!cb) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!atomic64_read(&cb->map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs. Otherwise timer might still be
+ * running even when bpf prog is detached and user space
+ * is gone, since map_release_uref won't ever be called.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ prev = cb->prog;
+ if (prev != prog) {
+ /* Bump prog refcnt once. Every bpf_timer_set_callback()
+ * can pick different callback_fn-s within the same prog.
+ */
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ ret = PTR_ERR(prog);
+ goto out;
+ }
+ if (prev)
+ /* Drop prev prog refcnt when swapping with new prog */
+ bpf_prog_put(prev);
+ cb->prog = prog;
+ }
+ rcu_assign_pointer(cb->callback_fn, callback_fn);
+out:
+ __bpf_spin_unlock_irqrestore(&async->lock);
+ return ret;
+}
- return map->ops->map_update_elem(map, key, value, r4);
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER);
}
-const struct bpf_func_proto bpf_map_update_elem_proto = {
- .func = bpf_map_update_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- .arg3_type = ARG_PTR_TO_MAP_VALUE,
- .arg4_type = ARG_ANYTHING,
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+ .func = bpf_timer_set_callback,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_PTR_TO_FUNC,
};
-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
+ struct bpf_hrtimer *t;
+ int ret = 0;
+ enum hrtimer_mode mode;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t || !t->cb.prog) {
+ ret = -EINVAL;
+ goto out;
+ }
- return map->ops->map_delete_elem(map, key);
+ if (flags & BPF_F_TIMER_ABS)
+ mode = HRTIMER_MODE_ABS_SOFT;
+ else
+ mode = HRTIMER_MODE_REL_SOFT;
+
+ if (flags & BPF_F_TIMER_CPU_PIN)
+ mode |= HRTIMER_MODE_PINNED;
+
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
}
-const struct bpf_func_proto bpf_map_delete_elem_proto = {
- .func = bpf_map_delete_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+static const struct bpf_func_proto bpf_timer_start_proto = {
+ .func = bpf_timer_start,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
};
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+static void drop_prog_refcnt(struct bpf_async_cb *async)
{
- return prandom_u32();
+ struct bpf_prog *prog = async->prog;
+
+ if (prog) {
+ bpf_prog_put(prog);
+ async->prog = NULL;
+ rcu_assign_pointer(async->callback_fn, NULL);
+ }
}
-const struct bpf_func_proto bpf_get_prandom_u32_proto = {
- .func = bpf_get_prandom_u32,
+BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
+{
+ struct bpf_hrtimer *t, *cur_t;
+ bool inc = false;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ rcu_read_lock();
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ cur_t = this_cpu_read(hrtimer_running);
+ if (cur_t == t) {
+ /* If bpf callback_fn is trying to bpf_timer_cancel()
+ * its own timer the hrtimer_cancel() will deadlock
+ * since it waits for callback_fn to finish.
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+
+ /* Only account in-flight cancellations when invoked from a timer
+ * callback, since we want to avoid waiting only if other _callbacks_
+ * are waiting on us, to avoid introducing lockups. Non-callback paths
+ * are ok, since nobody would synchronously wait for their completion.
+ */
+ if (!cur_t)
+ goto drop;
+ atomic_inc(&t->cancelling);
+ /* Need full barrier after relaxed atomic_inc */
+ smp_mb__after_atomic();
+ inc = true;
+ if (atomic_read(&cur_t->cancelling)) {
+ /* We're cancelling timer t, while some other timer callback is
+ * attempting to cancel us. In such a case, it might be possible
+ * that timer t belongs to the other callback, or some other
+ * callback waiting upon it (creating transitive dependencies
+ * upon us), and we will enter a deadlock if we continue
+ * cancelling and waiting for it synchronously, since it might
+ * do the same. Bail!
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+drop:
+ drop_prog_refcnt(&t->cb);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ /* Cancel the timer and wait for associated callback to finish
+ * if it was running.
+ */
+ ret = ret ?: hrtimer_cancel(&t->timer);
+ if (inc)
+ atomic_dec(&t->cancelling);
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+ .func = bpf_timer_cancel,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+};
+
+static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async)
+{
+ struct bpf_async_cb *cb;
+
+ /* Performance optimization: read async->cb without lock first. */
+ if (!READ_ONCE(async->cb))
+ return NULL;
+
+ __bpf_spin_lock_irqsave(&async->lock);
+ /* re-read it under lock */
+ cb = async->cb;
+ if (!cb)
+ goto out;
+ drop_prog_refcnt(cb);
+ /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+ * this timer, since it won't be initialized.
+ */
+ WRITE_ONCE(async->cb, NULL);
+out:
+ __bpf_spin_unlock_irqrestore(&async->lock);
+ return cb;
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_hrtimer *t;
+
+ t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val);
+
+ if (!t)
+ return;
+ /* We check that bpf_map_delete/update_elem() was called from timer
+ * callback_fn. In such case we don't call hrtimer_cancel() (since it
+ * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
+ * just return -1). Though callback_fn is still running on this cpu it's
+ * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+ * from 't'. The bpf subprog callback_fn won't be able to access 't',
+ * since async->cb = NULL was already done. The timer will be
+ * effectively cancelled because bpf_timer_cb() will return
+ * HRTIMER_NORESTART.
+ *
+ * However, it is possible the timer callback_fn calling us armed the
+ * timer _before_ calling us, such that failing to cancel it here will
+ * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
+ * Therefore, we _need_ to cancel any outstanding timers before we do
+ * call_rcu, even though no more timers can be armed.
+ *
+ * Moreover, we need to schedule work even if timer does not belong to
+ * the calling callback_fn, as on two different CPUs, we can end up in a
+ * situation where both sides run in parallel, try to cancel one
+ * another, and we end up waiting on both sides in hrtimer_cancel
+ * without making forward progress, since timer1 depends on time2
+ * callback to finish, and vice versa.
+ *
+ * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
+ * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
+ *
+ * To avoid these issues, punt to workqueue context when we are in a
+ * timer callback.
+ */
+ if (this_cpu_read(hrtimer_running)) {
+ queue_work(system_dfl_wq, &t->cb.delete_work);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* If the timer is running on other CPU, also use a kworker to
+ * wait for the completion of the timer instead of trying to
+ * acquire a sleepable lock in hrtimer_cancel() to wait for its
+ * completion.
+ */
+ if (hrtimer_try_to_cancel(&t->timer) >= 0)
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
+ else
+ queue_work(system_dfl_wq, &t->cb.delete_work);
+ } else {
+ bpf_timer_delete_work(&t->cb.delete_work);
+ }
+}
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
+{
+ struct bpf_work *work;
+
+ BTF_TYPE_EMIT(struct bpf_wq);
+
+ work = (struct bpf_work *)__bpf_async_cancel_and_free(val);
+ if (!work)
+ return;
+ /* Trigger cancel of the sleepable work, but *do not* wait for
+ * it to finish if it was running as we might not be in a
+ * sleepable context.
+ * kfree will be called once the work has finished.
+ */
+ schedule_work(&work->delete_work);
+}
+
+BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
+{
+ unsigned long *kptr = dst;
+
+ /* This helper may be inlined by verifier. */
+ return xchg(kptr, (unsigned long)ptr);
+}
+
+/* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
+ */
+static const struct bpf_func_proto bpf_kptr_xchg_proto = {
+ .func = bpf_kptr_xchg,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = BPF_PTR_POISON,
+ .arg1_type = ARG_KPTR_XCHG_DEST,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
+ .arg2_btf_id = BPF_PTR_POISON,
+};
+
+struct bpf_dynptr_file_impl {
+ struct freader freader;
+ /* 64 bit offset and size overriding 32 bit ones in bpf_dynptr_kern */
+ u64 offset;
+ u64 size;
+};
+
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE ((1UL << 24) - 1)
+#define DYNPTR_TYPE_SHIFT 28
+#define DYNPTR_SIZE_MASK 0xFFFFFF
+#define DYNPTR_RDONLY_BIT BIT(31)
+
+bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+{
+ return ptr->size & DYNPTR_RDONLY_BIT;
+}
+
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
+static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+ ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+ return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
+u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ return df->size;
+ }
+
+ return ptr->size & DYNPTR_SIZE_MASK;
+}
+
+static void bpf_dynptr_advance_offset(struct bpf_dynptr_kern *ptr, u64 off)
+{
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->offset += off;
+ return;
+ }
+ ptr->offset += off;
+}
+
+static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u64 new_size)
+{
+ u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK;
+
+ if (bpf_dynptr_get_type(ptr) == BPF_DYNPTR_TYPE_FILE) {
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ df->size = new_size;
+ return;
+ }
+ ptr->size = (u32)new_size | metadata;
+}
+
+int bpf_dynptr_check_size(u64 size)
+{
+ return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+static int bpf_file_fetch_bytes(struct bpf_dynptr_file_impl *df, u64 offset, void *buf, u64 len)
+{
+ const void *ptr;
+
+ if (!buf)
+ return -EINVAL;
+
+ df->freader.buf = buf;
+ df->freader.buf_sz = len;
+ ptr = freader_fetch(&df->freader, offset + df->offset, len);
+ if (!ptr)
+ return df->freader.err;
+
+ if (ptr != buf) /* Force copying into the buffer */
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+ enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+ ptr->data = data;
+ ptr->offset = offset;
+ ptr->size = size;
+ bpf_dynptr_set_type(ptr, type);
+}
+
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+ memset(ptr, 0, sizeof(*ptr));
+}
+
+BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u64, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_dynptr);
+
+ err = bpf_dynptr_check_size(size);
+ if (err)
+ goto error;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+
+ return 0;
+
+error:
+ bpf_dynptr_set_null(ptr);
+ return err;
+}
+
+static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+ .func = bpf_dynptr_from_mem,
.gpl_only = false,
.ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+static int __bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr_kern *src,
+ u64 offset, u64 flags)
{
- return raw_smp_processor_id();
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!src->data || flags)
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(src, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(src);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len);
+ return 0;
+ case BPF_DYNPTR_TYPE_FILE:
+ return bpf_file_fetch_bytes(src->data, offset, dst, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
}
-const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
- .func = bpf_get_smp_processor_id,
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u64, len, const struct bpf_dynptr_kern *, src,
+ u64, offset, u64, flags)
+{
+ return __bpf_dynptr_read(dst, len, src, offset, flags);
+}
+
+static const struct bpf_func_proto bpf_dynptr_read_proto = {
+ .func = bpf_dynptr_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src,
+ u64 len, u64 flags)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!dst->data || __bpf_dynptr_is_rdonly(dst))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(dst, offset, len);
+ if (err)
+ return err;
+
+ type = bpf_dynptr_get_type(dst);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ if (flags)
+ return -EINVAL;
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+ flags);
+ case BPF_DYNPTR_TYPE_XDP:
+ if (flags)
+ return -EINVAL;
+ return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src,
+ len, flags);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
+}
+
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u64, offset, void *, src,
+ u64, len, u64, flags)
+{
+ return __bpf_dynptr_write(dst, offset, src, len, flags);
+}
+
+static const struct bpf_func_proto bpf_dynptr_write_proto = {
+ .func = bpf_dynptr_write,
.gpl_only = false,
.ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u64, offset, u64, len)
+{
+ enum bpf_dynptr_type type;
+ int err;
+
+ if (!ptr->data)
+ return 0;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return 0;
+
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return 0;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_SKB:
+ case BPF_DYNPTR_TYPE_XDP:
+ case BPF_DYNPTR_TYPE_SKB_META:
+ /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+ return 0;
+ default:
+ WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+ return 0;
+ }
+}
+
+static const struct bpf_func_proto bpf_dynptr_data_proto = {
+ .func = bpf_dynptr_data,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+};
+
+const struct bpf_func_proto bpf_get_current_task_proto __weak;
+const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
+const struct bpf_func_proto bpf_probe_read_user_proto __weak;
+const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
+const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
+const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
+const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
+const struct bpf_func_proto bpf_perf_event_read_proto __weak;
+const struct bpf_func_proto bpf_send_signal_proto __weak;
+const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_proto __weak;
+const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
+
+const struct bpf_func_proto *
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_map_lookup_elem:
+ return &bpf_map_lookup_elem_proto;
+ case BPF_FUNC_map_update_elem:
+ return &bpf_map_update_elem_proto;
+ case BPF_FUNC_map_delete_elem:
+ return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ return &bpf_map_lookup_percpu_elem_proto;
+ case BPF_FUNC_get_prandom_u32:
+ return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_get_smp_processor_id:
+ return &bpf_get_raw_smp_processor_id_proto;
+ case BPF_FUNC_get_numa_node_id:
+ return &bpf_get_numa_node_id_proto;
+ case BPF_FUNC_tail_call:
+ return &bpf_tail_call_proto;
+ case BPF_FUNC_ktime_get_ns:
+ return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
+ case BPF_FUNC_ktime_get_tai_ns:
+ return &bpf_ktime_get_tai_ns_proto;
+ case BPF_FUNC_ringbuf_output:
+ return &bpf_ringbuf_output_proto;
+ case BPF_FUNC_ringbuf_reserve:
+ return &bpf_ringbuf_reserve_proto;
+ case BPF_FUNC_ringbuf_submit:
+ return &bpf_ringbuf_submit_proto;
+ case BPF_FUNC_ringbuf_discard:
+ return &bpf_ringbuf_discard_proto;
+ case BPF_FUNC_ringbuf_query:
+ return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_strncmp:
+ return &bpf_strncmp_proto;
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_ns_current_pid_tgid:
+ return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ default:
+ break;
+ }
+
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
+ case BPF_FUNC_per_cpu_ptr:
+ return &bpf_per_cpu_ptr_proto;
+ case BPF_FUNC_this_cpu_ptr:
+ return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_timer_init:
+ return &bpf_timer_init_proto;
+ case BPF_FUNC_timer_set_callback:
+ return &bpf_timer_set_callback_proto;
+ case BPF_FUNC_timer_start:
+ return &bpf_timer_start_proto;
+ case BPF_FUNC_timer_cancel:
+ return &bpf_timer_cancel_proto;
+ case BPF_FUNC_kptr_xchg:
+ return &bpf_kptr_xchg_proto;
+ case BPF_FUNC_for_each_map_elem:
+ return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_user_ringbuf_drain:
+ return &bpf_user_ringbuf_drain_proto;
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ return &bpf_ringbuf_reserve_dynptr_proto;
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ return &bpf_ringbuf_submit_dynptr_proto;
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ return &bpf_ringbuf_discard_dynptr_proto;
+ case BPF_FUNC_dynptr_from_mem:
+ return &bpf_dynptr_from_mem_proto;
+ case BPF_FUNC_dynptr_read:
+ return &bpf_dynptr_read_proto;
+ case BPF_FUNC_dynptr_write:
+ return &bpf_dynptr_write_proto;
+ case BPF_FUNC_dynptr_data:
+ return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
+ return &bpf_task_storage_delete_proto;
+ default:
+ break;
+ }
+
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_trace_printk:
+ return bpf_get_trace_printk_proto();
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
+ case BPF_FUNC_get_current_task_btf:
+ return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+ case BPF_FUNC_probe_read_user:
+ return &bpf_probe_read_user_proto;
+ case BPF_FUNC_probe_read_kernel:
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_proto;
+ case BPF_FUNC_probe_read_user_str:
+ return &bpf_probe_read_user_str_proto;
+ case BPF_FUNC_probe_read_kernel_str:
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
+ NULL : &bpf_probe_read_kernel_str_proto;
+ case BPF_FUNC_copy_from_user:
+ return &bpf_copy_from_user_proto;
+ case BPF_FUNC_copy_from_user_task:
+ return &bpf_copy_from_user_task_proto;
+ case BPF_FUNC_snprintf_btf:
+ return &bpf_snprintf_btf_proto;
+ case BPF_FUNC_snprintf:
+ return &bpf_snprintf_proto;
+ case BPF_FUNC_task_pt_regs:
+ return &bpf_task_pt_regs_proto;
+ case BPF_FUNC_trace_vprintk:
+ return bpf_get_trace_vprintk_proto();
+ case BPF_FUNC_perf_event_read_value:
+ return bpf_get_perf_event_read_value_proto();
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
+ case BPF_FUNC_send_signal:
+ return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
+ case BPF_FUNC_get_task_stack:
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
+ default:
+ return NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(bpf_base_func_proto);
+
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct list_head *head = list_head, *orig_head = list_head;
+
+ BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+ BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+ /* Do the actual list draining outside the lock to not hold the lock for
+ * too long, and also prevent deadlocks if tracing programs end up
+ * executing on entry/exit of functions called inside the critical
+ * section, and end up doing map ops that call bpf_list_head_free for
+ * the same map value again.
+ */
+ __bpf_spin_lock_irqsave(spin_lock);
+ if (!head->next || list_empty(head))
+ goto unlock;
+ head = head->next;
+unlock:
+ INIT_LIST_HEAD(orig_head);
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ while (head != orig_head) {
+ void *obj = head;
+
+ obj -= field->graph_root.node_offset;
+ head = head->next;
+ /* The contained type can also have resources, including a
+ * bpf_list_head which needs to be freed.
+ */
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+ }
+}
+
+/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are
+ * 'rb_node *', so field name of rb_node within containing struct is not
+ * needed.
+ *
+ * Since bpf_rb_tree's node type has a corresponding struct btf_field with
+ * graph_root.node_offset, it's not necessary to know field name
+ * or type of node struct
+ */
+#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \
+ for (pos = rb_first_postorder(root); \
+ pos && ({ n = rb_next_postorder(pos); 1; }); \
+ pos = n)
+
+void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct rb_root_cached orig_root, *root = rb_root;
+ struct rb_node *pos, *n;
+ void *obj;
+
+ BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root));
+ BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root));
+
+ __bpf_spin_lock_irqsave(spin_lock);
+ orig_root = *root;
+ *root = RB_ROOT_CACHED;
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) {
+ obj = pos;
+ obj -= field->graph_root.node_offset;
+
+
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
+ }
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ u64 size = local_type_id__k;
+ void *p;
+
+ p = bpf_mem_alloc(&bpf_global_ma, size);
+ if (!p)
+ return NULL;
+ if (meta)
+ bpf_obj_init(meta->record, p);
+ return p;
+}
+
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ u64 size = local_type_id__k;
+
+ /* The verifier has ensured that meta__ign must be NULL */
+ return bpf_mem_alloc(&bpf_global_percpu_ma, size);
+}
+
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
+{
+ struct bpf_mem_alloc *ma;
+
+ if (rec && rec->refcount_off >= 0 &&
+ !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+ /* Object is refcounted and refcount_dec didn't result in 0
+ * refcount. Return without freeing the object
+ */
+ return;
+ }
+
+ if (rec)
+ bpf_obj_free_fields(rec, p);
+
+ if (percpu)
+ ma = &bpf_global_percpu_ma;
+ else
+ ma = &bpf_global_ma;
+ bpf_mem_free_rcu(ma, p);
+}
+
+__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ void *p = p__alloc;
+
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
+}
+
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ /* The verifier has ensured that meta__ign must be NULL */
+ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
+}
+
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_refcount *ref;
+
+ /* Could just cast directly to refcount_t *, but need some code using
+ * bpf_refcount type so that it is emitted in vmlinux BTF
+ */
+ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+ if (!refcount_inc_not_zero((refcount_t *)ref))
+ return NULL;
+
+ /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null
+ * in verifier.c
+ */
+ return (void *)p__refcounted_kptr;
+}
+
+static int __bpf_list_add(struct bpf_list_node_kern *node,
+ struct bpf_list_head *head,
+ bool tail, struct btf_record *rec, u64 off)
+{
+ struct list_head *n = &node->list_head, *h = (void *)head;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+
+ /* node->owner != NULL implies !list_empty(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
+ return -EINVAL;
+ }
+
+ tail ? list_add_tail(n, h) : list_add(n, h);
+ WRITE_ONCE(node->owner, head);
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
+{
+ struct bpf_list_node_kern *n = (void *)node;
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n, *h = (void *)head;
+ struct bpf_list_node_kern *node;
+
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (list_empty(h))
+ return NULL;
+
+ n = tail ? h->prev : h->next;
+ node = container_of(n, struct bpf_list_node_kern, list_head);
+ if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+ return NULL;
+
+ list_del_init(n);
+ WRITE_ONCE(node->owner, NULL);
+ return (struct bpf_list_node *)n;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, false);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, true);
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->next;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->prev;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+ struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+ struct rb_node *n = &node_internal->rb_node;
+
+ /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
+ * n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
+ */
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ rb_erase_cached(n, r);
+ RB_CLEAR_NODE(n);
+ WRITE_ONCE(node_internal->owner, NULL);
+ return (struct bpf_rb_node *)n;
+}
+
+/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
+ * program
+ */
+static int __bpf_rbtree_add(struct bpf_rb_root *root,
+ struct bpf_rb_node_kern *node,
+ void *less, struct btf_record *rec, u64 off)
+{
+ struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ struct rb_node *parent = NULL, *n = &node->rb_node;
+ bpf_callback_t cb = (bpf_callback_t)less;
+ bool leftmost = true;
+
+ /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
+ * check the latter
+ */
+ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl((void *)n - off, rec, false);
+ return -EINVAL;
+ }
+
+ while (*link) {
+ parent = *link;
+ if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(n, parent, link);
+ rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ WRITE_ONCE(node->owner, root);
+ return 0;
+}
+
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_rb_node_kern *n = (void *)node;
+
+ return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)rb_first_cached(r);
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)r->rb_root.rb_node;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
+}
+
+/**
+ * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
+ * kfunc which is not stored in a map as a kptr, must be released by calling
+ * bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
+{
+ if (refcount_inc_not_zero(&p->rcu_users))
+ return p;
+ return NULL;
+}
+
+/**
+ * bpf_task_release - Release the reference acquired on a task.
+ * @p: The task on which a reference is being released.
+ */
+__bpf_kfunc void bpf_task_release(struct task_struct *p)
+{
+ put_task_struct_rcu_user(p);
+}
+
+__bpf_kfunc void bpf_task_release_dtor(void *p)
+{
+ put_task_struct_rcu_user(p);
+}
+CFI_NOSEAL(bpf_task_release_dtor);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
+ * this kfunc which is not stored in a map as a kptr, must be released by
+ * calling bpf_cgroup_release().
+ * @cgrp: The cgroup on which a reference is being acquired.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+{
+ return cgroup_tryget(cgrp) ? cgrp : NULL;
+}
+
+/**
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
+ * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
+ * not be freed until the current grace period has ended, even if its refcount
+ * drops to 0.
+ * @cgrp: The cgroup on which a reference is being released.
+ */
+__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
+{
+ cgroup_put(cgrp);
+}
+
+__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
+{
+ cgroup_put(cgrp);
+}
+CFI_NOSEAL(bpf_cgroup_release_dtor);
+
+/**
+ * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
+ * array. A cgroup returned by this kfunc which is not subsequently stored in a
+ * map, must be released by calling bpf_cgroup_release().
+ * @cgrp: The cgroup for which we're performing a lookup.
+ * @level: The level of ancestor to look up.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+{
+ struct cgroup *ancestor;
+
+ if (level > cgrp->level || level < 0)
+ return NULL;
+
+ /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
+ ancestor = cgrp->ancestors[level];
+ if (!cgroup_tryget(ancestor))
+ return NULL;
+ return ancestor;
+}
+
+/**
+ * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
+ * kfunc which is not subsequently stored in a map, must be released by calling
+ * bpf_cgroup_release().
+ * @cgid: cgroup id.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
+{
+ struct cgroup *cgrp;
+
+ cgrp = __cgroup_get_from_id(cgid);
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
+
+/**
+ * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test
+ * task's membership of cgroup ancestry.
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+__bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
+ struct cgroup *ancestor)
+{
+ long ret;
+
+ rcu_read_lock();
+ ret = task_under_cgroup_hierarchy(task, ancestor);
+ rcu_read_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct cgroup *cgrp;
+
+ if (unlikely(idx >= array->map.max_entries))
+ return -E2BIG;
+
+ cgrp = READ_ONCE(array->ptrs[idx]);
+ if (unlikely(!cgrp))
+ return -EAGAIN;
+
+ return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+ .func = bpf_current_task_under_cgroup,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+/**
+ * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @task: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returen. On failure, NULL is returned.
+ */
+__bpf_kfunc struct cgroup *
+bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
+{
+ struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
+
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
+#endif /* CONFIG_CGROUPS */
+
+/**
+ * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
+ * in the root pid namespace idr. If a task is returned, it must either be
+ * stored in a map, or released with bpf_task_release().
+ * @pid: The pid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
+ * bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
+ * in the pid namespace of the current task. If a task is returned, it must
+ * either be stored in a map, or released with bpf_task_release().
+ * @vpid: The vpid of the task being looked up.
+ */
+__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_vpid(vpid);
+ if (p)
+ p = bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+/**
+ * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
+ * @p: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
+{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ enum bpf_dynptr_type type;
+ u64 len = buffer__szk;
+ int err;
+
+ if (!ptr->data)
+ return NULL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return NULL;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return ptr->data + ptr->offset + offset;
+ case BPF_DYNPTR_TYPE_SKB:
+ if (buffer__opt)
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt);
+ else
+ return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ {
+ void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+ if (!IS_ERR_OR_NULL(xdp_ptr))
+ return xdp_ptr;
+
+ if (!buffer__opt)
+ return NULL;
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
+ return buffer__opt;
+ }
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_FILE:
+ err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk);
+ return err ? NULL : buffer__opt;
+ default:
+ WARN_ONCE(true, "unknown dynptr type %d\n", type);
+ return NULL;
+ }
+}
+
+/**
+ * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
+ * @p: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer__opt: User-provided buffer to copy contents into. May be NULL
+ * @buffer__szk: Size (in bytes) of the buffer if present. This is the
+ * length of the requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If buffer__opt is NULL, the call will fail if buffer_opt was needed.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ * return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset,
+ void *buffer__opt, u64 buffer__szk)
+{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+
+ /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+ *
+ * For skb-type dynptrs, it is safe to write into the returned pointer
+ * if the bpf program allows skb data writes. There are two possibilities
+ * that may occur when calling bpf_dynptr_slice_rdwr:
+ *
+ * 1) The requested slice is in the head of the skb. In this case, the
+ * returned pointer is directly to skb data, and if the skb is cloned, the
+ * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+ * The pointer can be directly written into.
+ *
+ * 2) Some portion of the requested slice is in the paged buffer area.
+ * In this case, the requested data will be copied out into the buffer
+ * and the returned pointer will be a pointer to the buffer. The skb
+ * will not be pulled. To persist the write, the user will need to call
+ * bpf_dynptr_write(), which will pull the skb and commit the write.
+ *
+ * Similarly for xdp programs, if the requested slice is not across xdp
+ * fragments, then a direct pointer will be returned, otherwise the data
+ * will be copied out into the buffer and the user will need to call
+ * bpf_dynptr_write() to commit changes.
+ */
+ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
+}
+
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ u64 size;
+
+ if (!ptr->data || start > end)
+ return -EINVAL;
+
+ size = __bpf_dynptr_size(ptr);
+
+ if (start > size || end > size)
+ return -ERANGE;
+
+ bpf_dynptr_advance_offset(ptr, start);
+ bpf_dynptr_set_size(ptr, end - start);
+
+ return 0;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ return !ptr->data;
+}
+
+__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data)
+ return false;
+
+ return __bpf_dynptr_is_rdonly(ptr);
+}
+
+__bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data)
+ return -EINVAL;
+
+ return __bpf_dynptr_size(ptr);
+}
+
+__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
+ struct bpf_dynptr *clone__uninit)
+{
+ struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
+ if (!ptr->data) {
+ bpf_dynptr_set_null(clone);
+ return -EINVAL;
+ }
+
+ *clone = *ptr;
+
+ return 0;
+}
+
+/**
+ * bpf_dynptr_copy() - Copy data from one dynptr to another.
+ * @dst_ptr: Destination dynptr - where data should be copied to
+ * @dst_off: Offset into the destination dynptr
+ * @src_ptr: Source dynptr - where data should be copied from
+ * @src_off: Offset into the source dynptr
+ * @size: Length of the data to copy from source to destination
+ *
+ * Copies data from source dynptr to destination dynptr.
+ * Returns 0 on success; negative error, otherwise.
+ */
+__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off,
+ struct bpf_dynptr *src_ptr, u64 src_off, u64 size)
+{
+ struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
+ struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
+ void *src_slice, *dst_slice;
+ char buf[256];
+ u64 off;
+
+ src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
+ dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
+
+ if (src_slice && dst_slice) {
+ memmove(dst_slice, src_slice, size);
+ return 0;
+ }
+
+ if (src_slice)
+ return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);
+
+ if (dst_slice)
+ return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);
+
+ if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
+ bpf_dynptr_check_off_len(src, src_off, size))
+ return -E2BIG;
+
+ off = 0;
+ while (off < size) {
+ u64 chunk_sz = min_t(u64, sizeof(buf), size - off);
+ int err;
+
+ err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+
+ off += chunk_sz;
+ }
+ return 0;
+}
+
+/**
+ * bpf_dynptr_memset() - Fill dynptr memory with a constant byte.
+ * @p: Destination dynptr - where data will be filled
+ * @offset: Offset into the dynptr to start filling from
+ * @size: Number of bytes to fill
+ * @val: Constant byte to fill the memory with
+ *
+ * Fills the @size bytes of the memory area pointed to by @p
+ * at @offset with the constant byte @val.
+ * Returns 0 on success; negative error, otherwise.
+ */
+__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ u64 chunk_sz, write_off;
+ char buf[256];
+ void* slice;
+ int err;
+
+ slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size);
+ if (likely(slice)) {
+ memset(slice, val, size);
+ return 0;
+ }
+
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, size);
+ if (err)
+ return err;
+
+ /* Non-linear data under the dynptr, write from a local buffer */
+ chunk_sz = min_t(u64, sizeof(buf), size);
+ memset(buf, val, chunk_sz);
+
+ for (write_off = 0; write_off < size; write_off += chunk_sz) {
+ chunk_sz = min_t(u64, sizeof(buf), size - write_off);
+ err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
+{
+ return obj;
+}
+
+__bpf_kfunc void *bpf_rdonly_cast(const void *obj__ign, u32 btf_id__k)
+{
+ return (void *)obj__ign;
+}
+
+__bpf_kfunc void bpf_rcu_read_lock(void)
+{
+ rcu_read_lock();
+}
+
+__bpf_kfunc void bpf_rcu_read_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+struct bpf_throw_ctx {
+ struct bpf_prog_aux *aux;
+ u64 sp;
+ u64 bp;
+ int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct bpf_throw_ctx *ctx = cookie;
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it has an
+ * active stack frame on the current stack trace, and won't disappear.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (!prog)
+ return !ctx->cnt;
+ ctx->cnt++;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctx->aux = prog->aux;
+ ctx->sp = sp;
+ ctx->bp = bp;
+ return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+ struct bpf_throw_ctx ctx = {};
+
+ arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+ WARN_ON_ONCE(!ctx.aux);
+ if (ctx.aux)
+ WARN_ON_ONCE(!ctx.aux->exception_boundary);
+ WARN_ON_ONCE(!ctx.bp);
+ WARN_ON_ONCE(!ctx.cnt);
+ /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
+ * deeper stack depths than ctx.sp as we do not return from bpf_throw,
+ * which skips compiler generated instrumentation to do the same.
+ */
+ kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
+ ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
+ WARN(1, "A call to BPF exception callback should never return\n");
+}
+
+__bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_map *map = p__map;
+
+ BUILD_BUG_ON(sizeof(struct bpf_async_kern) > sizeof(struct bpf_wq));
+ BUILD_BUG_ON(__alignof__(struct bpf_async_kern) != __alignof__(struct bpf_wq));
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
+{
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+ struct bpf_work *w;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ w = READ_ONCE(async->work);
+ if (!w || !READ_ONCE(w->cb.prog))
+ return -EINVAL;
+
+ schedule_work(&w->work);
+ return 0;
+}
+
+__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
+ int (callback_fn)(void *map, int *key, void *value),
+ unsigned int flags,
+ void *aux__prog)
+{
+ struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
+ struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+
+ if (flags)
+ return -EINVAL;
+
+ return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+}
+
+__bpf_kfunc void bpf_preempt_disable(void)
+{
+ preempt_disable();
+}
+
+__bpf_kfunc void bpf_preempt_enable(void)
+{
+ preempt_enable();
+}
+
+struct bpf_iter_bits {
+ __u64 __opaque[2];
+} __aligned(8);
+
+#define BITS_ITER_NR_WORDS_MAX 511
+
+struct bpf_iter_bits_kern {
+ union {
+ __u64 *bits;
+ __u64 bits_copy;
+ };
+ int nr_bits;
+ int bit;
+} __aligned(8);
+
+/* On 64-bit hosts, unsigned long and u64 have the same size, so passing
+ * a u64 pointer and an unsigned long pointer to find_next_bit() will
+ * return the same result, as both point to the same 8-byte area.
+ *
+ * For 32-bit little-endian hosts, using a u64 pointer or unsigned long
+ * pointer also makes no difference. This is because the first iterated
+ * unsigned long is composed of bits 0-31 of the u64 and the second unsigned
+ * long is composed of bits 32-63 of the u64.
+ *
+ * However, for 32-bit big-endian hosts, this is not the case. The first
+ * iterated unsigned long will be bits 32-63 of the u64, so swap these two
+ * ulong values within the u64.
+ */
+static void swap_ulong_in_u64(u64 *bits, unsigned int nr)
+{
+#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN)
+ unsigned int i;
+
+ for (i = 0; i < nr; i++)
+ bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32);
+#endif
+}
+
+/**
+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
+ * @it: The new bpf_iter_bits to be created
+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
+ * @nr_words: The size of the specified memory area, measured in 8-byte units.
+ * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be
+ * further reduced by the BPF memory allocator implementation.
+ *
+ * This function initializes a new bpf_iter_bits structure for iterating over
+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
+ * copies the data of the memory area to the newly created bpf_iter_bits @it for
+ * subsequent iteration operations.
+ *
+ * On success, 0 is returned. On failure, ERR is returned.
+ */
+__bpf_kfunc int
+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bytes = nr_words * sizeof(u64);
+ u32 nr_bits = BYTES_TO_BITS(nr_bytes);
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
+ __alignof__(struct bpf_iter_bits));
+
+ kit->nr_bits = 0;
+ kit->bits_copy = 0;
+ kit->bit = -1;
+
+ if (!unsafe_ptr__ign || !nr_words)
+ return -EINVAL;
+ if (nr_words > BITS_ITER_NR_WORDS_MAX)
+ return -E2BIG;
+
+ /* Optimization for u64 mask */
+ if (nr_bits == 64) {
+ err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
+ if (err)
+ return -EFAULT;
+
+ swap_ulong_in_u64(&kit->bits_copy, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+ }
+
+ if (bpf_mem_alloc_check_size(false, nr_bytes))
+ return -E2BIG;
+
+ /* Fallback to memalloc */
+ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
+ if (!kit->bits)
+ return -ENOMEM;
+
+ err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
+ if (err) {
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+ return err;
+ }
+
+ swap_ulong_in_u64(kit->bits, nr_words);
+
+ kit->nr_bits = nr_bits;
+ return 0;
+}
+
+/**
+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
+ * @it: The bpf_iter_bits to be checked
+ *
+ * This function returns a pointer to a number representing the value of the
+ * next bit in the bits.
+ *
+ * If there are no further bits available, it returns NULL.
+ */
+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ int bit = kit->bit, nr_bits = kit->nr_bits;
+ const void *bits;
+
+ if (!nr_bits || bit >= nr_bits)
+ return NULL;
+
+ bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
+ bit = find_next_bit(bits, nr_bits, bit + 1);
+ if (bit >= nr_bits) {
+ kit->bit = bit;
+ return NULL;
+ }
+
+ kit->bit = bit;
+ return &kit->bit;
+}
+
+/**
+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
+ * @it: The bpf_iter_bits to be destroyed
+ *
+ * Destroy the resource associated with the bpf_iter_bits.
+ */
+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+
+ if (kit->nr_bits <= 64)
+ return;
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+}
+
+/**
+ * bpf_copy_from_user_str() - Copy a string from an unsafe user address
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address, in user space.
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL-terminated string from userspace to BPF space. If user string is
+ * too long this will still ensure zero termination in the dst buffer unless
+ * buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst to 0 on success and
+ * memset all of @dst on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user *unsafe_ptr__ign, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(!dst__sz))
+ return 0;
+
+ ret = strncpy_from_user(dst, unsafe_ptr__ign, dst__sz - 1);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst, 0, dst__sz);
+
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset((char *)dst + ret, 0, dst__sz - ret);
+ else
+ ((char *)dst)[ret] = '\0';
+
+ return ret + 1;
+}
+
+/**
+ * bpf_copy_from_user_task_str() - Copy a string from an task's address space
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address in the task's address space.
+ * @tsk: The task whose address space will be used
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL terminated string from a task's address space to @dst__sz
+ * buffer. If user string is too long this will still ensure zero termination
+ * in the @dst__sz buffer unless buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
+ * and memset all of @dst__sz on failure.
+ *
+ * Return: The number of copied bytes on success including the NUL terminator.
+ * A negative error code on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
+ const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(dst__sz == 0))
+ return 0;
+
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst, 0, dst__sz);
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst + ret, 0, dst__sz - ret);
+
+ return ret + 1;
+}
+
+/* Keep unsinged long in prototype so that kfunc is usable when emitted to
+ * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
+ * unsigned long always points to 8-byte region on stack, the kernel may only
+ * read and write the 4-bytes on 32-bit.
+ */
+__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
+{
+ local_irq_save(*flags__irq_flag);
+}
+
+__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
+{
+ local_irq_restore(*flags__irq_flag);
+}
+
+__bpf_kfunc void __bpf_trap(void)
+{
+}
+
+/*
+ * Kfuncs for string operations.
+ *
+ * Since strings are not necessarily %NUL-terminated, we cannot directly call
+ * in-kernel implementations. Instead, we open-code the implementations using
+ * __get_kernel_nofault instead of plain dereference to make them safe.
+ */
+
+static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case)
+{
+ char c1, c2;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&c1, s1, char, err_out);
+ __get_kernel_nofault(&c2, s2, char, err_out);
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (c1 == '\0')
+ return 0;
+ s1++;
+ s2++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strcmp - Compare two strings
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, false);
+}
+
+/**
+ * bpf_strcasecmp - Compare two strings, ignoring the case of the characters
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strcasecmp(s1__ign, s2__ign, true);
+}
+
+/**
+ * bpf_strnchr - Find a character in a length limited string
+ * @s__ign: The string to be searched
+ * @count: The number of characters to be searched
+ * @c: The character to search for
+ *
+ * Note that the %NUL-terminator is considered part of the string, and can
+ * be searched for.
+ *
+ * Return:
+ * * >=0 - Index of the first occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in the first @count characters of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c)
+{
+ char sc;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == c)
+ return i;
+ if (sc == '\0')
+ return -ENOENT;
+ s__ign++;
+ }
+ return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strchr - Find the first occurrence of a character in a string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Note that the %NUL-terminator is considered part of the string, and can
+ * be searched for.
+ *
+ * Return:
+ * * >=0 - The index of the first occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strchr(const char *s__ign, char c)
+{
+ return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c);
+}
+
+/**
+ * bpf_strchrnul - Find and return a character in a string, or end of string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Return:
+ * * >=0 - Index of the first occurrence of @c within @s__ign or index of
+ * the null byte at the end of @s__ign when @c is not found
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c)
+{
+ char sc;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == '\0' || sc == c)
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strrchr - Find the last occurrence of a character in a string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Return:
+ * * >=0 - Index of the last occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strrchr(const char *s__ign, int c)
+{
+ char sc;
+ int i, last = -ENOENT;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == c)
+ last = i;
+ if (sc == '\0')
+ return last;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strnlen - Calculate the length of a length-limited string
+ * @s__ign: The string
+ * @count: The maximum number of characters to count
+ *
+ * Return:
+ * * >=0 - The length of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count)
+{
+ char c;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&c, s__ign, char, err_out);
+ if (c == '\0')
+ return i;
+ s__ign++;
+ }
+ return i == XATTR_SIZE_MAX ? -E2BIG : i;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strlen - Calculate the length of a string
+ * @s__ign: The string
+ *
+ * Return:
+ * * >=0 - The length of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strlen(const char *s__ign)
+{
+ return bpf_strnlen(s__ign, XATTR_SIZE_MAX);
+}
+
+/**
+ * bpf_strspn - Calculate the length of the initial substring of @s__ign which
+ * only contains letters in @accept__ign
+ * @s__ign: The string to be searched
+ * @accept__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - The length of the initial substring of @s__ign which only
+ * contains letters from @accept__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign)
+{
+ char cs, ca;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(accept__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&cs, s__ign, char, err_out);
+ if (cs == '\0')
+ return i;
+ for (j = 0; j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&ca, accept__ign + j, char, err_out);
+ if (cs == ca || ca == '\0')
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (ca == '\0')
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strcspn - Calculate the length of the initial substring of @s__ign which
+ * does not contain letters in @reject__ign
+ * @s__ign: The string to be searched
+ * @reject__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - The length of the initial substring of @s__ign which does not
+ * contain letters from @reject__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
+{
+ char cs, cr;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(reject__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&cs, s__ign, char, err_out);
+ if (cs == '\0')
+ return i;
+ for (j = 0; j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&cr, reject__ign + j, char, err_out);
+ if (cs == cr || cr == '\0')
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (cr != '\0')
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+static int __bpf_strnstr(const char *s1, const char *s2, size_t len,
+ bool ignore_case)
+{
+ char c1, c2;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s1, 1) ||
+ !copy_from_kernel_nofault_allowed(s2, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&c2, s2 + j, char, err_out);
+ if (c2 == '\0')
+ return i;
+ /*
+ * We allow reading an extra byte from s2 (note the
+ * `i + j <= len` above) to cover the case when s2 is
+ * a suffix of the first len chars of s1.
+ */
+ if (i + j == len)
+ break;
+ __get_kernel_nofault(&c1, s1 + j, char, err_out);
+
+ if (ignore_case) {
+ c1 = tolower(c1);
+ c2 = tolower(c2);
+ }
+
+ if (c1 == '\0')
+ return -ENOENT;
+ if (c1 != c2)
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (i + j == len)
+ return -ENOENT;
+ s1++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strstr - Find the first substring in a string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, false);
+}
+
+/**
+ * bpf_strcasestr - Find the first substring in a string, ignoring the case of
+ * the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcasestr(const char *s1__ign, const char *s2__ign)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX, true);
+}
+
+/**
+ * bpf_strnstr - Find the first substring in a length-limited string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, false);
+}
+
+/**
+ * bpf_strncasestr - Find the first substring in a length-limited string,
+ * ignoring the case of the characters
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strncasestr(const char *s1__ign, const char *s2__ign,
+ size_t len)
+{
+ return __bpf_strnstr(s1__ign, s2__ign, len, true);
+}
+
+#ifdef CONFIG_KEYS
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ * NULL pointer otherwise.
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
+{
+ key_ref_t key_ref;
+ struct bpf_key *bkey;
+
+ if (flags & ~KEY_LOOKUP_ALL)
+ return NULL;
+
+ /*
+ * Permission check is deferred until the key is used, as the
+ * intent of the caller is unknown here.
+ */
+ key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+ if (IS_ERR(key_ref))
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+ if (!bkey) {
+ key_put(key_ref_to_ptr(key_ref));
+ return NULL;
+ }
+
+ bkey->key = key_ref_to_ptr(key_ref);
+ bkey->has_ref = true;
+
+ return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ * pre-determined ID on success, a NULL pointer otherwise
+ */
+__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+ struct bpf_key *bkey;
+
+ if (system_keyring_id_check(id) < 0)
+ return NULL;
+
+ bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+ if (!bkey)
+ return NULL;
+
+ bkey->key = (struct key *)(unsigned long)id;
+ bkey->has_ref = false;
+
+ return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
+{
+ if (bkey->has_ref)
+ key_put(bkey->key);
+
+ kfree(bkey);
+}
+
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_p: data to verify
+ * @sig_p: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
+ struct bpf_key *trusted_keyring)
+{
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
+ const void *data, *sig;
+ u32 data_len, sig_len;
+ int ret;
+
+ if (trusted_keyring->has_ref) {
+ /*
+ * Do the permission check deferred in bpf_lookup_user_key().
+ * See bpf_lookup_user_key() for more details.
+ *
+ * A call to key_task_permission() here would be redundant, as
+ * it is already done by keyring_search() called by
+ * find_asymmetric_key().
+ */
+ ret = key_validate(trusted_keyring->key);
+ if (ret < 0)
+ return ret;
+ }
+
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
+ trusted_keyring->key,
+ VERIFYING_BPF_SIGNATURE, NULL,
+ NULL);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+}
+#endif /* CONFIG_KEYS */
+
+typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
+
+enum bpf_task_work_state {
+ /* bpf_task_work is ready to be used */
+ BPF_TW_STANDBY = 0,
+ /* irq work scheduling in progress */
+ BPF_TW_PENDING,
+ /* task work scheduling in progress */
+ BPF_TW_SCHEDULING,
+ /* task work is scheduled successfully */
+ BPF_TW_SCHEDULED,
+ /* callback is running */
+ BPF_TW_RUNNING,
+ /* associated BPF map value is deleted */
+ BPF_TW_FREED,
};
+
+struct bpf_task_work_ctx {
+ enum bpf_task_work_state state;
+ refcount_t refcnt;
+ struct callback_head work;
+ struct irq_work irq_work;
+ /* bpf_prog that schedules task work */
+ struct bpf_prog *prog;
+ /* task for which callback is scheduled */
+ struct task_struct *task;
+ /* the map and map value associated with this context */
+ struct bpf_map *map;
+ void *map_val;
+ enum task_work_notify_mode mode;
+ bpf_task_work_callback_t callback_fn;
+ struct rcu_head rcu;
+} __aligned(8);
+
+/* Actual type for struct bpf_task_work */
+struct bpf_task_work_kern {
+ struct bpf_task_work_ctx *ctx;
+};
+
+static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx)
+{
+ if (ctx->prog) {
+ bpf_prog_put(ctx->prog);
+ ctx->prog = NULL;
+ }
+ if (ctx->task) {
+ bpf_task_release(ctx->task);
+ ctx->task = NULL;
+ }
+}
+
+static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx)
+{
+ return refcount_inc_not_zero(&ctx->refcnt);
+}
+
+static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx)
+{
+ if (!refcount_dec_and_test(&ctx->refcnt))
+ return;
+
+ bpf_task_work_ctx_reset(ctx);
+
+ /* bpf_mem_free expects migration to be disabled */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, ctx);
+ migrate_enable();
+}
+
+static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx)
+{
+ /*
+ * Scheduled task_work callback holds ctx ref, so if we successfully
+ * cancelled, we put that ref on callback's behalf. If we couldn't
+ * cancel, callback will inevitably run or has already completed
+ * running, and it would have taken care of its ctx ref itself.
+ */
+ if (task_work_cancel(ctx->task, &ctx->work))
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_callback(struct callback_head *cb)
+{
+ struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work);
+ enum bpf_task_work_state state;
+ u32 idx;
+ void *key;
+
+ /* Read lock is needed to protect ctx and map key/value access */
+ guard(rcu_tasks_trace)();
+ /*
+ * This callback may start running before bpf_task_work_irq() switched to
+ * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
+ if (state == BPF_TW_SCHEDULED)
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
+ if (state == BPF_TW_FREED) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
+
+ migrate_disable();
+ ctx->callback_fn(ctx->map, key, ctx->map_val);
+ migrate_enable();
+
+ bpf_task_work_ctx_reset(ctx);
+ (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
+
+ bpf_task_work_ctx_put(ctx);
+}
+
+static void bpf_task_work_irq(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+ enum bpf_task_work_state state;
+ int err;
+
+ guard(rcu_tasks_trace)();
+
+ if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) {
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ err = task_work_add(ctx->task, &ctx->work, ctx->mode);
+ if (err) {
+ bpf_task_work_ctx_reset(ctx);
+ /*
+ * try to switch back to STANDBY for another task_work reuse, but we might have
+ * gone to FREED already, which is fine as we already cleaned up after ourselves
+ */
+ (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
+ bpf_task_work_ctx_put(ctx);
+ return;
+ }
+
+ /*
+ * It's technically possible for just scheduled task_work callback to
+ * complete running by now, going SCHEDULING -> RUNNING and then
+ * dropping its ctx refcount. Instead of capturing extra ref just to
+ * protected below ctx->state access, we rely on RCU protection to
+ * perform below SCHEDULING -> SCHEDULED attempt.
+ */
+ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
+ if (state == BPF_TW_FREED)
+ bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_kern *twk = (void *)tw;
+ struct bpf_task_work_ctx *ctx, *old_ctx;
+
+ ctx = READ_ONCE(twk->ctx);
+ if (ctx)
+ return ctx;
+
+ ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx));
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ memset(ctx, 0, sizeof(*ctx));
+ refcount_set(&ctx->refcnt, 1); /* map's own ref */
+ ctx->state = BPF_TW_STANDBY;
+
+ old_ctx = cmpxchg(&twk->ctx, NULL, ctx);
+ if (old_ctx) {
+ /*
+ * tw->ctx is set by concurrent BPF program, release allocated
+ * memory and try to reuse already set context.
+ */
+ bpf_mem_free(&bpf_global_ma, ctx);
+ return old_ctx;
+ }
+
+ return ctx; /* Success */
+}
+
+static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw,
+ struct bpf_map *map)
+{
+ struct bpf_task_work_ctx *ctx;
+
+ ctx = bpf_task_work_fetch_ctx(tw, map);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ /* try to get ref for task_work callback to hold */
+ if (!bpf_task_work_ctx_tryget(ctx))
+ return ERR_PTR(-EBUSY);
+
+ if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {
+ /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */
+ bpf_task_work_ctx_put(ctx);
+ return ERR_PTR(-EBUSY);
+ }
+
+ /*
+ * If no process or bpffs is holding a reference to the map, no new callbacks should be
+ * scheduled. This does not address any race or correctness issue, but rather is a policy
+ * choice: dropping user references should stop everything.
+ */
+ if (!atomic64_read(&map->usercnt)) {
+ /* drop ref we just got for task_work callback itself */
+ bpf_task_work_ctx_put(ctx);
+ /* transfer map's ref into cancel_and_free() */
+ bpf_task_work_cancel_and_free(tw);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return ctx;
+}
+
+static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
+ struct bpf_map *map, bpf_task_work_callback_t callback_fn,
+ struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
+{
+ struct bpf_prog *prog;
+ struct bpf_task_work_ctx *ctx;
+ int err;
+
+ BTF_TYPE_EMIT(struct bpf_task_work);
+
+ prog = bpf_prog_inc_not_zero(aux->prog);
+ if (IS_ERR(prog))
+ return -EBADF;
+ task = bpf_task_acquire(task);
+ if (!task) {
+ err = -EBADF;
+ goto release_prog;
+ }
+
+ ctx = bpf_task_work_acquire_ctx(tw, map);
+ if (IS_ERR(ctx)) {
+ err = PTR_ERR(ctx);
+ goto release_all;
+ }
+
+ ctx->task = task;
+ ctx->callback_fn = callback_fn;
+ ctx->prog = prog;
+ ctx->mode = mode;
+ ctx->map = map;
+ ctx->map_val = (void *)tw - map->record->task_work_off;
+ init_task_work(&ctx->work, bpf_task_work_callback);
+ init_irq_work(&ctx->irq_work, bpf_task_work_irq);
+
+ irq_work_queue(&ctx->irq_work);
+ return 0;
+
+release_all:
+ bpf_task_release(task);
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+/**
+ * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+}
+
+/**
+ * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * mode
+ * @task: Task struct for which callback should be scheduled
+ * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
+ * @map__map: bpf_map that embeds struct bpf_task_work in the values
+ * @callback: pointer to BPF subprogram to call
+ * @aux__prog: user should pass NULL
+ *
+ * Return: 0 if task work has been scheduled successfully, negative error code otherwise
+ */
+__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
+{
+ return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+}
+
+static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_dynptr_file_impl *state;
+
+ /* flags is currently unsupported */
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl));
+ if (!state) {
+ bpf_dynptr_set_null(ptr);
+ return -ENOMEM;
+ }
+ state->offset = 0;
+ state->size = U64_MAX; /* Don't restrict size, as file may change anyways */
+ freader_init_from_file(&state->freader, NULL, 0, file, may_sleep);
+ bpf_dynptr_init(ptr, state, BPF_DYNPTR_TYPE_FILE, 0, 0);
+ bpf_dynptr_set_rdonly(ptr);
+ return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_from_file(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, false, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit)
+{
+ return make_file_dynptr(file, flags, true, (struct bpf_dynptr_kern *)ptr__uninit);
+}
+
+__bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)dynptr;
+ struct bpf_dynptr_file_impl *df = ptr->data;
+
+ if (!df)
+ return 0;
+
+ freader_cleanup(&df->freader);
+ bpf_mem_free(&bpf_global_ma, df);
+ bpf_dynptr_set_null(ptr);
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work)
+{
+ struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work);
+
+ bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */
+ bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */
+}
+
+void bpf_task_work_cancel_and_free(void *val)
+{
+ struct bpf_task_work_kern *twk = val;
+ struct bpf_task_work_ctx *ctx;
+ enum bpf_task_work_state state;
+
+ ctx = xchg(&twk->ctx, NULL);
+ if (!ctx)
+ return;
+
+ state = xchg(&ctx->state, BPF_TW_FREED);
+ if (state == BPF_TW_SCHEDULED) {
+ /* run in irq_work to avoid locks in NMI */
+ init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled);
+ irq_work_queue(&ctx->irq_work);
+ return;
+ }
+
+ bpf_task_work_ctx_put(ctx); /* put bpf map's ref */
+}
+
+BTF_KFUNCS_START(generic_btf_ids)
+#ifdef CONFIG_CRASH_DUMP
+BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
+#endif
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
+BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
+BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
+
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
+BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+#endif
+BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+#endif
+#ifdef CONFIG_KEYS
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
+#endif
+BTF_KFUNCS_END(generic_btf_ids)
+
+static const struct btf_kfunc_id_set generic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &generic_btf_ids,
+};
+
+
+BTF_ID_LIST(generic_dtor_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(func, bpf_task_release_dtor)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+BTF_ID(func, bpf_cgroup_release_dtor)
+#endif
+
+BTF_KFUNCS_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
+#endif
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_dynptr_adjust)
+BTF_ID_FLAGS(func, bpf_dynptr_is_null)
+BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
+BTF_ID_FLAGS(func, bpf_dynptr_size)
+BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_ID_FLAGS(func, bpf_dynptr_copy)
+BTF_ID_FLAGS(func, bpf_dynptr_memset)
+#ifdef CONFIG_NET
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+#endif
+BTF_ID_FLAGS(func, bpf_wq_init)
+BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_start)
+BTF_ID_FLAGS(func, bpf_preempt_disable)
+BTF_ID_FLAGS(func, bpf_preempt_enable)
+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_get_kmem_cache)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_local_irq_save)
+BTF_ID_FLAGS(func, bpf_local_irq_restore)
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
+BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#endif
+#ifdef CONFIG_DMA_SHARED_BUFFER
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+#endif
+BTF_ID_FLAGS(func, __bpf_trap)
+BTF_ID_FLAGS(func, bpf_strcmp);
+BTF_ID_FLAGS(func, bpf_strcasecmp);
+BTF_ID_FLAGS(func, bpf_strchr);
+BTF_ID_FLAGS(func, bpf_strchrnul);
+BTF_ID_FLAGS(func, bpf_strnchr);
+BTF_ID_FLAGS(func, bpf_strrchr);
+BTF_ID_FLAGS(func, bpf_strlen);
+BTF_ID_FLAGS(func, bpf_strnlen);
+BTF_ID_FLAGS(func, bpf_strspn);
+BTF_ID_FLAGS(func, bpf_strcspn);
+BTF_ID_FLAGS(func, bpf_strstr);
+BTF_ID_FLAGS(func, bpf_strcasestr);
+BTF_ID_FLAGS(func, bpf_strnstr);
+BTF_ID_FLAGS(func, bpf_strncasestr);
+#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
+BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
+#endif
+BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
+BTF_KFUNCS_END(common_btf_ids)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &common_btf_ids,
+};
+
+static int __init kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc generic_dtors[] = {
+ {
+ .btf_id = generic_dtor_ids[0],
+ .kfunc_btf_id = generic_dtor_ids[1]
+ },
+#ifdef CONFIG_CGROUPS
+ {
+ .btf_id = generic_dtor_ids[2],
+ .kfunc_btf_id = generic_dtor_ids[3]
+ },
+#endif
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &generic_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
+ ARRAY_SIZE(generic_dtors),
+ THIS_MODULE);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
+}
+
+late_initcall(kfunc_init);
+
+/* Get a pointer to dynptr data up to len bytes for read only access. If
+ * the dynptr doesn't have continuous data up to len bytes, return NULL.
+ */
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len)
+{
+ const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
+
+ return bpf_dynptr_slice(p, 0, NULL, len);
+}
+
+/* Get a pointer to dynptr data up to len bytes for read write access. If
+ * the dynptr doesn't have continuous data up to len bytes, or the dynptr
+ * is read only, return NULL.
+ */
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len)
+{
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+ return (void *)__bpf_dynptr_data(ptr, len);
+}
+
+void bpf_map_free_internal_structs(struct bpf_map *map, void *val)
+{
+ if (btf_record_has_field(map->record, BPF_TIMER))
+ bpf_obj_free_timer(map->record, val);
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+ bpf_obj_free_workqueue(map->record, val);
+ if (btf_record_has_field(map->record, BPF_TASK_WORK))
+ bpf_obj_free_task_work(map->record, val);
+}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
new file mode 100644
index 000000000000..9f866a010dad
--- /dev/null
+++ b/kernel/bpf/inode.c
@@ -0,0 +1,1105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Minimal file system backend for holding eBPF maps and programs,
+ * used by bpf(2) object pinning.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <daniel@iogearbox.net>
+ */
+
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/major.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/kdev_t.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
+#include "preload/bpf_preload.h"
+
+enum bpf_type {
+ BPF_TYPE_UNSPEC = 0,
+ BPF_TYPE_PROG,
+ BPF_TYPE_MAP,
+ BPF_TYPE_LINK,
+};
+
+static void *bpf_any_get(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ bpf_prog_inc(raw);
+ break;
+ case BPF_TYPE_MAP:
+ bpf_map_inc_with_uref(raw);
+ break;
+ case BPF_TYPE_LINK:
+ bpf_link_inc(raw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return raw;
+}
+
+static void bpf_any_put(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ bpf_prog_put(raw);
+ break;
+ case BPF_TYPE_MAP:
+ bpf_map_put_with_uref(raw);
+ break;
+ case BPF_TYPE_LINK:
+ bpf_link_put(raw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
+{
+ void *raw;
+
+ raw = bpf_map_get_with_uref(ufd);
+ if (!IS_ERR(raw)) {
+ *type = BPF_TYPE_MAP;
+ return raw;
+ }
+
+ raw = bpf_prog_get(ufd);
+ if (!IS_ERR(raw)) {
+ *type = BPF_TYPE_PROG;
+ return raw;
+ }
+
+ raw = bpf_link_get_from_fd(ufd);
+ if (!IS_ERR(raw)) {
+ *type = BPF_TYPE_LINK;
+ return raw;
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static const struct inode_operations bpf_dir_iops;
+
+static const struct inode_operations bpf_prog_iops = { };
+static const struct inode_operations bpf_map_iops = { };
+static const struct inode_operations bpf_link_iops = { };
+
+struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
+{
+ struct inode *inode;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ case S_IFREG:
+ case S_IFLNK:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOSPC);
+
+ inode->i_ino = get_next_ino();
+ simple_inode_init_ts(inode);
+
+ inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+
+ return inode;
+}
+
+static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
+{
+ *type = BPF_TYPE_UNSPEC;
+ if (inode->i_op == &bpf_prog_iops)
+ *type = BPF_TYPE_PROG;
+ else if (inode->i_op == &bpf_map_iops)
+ *type = BPF_TYPE_MAP;
+ else if (inode->i_op == &bpf_link_iops)
+ *type = BPF_TYPE_LINK;
+ else
+ return -EACCES;
+
+ return 0;
+}
+
+static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
+ struct inode *dir)
+{
+ d_make_persistent(dentry, inode);
+
+ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
+}
+
+static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ inode->i_op = &bpf_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+
+ inc_nlink(inode);
+ inc_nlink(dir);
+
+ bpf_dentry_finalize(dentry, inode, dir);
+ return NULL;
+}
+
+struct map_iter {
+ void *key;
+ bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+ return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+ return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+ if (iter) {
+ kfree(iter->key);
+ kfree(iter);
+ }
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+ struct map_iter *iter;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+ if (!iter)
+ goto error;
+
+ iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!iter->key)
+ goto error;
+
+ return iter;
+
+error:
+ map_iter_free(iter);
+ return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct bpf_map *map = seq_file_to_map(m);
+ void *key = map_iter(m)->key;
+ void *prev_key;
+
+ (*pos)++;
+ if (map_iter(m)->done)
+ return NULL;
+
+ if (unlikely(v == SEQ_START_TOKEN))
+ prev_key = NULL;
+ else
+ prev_key = key;
+
+ rcu_read_lock();
+ if (map->ops->map_get_next_key(map, prev_key, key)) {
+ map_iter(m)->done = true;
+ key = NULL;
+ }
+ rcu_read_unlock();
+ return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (map_iter(m)->done)
+ return NULL;
+
+ return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+ struct bpf_map *map = seq_file_to_map(m);
+ void *key = map_iter(m)->key;
+
+ if (unlikely(v == SEQ_START_TOKEN)) {
+ seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+ seq_puts(m, "# WARNING!! The output format will change\n");
+ } else {
+ map->ops->map_seq_show_elem(map, key, m);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+ .start = map_seq_start,
+ .next = map_seq_next,
+ .show = map_seq_show,
+ .stop = map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+ struct bpf_map *map = inode->i_private;
+ struct map_iter *iter;
+ struct seq_file *m;
+ int err;
+
+ iter = map_iter_alloc(map);
+ if (!iter)
+ return -ENOMEM;
+
+ err = seq_open(file, &bpffs_map_seq_ops);
+ if (err) {
+ map_iter_free(iter);
+ return err;
+ }
+
+ m = file->private_data;
+ m->private = iter;
+
+ return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+
+ map_iter_free(map_iter(m));
+
+ return seq_release(inode, file);
+}
+
+/* bpffs_map_fops should only implement the basic
+ * read operation for a BPF map. The purpose is to
+ * provide a simple user intuitive way to do
+ * "cat bpffs/pathto/a-pinned-map".
+ *
+ * Other operations (e.g. write, lookup...) should be realized by
+ * the userspace tools (e.g. bpftool) through the
+ * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
+ * interface.
+ */
+static const struct file_operations bpffs_map_fops = {
+ .open = bpffs_map_open,
+ .read = seq_read,
+ .release = bpffs_map_release,
+};
+
+static int bpffs_obj_open(struct inode *inode, struct file *file)
+{
+ return -EIO;
+}
+
+static const struct file_operations bpffs_obj_fops = {
+ .open = bpffs_obj_open,
+};
+
+static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
+ const struct inode_operations *iops,
+ const struct file_operations *fops)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = iops;
+ inode->i_fop = fops;
+ inode->i_private = raw;
+
+ bpf_dentry_finalize(dentry, inode, dir);
+ return 0;
+}
+
+static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
+{
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
+ &bpffs_obj_fops);
+}
+
+static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
+{
+ struct bpf_map *map = arg;
+
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+ bpf_map_support_seq_show(map) ?
+ &bpffs_map_fops : &bpffs_obj_fops);
+}
+
+static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
+{
+ struct bpf_link *link = arg;
+
+ return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
+ bpf_link_is_iter(link) ?
+ &bpf_iter_fops : &bpffs_obj_fops);
+}
+
+static struct dentry *
+bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
+{
+ /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
+ * extensions. That allows popoulate_bpffs() create special files.
+ */
+ if ((dir->i_mode & S_IALLUGO) &&
+ strchr(dentry->d_name.name, '.'))
+ return ERR_PTR(-EPERM);
+
+ return simple_lookup(dir, dentry, flags);
+}
+
+static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, const char *target)
+{
+ char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
+ struct inode *inode;
+
+ if (!link)
+ return -ENOMEM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
+ if (IS_ERR(inode)) {
+ kfree(link);
+ return PTR_ERR(inode);
+ }
+
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = link;
+
+ bpf_dentry_finalize(dentry, inode, dir);
+ return 0;
+}
+
+static const struct inode_operations bpf_dir_iops = {
+ .lookup = bpf_lookup,
+ .mkdir = bpf_mkdir,
+ .symlink = bpf_symlink,
+ .rmdir = simple_rmdir,
+ .rename = simple_rename,
+ .link = simple_link,
+ .unlink = simple_unlink,
+};
+
+/* pin iterator link into bpffs */
+static int bpf_iter_link_pin_kernel(struct dentry *parent,
+ const char *name, struct bpf_link *link)
+{
+ umode_t mode = S_IFREG | S_IRUSR;
+ struct dentry *dentry;
+ int ret;
+
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
+ &bpf_iter_fops);
+ simple_done_creating(dentry);
+ return ret;
+}
+
+static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
+ enum bpf_type type)
+{
+ struct dentry *dentry;
+ struct inode *dir;
+ struct path path;
+ umode_t mode;
+ int ret;
+
+ dentry = start_creating_user_path(path_fd, pathname, &path, 0);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ dir = d_inode(path.dentry);
+ if (dir->i_op != &bpf_dir_iops) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ ret = security_path_mknod(&path, dentry, mode, 0);
+ if (ret)
+ goto out;
+
+ switch (type) {
+ case BPF_TYPE_PROG:
+ ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
+ break;
+ case BPF_TYPE_MAP:
+ ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
+ break;
+ case BPF_TYPE_LINK:
+ ret = vfs_mkobj(dentry, mode, bpf_mklink, raw);
+ break;
+ default:
+ ret = -EPERM;
+ }
+out:
+ end_creating_path(&path, dentry);
+ return ret;
+}
+
+int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname)
+{
+ enum bpf_type type;
+ void *raw;
+ int ret;
+
+ raw = bpf_fd_probe_obj(ufd, &type);
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
+
+ ret = bpf_obj_do_pin(path_fd, pathname, raw, type);
+ if (ret != 0)
+ bpf_any_put(raw, type);
+
+ return ret;
+}
+
+static void *bpf_obj_do_get(int path_fd, const char __user *pathname,
+ enum bpf_type *type, int flags)
+{
+ struct inode *inode;
+ struct path path;
+ void *raw;
+ int ret;
+
+ ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ inode = d_backing_inode(path.dentry);
+ ret = path_permission(&path, ACC_MODE(flags));
+ if (ret)
+ goto out;
+
+ ret = bpf_inode_type(inode, type);
+ if (ret)
+ goto out;
+
+ raw = bpf_any_get(inode->i_private, *type);
+ if (!IS_ERR(raw))
+ touch_atime(&path);
+
+ path_put(&path);
+ return raw;
+out:
+ path_put(&path);
+ return ERR_PTR(ret);
+}
+
+int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags)
+{
+ enum bpf_type type = BPF_TYPE_UNSPEC;
+ int f_flags;
+ void *raw;
+ int ret;
+
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
+ raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags);
+ if (IS_ERR(raw))
+ return PTR_ERR(raw);
+
+ if (type == BPF_TYPE_PROG)
+ ret = bpf_prog_new_fd(raw);
+ else if (type == BPF_TYPE_MAP)
+ ret = bpf_map_new_fd(raw, f_flags);
+ else if (type == BPF_TYPE_LINK)
+ ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
+ else
+ return -ENOENT;
+
+ if (ret < 0)
+ bpf_any_put(raw, type);
+ return ret;
+}
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (inode->i_op == &bpf_map_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op == &bpf_link_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op != &bpf_prog_iops)
+ return ERR_PTR(-EACCES);
+
+ prog = inode->i_private;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!bpf_prog_get_ok(prog, &type, false))
+ return ERR_PTR(-EINVAL);
+
+ bpf_prog_inc(prog);
+ return prog;
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ struct path path;
+ int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+ prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+ if (!IS_ERR(prog))
+ touch_atime(&path);
+ path_put(&path);
+ return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
+
+struct bpffs_btf_enums {
+ const struct btf *btf;
+ const struct btf_type *cmd_t;
+ const struct btf_type *map_t;
+ const struct btf_type *prog_t;
+ const struct btf_type *attach_t;
+};
+
+static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
+{
+ const struct btf *btf;
+ const struct btf_type *t;
+ const char *name;
+ int i, n;
+
+ memset(info, 0, sizeof(*info));
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -ENOENT;
+
+ info->btf = btf;
+
+ for (i = 1, n = btf_nr_types(btf); i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (!btf_type_is_enum(t))
+ continue;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name)
+ continue;
+
+ if (strcmp(name, "bpf_cmd") == 0)
+ info->cmd_t = t;
+ else if (strcmp(name, "bpf_map_type") == 0)
+ info->map_t = t;
+ else if (strcmp(name, "bpf_prog_type") == 0)
+ info->prog_t = t;
+ else if (strcmp(name, "bpf_attach_type") == 0)
+ info->attach_t = t;
+ else
+ continue;
+
+ if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
+ return 0;
+ }
+
+ return -ESRCH;
+}
+
+static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
+ const char *prefix, const char *str, int *value)
+{
+ const struct btf_enum *e;
+ const char *name;
+ int i, n, pfx_len = strlen(prefix);
+
+ *value = 0;
+
+ if (!btf || !enum_t)
+ return false;
+
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+
+ /* match symbolic name case insensitive and ignoring prefix */
+ if (strcasecmp(name + pfx_len, str) == 0) {
+ *value = e->val;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void seq_print_delegate_opts(struct seq_file *m,
+ const char *opt_name,
+ const struct btf *btf,
+ const struct btf_type *enum_t,
+ const char *prefix,
+ u64 delegate_msk, u64 any_msk)
+{
+ const struct btf_enum *e;
+ bool first = true;
+ const char *name;
+ u64 msk;
+ int i, n, pfx_len = strlen(prefix);
+
+ delegate_msk &= any_msk; /* clear unknown bits */
+
+ if (delegate_msk == 0)
+ return;
+
+ seq_printf(m, ",%s", opt_name);
+ if (delegate_msk == any_msk) {
+ seq_printf(m, "=any");
+ return;
+ }
+
+ if (btf && enum_t) {
+ for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+ e = &btf_enum(enum_t)[i];
+ name = btf_name_by_offset(btf, e->name_off);
+ if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+ continue;
+ msk = 1ULL << e->val;
+ if (delegate_msk & msk) {
+ /* emit lower-case name without prefix */
+ seq_putc(m, first ? '=' : ':');
+ name += pfx_len;
+ while (*name) {
+ seq_putc(m, tolower(*name));
+ name++;
+ }
+
+ delegate_msk &= ~msk;
+ first = false;
+ }
+ }
+ }
+ if (delegate_msk)
+ seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
+}
+
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int bpf_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct inode *inode = d_inode(root);
+ umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+ struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
+ u64 mask;
+
+ if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, inode->i_uid));
+ if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, inode->i_gid));
+ if (mode != S_IRWXUGO)
+ seq_printf(m, ",mode=%o", mode);
+
+ if (opts->delegate_cmds || opts->delegate_maps ||
+ opts->delegate_progs || opts->delegate_attachs) {
+ struct bpffs_btf_enums info;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ mask = (1ULL << __MAX_BPF_CMD) - 1;
+ seq_print_delegate_opts(m, "delegate_cmds",
+ info.btf, info.cmd_t, "BPF_",
+ opts->delegate_cmds, mask);
+
+ mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_maps",
+ info.btf, info.map_t, "BPF_MAP_TYPE_",
+ opts->delegate_maps, mask);
+
+ mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_progs",
+ info.btf, info.prog_t, "BPF_PROG_TYPE_",
+ opts->delegate_progs, mask);
+
+ mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+ seq_print_delegate_opts(m, "delegate_attachs",
+ info.btf, info.attach_t, "BPF_",
+ opts->delegate_attachs, mask);
+ }
+
+ return 0;
+}
+
+static void bpf_destroy_inode(struct inode *inode)
+{
+ enum bpf_type type;
+
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+ free_inode_nonrcu(inode);
+}
+
+const struct super_operations bpf_super_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = inode_just_drop,
+ .show_options = bpf_show_options,
+ .destroy_inode = bpf_destroy_inode,
+};
+
+enum {
+ OPT_UID,
+ OPT_GID,
+ OPT_MODE,
+ OPT_DELEGATE_CMDS,
+ OPT_DELEGATE_MAPS,
+ OPT_DELEGATE_PROGS,
+ OPT_DELEGATE_ATTACHS,
+};
+
+static const struct fs_parameter_spec bpf_fs_parameters[] = {
+ fsparam_u32 ("uid", OPT_UID),
+ fsparam_u32 ("gid", OPT_GID),
+ fsparam_u32oct ("mode", OPT_MODE),
+ fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS),
+ fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS),
+ fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS),
+ fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS),
+ {}
+};
+
+static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct bpf_mount_opts *opts = fc->s_fs_info;
+ struct fs_parse_result result;
+ kuid_t uid;
+ kgid_t gid;
+ int opt, err;
+
+ opt = fs_parse(fc, bpf_fs_parameters, param, &result);
+ if (opt < 0) {
+ /* We might like to report bad mount options here, but
+ * traditionally we've ignored all mount options, so we'd
+ * better continue to ignore non-existing options for bpf.
+ */
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
+
+ return 0;
+ }
+
+ if (opt < 0)
+ return opt;
+ }
+
+ switch (opt) {
+ case OPT_UID:
+ uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(uid))
+ goto bad_value;
+
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fc->user_ns, uid))
+ goto bad_value;
+
+ opts->uid = uid;
+ break;
+ case OPT_GID:
+ gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(gid))
+ goto bad_value;
+
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fc->user_ns, gid))
+ goto bad_value;
+
+ opts->gid = gid;
+ break;
+ case OPT_MODE:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
+ case OPT_DELEGATE_CMDS:
+ case OPT_DELEGATE_MAPS:
+ case OPT_DELEGATE_PROGS:
+ case OPT_DELEGATE_ATTACHS: {
+ struct bpffs_btf_enums info;
+ const struct btf_type *enum_t;
+ const char *enum_pfx;
+ u64 *delegate_msk, msk = 0;
+ char *p, *str;
+ int val;
+
+ /* ignore errors, fallback to hex */
+ (void)find_bpffs_btf_enums(&info);
+
+ switch (opt) {
+ case OPT_DELEGATE_CMDS:
+ delegate_msk = &opts->delegate_cmds;
+ enum_t = info.cmd_t;
+ enum_pfx = "BPF_";
+ break;
+ case OPT_DELEGATE_MAPS:
+ delegate_msk = &opts->delegate_maps;
+ enum_t = info.map_t;
+ enum_pfx = "BPF_MAP_TYPE_";
+ break;
+ case OPT_DELEGATE_PROGS:
+ delegate_msk = &opts->delegate_progs;
+ enum_t = info.prog_t;
+ enum_pfx = "BPF_PROG_TYPE_";
+ break;
+ case OPT_DELEGATE_ATTACHS:
+ delegate_msk = &opts->delegate_attachs;
+ enum_t = info.attach_t;
+ enum_pfx = "BPF_";
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ str = param->string;
+ while ((p = strsep(&str, ":"))) {
+ if (strcmp(p, "any") == 0) {
+ msk |= ~0ULL;
+ } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
+ msk |= 1ULL << val;
+ } else {
+ err = kstrtou64(p, 0, &msk);
+ if (err)
+ return err;
+ }
+ }
+
+ /* Setting delegation mount options requires privileges */
+ if (msk && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *delegate_msk |= msk;
+ break;
+ }
+ default:
+ /* ignore unknown mount options */
+ break;
+ }
+
+ return 0;
+bad_value:
+ return invalfc(fc, "Bad value for '%s'", param->key);
+}
+
+struct bpf_preload_ops *bpf_preload_ops;
+EXPORT_SYMBOL_GPL(bpf_preload_ops);
+
+static bool bpf_preload_mod_get(void)
+{
+ /* If bpf_preload.ko wasn't loaded earlier then load it now.
+ * When bpf_preload is built into vmlinux the module's __init
+ * function will populate it.
+ */
+ if (!bpf_preload_ops) {
+ request_module("bpf_preload");
+ if (!bpf_preload_ops)
+ return false;
+ }
+ /* And grab the reference, so the module doesn't disappear while the
+ * kernel is interacting with the kernel module and its UMD.
+ */
+ if (!try_module_get(bpf_preload_ops->owner)) {
+ pr_err("bpf_preload module get failed.\n");
+ return false;
+ }
+ return true;
+}
+
+static void bpf_preload_mod_put(void)
+{
+ if (bpf_preload_ops)
+ /* now user can "rmmod bpf_preload" if necessary */
+ module_put(bpf_preload_ops->owner);
+}
+
+static DEFINE_MUTEX(bpf_preload_lock);
+
+static int populate_bpffs(struct dentry *parent)
+{
+ struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
+ int err = 0, i;
+
+ /* grab the mutex to make sure the kernel interactions with bpf_preload
+ * are serialized
+ */
+ mutex_lock(&bpf_preload_lock);
+
+ /* if bpf_preload.ko wasn't built into vmlinux then load it */
+ if (!bpf_preload_mod_get())
+ goto out;
+
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ bpf_link_inc(objs[i].link);
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, objs[i].link);
+ if (err) {
+ bpf_link_put(objs[i].link);
+ goto out_put;
+ }
+ }
+out_put:
+ bpf_preload_mod_put();
+out:
+ mutex_unlock(&bpf_preload_lock);
+ return err;
+}
+
+static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ static const struct tree_descr bpf_rfiles[] = { { "" } };
+ struct bpf_mount_opts *opts = sb->s_fs_info;
+ struct inode *inode;
+ int ret;
+
+ /* Mounting an instance of BPF FS requires privileges */
+ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
+ if (ret)
+ return ret;
+
+ sb->s_op = &bpf_super_ops;
+
+ inode = sb->s_root->d_inode;
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
+ inode->i_op = &bpf_dir_iops;
+ inode->i_mode &= ~S_IALLUGO;
+ populate_bpffs(sb->s_root);
+ inode->i_mode |= S_ISVTX | opts->mode;
+ return 0;
+}
+
+static int bpf_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, bpf_fill_super);
+}
+
+static void bpf_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations bpf_context_ops = {
+ .free = bpf_free_fc,
+ .parse_param = bpf_parse_param,
+ .get_tree = bpf_get_tree,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int bpf_init_fs_context(struct fs_context *fc)
+{
+ struct bpf_mount_opts *opts;
+
+ opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ opts->mode = S_IRWXUGO;
+ opts->uid = current_fsuid();
+ opts->gid = current_fsgid();
+
+ /* start out with no BPF token delegation enabled */
+ opts->delegate_cmds = 0;
+ opts->delegate_maps = 0;
+ opts->delegate_progs = 0;
+ opts->delegate_attachs = 0;
+
+ fc->s_fs_info = opts;
+ fc->ops = &bpf_context_ops;
+ return 0;
+}
+
+static void bpf_kill_super(struct super_block *sb)
+{
+ struct bpf_mount_opts *opts = sb->s_fs_info;
+
+ kill_anon_super(sb);
+ kfree(opts);
+}
+
+static struct file_system_type bpf_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "bpf",
+ .init_fs_context = bpf_init_fs_context,
+ .parameters = bpf_fs_parameters,
+ .kill_sb = bpf_kill_super,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+static int __init bpf_init(void)
+{
+ int ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "bpf");
+ if (ret)
+ return ret;
+
+ ret = register_filesystem(&bpf_fs_type);
+ if (ret)
+ sysfs_remove_mount_point(fs_kobj, "bpf");
+
+ return ret;
+}
+fs_initcall(bpf_init);
diff --git a/kernel/bpf/kmem_cache_iter.c b/kernel/bpf/kmem_cache_iter.c
new file mode 100644
index 000000000000..3ae2158d767f
--- /dev/null
+++ b/kernel/bpf/kmem_cache_iter.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
+
+/* open-coded version */
+struct bpf_iter_kmem_cache {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_kmem_cache_kern {
+ struct kmem_cache *pos;
+} __attribute__((aligned(8)));
+
+#define KMEM_CACHE_POS_START ((void *)1L)
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->pos = KMEM_CACHE_POS_START;
+ return 0;
+}
+
+__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *prev = kit->pos;
+ struct kmem_cache *next;
+ bool destroy = false;
+
+ if (!prev)
+ return NULL;
+
+ mutex_lock(&slab_mutex);
+
+ if (list_empty(&slab_caches)) {
+ mutex_unlock(&slab_mutex);
+ return NULL;
+ }
+
+ if (prev == KMEM_CACHE_POS_START)
+ next = list_first_entry(&slab_caches, struct kmem_cache, list);
+ else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev)
+ next = NULL;
+ else
+ next = list_next_entry(prev, list);
+
+ /* boot_caches have negative refcount, don't touch them */
+ if (next && next->refcount > 0)
+ next->refcount++;
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (prev && prev != KMEM_CACHE_POS_START) {
+ if (prev->refcount > 1)
+ prev->refcount--;
+ else if (prev->refcount == 1)
+ destroy = true;
+ }
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(prev);
+
+ kit->pos = next;
+ return next;
+}
+
+__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it)
+{
+ struct bpf_iter_kmem_cache_kern *kit = (void *)it;
+ struct kmem_cache *s = kit->pos;
+ bool destroy = false;
+
+ if (s == NULL || s == KMEM_CACHE_POS_START)
+ return;
+
+ mutex_lock(&slab_mutex);
+
+ /* Skip kmem_cache_destroy() for active entries */
+ if (s->refcount > 1)
+ s->refcount--;
+ else if (s->refcount == 1)
+ destroy = true;
+
+ mutex_unlock(&slab_mutex);
+
+ if (destroy)
+ kmem_cache_destroy(s);
+}
+
+__bpf_kfunc_end_defs();
+
+struct bpf_iter__kmem_cache {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kmem_cache *, s);
+};
+
+union kmem_cache_iter_priv {
+ struct bpf_iter_kmem_cache it;
+ struct bpf_iter_kmem_cache_kern kit;
+};
+
+static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ loff_t cnt = 0;
+ bool found = false;
+ struct kmem_cache *s;
+ union kmem_cache_iter_priv *p = seq->private;
+
+ mutex_lock(&slab_mutex);
+
+ /* Find an entry at the given position in the slab_caches list instead
+ * of keeping a reference (of the last visited entry, if any) out of
+ * slab_mutex. It might miss something if one is deleted in the middle
+ * while it releases the lock. But it should be rare and there's not
+ * much we can do about it.
+ */
+ list_for_each_entry(s, &slab_caches, list) {
+ if (cnt == *pos) {
+ /* Make sure this entry remains in the list by getting
+ * a new reference count. Note that boot_cache entries
+ * have a negative refcount, so don't touch them.
+ */
+ if (s->refcount > 0)
+ s->refcount++;
+ found = true;
+ break;
+ }
+ cnt++;
+ }
+ mutex_unlock(&slab_mutex);
+
+ if (!found)
+ s = NULL;
+
+ p->kit.pos = s;
+ return s;
+}
+
+static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ union kmem_cache_iter_priv *p = seq->private;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog && !ctx.s)
+ bpf_iter_run_prog(prog, &ctx);
+
+ bpf_iter_kmem_cache_destroy(&p->it);
+}
+
+static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ union kmem_cache_iter_priv *p = seq->private;
+
+ ++*pos;
+
+ return bpf_iter_kmem_cache_next(&p->it);
+}
+
+static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__kmem_cache ctx = {
+ .meta = &meta,
+ .s = v,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static const struct seq_operations kmem_cache_iter_seq_ops = {
+ .start = kmem_cache_iter_seq_start,
+ .next = kmem_cache_iter_seq_next,
+ .stop = kmem_cache_iter_seq_stop,
+ .show = kmem_cache_iter_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
+
+static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
+ .seq_ops = &kmem_cache_iter_seq_ops,
+ .seq_priv_size = sizeof(union kmem_cache_iter_priv),
+};
+
+static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "kmem_cache iter\n");
+}
+
+DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
+ struct kmem_cache *s)
+
+static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
+ .target = "kmem_cache",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_kmem_cache_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__kmem_cache, s),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &kmem_cache_iter_seq_info,
+};
+
+static int __init bpf_kmem_cache_iter_init(void)
+{
+ bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
+ return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
+}
+
+late_initcall(bpf_kmem_cache_iter_init);
diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c
new file mode 100644
index 000000000000..8158e9c1af7b
--- /dev/null
+++ b/kernel/bpf/link_iter.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Red Hat, Inc. */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_link_info {
+ u32 link_id;
+};
+
+static void *bpf_link_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+ struct bpf_link *link;
+
+ link = bpf_link_get_curr_or_next(&info->link_id);
+ if (!link)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return link;
+}
+
+static void *bpf_link_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_link_info *info = seq->private;
+
+ ++*pos;
+ ++info->link_id;
+ bpf_link_put((struct bpf_link *)v);
+ return bpf_link_get_curr_or_next(&info->link_id);
+}
+
+struct bpf_iter__bpf_link {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_link *, link);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_link, struct bpf_iter_meta *meta, struct bpf_link *link)
+
+static int __bpf_link_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_link ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.link = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_link_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_link_seq_show(seq, v, false);
+}
+
+static void bpf_link_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_link_seq_show(seq, v, true);
+ else
+ bpf_link_put((struct bpf_link *)v);
+}
+
+static const struct seq_operations bpf_link_seq_ops = {
+ .start = bpf_link_seq_start,
+ .next = bpf_link_seq_next,
+ .stop = bpf_link_seq_stop,
+ .show = bpf_link_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(btf_bpf_link_id, struct, bpf_link)
+
+static const struct bpf_iter_seq_info bpf_link_seq_info = {
+ .seq_ops = &bpf_link_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_link_info),
+};
+
+static struct bpf_iter_reg bpf_link_reg_info = {
+ .target = "bpf_link",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_link, link),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_link_seq_info,
+};
+
+static int __init bpf_link_iter_init(void)
+{
+ bpf_link_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_link_id;
+ return bpf_iter_reg_target(&bpf_link_reg_info);
+}
+
+late_initcall(bpf_link_iter_init);
diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
new file mode 100644
index 000000000000..60db5d655495
--- /dev/null
+++ b/kernel/bpf/liveness.c
@@ -0,0 +1,753 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf_verifier.h>
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+
+/*
+ * This file implements live stack slots analysis. After accumulating
+ * stack usage data, the analysis answers queries about whether a
+ * particular stack slot may be read by an instruction or any of it's
+ * successors. This data is consumed by the verifier states caching
+ * mechanism to decide which stack slots are important when looking for a
+ * visited state corresponding to the current state.
+ *
+ * The analysis is call chain sensitive, meaning that data is collected
+ * and queried for tuples (call chain, subprogram instruction index).
+ * Such sensitivity allows identifying if some subprogram call always
+ * leads to writes in the caller's stack.
+ *
+ * The basic idea is as follows:
+ * - As the verifier accumulates a set of visited states, the analysis instance
+ * accumulates a conservative estimate of stack slots that can be read
+ * or must be written for each visited tuple (call chain, instruction index).
+ * - If several states happen to visit the same instruction with the same
+ * call chain, stack usage information for the corresponding tuple is joined:
+ * - "may_read" set represents a union of all possibly read slots
+ * (any slot in "may_read" set might be read at or after the instruction);
+ * - "must_write" set represents an intersection of all possibly written slots
+ * (any slot in "must_write" set is guaranteed to be written by the instruction).
+ * - The analysis is split into two phases:
+ * - read and write marks accumulation;
+ * - read and write marks propagation.
+ * - The propagation phase is a textbook live variable data flow analysis:
+ *
+ * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)]
+ * state[cc, i].live_before =
+ * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read
+ *
+ * Where:
+ * - `U` stands for set union
+ * - `/` stands for set difference;
+ * - `cc` stands for a call chain;
+ * - `i` and `s` are instruction indexes;
+ *
+ * The above equations are computed for each call chain and instruction
+ * index until state stops changing.
+ * - Additionally, in order to transfer "must_write" information from a
+ * subprogram to call instructions invoking this subprogram,
+ * the "must_write_acc" set is tracked for each (cc, i) tuple.
+ * A set of stack slots that are guaranteed to be written by this
+ * instruction or any of its successors (within the subprogram).
+ * The equation for "must_write_acc" propagation looks as follows:
+ *
+ * state[cc, i].must_write_acc =
+ * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)]
+ * U state[cc, i].must_write
+ *
+ * (An intersection of all "must_write_acc" for instruction successors
+ * plus all "must_write" slots for the instruction itself).
+ * - After the propagation phase completes for a subprogram, information from
+ * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain:
+ * - "must_write_acc" set is intersected with the call site's "must_write" set;
+ * - "may_read" set is added to the call site's "may_read" set.
+ * - Any live stack queries must be taken after the propagation phase.
+ * - Accumulation and propagation phases can be entered multiple times,
+ * at any point in time:
+ * - "may_read" set only grows;
+ * - "must_write" set only shrinks;
+ * - for each visited verifier state with zero branches, all relevant
+ * read and write marks are already recorded by the analysis instance.
+ *
+ * Technically, the analysis is facilitated by the following data structures:
+ * - Call chain: for given verifier state, the call chain is a tuple of call
+ * instruction indexes leading to the current subprogram plus the subprogram
+ * entry point index.
+ * - Function instance: for a given call chain, for each instruction in
+ * the current subprogram, a mapping between instruction index and a
+ * set of "may_read", "must_write" and other marks accumulated for this
+ * instruction.
+ * - A hash table mapping call chains to function instances.
+ */
+
+struct callchain {
+ u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */
+ /* cached subprog_info[*].start for functions owning the frames:
+ * - sp_starts[curframe] used to get insn relative index within current function;
+ * - sp_starts[0..current-1] used for fast callchain_frame_up().
+ */
+ u32 sp_starts[MAX_CALL_FRAMES];
+ u32 curframe; /* depth of callsites and sp_starts arrays */
+};
+
+struct per_frame_masks {
+ u64 may_read; /* stack slots that may be read by this instruction */
+ u64 must_write; /* stack slots written by this instruction */
+ u64 must_write_acc; /* stack slots written by this instruction and its successors */
+ u64 live_before; /* stack slots that may be read by this insn and its successors */
+};
+
+/*
+ * A function instance created for a specific callchain.
+ * Encapsulates read and write marks for each instruction in the function.
+ * Marks are tracked for each frame in the callchain.
+ */
+struct func_instance {
+ struct hlist_node hl_node;
+ struct callchain callchain;
+ u32 insn_cnt; /* cached number of insns in the function */
+ bool updated;
+ bool must_write_dropped;
+ /* Per frame, per instruction masks, frames allocated lazily. */
+ struct per_frame_masks *frames[MAX_CALL_FRAMES];
+ /* For each instruction a flag telling if "must_write" had been initialized for it. */
+ bool *must_write_set;
+};
+
+struct live_stack_query {
+ struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */
+ u32 curframe;
+ u32 insn_idx;
+};
+
+struct bpf_liveness {
+ DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */
+ struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */
+ /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */
+ struct func_instance *cur_instance;
+ /*
+ * Below fields are used to accumulate stack write marks for instruction at
+ * @write_insn_idx before submitting the marks to @cur_instance.
+ */
+ u64 write_masks_acc[MAX_CALL_FRAMES];
+ u32 write_insn_idx;
+};
+
+/* Compute callchain corresponding to state @st at depth @frameno */
+static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st,
+ struct callchain *callchain, u32 frameno)
+{
+ struct bpf_subprog_info *subprog_info = env->subprog_info;
+ u32 i;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= frameno; i++) {
+ callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start;
+ if (i < st->curframe)
+ callchain->callsites[i] = st->frame[i + 1]->callsite;
+ }
+ callchain->curframe = frameno;
+ callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe];
+}
+
+static u32 hash_callchain(struct callchain *callchain)
+{
+ return jhash2(callchain->callsites, callchain->curframe, 0);
+}
+
+static bool same_callsites(struct callchain *a, struct callchain *b)
+{
+ int i;
+
+ if (a->curframe != b->curframe)
+ return false;
+ for (i = a->curframe; i >= 0; i--)
+ if (a->callsites[i] != b->callsites[i])
+ return false;
+ return true;
+}
+
+/*
+ * Find existing or allocate new function instance corresponding to @callchain.
+ * Instances are accumulated in env->liveness->func_instances and persist
+ * until the end of the verification process.
+ */
+static struct func_instance *__lookup_instance(struct bpf_verifier_env *env,
+ struct callchain *callchain)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct bpf_subprog_info *subprog;
+ struct func_instance *result;
+ u32 subprog_sz, size, key;
+
+ key = hash_callchain(callchain);
+ hash_for_each_possible(liveness->func_instances, result, hl_node, key)
+ if (same_callsites(&result->callchain, callchain))
+ return result;
+
+ subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]);
+ subprog_sz = (subprog + 1)->start - subprog->start;
+ size = sizeof(struct func_instance);
+ result = kvzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+ result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set),
+ GFP_KERNEL_ACCOUNT);
+ if (!result->must_write_set) {
+ kvfree(result);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(&result->callchain, callchain, sizeof(*callchain));
+ result->insn_cnt = subprog_sz;
+ hash_add(liveness->func_instances, &result->hl_node, key);
+ return result;
+}
+
+static struct func_instance *lookup_instance(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ u32 frameno)
+{
+ struct callchain callchain;
+
+ compute_callchain(env, st, &callchain, frameno);
+ return __lookup_instance(env, &callchain);
+}
+
+int bpf_stack_liveness_init(struct bpf_verifier_env *env)
+{
+ env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT);
+ if (!env->liveness)
+ return -ENOMEM;
+ hash_init(env->liveness->func_instances);
+ return 0;
+}
+
+void bpf_stack_liveness_free(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ struct hlist_node *tmp;
+ int bkt, i;
+
+ if (!env->liveness)
+ return;
+ hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) {
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ kvfree(instance->frames[i]);
+ kvfree(instance->must_write_set);
+ kvfree(instance);
+ }
+ kvfree(env->liveness);
+}
+
+/*
+ * Convert absolute instruction index @insn_idx to an index relative
+ * to start of the function corresponding to @instance.
+ */
+static int relative_idx(struct func_instance *instance, u32 insn_idx)
+{
+ return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe];
+}
+
+static struct per_frame_masks *get_frame_masks(struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ if (!instance->frames[frame])
+ return NULL;
+
+ return &instance->frames[frame][relative_idx(instance, insn_idx)];
+}
+
+static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env,
+ struct func_instance *instance,
+ u32 frame, u32 insn_idx)
+{
+ struct per_frame_masks *arr;
+
+ if (!instance->frames[frame]) {
+ arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT);
+ instance->frames[frame] = arr;
+ if (!arr)
+ return ERR_PTR(-ENOMEM);
+ }
+ return get_frame_masks(instance, frame, insn_idx);
+}
+
+void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env)
+{
+ env->liveness->cur_instance = NULL;
+}
+
+/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */
+static int ensure_cur_instance(struct bpf_verifier_env *env)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ struct func_instance *instance;
+
+ if (liveness->cur_instance)
+ return 0;
+
+ instance = lookup_instance(env, env->cur_state, env->cur_state->curframe);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ liveness->cur_instance = instance;
+ return 0;
+}
+
+/* Accumulate may_read masks for @frame at @insn_idx */
+static int mark_stack_read(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask)
+{
+ struct per_frame_masks *masks;
+ u64 new_may_read;
+
+ masks = alloc_frame_masks(env, instance, frame, insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ new_may_read = masks->may_read | mask;
+ if (new_may_read != masks->may_read &&
+ ((new_may_read | masks->live_before) != masks->live_before))
+ instance->updated = true;
+ masks->may_read |= mask;
+ return 0;
+}
+
+int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask)
+{
+ int err;
+
+ err = ensure_cur_instance(env);
+ err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask);
+ return err;
+}
+
+static void reset_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int i;
+
+ liveness->write_insn_idx = insn_idx;
+ for (i = 0; i <= instance->callchain.curframe; i++)
+ liveness->write_masks_acc[i] = 0;
+}
+
+int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ int err;
+
+ err = ensure_cur_instance(env);
+ if (err)
+ return err;
+
+ reset_stack_write_marks(env, liveness->cur_instance, insn_idx);
+ return 0;
+}
+
+void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask)
+{
+ env->liveness->write_masks_acc[frame] |= mask;
+}
+
+static int commit_stack_write_marks(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct bpf_liveness *liveness = env->liveness;
+ u32 idx, frame, curframe, old_must_write;
+ struct per_frame_masks *masks;
+ u64 mask;
+
+ if (!instance)
+ return 0;
+
+ curframe = instance->callchain.curframe;
+ idx = relative_idx(instance, liveness->write_insn_idx);
+ for (frame = 0; frame <= curframe; frame++) {
+ mask = liveness->write_masks_acc[frame];
+ /* avoid allocating frames for zero masks */
+ if (mask == 0 && !instance->must_write_set[idx])
+ continue;
+ masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx);
+ if (IS_ERR(masks))
+ return PTR_ERR(masks);
+ old_must_write = masks->must_write;
+ /*
+ * If instruction at this callchain is seen for a first time, set must_write equal
+ * to @mask. Otherwise take intersection with the previous value.
+ */
+ if (instance->must_write_set[idx])
+ mask &= old_must_write;
+ if (old_must_write != mask) {
+ masks->must_write = mask;
+ instance->updated = true;
+ }
+ if (old_must_write & ~mask)
+ instance->must_write_dropped = true;
+ }
+ instance->must_write_set[idx] = true;
+ liveness->write_insn_idx = 0;
+ return 0;
+}
+
+/*
+ * Merge stack writes marks in @env->liveness->write_masks_acc
+ * with information already in @env->liveness->cur_instance.
+ */
+int bpf_commit_stack_write_marks(struct bpf_verifier_env *env)
+{
+ return commit_stack_write_marks(env, env->liveness->cur_instance);
+}
+
+static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain)
+{
+ char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf);
+ char *buf = env->tmp_str_buf;
+ int i;
+
+ buf += snprintf(buf, buf_end - buf, "(");
+ for (i = 0; i <= callchain->curframe; i++)
+ buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]);
+ snprintf(buf, buf_end - buf, ")");
+ return env->tmp_str_buf;
+}
+
+static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain,
+ char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new)
+{
+ u64 changed_bits = old ^ new;
+ u64 new_ones = new & changed_bits;
+ u64 new_zeros = ~new & changed_bits;
+
+ if (!changed_bits)
+ return;
+ bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx);
+ if (new_ones) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones);
+ bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf);
+ }
+ if (new_zeros) {
+ bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros);
+ bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf);
+ }
+ bpf_log(&env->log, "\n");
+}
+
+int bpf_jmp_offset(struct bpf_insn *insn)
+{
+ u8 code = insn->code;
+
+ if (code == (BPF_JMP32 | BPF_JA))
+ return insn->imm;
+ return insn->off;
+}
+
+__diag_push();
+__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl");
+
+/*
+ * Returns an array of instructions succ, with succ->items[0], ...,
+ * succ->items[n-1] with successor instructions, where n=succ->cnt
+ */
+inline struct bpf_iarray *
+bpf_insn_successors(struct bpf_verifier_env *env, u32 idx)
+{
+ static const struct opcode_info {
+ bool can_jump;
+ bool can_fallthrough;
+ } opcode_info_tbl[256] = {
+ [0 ... 255] = {.can_jump = false, .can_fallthrough = true},
+ #define _J(code, ...) \
+ [BPF_JMP | code] = __VA_ARGS__, \
+ [BPF_JMP32 | code] = __VA_ARGS__
+
+ _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}),
+ _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}),
+ _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}),
+ _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}),
+ #undef _J
+ };
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = &prog->insnsi[idx];
+ const struct opcode_info *opcode_info;
+ struct bpf_iarray *succ, *jt;
+ int insn_sz;
+
+ jt = env->insn_aux_data[idx].jt;
+ if (unlikely(jt))
+ return jt;
+
+ /* pre-allocated array of size up to 2; reset cnt, as it may have been used already */
+ succ = env->succ;
+ succ->cnt = 0;
+
+ opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)];
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ if (opcode_info->can_fallthrough)
+ succ->items[succ->cnt++] = idx + insn_sz;
+
+ if (opcode_info->can_jump)
+ succ->items[succ->cnt++] = idx + bpf_jmp_offset(insn) + 1;
+
+ return succ;
+}
+
+__diag_pop();
+
+static struct func_instance *get_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain callchain = instance->callchain;
+
+ /* Adjust @callchain to represent callchain one frame up */
+ callchain.callsites[callchain.curframe] = 0;
+ callchain.sp_starts[callchain.curframe] = 0;
+ callchain.curframe--;
+ callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe];
+ return __lookup_instance(env, &callchain);
+}
+
+static u32 callchain_subprog_start(struct callchain *callchain)
+{
+ return callchain->sp_starts[callchain->curframe];
+}
+
+/*
+ * Transfer @may_read and @must_write_acc marks from the first instruction of @instance,
+ * to the call instruction in function instance calling @instance.
+ */
+static int propagate_to_outer_instance(struct bpf_verifier_env *env,
+ struct func_instance *instance)
+{
+ struct callchain *callchain = &instance->callchain;
+ u32 this_subprog_start, callsite, frame;
+ struct func_instance *outer_instance;
+ struct per_frame_masks *insn;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ outer_instance = get_outer_instance(env, instance);
+ if (IS_ERR(outer_instance))
+ return PTR_ERR(outer_instance);
+ callsite = callchain->callsites[callchain->curframe - 1];
+
+ reset_stack_write_marks(env, outer_instance, callsite);
+ for (frame = 0; frame < callchain->curframe; frame++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start);
+ if (!insn)
+ continue;
+ bpf_mark_stack_write(env, frame, insn->must_write_acc);
+ err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before);
+ if (err)
+ return err;
+ }
+ commit_stack_write_marks(env, outer_instance);
+ return 0;
+}
+
+static inline bool update_insn(struct bpf_verifier_env *env,
+ struct func_instance *instance, u32 frame, u32 insn_idx)
+{
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ u64 new_before, new_after, must_write_acc;
+ struct per_frame_masks *insn, *succ_insn;
+ struct bpf_iarray *succ;
+ u32 s;
+ bool changed;
+
+ succ = bpf_insn_successors(env, insn_idx);
+ if (succ->cnt == 0)
+ return false;
+
+ changed = false;
+ insn = get_frame_masks(instance, frame, insn_idx);
+ new_before = 0;
+ new_after = 0;
+ /*
+ * New "must_write_acc" is an intersection of all "must_write_acc"
+ * of successors plus all "must_write" slots of instruction itself.
+ */
+ must_write_acc = U64_MAX;
+ for (s = 0; s < succ->cnt; ++s) {
+ succ_insn = get_frame_masks(instance, frame, succ->items[s]);
+ new_after |= succ_insn->live_before;
+ must_write_acc &= succ_insn->must_write_acc;
+ }
+ must_write_acc |= insn->must_write;
+ /*
+ * New "live_before" is a union of all "live_before" of successors
+ * minus slots written by instruction plus slots read by instruction.
+ */
+ new_before = (new_after & ~insn->must_write) | insn->may_read;
+ changed |= new_before != insn->live_before;
+ changed |= must_write_acc != insn->must_write_acc;
+ if (unlikely(env->log.level & BPF_LOG_LEVEL2) &&
+ (insn->may_read || insn->must_write ||
+ insn_idx == callchain_subprog_start(&instance->callchain) ||
+ aux[insn_idx].prune_point)) {
+ log_mask_change(env, &instance->callchain, "live",
+ frame, insn_idx, insn->live_before, new_before);
+ log_mask_change(env, &instance->callchain, "written",
+ frame, insn_idx, insn->must_write_acc, must_write_acc);
+ }
+ insn->live_before = new_before;
+ insn->must_write_acc = must_write_acc;
+ return changed;
+}
+
+/* Fixed-point computation of @live_before and @must_write_acc marks */
+static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance)
+{
+ u32 i, frame, po_start, po_end, cnt, this_subprog_start;
+ struct callchain *callchain = &instance->callchain;
+ int *insn_postorder = env->cfg.insn_postorder;
+ struct bpf_subprog_info *subprog;
+ struct per_frame_masks *insn;
+ bool changed;
+ int err;
+
+ this_subprog_start = callchain_subprog_start(callchain);
+ /*
+ * If must_write marks were updated must_write_acc needs to be reset
+ * (to account for the case when new must_write sets became smaller).
+ */
+ if (instance->must_write_dropped) {
+ for (frame = 0; frame <= callchain->curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = 0; i < instance->insn_cnt; i++) {
+ insn = get_frame_masks(instance, frame, this_subprog_start + i);
+ insn->must_write_acc = 0;
+ }
+ }
+ }
+
+ subprog = bpf_find_containing_subprog(env, this_subprog_start);
+ po_start = subprog->postorder_start;
+ po_end = (subprog + 1)->postorder_start;
+ cnt = 0;
+ /* repeat until fixed point is reached */
+ do {
+ cnt++;
+ changed = false;
+ for (frame = 0; frame <= instance->callchain.curframe; frame++) {
+ if (!instance->frames[frame])
+ continue;
+
+ for (i = po_start; i < po_end; i++)
+ changed |= update_insn(env, instance, frame, insn_postorder[i]);
+ }
+ } while (changed);
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ bpf_log(&env->log, "%s live stack update done in %d iterations\n",
+ fmt_callchain(env, callchain), cnt);
+
+ /* transfer marks accumulated for outer frames to outer func instance (caller) */
+ if (callchain->curframe > 0) {
+ err = propagate_to_outer_instance(env, instance);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * Prepare all callchains within @env->cur_state for querying.
+ * This function should be called after each verifier.c:pop_stack()
+ * and whenever verifier.c:do_check_insn() processes subprogram exit.
+ * This would guarantee that visited verifier states with zero branches
+ * have their bpf_mark_stack_{read,write}() effects propagated in
+ * @env->liveness.
+ */
+int bpf_update_live_stack(struct bpf_verifier_env *env)
+{
+ struct func_instance *instance;
+ int err, frame;
+
+ bpf_reset_live_stack_callchain(env);
+ for (frame = env->cur_state->curframe; frame >= 0; --frame) {
+ instance = lookup_instance(env, env->cur_state, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+
+ if (instance->updated) {
+ err = update_instance(env, instance);
+ if (err)
+ return err;
+ instance->updated = false;
+ instance->must_write_dropped = false;
+ }
+ }
+ return 0;
+}
+
+static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi)
+{
+ struct per_frame_masks *masks;
+
+ masks = get_frame_masks(instance, frameno, insn_idx);
+ return masks && (masks->live_before & BIT(spi));
+}
+
+int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance;
+ u32 frame;
+
+ memset(q, 0, sizeof(*q));
+ for (frame = 0; frame <= st->curframe; frame++) {
+ instance = lookup_instance(env, st, frame);
+ if (IS_ERR(instance))
+ return PTR_ERR(instance);
+ q->instances[frame] = instance;
+ }
+ q->curframe = st->curframe;
+ q->insn_idx = st->insn_idx;
+ return 0;
+}
+
+bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi)
+{
+ /*
+ * Slot is alive if it is read before q->st->insn_idx in current func instance,
+ * or if for some outer func instance:
+ * - alive before callsite if callsite calls callback, otherwise
+ * - alive after callsite
+ */
+ struct live_stack_query *q = &env->liveness->live_stack_query;
+ struct func_instance *instance, *curframe_instance;
+ u32 i, callsite;
+ bool alive;
+
+ curframe_instance = q->instances[q->curframe];
+ if (is_live_before(curframe_instance, q->insn_idx, frameno, spi))
+ return true;
+
+ for (i = frameno; i < q->curframe; i++) {
+ callsite = curframe_instance->callchain.callsites[i];
+ instance = q->instances[i];
+ alive = bpf_calls_callback(env, callsite)
+ ? is_live_before(instance, callsite, frameno, spi)
+ : is_live_before(instance, callsite + 1, frameno, spi);
+ if (alive)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
new file mode 100644
index 000000000000..c93a756e035c
--- /dev/null
+++ b/kernel/bpf/local_storage.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/btf.h>
+#include <linux/bug.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+#ifdef CONFIG_CGROUP_BPF
+
+#include "../cgroup/cgroup-internal.h"
+
+#define LOCAL_STORAGE_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
+
+struct bpf_cgroup_storage_map {
+ struct bpf_map map;
+
+ spinlock_t lock;
+ struct rb_root root;
+ struct list_head list;
+};
+
+static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
+{
+ return container_of(map, struct bpf_cgroup_storage_map, map);
+}
+
+static bool attach_type_isolated(const struct bpf_map *map)
+{
+ return map->key_size == sizeof(struct bpf_cgroup_storage_key);
+}
+
+static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map,
+ const void *_key1, const void *_key2)
+{
+ if (attach_type_isolated(&map->map)) {
+ const struct bpf_cgroup_storage_key *key1 = _key1;
+ const struct bpf_cgroup_storage_key *key2 = _key2;
+
+ if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+ return -1;
+ else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+ return 1;
+ else if (key1->attach_type < key2->attach_type)
+ return -1;
+ else if (key1->attach_type > key2->attach_type)
+ return 1;
+ } else {
+ const __u64 *cgroup_inode_id1 = _key1;
+ const __u64 *cgroup_inode_id2 = _key2;
+
+ if (*cgroup_inode_id1 < *cgroup_inode_id2)
+ return -1;
+ else if (*cgroup_inode_id1 > *cgroup_inode_id2)
+ return 1;
+ }
+ return 0;
+}
+
+struct bpf_cgroup_storage *
+cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
+ void *key, bool locked)
+{
+ struct rb_root *root = &map->root;
+ struct rb_node *node;
+
+ if (!locked)
+ spin_lock_bh(&map->lock);
+
+ node = root->rb_node;
+ while (node) {
+ struct bpf_cgroup_storage *storage;
+
+ storage = container_of(node, struct bpf_cgroup_storage, node);
+
+ switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) {
+ case -1:
+ node = node->rb_left;
+ break;
+ case 1:
+ node = node->rb_right;
+ break;
+ default:
+ if (!locked)
+ spin_unlock_bh(&map->lock);
+ return storage;
+ }
+ }
+
+ if (!locked)
+ spin_unlock_bh(&map->lock);
+
+ return NULL;
+}
+
+static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
+ struct bpf_cgroup_storage *storage)
+{
+ struct rb_root *root = &map->root;
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ while (*new) {
+ struct bpf_cgroup_storage *this;
+
+ this = container_of(*new, struct bpf_cgroup_storage, node);
+
+ parent = *new;
+ switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) {
+ case -1:
+ new = &((*new)->rb_left);
+ break;
+ case 1:
+ new = &((*new)->rb_right);
+ break;
+ default:
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&storage->node, parent, new);
+ rb_insert_color(&storage->node, root);
+
+ return 0;
+}
+
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage *storage;
+
+ storage = cgroup_storage_lookup(map, key, false);
+ if (!storage)
+ return NULL;
+
+ return &READ_ONCE(storage->buf)->data[0];
+}
+
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct bpf_cgroup_storage *storage;
+ struct bpf_storage_buffer *new;
+
+ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST)))
+ return -EINVAL;
+
+ if (unlikely((flags & BPF_F_LOCK) &&
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
+ return -EINVAL;
+
+ storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
+ key, false);
+ if (!storage)
+ return -ENOENT;
+
+ if (flags & BPF_F_LOCK) {
+ copy_map_value_locked(map, storage->buf->data, value, false);
+ return 0;
+ }
+
+ new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
+ __GFP_ZERO | GFP_NOWAIT,
+ map->numa_node);
+ if (!new)
+ return -ENOMEM;
+
+ memcpy(&new->data[0], value, map->value_size);
+ check_and_init_map_value(map, new->data);
+
+ new = xchg(&storage->buf, new);
+ kfree_rcu(new, rcu);
+
+ return 0;
+}
+
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
+ void *value)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage *storage;
+ int cpu, off = 0;
+ u32 size;
+
+ rcu_read_lock();
+ storage = cgroup_storage_lookup(map, key, false);
+ if (!storage) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(_map->value_size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(storage->percpu_buf, cpu), size);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage *storage;
+ int cpu, off = 0;
+ u32 size;
+
+ if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
+ return -EINVAL;
+
+ rcu_read_lock();
+ storage = cgroup_storage_lookup(map, key, false);
+ if (!storage) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ /* the user space will provide round_up(value_size, 8) bytes that
+ * will be copied into per-cpu area. bpf programs can only access
+ * value_size of it. During lookup the same extra bytes will be
+ * returned or zeros which were zero-filled by percpu_alloc,
+ * so no kernel data leaks possible
+ */
+ size = round_up(_map->value_size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
+ value + off, size);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
+ void *_next_key)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage *storage;
+
+ spin_lock_bh(&map->lock);
+
+ if (list_empty(&map->list))
+ goto enoent;
+
+ if (key) {
+ storage = cgroup_storage_lookup(map, key, true);
+ if (!storage)
+ goto enoent;
+
+ storage = list_next_entry(storage, list_map);
+ if (!storage)
+ goto enoent;
+ } else {
+ storage = list_first_entry(&map->list,
+ struct bpf_cgroup_storage, list_map);
+ }
+
+ spin_unlock_bh(&map->lock);
+
+ if (attach_type_isolated(&map->map)) {
+ struct bpf_cgroup_storage_key *next = _next_key;
+ *next = storage->key;
+ } else {
+ __u64 *next = _next_key;
+ *next = storage->key.cgroup_inode_id;
+ }
+ return 0;
+
+enoent:
+ spin_unlock_bh(&map->lock);
+ return -ENOENT;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ __u32 max_value_size = BPF_LOCAL_STORAGE_MAX_VALUE_SIZE;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_cgroup_storage_map *map;
+
+ /* percpu is bound by PCPU_MIN_UNIT_SIZE, non-percu
+ * is the same as other local storages.
+ */
+ if (attr->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ max_value_size = min_t(__u32, max_value_size,
+ PCPU_MIN_UNIT_SIZE);
+
+ if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
+ attr->key_size != sizeof(__u64))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size > max_value_size)
+ return ERR_PTR(-E2BIG);
+
+ if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
+ return ERR_PTR(-EINVAL);
+
+ if (attr->max_entries)
+ /* max_entries is not used and enforced to be 0 */
+ return ERR_PTR(-EINVAL);
+
+ map = bpf_map_area_alloc(sizeof(struct bpf_cgroup_storage_map), numa_node);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ /* copy mandatory map attributes */
+ bpf_map_init_from_attr(&map->map, attr);
+
+ spin_lock_init(&map->lock);
+ map->root = RB_ROOT;
+ INIT_LIST_HEAD(&map->list);
+
+ return &map->map;
+}
+
+static void cgroup_storage_map_free(struct bpf_map *_map)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct list_head *storages = &map->list;
+ struct bpf_cgroup_storage *storage, *stmp;
+
+ cgroup_lock();
+
+ list_for_each_entry_safe(storage, stmp, storages, list_map) {
+ bpf_cgroup_storage_unlink(storage);
+ bpf_cgroup_storage_free(storage);
+ }
+
+ cgroup_unlock();
+
+ WARN_ON(!RB_EMPTY_ROOT(&map->root));
+ WARN_ON(!list_empty(&map->list));
+
+ bpf_map_area_free(map);
+}
+
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+static int cgroup_storage_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ if (attach_type_isolated(map)) {
+ struct btf_member *m;
+ u32 offset, size;
+
+ /* Key is expected to be of struct bpf_cgroup_storage_key type,
+ * which is:
+ * struct bpf_cgroup_storage_key {
+ * __u64 cgroup_inode_id;
+ * __u32 attach_type;
+ * };
+ */
+
+ /*
+ * Key_type must be a structure with two fields.
+ */
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+ BTF_INFO_VLEN(key_type->info) != 2)
+ return -EINVAL;
+
+ /*
+ * The first field must be a 64 bit integer at 0 offset.
+ */
+ m = (struct btf_member *)(key_type + 1);
+ size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
+ if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
+ return -EINVAL;
+
+ /*
+ * The second field must be a 32 bit integer at 64 bit offset.
+ */
+ m++;
+ offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
+ size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
+ if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
+ return -EINVAL;
+ } else {
+ /*
+ * Key is expected to be u64, which stores the cgroup_inode_id
+ */
+ if (!btf_type_is_i64(key_type))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
+ struct seq_file *m)
+{
+ enum bpf_cgroup_storage_type stype;
+ struct bpf_cgroup_storage *storage;
+ int cpu;
+
+ rcu_read_lock();
+ storage = cgroup_storage_lookup(map_to_storage(map), key, false);
+ if (!storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+ stype = cgroup_storage_type(map);
+ if (stype == BPF_CGROUP_STORAGE_SHARED) {
+ seq_puts(m, ": ");
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ &READ_ONCE(storage->buf)->data[0], m);
+ seq_putc(m, '\n');
+ } else {
+ seq_puts(m, ": {\n");
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "\tcpu%d: ", cpu);
+ btf_type_seq_show(map->btf, map->btf_value_type_id,
+ per_cpu_ptr(storage->percpu_buf, cpu),
+ m);
+ seq_putc(m, '\n');
+ }
+ seq_puts(m, "}\n");
+ }
+ rcu_read_unlock();
+}
+
+static u64 cgroup_storage_map_usage(const struct bpf_map *map)
+{
+ /* Currently the dynamically allocated elements are not counted. */
+ return sizeof(struct bpf_cgroup_storage_map);
+}
+
+BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
+ bpf_cgroup_storage_map)
+const struct bpf_map_ops cgroup_storage_map_ops = {
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = cgroup_storage_get_next_key,
+ .map_lookup_elem = cgroup_storage_lookup_elem,
+ .map_update_elem = cgroup_storage_update_elem,
+ .map_delete_elem = cgroup_storage_delete_elem,
+ .map_check_btf = cgroup_storage_check_btf,
+ .map_seq_show_elem = cgroup_storage_seq_show_elem,
+ .map_mem_usage = cgroup_storage_map_usage,
+ .map_btf_id = &cgroup_storage_map_btf_ids[0],
+};
+
+int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
+{
+ enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
+
+ if (aux->cgroup_storage[stype] &&
+ aux->cgroup_storage[stype] != _map)
+ return -EBUSY;
+
+ aux->cgroup_storage[stype] = _map;
+ return 0;
+}
+
+static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
+{
+ size_t size;
+
+ if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) {
+ size = sizeof(struct bpf_storage_buffer) + map->value_size;
+ *pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
+ PAGE_SIZE) >> PAGE_SHIFT;
+ } else {
+ size = map->value_size;
+ *pages = round_up(round_up(size, 8) * num_possible_cpus(),
+ PAGE_SIZE) >> PAGE_SHIFT;
+ }
+
+ return size;
+}
+
+struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
+ enum bpf_cgroup_storage_type stype)
+{
+ const gfp_t gfp = __GFP_ZERO | GFP_USER;
+ struct bpf_cgroup_storage *storage;
+ struct bpf_map *map;
+ size_t size;
+ u32 pages;
+
+ map = prog->aux->cgroup_storage[stype];
+ if (!map)
+ return NULL;
+
+ size = bpf_cgroup_storage_calculate_size(map, &pages);
+
+ storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage),
+ gfp, map->numa_node);
+ if (!storage)
+ goto enomem;
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED) {
+ storage->buf = bpf_map_kmalloc_node(map, size, gfp,
+ map->numa_node);
+ if (!storage->buf)
+ goto enomem;
+ check_and_init_map_value(map, storage->buf->data);
+ } else {
+ storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
+ if (!storage->percpu_buf)
+ goto enomem;
+ }
+
+ storage->map = (struct bpf_cgroup_storage_map *)map;
+
+ return storage;
+
+enomem:
+ kfree(storage);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+ struct bpf_cgroup_storage *storage =
+ container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+ kfree(storage->buf);
+ kfree(storage);
+}
+
+static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+ struct bpf_cgroup_storage *storage =
+ container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+ free_percpu(storage->percpu_buf);
+ kfree(storage);
+}
+
+void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
+{
+ enum bpf_cgroup_storage_type stype;
+ struct bpf_map *map;
+
+ if (!storage)
+ return;
+
+ map = &storage->map->map;
+ stype = cgroup_storage_type(map);
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
+ else
+ call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
+}
+
+void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
+ struct cgroup *cgroup,
+ enum bpf_attach_type type)
+{
+ struct bpf_cgroup_storage_map *map;
+
+ if (!storage)
+ return;
+
+ storage->key.attach_type = type;
+ storage->key.cgroup_inode_id = cgroup_id(cgroup);
+
+ map = storage->map;
+
+ spin_lock_bh(&map->lock);
+ WARN_ON(cgroup_storage_insert(map, storage));
+ list_add(&storage->list_map, &map->list);
+ list_add(&storage->list_cg, &cgroup->bpf.storages);
+ spin_unlock_bh(&map->lock);
+}
+
+void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
+{
+ struct bpf_cgroup_storage_map *map;
+ struct rb_root *root;
+
+ if (!storage)
+ return;
+
+ map = storage->map;
+
+ spin_lock_bh(&map->lock);
+ root = &map->root;
+ rb_erase(&storage->node, root);
+
+ list_del(&storage->list_map);
+ list_del(&storage->list_cg);
+ spin_unlock_bh(&map->lock);
+}
+
+#endif
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..a0c3b35de2ce
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,865 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/math64.h>
+#include <linux/string.h>
+
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+ /* ubuf and len_total should both be specified (or not) together */
+ if (!!log->ubuf != !!log->len_total)
+ return false;
+ /* log buf without log_level is meaningless */
+ if (log->ubuf && log->level == 0)
+ return false;
+ if (log->level & ~BPF_LOG_MASK)
+ return false;
+ if (log->len_total > UINT_MAX >> 2)
+ return false;
+ return true;
+}
+
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+ char __user *log_buf, u32 log_size)
+{
+ log->level = log_level;
+ log->ubuf = log_buf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (!bpf_verifier_log_attr_valid(log))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+ /* add_len includes terminal \0, so no need for +1. */
+ u64 len = log->end_pos + add_len;
+
+ /* log->len_max could be larger than our current len due to
+ * bpf_vlog_reset() calls, so we maintain the max of any length at any
+ * previous point
+ */
+ if (len > UINT_MAX)
+ log->len_max = UINT_MAX;
+ else if (len > log->len_max)
+ log->len_max = len;
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
+{
+ u64 cur_pos;
+ u32 new_n, n;
+
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+ if (log->level == BPF_LOG_KERNEL) {
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+ return;
+ }
+
+ n += 1; /* include terminating zero */
+ bpf_vlog_update_len_max(log, n);
+
+ if (log->level & BPF_LOG_FIXED) {
+ /* check if we have at least something to put into user buf */
+ new_n = 0;
+ if (log->end_pos < log->len_total) {
+ new_n = min_t(u32, log->len_total - log->end_pos, n);
+ log->kbuf[new_n - 1] = '\0';
+ }
+
+ cur_pos = log->end_pos;
+ log->end_pos += n - 1; /* don't count terminating '\0' */
+
+ if (log->ubuf && new_n &&
+ copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+ goto fail;
+ } else {
+ u64 new_end, new_start;
+ u32 buf_start, buf_end;
+
+ new_end = log->end_pos + n;
+ if (new_end - log->start_pos >= log->len_total)
+ new_start = new_end - log->len_total;
+ else
+ new_start = log->start_pos;
+
+ log->start_pos = new_start;
+ log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+ if (!log->ubuf)
+ return;
+
+ new_n = min(n, log->len_total);
+ cur_pos = new_end - new_n;
+ div_u64_rem(cur_pos, log->len_total, &buf_start);
+ div_u64_rem(new_end, log->len_total, &buf_end);
+ /* new_end and buf_end are exclusive indices, so if buf_end is
+ * exactly zero, then it actually points right to the end of
+ * ubuf and there is no wrap around
+ */
+ if (buf_end == 0)
+ buf_end = log->len_total;
+
+ /* if buf_start > buf_end, we wrapped around;
+ * if buf_start == buf_end, then we fill ubuf completely; we
+ * can't have buf_start == buf_end to mean that there is
+ * nothing to write, because we always write at least
+ * something, even if terminal '\0'
+ */
+ if (buf_start < buf_end) {
+ /* message fits within contiguous chunk of ubuf */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ buf_end - buf_start))
+ goto fail;
+ } else {
+ /* message wraps around the end of ubuf, copy in two chunks */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ log->len_total - buf_start))
+ goto fail;
+ if (copy_to_user(log->ubuf,
+ log->kbuf + n - buf_end,
+ buf_end))
+ goto fail;
+ }
+ }
+
+ return;
+fail:
+ log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
+{
+ char zero = 0;
+ u32 pos;
+
+ if (WARN_ON_ONCE(new_pos > log->end_pos))
+ return;
+
+ if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
+ return;
+
+ /* if position to which we reset is beyond current log window,
+ * then we didn't preserve any useful content and should adjust
+ * start_pos to end up with an empty log (start_pos == end_pos)
+ */
+ log->end_pos = new_pos;
+ if (log->end_pos < log->start_pos)
+ log->start_pos = log->end_pos;
+
+ if (!log->ubuf)
+ return;
+
+ if (log->level & BPF_LOG_FIXED)
+ pos = log->end_pos + 1;
+ else
+ div_u64_rem(new_pos, log->len_total, &pos);
+
+ if (pos < log->len_total && put_user(zero, log->ubuf + pos))
+ log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+ int i, j;
+
+ for (i = 0, j = len - 1; i < j; i++, j--)
+ swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+ /* we split log->kbuf into two equal parts for both ends of array */
+ int n = sizeof(log->kbuf) / 2, nn;
+ char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+ /* Read ubuf's section [start, end) two chunks at a time, from left
+ * and right side; within each chunk, swap all the bytes; after that
+ * reverse the order of lbuf and rbuf and write result back to ubuf.
+ * This way we'll end up with swapped contents of specified
+ * [start, end) ubuf segment.
+ */
+ while (end - start > 1) {
+ nn = min(n, (end - start ) / 2);
+
+ if (copy_from_user(lbuf, log->ubuf + start, nn))
+ return -EFAULT;
+ if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+ return -EFAULT;
+
+ bpf_vlog_reverse_kbuf(lbuf, nn);
+ bpf_vlog_reverse_kbuf(rbuf, nn);
+
+ /* we write lbuf to the right end of ubuf, while rbuf to the
+ * left one to end up with properly reversed overall ubuf
+ */
+ if (copy_to_user(log->ubuf + start, rbuf, nn))
+ return -EFAULT;
+ if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+ return -EFAULT;
+
+ start += nn;
+ end -= nn;
+ }
+
+ return 0;
+}
+
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
+{
+ u32 sublen;
+ int err;
+
+ *log_size_actual = 0;
+ if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+ return 0;
+
+ if (!log->ubuf)
+ goto skip_log_rotate;
+ /* If we never truncated log, there is nothing to move around. */
+ if (log->start_pos == 0)
+ goto skip_log_rotate;
+
+ /* Otherwise we need to rotate log contents to make it start from the
+ * buffer beginning and be a continuous zero-terminated string. Note
+ * that if log->start_pos != 0 then we definitely filled up entire log
+ * buffer with no gaps, and we just need to shift buffer contents to
+ * the left by (log->start_pos % log->len_total) bytes.
+ *
+ * Unfortunately, user buffer could be huge and we don't want to
+ * allocate temporary kernel memory of the same size just to shift
+ * contents in a straightforward fashion. Instead, we'll be clever and
+ * do in-place array rotation. This is a leetcode-style problem, which
+ * could be solved by three rotations.
+ *
+ * Let's say we have log buffer that has to be shifted left by 7 bytes
+ * (spaces and vertical bar is just for demonstrative purposes):
+ * E F G H I J K | A B C D
+ *
+ * First, we reverse entire array:
+ * D C B A | K J I H G F E
+ *
+ * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+ * (KJIHGFE), resulting in a properly rotated array:
+ * A B C D | E F G H I J K
+ *
+ * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+ * bytes, and write them back. Doing it byte-by-byte would be
+ * unnecessarily inefficient. Altogether we are going to read and
+ * write each byte twice, for total 4 memory copies between kernel and
+ * user space.
+ */
+
+ /* length of the chopped off part that will be the beginning;
+ * len(ABCD) in the example above
+ */
+ div_u64_rem(log->start_pos, log->len_total, &sublen);
+ sublen = log->len_total - sublen;
+
+ err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+ err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+ err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+ if (err)
+ log->ubuf = NULL;
+
+skip_log_rotate:
+ *log_size_actual = log->len_max;
+
+ /* properly initialized log has either both ubuf!=NULL and len_total>0
+ * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+ * we got a fault somewhere along the way, so report it back
+ */
+ if (!!log->ubuf != !!log->len_total)
+ return -EFAULT;
+
+ /* did truncation actually happen? */
+ if (log->ubuf && log->len_max > log->len_total)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
+
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+ const struct bpf_line_info *linfo;
+ const struct bpf_prog *prog;
+ u32 nr_linfo;
+ int l, r, m;
+
+ prog = env->prog;
+ nr_linfo = prog->aux->nr_linfo;
+
+ if (!nr_linfo || insn_off >= prog->len)
+ return NULL;
+
+ linfo = prog->aux->linfo;
+ /* Loop invariant: linfo[l].insn_off <= insns_off.
+ * linfo[0].insn_off == 0 which always satisfies above condition.
+ * Binary search is searching for rightmost linfo entry that satisfies
+ * the above invariant, giving us the desired record that covers given
+ * instruction offset.
+ */
+ l = 0;
+ r = nr_linfo - 1;
+ while (l < r) {
+ /* (r - l + 1) / 2 means we break a tie to the right, so if:
+ * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off,
+ * then m=2, we see that linfo[m].insn_off > insn_off, and so
+ * r becomes 1 and we exit the loop with correct l==1.
+ * If the tie was broken to the left, m=1 would end us up in
+ * an endless loop where l and m stay at 1 and r stays at 2.
+ */
+ m = l + (r - l + 1) / 2;
+ if (linfo[m].insn_off <= insn_off)
+ l = m;
+ else
+ r = m - 1;
+ }
+
+ return &linfo[l];
+}
+
+static const char *ltrim(const char *s)
+{
+ while (isspace(*s))
+ s++;
+
+ return s;
+}
+
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+ u32 insn_off,
+ const char *prefix_fmt, ...)
+{
+ const struct bpf_line_info *linfo, *prev_linfo;
+ const struct btf *btf;
+ const char *s, *fname;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ prev_linfo = env->prev_linfo;
+ linfo = find_linfo(env, insn_off);
+ if (!linfo || linfo == prev_linfo)
+ return;
+
+ /* It often happens that two separate linfo records point to the same
+ * source code line, but have differing column numbers. Given verifier
+ * log doesn't emit column information, from user perspective we just
+ * end up emitting the same source code line twice unnecessarily.
+ * So instead check that previous and current linfo record point to
+ * the same file (file_name_offs match) and the same line number, and
+ * avoid emitting duplicated source code line in such case.
+ */
+ if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off &&
+ BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col))
+ return;
+
+ if (prefix_fmt) {
+ va_list args;
+
+ va_start(args, prefix_fmt);
+ bpf_verifier_vlog(&env->log, prefix_fmt, args);
+ va_end(args);
+ }
+
+ btf = env->prog->aux->btf;
+ s = ltrim(btf_name_by_offset(btf, linfo->line_off));
+ verbose(env, "%s", s); /* source code line */
+
+ s = btf_name_by_offset(btf, linfo->file_name_off);
+ /* leave only file name */
+ fname = strrchr(s, '/');
+ fname = fname ? fname + 1 : s;
+ verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col));
+
+ env->prev_linfo = linfo;
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[64] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "scalar",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_ARENA] = "arena",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_INSN] = "insn",
+ [PTR_TO_MAP_KEY] = "map_key",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID)
+ strscpy(postfix, "or_null_");
+ else
+ strscpy(postfix, "_or_null");
+ }
+
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
+
+ snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->tmp_str_buf;
+}
+
+const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return "local";
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return "ringbuf";
+ case BPF_DYNPTR_TYPE_SKB:
+ return "skb";
+ case BPF_DYNPTR_TYPE_XDP:
+ return "xdp";
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return "skb_meta";
+ case BPF_DYNPTR_TYPE_FILE:
+ return "file";
+ case BPF_DYNPTR_TYPE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown dynptr type %d\n", type);
+ return "<unknown>";
+ }
+}
+
+const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
+static char slot_type_char[] = {
+ [STACK_INVALID] = '?',
+ [STACK_SPILL] = 'r',
+ [STACK_MISC] = 'm',
+ [STACK_ZERO] = '0',
+ [STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
+ [STACK_IRQ_FLAG] = 'f'
+};
+
+#define UNUM_MAX_DECIMAL U16_MAX
+#define SNUM_MAX_DECIMAL S16_MAX
+#define SNUM_MIN_DECIMAL S16_MIN
+
+static bool is_unum_decimal(u64 num)
+{
+ return num <= UNUM_MAX_DECIMAL;
+}
+
+static bool is_snum_decimal(s64 num)
+{
+ return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL;
+}
+
+static void verbose_unum(struct bpf_verifier_env *env, u64 num)
+{
+ if (is_unum_decimal(num))
+ verbose(env, "%llu", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+static void verbose_snum(struct bpf_verifier_env *env, s64 num)
+{
+ if (is_snum_decimal(num))
+ verbose(env, "%lld", num);
+ else
+ verbose(env, "%#llx", num);
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+ /* print as a constant, if tnum is fully known */
+ if (a.mask == 0) {
+ if (is_unum_decimal(a.value))
+ return snprintf(str, size, "%llu", a.value);
+ else
+ return snprintf(str, size, "%#llx", a.value);
+ }
+ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ const char **sep)
+{
+ /* For signed ranges, we want to unify 64-bit and 32-bit values in the
+ * output as much as possible, but there is a bit of a complication.
+ * If we choose to print values as decimals, this is natural to do,
+ * because negative 64-bit and 32-bit values >= -S32_MIN have the same
+ * representation due to sign extension. But if we choose to print
+ * them in hex format (see is_snum_decimal()), then sign extension is
+ * misleading.
+ * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in
+ * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two
+ * very different numbers.
+ * So we avoid sign extension if we choose to print values in hex.
+ */
+ struct {
+ const char *name;
+ u64 val;
+ bool omit;
+ } minmaxs[] = {
+ {"smin", reg->smin_value, reg->smin_value == S64_MIN},
+ {"smax", reg->smax_value, reg->smax_value == S64_MAX},
+ {"umin", reg->umin_value, reg->umin_value == 0},
+ {"umax", reg->umax_value, reg->umax_value == U64_MAX},
+ {"smin32",
+ is_snum_decimal((s64)reg->s32_min_value)
+ ? (s64)reg->s32_min_value
+ : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+ {"smax32",
+ is_snum_decimal((s64)reg->s32_max_value)
+ ? (s64)reg->s32_max_value
+ : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+ {"umin32", reg->u32_min_value, reg->u32_min_value == 0},
+ {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX},
+ }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+ bool neg1, neg2;
+
+ for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+ if (m1->omit)
+ continue;
+
+ neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+ verbose(env, "%s%s=", *sep, m1->name);
+ *sep = ",";
+
+ for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+ if (m2->omit || m2->val != m1->val)
+ continue;
+ /* don't mix negatives with positives */
+ neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+ if (neg2 != neg1)
+ continue;
+ m2->omit = true;
+ verbose(env, "%s=", m2->name);
+ }
+
+ if (m1->name[0] == 's')
+ verbose_snum(env, m1->val);
+ else
+ verbose_unum(env, m1->val);
+ }
+}
+
+static bool type_is_map_ptr(enum bpf_reg_type t) {
+ switch (base_type(t)) {
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
+
+static void print_reg_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state,
+ const struct bpf_reg_state *reg)
+{
+ enum bpf_reg_type t;
+ const char *sep = "";
+
+ t = reg->type;
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
+ verbose_snum(env, reg->var_off.value);
+ return;
+ }
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (t == PTR_TO_ARENA)
+ return;
+ if (t == PTR_TO_STACK) {
+ if (state->frameno != reg->frameno)
+ verbose(env, "[%d]", reg->frameno);
+ if (tnum_is_const(reg->var_off)) {
+ verbose_snum(env, reg->var_off.value + reg->off);
+ return;
+ }
+ }
+ if (base_type(t) == PTR_TO_BTF_ID)
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+ verbose(env, "(");
+ if (reg->id)
+ verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
+ if (reg->id & BPF_ADD_CONST)
+ verbose(env, "%+d", reg->off);
+ if (reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+ if (type_is_non_owning_ref(reg->type))
+ verbose_a("%s", "non_own_ref");
+ if (type_is_map_ptr(t)) {
+ if (reg->map_ptr->name[0])
+ verbose_a("map=%s", reg->map_ptr->name);
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
+ }
+ if (t != SCALAR_VALUE && reg->off) {
+ verbose_a("off=");
+ verbose_snum(env, reg->off);
+ }
+ if (type_is_pkt_pointer(t)) {
+ verbose_a("r=");
+ verbose_unum(env, reg->range);
+ }
+ if (base_type(t) == PTR_TO_MEM) {
+ verbose_a("sz=");
+ verbose_unum(env, reg->mem_size);
+ }
+ if (t == CONST_PTR_TO_DYNPTR)
+ verbose_a("type=%s", dynptr_type_str(reg->dynptr.type));
+ if (tnum_is_const(reg->var_off)) {
+ /* a pointer register with fixed offset */
+ if (reg->var_off.value) {
+ verbose_a("imm=");
+ verbose_snum(env, reg->var_off.value);
+ }
+ } else {
+ print_scalar_ranges(env, reg, &sep);
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose_a("var_off=%s", tn_buf);
+ }
+ }
+ verbose(env, ")");
+}
+
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno, bool print_all)
+{
+ const struct bpf_func_state *state = vstate->frame[frameno];
+ const struct bpf_reg_state *reg;
+ int i;
+
+ if (state->frameno)
+ verbose(env, " frame%d:", state->frameno);
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ reg = &state->regs[i];
+ if (reg->type == NOT_INIT)
+ continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
+ verbose(env, " R%d", i);
+ verbose(env, "=");
+ print_reg_state(env, state, reg);
+ }
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ char types_buf[BPF_REG_SIZE + 1];
+ const char *sep = "";
+ bool valid = false;
+ u8 slot_type;
+ int j;
+
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
+
+ for (j = 0; j < BPF_REG_SIZE; j++) {
+ slot_type = state->stack[i].slot_type[j];
+ if (slot_type != STACK_INVALID)
+ valid = true;
+ types_buf[j] = slot_type_char[slot_type];
+ }
+ types_buf[BPF_REG_SIZE] = 0;
+ if (!valid)
+ continue;
+
+ reg = &state->stack[i].spilled_ptr;
+ switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
+ /* print MISC/ZERO/INVALID slots above subreg spill */
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (state->stack[i].slot_type[j] == STACK_SPILL)
+ break;
+ types_buf[j] = '\0';
+
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
+ print_reg_state(env, state, reg);
+ break;
+ case STACK_DYNPTR:
+ /* skip to main dynptr slot */
+ i += BPF_DYNPTR_NR_SLOTS - 1;
+ reg = &state->stack[i].spilled_ptr;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg->ref_obj_id)
+ verbose_a("ref_id=%d", reg->ref_obj_id);
+ if (reg->dynptr_id)
+ verbose_a("dynptr_id=%d", reg->dynptr_id);
+ verbose(env, ")");
+ break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ (-i - 1) * BPF_REG_SIZE,
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ default:
+ verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf);
+ break;
+ }
+ }
+ if (vstate->acquired_refs && vstate->refs[0].id) {
+ verbose(env, " refs=%d", vstate->refs[0].id);
+ for (i = 1; i < vstate->acquired_refs; i++)
+ if (vstate->refs[i].id)
+ verbose(env, ",%d", vstate->refs[i].id);
+ }
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
+ verbose(env, "\n");
+ if (!print_all)
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno)
+{
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, vstate, frameno, false);
+}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 000000000000..be66d7e520e0
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,789 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016,2017 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
+#include <linux/bpf_mem_alloc.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+ struct lpm_trie_node __rcu *child[2];
+ u32 prefixlen;
+ u32 flags;
+ u8 data[];
+};
+
+struct lpm_trie {
+ struct bpf_map map;
+ struct lpm_trie_node __rcu *root;
+ struct bpf_mem_alloc ma;
+ size_t n_entries;
+ size_t max_prefixlen;
+ size_t data_size;
+ rqspinlock_t lock;
+};
+
+/* This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of what (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * |
+ * +----------------+
+ * | (2) |
+ * | 192.168.0.0/24 |
+ * | value: 2 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (2) | | (3) |
+ * | 192.168.0.0/24 | | 192.168.128.0/24 |
+ * | value: 2 | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differentiate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (4) (I) | | (3) |
+ * | 192.168.0.0/23 | | 192.168.128.0/24 |
+ * | value: --- | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ * | |
+ * +----------------+ +----------------+
+ * | (2) | | (5) |
+ * | 192.168.0.0/24 | | 192.168.1.0/24 |
+ * | value: 2 | | value: 5 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+ return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * __longest_prefix_match() - determine the longest prefix
+ * @trie: The trie to get internal sizes from
+ * @node: The node to operate on
+ * @key: The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static __always_inline
+size_t __longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
+{
+ u32 limit = min(node->prefixlen, key->prefixlen);
+ u32 prefixlen = 0, i = 0;
+
+ BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32));
+ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key_u8, data) % sizeof(u32));
+
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT)
+
+ /* data_size >= 16 has very small probability.
+ * We do not use a loop for optimal code generation.
+ */
+ if (trie->data_size >= 8) {
+ u64 diff = be64_to_cpu(*(__be64 *)node->data ^
+ *(__be64 *)key->data);
+
+ prefixlen = 64 - fls64(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i = 8;
+ }
+#endif
+
+ while (trie->data_size >= i + 4) {
+ u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^
+ *(__be32 *)&key->data[i]);
+
+ prefixlen += 32 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 4;
+ }
+
+ if (trie->data_size >= i + 2) {
+ u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^
+ *(__be16 *)&key->data[i]);
+
+ prefixlen += 16 - fls(diff);
+ if (prefixlen >= limit)
+ return limit;
+ if (diff)
+ return prefixlen;
+ i += 2;
+ }
+
+ if (trie->data_size >= i + 1) {
+ prefixlen += 8 - fls(node->data[i] ^ key->data[i]);
+
+ if (prefixlen >= limit)
+ return limit;
+ }
+
+ return prefixlen;
+}
+
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key_u8 *key)
+{
+ return __longest_prefix_match(trie, node, key);
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *found = NULL;
+ struct bpf_lpm_trie_key_u8 *key = _key;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return NULL;
+
+ /* Start walking the trie from the root node ... */
+
+ for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
+ node;) {
+ unsigned int next_bit;
+ size_t matchlen;
+
+ /* Determine the longest prefix of @node that matches @key.
+ * If it's the maximum possible prefix for this trie, we have
+ * an exact match and can return it directly.
+ */
+ matchlen = __longest_prefix_match(trie, node, key);
+ if (matchlen == trie->max_prefixlen) {
+ found = node;
+ break;
+ }
+
+ /* If the number of bits that match is smaller than the prefix
+ * length of @node, bail out and return the node we have seen
+ * last in the traversal (ie, the parent).
+ */
+ if (matchlen < node->prefixlen)
+ break;
+
+ /* Consider this node as return candidate unless it is an
+ * artificially added intermediate one.
+ */
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ found = node;
+
+ /* If the node match is fully satisfied, let's see if we can
+ * become more specific. Determine the next bit in the key and
+ * traverse down.
+ */
+ next_bit = extract_bit(key->data, node->prefixlen);
+ node = rcu_dereference_check(node->child[next_bit],
+ rcu_read_lock_bh_held());
+ }
+
+ if (!found)
+ return NULL;
+
+ return found->data + trie->data_size;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
+ const void *value)
+{
+ struct lpm_trie_node *node;
+
+ node = bpf_mem_cache_alloc(&trie->ma);
+
+ if (!node)
+ return NULL;
+
+ node->flags = 0;
+
+ if (value)
+ memcpy(node->data + trie->data_size, value,
+ trie->map.value_size);
+
+ return node;
+}
+
+static int trie_check_add_elem(struct lpm_trie *trie, u64 flags)
+{
+ if (flags == BPF_EXIST)
+ return -ENOENT;
+ if (trie->n_entries == trie->map.max_entries)
+ return -ENOSPC;
+ trie->n_entries++;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static long trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *im_node, *new_node;
+ struct lpm_trie_node *free_node = NULL;
+ struct lpm_trie_node __rcu **slot;
+ struct bpf_lpm_trie_key_u8 *key = _key;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ /* Allocate and fill a new node */
+ new_node = lpm_trie_node_alloc(trie, value);
+ if (!new_node)
+ return -ENOMEM;
+
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ goto out_free;
+
+ new_node->prefixlen = key->prefixlen;
+ RCU_INIT_POINTER(new_node->child[0], NULL);
+ RCU_INIT_POINTER(new_node->child[1], NULL);
+ memcpy(new_node->data, key->data, trie->data_size);
+
+ /* Now find a slot to attach the new node. To do that, walk the tree
+ * from the root and match as many bits as possible for each node until
+ * we either find an empty slot or a slot that needs to be replaced by
+ * an intermediate node.
+ */
+ slot = &trie->root;
+
+ while ((node = rcu_dereference(*slot))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ next_bit = extract_bit(key->data, node->prefixlen);
+ slot = &node->child[next_bit];
+ }
+
+ /* If the slot is empty (a free child pointer or an empty root),
+ * simply assign the @new_node to that slot and be done.
+ */
+ if (!node) {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ /* If the slot we picked already exists, replace it with @new_node
+ * which already has the correct data array set.
+ */
+ if (node->prefixlen == matchlen) {
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ if (flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out;
+ }
+ } else {
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+ }
+
+ new_node->child[0] = node->child[0];
+ new_node->child[1] = node->child[1];
+
+ rcu_assign_pointer(*slot, new_node);
+ free_node = node;
+
+ goto out;
+ }
+
+ ret = trie_check_add_elem(trie, flags);
+ if (ret)
+ goto out;
+
+ /* If the new node matches the prefix completely, it must be inserted
+ * as an ancestor. Simply insert it between @node and *@slot.
+ */
+ if (matchlen == key->prefixlen) {
+ next_bit = extract_bit(node->data, matchlen);
+ rcu_assign_pointer(new_node->child[next_bit], node);
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ im_node = lpm_trie_node_alloc(trie, NULL);
+ if (!im_node) {
+ trie->n_entries--;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ im_node->prefixlen = matchlen;
+ im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+ memcpy(im_node->data, node->data, trie->data_size);
+
+ /* Now determine which child to install in which slot */
+ if (extract_bit(key->data, matchlen)) {
+ rcu_assign_pointer(im_node->child[0], node);
+ rcu_assign_pointer(im_node->child[1], new_node);
+ } else {
+ rcu_assign_pointer(im_node->child[0], new_node);
+ rcu_assign_pointer(im_node->child[1], node);
+ }
+
+ /* Finally, assign the intermediate node to the determined slot */
+ rcu_assign_pointer(*slot, im_node);
+
+out:
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
+out_free:
+ if (ret)
+ bpf_mem_cache_free(&trie->ma, new_node);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
+
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static long trie_delete_elem(struct bpf_map *map, void *_key)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *free_node = NULL, *free_parent = NULL;
+ struct bpf_lpm_trie_key_u8 *key = _key;
+ struct lpm_trie_node __rcu **trim, **trim2;
+ struct lpm_trie_node *node, *parent;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ return ret;
+
+ /* Walk the tree looking for an exact key/length match and keeping
+ * track of the path we traverse. We will need to know the node
+ * we wish to delete, and the slot that points to the node we want
+ * to delete. We may also need to know the nodes parent and the
+ * slot that contains it.
+ */
+ trim = &trie->root;
+ trim2 = trim;
+ parent = NULL;
+ while ((node = rcu_dereference(*trim))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ parent = node;
+ trim2 = trim;
+ next_bit = extract_bit(key->data, node->prefixlen);
+ trim = &node->child[next_bit];
+ }
+
+ if (!node || node->prefixlen != key->prefixlen ||
+ node->prefixlen != matchlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trie->n_entries--;
+
+ /* If the node we are removing has two children, simply mark it
+ * as intermediate and we are done.
+ */
+ if (rcu_access_pointer(node->child[0]) &&
+ rcu_access_pointer(node->child[1])) {
+ node->flags |= LPM_TREE_NODE_FLAG_IM;
+ goto out;
+ }
+
+ /* If the parent of the node we are about to delete is an intermediate
+ * node, and the deleted node doesn't have any children, we can delete
+ * the intermediate parent as well and promote its other child
+ * up the tree. Doing this maintains the invariant that all
+ * intermediate nodes have exactly 2 children and that there are no
+ * unnecessary intermediate nodes in the tree.
+ */
+ if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+ !node->child[0] && !node->child[1]) {
+ if (node == rcu_access_pointer(parent->child[0]))
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[1]));
+ else
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[0]));
+ free_parent = parent;
+ free_node = node;
+ goto out;
+ }
+
+ /* The node we are removing has either zero or one child. If there
+ * is a child, move it into the removed node's slot then delete
+ * the node. Otherwise just clear the slot and delete the node.
+ */
+ if (node->child[0])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+ else if (node->child[1])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+ else
+ RCU_INIT_POINTER(*trim, NULL);
+ free_node = node;
+
+out:
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ bpf_mem_cache_free_rcu(&trie->ma, free_parent);
+ bpf_mem_cache_free_rcu(&trie->ma, free_node);
+
+ return ret;
+}
+
+#define LPM_DATA_SIZE_MAX 256
+#define LPM_DATA_SIZE_MIN 1
+
+#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
+ sizeof(struct lpm_trie_node))
+#define LPM_VAL_SIZE_MIN 1
+
+#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key_u8) + (X))
+#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
+#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+
+#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
+ BPF_F_ACCESS_MASK)
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+ struct lpm_trie *trie;
+ size_t leaf_size;
+ int err;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
+ attr->key_size < LPM_KEY_SIZE_MIN ||
+ attr->key_size > LPM_KEY_SIZE_MAX ||
+ attr->value_size < LPM_VAL_SIZE_MIN ||
+ attr->value_size > LPM_VAL_SIZE_MAX)
+ return ERR_PTR(-EINVAL);
+
+ trie = bpf_map_area_alloc(sizeof(*trie), NUMA_NO_NODE);
+ if (!trie)
+ return ERR_PTR(-ENOMEM);
+
+ /* copy mandatory map attributes */
+ bpf_map_init_from_attr(&trie->map, attr);
+ trie->data_size = attr->key_size -
+ offsetof(struct bpf_lpm_trie_key_u8, data);
+ trie->max_prefixlen = trie->data_size * 8;
+
+ raw_res_spin_lock_init(&trie->lock);
+
+ /* Allocate intermediate and leaf nodes from the same allocator */
+ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
+ if (err)
+ goto free_out;
+ return &trie->map;
+
+free_out:
+ bpf_map_area_free(trie);
+ return ERR_PTR(err);
+}
+
+static void trie_free(struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node __rcu **slot;
+ struct lpm_trie_node *node;
+
+ /* Always start at the root and walk down to a node that has no
+ * children. Then free that node, nullify its reference in the parent
+ * and start over.
+ */
+
+ for (;;) {
+ slot = &trie->root;
+
+ for (;;) {
+ node = rcu_dereference_protected(*slot, 1);
+ if (!node)
+ goto out;
+
+ if (rcu_access_pointer(node->child[0])) {
+ slot = &node->child[0];
+ continue;
+ }
+
+ if (rcu_access_pointer(node->child[1])) {
+ slot = &node->child[1];
+ continue;
+ }
+
+ /* No bpf program may access the map, so freeing the
+ * node without waiting for the extra RCU GP.
+ */
+ bpf_mem_cache_raw_free(node);
+ RCU_INIT_POINTER(*slot, NULL);
+ break;
+ }
+ }
+
+out:
+ bpf_mem_alloc_destroy(&trie->ma);
+ bpf_map_area_free(trie);
+}
+
+static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
+{
+ struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct bpf_lpm_trie_key_u8 *key = _key, *next_key = _next_key;
+ struct lpm_trie_node **node_stack = NULL;
+ int err = 0, stack_ptr = -1;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+
+ /* The get_next_key follows postorder. For the 4 node example in
+ * the top of this file, the trie_get_next_key() returns the following
+ * one after another:
+ * 192.168.0.0/24
+ * 192.168.1.0/24
+ * 192.168.128.0/24
+ * 192.168.0.0/16
+ *
+ * The idea is to return more specific keys before less specific ones.
+ */
+
+ /* Empty trie */
+ search_root = rcu_dereference(trie->root);
+ if (!search_root)
+ return -ENOENT;
+
+ /* For invalid key, find the leftmost node in the trie */
+ if (!key || key->prefixlen > trie->max_prefixlen)
+ goto find_leftmost;
+
+ node_stack = kmalloc_array(trie->max_prefixlen + 1,
+ sizeof(struct lpm_trie_node *),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!node_stack)
+ return -ENOMEM;
+
+ /* Try to find the exact node for the given key */
+ for (node = search_root; node;) {
+ node_stack[++stack_ptr] = node;
+ matchlen = longest_prefix_match(trie, node, key);
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ next_bit = extract_bit(key->data, node->prefixlen);
+ node = rcu_dereference(node->child[next_bit]);
+ }
+ if (!node || node->prefixlen != matchlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM))
+ goto find_leftmost;
+
+ /* The node with the exactly-matching key has been found,
+ * find the first node in postorder after the matched node.
+ */
+ node = node_stack[stack_ptr];
+ while (stack_ptr > 0) {
+ parent = node_stack[stack_ptr - 1];
+ if (rcu_dereference(parent->child[0]) == node) {
+ search_root = rcu_dereference(parent->child[1]);
+ if (search_root)
+ goto find_leftmost;
+ }
+ if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
+ next_node = parent;
+ goto do_copy;
+ }
+
+ node = parent;
+ stack_ptr--;
+ }
+
+ /* did not find anything */
+ err = -ENOENT;
+ goto free_stack;
+
+find_leftmost:
+ /* Find the leftmost non-intermediate node, all intermediate nodes
+ * have exact two children, so this function will never return NULL.
+ */
+ for (node = search_root; node;) {
+ if (node->flags & LPM_TREE_NODE_FLAG_IM) {
+ node = rcu_dereference(node->child[0]);
+ } else {
+ next_node = node;
+ node = rcu_dereference(node->child[0]);
+ if (!node)
+ node = rcu_dereference(next_node->child[1]);
+ }
+ }
+do_copy:
+ next_key->prefixlen = next_node->prefixlen;
+ memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key_u8, data),
+ next_node->data, trie->data_size);
+free_stack:
+ kfree(node_stack);
+ return err;
+}
+
+static int trie_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* Keys must have struct bpf_lpm_trie_key_u8 embedded. */
+ return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ?
+ -EINVAL : 0;
+}
+
+static u64 trie_mem_usage(const struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ u64 elem_size;
+
+ elem_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ return elem_size * READ_ONCE(trie->n_entries);
+}
+
+BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
+const struct bpf_map_ops trie_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = trie_alloc,
+ .map_free = trie_free,
+ .map_get_next_key = trie_get_next_key,
+ .map_lookup_elem = trie_lookup_elem,
+ .map_update_elem = trie_update_elem,
+ .map_delete_elem = trie_delete_elem,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_delete_batch = generic_map_delete_batch,
+ .map_check_btf = trie_check_btf,
+ .map_mem_usage = trie_mem_usage,
+ .map_btf_id = &trie_map_btf_ids[0],
+};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
new file mode 100644
index 000000000000..645bd30bc9a9
--- /dev/null
+++ b/kernel/bpf/map_in_map.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+#include "map_in_map.h"
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
+{
+ struct bpf_map *inner_map, *inner_map_meta;
+ u32 inner_map_meta_size;
+ CLASS(fd, f)(inner_map_ufd);
+
+ inner_map = __bpf_map_get(f);
+ if (IS_ERR(inner_map))
+ return inner_map;
+
+ /* Does not support >1 level map-in-map */
+ if (inner_map->inner_map_meta)
+ return ERR_PTR(-EINVAL);
+
+ if (!inner_map->ops->map_meta_equal)
+ return ERR_PTR(-ENOTSUPP);
+
+ inner_map_meta_size = sizeof(*inner_map_meta);
+ /* In some cases verifier needs to access beyond just base map. */
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops)
+ inner_map_meta_size = sizeof(struct bpf_array);
+
+ inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
+ if (!inner_map_meta)
+ return ERR_PTR(-ENOMEM);
+
+ inner_map_meta->map_type = inner_map->map_type;
+ inner_map_meta->key_size = inner_map->key_size;
+ inner_map_meta->value_size = inner_map->value_size;
+ inner_map_meta->map_flags = inner_map->map_flags;
+ inner_map_meta->max_entries = inner_map->max_entries;
+
+ inner_map_meta->record = btf_record_dup(inner_map->record);
+ if (IS_ERR(inner_map_meta->record)) {
+ /* btf_record_dup returns NULL or valid pointer in case of
+ * invalid/empty/valid, but ERR_PTR in case of errors. During
+ * equality NULL or IS_ERR is equivalent.
+ */
+ struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
+ kfree(inner_map_meta);
+ return ret;
+ }
+ /* Note: We must use the same BTF, as we also used btf_record_dup above
+ * which relies on BTF being same for both maps, as some members like
+ * record->fields.list_head have pointers like value_rec pointing into
+ * inner_map->btf.
+ */
+ if (inner_map->btf) {
+ btf_get(inner_map->btf);
+ inner_map_meta->btf = inner_map->btf;
+ }
+
+ /* Misc members not needed in bpf_map_meta_equal() check. */
+ inner_map_meta->ops = inner_map->ops;
+ if (inner_map->ops == &array_map_ops || inner_map->ops == &percpu_array_map_ops) {
+ struct bpf_array *inner_array_meta =
+ container_of(inner_map_meta, struct bpf_array, map);
+ struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map);
+
+ inner_array_meta->index_mask = inner_array->index_mask;
+ inner_array_meta->elem_size = inner_array->elem_size;
+ inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
+ }
+ return inner_map_meta;
+}
+
+void bpf_map_meta_free(struct bpf_map *map_meta)
+{
+ bpf_map_free_record(map_meta);
+ btf_put(map_meta->btf);
+ kfree(map_meta);
+}
+
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ /* No need to compare ops because it is covered by map_type */
+ return meta0->map_type == meta1->map_type &&
+ meta0->key_size == meta1->key_size &&
+ meta0->value_size == meta1->value_size &&
+ meta0->map_flags == meta1->map_flags &&
+ btf_record_equal(meta0->record, meta1->record);
+}
+
+void *bpf_map_fd_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int ufd)
+{
+ struct bpf_map *inner_map, *inner_map_meta;
+ CLASS(fd, f)(ufd);
+
+ inner_map = __bpf_map_get(f);
+ if (IS_ERR(inner_map))
+ return inner_map;
+
+ inner_map_meta = map->inner_map_meta;
+ if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map))
+ bpf_map_inc(inner_map);
+ else
+ inner_map = ERR_PTR(-EINVAL);
+
+ return inner_map;
+}
+
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
+{
+ struct bpf_map *inner_map = ptr;
+
+ /* Defer the freeing of inner map according to the sleepable attribute
+ * of bpf program which owns the outer map, so unnecessary waiting for
+ * RCU tasks trace grace period can be avoided.
+ */
+ if (need_defer) {
+ if (atomic64_read(&map->sleepable_refcnt))
+ WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+ else
+ WRITE_ONCE(inner_map->free_after_rcu_gp, true);
+ }
+ bpf_map_put(inner_map);
+}
+
+u32 bpf_map_fd_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_map *)ptr)->id;
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
new file mode 100644
index 000000000000..7d61602354de
--- /dev/null
+++ b/kernel/bpf/map_in_map.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2017 Facebook
+ */
+#ifndef __MAP_IN_MAP_H__
+#define __MAP_IN_MAP_H__
+
+#include <linux/types.h>
+
+struct file;
+struct bpf_map;
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
+void bpf_map_meta_free(struct bpf_map *map_meta);
+void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
+ int ufd);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
+u32 bpf_map_fd_sys_lookup_elem(void *ptr);
+
+#endif
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
new file mode 100644
index 000000000000..9575314f40a6
--- /dev/null
+++ b/kernel/bpf/map_iter.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_map_info {
+ u32 map_id;
+};
+
+static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_map_info *info = seq->private;
+ struct bpf_map *map;
+
+ map = bpf_map_get_curr_or_next(&info->map_id);
+ if (!map)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return map;
+}
+
+static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->map_id;
+ bpf_map_put((struct bpf_map *)v);
+ return bpf_map_get_curr_or_next(&info->map_id);
+}
+
+struct bpf_iter__bpf_map {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map)
+
+static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_map ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.map = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_map_seq_show(seq, v, false);
+}
+
+static void bpf_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_map_seq_show(seq, v, true);
+ else
+ bpf_map_put((struct bpf_map *)v);
+}
+
+static const struct seq_operations bpf_map_seq_ops = {
+ .start = bpf_map_seq_start,
+ .next = bpf_map_seq_next,
+ .stop = bpf_map_seq_stop,
+ .show = bpf_map_seq_show,
+};
+
+BTF_ID_LIST_GLOBAL_SINGLE(btf_bpf_map_id, struct, bpf_map)
+
+static const struct bpf_iter_seq_info bpf_map_seq_info = {
+ .seq_ops = &bpf_map_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_map_info),
+};
+
+static struct bpf_iter_reg bpf_map_reg_info = {
+ .target = "bpf_map",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_map, map),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &bpf_map_seq_info,
+};
+
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ u32 key_acc_size, value_acc_size, key_size, value_size;
+ struct bpf_map *map;
+ bool is_percpu = false;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ is_percpu = true;
+ else if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY)
+ goto put_map;
+
+ key_acc_size = prog->aux->max_rdonly_access;
+ value_acc_size = prog->aux->max_rdwr_access;
+ key_size = map->key_size;
+ if (!is_percpu)
+ value_size = map->value_size;
+ else
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+
+ if (key_acc_size > key_size || value_acc_size > value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_printf(seq, "map_id:\t%u\n", aux->map->id);
+}
+
+int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
+ struct bpf_link_info *info)
+{
+ info->iter.map.map_id = aux->map->id;
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key, void *value)
+
+static const struct bpf_iter_reg bpf_map_elem_reg_info = {
+ .target = "bpf_map_elem",
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_map_elem, key),
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
+ { offsetof(struct bpf_iter__bpf_map_elem, value),
+ PTR_TO_BUF | PTR_MAYBE_NULL },
+ },
+};
+
+static int __init bpf_map_iter_init(void)
+{
+ int ret;
+
+ bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id;
+ ret = bpf_iter_reg_target(&bpf_map_reg_info);
+ if (ret)
+ return ret;
+
+ return bpf_iter_reg_target(&bpf_map_elem_reg_info);
+}
+
+late_initcall(bpf_map_iter_init);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
+{
+ s64 *pcount;
+ s64 ret = 0;
+ int cpu;
+
+ if (!map || !map->elem_count)
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ pcount = per_cpu_ptr(map->elem_count, cpu);
+ ret += READ_ONCE(*pcount);
+ }
+ return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_map_iter_kfunc_ids,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_map_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..bd45dda9dc35
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+#include <asm/local.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+#define BPF_MEM_ALLOC_SIZE_MAX 4096
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+ 3, /* 8 */
+ 3, /* 16 */
+ 4, /* 24 */
+ 4, /* 32 */
+ 5, /* 40 */
+ 5, /* 48 */
+ 5, /* 56 */
+ 5, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 6, /* 104 */
+ 6, /* 112 */
+ 6, /* 120 */
+ 6, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+ if (!size || size > BPF_MEM_ALLOC_SIZE_MAX)
+ return -1;
+
+ if (size <= 192)
+ return size_index[(size - 1) / 8] - 1;
+
+ return fls(size - 1) - 2;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+ /* per-cpu list of free objects of size 'unit_size'.
+ * All accesses are done with interrupts disabled and 'active' counter
+ * protection with __llist_add() and __llist_del_first().
+ */
+ struct llist_head free_llist;
+ local_t active;
+
+ /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
+ * are sequenced by per-cpu 'active' counter. But unit_free() cannot
+ * fail. When 'active' is busy the unit_free() will add an object to
+ * free_llist_extra.
+ */
+ struct llist_head free_llist_extra;
+
+ struct irq_work refill_work;
+ struct obj_cgroup *objcg;
+ int unit_size;
+ /* count of objects in free_llist */
+ int free_cnt;
+ int low_watermark, high_watermark, batch;
+ int percpu_size;
+ bool draining;
+ struct bpf_mem_cache *tgt;
+
+ /* list of objects to be freed after RCU GP */
+ struct llist_head free_by_rcu;
+ struct llist_node *free_by_rcu_tail;
+ struct llist_head waiting_for_gp;
+ struct llist_node *waiting_for_gp_tail;
+ struct rcu_head rcu;
+ atomic_t call_rcu_in_progress;
+ struct llist_head free_llist_extra_rcu;
+
+ /* list of objects to be freed after RCU tasks trace GP */
+ struct llist_head free_by_rcu_ttrace;
+ struct llist_head waiting_for_gp_ttrace;
+ struct rcu_head rcu_ttrace;
+ atomic_t call_rcu_ttrace_in_progress;
+};
+
+struct bpf_mem_caches {
+ struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+ struct llist_node *entry, *next;
+
+ entry = head->first;
+ if (!entry)
+ return NULL;
+ next = entry->next;
+ head->first = next;
+ return entry;
+}
+
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
+{
+ if (c->percpu_size) {
+ void __percpu **obj = kmalloc_node(c->percpu_size, flags, node);
+ void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
+
+ if (!obj || !pptr) {
+ free_percpu(pptr);
+ kfree(obj);
+ return NULL;
+ }
+ obj[1] = pptr;
+ return obj;
+ }
+
+ return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG
+ if (c->objcg)
+ return get_mem_cgroup_from_objcg(c->objcg);
+ return root_mem_cgroup;
+#else
+ return NULL;
+#endif
+}
+
+static void inc_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ /* In RT irq_work runs in per-cpu kthread, so disable
+ * interrupts to avoid preemption and interrupts and
+ * reduce the chance of bpf prog executing on this cpu
+ * when active counter is busy.
+ */
+ local_irq_save(*flags);
+ /* alloc_bulk runs from irq_work which will not preempt a bpf
+ * program that does unit_alloc/unit_free since IRQs are
+ * disabled there. There is no race to increment 'active'
+ * counter. It protects free_llist from corruption in case NMI
+ * bpf prog preempted this loop.
+ */
+ WARN_ON_ONCE(local_inc_return(&c->active) != 1);
+}
+
+static void dec_active(struct bpf_mem_cache *c, unsigned long *flags)
+{
+ local_dec(&c->active);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj)
+{
+ unsigned long flags;
+
+ inc_active(c, &flags);
+ __llist_add(obj, &c->free_llist);
+ c->free_cnt++;
+ dec_active(c, &flags);
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
+{
+ struct mem_cgroup *memcg = NULL, *old_memcg;
+ gfp_t gfp;
+ void *obj;
+ int i;
+
+ gfp = __GFP_NOWARN | __GFP_ACCOUNT;
+ gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
+
+ for (i = 0; i < cnt; i++) {
+ /*
+ * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
+ * done only by one CPU == current CPU. Other CPUs might
+ * llist_add() and llist_del_all() in parallel.
+ */
+ obj = llist_del_first(&c->free_by_rcu_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ for (; i < cnt; i++) {
+ obj = llist_del_first(&c->waiting_for_gp_ttrace);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ if (i >= cnt)
+ return;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ for (; i < cnt; i++) {
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ obj = __alloc(c, node, gfp);
+ if (!obj)
+ break;
+ add_obj_to_free_list(c, obj);
+ }
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+}
+
+static void free_one(void *obj, bool percpu)
+{
+ if (percpu)
+ free_percpu(((void __percpu **)obj)[1]);
+
+ kfree(obj);
+}
+
+static int free_all(struct llist_node *llnode, bool percpu)
+{
+ struct llist_node *pos, *t;
+ int cnt = 0;
+
+ llist_for_each_safe(pos, t, llnode) {
+ free_one(pos, percpu);
+ cnt++;
+ }
+ return cnt;
+}
+
+static void __free_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
+
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
+ atomic_set(&c->call_rcu_ttrace_in_progress, 0);
+}
+
+static void __free_rcu_tasks_trace(struct rcu_head *head)
+{
+ /* If RCU Tasks Trace grace period implies RCU grace period,
+ * there is no need to invoke call_rcu().
+ */
+ if (rcu_trace_implies_rcu_gp())
+ __free_rcu(head);
+ else
+ call_rcu(head, __free_rcu);
+}
+
+static void enque_to_free(struct bpf_mem_cache *c, void *obj)
+{
+ struct llist_node *llnode = obj;
+
+ /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
+ * Nothing races to add to free_by_rcu_ttrace list.
+ */
+ llist_add(llnode, &c->free_by_rcu_ttrace);
+}
+
+static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+
+ if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
+ if (unlikely(READ_ONCE(c->draining))) {
+ llnode = llist_del_all(&c->free_by_rcu_ttrace);
+ free_all(llnode, !!c->percpu_size);
+ }
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace))
+ llist_add(llnode, &c->waiting_for_gp_ttrace);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ __free_rcu(&c->rcu_ttrace);
+ return;
+ }
+
+ /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * If RCU Tasks Trace grace period implies RCU grace period, free
+ * these elements directly, else use call_rcu() to wait for normal
+ * progs to finish and finally do free_one() on each element.
+ */
+ call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+ int cnt;
+
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
+ do {
+ inc_active(c, &flags);
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode)
+ cnt = --c->free_cnt;
+ else
+ cnt = 0;
+ dec_active(c, &flags);
+ if (llnode)
+ enque_to_free(tgt, llnode);
+ } while (cnt > (c->high_watermark + c->low_watermark) / 2);
+
+ /* and drain free_llist_extra */
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
+ enque_to_free(tgt, llnode);
+ do_call_rcu_ttrace(tgt);
+}
+
+static void __free_by_rcu(struct rcu_head *head)
+{
+ struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+ struct bpf_mem_cache *tgt = c->tgt;
+ struct llist_node *llnode;
+
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+ WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
+ llnode = llist_del_all(&c->waiting_for_gp);
+ if (!llnode)
+ goto out;
+
+ llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
+
+ /* Objects went through regular RCU GP. Send them to RCU tasks trace */
+ do_call_rcu_ttrace(tgt);
+out:
+ atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode, *t;
+ unsigned long flags;
+
+ /* drain free_llist_extra_rcu */
+ if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) {
+ inc_active(c, &flags);
+ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ dec_active(c, &flags);
+ }
+
+ if (llist_empty(&c->free_by_rcu))
+ return;
+
+ if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+ /*
+ * Instead of kmalloc-ing new rcu_head and triggering 10k
+ * call_rcu() to hit rcutree.qhimark and force RCU to notice
+ * the overload just ask RCU to hurry up. There could be many
+ * objects in free_by_rcu list.
+ * This hint reduces memory consumption for an artificial
+ * benchmark from 2 Gbyte to 150 Mbyte.
+ */
+ rcu_request_urgent_qs_task(current);
+ return;
+ }
+
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+ inc_active(c, &flags);
+ WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+ c->waiting_for_gp_tail = c->free_by_rcu_tail;
+ dec_active(c, &flags);
+
+ if (unlikely(READ_ONCE(c->draining))) {
+ free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
+ atomic_set(&c->call_rcu_in_progress, 0);
+ } else {
+ call_rcu_hurry(&c->rcu, __free_by_rcu);
+ }
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+ struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+ int cnt;
+
+ /* Racy access to free_cnt. It doesn't need to be 100% accurate */
+ cnt = c->free_cnt;
+ if (cnt < c->low_watermark)
+ /* irq_work runs on this cpu and kmalloc will allocate
+ * from the current numa node which is what we want here.
+ */
+ alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
+ else if (cnt > c->high_watermark)
+ free_bulk(c);
+
+ check_free_by_rcu(c);
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c)
+{
+ irq_work_queue(&c->refill_work);
+}
+
+/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
+ * the freelist cache will be elem_size * 64 (or less) on each cpu.
+ *
+ * For bpf programs that don't have statically known allocation sizes and
+ * assuming (low_mark + high_mark) / 2 as an average number of elements per
+ * bucket and all buckets are used the total amount of memory in freelists
+ * on each cpu will be:
+ * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
+ * == ~ 116 Kbyte using below heuristic.
+ * Initialized, but unused bpf allocator (not bpf map specific one) will
+ * consume ~ 11 Kbyte per cpu.
+ * Typical case will be between 11K and 116K closer to 11K.
+ * bpf progs can and should share bpf_mem_cache when possible.
+ *
+ * Percpu allocation is typically rare. To avoid potential unnecessary large
+ * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
+ */
+static void init_refill_work(struct bpf_mem_cache *c)
+{
+ init_irq_work(&c->refill_work, bpf_mem_refill);
+ if (c->percpu_size) {
+ c->low_watermark = 1;
+ c->high_watermark = 3;
+ } else if (c->unit_size <= 256) {
+ c->low_watermark = 32;
+ c->high_watermark = 96;
+ } else {
+ /* When page_size == 4k, order-0 cache will have low_mark == 2
+ * and high_mark == 6 with batch alloc of 3 individual pages at
+ * a time.
+ * 8k allocs and above low == 1, high == 3, batch == 1.
+ */
+ c->low_watermark = max(32 * 256 / c->unit_size, 1);
+ c->high_watermark = max(96 * 256 / c->unit_size, 3);
+ }
+ c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+}
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+ int cnt = 1;
+
+ /* To avoid consuming memory, for non-percpu allocation, assume that
+ * 1st run of bpf prog won't be doing more than 4 map_update_elem from
+ * irq disabled region if unit size is less than or equal to 256.
+ * For all other cases, let us just do one allocation.
+ */
+ if (!c->percpu_size && c->unit_size <= 256)
+ cnt = 4;
+ alloc_bulk(c, cnt, cpu_to_node(cpu), false);
+}
+
+/* When size != 0 bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
+{
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
+ struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc;
+ struct obj_cgroup *objcg = NULL;
+ int cpu, i, unit_size, percpu_size = 0;
+
+ if (percpu && size == 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ if (percpu)
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+ ma->percpu = percpu;
+
+ if (size) {
+ pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+ if (!pc)
+ return -ENOMEM;
+
+ if (!percpu)
+ size += LLIST_NODE_SZ; /* room for llist_node */
+ unit_size = size;
+
+#ifdef CONFIG_MEMCG
+ if (memcg_bpf_enabled())
+ objcg = get_obj_cgroup_from_current();
+#endif
+ ma->objcg = objcg;
+
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(pc, cpu);
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+ ma->cache = pc;
+ return 0;
+ }
+
+ pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+#ifdef CONFIG_MEMCG
+ objcg = get_obj_cgroup_from_current();
+#endif
+ ma->objcg = objcg;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ c->unit_size = sizes[i];
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+ }
+
+ ma->caches = pcc;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
+{
+ struct bpf_mem_caches __percpu *pcc;
+
+ pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
+ if (!pcc)
+ return -ENOMEM;
+
+ ma->caches = pcc;
+ ma->objcg = objcg;
+ ma->percpu = true;
+ return 0;
+}
+
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
+{
+ struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc;
+ int cpu, i, unit_size, percpu_size;
+ struct obj_cgroup *objcg;
+ struct bpf_mem_cache *c;
+
+ i = bpf_mem_cache_idx(size);
+ if (i < 0)
+ return -EINVAL;
+
+ /* room for llist_node and per-cpu pointer */
+ percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
+ unit_size = sizes[i];
+ objcg = ma->objcg;
+ pcc = ma->caches;
+
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(pcc, cpu);
+ c = &cc->cache[i];
+ if (c->unit_size)
+ break;
+
+ c->unit_size = unit_size;
+ c->objcg = objcg;
+ c->percpu_size = percpu_size;
+ c->tgt = c;
+
+ init_refill_work(c);
+ prefill_mem_cache(c, cpu);
+ }
+
+ return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+ bool percpu = !!c->percpu_size;
+
+ /* No progs are using this bpf_mem_cache, but htab_map_free() called
+ * bpf_mem_cache_free() for all remaining elements and they can be in
+ * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now.
+ *
+ * Except for waiting_for_gp_ttrace list, there are no concurrent operations
+ * on these lists, so it is safe to use __llist_del_all().
+ */
+ free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
+ free_all(__llist_del_all(&c->free_llist), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra), percpu);
+ free_all(__llist_del_all(&c->free_by_rcu), percpu);
+ free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+ free_all(llist_del_all(&c->waiting_for_gp), percpu);
+}
+
+static void check_mem_cache(struct bpf_mem_cache *c)
+{
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra));
+ WARN_ON_ONCE(!llist_empty(&c->free_by_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu));
+ WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+}
+
+static void check_leaked_objs(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i;
+
+ if (ma->cache) {
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ check_mem_cache(c);
+ }
+ }
+ if (ma->caches) {
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ check_mem_cache(c);
+ }
+ }
+ }
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+{
+ check_leaked_objs(ma);
+ free_percpu(ma->cache);
+ free_percpu(ma->caches);
+ ma->cache = NULL;
+ ma->caches = NULL;
+}
+
+static void free_mem_alloc(struct bpf_mem_alloc *ma)
+{
+ /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+ * might still execute. Wait for them.
+ *
+ * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+ * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+ * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+ * so if call_rcu(head, __free_rcu) is skipped due to
+ * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+ * using rcu_trace_implies_rcu_gp() as well.
+ */
+ rcu_barrier(); /* wait for __free_by_rcu */
+ rcu_barrier_tasks_trace(); /* wait for __free_rcu */
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
+ free_mem_alloc_no_barrier(ma);
+}
+
+static void free_mem_alloc_deferred(struct work_struct *work)
+{
+ struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
+
+ free_mem_alloc(ma);
+ kfree(ma);
+}
+
+static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
+{
+ struct bpf_mem_alloc *copy;
+
+ if (!rcu_in_progress) {
+ /* Fast path. No callbacks are pending, hence no need to do
+ * rcu_barrier-s.
+ */
+ free_mem_alloc_no_barrier(ma);
+ return;
+ }
+
+ copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL);
+ if (!copy) {
+ /* Slow path with inline barrier-s */
+ free_mem_alloc(ma);
+ return;
+ }
+
+ /* Defer barriers into worker to let the rest of map memory to be freed */
+ memset(ma, 0, sizeof(*ma));
+ INIT_WORK(&copy->work, free_mem_alloc_deferred);
+ queue_work(system_dfl_wq, &copy->work);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+ struct bpf_mem_caches *cc;
+ struct bpf_mem_cache *c;
+ int cpu, i, rcu_in_progress;
+
+ if (ma->cache) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ c = per_cpu_ptr(ma->cache, cpu);
+ WRITE_ONCE(c->draining, true);
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ obj_cgroup_put(ma->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+ if (ma->caches) {
+ rcu_in_progress = 0;
+ for_each_possible_cpu(cpu) {
+ cc = per_cpu_ptr(ma->caches, cpu);
+ for (i = 0; i < NUM_CACHES; i++) {
+ c = &cc->cache[i];
+ WRITE_ONCE(c->draining, true);
+ irq_work_sync(&c->refill_work);
+ drain_mem_cache(c);
+ rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+ rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
+ }
+ }
+ obj_cgroup_put(ma->objcg);
+ destroy_mem_alloc(ma, rcu_in_progress);
+ }
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+ struct llist_node *llnode = NULL;
+ unsigned long flags;
+ int cnt = 0;
+
+ /* Disable irqs to prevent the following race for majority of prog types:
+ * prog_A
+ * bpf_mem_alloc
+ * preemption or irq -> prog_B
+ * bpf_mem_alloc
+ *
+ * but prog_B could be a perf_event NMI prog.
+ * Use per-cpu 'active' counter to order free_list access between
+ * unit_alloc/unit_free/bpf_mem_refill.
+ */
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ llnode = __llist_del_first(&c->free_llist);
+ if (llnode) {
+ cnt = --c->free_cnt;
+ *(struct bpf_mem_cache **)llnode = c;
+ }
+ }
+ local_dec(&c->active);
+
+ WARN_ON(cnt < 0);
+
+ if (cnt < c->low_watermark)
+ irq_work_raise(c);
+ /* Enable IRQ after the enqueue of irq work completes, so irq work
+ * will run after IRQ is enabled and free_llist may be refilled by
+ * irq work before other task preempts current task.
+ */
+ local_irq_restore(flags);
+
+ return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+ int cnt = 0;
+
+ BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+ /*
+ * Remember bpf_mem_cache that allocated this object.
+ * The hint is not accurate.
+ */
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ __llist_add(llnode, &c->free_llist);
+ cnt = ++c->free_cnt;
+ } else {
+ /* unit_free() cannot fail. Therefore add an object to atomic
+ * llist. free_bulk() will drain it. Though free_llist_extra is
+ * a per-cpu list we have to use atomic llist_add here, since
+ * it also can be interrupted by bpf nmi prog that does another
+ * unit_free() into the same free_llist_extra.
+ */
+ llist_add(llnode, &c->free_llist_extra);
+ }
+ local_dec(&c->active);
+
+ if (cnt > c->high_watermark)
+ /* free few objects from current cpu into global kmalloc pool */
+ irq_work_raise(c);
+ /* Enable IRQ after irq_work_raise() completes, otherwise when current
+ * task is preempted by task which does unit_alloc(), unit_alloc() may
+ * return NULL unexpectedly because irq work is already pending but can
+ * not been triggered and free_llist can not be refilled timely.
+ */
+ local_irq_restore(flags);
+}
+
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+ struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+ unsigned long flags;
+
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
+ local_irq_save(flags);
+ if (local_inc_return(&c->active) == 1) {
+ if (__llist_add(llnode, &c->free_by_rcu))
+ c->free_by_rcu_tail = llnode;
+ } else {
+ llist_add(llnode, &c->free_llist_extra_rcu);
+ }
+ local_dec(&c->active);
+
+ if (!atomic_read(&c->call_rcu_in_progress))
+ irq_work_raise(c);
+ local_irq_restore(flags);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+ int idx;
+ void *ret;
+
+ if (!size)
+ return NULL;
+
+ if (!ma->percpu)
+ size += LLIST_NODE_SZ;
+ idx = bpf_mem_cache_idx(size);
+ if (idx < 0)
+ return NULL;
+
+ ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ struct bpf_mem_cache *c;
+ int idx;
+
+ if (!ptr)
+ return;
+
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
+ return;
+
+ unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ struct bpf_mem_cache *c;
+ int idx;
+
+ if (!ptr)
+ return;
+
+ c = *(void **)(ptr - LLIST_NODE_SZ);
+ idx = bpf_mem_cache_idx(c->unit_size);
+ if (WARN_ON_ONCE(idx < 0))
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+ void *ret;
+
+ ret = unit_alloc(this_cpu_ptr(ma->cache));
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free(this_cpu_ptr(ma->cache), ptr);
+}
+
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+ if (!ptr)
+ return;
+
+ unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+ if (!ptr)
+ return;
+
+ kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+ struct bpf_mem_cache *c;
+ void *ret;
+
+ c = this_cpu_ptr(ma->cache);
+
+ ret = unit_alloc(c);
+ if (!ret && flags == GFP_KERNEL) {
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (ret)
+ *(struct bpf_mem_cache **)ret = c;
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ }
+
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+int bpf_mem_alloc_check_size(bool percpu, size_t size)
+{
+ /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */
+ if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) ||
+ (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ))
+ return -E2BIG;
+
+ return 0;
+}
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
new file mode 100644
index 000000000000..5d18d7d85bef
--- /dev/null
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021 Facebook
+ */
+
+#ifndef __MMAP_UNLOCK_WORK_H__
+#define __MMAP_UNLOCK_WORK_H__
+#include <linux/irq_work.h>
+
+/* irq_work to run mmap_read_unlock() in irq_work */
+struct mmap_unlock_irq_work {
+ struct irq_work irq_work;
+ struct mm_struct *mm;
+};
+
+DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+/*
+ * We cannot do mmap_read_unlock() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To look up vma when the irqs are
+ * disabled, we need to run mmap_read_unlock() in irq_work. We use a
+ * percpu variable to do the irq_work. If the irq_work is already used
+ * by another lookup, we fall over.
+ */
+static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = false;
+
+ if (irqs_disabled()) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ work = this_cpu_ptr(&mmap_unlock_work);
+ if (irq_work_is_busy(&work->irq_work)) {
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
+ } else {
+ /*
+ * PREEMPT_RT does not allow to trylock mmap sem in
+ * interrupt disabled context. Force the fallback code.
+ */
+ irq_work_busy = true;
+ }
+ }
+
+ *work_ptr = work;
+ return irq_work_busy;
+}
+
+static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+{
+ if (!work) {
+ mmap_read_unlock(mm);
+ } else {
+ work->mm = mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+ irq_work_queue(&work->irq_work);
+ }
+}
+
+#endif /* __MMAP_UNLOCK_WORK_H__ */
diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
new file mode 100644
index 000000000000..1394168062e8
--- /dev/null
+++ b/kernel/bpf/mprog.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+
+static int bpf_mprog_link(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ if (type && link->prog->type != type) {
+ bpf_link_put(link);
+ return -EINVAL;
+ }
+
+ tuple->link = link;
+ tuple->prog = link->prog;
+ return 0;
+}
+
+static int bpf_mprog_prog(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+ bool id = flags & BPF_F_ID;
+
+ if (id)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ if (type && prog->type != type) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ tuple->link = NULL;
+ tuple->prog = prog;
+ return 0;
+}
+
+static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple,
+ u32 id_or_fd, u32 flags,
+ enum bpf_prog_type type)
+{
+ bool link = flags & BPF_F_LINK;
+ bool id = flags & BPF_F_ID;
+
+ memset(tuple, 0, sizeof(*tuple));
+ if (link)
+ return bpf_mprog_link(tuple, id_or_fd, flags, type);
+ /* If no relevant flag is set and no id_or_fd was passed, then
+ * tuple link/prog is just NULLed. This is the case when before/
+ * after selects first/last position without passing fd.
+ */
+ if (!id && !id_or_fd)
+ return 0;
+ return bpf_mprog_prog(tuple, id_or_fd, flags, type);
+}
+
+static void bpf_mprog_tuple_put(struct bpf_tuple *tuple)
+{
+ if (tuple->link)
+ bpf_link_put(tuple->link);
+ else if (tuple->prog)
+ bpf_prog_put(tuple->prog);
+}
+
+/* The bpf_mprog_{replace,delete}() operate on exact idx position with the
+ * one exception that for deletion we support delete from front/back. In
+ * case of front idx is -1, in case of back idx is bpf_mprog_total(entry).
+ * Adjustment to first and last entry is trivial. The bpf_mprog_insert()
+ * we have to deal with the following cases:
+ *
+ * idx + before:
+ *
+ * Insert P4 before P3: idx for old array is 1, idx for new array is 2,
+ * hence we adjust target idx for the new array, so that memmove copies
+ * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting
+ * before P1 would have old idx -1 and new idx 0.
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ *
+ * idx + after:
+ *
+ * Insert P4 after P2: idx for old array is 2, idx for new array is 2.
+ * Again, memmove copies P1 and P2 to the new entry, and we insert P4
+ * into idx 2. Inserting after P3 would have both old/new idx at 4 aka
+ * bpf_mprog_total(entry).
+ *
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3|
+ * +--+--+--+ +--+--+--+--+ +--+--+--+--+
+ */
+static int bpf_mprog_replace(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *oprog;
+
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ oprog = READ_ONCE(fp->prog);
+ bpf_mprog_write(fp, cp, ntuple);
+ if (!ntuple->link) {
+ WARN_ON_ONCE(cp->link);
+ bpf_prog_put(oprog);
+ }
+ *entry_new = entry;
+ return 0;
+}
+
+static int bpf_mprog_insert(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *ntuple, int idx, u32 flags)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == total)
+ goto insert;
+ else if (flags & BPF_F_BEFORE)
+ idx += 1;
+ bpf_mprog_entry_grow(peer, idx);
+insert:
+ bpf_mprog_read(peer, idx, &fp, &cp);
+ bpf_mprog_write(fp, cp, ntuple);
+ bpf_mprog_inc(peer);
+ *entry_new = peer;
+ return 0;
+}
+
+static int bpf_mprog_delete(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_tuple *dtuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_entry *peer;
+
+ peer = bpf_mprog_peer(entry);
+ bpf_mprog_entry_copy(peer, entry);
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_entry_shrink(peer, idx);
+ bpf_mprog_dec(peer);
+ bpf_mprog_mark_for_release(peer, dtuple);
+ *entry_new = peer;
+ return 0;
+}
+
+/* In bpf_mprog_pos_*() we evaluate the target position for the BPF
+ * program/link that needs to be replaced, inserted or deleted for
+ * each "rule" independently. If all rules agree on that position
+ * or existing element, then enact replacement, addition or deletion.
+ * If this is not the case, then the request cannot be satisfied and
+ * we bail out with an error.
+ */
+static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog))
+ return tuple->link == cp->link ? i : -EBUSY;
+ }
+ return -ENOENT;
+}
+
+static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i - 1;
+ }
+ return tuple->prog ? -ENOENT : -1;
+}
+
+static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i + 1;
+ }
+ return tuple->prog ? -ENOENT : bpf_mprog_total(entry);
+}
+
+int bpf_mprog_attach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog_new, struct bpf_link *link,
+ struct bpf_prog *prog_old,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, ntuple = {
+ .prog = prog_new,
+ .link = link,
+ }, otuple = {
+ .prog = prog_old,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (bpf_mprog_exists(entry, prog_new))
+ return -EEXIST;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd,
+ flags & ~BPF_F_REPLACE,
+ prog_new->type);
+ if (ret)
+ return ret;
+ if (flags & BPF_F_REPLACE) {
+ tidx = bpf_mprog_pos_exact(entry, &otuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ } else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (flags & BPF_F_REPLACE)
+ ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx);
+ else
+ ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+static int bpf_mprog_fetch(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple, int idx)
+{
+ int total = bpf_mprog_total(entry);
+ struct bpf_mprog_cp *cp;
+ struct bpf_mprog_fp *fp;
+ struct bpf_prog *prog;
+ struct bpf_link *link;
+
+ if (idx == -1)
+ idx = 0;
+ else if (idx == total)
+ idx = total - 1;
+ bpf_mprog_read(entry, idx, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ link = cp->link;
+ /* The deletion request can either be without filled tuple in which
+ * case it gets populated here based on idx, or with filled tuple
+ * where the only thing we end up doing is the WARN_ON_ONCE() assert.
+ * If we hit a BPF link at the given index, it must not be removed
+ * from opts path.
+ */
+ if (link && !tuple->link)
+ return -EBUSY;
+ WARN_ON_ONCE(tuple->prog && tuple->prog != prog);
+ WARN_ON_ONCE(tuple->link && tuple->link != link);
+ tuple->prog = prog;
+ tuple->link = link;
+ return 0;
+}
+
+int bpf_mprog_detach(struct bpf_mprog_entry *entry,
+ struct bpf_mprog_entry **entry_new,
+ struct bpf_prog *prog, struct bpf_link *link,
+ u32 flags, u32 id_or_fd, u64 revision)
+{
+ struct bpf_tuple rtuple, dtuple = {
+ .prog = prog,
+ .link = link,
+ };
+ int ret, idx = -ERANGE, tidx;
+
+ if (flags & BPF_F_REPLACE)
+ return -EINVAL;
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (!bpf_mprog_total(entry))
+ return -ENOENT;
+ ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags,
+ prog ? prog->type :
+ BPF_PROG_TYPE_UNSPEC);
+ if (ret)
+ return ret;
+ if (dtuple.prog) {
+ tidx = bpf_mprog_pos_exact(entry, &dtuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -ERANGE;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (idx < -1) {
+ if (rtuple.prog || flags) {
+ ret = -EINVAL;
+ goto out;
+ }
+ idx = bpf_mprog_total(entry);
+ flags = BPF_F_AFTER;
+ }
+ if (idx >= bpf_mprog_max()) {
+ ret = -ERANGE;
+ goto out;
+ }
+ ret = bpf_mprog_fetch(entry, &dtuple, idx);
+ if (ret)
+ goto out;
+ ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx);
+out:
+ bpf_mprog_tuple_put(&rtuple);
+ return ret;
+}
+
+int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
+ struct bpf_mprog_entry *entry)
+{
+ u32 __user *uprog_flags, *ulink_flags;
+ u32 __user *uprog_id, *ulink_id;
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ struct bpf_prog *prog;
+ const u32 flags = 0;
+ u32 id, count = 0;
+ u64 revision = 1;
+ int i, ret = 0;
+
+ if (attr->query.query_flags || attr->query.attach_flags)
+ return -EINVAL;
+ if (entry) {
+ revision = bpf_mprog_revision(entry);
+ count = bpf_mprog_total(entry);
+ }
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.count, &count, sizeof(count)))
+ return -EFAULT;
+ uprog_id = u64_to_user_ptr(attr->query.prog_ids);
+ uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+ ulink_id = u64_to_user_ptr(attr->query.link_ids);
+ ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags);
+ if (attr->query.count == 0 || !uprog_id || !count)
+ return 0;
+ if (attr->query.count < count) {
+ count = attr->query.count;
+ ret = -ENOSPC;
+ }
+ for (i = 0; i < bpf_mprog_max(); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ prog = READ_ONCE(fp->prog);
+ if (!prog)
+ break;
+ id = prog->aux->id;
+ if (copy_to_user(uprog_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (uprog_flags &&
+ copy_to_user(uprog_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ id = cp->link ? cp->link->id : 0;
+ if (ulink_id &&
+ copy_to_user(ulink_id + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (ulink_flags &&
+ copy_to_user(ulink_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (i + 1 == count)
+ break;
+ }
+ return ret;
+}
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
new file mode 100644
index 000000000000..8e88201c98bf
--- /dev/null
+++ b/kernel/bpf/net_namespace.c
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <linux/bpf-netns.h>
+#include <linux/filter.h>
+#include <net/net_namespace.h>
+
+/*
+ * Functions to manage BPF programs attached to netns
+ */
+
+struct bpf_netns_link {
+ struct bpf_link link;
+
+ /* We don't hold a ref to net in order to auto-detach the link
+ * when netns is going away. Instead we rely on pernet
+ * pre_exit callback to clear this pointer. Must be accessed
+ * with netns_bpf_mutex held.
+ */
+ struct net *net;
+ struct list_head node; /* node in list of links attached to net */
+ enum netns_bpf_attach_type netns_type;
+};
+
+/* Protects updates to netns_bpf */
+DEFINE_MUTEX(netns_bpf_mutex);
+
+static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+#ifdef CONFIG_INET
+ case NETNS_BPF_SK_LOOKUP:
+ static_branch_dec(&bpf_sk_lookup_enabled);
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+#ifdef CONFIG_INET
+ case NETNS_BPF_SK_LOOKUP:
+ static_branch_inc(&bpf_sk_lookup_enabled);
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+/* Must be called with netns_bpf_mutex held. */
+static void netns_bpf_run_array_detach(struct net *net,
+ enum netns_bpf_attach_type type)
+{
+ struct bpf_prog_array *run_array;
+
+ run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL,
+ lockdep_is_held(&netns_bpf_mutex));
+ bpf_prog_array_free(run_array);
+}
+
+static int link_index(struct net *net, enum netns_bpf_attach_type type,
+ struct bpf_netns_link *link)
+{
+ struct bpf_netns_link *pos;
+ int i = 0;
+
+ list_for_each_entry(pos, &net->bpf.links[type], node) {
+ if (pos == link)
+ return i;
+ i++;
+ }
+ return -ENOENT;
+}
+
+static int link_count(struct net *net, enum netns_bpf_attach_type type)
+{
+ struct list_head *pos;
+ int i = 0;
+
+ list_for_each(pos, &net->bpf.links[type])
+ i++;
+ return i;
+}
+
+static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type,
+ struct bpf_prog_array *prog_array)
+{
+ struct bpf_netns_link *pos;
+ unsigned int i = 0;
+
+ list_for_each_entry(pos, &net->bpf.links[type], node) {
+ prog_array->items[i].prog = pos->link.prog;
+ i++;
+ }
+}
+
+static void bpf_netns_link_release(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ enum netns_bpf_attach_type type = net_link->netns_type;
+ struct bpf_prog_array *old_array, *new_array;
+ struct net *net;
+ int cnt, idx;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ /* We can race with cleanup_net, but if we see a non-NULL
+ * struct net pointer, pre_exit has not run yet and wait for
+ * netns_bpf_mutex.
+ */
+ net = net_link->net;
+ if (!net)
+ goto out_unlock;
+
+ /* Mark attach point as unused */
+ netns_bpf_attach_type_unneed(type);
+
+ /* Remember link position in case of safe delete */
+ idx = link_index(net, type, net_link);
+ list_del(&net_link->node);
+
+ cnt = link_count(net, type);
+ if (!cnt) {
+ netns_bpf_run_array_detach(net, type);
+ goto out_unlock;
+ }
+
+ old_array = rcu_dereference_protected(net->bpf.run_array[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!new_array) {
+ WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx));
+ goto out_unlock;
+ }
+ fill_prog_array(net, type, new_array);
+ rcu_assign_pointer(net->bpf.run_array[type], new_array);
+ bpf_prog_array_free(old_array);
+
+out_unlock:
+ net_link->net = NULL;
+ mutex_unlock(&netns_bpf_mutex);
+}
+
+static int bpf_netns_link_detach(struct bpf_link *link)
+{
+ bpf_netns_link_release(link);
+ return 0;
+}
+
+static void bpf_netns_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+
+ kfree(net_link);
+}
+
+static int bpf_netns_link_update_prog(struct bpf_link *link,
+ struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ enum netns_bpf_attach_type type = net_link->netns_type;
+ struct bpf_prog_array *run_array;
+ struct net *net;
+ int idx, ret;
+
+ if (old_prog && old_prog != link->prog)
+ return -EPERM;
+ if (new_prog->type != link->prog->type)
+ return -EINVAL;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ net = net_link->net;
+ if (!net || !check_net(net)) {
+ /* Link auto-detached or netns dying */
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+
+ run_array = rcu_dereference_protected(net->bpf.run_array[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ idx = link_index(net, type, net_link);
+ ret = bpf_prog_array_update_at(run_array, idx, new_prog);
+ if (ret)
+ goto out_unlock;
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+ return ret;
+}
+
+static int bpf_netns_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ unsigned int inum = 0;
+ struct net *net;
+
+ mutex_lock(&netns_bpf_mutex);
+ net = net_link->net;
+ if (net && check_net(net))
+ inum = net->ns.inum;
+ mutex_unlock(&netns_bpf_mutex);
+
+ info->netns.netns_ino = inum;
+ info->netns.attach_type = link->attach_type;
+ return 0;
+}
+
+static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_link_info info = {};
+
+ bpf_netns_link_fill_info(link, &info);
+ seq_printf(seq,
+ "netns_ino:\t%u\n"
+ "attach_type:\t%u\n",
+ info.netns.netns_ino,
+ link->attach_type);
+}
+
+static const struct bpf_link_ops bpf_netns_link_ops = {
+ .release = bpf_netns_link_release,
+ .dealloc = bpf_netns_link_dealloc,
+ .detach = bpf_netns_link_detach,
+ .update_prog = bpf_netns_link_update_prog,
+ .fill_link_info = bpf_netns_link_fill_info,
+ .show_fdinfo = bpf_netns_link_show_fdinfo,
+};
+
+/* Must be called with netns_bpf_mutex held. */
+static int __netns_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ struct net *net,
+ enum netns_bpf_attach_type type)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ struct bpf_prog_array *run_array;
+ u32 prog_cnt = 0, flags = 0;
+
+ run_array = rcu_dereference_protected(net->bpf.run_array[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ if (run_array)
+ prog_cnt = bpf_prog_array_length(run_array);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ return -EFAULT;
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ return 0;
+
+ return bpf_prog_array_copy_to_user(run_array, prog_ids,
+ attr->query.prog_cnt);
+}
+
+int netns_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ enum netns_bpf_attach_type type;
+ struct net *net;
+ int ret;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ type = to_netns_bpf_attach_type(attr->query.attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ net = get_net_ns_by_fd(attr->query.target_fd);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ mutex_lock(&netns_bpf_mutex);
+ ret = __netns_bpf_prog_query(attr, uattr, net, type);
+ mutex_unlock(&netns_bpf_mutex);
+
+ put_net(net);
+ return ret;
+}
+
+int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_prog_array *run_array;
+ enum netns_bpf_attach_type type;
+ struct bpf_prog *attached;
+ struct net *net;
+ int ret;
+
+ if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd)
+ return -EINVAL;
+
+ type = to_netns_bpf_attach_type(attr->attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ net = current->nsproxy->net_ns;
+ mutex_lock(&netns_bpf_mutex);
+
+ /* Attaching prog directly is not compatible with links */
+ if (!list_empty(&net->bpf.links[type])) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ ret = flow_dissector_bpf_prog_attach_check(net, prog);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ if (ret)
+ goto out_unlock;
+
+ attached = net->bpf.progs[type];
+ if (attached == prog) {
+ /* The same program cannot be attached twice */
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ run_array = rcu_dereference_protected(net->bpf.run_array[type],
+ lockdep_is_held(&netns_bpf_mutex));
+ if (run_array) {
+ WRITE_ONCE(run_array->items[0].prog, prog);
+ } else {
+ run_array = bpf_prog_array_alloc(1, GFP_KERNEL);
+ if (!run_array) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ run_array->items[0].prog = prog;
+ rcu_assign_pointer(net->bpf.run_array[type], run_array);
+ }
+
+ net->bpf.progs[type] = prog;
+ if (attached)
+ bpf_prog_put(attached);
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+
+ return ret;
+}
+
+/* Must be called with netns_bpf_mutex held. */
+static int __netns_bpf_prog_detach(struct net *net,
+ enum netns_bpf_attach_type type,
+ struct bpf_prog *old)
+{
+ struct bpf_prog *attached;
+
+ /* Progs attached via links cannot be detached */
+ if (!list_empty(&net->bpf.links[type]))
+ return -EINVAL;
+
+ attached = net->bpf.progs[type];
+ if (!attached || attached != old)
+ return -ENOENT;
+ netns_bpf_run_array_detach(net, type);
+ net->bpf.progs[type] = NULL;
+ bpf_prog_put(attached);
+ return 0;
+}
+
+int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+ enum netns_bpf_attach_type type;
+ struct bpf_prog *prog;
+ int ret;
+
+ if (attr->target_fd)
+ return -EINVAL;
+
+ type = to_netns_bpf_attach_type(attr->attach_type);
+ if (type < 0)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ mutex_lock(&netns_bpf_mutex);
+ ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog);
+ mutex_unlock(&netns_bpf_mutex);
+
+ bpf_prog_put(prog);
+
+ return ret;
+}
+
+static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
+{
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ return 1;
+ case NETNS_BPF_SK_LOOKUP:
+ return 64;
+ default:
+ return 0;
+ }
+}
+
+static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
+ enum netns_bpf_attach_type type)
+{
+ struct bpf_netns_link *net_link =
+ container_of(link, struct bpf_netns_link, link);
+ struct bpf_prog_array *run_array;
+ int cnt, err;
+
+ mutex_lock(&netns_bpf_mutex);
+
+ cnt = link_count(net, type);
+ if (cnt >= netns_bpf_max_progs(type)) {
+ err = -E2BIG;
+ goto out_unlock;
+ }
+ /* Links are not compatible with attaching prog directly */
+ if (net->bpf.progs[type]) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case NETNS_BPF_FLOW_DISSECTOR:
+ err = flow_dissector_bpf_prog_attach_check(net, link->prog);
+ break;
+ case NETNS_BPF_SK_LOOKUP:
+ err = 0; /* nothing to check */
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (err)
+ goto out_unlock;
+
+ run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL);
+ if (!run_array) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ list_add_tail(&net_link->node, &net->bpf.links[type]);
+
+ fill_prog_array(net, type, run_array);
+ run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array,
+ lockdep_is_held(&netns_bpf_mutex));
+ bpf_prog_array_free(run_array);
+
+ /* Mark attach point as used */
+ netns_bpf_attach_type_need(type);
+
+out_unlock:
+ mutex_unlock(&netns_bpf_mutex);
+ return err;
+}
+
+int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ enum netns_bpf_attach_type netns_type;
+ struct bpf_link_primer link_primer;
+ struct bpf_netns_link *net_link;
+ enum bpf_attach_type type;
+ struct net *net;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ type = attr->link_create.attach_type;
+ netns_type = to_netns_bpf_attach_type(type);
+ if (netns_type < 0)
+ return -EINVAL;
+
+ net = get_net_ns_by_fd(attr->link_create.target_fd);
+ if (IS_ERR(net))
+ return PTR_ERR(net);
+
+ net_link = kzalloc(sizeof(*net_link), GFP_USER);
+ if (!net_link) {
+ err = -ENOMEM;
+ goto out_put_net;
+ }
+ bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
+ &bpf_netns_link_ops, prog, type);
+ net_link->net = net;
+ net_link->netns_type = netns_type;
+
+ err = bpf_link_prime(&net_link->link, &link_primer);
+ if (err) {
+ kfree(net_link);
+ goto out_put_net;
+ }
+
+ err = netns_bpf_link_attach(net, &net_link->link, netns_type);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_net;
+ }
+
+ put_net(net);
+ return bpf_link_settle(&link_primer);
+
+out_put_net:
+ put_net(net);
+ return err;
+}
+
+static int __net_init netns_bpf_pernet_init(struct net *net)
+{
+ int type;
+
+ for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++)
+ INIT_LIST_HEAD(&net->bpf.links[type]);
+
+ return 0;
+}
+
+static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
+{
+ enum netns_bpf_attach_type type;
+ struct bpf_netns_link *net_link;
+
+ mutex_lock(&netns_bpf_mutex);
+ for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
+ netns_bpf_run_array_detach(net, type);
+ list_for_each_entry(net_link, &net->bpf.links[type], node) {
+ net_link->net = NULL; /* auto-detach link */
+ netns_bpf_attach_type_unneed(type);
+ }
+ if (net->bpf.progs[type])
+ bpf_prog_put(net->bpf.progs[type]);
+ }
+ mutex_unlock(&netns_bpf_mutex);
+}
+
+static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
+ .init = netns_bpf_pernet_init,
+ .pre_exit = netns_bpf_pernet_pre_exit,
+};
+
+static int __init netns_bpf_init(void)
+{
+ return register_pernet_subsys(&netns_bpf_pernet_ops);
+}
+
+subsys_initcall(netns_bpf_init);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..42ae8d595c2c
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,878 @@
+/*
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
+ *
+ * This software is licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree.
+ *
+ * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+ * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+ */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/proc_ns.h>
+#include <linux/rhashtable.h>
+#include <linux/rtnetlink.h>
+#include <linux/rwsem.h>
+#include <net/netdev_lock.h>
+#include <net/xdp.h>
+
+/* Protects offdevs, members of bpf_offload_netdev and offload members
+ * of all progs.
+ * RTNL lock cannot be taken when holding this lock.
+ */
+static DECLARE_RWSEM(bpf_devs_lock);
+
+struct bpf_offload_dev {
+ const struct bpf_prog_offload_ops *ops;
+ struct list_head netdevs;
+ void *priv;
+};
+
+struct bpf_offload_netdev {
+ struct rhash_head l;
+ struct net_device *netdev;
+ struct bpf_offload_dev *offdev; /* NULL when bound-only */
+ struct list_head progs;
+ struct list_head maps;
+ struct list_head offdev_netdevs;
+};
+
+static const struct rhashtable_params offdevs_params = {
+ .nelem_hint = 4,
+ .key_len = sizeof(struct net_device *),
+ .key_offset = offsetof(struct bpf_offload_netdev, netdev),
+ .head_offset = offsetof(struct bpf_offload_netdev, l),
+ .automatic_shrinking = true,
+};
+
+static struct rhashtable offdevs;
+
+static int bpf_dev_offload_check(struct net_device *netdev)
+{
+ if (!netdev)
+ return -EINVAL;
+ if (!netdev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static struct bpf_offload_netdev *
+bpf_offload_find_netdev(struct net_device *netdev)
+{
+ lockdep_assert_held(&bpf_devs_lock);
+
+ return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+}
+
+static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ int err;
+
+ ondev = kzalloc(sizeof(*ondev), GFP_KERNEL);
+ if (!ondev)
+ return -ENOMEM;
+
+ ondev->netdev = netdev;
+ ondev->offdev = offdev;
+ INIT_LIST_HEAD(&ondev->progs);
+ INIT_LIST_HEAD(&ondev->maps);
+
+ err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params);
+ if (err) {
+ netdev_warn(netdev, "failed to register for BPF offload\n");
+ goto err_free;
+ }
+
+ if (offdev)
+ list_add(&ondev->offdev_netdevs, &offdev->netdevs);
+ return 0;
+
+err_free:
+ kfree(ondev);
+ return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload = prog->aux->offload;
+
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+}
+
+static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
+ enum bpf_netdev_command cmd)
+{
+ struct netdev_bpf data = {};
+ struct net_device *netdev;
+
+ ASSERT_RTNL();
+
+ data.command = cmd;
+ data.offmap = offmap;
+ /* Caller must make sure netdev is valid */
+ netdev = offmap->netdev;
+
+ return netdev->netdev_ops->ndo_bpf(netdev, &data);
+}
+
+static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
+{
+ WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
+ /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
+ bpf_map_free_id(&offmap->map);
+ list_del_init(&offmap->offloads);
+ offmap->netdev = NULL;
+}
+
+static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev, *altdev = NULL;
+ struct bpf_offloaded_map *offmap, *mtmp;
+ struct bpf_prog_offload *offload, *ptmp;
+
+ ASSERT_RTNL();
+
+ ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);
+ if (WARN_ON(!ondev))
+ return;
+
+ WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params));
+
+ /* Try to move the objects to another netdev of the device */
+ if (offdev) {
+ list_del(&ondev->offdev_netdevs);
+ altdev = list_first_entry_or_null(&offdev->netdevs,
+ struct bpf_offload_netdev,
+ offdev_netdevs);
+ }
+
+ if (altdev) {
+ list_for_each_entry(offload, &ondev->progs, offloads)
+ offload->netdev = altdev->netdev;
+ list_splice_init(&ondev->progs, &altdev->progs);
+
+ list_for_each_entry(offmap, &ondev->maps, offloads)
+ offmap->netdev = altdev->netdev;
+ list_splice_init(&ondev->maps, &altdev->maps);
+ } else {
+ list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads)
+ __bpf_prog_offload_destroy(offload->prog);
+ list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads)
+ __bpf_map_offload_destroy(offmap);
+ }
+
+ WARN_ON(!list_empty(&ondev->progs));
+ WARN_ON(!list_empty(&ondev->maps));
+ kfree(ondev);
+}
+
+static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev;
+ struct bpf_prog_offload *offload;
+ int err;
+
+ offload = kzalloc(sizeof(*offload), GFP_USER);
+ if (!offload)
+ return -ENOMEM;
+
+ offload->prog = prog;
+ offload->netdev = netdev;
+
+ ondev = bpf_offload_find_netdev(offload->netdev);
+ /* When program is offloaded require presence of "true"
+ * bpf_offload_netdev, avoid the one created for !ondev case below.
+ */
+ if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ if (!ondev) {
+ /* When only binding to the device, explicitly
+ * create an entry in the hashtable.
+ */
+ err = __bpf_offload_dev_netdev_register(NULL, offload->netdev);
+ if (err)
+ goto err_free;
+ ondev = bpf_offload_find_netdev(offload->netdev);
+ }
+ offload->offdev = ondev->offdev;
+ prog->aux->offload = offload;
+ list_add_tail(&offload->offloads, &ondev->progs);
+
+ return 0;
+err_free:
+ kfree(offload);
+ return err;
+}
+
+int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ struct net_device *netdev;
+ int err;
+
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
+
+ if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS))
+ return -EINVAL;
+
+ /* Frags are allowed only if program is dev-bound-only, but not
+ * if it is requesting bpf offload.
+ */
+ if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS &&
+ !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY))
+ return -EINVAL;
+
+ if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY)
+ return -EINVAL;
+
+ netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex);
+ if (!netdev)
+ return -EINVAL;
+
+ err = bpf_dev_offload_check(netdev);
+ if (err)
+ goto out;
+
+ prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY);
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_prog_dev_bound_init(prog, netdev);
+ up_write(&bpf_devs_lock);
+
+out:
+ dev_put(netdev);
+ return err;
+}
+
+int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog)
+{
+ int err;
+
+ if (!bpf_prog_is_dev_bound(old_prog->aux))
+ return 0;
+
+ if (bpf_prog_is_offloaded(old_prog->aux))
+ return -EINVAL;
+
+ new_prog->aux->dev_bound = old_prog->aux->dev_bound;
+ new_prog->aux->offload_requested = old_prog->aux->offload_requested;
+
+ down_write(&bpf_devs_lock);
+ if (!old_prog->aux->offload) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev);
+
+out:
+ up_write(&bpf_devs_lock);
+ return err;
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload) {
+ ret = offload->offdev->ops->prepare(prog);
+ offload->dev_state = !ret;
+ }
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload)
+ ret = offload->offdev->ops->insn_hook(env, insn_idx,
+ prev_insn_idx);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ if (offload->offdev->ops->finalize)
+ ret = offload->offdev->ops->finalize(env);
+ else
+ ret = 0;
+ }
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+void
+bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
+ struct bpf_insn *insn)
+{
+ const struct bpf_prog_offload_ops *ops;
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ ops = offload->offdev->ops;
+ if (!offload->opt_failed && ops->replace_insn)
+ ret = ops->replace_insn(env, off, insn);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
+void
+bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ if (!offload->opt_failed && offload->offdev->ops->remove_insns)
+ ret = offload->offdev->ops->remove_insns(env, off, cnt);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
+void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)
+{
+ struct bpf_offload_netdev *ondev;
+ struct net_device *netdev;
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ if (prog->aux->offload) {
+ list_del_init(&prog->aux->offload->offloads);
+
+ netdev = prog->aux->offload->netdev;
+ __bpf_prog_offload_destroy(prog);
+
+ ondev = bpf_offload_find_netdev(netdev);
+ if (!ondev->offdev && list_empty(&ondev->progs))
+ __bpf_offload_dev_netdev_unregister(NULL, netdev);
+ }
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ offload = prog->aux->offload;
+ if (offload)
+ ret = offload->offdev->ops->translate(prog);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ WARN(1, "attempt to execute device eBPF program on the host!");
+ return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+ prog->bpf_func = bpf_prog_warn_on_exec;
+
+ return bpf_prog_offload_translate(prog);
+}
+
+struct ns_get_path_bpf_prog_args {
+ struct bpf_prog *prog;
+ struct bpf_prog_info *info;
+};
+
+static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
+{
+ struct ns_get_path_bpf_prog_args *args = private_data;
+ struct bpf_prog_aux *aux = args->prog->aux;
+ struct ns_common *ns;
+ struct net *net;
+
+ rtnl_lock();
+ down_read(&bpf_devs_lock);
+
+ if (aux->offload) {
+ args->info->ifindex = aux->offload->netdev->ifindex;
+ net = dev_net(aux->offload->netdev);
+ get_net(net);
+ ns = &net->ns;
+ } else {
+ args->info->ifindex = 0;
+ ns = NULL;
+ }
+
+ up_read(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return ns;
+}
+
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+ struct bpf_prog *prog)
+{
+ struct ns_get_path_bpf_prog_args args = {
+ .prog = prog,
+ .info = info,
+ };
+ struct bpf_prog_aux *aux = prog->aux;
+ struct inode *ns_inode;
+ struct path ns_path;
+ char __user *uinsns;
+ int res;
+ u32 ulen;
+
+ res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
+ if (res) {
+ if (!info->ifindex)
+ return -ENODEV;
+ return res;
+ }
+
+ down_read(&bpf_devs_lock);
+
+ if (!aux->offload) {
+ up_read(&bpf_devs_lock);
+ return -ENODEV;
+ }
+
+ ulen = info->jited_prog_len;
+ info->jited_prog_len = aux->offload->jited_len;
+ if (info->jited_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info->jited_prog_insns);
+ ulen = min_t(u32, info->jited_prog_len, ulen);
+ if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
+ up_read(&bpf_devs_lock);
+ return -EFAULT;
+ }
+ }
+
+ up_read(&bpf_devs_lock);
+
+ ns_inode = ns_path.dentry->d_inode;
+ info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ info->netns_ino = ns_inode->i_ino;
+ path_put(&ns_path);
+
+ return 0;
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_offload_netdev *ondev;
+ struct bpf_offloaded_map *offmap;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+ attr->map_type != BPF_MAP_TYPE_HASH)
+ return ERR_PTR(-EINVAL);
+
+ offmap = bpf_map_area_alloc(sizeof(*offmap), NUMA_NO_NODE);
+ if (!offmap)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&offmap->map, attr);
+ rtnl_lock();
+ offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
+ err = bpf_dev_offload_check(offmap->netdev);
+ if (err)
+ goto err_unlock_rtnl;
+
+ netdev_lock_ops(offmap->netdev);
+ down_write(&bpf_devs_lock);
+
+ ondev = bpf_offload_find_netdev(offmap->netdev);
+ if (!ondev) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
+ if (err)
+ goto err_unlock;
+
+ list_add_tail(&offmap->offloads, &ondev->maps);
+ up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
+ rtnl_unlock();
+
+ return &offmap->map;
+
+err_unlock:
+ up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
+err_unlock_rtnl:
+ rtnl_unlock();
+ bpf_map_area_free(offmap);
+ return ERR_PTR(err);
+}
+
+void bpf_map_offload_map_free(struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+
+ rtnl_lock();
+ down_write(&bpf_devs_lock);
+ if (offmap->netdev)
+ __bpf_map_offload_destroy(offmap);
+ up_write(&bpf_devs_lock);
+ rtnl_unlock();
+
+ bpf_map_area_free(offmap);
+}
+
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+ /* The memory dynamically allocated in netdev dev_ops is not counted */
+ return sizeof(struct bpf_offloaded_map);
+}
+
+int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_update_elem(offmap, key, value,
+ flags);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_delete_elem(offmap, key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_offloaded_map *offmap = map_to_offmap(map);
+ int ret = -ENODEV;
+
+ down_read(&bpf_devs_lock);
+ if (offmap->netdev)
+ ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+struct ns_get_path_bpf_map_args {
+ struct bpf_offloaded_map *offmap;
+ struct bpf_map_info *info;
+};
+
+static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
+{
+ struct ns_get_path_bpf_map_args *args = private_data;
+ struct ns_common *ns;
+ struct net *net;
+
+ rtnl_lock();
+ down_read(&bpf_devs_lock);
+
+ if (args->offmap->netdev) {
+ args->info->ifindex = args->offmap->netdev->ifindex;
+ net = dev_net(args->offmap->netdev);
+ get_net(net);
+ ns = &net->ns;
+ } else {
+ args->info->ifindex = 0;
+ ns = NULL;
+ }
+
+ up_read(&bpf_devs_lock);
+ rtnl_unlock();
+
+ return ns;
+}
+
+int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
+{
+ struct ns_get_path_bpf_map_args args = {
+ .offmap = map_to_offmap(map),
+ .info = info,
+ };
+ struct inode *ns_inode;
+ struct path ns_path;
+ int res;
+
+ res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
+ if (res) {
+ if (!info->ifindex)
+ return -ENODEV;
+ return res;
+ }
+
+ ns_inode = ns_path.dentry->d_inode;
+ info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ info->netns_ino = ns_inode->i_ino;
+ path_put(&ns_path);
+
+ return 0;
+}
+
+static bool __bpf_offload_dev_match(struct bpf_prog *prog,
+ struct net_device *netdev)
+{
+ struct bpf_offload_netdev *ondev1, *ondev2;
+ struct bpf_prog_offload *offload;
+
+ if (!bpf_prog_is_dev_bound(prog->aux))
+ return false;
+
+ offload = prog->aux->offload;
+ if (!offload)
+ return false;
+ if (offload->netdev == netdev)
+ return true;
+
+ ondev1 = bpf_offload_find_netdev(offload->netdev);
+ ondev2 = bpf_offload_find_netdev(netdev);
+
+ return ondev1 && ondev2 && ondev1->offdev == ondev2->offdev;
+}
+
+bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)
+{
+ bool ret;
+
+ down_read(&bpf_devs_lock);
+ ret = __bpf_offload_dev_match(prog, netdev);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_match);
+
+bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs)
+{
+ bool ret;
+
+ if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux))
+ return false;
+
+ down_read(&bpf_devs_lock);
+ ret = lhs->aux->offload && rhs->aux->offload &&
+ lhs->aux->offload->netdev &&
+ lhs->aux->offload->netdev == rhs->aux->offload->netdev;
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)
+{
+ struct bpf_offloaded_map *offmap;
+ bool ret;
+
+ if (!bpf_map_is_offloaded(map))
+ return bpf_map_offload_neutral(map);
+ offmap = map_to_offmap(map);
+
+ down_read(&bpf_devs_lock);
+ ret = __bpf_offload_dev_match(prog, offmap->netdev);
+ up_read(&bpf_devs_lock);
+
+ return ret;
+}
+
+int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ int err;
+
+ down_write(&bpf_devs_lock);
+ err = __bpf_offload_dev_netdev_register(offdev, netdev);
+ up_write(&bpf_devs_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);
+
+void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,
+ struct net_device *netdev)
+{
+ down_write(&bpf_devs_lock);
+ __bpf_offload_dev_netdev_unregister(offdev, netdev);
+ up_write(&bpf_devs_lock);
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
+
+struct bpf_offload_dev *
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
+{
+ struct bpf_offload_dev *offdev;
+
+ offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);
+ if (!offdev)
+ return ERR_PTR(-ENOMEM);
+
+ offdev->ops = ops;
+ offdev->priv = priv;
+ INIT_LIST_HEAD(&offdev->netdevs);
+
+ return offdev;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_create);
+
+void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
+{
+ WARN_ON(!list_empty(&offdev->netdevs));
+ kfree(offdev);
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
+
+void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
+{
+ return offdev->priv;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
+
+void bpf_dev_bound_netdev_unregister(struct net_device *dev)
+{
+ struct bpf_offload_netdev *ondev;
+
+ ASSERT_RTNL();
+
+ down_write(&bpf_devs_lock);
+ ondev = bpf_offload_find_netdev(dev);
+ if (ondev && !ondev->offdev)
+ __bpf_offload_dev_netdev_unregister(NULL, ondev->netdev);
+ up_write(&bpf_devs_lock);
+}
+
+int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
+ struct bpf_prog_aux *prog_aux)
+{
+ if (!bpf_prog_is_dev_bound(prog_aux)) {
+ bpf_log(log, "metadata kfuncs require device-bound program\n");
+ return -EINVAL;
+ }
+
+ if (bpf_prog_is_offloaded(prog_aux)) {
+ bpf_log(log, "metadata kfuncs can't be offloaded\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
+{
+ const struct xdp_metadata_ops *ops;
+ void *p = NULL;
+
+ /* We don't hold bpf_devs_lock while resolving several
+ * kfuncs and can race with the unregister_netdevice().
+ * We rely on bpf_dev_bound_match() check at attach
+ * to render this program unusable.
+ */
+ down_read(&bpf_devs_lock);
+ if (!prog->aux->offload)
+ goto out;
+
+ ops = prog->aux->offload->netdev->xdp_metadata_ops;
+ if (!ops)
+ goto out;
+
+#define XDP_METADATA_KFUNC(name, _, __, xmo) \
+ if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
+ XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+out:
+ up_read(&bpf_devs_lock);
+
+ return p;
+}
+
+static int __init bpf_offload_init(void)
+{
+ return rhashtable_init(&offdevs, &offdevs_params);
+}
+
+core_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
new file mode 100644
index 000000000000..632762b57299
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#include "percpu_freelist.h"
+
+int pcpu_freelist_init(struct pcpu_freelist *s)
+{
+ int cpu;
+
+ s->freelist = alloc_percpu(struct pcpu_freelist_head);
+ if (!s->freelist)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
+
+ raw_res_spin_lock_init(&head->lock);
+ head->first = NULL;
+ }
+ return 0;
+}
+
+void pcpu_freelist_destroy(struct pcpu_freelist *s)
+{
+ free_percpu(s->freelist);
+}
+
+static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
+{
+ node->next = head->first;
+ WRITE_ONCE(head->first, node);
+}
+
+static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
+{
+ if (raw_res_spin_lock(&head->lock))
+ return false;
+ pcpu_freelist_push_node(head, node);
+ raw_res_spin_unlock(&head->lock);
+ return true;
+}
+
+void __pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ struct pcpu_freelist_head *head;
+ int cpu;
+
+ if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node))
+ return;
+
+ while (true) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
+ if (cpu == raw_smp_processor_id())
+ continue;
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (raw_res_spin_lock(&head->lock))
+ continue;
+ pcpu_freelist_push_node(head, node);
+ raw_res_spin_unlock(&head->lock);
+ return;
+ }
+ }
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __pcpu_freelist_push(s, node);
+ local_irq_restore(flags);
+}
+
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems)
+{
+ struct pcpu_freelist_head *head;
+ unsigned int cpu, cpu_idx, i, j, n, m;
+
+ n = nr_elems / num_possible_cpus();
+ m = nr_elems % num_possible_cpus();
+
+ cpu_idx = 0;
+ for_each_possible_cpu(cpu) {
+ head = per_cpu_ptr(s->freelist, cpu);
+ j = n + (cpu_idx < m ? 1 : 0);
+ for (i = 0; i < j; i++) {
+ /* No locking required as this is not visible yet. */
+ pcpu_freelist_push_node(head, buf);
+ buf += elem_size;
+ }
+ cpu_idx++;
+ }
+}
+
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_node *node = NULL;
+ struct pcpu_freelist_head *head;
+ int cpu;
+
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
+ head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ continue;
+ if (raw_res_spin_lock(&head->lock))
+ continue;
+ node = head->first;
+ if (node) {
+ WRITE_ONCE(head->first, node->next);
+ raw_res_spin_unlock(&head->lock);
+ return node;
+ }
+ raw_res_spin_unlock(&head->lock);
+ }
+ return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ return ___pcpu_freelist_pop(s);
+}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_node *ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ret = __pcpu_freelist_pop(s);
+ local_irq_restore(flags);
+ return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
new file mode 100644
index 000000000000..914798b74967
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2016 Facebook
+ */
+#ifndef __PERCPU_FREELIST_H__
+#define __PERCPU_FREELIST_H__
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <asm/rqspinlock.h>
+
+struct pcpu_freelist_head {
+ struct pcpu_freelist_node *first;
+ rqspinlock_t lock;
+};
+
+struct pcpu_freelist {
+ struct pcpu_freelist_head __percpu *freelist;
+};
+
+struct pcpu_freelist_node {
+ struct pcpu_freelist_node *next;
+};
+
+/* pcpu_freelist_* do spin_lock_irqsave. */
+void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems);
+int pcpu_freelist_init(struct pcpu_freelist *);
+void pcpu_freelist_destroy(struct pcpu_freelist *s);
+#endif
diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore
new file mode 100644
index 000000000000..9452322902a5
--- /dev/null
+++ b/kernel/bpf/preload/.gitignore
@@ -0,0 +1,2 @@
+/libbpf
+/bpf_preload_umd
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
new file mode 100644
index 000000000000..aef7b0bc96d6
--- /dev/null
+++ b/kernel/bpf/preload/Kconfig
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig BPF_PRELOAD
+ bool "Preload BPF file system with kernel specific program and map iterators"
+ depends on BPF
+ depends on BPF_SYSCALL
+ # The dependency on !COMPILE_TEST prevents it from being enabled
+ # in allmodconfig or allyesconfig configurations
+ depends on !COMPILE_TEST
+ help
+ This builds kernel module with several embedded BPF programs that are
+ pinned into BPF FS mount point as human readable files that are
+ useful in debugging and introspection of BPF programs and maps.
+
+if BPF_PRELOAD
+config BPF_PRELOAD_UMD
+ tristate "bpf_preload kernel module"
+ default m
+ help
+ This builds bpf_preload kernel module with embedded BPF programs for
+ introspection in bpffs.
+endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
new file mode 100644
index 000000000000..20f89cc0a0a6
--- /dev/null
+++ b/kernel/bpf/preload/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+LIBBPF_INCLUDE = $(srctree)/tools/lib
+
+obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
new file mode 100644
index 000000000000..f065c91213a0
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_PRELOAD_H
+#define _BPF_PRELOAD_H
+
+struct bpf_preload_info {
+ char link_name[16];
+ struct bpf_link *link;
+};
+
+struct bpf_preload_ops {
+ int (*preload)(struct bpf_preload_info *);
+ struct module *owner;
+};
+extern struct bpf_preload_ops *bpf_preload_ops;
+#define BPF_PRELOAD_LINKS 2
+#endif
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
new file mode 100644
index 000000000000..774e5a538811
--- /dev/null
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include "bpf_preload.h"
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#include "iterators/iterators.lskel-little-endian.h"
+#else
+#include "iterators/iterators.lskel-big-endian.h"
+#endif
+
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
+
+static void free_links_and_skel(void)
+{
+ if (!IS_ERR_OR_NULL(maps_link))
+ bpf_link_put(maps_link);
+ if (!IS_ERR_OR_NULL(progs_link))
+ bpf_link_put(progs_link);
+ iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+ strscpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ obj[0].link = maps_link;
+ strscpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ obj[1].link = progs_link;
+ return 0;
+}
+
+static struct bpf_preload_ops ops = {
+ .preload = preload,
+ .owner = THIS_MODULE,
+};
+
+static int load_skel(void)
+{
+ int err;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return -ENOMEM;
+ err = iterators_bpf__load(skel);
+ if (err)
+ goto out;
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto out;
+ maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+ if (IS_ERR(maps_link)) {
+ err = PTR_ERR(maps_link);
+ goto out;
+ }
+ progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+ if (IS_ERR(progs_link)) {
+ err = PTR_ERR(progs_link);
+ goto out;
+ }
+ /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+ * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+ */
+ close_fd(skel->links.dump_bpf_map_fd);
+ skel->links.dump_bpf_map_fd = 0;
+ close_fd(skel->links.dump_bpf_prog_fd);
+ skel->links.dump_bpf_prog_fd = 0;
+ return 0;
+out:
+ free_links_and_skel();
+ return err;
+}
+
+static int __init load(void)
+{
+ int err;
+
+ err = load_skel();
+ if (err)
+ return err;
+ bpf_preload_ops = &ops;
+ return err;
+}
+
+static void __exit fini(void)
+{
+ bpf_preload_ops = NULL;
+ free_links_and_skel();
+}
+late_initcall(load);
+module_exit(fini);
+MODULE_IMPORT_NS("BPF_INTERNAL");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs");
diff --git a/kernel/bpf/preload/iterators/.gitignore b/kernel/bpf/preload/iterators/.gitignore
new file mode 100644
index 000000000000..ffdb70230c8b
--- /dev/null
+++ b/kernel/bpf/preload/iterators/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/.output
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
new file mode 100644
index 000000000000..b83c2f5e9be1
--- /dev/null
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0
+OUTPUT := .output
+abs_out := $(abspath $(OUTPUT))
+
+CLANG ?= clang
+LLC ?= llc
+LLVM_STRIP ?= llvm-strip
+
+TOOLS_PATH := $(abspath ../../../../tools)
+BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
+BPFTOOL_OUTPUT := $(abs_out)/bpftool
+DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
+LIBBPF_OUTPUT := $(abs_out)/libbpf
+LIBBPF_DESTDIR := $(LIBBPF_OUTPUT)
+LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)/include
+BPFOBJ := $(LIBBPF_OUTPUT)/libbpf.a
+
+INCLUDES := -I$(OUTPUT) -I$(LIBBPF_INCLUDE) -I$(TOOLS_PATH)/include/uapi
+CFLAGS := -g -Wall
+
+ifeq ($(V),1)
+Q =
+msg =
+else
+Q = @
+msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))";
+MAKEFLAGS += --no-print-directory
+submake_extras := feature_display=0
+endif
+
+.DELETE_ON_ERROR:
+
+.PHONY: all clean
+
+all: iterators.lskel-little-endian.h
+
+big: iterators.lskel-big-endian.h
+
+clean:
+ $(call msg,CLEAN)
+ $(Q)rm -rf $(OUTPUT) iterators
+
+iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)
+ $(call msg,GEN-SKEL,$@)
+ $(Q)$(BPFTOOL) gen skeleton -L $< > $@
+
+$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
+ $(call msg,BPF,$@)
+ $(Q)mkdir -p $(@D)
+ $(Q)$(CLANG) -g -O2 --target=bpf -m$* $(INCLUDES) \
+ -c $(filter %.c,$^) -o $@ && \
+ $(LLVM_STRIP) -g $@
+
+$(OUTPUT) $(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
+ $(call msg,MKDIR,$@)
+ $(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \
+ OUTPUT=$(abspath $(dir $@))/ prefix= \
+ DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
+
+$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README
new file mode 100644
index 000000000000..98e7c90ea012
--- /dev/null
+++ b/kernel/bpf/preload/iterators/README
@@ -0,0 +1,7 @@
+WARNING:
+If you change "iterators.bpf.c" do "make -j" in this directory to
+rebuild "iterators.lskel-little-endian.h". Then, on a big-endian
+machine, do "make -j big" in this directory to rebuild
+"iterators.lskel-big-endian.h". Commit both resulting headers.
+Make sure to have clang 10 installed.
+See Documentation/bpf/bpf_devel_QA.rst
diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c
new file mode 100644
index 000000000000..b78968b63fab
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.bpf.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
+struct seq_file;
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+};
+
+struct bpf_map {
+ __u32 id;
+ char name[16];
+ __u32 max_entries;
+};
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+};
+
+struct btf_type {
+ __u32 name_off;
+};
+
+struct btf_header {
+ __u32 str_len;
+};
+
+struct btf {
+ const char *strings;
+ struct btf_type **types;
+ struct btf_header hdr;
+};
+
+struct bpf_prog_aux {
+ __u32 id;
+ char name[16];
+ const char *attach_func_name;
+ struct bpf_prog *dst_prog;
+ struct bpf_func_info *func_info;
+ struct btf *btf;
+};
+
+struct bpf_prog {
+ struct bpf_prog_aux *aux;
+};
+
+struct bpf_iter__bpf_prog {
+ struct bpf_iter_meta *meta;
+ struct bpf_prog *prog;
+};
+#pragma clang attribute pop
+
+static const char *get_name(struct btf *btf, long btf_id, const char *fallback)
+{
+ struct btf_type **types, *t;
+ unsigned int name_off;
+ const char *str;
+
+ if (!btf)
+ return fallback;
+ str = btf->strings;
+ types = btf->types;
+ bpf_probe_read_kernel(&t, sizeof(t), types + btf_id);
+ name_off = BPF_CORE_READ(t, name_off);
+ if (name_off >= btf->hdr.str_len)
+ return fallback;
+ return str + name_off;
+}
+
+__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym;
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u64 seq_num = ctx->meta->seq_num;
+ struct bpf_map *map = ctx->map;
+
+ if (!map)
+ return 0;
+
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " id name max_entries cur_entries\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %10d %10lld\n",
+ map->id, map->name, map->max_entries,
+ bpf_map_sum_elem_count(map));
+
+ return 0;
+}
+
+SEC("iter/bpf_prog")
+int dump_bpf_prog(struct bpf_iter__bpf_prog *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u64 seq_num = ctx->meta->seq_num;
+ struct bpf_prog *prog = ctx->prog;
+ struct bpf_prog_aux *aux;
+
+ if (!prog)
+ return 0;
+
+ aux = prog->aux;
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " id name attached\n");
+
+ BPF_SEQ_PRINTF(seq, "%4u %-16s %s %s\n", aux->id,
+ get_name(aux->btf, aux->func_info[0].type_id, aux->name),
+ aux->attach_func_name, aux->dst_prog->aux->name);
+ return 0;
+}
+char LICENSE[] SEC("license") = "GPL";
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
new file mode 100644
index 000000000000..49b1d515a847
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
@@ -0,0 +1,437 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+ static const char opts_data[] __attribute__((__aligned__(8))) = "\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\
+\0\0\0\x18\0\0\0\0\0\0\x04\x80\0\0\x04\x80\0\0\x05\x44\0\0\0\0\x02\0\0\0\0\0\0\
+\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\
+\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\
+\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\
+\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc3\x04\0\0\x03\0\0\0\x18\0\0\0\
+\xd1\0\0\0\x09\0\0\0\0\0\0\0\xd5\0\0\0\x0b\0\0\0\x40\0\0\0\xe0\0\0\0\x0b\0\0\0\
+\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe8\x07\0\0\0\0\0\0\0\0\0\0\xf1\x08\0\0\
+\0\0\0\0\x0c\0\0\0\xf7\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xc1\x04\0\0\x03\0\
+\0\0\x18\0\0\x01\xc9\0\0\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x01\
+\xd1\0\0\0\x0e\0\0\0\xa0\0\0\x01\xdd\x08\0\0\0\0\0\0\x0f\0\0\x01\xe3\x01\0\0\0\
+\0\0\0\x04\0\0\0\x20\0\0\x01\xf0\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
+\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xf5\x01\0\0\0\0\0\0\x04\0\0\
+\0\x20\0\0\0\0\x0d\0\0\x01\0\0\0\x14\0\0\x05\x39\0\0\0\x04\0\0\x02\x3e\x08\0\0\
+\0\0\0\0\x15\0\0\x02\x44\x01\0\0\0\0\0\0\x08\x01\0\0\x40\0\0\x02\x4e\x0c\0\0\
+\x01\0\0\0\x13\0\0\0\0\x02\0\0\0\0\0\0\x18\0\0\x02\x65\x04\0\0\x02\0\0\0\x10\0\
+\0\0\x13\0\0\0\x03\0\0\0\0\0\0\x02\x78\0\0\0\x19\0\0\0\x40\0\0\0\0\x02\0\0\0\0\
+\0\0\x1c\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x17\0\0\x02\x7d\x0c\0\0\
+\x01\0\0\0\x1a\0\0\x02\xc9\x04\0\0\x01\0\0\0\x08\0\0\x02\xd2\0\0\0\x1d\0\0\0\0\
+\0\0\0\0\x02\0\0\0\0\0\0\x1e\0\0\x03\x23\x04\0\0\x06\0\0\0\x38\0\0\x01\xc9\0\0\
+\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x03\x30\0\0\0\x1f\0\0\0\xc0\
+\0\0\x03\x41\0\0\0\x19\0\0\x01\0\0\0\x03\x4a\0\0\0\x21\0\0\x01\x40\0\0\x03\x54\
+\0\0\0\x22\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\0\0\x0a\0\0\0\0\0\0\x10\
+\0\0\0\0\x02\0\0\0\0\0\0\x23\0\0\0\0\x02\0\0\0\0\0\0\x24\0\0\x03\x9e\x04\0\0\
+\x02\0\0\0\x08\0\0\x03\xac\0\0\0\x0e\0\0\0\0\0\0\x03\xb5\0\0\0\x0e\0\0\0\x20\0\
+\0\x03\x54\x04\0\0\x03\0\0\0\x18\0\0\x03\xbf\0\0\0\x1f\0\0\0\0\0\0\x03\xc7\0\0\
+\0\x25\0\0\0\x40\0\0\x03\xcd\0\0\0\x27\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x26\0\
+\0\0\0\x02\0\0\0\0\0\0\x28\0\0\x03\xd1\x04\0\0\x01\0\0\0\x04\0\0\x03\xdc\0\0\0\
+\x0e\0\0\0\0\0\0\x04\x45\x04\0\0\x01\0\0\0\x04\0\0\x04\x4e\0\0\0\x0e\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\x04\xc4\x0e\0\0\0\0\
+\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\
+\x04\xd8\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\x04\xee\x0e\0\0\0\0\0\0\x2d\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\x05\x03\x0e\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\
+\0\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\x1a\x0e\0\0\0\0\0\0\
+\x31\0\0\0\x01\0\0\x05\x22\x0f\0\0\x01\0\0\0\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\x05\x29\x0f\0\0\x04\0\0\0\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\x05\x31\x0f\0\0\x01\0\0\0\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\x05\x39\x0e\0\0\
+\0\0\0\0\x06\0\0\0\x01\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\
+\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x32\x2f\x69\x69\x69\
+\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\
+\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\
+\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\
+\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\
+\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\
+\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\
+\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\
+\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\
+\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\
+\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\
+\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\
+\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\
+\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\
+\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\
+\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\
+\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\
+\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\
+\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\
+\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\
+\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\
+\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\
+\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\
+\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\
+\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\
+\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
+\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\
+\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\
+\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\
+\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\
+\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\
+\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\
+\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\
+\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\
+\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\
+\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\
+\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\
+\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\
+\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\
+\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\
+\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\
+\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\
+\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\
+\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\
+\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\
+\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\
+\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\
+\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\
+\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\
+\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\
+\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\
+\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\
+\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\
+\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\
+\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\
+\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\
+\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\x73\0\x2e\
+\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\x6d\x79\
+\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x09\xdc\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\x80\0\0\0\0\
+\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\
+\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\
+\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\
+\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\
+\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\0\
+\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1d\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
+\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe0\xbf\x16\0\0\
+\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb4\x30\0\0\0\0\0\x30\xb4\x50\0\0\
+\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe0\0\0\0\0\xb7\
+\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xe8\0\0\
+\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf0\0\0\0\0\xbf\x17\0\0\0\0\0\0\x85\x02\
+\0\0\0\0\0\0\x7b\xa0\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\
+\xff\xe0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\xb4\x30\0\0\
+\0\0\0\x1a\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x44\x1e\0\0\0\
+\x01\0\0\0\x42\0\0\0\x9b\0\x01\x44\x24\0\0\0\x02\0\0\0\x42\0\0\x01\x0e\0\x01\
+\x4c\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2f\0\x01\x54\x06\0\0\0\x04\0\0\0\x42\0\0\
+\x01\x3e\0\x01\x48\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x63\0\x01\x60\x0e\0\0\0\x08\
+\0\0\0\x42\0\0\x01\x76\0\x01\x64\x03\0\0\0\x0e\0\0\0\x42\0\0\x02\x09\0\x01\x6c\
+\x02\0\0\0\x21\0\0\0\x42\0\0\x02\x3c\0\x01\x80\x01\0\0\0\0\0\0\0\x02\0\0\0\x3e\
+\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\x01\x0a\
+\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\
+\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x0a\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\x01\
+\x3a\0\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\
+\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\
+\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\
+\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\x50\x4c\0\0\0\0\0\x79\
+\x21\0\0\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\
+\0\x79\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\
+\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x4a\xb4\x30\0\0\0\0\0\x20\xb4\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\
+\x7b\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\
+\0\0\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\
+\x87\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\
+\0\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\
+\0\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
+\xb4\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
+\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\
+\xb4\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
+\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3e\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
+\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\
+\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\
+\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\
+\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x6a\xb4\x30\0\0\0\0\0\x11\
+\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x94\x1e\0\0\0\x01\0\0\0\
+\x42\0\0\0\x9b\0\x01\x94\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x99\0\x01\x9c\x1f\0\0\
+\0\x03\0\0\0\x42\0\0\x02\xbd\0\x01\xa8\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xd6\0\
+\x01\xb4\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3e\0\x01\x98\x1d\0\0\0\x06\0\0\0\x42\
+\0\0\x01\x63\0\x01\xb8\x0e\0\0\0\x09\0\0\0\x42\0\0\x02\xe8\0\x01\xbc\x03\0\0\0\
+\x10\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x93\0\x01\
+\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x1b\0\0\0\x42\0\0\
+\x03\xe4\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xf9\0\x01\x14\x2d\0\0\0\x1e\
+\0\0\0\x42\0\0\x04\x30\0\x01\x0c\x0d\0\0\0\x21\0\0\0\x42\0\0\x03\xf9\0\x01\x14\
+\x02\0\0\0\x24\0\0\0\x42\0\0\x04\x57\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\
+\x57\0\x01\x18\x0d\0\0\0\x2c\0\0\0\x42\0\0\x04\x85\0\x01\x1c\x1b\0\0\0\x2d\0\0\
+\0\x42\0\0\x04\x85\0\x01\x1c\x0f\0\0\0\x2e\0\0\0\x42\0\0\x04\xa8\0\x01\x24\x0d\
+\0\0\0\x30\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x3c\
+\0\x01\xd4\x01\0\0\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\
+\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\x01\x0a\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\0\
+\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\0\
+\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\x01\x0a\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\
+\x03\x8b\0\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\x03\x8f\0\0\0\0\0\0\0\xc0\0\0\0\x23\0\
+\0\x03\xbd\0\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\x01\x0a\0\0\0\0\0\0\0\xf0\0\0\0\x24\
+\0\0\0\x3e\0\0\0\0\0\0\x01\x18\0\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\
+\x1e\0\0\x01\x0a\0\0\0\0\0\0\x01\x60\0\0\0\x24\0\0\x04\x7f\0\0\0\0\0\0\x01\x88\
+\0\0\0\x1e\0\0\x01\x3a\0\0\0\0\0\0\x01\x98\0\0\0\x1e\0\0\x04\xc0\0\0\0\0\0\0\
+\x01\xa0\0\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\
+\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x16\0\0\0\x01\0\0\0\0\0\
+\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\
+\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ static const char opts_insn[] __attribute__((__aligned__(8))) = "\
+\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\
+\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\
+\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\
+\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x80\0\0\0\0\xd5\
+\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x84\0\0\0\0\xd5\x10\0\x01\0\
+\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\
+\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0e\xf8\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x0e\xf4\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0e\xe8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\
+\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x30\x63\x10\0\0\0\
+\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
+\0\x0f\x0c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\x0f\0\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
+\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\
+\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x0f\x48\xb7\x20\0\0\0\0\0\x7b\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
+\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\x63\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xc8\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0f\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
+\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\x63\x10\
+\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\xb7\
+\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\
+\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x12\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\x18\x16\0\0\
+\0\0\0\0\0\0\0\0\0\0\x12\x28\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\
+\0\x11\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x70\x7b\x10\0\0\0\0\0\0\x18\x06\
+\0\0\0\0\0\0\0\0\0\0\0\0\x11\x20\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x80\x7b\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x12\xa0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x98\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x38\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\
+\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x3c\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\
+\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x40\x7b\x10\0\0\0\0\0\0\x61\x0a\
+\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x68\x63\x10\0\0\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xb0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\0\
+\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\
+\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\x63\x07\0\x6c\0\0\0\0\x77\
+\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\x18\x86\0\0\0\0\0\0\0\0\0\0\0\0\x10\
+\xb8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xc8\xb7\x20\0\0\0\0\0\x17\xb7\x30\0\0\
+\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
+\xff\x4d\0\0\0\0\x75\x70\0\x03\0\0\0\0\x62\x80\0\x04\0\0\0\0\x6a\x80\0\x02\0\0\
+\0\0\x05\0\0\x0a\0\0\0\0\x63\x87\0\x04\0\0\0\0\xbf\x97\0\0\0\0\0\0\x77\x90\0\0\
+\0\0\0\x20\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\x63\x09\0\0\0\0\0\0\x55\x90\0\
+\x02\0\0\0\0\x6a\x80\0\x02\0\0\0\0\x05\0\0\x01\0\0\0\0\x6a\x80\0\x02\0\0\0\x40\
+\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\xb7\x30\0\0\0\0\
+\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\
+\x01\0\x61\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\
+\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x90\x61\x10\0\0\0\0\0\0\xd5\x10\
+\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x2c\0\0\0\0\
+\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\xe0\x18\x16\0\0\0\
+\0\0\0\0\0\0\0\0\0\x17\x88\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\
+\x12\xe8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x80\x7b\x10\0\0\0\0\0\0\x18\x06\0\
+\0\0\0\0\0\0\0\0\0\0\0\x14\xf0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc8\x7b\x10\
+\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\xf8\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x17\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\x58\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf0\x7b\x10\0\0\0\0\0\0\x61\
+\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x90\x63\x10\0\0\0\0\0\0\
+\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x94\x63\x10\0\0\0\0\
+\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x98\x7b\x10\0\0\
+\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc0\x63\
+\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x18\x08\xb7\x20\0\0\0\0\0\x12\
+\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\
+\0\0\xc5\x70\xfe\xf5\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\x63\x07\0\
+\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\
+\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\
+\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\xe8\x61\x10\0\0\0\
+\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\
+\xfe\xe3\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\x10\0\x02\
+\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\0\0\x63\
+\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\x16\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\0\0\0\0\0\
+\0\0\x95\0\0\0\0\0\0\0";
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = sizeof(opts_data) - 1;
+ opts.data = (void *)opts_data;
+ opts.insns_sz = sizeof(opts_insn) - 1;
+ opts.insns = (void *)opts_insn;
+
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
new file mode 100644
index 000000000000..5b98ab02025e
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h
@@ -0,0 +1,435 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6208;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x80\x04\0\0\x80\x04\0\0\x31\x05\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xb0\0\0\0\x03\0\0\x04\x18\0\0\0\xbe\0\
+\0\0\x09\0\0\0\0\0\0\0\xc2\0\0\0\x0b\0\0\0\x40\0\0\0\xcd\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd5\0\0\0\0\0\0\x07\0\0\0\0\xde\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xe4\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xae\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\xb6\x01\0\0\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\xbe\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xca\x01\0\0\0\0\0\x08\x0f\0\0\0\xd0\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xdd\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xe2\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\x01\0\0\x0d\x14\0\0\0\x26\x05\0\0\x04\0\0\0\x2b\x02\0\0\0\0\
+\0\x08\x15\0\0\0\x31\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\x01\x3b\x02\0\0\x01\0\
+\0\x0c\x13\0\0\0\0\0\0\0\0\0\0\x02\x18\0\0\0\x52\x02\0\0\x02\0\0\x04\x10\0\0\0\
+\x13\0\0\0\x03\0\0\0\0\0\0\0\x65\x02\0\0\x19\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
+\x1c\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x17\0\0\0\x6a\x02\0\0\x01\0\
+\0\x0c\x1a\0\0\0\xb6\x02\0\0\x01\0\0\x04\x08\0\0\0\xbf\x02\0\0\x1d\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\x02\x1e\0\0\0\x10\x03\0\0\x06\0\0\x04\x38\0\0\0\xb6\x01\0\0\
+\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\x1d\x03\0\0\x1f\0\0\0\xc0\0\
+\0\0\x2e\x03\0\0\x19\0\0\0\0\x01\0\0\x37\x03\0\0\x21\0\0\0\x40\x01\0\0\x41\x03\
+\0\0\x22\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
+\0\0\0\0\0\0\0\0\0\x02\x23\0\0\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x8b\x03\0\0\x02\0\
+\0\x04\x08\0\0\0\x99\x03\0\0\x0e\0\0\0\0\0\0\0\xa2\x03\0\0\x0e\0\0\0\x20\0\0\0\
+\x41\x03\0\0\x03\0\0\x04\x18\0\0\0\xac\x03\0\0\x1f\0\0\0\0\0\0\0\xb4\x03\0\0\
+\x25\0\0\0\x40\0\0\0\xba\x03\0\0\x27\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x26\0\0\
+\0\0\0\0\0\0\0\0\x02\x28\0\0\0\xbe\x03\0\0\x01\0\0\x04\x04\0\0\0\xc9\x03\0\0\
+\x0e\0\0\0\0\0\0\0\x32\x04\0\0\x01\0\0\x04\x04\0\0\0\x3b\x04\0\0\x0e\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\0\xb1\x04\0\0\0\0\0\
+\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\0\
+\xc5\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\0\xdb\x04\0\0\0\0\0\x0e\x2d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\0\xf0\x04\0\0\0\0\0\x0e\x2f\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\x07\x05\0\0\0\0\0\x0e\
+\x31\0\0\0\x01\0\0\0\x0f\x05\0\0\x01\0\0\x0f\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\0\x16\x05\0\0\x04\0\0\x0f\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\0\x1e\x05\0\0\x01\0\0\x0f\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\0\x26\x05\0\0\0\
+\0\0\x0e\x06\0\0\0\x01\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\
+\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x73\
+\x70\x73\x6b\x2f\x73\x72\x63\x2f\x62\x70\x66\x2d\x6e\x65\x78\x74\x2f\x6b\x65\
+\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\
+\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\
+\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\
+\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\
+\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\
+\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\
+\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\
+\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\
+\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\
+\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\
+\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\
+\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\
+\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\
+\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\
+\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\
+\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\
+\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\
+\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
+\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\
+\x6c\x6c\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\
+\x6c\x6f\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\
+\x6d\x5f\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
+\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\
+\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\
+\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\
+\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\
+\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
+\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\
+\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\
+\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\
+\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\
+\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\
+\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\
+\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\
+\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\
+\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\
+\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\
+\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\
+\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\
+\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\
+\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\
+\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\
+\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\
+\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\
+\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\
+\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\
+\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\
+\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
+\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\
+\x73\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\
+\x6d\x79\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\xc9\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\
+\x80\0\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\
+\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\
+\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\
+\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\
+\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\
+\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\
+\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\
+\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1d\0\0\0\0\0\x79\x21\
+\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe0\xff\
+\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\
+\x30\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\
+\xe0\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
+\x7b\x2a\xe8\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\xbf\x71\
+\0\0\0\0\0\0\x85\x20\0\0\0\0\0\0\x7b\x0a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
+\x07\x04\0\0\xe0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x30\0\0\0\xb7\x03\0\0\x1a\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\
+\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\
+\x1e\x44\x01\0\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x44\x01\0\x02\0\0\0\x42\0\0\0\
+\xfb\0\0\0\x1d\x4c\x01\0\x03\0\0\0\x42\0\0\0\x1c\x01\0\0\x06\x54\x01\0\x04\0\0\
+\0\x42\0\0\0\x2b\x01\0\0\x1d\x48\x01\0\x05\0\0\0\x42\0\0\0\x50\x01\0\0\x06\x60\
+\x01\0\x07\0\0\0\x42\0\0\0\x63\x01\0\0\x03\x64\x01\0\x0e\0\0\0\x42\0\0\0\xf6\
+\x01\0\0\x02\x6c\x01\0\x21\0\0\0\x42\0\0\0\x29\x02\0\0\x01\x80\x01\0\0\0\0\0\
+\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\
+\x02\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x70\0\0\0\
+\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xf7\0\0\0\0\0\0\0\xa0\0\0\0\
+\x0d\0\0\0\x27\x01\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\
+\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
+\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\
+\x65\x72\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\
+\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\
+\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\
+\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\
+\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\
+\x18\x62\0\0\0\0\0\0\0\0\0\0\x4a\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\
+\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\
+\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\
+\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\
+\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\
+\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\
+\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
+\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\
+\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\
+\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\
+\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\
+\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x6a\0\0\0\xb7\
+\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\
+\x95\0\0\0\0\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\x1e\x94\x01\0\
+\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x94\x01\0\x02\0\0\0\x42\0\0\0\x86\x02\0\0\
+\x1f\x9c\x01\0\x03\0\0\0\x42\0\0\0\xaa\x02\0\0\x06\xa8\x01\0\x04\0\0\0\x42\0\0\
+\0\xc3\x02\0\0\x0e\xb4\x01\0\x05\0\0\0\x42\0\0\0\x2b\x01\0\0\x1d\x98\x01\0\x06\
+\0\0\0\x42\0\0\0\x50\x01\0\0\x06\xb8\x01\0\x08\0\0\0\x42\0\0\0\xd5\x02\0\0\x03\
+\xbc\x01\0\x10\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x17\0\0\0\x42\0\0\0\
+\x80\x03\0\0\x06\x04\x01\0\x1a\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x1b\0\
+\0\0\x42\0\0\0\xd1\x03\0\0\x0f\x10\x01\0\x1c\0\0\0\x42\0\0\0\xe6\x03\0\0\x2d\
+\x14\x01\0\x1e\0\0\0\x42\0\0\0\x1d\x04\0\0\x0d\x0c\x01\0\x20\0\0\0\x42\0\0\0\
+\x45\x03\0\0\x02\xc4\x01\0\x21\0\0\0\x42\0\0\0\xe6\x03\0\0\x02\x14\x01\0\x24\0\
+\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x27\0\0\0\x42\0\0\0\x45\x03\0\0\x02\
+\xc4\x01\0\x28\0\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x2b\0\0\0\x42\0\0\0\
+\x44\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x72\x04\0\0\x1b\x1c\x01\0\x2d\0\
+\0\0\x42\0\0\0\x72\x04\0\0\x06\x1c\x01\0\x2e\0\0\0\x42\0\0\0\x95\x04\0\0\x0d\
+\x24\x01\0\x30\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x3f\0\0\0\x42\0\0\0\
+\x29\x02\0\0\x01\xd4\x01\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\
+\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\
+\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\
+\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\0\xf7\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\0\
+\x78\x03\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\0\x7c\x03\0\0\0\0\0\0\xc0\0\0\0\x23\0\0\
+\0\xaa\x03\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\0\xf7\0\0\0\0\0\0\0\xf0\0\0\0\x24\0\0\
+\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1e\0\
+\0\0\xf7\0\0\0\0\0\0\0\x60\x01\0\0\x24\0\0\0\x6c\x04\0\0\0\0\0\0\x88\x01\0\0\
+\x1e\0\0\0\x27\x01\0\0\0\0\0\0\x98\x01\0\0\x1e\0\0\0\xad\x04\0\0\0\0\0\0\xa0\
+\x01\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\
+\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\
+\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\
+\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ opts.insns_sz = 2456;
+ opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\xe8\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\xe4\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\xd8\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x0f\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\xfc\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\xf0\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x38\
+\x0f\0\0\xb7\x02\0\0\x7b\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xb8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\
+\0\0\0\xc8\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x20\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xf0\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x18\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x08\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x60\x12\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x70\x12\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xa0\x11\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\x90\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x12\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x2c\x12\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x58\x12\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x12\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\x18\x68\0\0\0\0\0\0\0\0\0\0\xa8\
+\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x12\0\0\xb7\x02\0\0\x17\0\0\0\xb7\x03\
+\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\
+\x07\x4d\xff\0\0\0\0\x75\x07\x03\0\0\0\0\0\x62\x08\x04\0\0\0\0\0\x6a\x08\x02\0\
+\0\0\0\0\x05\0\x0a\0\0\0\0\0\x63\x78\x04\0\0\0\0\0\xbf\x79\0\0\0\0\0\0\x77\x09\
+\0\0\x20\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\x63\x90\0\0\0\0\0\0\x55\
+\x09\x02\0\0\0\0\0\x6a\x08\x02\0\0\0\0\0\x05\0\x01\0\0\0\0\0\x6a\x08\x02\0\x40\
+\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\xb7\x03\0\
+\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\
+\0\0\x01\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\
+\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x80\x12\0\0\x61\x01\0\0\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x2c\xff\
+\0\0\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\xa8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\xd8\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x17\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\xe0\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe8\x17\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x14\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xf8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x78\
+\x16\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x10\x18\0\0\x7b\x01\0\0\
+\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb4\x17\0\0\x63\
+\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\
+\x17\0\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x18\0\0\xb7\x02\0\
+\0\x12\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\
+\x07\0\0\0\0\0\0\xc5\x07\xf5\xfe\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x17\0\
+\0\x63\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\
+\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x98\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\
+\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x08\x18\0\0\
+\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\
+\0\0\xc5\x07\xe3\xfe\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\
+\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\
+\0\0\0\0\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\
+\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+__attribute__((unused)) static void
+iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused)))
+{
+#ifdef __cplusplus
+#define _Static_assert static_assert
+#endif
+#ifdef __cplusplus
+#undef _Static_assert
+#endif
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
new file mode 100644
index 000000000000..85d8fcb56fb7
--- /dev/null
+++ b/kernel/bpf/prog_iter.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_prog_info {
+ u32 prog_id;
+};
+
+static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_prog_info *info = seq->private;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_curr_or_next(&info->prog_id);
+ if (!prog)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return prog;
+}
+
+static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_prog_info *info = seq->private;
+
+ ++*pos;
+ ++info->prog_id;
+ bpf_prog_put((struct bpf_prog *)v);
+ return bpf_prog_get_curr_or_next(&info->prog_id);
+}
+
+struct bpf_iter__bpf_prog {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_prog *, prog);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog)
+
+static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter__bpf_prog ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.meta = &meta;
+ ctx.prog = v;
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (prog)
+ ret = bpf_iter_run_prog(prog, &ctx);
+
+ return ret;
+}
+
+static int bpf_prog_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_prog_seq_show(seq, v, false);
+}
+
+static void bpf_prog_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_prog_seq_show(seq, v, true);
+ else
+ bpf_prog_put((struct bpf_prog *)v);
+}
+
+static const struct seq_operations bpf_prog_seq_ops = {
+ .start = bpf_prog_seq_start,
+ .next = bpf_prog_seq_next,
+ .stop = bpf_prog_seq_stop,
+ .show = bpf_prog_seq_show,
+};
+
+BTF_ID_LIST_SINGLE(btf_bpf_prog_id, struct, bpf_prog)
+
+static const struct bpf_iter_seq_info bpf_prog_seq_info = {
+ .seq_ops = &bpf_prog_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_prog_info),
+};
+
+static struct bpf_iter_reg bpf_prog_reg_info = {
+ .target = "bpf_prog",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_prog, prog),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &bpf_prog_seq_info,
+};
+
+static int __init bpf_prog_iter_init(void)
+{
+ bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id;
+ return bpf_iter_reg_target(&bpf_prog_reg_info);
+}
+
+late_initcall(bpf_prog_iter_init);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
new file mode 100644
index 000000000000..9a5f94371e50
--- /dev/null
+++ b/kernel/bpf/queue_stack_maps.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * queue_stack_maps.c: BPF queue and stack maps
+ *
+ * Copyright (c) 2018 Politecnico di Torino
+ */
+#include <linux/bpf.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/btf_ids.h>
+#include "percpu_freelist.h"
+#include <asm/rqspinlock.h>
+
+#define QUEUE_STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
+
+struct bpf_queue_stack {
+ struct bpf_map map;
+ rqspinlock_t lock;
+ u32 head, tail;
+ u32 size; /* max_entries + 1 */
+
+ char elements[] __aligned(8);
+};
+
+static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)
+{
+ return container_of(map, struct bpf_queue_stack, map);
+}
+
+static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs)
+{
+ return qs->head == qs->tail;
+}
+
+static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
+{
+ u32 head = qs->head + 1;
+
+ if (unlikely(head >= qs->size))
+ head = 0;
+
+ return head == qs->tail;
+}
+
+/* Called from syscall */
+static int queue_stack_map_alloc_check(union bpf_attr *attr)
+{
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 0 ||
+ attr->value_size == 0 ||
+ attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
+ return -EINVAL;
+
+ if (attr->value_size > KMALLOC_MAX_SIZE)
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return -E2BIG;
+
+ return 0;
+}
+
+static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
+{
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_queue_stack *qs;
+ u64 size, queue_size;
+
+ size = (u64) attr->max_entries + 1;
+ queue_size = sizeof(*qs) + size * attr->value_size;
+
+ qs = bpf_map_area_alloc(queue_size, numa_node);
+ if (!qs)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&qs->map, attr);
+
+ qs->size = size;
+
+ raw_res_spin_lock_init(&qs->lock);
+
+ return &qs->map;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_stack_map_free(struct bpf_map *map)
+{
+ struct bpf_queue_stack *qs = bpf_queue_stack(map);
+
+ bpf_map_area_free(qs);
+}
+
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
+{
+ struct bpf_queue_stack *qs = bpf_queue_stack(map);
+ unsigned long flags;
+ int err = 0;
+ void *ptr;
+
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+
+ if (queue_stack_map_is_empty(qs)) {
+ memset(value, 0, qs->map.value_size);
+ err = -ENOENT;
+ goto out;
+ }
+
+ ptr = &qs->elements[qs->tail * qs->map.value_size];
+ memcpy(value, ptr, qs->map.value_size);
+
+ if (delete) {
+ if (unlikely(++qs->tail >= qs->size))
+ qs->tail = 0;
+ }
+
+out:
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
+ return err;
+}
+
+
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
+{
+ struct bpf_queue_stack *qs = bpf_queue_stack(map);
+ unsigned long flags;
+ int err = 0;
+ void *ptr;
+ u32 index;
+
+ if (raw_res_spin_lock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+
+ if (queue_stack_map_is_empty(qs)) {
+ memset(value, 0, qs->map.value_size);
+ err = -ENOENT;
+ goto out;
+ }
+
+ index = qs->head - 1;
+ if (unlikely(index >= qs->size))
+ index = qs->size - 1;
+
+ ptr = &qs->elements[index * qs->map.value_size];
+ memcpy(value, ptr, qs->map.value_size);
+
+ if (delete)
+ qs->head = index;
+
+out:
+ raw_res_spin_unlock_irqrestore(&qs->lock, flags);
+ return err;
+}
+
+/* Called from syscall or from eBPF program */
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return __queue_map_get(map, value, false);
+}
+
+/* Called from syscall or from eBPF program */
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
+{
+ return __stack_map_get(map, value, false);
+}
+
+/* Called from syscall or from eBPF program */
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return __queue_map_get(map, value, true);
+}
+
+/* Called from syscall or from eBPF program */
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return __stack_map_get(map, value, true);
+}
+
+/* Called from syscall or from eBPF program */
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
+{
+ struct bpf_queue_stack *qs = bpf_queue_stack(map);
+ unsigned long irq_flags;
+ int err = 0;
+ void *dst;
+
+ /* BPF_EXIST is used to force making room for a new element in case the
+ * map is full
+ */
+ bool replace = (flags & BPF_EXIST);
+
+ /* Check supported flags for queue and stack maps */
+ if (flags & BPF_NOEXIST || flags > BPF_EXIST)
+ return -EINVAL;
+
+ if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags))
+ return -EBUSY;
+
+ if (queue_stack_map_is_full(qs)) {
+ if (!replace) {
+ err = -E2BIG;
+ goto out;
+ }
+ /* advance tail pointer to overwrite oldest element */
+ if (unlikely(++qs->tail >= qs->size))
+ qs->tail = 0;
+ }
+
+ dst = &qs->elements[qs->head * qs->map.value_size];
+ memcpy(dst, value, qs->map.value_size);
+
+ if (unlikely(++qs->head >= qs->size))
+ qs->head = 0;
+
+out:
+ raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags);
+ return err;
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -EINVAL;
+}
+
+static u64 queue_stack_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_queue_stack);
+
+ usage += ((u64)map->max_entries + 1) * map->value_size;
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
+const struct bpf_map_ops queue_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = queue_stack_map_alloc_check,
+ .map_alloc = queue_stack_map_alloc,
+ .map_free = queue_stack_map_free,
+ .map_lookup_elem = queue_stack_map_lookup_elem,
+ .map_update_elem = queue_stack_map_update_elem,
+ .map_delete_elem = queue_stack_map_delete_elem,
+ .map_push_elem = queue_stack_map_push_elem,
+ .map_pop_elem = queue_map_pop_elem,
+ .map_peek_elem = queue_map_peek_elem,
+ .map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
+};
+
+const struct bpf_map_ops stack_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = queue_stack_map_alloc_check,
+ .map_alloc = queue_stack_map_alloc,
+ .map_free = queue_stack_map_free,
+ .map_lookup_elem = queue_stack_map_lookup_elem,
+ .map_update_elem = queue_stack_map_update_elem,
+ .map_delete_elem = queue_stack_map_delete_elem,
+ .map_push_elem = queue_stack_map_push_elem,
+ .map_pop_elem = stack_map_pop_elem,
+ .map_peek_elem = stack_map_peek_elem,
+ .map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
+ .map_btf_id = &queue_map_btf_ids[0],
+};
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
new file mode 100644
index 000000000000..99c63d982c5d
--- /dev/null
+++ b/kernel/bpf/range_tree.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <linux/interval_tree_generic.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include "range_tree.h"
+
+/*
+ * struct range_tree is a data structure used to allocate contiguous memory
+ * ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is
+ * represented by struct range_node or 'rn' for short.
+ * rn->rn_rbnode links it into an interval tree while
+ * rn->rb_range_size links it into a second rbtree sorted by size of the range.
+ * __find_range() performs binary search and best fit algorithm to find the
+ * range less or equal requested size.
+ * range_tree_clear/set() clears or sets a range of bits in this bitmap. The
+ * adjacent ranges are merged or split at the same time.
+ *
+ * The split/merge logic is based/borrowed from XFS's xbitmap32 added
+ * in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
+ *
+ * The implementation relies on external lock to protect rbtree-s.
+ * The alloc/free of range_node-s is done via kmalloc_nolock().
+ *
+ * bpf arena is using range_tree to represent unallocated slots.
+ * At init time:
+ * range_tree_set(rt, 0, max);
+ * Then:
+ * start = range_tree_find(rt, len);
+ * if (start >= 0)
+ * range_tree_clear(rt, start, len);
+ * to find free range and mark slots as allocated and later:
+ * range_tree_set(rt, start, len);
+ * to mark as unallocated after use.
+ */
+struct range_node {
+ struct rb_node rn_rbnode;
+ struct rb_node rb_range_size;
+ u32 rn_start;
+ u32 rn_last; /* inclusive */
+ u32 __rn_subtree_last;
+};
+
+static struct range_node *rb_to_range_node(struct rb_node *rb)
+{
+ return rb_entry(rb, struct range_node, rb_range_size);
+}
+
+static u32 rn_size(struct range_node *rn)
+{
+ return rn->rn_last - rn->rn_start + 1;
+}
+
+/* Find range that fits best to requested size */
+static inline struct range_node *__find_range(struct range_tree *rt, u32 len)
+{
+ struct rb_node *rb = rt->range_size_root.rb_root.rb_node;
+ struct range_node *best = NULL;
+
+ while (rb) {
+ struct range_node *rn = rb_to_range_node(rb);
+
+ if (len <= rn_size(rn)) {
+ best = rn;
+ rb = rb->rb_right;
+ } else {
+ rb = rb->rb_left;
+ }
+ }
+
+ return best;
+}
+
+s64 range_tree_find(struct range_tree *rt, u32 len)
+{
+ struct range_node *rn;
+
+ rn = __find_range(rt, len);
+ if (!rn)
+ return -ENOENT;
+ return rn->rn_start;
+}
+
+/* Insert the range into rbtree sorted by the range size */
+static inline void __range_size_insert(struct range_node *rn,
+ struct rb_root_cached *root)
+{
+ struct rb_node **link = &root->rb_root.rb_node, *rb = NULL;
+ u64 size = rn_size(rn);
+ bool leftmost = true;
+
+ while (*link) {
+ rb = *link;
+ if (size > rn_size(rb_to_range_node(rb))) {
+ link = &rb->rb_left;
+ } else {
+ link = &rb->rb_right;
+ leftmost = false;
+ }
+ }
+
+ rb_link_node(&rn->rb_range_size, rb, link);
+ rb_insert_color_cached(&rn->rb_range_size, root, leftmost);
+}
+
+#define START(node) ((node)->rn_start)
+#define LAST(node) ((node)->rn_last)
+
+INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32,
+ __rn_subtree_last, START, LAST,
+ static inline __maybe_unused,
+ __range_it)
+
+static inline __maybe_unused void
+range_it_insert(struct range_node *rn, struct range_tree *rt)
+{
+ __range_size_insert(rn, &rt->range_size_root);
+ __range_it_insert(rn, &rt->it_root);
+}
+
+static inline __maybe_unused void
+range_it_remove(struct range_node *rn, struct range_tree *rt)
+{
+ rb_erase_cached(&rn->rb_range_size, &rt->range_size_root);
+ RB_CLEAR_NODE(&rn->rb_range_size);
+ __range_it_remove(rn, &rt->it_root);
+}
+
+static inline __maybe_unused struct range_node *
+range_it_iter_first(struct range_tree *rt, u32 start, u32 last)
+{
+ return __range_it_iter_first(&rt->it_root, start, last);
+}
+
+/* Clear the range in this range tree */
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *new_rn;
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, start, last))) {
+ if (rn->rn_start < start && rn->rn_last > last) {
+ u32 old_last = rn->rn_last;
+
+ /* Overlaps with the entire clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+
+ /* Add a range */
+ new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ if (!new_rn)
+ return -ENOMEM;
+ new_rn->rn_start = last + 1;
+ new_rn->rn_last = old_last;
+ range_it_insert(new_rn, rt);
+ } else if (rn->rn_start < start) {
+ /* Overlaps with the left side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_last = start - 1;
+ range_it_insert(rn, rt);
+ } else if (rn->rn_last > last) {
+ /* Overlaps with the right side of the clearing range */
+ range_it_remove(rn, rt);
+ rn->rn_start = last + 1;
+ range_it_insert(rn, rt);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ range_it_remove(rn, rt);
+ kfree_nolock(rn);
+ }
+ }
+ return 0;
+}
+
+/* Is the whole range set ? */
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *left;
+
+ /* Is this whole range set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+ return -ESRCH;
+}
+
+/* Set the range in this range tree */
+int range_tree_set(struct range_tree *rt, u32 start, u32 len)
+{
+ u32 last = start + len - 1;
+ struct range_node *right;
+ struct range_node *left;
+ int err;
+
+ /* Is this whole range already set ? */
+ left = range_it_iter_first(rt, start, last);
+ if (left && left->rn_start <= start && left->rn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ err = range_tree_clear(rt, start, len);
+ if (err)
+ return err;
+
+ /* Do we have a left-adjacent range ? */
+ left = range_it_iter_first(rt, start - 1, start - 1);
+ if (left && left->rn_last + 1 != start)
+ return -EFAULT;
+
+ /* Do we have a right-adjacent range ? */
+ right = range_it_iter_first(rt, last + 1, last + 1);
+ if (right && right->rn_start != last + 1)
+ return -EFAULT;
+
+ if (left && right) {
+ /* Combine left and right adjacent ranges */
+ range_it_remove(left, rt);
+ range_it_remove(right, rt);
+ left->rn_last = right->rn_last;
+ range_it_insert(left, rt);
+ kfree_nolock(right);
+ } else if (left) {
+ /* Combine with the left range */
+ range_it_remove(left, rt);
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ } else if (right) {
+ /* Combine with the right range */
+ range_it_remove(right, rt);
+ right->rn_start = start;
+ range_it_insert(right, rt);
+ } else {
+ left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+ if (!left)
+ return -ENOMEM;
+ left->rn_start = start;
+ left->rn_last = last;
+ range_it_insert(left, rt);
+ }
+ return 0;
+}
+
+void range_tree_destroy(struct range_tree *rt)
+{
+ struct range_node *rn;
+
+ while ((rn = range_it_iter_first(rt, 0, -1U))) {
+ range_it_remove(rn, rt);
+ kfree_nolock(rn);
+ }
+}
+
+void range_tree_init(struct range_tree *rt)
+{
+ rt->it_root = RB_ROOT_CACHED;
+ rt->range_size_root = RB_ROOT_CACHED;
+}
diff --git a/kernel/bpf/range_tree.h b/kernel/bpf/range_tree.h
new file mode 100644
index 000000000000..ff0b9110eb71
--- /dev/null
+++ b/kernel/bpf/range_tree.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#ifndef _RANGE_TREE_H
+#define _RANGE_TREE_H 1
+
+struct range_tree {
+ /* root of interval tree */
+ struct rb_root_cached it_root;
+ /* root of rbtree of interval sizes */
+ struct rb_root_cached range_size_root;
+};
+
+void range_tree_init(struct range_tree *rt);
+void range_tree_destroy(struct range_tree *rt);
+
+int range_tree_clear(struct range_tree *rt, u32 start, u32 len);
+int range_tree_set(struct range_tree *rt, u32 start, u32 len);
+int is_range_tree_set(struct range_tree *rt, u32 start, u32 len);
+s64 range_tree_find(struct range_tree *rt, u32 len);
+
+#endif
diff --git a/kernel/bpf/relo_core.c b/kernel/bpf/relo_core.c
new file mode 100644
index 000000000000..aa822c9fcfde
--- /dev/null
+++ b/kernel/bpf/relo_core.c
@@ -0,0 +1,2 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+#include "../../tools/lib/bpf/relo_core.c"
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
new file mode 100644
index 000000000000..49b8e5a0c6b4
--- /dev/null
+++ b/kernel/bpf/reuseport_array.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/sock_diag.h>
+#include <net/sock_reuseport.h>
+#include <linux/btf_ids.h>
+
+struct reuseport_array {
+ struct bpf_map map;
+ struct sock __rcu *ptrs[];
+};
+
+static struct reuseport_array *reuseport_array(struct bpf_map *map)
+{
+ return (struct reuseport_array *)map;
+}
+
+/* The caller must hold the reuseport_lock */
+void bpf_sk_reuseport_detach(struct sock *sk)
+{
+ struct sock __rcu **socks;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF);
+ if (socks) {
+ WRITE_ONCE(sk->sk_user_data, NULL);
+ /*
+ * Do not move this NULL assignment outside of
+ * sk->sk_callback_lock because there is
+ * a race with reuseport_array_free()
+ * which does not hold the reuseport_lock.
+ */
+ RCU_INIT_POINTER(*socks, NULL);
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int reuseport_array_alloc_check(union bpf_attr *attr)
+{
+ if (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64))
+ return -EINVAL;
+
+ return array_map_alloc_check(attr);
+}
+
+static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return rcu_dereference(array->ptrs[index]);
+}
+
+/* Called from syscall only */
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = *(u32 *)key;
+ struct sock *sk;
+ int err;
+
+ if (index >= map->max_entries)
+ return -E2BIG;
+
+ if (!rcu_access_pointer(array->ptrs[index]))
+ return -ENOENT;
+
+ spin_lock_bh(&reuseport_lock);
+
+ sk = rcu_dereference_protected(array->ptrs[index],
+ lockdep_is_held(&reuseport_lock));
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ WRITE_ONCE(sk->sk_user_data, NULL);
+ RCU_INIT_POINTER(array->ptrs[index], NULL);
+ write_unlock_bh(&sk->sk_callback_lock);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+
+ return err;
+}
+
+static void reuseport_array_free(struct bpf_map *map)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ struct sock *sk;
+ u32 i;
+
+ /*
+ * ops->map_*_elem() will not be able to access this
+ * array now. Hence, this function only races with
+ * bpf_sk_reuseport_detach() which was triggered by
+ * close() or disconnect().
+ *
+ * This function and bpf_sk_reuseport_detach() are
+ * both removing sk from "array". Who removes it
+ * first does not matter.
+ *
+ * The only concern here is bpf_sk_reuseport_detach()
+ * may access "array" which is being freed here.
+ * bpf_sk_reuseport_detach() access this "array"
+ * through sk->sk_user_data _and_ with sk->sk_callback_lock
+ * held which is enough because this "array" is not freed
+ * until all sk->sk_user_data has stopped referencing this "array".
+ *
+ * Hence, due to the above, taking "reuseport_lock" is not
+ * needed here.
+ */
+
+ /*
+ * Since reuseport_lock is not taken, sk is accessed under
+ * rcu_read_lock()
+ */
+ rcu_read_lock();
+ for (i = 0; i < map->max_entries; i++) {
+ sk = rcu_dereference(array->ptrs[i]);
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ /*
+ * No need for WRITE_ONCE(). At this point,
+ * no one is reading it without taking the
+ * sk->sk_callback_lock.
+ */
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
+ RCU_INIT_POINTER(array->ptrs[i], NULL);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Once reaching here, all sk->sk_user_data is not
+ * referencing this "array". "array" can be freed now.
+ */
+ bpf_map_area_free(array);
+}
+
+static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
+{
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct reuseport_array *array;
+
+ /* allocate all map elements and zero-initialize them */
+ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+
+ /* copy mandatory map attributes */
+ bpf_map_init_from_attr(&array->map, attr);
+
+ return &array->map;
+}
+
+int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+ void *value)
+{
+ struct sock *sk;
+ int err;
+
+ if (map->value_size != sizeof(u64))
+ return -ENOSPC;
+
+ rcu_read_lock();
+ sk = reuseport_array_lookup_elem(map, key);
+ if (sk) {
+ *(u64 *)value = __sock_gen_cookie(sk);
+ err = 0;
+ } else {
+ err = -ENOENT;
+ }
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int
+reuseport_array_update_check(const struct reuseport_array *array,
+ const struct sock *nsk,
+ const struct sock *osk,
+ const struct sock_reuseport *nsk_reuse,
+ u32 map_flags)
+{
+ if (osk && map_flags == BPF_NOEXIST)
+ return -EEXIST;
+
+ if (!osk && map_flags == BPF_EXIST)
+ return -ENOENT;
+
+ if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP)
+ return -ENOTSUPP;
+
+ if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6)
+ return -ENOTSUPP;
+
+ if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM)
+ return -ENOTSUPP;
+
+ /*
+ * sk must be hashed (i.e. listening in the TCP case or binded
+ * in the UDP case) and
+ * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL).
+ *
+ * Also, sk will be used in bpf helper that is protected by
+ * rcu_read_lock().
+ */
+ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse)
+ return -EINVAL;
+
+ /* READ_ONCE because the sk->sk_callback_lock may not be held here */
+ if (READ_ONCE(nsk->sk_user_data))
+ return -EBUSY;
+
+ return 0;
+}
+
+/*
+ * Called from syscall only.
+ * The "nsk" in the fd refcnt.
+ * The "osk" and "reuse" are protected by reuseport_lock.
+ */
+int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ struct sock *free_osk = NULL, *osk, *nsk;
+ struct sock_reuseport *reuse;
+ u32 index = *(u32 *)key;
+ uintptr_t sk_user_data;
+ struct socket *socket;
+ int err, fd;
+
+ if (map_flags > BPF_EXIST)
+ return -EINVAL;
+
+ if (index >= map->max_entries)
+ return -E2BIG;
+
+ if (map->value_size == sizeof(u64)) {
+ u64 fd64 = *(u64 *)value;
+
+ if (fd64 > S32_MAX)
+ return -EINVAL;
+ fd = fd64;
+ } else {
+ fd = *(int *)value;
+ }
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ nsk = socket->sk;
+ if (!nsk) {
+ err = -EINVAL;
+ goto put_file;
+ }
+
+ /* Quick checks before taking reuseport_lock */
+ err = reuseport_array_update_check(array, nsk,
+ rcu_access_pointer(array->ptrs[index]),
+ rcu_access_pointer(nsk->sk_reuseport_cb),
+ map_flags);
+ if (err)
+ goto put_file;
+
+ spin_lock_bh(&reuseport_lock);
+ /*
+ * Some of the checks only need reuseport_lock
+ * but it is done under sk_callback_lock also
+ * for simplicity reason.
+ */
+ write_lock_bh(&nsk->sk_callback_lock);
+
+ osk = rcu_dereference_protected(array->ptrs[index],
+ lockdep_is_held(&reuseport_lock));
+ reuse = rcu_dereference_protected(nsk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags);
+ if (err)
+ goto put_file_unlock;
+
+ sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY |
+ SK_USER_DATA_BPF;
+ WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data);
+ rcu_assign_pointer(array->ptrs[index], nsk);
+ free_osk = osk;
+ err = 0;
+
+put_file_unlock:
+ write_unlock_bh(&nsk->sk_callback_lock);
+
+ if (free_osk) {
+ write_lock_bh(&free_osk->sk_callback_lock);
+ WRITE_ONCE(free_osk->sk_user_data, NULL);
+ write_unlock_bh(&free_osk->sk_callback_lock);
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+put_file:
+ sockfd_put(socket);
+ return err;
+}
+
+/* Called from syscall */
+static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ struct reuseport_array *array = reuseport_array(map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+static u64 reuseport_array_mem_usage(const struct bpf_map *map)
+{
+ struct reuseport_array *array;
+
+ return struct_size(array, ptrs, map->max_entries);
+}
+
+BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
+const struct bpf_map_ops reuseport_array_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = reuseport_array_alloc_check,
+ .map_alloc = reuseport_array_alloc,
+ .map_free = reuseport_array_free,
+ .map_lookup_elem = reuseport_array_lookup_elem,
+ .map_get_next_key = reuseport_array_get_next_key,
+ .map_delete_elem = reuseport_array_delete_elem,
+ .map_mem_usage = reuseport_array_mem_usage,
+ .map_btf_id = &reuseport_array_map_btf_ids[0],
+};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
new file mode 100644
index 000000000000..f6a075ffac63
--- /dev/null
+++ b/kernel/bpf/ringbuf.c
@@ -0,0 +1,879 @@
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/kmemleak.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
+
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BPF_F_RB_OVERWRITE)
+
+/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
+#define RINGBUF_PGOFF \
+ (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
+/* consumer page and producer page */
+#define RINGBUF_POS_PAGES 2
+#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
+
+#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
+
+struct bpf_ringbuf {
+ wait_queue_head_t waitq;
+ struct irq_work work;
+ u64 mask;
+ struct page **pages;
+ int nr_pages;
+ bool overwrite_mode;
+ rqspinlock_t spinlock ____cacheline_aligned_in_smp;
+ /* For user-space producer ring buffers, an atomic_t busy bit is used
+ * to synchronize access to the ring buffers in the kernel, rather than
+ * the spinlock that is used for kernel-producer ring buffers. This is
+ * done because the ring buffer must hold a lock across a BPF program's
+ * callback:
+ *
+ * __bpf_user_ringbuf_peek() // lock acquired
+ * -> program callback_fn()
+ * -> __bpf_user_ringbuf_sample_release() // lock released
+ *
+ * It is unsafe and incorrect to hold an IRQ spinlock across what could
+ * be a long execution window, so we instead simply disallow concurrent
+ * access to the ring buffer by kernel consumers, and return -EBUSY from
+ * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+ */
+ atomic_t busy ____cacheline_aligned_in_smp;
+ /* Consumer and producer counters are put into separate pages to
+ * allow each position to be mapped with different permissions.
+ * This prevents a user-space application from modifying the
+ * position and ruining in-kernel tracking. The permissions of the
+ * pages depend on who is producing samples: user-space or the
+ * kernel. Note that the pending counter is placed in the same
+ * page as the producer, so that it shares the same cache line.
+ *
+ * Kernel-producer
+ * ---------------
+ * The producer position and data pages are mapped as r/o in
+ * userspace. For this approach, bits in the header of samples are
+ * used to signal to user-space, and to other producers, whether a
+ * sample is currently being written.
+ *
+ * User-space producer
+ * -------------------
+ * Only the page containing the consumer position is mapped r/o in
+ * user-space. User-space producers also use bits of the header to
+ * communicate to the kernel, but the kernel must carefully check and
+ * validate each sample to ensure that they're correctly formatted, and
+ * fully contained within the ring buffer.
+ */
+ unsigned long consumer_pos __aligned(PAGE_SIZE);
+ unsigned long producer_pos __aligned(PAGE_SIZE);
+ unsigned long pending_pos;
+ unsigned long overwrite_pos; /* position after the last overwritten record */
+ char data[] __aligned(PAGE_SIZE);
+};
+
+struct bpf_ringbuf_map {
+ struct bpf_map map;
+ struct bpf_ringbuf *rb;
+};
+
+/* 8-byte ring buffer record header structure */
+struct bpf_ringbuf_hdr {
+ u32 len;
+ u32 pg_off;
+};
+
+static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
+{
+ const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
+ __GFP_NOWARN | __GFP_ZERO;
+ int nr_meta_pages = RINGBUF_NR_META_PAGES;
+ int nr_data_pages = data_sz >> PAGE_SHIFT;
+ int nr_pages = nr_meta_pages + nr_data_pages;
+ struct page **pages, *page;
+ struct bpf_ringbuf *rb;
+ size_t array_size;
+ int i;
+
+ /* Each data page is mapped twice to allow "virtual"
+ * continuous read of samples wrapping around the end of ring
+ * buffer area:
+ * ------------------------------------------------------
+ * | meta pages | real data pages | same data pages |
+ * ------------------------------------------------------
+ * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
+ * ------------------------------------------------------
+ * | | TA DA | TA DA |
+ * ------------------------------------------------------
+ * ^^^^^^^
+ * |
+ * Here, no need to worry about special handling of wrapped-around
+ * data due to double-mapped data pages. This works both in kernel and
+ * when mmap()'ed in user-space, simplifying both kernel and
+ * user-space implementations significantly.
+ */
+ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
+ pages = bpf_map_area_alloc(array_size, numa_node);
+ if (!pages)
+ return NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = alloc_pages_node(numa_node, flags, 0);
+ if (!page) {
+ nr_pages = i;
+ goto err_free_pages;
+ }
+ pages[i] = page;
+ if (i >= nr_meta_pages)
+ pages[nr_data_pages + i] = page;
+ }
+
+ rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
+ VM_MAP | VM_USERMAP, PAGE_KERNEL);
+ if (rb) {
+ kmemleak_not_leak(pages);
+ rb->pages = pages;
+ rb->nr_pages = nr_pages;
+ return rb;
+ }
+
+err_free_pages:
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ bpf_map_area_free(pages);
+ return NULL;
+}
+
+static void bpf_ringbuf_notify(struct irq_work *work)
+{
+ struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
+
+ wake_up_all(&rb->waitq);
+}
+
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and
+ * take into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts, the current maximum size would be:
+ *
+ * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+ *
+ * This gives 64GB limit, which seems plenty for single ring buffer. Now
+ * considering that the maximum value of data_sz is (4GB - 1), there
+ * will be no overflow, so just note the size limit in the comments.
+ */
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node, bool overwrite_mode)
+{
+ struct bpf_ringbuf *rb;
+
+ rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
+ if (!rb)
+ return NULL;
+
+ raw_res_spin_lock_init(&rb->spinlock);
+ atomic_set(&rb->busy, 0);
+ init_waitqueue_head(&rb->waitq);
+ init_irq_work(&rb->work, bpf_ringbuf_notify);
+
+ rb->mask = data_sz - 1;
+ rb->consumer_pos = 0;
+ rb->producer_pos = 0;
+ rb->pending_pos = 0;
+ rb->overwrite_mode = overwrite_mode;
+
+ return rb;
+}
+
+static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
+{
+ bool overwrite_mode = false;
+ struct bpf_ringbuf_map *rb_map;
+
+ if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->map_flags & BPF_F_RB_OVERWRITE) {
+ if (attr->map_type != BPF_MAP_TYPE_RINGBUF)
+ return ERR_PTR(-EINVAL);
+ overwrite_mode = true;
+ }
+
+ if (attr->key_size || attr->value_size ||
+ !is_power_of_2(attr->max_entries) ||
+ !PAGE_ALIGNED(attr->max_entries))
+ return ERR_PTR(-EINVAL);
+
+ rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
+ if (!rb_map)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&rb_map->map, attr);
+
+ rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node, overwrite_mode);
+ if (!rb_map->rb) {
+ bpf_map_area_free(rb_map);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &rb_map->map;
+}
+
+static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
+{
+ irq_work_sync(&rb->work);
+
+ /* copy pages pointer and nr_pages to local variable, as we are going
+ * to unmap rb itself with vunmap() below
+ */
+ struct page **pages = rb->pages;
+ int i, nr_pages = rb->nr_pages;
+
+ vunmap(rb);
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ bpf_map_area_free(pages);
+}
+
+static void ringbuf_map_free(struct bpf_map *map)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ bpf_ringbuf_free(rb_map->rb);
+ bpf_map_area_free(rb_map);
+}
+
+static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
+{
+ return -ENOTSUPP;
+}
+
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -ENOTSUPP;
+}
+
+static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ /* allow writable mapping for the consumer_pos only */
+ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EPERM;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb,
+ vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (vma->vm_pgoff == 0)
+ /* Disallow writable mappings to the consumer pointer,
+ * and allow writable mappings to both the producer
+ * position, and the ring buffer data itself.
+ */
+ return -EPERM;
+ }
+ /* remap_vmalloc_range() checks size and offset constraints */
+ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
+/*
+ * Return an estimate of the available data in the ring buffer.
+ * Note: the returned value can exceed the actual ring buffer size because the
+ * function is not synchronized with the producer. The producer acquires the
+ * ring buffer's spinlock, but this function does not.
+ */
+static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
+{
+ unsigned long cons_pos, prod_pos, over_pos;
+
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = smp_load_acquire(&rb->overwrite_pos);
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - max(cons_pos, over_pos);
+ } else {
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ return prod_pos - cons_pos;
+ }
+}
+
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+ return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb))
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+ struct poll_table_struct *pts)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ poll_wait(filp, &rb_map->rb->waitq, pts);
+
+ if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+ return EPOLLOUT | EPOLLWRNORM;
+ return 0;
+}
+
+static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_ringbuf *rb;
+ int nr_data_pages;
+ int nr_meta_pages;
+ u64 usage = sizeof(struct bpf_ringbuf_map);
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+ usage += (u64)rb->nr_pages << PAGE_SHIFT;
+ nr_meta_pages = RINGBUF_NR_META_PAGES;
+ nr_data_pages = map->max_entries >> PAGE_SHIFT;
+ usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_kern,
+ .map_poll = ringbuf_map_poll_kern,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &ringbuf_map_btf_ids[0],
+};
+
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = ringbuf_map_alloc,
+ .map_free = ringbuf_map_free,
+ .map_mmap = ringbuf_map_mmap_user,
+ .map_poll = ringbuf_map_poll_user,
+ .map_lookup_elem = ringbuf_map_lookup_elem,
+ .map_update_elem = ringbuf_map_update_elem,
+ .map_delete_elem = ringbuf_map_delete_elem,
+ .map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
+ .map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
+/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
+ * calculate offset from record metadata to ring buffer in pages, rounded
+ * down. This page offset is stored as part of record metadata and allows to
+ * restore struct bpf_ringbuf * from record pointer. This page offset is
+ * stored at offset 4 of record metadata header.
+ */
+static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
+ struct bpf_ringbuf_hdr *hdr)
+{
+ return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
+}
+
+/* Given pointer to ring buffer record header, restore pointer to struct
+ * bpf_ringbuf itself by using page offset stored at offset 4
+ */
+static struct bpf_ringbuf *
+bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
+{
+ unsigned long addr = (unsigned long)(void *)hdr;
+ unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
+
+ return (void*)((addr & PAGE_MASK) - off);
+}
+
+static bool bpf_ringbuf_has_space(const struct bpf_ringbuf *rb,
+ unsigned long new_prod_pos,
+ unsigned long cons_pos,
+ unsigned long pend_pos)
+{
+ /*
+ * No space if oldest not yet committed record until the newest
+ * record span more than (ringbuf_size - 1).
+ */
+ if (new_prod_pos - pend_pos > rb->mask)
+ return false;
+
+ /* Ok, we have space in overwrite mode */
+ if (unlikely(rb->overwrite_mode))
+ return true;
+
+ /*
+ * No space if producer position advances more than (ringbuf_size - 1)
+ * ahead of consumer position when not in overwrite mode.
+ */
+ if (new_prod_pos - cons_pos > rb->mask)
+ return false;
+
+ return true;
+}
+
+static u32 bpf_ringbuf_round_up_hdr_len(u32 hdr_len)
+{
+ hdr_len &= ~BPF_RINGBUF_DISCARD_BIT;
+ return round_up(hdr_len + BPF_RINGBUF_HDR_SZ, 8);
+}
+
+static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
+{
+ unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, over_pos, flags;
+ struct bpf_ringbuf_hdr *hdr;
+ u32 len, pg_off, hdr_len;
+
+ if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
+ return NULL;
+
+ len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+ if (len > ringbuf_total_data_sz(rb))
+ return NULL;
+
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+
+ if (raw_res_spin_lock_irqsave(&rb->spinlock, flags))
+ return NULL;
+
+ pend_pos = rb->pending_pos;
+ prod_pos = rb->producer_pos;
+ new_prod_pos = prod_pos + len;
+
+ while (pend_pos < prod_pos) {
+ hdr = (void *)rb->data + (pend_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ if (hdr_len & BPF_RINGBUF_BUSY_BIT)
+ break;
+ pend_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
+ }
+ rb->pending_pos = pend_pos;
+
+ if (!bpf_ringbuf_has_space(rb, new_prod_pos, cons_pos, pend_pos)) {
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
+ return NULL;
+ }
+
+ /*
+ * In overwrite mode, advance overwrite_pos when the ring buffer is full.
+ * The key points are to stay on record boundaries and consume enough records
+ * to fit the new one.
+ */
+ if (unlikely(rb->overwrite_mode)) {
+ over_pos = rb->overwrite_pos;
+ while (new_prod_pos - over_pos > rb->mask) {
+ hdr = (void *)rb->data + (over_pos & rb->mask);
+ hdr_len = READ_ONCE(hdr->len);
+ /*
+ * The bpf_ringbuf_has_space() check above ensures we won’t
+ * step over a record currently being worked on by another
+ * producer.
+ */
+ over_pos += bpf_ringbuf_round_up_hdr_len(hdr_len);
+ }
+ /*
+ * smp_store_release(&rb->producer_pos, new_prod_pos) at
+ * the end of the function ensures that when consumer sees
+ * the updated rb->producer_pos, it always sees the updated
+ * rb->overwrite_pos, so when consumer reads overwrite_pos
+ * after smp_load_acquire(r->producer_pos), the overwrite_pos
+ * will always be valid.
+ */
+ WRITE_ONCE(rb->overwrite_pos, over_pos);
+ }
+
+ hdr = (void *)rb->data + (prod_pos & rb->mask);
+ pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
+ hdr->len = size | BPF_RINGBUF_BUSY_BIT;
+ hdr->pg_off = pg_off;
+
+ /* pairs with consumer's smp_load_acquire() */
+ smp_store_release(&rb->producer_pos, new_prod_pos);
+
+ raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
+
+ return (void *)hdr + BPF_RINGBUF_HDR_SZ;
+}
+
+BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
+{
+ struct bpf_ringbuf_map *rb_map;
+
+ if (unlikely(flags))
+ return 0;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
+ .func = bpf_ringbuf_reserve,
+ .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
+{
+ unsigned long rec_pos, cons_pos;
+ struct bpf_ringbuf_hdr *hdr;
+ struct bpf_ringbuf *rb;
+ u32 new_len;
+
+ hdr = sample - BPF_RINGBUF_HDR_SZ;
+ rb = bpf_ringbuf_restore_from_rec(hdr);
+ new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
+ if (discard)
+ new_len |= BPF_RINGBUF_DISCARD_BIT;
+
+ /* update record header with correct final size prefix */
+ xchg(&hdr->len, new_len);
+
+ /* if consumer caught up and is waiting for our record, notify about
+ * new data availability
+ */
+ rec_pos = (void *)hdr - (void *)rb->data;
+ cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
+ irq_work_queue(&rb->work);
+}
+
+BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
+{
+ bpf_ringbuf_commit(sample, flags, false /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_proto = {
+ .func = bpf_ringbuf_submit,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
+{
+ bpf_ringbuf_commit(sample, flags, true /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_proto = {
+ .func = bpf_ringbuf_discard,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
+ u64, flags)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *rec;
+
+ if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
+ return -EINVAL;
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+ rec = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!rec)
+ return -EAGAIN;
+
+ memcpy(rec, data, size);
+ bpf_ringbuf_commit(rec, flags, false /* discard */);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_output_proto = {
+ .func = bpf_ringbuf_output,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ switch (flags) {
+ case BPF_RB_AVAIL_DATA:
+ return ringbuf_avail_data_sz(rb);
+ case BPF_RB_RING_SIZE:
+ return ringbuf_total_data_sz(rb);
+ case BPF_RB_CONS_POS:
+ return smp_load_acquire(&rb->consumer_pos);
+ case BPF_RB_PROD_POS:
+ return smp_load_acquire(&rb->producer_pos);
+ case BPF_RB_OVERWRITE_POS:
+ return smp_load_acquire(&rb->overwrite_pos);
+ default:
+ return 0;
+ }
+}
+
+const struct bpf_func_proto bpf_ringbuf_query_proto = {
+ .func = bpf_ringbuf_query,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
+ struct bpf_dynptr_kern *, ptr)
+{
+ struct bpf_ringbuf_map *rb_map;
+ void *sample;
+ int err;
+
+ if (unlikely(flags)) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ err = bpf_dynptr_check_size(size);
+ if (err) {
+ bpf_dynptr_set_null(ptr);
+ return err;
+ }
+
+ rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+ sample = __bpf_ringbuf_reserve(rb_map->rb, size);
+ if (!sample) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
+ .func = bpf_ringbuf_reserve_dynptr,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE,
+};
+
+BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
+ .func = bpf_ringbuf_submit_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
+{
+ if (!ptr->data)
+ return 0;
+
+ bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
+ .func = bpf_ringbuf_discard_dynptr,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+ int err;
+ u32 hdr_len, sample_len, total_len, flags, *hdr;
+ u64 cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ prod_pos = smp_load_acquire(&rb->producer_pos);
+ if (prod_pos % 8)
+ return -EINVAL;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+ cons_pos = smp_load_acquire(&rb->consumer_pos);
+ if (cons_pos >= prod_pos)
+ return -ENODATA;
+
+ hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+ /* Synchronizes with smp_store_release() in user-space producer. */
+ hdr_len = smp_load_acquire(hdr);
+ flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+ sample_len = hdr_len & ~flags;
+ total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* The sample must fit within the region advertised by the producer position. */
+ if (total_len > prod_pos - cons_pos)
+ return -EINVAL;
+
+ /* The sample must fit within the data region of the ring buffer. */
+ if (total_len > ringbuf_total_data_sz(rb))
+ return -E2BIG;
+
+ /* The sample must fit into a struct bpf_dynptr. */
+ err = bpf_dynptr_check_size(sample_len);
+ if (err)
+ return -E2BIG;
+
+ if (flags & BPF_RINGBUF_DISCARD_BIT) {
+ /* If the discard bit is set, the sample should be skipped.
+ *
+ * Update the consumer pos, and return -EAGAIN so the caller
+ * knows to skip this sample and try to read the next one.
+ */
+ smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+ return -EAGAIN;
+ }
+
+ if (flags & BPF_RINGBUF_BUSY_BIT)
+ return -ENODATA;
+
+ *sample = (void *)((uintptr_t)rb->data +
+ (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+ *size = sample_len;
+ return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+ u64 consumer_pos;
+ u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+ /* Using smp_load_acquire() is unnecessary here, as the busy-bit
+ * prevents another task from writing to consumer_pos after it was read
+ * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+ */
+ consumer_pos = rb->consumer_pos;
+ /* Synchronizes with smp_load_acquire() in user-space producer. */
+ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+ void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct bpf_ringbuf *rb;
+ long samples, discarded_samples = 0, ret = 0;
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+ int busy = 0;
+
+ if (unlikely(flags & ~wakeup_flags))
+ return -EINVAL;
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+ /* If another consumer is already consuming a sample, wait for them to finish. */
+ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+ return -EBUSY;
+
+ for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+ int err;
+ u32 size;
+ void *sample;
+ struct bpf_dynptr_kern dynptr;
+
+ err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+ if (err) {
+ if (err == -ENODATA) {
+ break;
+ } else if (err == -EAGAIN) {
+ discarded_samples++;
+ continue;
+ } else {
+ ret = err;
+ goto schedule_work_return;
+ }
+ }
+
+ bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+ ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+ __bpf_user_ringbuf_sample_release(rb, size, flags);
+ }
+ ret = samples - discarded_samples;
+
+schedule_work_return:
+ /* Prevent the clearing of the busy-bit from being reordered before the
+ * storing of any rb consumer or producer positions.
+ */
+ atomic_set_release(&rb->busy, 0);
+
+ if (flags & BPF_RB_FORCE_WAKEUP)
+ irq_work_queue(&rb->work);
+ else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+ irq_work_queue(&rb->work);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+ .func = bpf_user_ringbuf_drain,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
new file mode 100644
index 000000000000..f7d0c8d4644e
--- /dev/null
+++ b/kernel/bpf/rqspinlock.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Resilient Queued Spin Lock
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ * Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <linux/prefetch.h>
+#include <asm/byteorder.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#endif
+#include <trace/events/lock.h>
+#include <asm/rqspinlock.h>
+#include <linux/timekeeping.h>
+
+/*
+ * Include queued spinlock definitions and statistics code
+ */
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include "../locking/qspinlock.h"
+#include "../locking/lock_events.h"
+#include "rqspinlock.h"
+#include "../locking/mcs_spinlock.h"
+#endif
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+struct rqspinlock_timeout {
+ u64 timeout_end;
+ u64 duration;
+ u64 cur;
+ u16 spin;
+};
+
+#define RES_TIMEOUT_VAL 2
+
+DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
+EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
+
+static bool is_lock_released(rqspinlock_t *lock, u32 mask)
+{
+ if (!(atomic_read_acquire(&lock->val) & (mask)))
+ return true;
+ return false;
+}
+
+static noinline int check_deadlock_AA(rqspinlock_t *lock)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int cnt = min(RES_NR_HELD, rqh->cnt);
+
+ /*
+ * Return an error if we hold the lock we are attempting to acquire.
+ * We'll iterate over max 32 locks; no need to do is_lock_released.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (rqh->locks[i] == lock)
+ return -EDEADLK;
+ }
+ return 0;
+}
+
+/*
+ * This focuses on the most common case of ABBA deadlocks (or ABBA involving
+ * more locks, which reduce to ABBA). This is not exhaustive, and we rely on
+ * timeouts as the final line of defense.
+ */
+static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
+ void *remote_lock;
+ int cpu;
+
+ /*
+ * Find the CPU holding the lock that we want to acquire. If there is a
+ * deadlock scenario, we will read a stable set on the remote CPU and
+ * find the target. This would be a constant time operation instead of
+ * O(NR_CPUS) if we could determine the owning CPU from a lock value, but
+ * that requires increasing the size of the lock word.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu);
+ int real_cnt = READ_ONCE(rqh_cpu->cnt);
+ int cnt = min(RES_NR_HELD, real_cnt);
+
+ /*
+ * Let's ensure to break out of this loop if the lock is available for
+ * us to potentially acquire.
+ */
+ if (is_lock_released(lock, mask))
+ return 0;
+
+ /*
+ * Skip ourselves, and CPUs whose count is less than 2, as they need at
+ * least one held lock and one acquisition attempt (reflected as top
+ * most entry) to participate in an ABBA deadlock.
+ *
+ * If cnt is more than RES_NR_HELD, it means the current lock being
+ * acquired won't appear in the table, and other locks in the table are
+ * already held, so we can't determine ABBA.
+ */
+ if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD)
+ continue;
+
+ /*
+ * Obtain the entry at the top, this corresponds to the lock the
+ * remote CPU is attempting to acquire in a deadlock situation,
+ * and would be one of the locks we hold on the current CPU.
+ */
+ remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]);
+ /*
+ * If it is NULL, we've raced and cannot determine a deadlock
+ * conclusively, skip this CPU.
+ */
+ if (!remote_lock)
+ continue;
+ /*
+ * Find if the lock we're attempting to acquire is held by this CPU.
+ * Don't consider the topmost entry, as that must be the latest lock
+ * being held or acquired. For a deadlock, the target CPU must also
+ * attempt to acquire a lock we hold, so for this search only 'cnt - 1'
+ * entries are important.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (READ_ONCE(rqh_cpu->locks[i]) != lock)
+ continue;
+ /*
+ * We found our lock as held on the remote CPU. Is the
+ * acquisition attempt on the remote CPU for a lock held
+ * by us? If so, we have a deadlock situation, and need
+ * to recover.
+ */
+ for (int i = 0; i < rqh_cnt - 1; i++) {
+ if (rqh->locks[i] == remote_lock)
+ return -EDEADLK;
+ }
+ /*
+ * Inconclusive; retry again later.
+ */
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ u64 prev = ts->cur;
+ u64 time;
+
+ if (!ts->timeout_end) {
+ if (check_deadlock_AA(lock))
+ return -EDEADLK;
+ ts->cur = ktime_get_mono_fast_ns();
+ ts->timeout_end = ts->cur + ts->duration;
+ return 0;
+ }
+
+ time = ktime_get_mono_fast_ns();
+ if (time > ts->timeout_end)
+ return -ETIMEDOUT;
+
+ /*
+ * A millisecond interval passed from last time? Trigger deadlock
+ * checks.
+ */
+ if (prev + NSEC_PER_MSEC < time) {
+ ts->cur = time;
+ return check_deadlock_ABBA(lock, mask);
+ }
+
+ return 0;
+}
+
+/*
+ * Do not amortize with spins when res_smp_cond_load_acquire is defined,
+ * as the macro does internal amortization for us.
+ */
+#ifndef res_smp_cond_load_acquire
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ \
+ if (!(ts).spin++) \
+ (ret) = check_timeout((lock), (mask), &(ts)); \
+ (ret); \
+ })
+#else
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ (ret) = check_timeout((lock), (mask), &(ts)); })
+#endif
+
+/*
+ * Initialize the 'spin' member.
+ * Set spin member to 0 to trigger AA/ABBA checks immediately.
+ */
+#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
+
+/*
+ * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
+ * Duration is defined for each spin attempt, so set it here.
+ */
+#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; })
+
+/*
+ * Provide a test-and-set fallback for cases when queued spin lock support is
+ * absent from the architecture.
+ */
+int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
+{
+ struct rqspinlock_timeout ts;
+ int val, ret = 0;
+
+ RES_INIT_TIMEOUT(ts);
+ /*
+ * The fast path is not invoked for the TAS fallback, so we must grab
+ * the deadlock detection entry here.
+ */
+ grab_held_lock_entry(lock);
+
+ /*
+ * Since the waiting loop's time is dependent on the amount of
+ * contention, a short timeout unlike rqspinlock waiting loops
+ * isn't enough. Choose a second as the timeout value.
+ */
+ RES_RESET_TIMEOUT(ts, NSEC_PER_SEC);
+retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) {
+ if (RES_CHECK_TIMEOUT(ts, ret, ~0u))
+ goto out;
+ cpu_relax();
+ goto retry;
+ }
+
+ return 0;
+out:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_tas_spin_lock);
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
+
+#ifndef res_smp_cond_load_acquire
+#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
+#endif
+
+#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+
+/**
+ * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * Return:
+ * * 0 - Lock was acquired successfully.
+ * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
+ * * -ETIMEDOUT - Lock acquisition failed because of timeout.
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ struct rqspinlock_timeout ts;
+ int idx, ret = 0;
+ u32 old, tail;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (resilient_virt_spin_lock_enabled())
+ return resilient_virt_spin_lock(lock);
+
+ RES_INIT_TIMEOUT(ts);
+
+ /*
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
+ }
+
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
+ */
+ val = queued_fetch_set_pending_acquire(lock);
+
+ /*
+ * If we observe contention, there is a concurrent locker.
+ *
+ * Undo and queue; our setting of PENDING might have made the
+ * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+ * on @next to become !NULL.
+ */
+ if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+ /* Undo PENDING if we set it. */
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
+
+ goto queue;
+ }
+
+ /* Deadlock detection entry already held after failing fast path. */
+
+ /*
+ * We're pending, wait for the owner to go away.
+ *
+ * 0,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
+ */
+ if (val & _Q_LOCKED_MASK) {
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
+ res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+ }
+
+ if (ret) {
+ /*
+ * We waited for the locked bit to go back to 0, as the pending
+ * waiter, but timed out. We need to clear the pending bit since
+ * we own it. Once a stuck owner has been recovered, the lock
+ * must be restored to a valid state, hence removing the pending
+ * bit is necessary.
+ *
+ * *,1,* -> *,0,*
+ */
+ clear_pending(lock);
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_entry;
+ }
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ clear_pending_set_locked(lock);
+ lockevent_inc(lock_pending);
+ return 0;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ /*
+ * Do not queue if we're a waiter and someone is attempting this lock on
+ * the same CPU. In case of NMIs, this prevents long timeouts where we
+ * interrupt the pending waiter, and the owner, that will eventually
+ * signal the head of our queue, both of which are logically but not
+ * physically part of the queue, hence outside the scope of the idx > 0
+ * check above for the trylock fallback.
+ */
+ if (check_deadlock_AA(lock)) {
+ ret = -EDEADLK;
+ goto err_release_entry;
+ }
+
+ lockevent_inc(lock_slowpath);
+ /* Deadlock detection entry already held after failing fast path. */
+ node = this_cpu_ptr(&rqnodes[0].mcs);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ trace_contention_begin(lock, LCB_F_SPIN);
+
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to attempting a trylock operation without using
+ * any MCS node. Unlike qspinlock which cannot fail, we have the
+ * option of failing the slow path, and under contention, such a
+ * trylock spinning will likely be treated unfairly due to lack of
+ * queueing, hence do not spin.
+ */
+ if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
+ lockevent_inc(lock_no_node);
+ if (!queued_spin_trylock(lock)) {
+ ret = -EDEADLK;
+ goto err_release_node;
+ }
+ goto release;
+ }
+
+ node = grab_mcs_node(node, idx);
+
+ /*
+ * Keep counts of non-zero index values:
+ */
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
+ node->locked = 0;
+ node->next = NULL;
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+ next = NULL;
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ int val;
+
+ prev = decode_tail(old, rqnodes);
+
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
+
+ val = arch_mcs_spin_lock_contended(&node->locked);
+ if (val == RES_TIMEOUT_VAL) {
+ ret = -ETIMEDOUT;
+ goto waitq_timeout;
+ }
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is
+ * meant to span maximum allowed time per critical section, and we may
+ * have both the owner of the lock and the pending bit waiter ahead of
+ * us.
+ */
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
+ val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+ RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+
+ /* Disable queue destruction when we detect deadlocks. */
+ if (ret == -EDEADLK) {
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+ arch_mcs_spin_unlock_contended(&next->locked);
+ goto err_release_node;
+ }
+
+waitq_timeout:
+ if (ret) {
+ /*
+ * If the tail is still pointing to us, then we are the final waiter,
+ * and are responsible for resetting the tail back to 0. Otherwise, if
+ * the cmpxchg operation fails, we signal the next waiter to take exit
+ * and try the same. For a waiter with tail node 'n':
+ *
+ * n,*,* -> 0,*,*
+ *
+ * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is
+ * possible locked/pending bits keep changing and we see failures even
+ * when we remain the head of wait queue. However, eventually,
+ * pending bit owner will unset the pending bit, and new waiters
+ * will queue behind us. This will leave the lock owner in
+ * charge, and it will eventually either set locked bit to 0, or
+ * leave it as 1, allowing us to make progress.
+ *
+ * We terminate the whole wait queue for two reasons. Firstly,
+ * we eschew per-waiter timeouts with one applied at the head of
+ * the wait queue. This allows everyone to break out faster
+ * once we've seen the owner / pending waiter not responding for
+ * the timeout duration from the head. Secondly, it avoids
+ * complicated synchronization, because when not leaving in FIFO
+ * order, prev's next pointer needs to be fixed up etc.
+ */
+ if (!try_cmpxchg_tail(lock, tail, 0)) {
+ next = smp_cond_load_relaxed(&node->next, VAL);
+ WRITE_ONCE(next->locked, RES_TIMEOUT_VAL);
+ }
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_node;
+ }
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,*,0 -> *,*,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
+ */
+
+ /*
+ * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+ * above wait condition, therefore any concurrent setting of
+ * PENDING will make the uncontended transition fail.
+ */
+ if ((val & _Q_TAIL_MASK) == tail) {
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+ }
+
+ /*
+ * Either somebody is queued behind us or _Q_PENDING_VAL got set
+ * which will then detect the remaining tail and queue behind us
+ * ensuring we'll see a @next.
+ */
+ set_locked(lock);
+
+ /*
+ * contended path; wait for next if not observed yet, release.
+ */
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+
+release:
+ trace_contention_end(lock, 0);
+
+ /*
+ * release the node
+ */
+ __this_cpu_dec(rqnodes[0].mcs.count);
+ return ret;
+err_release_node:
+ trace_contention_end(lock, ret);
+ __this_cpu_dec(rqnodes[0].mcs.count);
+err_release_entry:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
+
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+__bpf_kfunc_start_defs();
+
+static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return;
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : "");
+ bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock);
+ bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt);
+ for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++)
+ bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]);
+ bpf_stream_dump_stack(ss);
+ }));
+}
+
+#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; })
+
+__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ int ret;
+
+ BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock));
+ BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock));
+
+ preempt_disable();
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false);
+ preempt_enable();
+ return ret;
+ }
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock)
+{
+ res_spin_unlock((rqspinlock_t *)lock);
+ preempt_enable();
+}
+
+__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags;
+ int ret;
+
+ preempt_disable();
+ local_irq_save(flags);
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true);
+ local_irq_restore(flags);
+ preempt_enable();
+ return ret;
+ }
+ *ptr = flags;
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags = *ptr;
+
+ res_spin_unlock((rqspinlock_t *)lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(rqspinlock_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock)
+BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore)
+BTF_KFUNCS_END(rqspinlock_kfunc_ids)
+
+static const struct btf_kfunc_id_set rqspinlock_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &rqspinlock_kfunc_ids,
+};
+
+static __init int rqspinlock_register_kfuncs(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set);
+}
+late_initcall(rqspinlock_register_kfuncs);
diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h
new file mode 100644
index 000000000000..5d8cb1b1aab4
--- /dev/null
+++ b/kernel/bpf/rqspinlock.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Resilient Queued Spin Lock defines
+ *
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+#ifndef __LINUX_RQSPINLOCK_H
+#define __LINUX_RQSPINLOCK_H
+
+#include "../locking/qspinlock.h"
+
+/*
+ * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value
+ * @lock: Pointer to queued spinlock structure
+ * @tail: The tail to compare against
+ * @new_tail: The new queue tail code word
+ * Return: Bool to indicate whether the cmpxchg operation succeeded
+ *
+ * This is used by the head of the wait queue to clean up the queue.
+ * Provides relaxed ordering, since observers only rely on initialized
+ * state of the node which was made visible through the xchg_tail operation,
+ * i.e. through the smp_wmb preceding xchg_tail.
+ *
+ * We avoid using 16-bit cmpxchg, which is not available on all architectures.
+ */
+static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ /*
+ * Is the tail part we compare to already stale? Fail.
+ */
+ if ((old & _Q_TAIL_MASK) != tail)
+ return false;
+ /*
+ * Encode latest locked/pending state for new tail.
+ */
+ new = (old & _Q_LOCKED_PENDING_MASK) | new_tail;
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return true;
+}
+
+#endif /* __LINUX_RQSPINLOCK_H */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
new file mode 100644
index 000000000000..da3d328f5c15
--- /dev/null
+++ b/kernel/bpf/stackmap.c
@@ -0,0 +1,792 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2016 Facebook
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
+#include <linux/btf_ids.h>
+#include <linux/buildid.h>
+#include "percpu_freelist.h"
+#include "mmap_unlock_work.h"
+
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
+ BPF_F_STACK_BUILD_ID)
+
+struct stack_map_bucket {
+ struct pcpu_freelist_node fnode;
+ u32 hash;
+ u32 nr;
+ u64 data[];
+};
+
+struct bpf_stack_map {
+ struct bpf_map map;
+ void *elems;
+ struct pcpu_freelist freelist;
+ u32 n_buckets;
+ struct stack_map_bucket *buckets[] __counted_by(n_buckets);
+};
+
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+ return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+ return stack_map_use_build_id(map) ?
+ sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
+/**
+ * stack_map_calculate_max_depth - Calculate maximum allowed stack trace depth
+ * @size: Size of the buffer/map value in bytes
+ * @elem_size: Size of each stack trace element
+ * @flags: BPF stack trace flags (BPF_F_USER_STACK, BPF_F_USER_BUILD_ID, ...)
+ *
+ * Return: Maximum number of stack trace entries that can be safely stored
+ */
+static u32 stack_map_calculate_max_depth(u32 size, u32 elem_size, u64 flags)
+{
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 max_depth;
+ u32 curr_sysctl_max_stack = READ_ONCE(sysctl_perf_event_max_stack);
+
+ max_depth = size / elem_size;
+ max_depth += skip;
+ if (max_depth > curr_sysctl_max_stack)
+ return curr_sysctl_max_stack;
+
+ return max_depth;
+}
+
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+{
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
+ int err;
+
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
+ smap->map.numa_node);
+ if (!smap->elems)
+ return -ENOMEM;
+
+ err = pcpu_freelist_init(&smap->freelist);
+ if (err)
+ goto free_elems;
+
+ pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
+ smap->map.max_entries);
+ return 0;
+
+free_elems:
+ bpf_map_area_free(smap->elems);
+ return err;
+}
+
+/* Called from syscall */
+static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
+{
+ u32 value_size = attr->value_size;
+ struct bpf_stack_map *smap;
+ u64 cost, n_buckets;
+ int err;
+
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ value_size < 8 || value_size % 8)
+ return ERR_PTR(-EINVAL);
+
+ BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+ if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+ if (value_size % sizeof(struct bpf_stack_build_id) ||
+ value_size / sizeof(struct bpf_stack_build_id)
+ > sysctl_perf_event_max_stack)
+ return ERR_PTR(-EINVAL);
+ } else if (value_size / 8 > sysctl_perf_event_max_stack)
+ return ERR_PTR(-EINVAL);
+
+ /* hash table size must be power of 2; roundup_pow_of_two() can overflow
+ * into UB on 32-bit arches, so check that first
+ */
+ if (attr->max_entries > 1UL << 31)
+ return ERR_PTR(-E2BIG);
+
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+
+ cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+ smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&smap->map, attr);
+ smap->n_buckets = n_buckets;
+
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
+ if (err)
+ goto free_smap;
+
+ err = prealloc_elems_and_freelist(smap);
+ if (err)
+ goto put_buffers;
+
+ return &smap->map;
+
+put_buffers:
+ put_callchain_buffers();
+free_smap:
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
+}
+
+static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
+{
+ return may_fault ? build_id_parse(vma, build_id, NULL)
+ : build_id_parse_nofault(vma, build_id, NULL);
+}
+
+/*
+ * Expects all id_offs[i].ip values to be set to correct initial IPs.
+ * They will be subsequently:
+ * - either adjusted in place to a file offset, if build ID fetching
+ * succeeds; in this case id_offs[i].build_id is set to correct build ID,
+ * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
+ * - or IP will be kept intact, if build ID fetching failed; in this case
+ * id_offs[i].build_id is zeroed out and id_offs[i].status is set to
+ * BPF_STACK_BUILD_ID_IP.
+ */
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
+ u32 trace_nr, bool user, bool may_fault)
+{
+ int i;
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+ struct vm_area_struct *vma, *prev_vma = NULL;
+ const char *prev_build_id;
+
+ /* If the irq_work is in use, fall back to report ips. Same
+ * fallback is used for kernel stack (!user) on a stackmap with
+ * build_id.
+ */
+ if (!user || !current || !current->mm || irq_work_busy ||
+ !mmap_read_trylock(current->mm)) {
+ /* cannot access current->mm, fall back to ips */
+ for (i = 0; i < trace_nr; i++) {
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
+ }
+ return;
+ }
+
+ for (i = 0; i < trace_nr; i++) {
+ u64 ip = READ_ONCE(id_offs[i].ip);
+
+ if (range_in_vma(prev_vma, ip, ip)) {
+ vma = prev_vma;
+ memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
+ goto build_id_valid;
+ }
+ vma = find_vma(current->mm, ip);
+ if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
+ /* per entry fall back to ips */
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+ memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
+ continue;
+ }
+build_id_valid:
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
+ id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ prev_vma = vma;
+ prev_build_id = id_offs[i].build_id;
+ }
+ bpf_mmap_unlock_mm(work, current->mm);
+}
+
+static struct perf_callchain_entry *
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
+{
+#ifdef CONFIG_STACKTRACE
+ struct perf_callchain_entry *entry;
+ int rctx;
+
+ entry = get_callchain_entry(&rctx);
+
+ if (!entry)
+ return NULL;
+
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+ max_depth, 0);
+
+ /* stack_trace_save_tsk() works on unsigned long array, while
+ * perf_callchain_entry uses u64 array. For 32-bit systems, it is
+ * necessary to fix this mismatch.
+ */
+ if (__BITS_PER_LONG != 64) {
+ unsigned long *from = (unsigned long *) entry->ip;
+ u64 *to = entry->ip;
+ int i;
+
+ /* copy data from the end to avoid using extra buffer */
+ for (i = entry->nr - 1; i >= 0; i--)
+ to[i] = (u64)(from[i]);
+ }
+
+ put_callchain_entry(rctx);
+
+ return entry;
+#else /* CONFIG_STACKTRACE */
+ return NULL;
+#endif
+}
+
+static long __bpf_get_stackid(struct bpf_map *map,
+ struct perf_callchain_entry *trace, u64 flags)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+ u32 hash, id, trace_nr, trace_len, i, max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ bool user = flags & BPF_F_USER_STACK;
+ u64 *ips;
+ bool hash_matches;
+
+ if (trace->nr <= skip)
+ /* skipping more than usable stack trace */
+ return -EFAULT;
+
+ max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
+ trace_nr = min_t(u32, trace->nr - skip, max_depth - skip);
+ trace_len = trace_nr * sizeof(u64);
+ ips = trace->ip + skip;
+ hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
+ id = hash & (smap->n_buckets - 1);
+ bucket = READ_ONCE(smap->buckets[id]);
+
+ hash_matches = bucket && bucket->hash == hash;
+ /* fast cmp */
+ if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+ return id;
+
+ if (stack_map_use_build_id(map)) {
+ struct bpf_stack_build_id *id_offs;
+
+ /* for build_id+offset, pop a bucket before slow cmp */
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ new_bucket->nr = trace_nr;
+ id_offs = (struct bpf_stack_build_id *)new_bucket->data;
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
+ trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+ return id;
+ }
+ if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+ return -EEXIST;
+ }
+ } else {
+ if (hash_matches && bucket->nr == trace_nr &&
+ memcmp(bucket->data, ips, trace_len) == 0)
+ return id;
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
+ return -EEXIST;
+
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+ memcpy(new_bucket->data, ips, trace_len);
+ }
+
+ new_bucket->hash = hash;
+ new_bucket->nr = trace_nr;
+
+ old_bucket = xchg(&smap->buckets[id], new_bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return id;
+}
+
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags)
+{
+ u32 elem_size = stack_map_data_size(map);
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+ u32 max_depth;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags);
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
+ false, false, 0);
+
+ if (unlikely(!trace))
+ /* couldn't fetch the stack trace */
+ return -EFAULT;
+
+ return __bpf_get_stackid(map, trace, flags);
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto = {
+ .func = bpf_get_stackid,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
+{
+ __u64 nr_kernel = 0;
+
+ while (nr_kernel < trace->nr) {
+ if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
+ break;
+ nr_kernel++;
+ }
+ return nr_kernel;
+}
+
+BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
+ struct bpf_map *, map, u64, flags)
+{
+ struct perf_event *event = ctx->event;
+ struct perf_callchain_entry *trace;
+ bool kernel, user;
+ __u64 nr_kernel;
+ int ret;
+
+ /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ return bpf_get_stackid((unsigned long)(ctx->regs),
+ (unsigned long) map, flags, 0, 0);
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ user = flags & BPF_F_USER_STACK;
+ kernel = !user;
+
+ trace = ctx->data->callchain;
+ if (unlikely(!trace))
+ return -EFAULT;
+
+ nr_kernel = count_kernel_ip(trace);
+ __u64 nr = trace->nr; /* save original */
+
+ if (kernel) {
+ trace->nr = nr_kernel;
+ ret = __bpf_get_stackid(map, trace, flags);
+ } else { /* user */
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+ skip += nr_kernel;
+ if (skip > BPF_F_SKIP_FIELD_MASK)
+ return -EFAULT;
+
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+ ret = __bpf_get_stackid(map, trace, flags);
+ }
+
+ /* restore nr */
+ trace->nr = nr;
+
+ return ret;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto_pe = {
+ .func = bpf_get_stackid_pe,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+ struct perf_callchain_entry *trace_in,
+ void *buf, u32 size, u64 flags, bool may_fault)
+{
+ u32 trace_nr, copy_len, elem_size, max_depth;
+ bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ bool crosstask = task && task != current;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+ int err = -EINVAL;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ goto clear;
+ if (kernel && user_build_id)
+ goto clear;
+
+ elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
+ if (unlikely(size % elem_size))
+ goto clear;
+
+ /* cannot get valid user stack for task without user_mode regs */
+ if (task && user && !user_mode(regs))
+ goto err_fault;
+
+ /* get_perf_callchain does not support crosstask user stack walking
+ * but returns an empty stack instead of NULL.
+ */
+ if (crosstask && user) {
+ err = -EOPNOTSUPP;
+ goto clear;
+ }
+
+ max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
+
+ if (may_fault)
+ rcu_read_lock(); /* need RCU for perf's callchain below */
+
+ if (trace_in) {
+ trace = trace_in;
+ trace->nr = min_t(u32, trace->nr, max_depth);
+ } else if (kernel && task) {
+ trace = get_callchain_entry_for_task(task, max_depth);
+ } else {
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
+ crosstask, false, 0);
+ }
+
+ if (unlikely(!trace) || trace->nr < skip) {
+ if (may_fault)
+ rcu_read_unlock();
+ goto err_fault;
+ }
+
+ trace_nr = trace->nr - skip;
+ copy_len = trace_nr * elem_size;
+
+ ips = trace->ip + skip;
+ if (user_build_id) {
+ struct bpf_stack_build_id *id_offs = buf;
+ u32 i;
+
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ } else {
+ memcpy(buf, ips, copy_len);
+ }
+
+ /* trace/ips should not be dereferenced after this point */
+ if (may_fault)
+ rcu_read_unlock();
+
+ if (user_build_id)
+ stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
+
+ if (size > copy_len)
+ memset(buf + copy_len, 0, size - copy_len);
+ return copy_len;
+
+err_fault:
+ err = -EFAULT;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+ .func = bpf_get_stack,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
+ .func = bpf_get_stack_sleepable,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
+ u64 flags, bool may_fault)
+{
+ struct pt_regs *regs;
+ long res = -EINVAL;
+
+ if (!try_get_task_stack(task))
+ return -EFAULT;
+
+ regs = task_pt_regs(task);
+ if (regs)
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
+ put_task_stack(task);
+
+ return res;
+}
+
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_task_stack_proto = {
+ .func = bpf_get_task_stack,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
+ .func = bpf_get_task_stack_sleepable,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
+ void *, buf, u32, size, u64, flags)
+{
+ struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
+ struct perf_event *event = ctx->event;
+ struct perf_callchain_entry *trace;
+ bool kernel, user;
+ int err = -EINVAL;
+ __u64 nr_kernel;
+
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ goto clear;
+
+ user = flags & BPF_F_USER_STACK;
+ kernel = !user;
+
+ err = -EFAULT;
+ trace = ctx->data->callchain;
+ if (unlikely(!trace))
+ goto clear;
+
+ nr_kernel = count_kernel_ip(trace);
+
+ if (kernel) {
+ __u64 nr = trace->nr;
+
+ trace->nr = nr_kernel;
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
+
+ /* restore nr */
+ trace->nr = nr;
+ } else { /* user */
+ u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+ skip += nr_kernel;
+ if (skip > BPF_F_SKIP_FIELD_MASK)
+ goto clear;
+
+ flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
+ }
+ return err;
+
+clear:
+ memset(buf, 0, size);
+ return err;
+
+}
+
+const struct bpf_func_proto bpf_get_stack_proto_pe = {
+ .func = bpf_get_stack_pe,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+/* Called from eBPF program */
+static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+/* Called from syscall */
+static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ return bpf_stackmap_extract(map, key, value, true);
+}
+
+/* Called from syscall */
+int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *bucket, *old_bucket;
+ u32 id = *(u32 *)key, trace_len;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -ENOENT;
+
+ bucket = xchg(&smap->buckets[id], NULL);
+ if (!bucket)
+ return -ENOENT;
+
+ trace_len = bucket->nr * stack_map_data_size(map);
+ memcpy(value, bucket->data, trace_len);
+ memset(value + trace_len, 0, map->value_size - trace_len);
+
+ if (delete)
+ old_bucket = bucket;
+ else
+ old_bucket = xchg(&smap->buckets[id], bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+}
+
+static int stack_map_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ struct bpf_stack_map *smap = container_of(map,
+ struct bpf_stack_map, map);
+ u32 id;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (!key) {
+ id = 0;
+ } else {
+ id = *(u32 *)key;
+ if (id >= smap->n_buckets || !smap->buckets[id])
+ id = 0;
+ else
+ id++;
+ }
+
+ while (id < smap->n_buckets && !smap->buckets[id])
+ id++;
+
+ if (id >= smap->n_buckets)
+ return -ENOENT;
+
+ *(u32 *)next_key = id;
+ return 0;
+}
+
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *old_bucket;
+ u32 id = *(u32 *)key;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -E2BIG;
+
+ old_bucket = xchg(&smap->buckets[id], NULL);
+ if (old_bucket) {
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void stack_map_free(struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+
+ bpf_map_area_free(smap->elems);
+ pcpu_freelist_destroy(&smap->freelist);
+ bpf_map_area_free(smap);
+ put_callchain_buffers();
+}
+
+static u64 stack_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ u64 value_size = map->value_size;
+ u64 n_buckets = smap->n_buckets;
+ u64 enties = map->max_entries;
+ u64 usage = sizeof(*smap);
+
+ usage += n_buckets * sizeof(struct stack_map_bucket *);
+ usage += enties * (sizeof(struct stack_map_bucket) + value_size);
+ return usage;
+}
+
+BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
+const struct bpf_map_ops stack_trace_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = stack_map_alloc,
+ .map_free = stack_map_free,
+ .map_get_next_key = stack_map_get_next_key,
+ .map_lookup_elem = stack_map_lookup_elem,
+ .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem,
+ .map_update_elem = stack_map_update_elem,
+ .map_delete_elem = stack_map_delete_elem,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = stack_map_mem_usage,
+ .map_btf_id = &stack_trace_map_btf_ids[0],
+};
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
new file mode 100644
index 000000000000..0b6bc3f30335
--- /dev/null
+++ b/kernel/bpf/stream.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/gfp.h>
+#include <linux/memory.h>
+#include <linux/mutex.h>
+
+static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
+{
+ init_llist_node(&elem->node);
+ elem->total_len = len;
+ elem->consumed_len = 0;
+}
+
+static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
+{
+ const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
+ struct bpf_stream_elem *elem;
+ size_t alloc_size;
+
+ /*
+ * Length denotes the amount of data to be written as part of stream element,
+ * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
+ * accomodate, therefore deny allocations that won't fit into them.
+ */
+ if (len < 0 || len > max_len)
+ return NULL;
+
+ alloc_size = offsetof(struct bpf_stream_elem, str[len]);
+ elem = kmalloc_nolock(alloc_size, __GFP_ZERO, -1);
+ if (!elem)
+ return NULL;
+
+ bpf_stream_elem_init(elem, len);
+
+ return elem;
+}
+
+static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len)
+{
+ struct bpf_stream_elem *elem = NULL;
+
+ /*
+ * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream
+ * log, elements will be popped at once and reversed to print the log.
+ */
+ elem = bpf_stream_elem_alloc(len);
+ if (!elem)
+ return -ENOMEM;
+
+ memcpy(elem->str, str, len);
+ llist_add(&elem->node, log);
+
+ return 0;
+}
+
+static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len)
+{
+ if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY)
+ return -ENOSPC;
+ if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) {
+ atomic_sub(len, &stream->capacity);
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem)
+{
+ int len = elem->total_len;
+
+ atomic_sub(len, &stream->capacity);
+}
+
+static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len)
+{
+ int ret = bpf_stream_consume_capacity(stream, len);
+
+ return ret ?: __bpf_stream_push_str(&stream->log, str, len);
+}
+
+static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux)
+{
+ if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR)
+ return NULL;
+ return &aux->stream[stream_id - 1];
+}
+
+static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
+{
+ kfree_nolock(elem);
+}
+
+static void bpf_stream_free_list(struct llist_node *list)
+{
+ struct bpf_stream_elem *elem, *tmp;
+
+ llist_for_each_entry_safe(elem, tmp, list, node)
+ bpf_stream_free_elem(elem);
+}
+
+static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream)
+{
+ return stream->backlog_head;
+}
+
+static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream)
+{
+ struct llist_node *node;
+
+ node = stream->backlog_head;
+ if (stream->backlog_head == stream->backlog_tail)
+ stream->backlog_head = stream->backlog_tail = NULL;
+ else
+ stream->backlog_head = node->next;
+ return node;
+}
+
+static void bpf_stream_backlog_fill(struct bpf_stream *stream)
+{
+ struct llist_node *head, *tail;
+
+ if (llist_empty(&stream->log))
+ return;
+ tail = llist_del_all(&stream->log);
+ if (!tail)
+ return;
+ head = llist_reverse_order(tail);
+
+ if (!stream->backlog_head) {
+ stream->backlog_head = head;
+ stream->backlog_tail = tail;
+ } else {
+ stream->backlog_tail->next = head;
+ stream->backlog_tail = tail;
+ }
+
+ return;
+}
+
+static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len)
+{
+ int rem = elem->total_len - elem->consumed_len;
+ int used = min(rem, *len);
+
+ elem->consumed_len += used;
+ *len -= used;
+
+ return elem->consumed_len == elem->total_len;
+}
+
+static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len)
+{
+ int rem_len = len, cons_len, ret = 0;
+ struct bpf_stream_elem *elem = NULL;
+ struct llist_node *node;
+
+ mutex_lock(&stream->lock);
+
+ while (rem_len) {
+ int pos = len - rem_len;
+ bool cont;
+
+ node = bpf_stream_backlog_peek(stream);
+ if (!node) {
+ bpf_stream_backlog_fill(stream);
+ node = bpf_stream_backlog_peek(stream);
+ }
+ if (!node)
+ break;
+ elem = container_of(node, typeof(*elem), node);
+
+ cons_len = elem->consumed_len;
+ cont = bpf_stream_consume_elem(elem, &rem_len) == false;
+
+ ret = copy_to_user(buf + pos, elem->str + cons_len,
+ elem->consumed_len - cons_len);
+ /* Restore in case of error. */
+ if (ret) {
+ ret = -EFAULT;
+ elem->consumed_len = cons_len;
+ break;
+ }
+
+ if (cont)
+ continue;
+ bpf_stream_backlog_pop(stream);
+ bpf_stream_release_capacity(stream, elem);
+ bpf_stream_free_elem(elem);
+ }
+
+ mutex_unlock(&stream->lock);
+ return ret ? ret : len - rem_len;
+}
+
+int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len)
+{
+ struct bpf_stream *stream;
+
+ stream = bpf_stream_get(stream_id, prog->aux);
+ if (!stream)
+ return -ENOENT;
+ return bpf_stream_read(stream, buf, len);
+}
+
+__bpf_kfunc_start_defs();
+
+/*
+ * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
+ * enum in headers.
+ */
+__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
+ u32 len__sz, void *aux__prog)
+{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
+ struct bpf_prog_aux *aux = aux__prog;
+ u32 fmt_size = strlen(fmt__str) + 1;
+ struct bpf_stream *stream;
+ u32 data_len = len__sz;
+ int ret, num_args;
+
+ stream = bpf_stream_get(stream_id, aux);
+ if (!stream)
+ return -ENOENT;
+
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data);
+ if (ret < 0)
+ return ret;
+
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args);
+ /* Exclude NULL byte during push. */
+ ret = bpf_stream_push_str(stream, data.buf, ret);
+ bpf_bprintf_cleanup(&data);
+
+ return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+/* Added kfunc to common_btf_ids */
+
+void bpf_prog_stream_init(struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+ atomic_set(&prog->aux->stream[i].capacity, 0);
+ init_llist_head(&prog->aux->stream[i].log);
+ mutex_init(&prog->aux->stream[i].lock);
+ prog->aux->stream[i].backlog_head = NULL;
+ prog->aux->stream[i].backlog_tail = NULL;
+ }
+}
+
+void bpf_prog_stream_free(struct bpf_prog *prog)
+{
+ struct llist_node *list;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+ list = llist_del_all(&prog->aux->stream[i].log);
+ bpf_stream_free_list(list);
+ bpf_stream_free_list(prog->aux->stream[i].backlog_head);
+ }
+}
+
+void bpf_stream_stage_init(struct bpf_stream_stage *ss)
+{
+ init_llist_head(&ss->log);
+ ss->len = 0;
+}
+
+void bpf_stream_stage_free(struct bpf_stream_stage *ss)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&ss->log);
+ bpf_stream_free_list(node);
+}
+
+int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...)
+{
+ struct bpf_bprintf_buffers *buf;
+ va_list args;
+ int ret;
+
+ if (bpf_try_get_buffers(&buf))
+ return -EBUSY;
+
+ va_start(args, fmt);
+ ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args);
+ va_end(args);
+ ss->len += ret;
+ /* Exclude NULL byte during push. */
+ ret = __bpf_stream_push_str(&ss->log, buf->buf, ret);
+ bpf_put_buffers();
+ return ret;
+}
+
+int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
+ enum bpf_stream_id stream_id)
+{
+ struct llist_node *list, *head, *tail;
+ struct bpf_stream *stream;
+ int ret;
+
+ stream = bpf_stream_get(stream_id, prog->aux);
+ if (!stream)
+ return -EINVAL;
+
+ ret = bpf_stream_consume_capacity(stream, ss->len);
+ if (ret)
+ return ret;
+
+ list = llist_del_all(&ss->log);
+ head = tail = list;
+
+ if (!list)
+ return 0;
+ while (llist_next(list)) {
+ tail = llist_next(list);
+ list = tail;
+ }
+ llist_add_batch(head, tail, &stream->log);
+ return 0;
+}
+
+struct dump_stack_ctx {
+ struct bpf_stream_stage *ss;
+ int err;
+};
+
+static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct dump_stack_ctx *ctxp = cookie;
+ const char *file = "", *line = "";
+ struct bpf_prog *prog;
+ int num, ret;
+
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (prog) {
+ ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num);
+ if (ret < 0)
+ goto end;
+ ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n",
+ (void *)(long)ip, line, file, num);
+ return !ctxp->err;
+ }
+end:
+ ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip);
+ return !ctxp->err;
+}
+
+int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss)
+{
+ struct dump_stack_ctx ctx = { .ss = ss };
+ int ret;
+
+ ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n",
+ raw_smp_processor_id(), __kuid_val(current_real_cred()->euid),
+ current->pid, current->comm);
+ if (ret)
+ return ret;
+ ret = bpf_stream_stage_printk(ss, "Call trace:\n");
+ if (ret)
+ return ret;
+ arch_bpf_stack_walk(dump_stack_cb, &ctx);
+ if (ctx.err)
+ return ctx.err;
+ return bpf_stream_stage_printk(ss, "\n");
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591914..4ff82144f885 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,47 +1,913 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
+#include <crypto/sha2.h>
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bsearch.h>
+#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/vmalloc.h>
+#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
#include <linux/file.h>
+#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
-#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
+#include <linux/nospec.h>
+#include <linux/audit.h>
+#include <uapi/linux/btf.h>
+#include <linux/pgtable.h>
+#include <linux/bpf_lsm.h>
+#include <linux/poll.h>
+#include <linux/sort.h>
+#include <linux/bpf-netns.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/memcontrol.h>
+#include <linux/trace_events.h>
+#include <linux/tracepoint.h>
+#include <linux/overflow.h>
+#include <linux/cookie.h>
+#include <linux/verification.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <net/netkit.h>
+#include <net/tcx.h>
+
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
+ IS_FD_HASH(map))
-static LIST_HEAD(bpf_map_types);
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
+DEFINE_PER_CPU(int, bpf_prog_active);
+DEFINE_COOKIE(bpf_map_cookie);
+static DEFINE_IDR(prog_idr);
+static DEFINE_SPINLOCK(prog_idr_lock);
+static DEFINE_IDR(map_idr);
+static DEFINE_SPINLOCK(map_idr_lock);
+static DEFINE_IDR(link_idr);
+static DEFINE_SPINLOCK(link_idr_lock);
+
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
+
+static const struct bpf_map_ops * const bpf_map_types[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops) \
+ [_id] = &_ops,
+#define BPF_LINK_TYPE(_id, _name)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
+};
-static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+/*
+ * If we're handed a bigger struct than we know of, ensure all the unknown bits
+ * are 0 - i.e. new user-space does not rely on any kernel feature extensions
+ * we don't know about yet.
+ *
+ * There is a ToCToU between this function call and the following
+ * copy_from_user() call. However, this is not a concern since this function is
+ * meant to be a future-proofing of bits.
+ */
+int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
+ size_t expected_size,
+ size_t actual_size)
{
- struct bpf_map_type_list *tl;
- struct bpf_map *map;
+ int res;
+
+ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
+ return -E2BIG;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ if (uaddr.is_kernel)
+ res = memchr_inv(uaddr.kernel + expected_size, 0,
+ actual_size - expected_size) == NULL;
+ else
+ res = check_zeroed_user(uaddr.user + expected_size,
+ actual_size - expected_size);
+ if (res < 0)
+ return res;
+ return res ? 0 : -E2BIG;
+}
+
+const struct bpf_map_ops bpf_map_offload_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = bpf_map_offload_map_alloc,
+ .map_free = bpf_map_offload_map_free,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = bpf_map_offload_map_mem_usage,
+};
+
+static void bpf_map_write_active_inc(struct bpf_map *map)
+{
+ atomic64_inc(&map->writecnt);
+}
+
+static void bpf_map_write_active_dec(struct bpf_map *map)
+{
+ atomic64_dec(&map->writecnt);
+}
+
+bool bpf_map_write_active(const struct bpf_map *map)
+{
+ return atomic64_read(&map->writecnt) != 0;
+}
+
+static u32 bpf_map_value_size(const struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ return round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ return sizeof(u32);
+ else
+ return map->value_size;
+}
+
+static void maybe_wait_bpf_programs(struct bpf_map *map)
+{
+ /* Wait for any running non-sleepable BPF programs to complete so that
+ * userspace, when we return to it, knows that all non-sleepable
+ * programs that could be running use the new map value. For sleepable
+ * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
+ * for the completions of these programs, but considering the waiting
+ * time can be very long and userspace may think it will hang forever,
+ * so don't handle sleepable BPF programs now.
+ */
+ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
+ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+ synchronize_rcu_expedited();
+}
+
+static void unpin_uptr_kaddr(void *kaddr)
+{
+ if (kaddr)
+ unpin_user_page(virt_to_page(kaddr));
+}
+
+static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
+{
+ const struct btf_field *field;
+ void **uptr_addr;
+ int i;
+
+ for (i = 0, field = rec->fields; i < cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ unpin_uptr_kaddr(*uptr_addr);
+ }
+}
+
+static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
+{
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return;
+
+ __bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
+}
+
+static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
+{
+ const struct btf_field *field;
+ const struct btf_type *t;
+ unsigned long start, end;
+ struct page *page;
+ void **uptr_addr;
+ int i, err;
+
+ if (!btf_record_has_field(rec, BPF_UPTR))
+ return 0;
+
+ for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
+ if (field->type != BPF_UPTR)
+ continue;
+
+ uptr_addr = obj + field->offset;
+ start = *(unsigned long *)uptr_addr;
+ if (!start)
+ continue;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ /* t->size was checked for zero before */
+ if (check_add_overflow(start, t->size - 1, &end)) {
+ err = -EFAULT;
+ goto unpin_all;
+ }
+
+ /* The uptr's struct cannot span across two pages */
+ if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
+ err = -EOPNOTSUPP;
+ goto unpin_all;
+ }
+
+ err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
+ if (err != 1)
+ goto unpin_all;
+
+ if (PageHighMem(page)) {
+ err = -EOPNOTSUPP;
+ unpin_user_page(page);
+ goto unpin_all;
+ }
+
+ *uptr_addr = page_address(page) + offset_in_page(start);
+ }
+
+ return 0;
+
+unpin_all:
+ __bpf_obj_unpin_uptrs(rec, i, obj);
+ return err;
+}
+
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, __u64 flags)
+{
+ int err;
+
+ /* Need to create a kthread, thus must support schedule */
+ if (bpf_map_is_offloaded(map)) {
+ return bpf_map_offload_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+ map->map_type == BPF_MAP_TYPE_ARENA ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ return map->ops->map_update_elem(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+ map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ return sock_map_update_elem_sys(map, key, value, flags);
+ } else if (IS_FD_PROG_ARRAY(map)) {
+ return bpf_fd_array_map_update_elem(map, map_file, key, value,
+ flags);
+ }
+
+ bpf_disable_instrumentation();
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_update(map, key, value,
+ flags);
+ } else if (IS_FD_ARRAY(map)) {
+ err = bpf_fd_array_map_update_elem(map, map_file, key, value,
+ flags);
+ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
+ flags);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ /* rcu_read_lock() is not needed */
+ err = bpf_fd_reuseport_array_update_elem(map, key, value,
+ flags);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
+ err = map->ops->map_push_elem(map, value, flags);
+ } else {
+ err = bpf_obj_pin_uptrs(map->record, value);
+ if (!err) {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, flags);
+ rcu_read_unlock();
+ if (err)
+ bpf_obj_unpin_uptrs(map->record, value);
+ }
+ }
+ bpf_enable_instrumentation();
+
+ return err;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+ __u64 flags)
+{
+ void *ptr;
+ int err;
+
+ if (bpf_map_is_offloaded(map))
+ return bpf_map_offload_lookup_elem(map, key, value);
+
+ bpf_disable_instrumentation();
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_extract(map, key, value, false);
+ } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+ err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK ||
+ map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
+ err = map->ops->map_peek_elem(map, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* struct_ops map requires directly updating "value" */
+ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
+ } else {
+ rcu_read_lock();
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ } else if (!ptr) {
+ err = -ENOENT;
+ } else {
+ err = 0;
+ if (flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock and timer, since value wasn't zero inited */
+ check_and_init_map_value(map, value);
+ }
+ rcu_read_unlock();
+ }
+
+ bpf_enable_instrumentation();
+
+ return err;
+}
+
+/* Please, do not use this function outside from the map creation path
+ * (e.g. in map update path) without taking care of setting the active
+ * memory cgroup (see at bpf_map_kmalloc_node() for example).
+ */
+static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
+{
+ /* We really just want to fail instead of triggering OOM killer
+ * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
+ * which is used for lower order allocation requests.
+ *
+ * It has been observed that higher order allocation requests done by
+ * vmalloc with __GFP_NORETRY being set might fail due to not trying
+ * to reclaim memory from the page cache, thus we set
+ * __GFP_RETRY_MAYFAIL to avoid such situations.
+ */
+
+ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);
+ unsigned int flags = 0;
+ unsigned long align = 1;
+ void *area;
+
+ if (size >= SIZE_MAX)
+ return NULL;
+
+ /* kmalloc()'ed memory can't be mmap()'ed */
+ if (mmapable) {
+ BUG_ON(!PAGE_ALIGNED(size));
+ align = SHMLBA;
+ flags = VM_USERMAP;
+ } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
+ numa_node);
+ if (area != NULL)
+ return area;
+ }
+
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
+ flags, numa_node, __builtin_return_address(0));
+}
+
+void *bpf_map_area_alloc(u64 size, int numa_node)
+{
+ return __bpf_map_area_alloc(size, numa_node, false);
+}
+
+void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
+{
+ return __bpf_map_area_alloc(size, numa_node, true);
+}
+
+void bpf_map_area_free(void *area)
+{
+ kvfree(area);
+}
+
+static u32 bpf_map_flags_retain_permanent(u32 flags)
+{
+ /* Some map creation flags are not tied to the map object but
+ * rather to the map fd instead, so they have no meaning upon
+ * map object inspection since multiple file descriptors with
+ * different (access) properties can exist here. Thus, given
+ * this has zero meaning for the map itself, lets clear these
+ * from here.
+ */
+ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
+}
+
+void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
+{
+ map->map_type = attr->map_type;
+ map->key_size = attr->key_size;
+ map->value_size = attr->value_size;
+ map->max_entries = attr->max_entries;
+ map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
+ map->numa_node = bpf_map_attr_numa_node(attr);
+ map->map_extra = attr->map_extra;
+}
+
+static int bpf_map_alloc_id(struct bpf_map *map)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&map_idr_lock);
+ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ map->id = id;
+ spin_unlock_bh(&map_idr_lock);
+ idr_preload_end();
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+void bpf_map_free_id(struct bpf_map *map)
+{
+ unsigned long flags;
+
+ /* Offloaded maps are removed from the IDR store when their device
+ * disappears - even if someone holds an fd to them they are unusable,
+ * the memory is gone, all ops will fail; they are simply waiting for
+ * refcnt to drop to be freed.
+ */
+ if (!map->id)
+ return;
+
+ spin_lock_irqsave(&map_idr_lock, flags);
+
+ idr_remove(&map_idr, map->id);
+ map->id = 0;
+
+ spin_unlock_irqrestore(&map_idr_lock, flags);
+}
+
+#ifdef CONFIG_MEMCG
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+ /* Currently if a map is created by a process belonging to the root
+ * memory cgroup, get_obj_cgroup_from_current() will return NULL.
+ * So we have to check map->objcg for being NULL each time it's
+ * being used.
+ */
+ if (memcg_bpf_enabled())
+ map->objcg = get_obj_cgroup_from_current();
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+ if (map->objcg)
+ obj_cgroup_put(map->objcg);
+}
+
+static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+{
+ if (map->objcg)
+ return get_mem_cgroup_from_objcg(map->objcg);
+
+ return root_mem_cgroup;
+}
+
+void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
+void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
+void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kzalloc(size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
- list_for_each_entry(tl, &bpf_map_types, list_node) {
- if (tl->type == attr->map_type) {
- map = tl->ops->map_alloc(attr);
- if (IS_ERR(map))
- return map;
- map->ops = tl->ops;
- map->map_type = attr->map_type;
- return map;
+ return ptr;
+}
+
+void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
+ gfp_t flags)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
+void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
+ size_t align, gfp_t flags)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void __percpu *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
+#else
+static void bpf_map_save_memcg(struct bpf_map *map)
+{
+}
+
+static void bpf_map_release_memcg(struct bpf_map *map)
+{
+}
+#endif
+
+static bool can_alloc_pages(void)
+{
+ return preempt_count() == 0 && !irqs_disabled() &&
+ !IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+static struct page *__bpf_alloc_page(int nid)
+{
+ if (!can_alloc_pages())
+ return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0);
+
+ return alloc_pages_node(nid,
+ GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
+ | __GFP_NOWARN,
+ 0);
+}
+
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
+ unsigned long nr_pages, struct page **pages)
+{
+ unsigned long i, j;
+ struct page *pg;
+ int ret = 0;
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+#endif
+ for (i = 0; i < nr_pages; i++) {
+ pg = __bpf_alloc_page(nid);
+
+ if (pg) {
+ pages[i] = pg;
+ continue;
}
+ for (j = 0; j < i; j++)
+ free_pages_nolock(pages[j], 0);
+ ret = -ENOMEM;
+ break;
}
- return ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_MEMCG
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+#endif
+ return ret;
}
-/* boot time registration of different map implementations */
-void bpf_register_map_type(struct bpf_map_type_list *tl)
+
+static int btf_field_cmp(const void *a, const void *b)
{
- list_add(&tl->list_node, &bpf_map_types);
+ const struct btf_field *f1 = a, *f2 = b;
+
+ if (f1->offset < f2->offset)
+ return -1;
+ else if (f1->offset > f2->offset)
+ return 1;
+ return 0;
+}
+
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+ u32 field_mask)
+{
+ struct btf_field *field;
+
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
+ return NULL;
+ field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+ if (!field || !(field->type & field_mask))
+ return NULL;
+ return field;
+}
+
+void btf_record_free(struct btf_record *rec)
+{
+ int i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (rec->fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ if (rec->fields[i].kptr.module)
+ module_put(rec->fields[i].kptr.module);
+ if (btf_is_kernel(rec->fields[i].kptr.btf))
+ btf_put(rec->fields[i].kptr.btf);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
+ /* Nothing to release */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
+ kfree(rec);
+}
+
+void bpf_map_free_record(struct bpf_map *map)
+{
+ btf_record_free(map->record);
+ map->record = NULL;
+}
+
+struct btf_record *btf_record_dup(const struct btf_record *rec)
+{
+ const struct btf_field *fields;
+ struct btf_record *new_rec;
+ int ret, size, i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+ size = struct_size(rec, fields, rec->cnt);
+ new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_rec)
+ return ERR_PTR(-ENOMEM);
+ /* Do a deep copy of the btf_record */
+ fields = rec->fields;
+ new_rec->cnt = 0;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (fields[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ if (btf_is_kernel(fields[i].kptr.btf))
+ btf_get(fields[i].kptr.btf);
+ if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+ ret = -ENXIO;
+ goto free;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ case BPF_RB_ROOT:
+ case BPF_RB_NODE:
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_REFCOUNT:
+ case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
+ /* Nothing to acquire */
+ break;
+ default:
+ ret = -EFAULT;
+ WARN_ON_ONCE(1);
+ goto free;
+ }
+ new_rec->cnt++;
+ }
+ return new_rec;
+free:
+ btf_record_free(new_rec);
+ return ERR_PTR(ret);
+}
+
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
+{
+ bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
+ int size;
+
+ if (!a_has_fields && !b_has_fields)
+ return true;
+ if (a_has_fields != b_has_fields)
+ return false;
+ if (rec_a->cnt != rec_b->cnt)
+ return false;
+ size = struct_size(rec_a, fields, rec_a->cnt);
+ /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
+ * members are zeroed out. So memcmp is safe to do without worrying
+ * about padding/unused fields.
+ *
+ * While spin_lock, timer, and kptr have no relation to map BTF,
+ * list_head metadata is specific to map BTF, the btf and value_rec
+ * members in particular. btf is the map BTF, while value_rec points to
+ * btf_record in that map BTF.
+ *
+ * So while by default, we don't rely on the map BTF (which the records
+ * were parsed from) matching for both records, which is not backwards
+ * compatible, in case list_head is part of it, we implicitly rely on
+ * that by way of depending on memcmp succeeding for it.
+ */
+ return !memcmp(rec_a, rec_b, size);
+}
+
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+ return;
+ bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
+
+void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
+ return;
+ bpf_wq_cancel_and_free(obj + rec->wq_off);
+}
+
+void bpf_obj_free_task_work(const struct btf_record *rec, void *obj)
+{
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK)))
+ return;
+ bpf_task_work_cancel_and_free(obj + rec->task_work_off);
+}
+
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
+{
+ const struct btf_field *fields;
+ int i;
+
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ fields = rec->fields;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *pointee_struct_meta;
+ const struct btf_field *field = &fields[i];
+ void *field_ptr = obj + field->offset;
+ void *xchgd_field;
+
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ break;
+ case BPF_TIMER:
+ bpf_timer_cancel_and_free(field_ptr);
+ break;
+ case BPF_WORKQUEUE:
+ bpf_wq_cancel_and_free(field_ptr);
+ break;
+ case BPF_TASK_WORK:
+ bpf_task_work_cancel_and_free(field_ptr);
+ break;
+ case BPF_KPTR_UNREF:
+ WRITE_ONCE(*(u64 *)field_ptr, 0);
+ break;
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+ if (!xchgd_field)
+ break;
+
+ if (!btf_is_kernel(field->kptr.btf)) {
+ pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+ field->kptr.btf_id);
+ __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+ pointee_struct_meta->record : NULL,
+ fields[i].type == BPF_KPTR_PERCPU);
+ } else {
+ field->kptr.dtor(xchgd_field);
+ }
+ break;
+ case BPF_UPTR:
+ /* The caller ensured that no one is using the uptr */
+ unpin_uptr_kaddr(*(void **)field_ptr);
+ break;
+ case BPF_LIST_HEAD:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_RB_ROOT:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+ }
+}
+
+static void bpf_map_free(struct bpf_map *map)
+{
+ struct btf_record *rec = map->record;
+ struct btf *btf = map->btf;
+
+ /* implementation dependent freeing. Disabling migration to simplify
+ * the free of values or special fields allocated from bpf memory
+ * allocator.
+ */
+ kfree(map->excl_prog_sha);
+ migrate_disable();
+ map->ops->map_free(map);
+ migrate_enable();
+
+ /* Delay freeing of btf_record for maps, as map_free
+ * callback usually needs access to them. It is better to do it here
+ * than require each callback to do the free itself manually.
+ *
+ * Note that the btf_record stashed in map->inner_map_meta->record was
+ * already freed using the map_free callback for map in map case which
+ * eventually calls bpf_map_free_meta, since inner_map_meta is only a
+ * template bpf_map struct used during verification.
+ */
+ btf_record_free(rec);
+ /* Delay freeing of btf for maps, as map_free callback may need
+ * struct_meta info which will be freed with btf_put().
+ */
+ btf_put(btf);
}
/* called from workqueue */
@@ -49,33 +915,294 @@ static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
- /* implementation dependent freeing */
- map->ops->map_free(map);
+ security_bpf_map_free(map);
+ bpf_map_release_memcg(map);
+ bpf_map_owner_free(map);
+ bpf_map_free(map);
+}
+
+static void bpf_map_put_uref(struct bpf_map *map)
+{
+ if (atomic64_dec_and_test(&map->usercnt)) {
+ if (map->ops->map_release_uref)
+ map->ops->map_release_uref(map);
+ }
+}
+
+static void bpf_map_free_in_work(struct bpf_map *map)
+{
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ /* Avoid spawning kworkers, since they all might contend
+ * for the same mutex like slab_mutex.
+ */
+ queue_work(system_dfl_wq, &map->work);
+}
+
+static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
+{
+ bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
+}
+
+static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_map_free_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_map_free_rcu_gp);
}
/* decrement map refcnt and schedule it for freeing via workqueue
- * (unrelying map implementation ops->map_free() might sleep)
+ * (underlying map implementation ops->map_free() might sleep)
*/
void bpf_map_put(struct bpf_map *map)
{
- if (atomic_dec_and_test(&map->refcnt)) {
- INIT_WORK(&map->work, bpf_map_free_deferred);
- schedule_work(&map->work);
+ if (atomic64_dec_and_test(&map->refcnt)) {
+ /* bpf_map_free_id() must be called first */
+ bpf_map_free_id(map);
+
+ WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ if (READ_ONCE(map->free_after_mult_rcu_gp))
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ else if (READ_ONCE(map->free_after_rcu_gp))
+ call_rcu(&map->rcu, bpf_map_free_rcu_gp);
+ else
+ bpf_map_free_in_work(map);
}
}
+EXPORT_SYMBOL_GPL(bpf_map_put);
+
+void bpf_map_put_with_uref(struct bpf_map *map)
+{
+ bpf_map_put_uref(map);
+ bpf_map_put(map);
+}
static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
- bpf_map_put(map);
+ if (map->ops->map_release)
+ map->ops->map_release(map, filp);
+
+ bpf_map_put_with_uref(map);
return 0;
}
-static const struct file_operations bpf_map_fops = {
- .release = bpf_map_release,
+static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
+{
+ fmode_t mode = fd_file(f)->f_mode;
+
+ /* Our file permissions may have been overridden by global
+ * map permissions facing syscall side.
+ */
+ if (READ_ONCE(map->frozen))
+ mode &= ~FMODE_CAN_WRITE;
+ return mode;
+}
+
+#ifdef CONFIG_PROC_FS
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
+{
+ return map->ops->map_mem_usage(map);
+}
+
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+ u32 type = 0, jited = 0;
+
+ spin_lock(&map->owner_lock);
+ if (map->owner) {
+ type = map->owner->type;
+ jited = map->owner->jited;
+ }
+ spin_unlock(&map->owner_lock);
+
+ seq_printf(m,
+ "map_type:\t%u\n"
+ "key_size:\t%u\n"
+ "value_size:\t%u\n"
+ "max_entries:\t%u\n"
+ "map_flags:\t%#x\n"
+ "map_extra:\t%#llx\n"
+ "memlock:\t%llu\n"
+ "map_id:\t%u\n"
+ "frozen:\t%u\n",
+ map->map_type,
+ map->key_size,
+ map->value_size,
+ map->max_entries,
+ map->map_flags,
+ (unsigned long long)map->map_extra,
+ bpf_map_memory_usage(map),
+ map->id,
+ READ_ONCE(map->frozen));
+ if (type) {
+ seq_printf(m, "owner_prog_type:\t%u\n", type);
+ seq_printf(m, "owner_jited:\t%u\n", jited);
+ }
+}
+#endif
+
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+/* called for any extra memory-mapped regions (except initial) */
+static void bpf_map_mmap_open(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_inc(map);
+}
+
+/* called for all unmapped memory region (including initial) */
+static void bpf_map_mmap_close(struct vm_area_struct *vma)
+{
+ struct bpf_map *map = vma->vm_file->private_data;
+
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_dec(map);
+}
+
+static const struct vm_operations_struct bpf_map_default_vmops = {
+ .open = bpf_map_mmap_open,
+ .close = bpf_map_mmap_close,
};
+static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct bpf_map *map = filp->private_data;
+ int err = 0;
+
+ if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
+ return -ENOTSUPP;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ mutex_lock(&map->freeze_mutex);
+
+ if (vma->vm_flags & VM_WRITE) {
+ if (map->frozen) {
+ err = -EPERM;
+ goto out;
+ }
+ /* map is meant to be read-only, so do not allow mapping as
+ * writable, because it's possible to leak a writable page
+ * reference and allows user-space to still modify it after
+ * freezing, while verifier will assume contents do not change
+ */
+ if (map->map_flags & BPF_F_RDONLY_PROG) {
+ err = -EACCES;
+ goto out;
+ }
+ bpf_map_write_active_inc(map);
+ }
+out:
+ mutex_unlock(&map->freeze_mutex);
+ if (err)
+ return err;
+
+ /* set default open/close callbacks */
+ vma->vm_ops = &bpf_map_default_vmops;
+ vma->vm_private_data = map;
+ vm_flags_clear(vma, VM_MAYEXEC);
+ /* If mapping is read-only, then disallow potentially re-mapping with
+ * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
+ * means that as far as BPF map's memory-mapped VMAs are concerned,
+ * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
+ * both should be set, so we can forget about VM_MAYWRITE and always
+ * check just VM_WRITE
+ */
+ if (!(vma->vm_flags & VM_WRITE))
+ vm_flags_clear(vma, VM_MAYWRITE);
+
+ err = map->ops->map_mmap(map, vma);
+ if (err) {
+ if (vma->vm_flags & VM_WRITE)
+ bpf_map_write_active_dec(map);
+ }
+
+ return err;
+}
+
+static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
+{
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_poll)
+ return map->ops->map_poll(map, filp, pts);
+
+ return EPOLLERR;
+}
+
+static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_get_unmapped_area)
+ return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
+#ifdef CONFIG_MMU
+ return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
+#else
+ return addr;
+#endif
+}
+
+const struct file_operations bpf_map_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_map_show_fdinfo,
+#endif
+ .release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+ .mmap = bpf_map_mmap,
+ .poll = bpf_map_poll,
+ .get_unmapped_area = bpf_get_unmapped_area,
+};
+
+int bpf_map_new_fd(struct bpf_map *map, int flags)
+{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
+ return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
+}
+
/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
@@ -84,182 +1211,628 @@ static const struct file_operations bpf_map_fops = {
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* dst and src must have at least "size" number of bytes.
+ * Return strlen on success and < 0 on error.
+ */
+int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
+{
+ const char *end = src + size;
+ const char *orig_src = src;
+
+ memset(dst, 0, size);
+ /* Copy all isalnum(), '_' and '.' chars. */
+ while (src < end && *src) {
+ if (!isalnum(*src) &&
+ *src != '_' && *src != '.')
+ return -EINVAL;
+ *dst++ = *src++;
+ }
+
+ /* No '\0' found in "size" number of bytes */
+ if (src == end)
+ return -EINVAL;
+
+ return src - orig_src;
+}
+EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
+
+int map_check_no_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ return -ENOTSUPP;
+}
+
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+ const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
+{
+ const struct btf_type *key_type, *value_type;
+ u32 key_size, value_size;
+ int ret = 0;
+
+ /* Some maps allow key to be unspecified. */
+ if (btf_key_id) {
+ key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+ if (!key_type || key_size != map->key_size)
+ return -EINVAL;
+ } else {
+ key_type = btf_type_by_id(btf, 0);
+ if (!map->ops->map_check_btf)
+ return -EINVAL;
+ }
+
+ value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+ if (!value_type || value_size != map->value_size)
+ return -EINVAL;
+
+ map->record = btf_parse_fields(btf, value_type,
+ BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR |
+ BPF_TASK_WORK,
+ map->value_size);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ int i;
+
+ if (!bpf_token_capable(token, CAP_BPF)) {
+ ret = -EPERM;
+ goto free_map_tab;
+ }
+ if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+ ret = -EACCES;
+ goto free_map_tab;
+ }
+ for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+ switch (map->record->field_mask & (1 << i)) {
+ case 0:
+ continue;
+ case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_TASK_WORK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_REFCOUNT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_UPTR:
+ if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ default:
+ /* Fail if map_type checks are missing for a field type */
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ }
+ }
+
+ ret = btf_check_and_fixup_fields(btf, map->record);
+ if (ret < 0)
+ goto free_map_tab;
+
+ if (map->ops->map_check_btf) {
+ ret = map->ops->map_check_btf(map, btf, key_type, value_type);
+ if (ret < 0)
+ goto free_map_tab;
+ }
+
+ return ret;
+free_map_tab:
+ bpf_map_free_record(map);
+ return ret;
+}
+
+static bool bpf_net_capable(void)
+{
+ return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
-static int map_create(union bpf_attr *attr)
+static int map_create(union bpf_attr *attr, bpfptr_t uattr)
{
+ const struct bpf_map_ops *ops;
+ struct bpf_token *token = NULL;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ u32 map_type = attr->map_type;
struct bpf_map *map;
+ bool token_flag;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it
+ * to avoid per-map type checks tripping on unknown flag
+ */
+ token_flag = attr->map_flags & BPF_F_TOKEN_FD;
+ attr->map_flags &= ~BPF_F_TOKEN_FD;
+
+ if (attr->btf_vmlinux_value_type_id) {
+ if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
+ attr->btf_key_type_id || attr->btf_value_type_id)
+ return -EINVAL;
+ } else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
+ return -EINVAL;
+ }
+
+ if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
+ attr->map_type != BPF_MAP_TYPE_ARENA &&
+ attr->map_extra != 0)
+ return -EINVAL;
+
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
+ if (numa_node != NUMA_NO_NODE &&
+ ((unsigned int)numa_node >= nr_node_ids ||
+ !node_online(numa_node)))
+ return -EINVAL;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
- map = find_and_alloc_map(attr);
- if (IS_ERR(map))
- return PTR_ERR(map);
+ map_type = attr->map_type;
+ if (map_type >= ARRAY_SIZE(bpf_map_types))
+ return -EINVAL;
+ map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
+ ops = bpf_map_types[map_type];
+ if (!ops)
+ return -EINVAL;
- atomic_set(&map->refcnt, 1);
+ if (ops->map_alloc_check) {
+ err = ops->map_alloc_check(attr);
+ if (err)
+ return err;
+ }
+ if (attr->map_ifindex)
+ ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return -EINVAL;
- err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+ if (token_flag) {
+ token = bpf_token_get_from_fd(attr->map_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ /* if current token doesn't grant map creation permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+ !bpf_token_allow_map_type(token, attr->map_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ err = -EPERM;
+
+ /* Intent here is for unprivileged_bpf_disabled to block BPF map
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+ goto put_token;
+
+ /* check privileged map type permissions */
+ switch (map_type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_CGROUP_STORAGE:
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+ /* unprivileged */
+ break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ case BPF_MAP_TYPE_LPM_TRIE:
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ case BPF_MAP_TYPE_STACK_TRACE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_STRUCT_OPS:
+ case BPF_MAP_TYPE_CPUMAP:
+ case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
+ if (!bpf_token_capable(token, CAP_BPF))
+ goto put_token;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ case BPF_MAP_TYPE_XSKMAP:
+ if (!bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
+ break;
+ default:
+ WARN(1, "unsupported map type %d", map_type);
+ goto put_token;
+ }
+
+ map = ops->map_alloc(attr);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto put_token;
+ }
+ map->ops = ops;
+ map->map_type = map_type;
+
+ err = bpf_obj_name_cpy(map->name, attr->map_name,
+ sizeof(attr->map_name));
if (err < 0)
- /* failed to allocate fd */
goto free_map;
+ preempt_disable();
+ map->cookie = gen_cookie_next(&bpf_map_cookie);
+ preempt_enable();
+
+ atomic64_set(&map->refcnt, 1);
+ atomic64_set(&map->usercnt, 1);
+ mutex_init(&map->freeze_mutex);
+ spin_lock_init(&map->owner_lock);
+
+ if (attr->btf_key_type_id || attr->btf_value_type_id ||
+ /* Even the map's value is a kernel's struct,
+ * the bpf_prog.o must have BTF to begin with
+ * to figure out the corresponding kernel's
+ * counter part. Thus, attr->btf_fd has
+ * to be valid also.
+ */
+ attr->btf_vmlinux_value_type_id) {
+ struct btf *btf;
+
+ btf = btf_get_by_fd(attr->btf_fd);
+ if (IS_ERR(btf)) {
+ err = PTR_ERR(btf);
+ goto free_map;
+ }
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ err = -EACCES;
+ goto free_map;
+ }
+ map->btf = btf;
+
+ if (attr->btf_value_type_id) {
+ err = map_check_btf(map, token, btf, attr->btf_key_type_id,
+ attr->btf_value_type_id);
+ if (err)
+ goto free_map;
+ }
+
+ map->btf_key_type_id = attr->btf_key_type_id;
+ map->btf_value_type_id = attr->btf_value_type_id;
+ map->btf_vmlinux_value_type_id =
+ attr->btf_vmlinux_value_type_id;
+ }
+
+ if (attr->excl_prog_hash) {
+ bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel);
+
+ if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) {
+ err = -EINVAL;
+ goto free_map;
+ }
+
+ map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!map->excl_prog_sha) {
+ err = -ENOMEM;
+ goto free_map;
+ }
+
+ if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) {
+ err = -EFAULT;
+ goto free_map;
+ }
+ } else if (attr->excl_prog_hash_size) {
+ err = -EINVAL;
+ goto free_map;
+ }
+
+ err = security_bpf_map_create(map, attr, token, uattr.is_kernel);
+ if (err)
+ goto free_map_sec;
+
+ err = bpf_map_alloc_id(map);
+ if (err)
+ goto free_map_sec;
+
+ bpf_map_save_memcg(map);
+ bpf_token_put(token);
+
+ err = bpf_map_new_fd(map, f_flags);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_map_put_with_uref() is needed because the above
+ * bpf_map_alloc_id() has published the map
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+ */
+ bpf_map_put_with_uref(map);
+ return err;
+ }
+
return err;
+free_map_sec:
+ security_bpf_map_free(map);
free_map:
- map->ops->map_free(map);
+ bpf_map_free(map);
+put_token:
+ bpf_token_put(token);
return err;
}
-/* if error is returned, fd is released.
- * On success caller should complete fd access with matching fdput()
- */
-struct bpf_map *bpf_map_get(struct fd f)
+void bpf_map_inc(struct bpf_map *map)
{
- struct bpf_map *map;
+ atomic64_inc(&map->refcnt);
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc);
- if (!f.file)
- return ERR_PTR(-EBADF);
+void bpf_map_inc_with_uref(struct bpf_map *map)
+{
+ atomic64_inc(&map->refcnt);
+ atomic64_inc(&map->usercnt);
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
- if (f.file->f_op != &bpf_map_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
- }
+struct bpf_map *bpf_map_get(u32 ufd)
+{
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
+
+ if (!IS_ERR(map))
+ bpf_map_inc(map);
+
+ return map;
+}
+EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL");
+
+struct bpf_map *bpf_map_get_with_uref(u32 ufd)
+{
+ CLASS(fd, f)(ufd);
+ struct bpf_map *map = __bpf_map_get(f);
- map = f.file->private_data;
+ if (!IS_ERR(map))
+ bpf_map_inc_with_uref(map);
return map;
}
-/* helper to convert user pointers passed inside __aligned_u64 fields */
-static void __user *u64_to_ptr(__u64 val)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
- return (void __user *) (unsigned long) val;
+ int refold;
+
+ refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+ if (uref)
+ atomic64_inc(&map->usercnt);
+
+ return map;
+}
+
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
+{
+ lockdep_assert(rcu_read_lock_held());
+ return __bpf_map_inc_not_zero(map, false);
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
+
+int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value,
+ bool delete)
+{
+ return -ENOTSUPP;
+}
+
+static void *__bpf_copy_key(void __user *ukey, u64 key_size)
+{
+ if (key_size)
+ return vmemdup_user(ukey, key_size);
+
+ if (ukey)
+ return ERR_PTR(-EINVAL);
+
+ return NULL;
+}
+
+static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
+{
+ if (key_size)
+ return kvmemdup_bpfptr(ukey, key_size);
+
+ if (!bpfptr_is_null(ukey))
+ return ERR_PTR(-EINVAL);
+
+ return NULL;
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
static int map_lookup_elem(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *uvalue = u64_to_ptr(attr->value);
- int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *uvalue = u64_to_user_ptr(attr->value);
struct bpf_map *map;
- void *key, *value, *ptr;
+ void *key, *value;
+ u32 value_size;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
- map = bpf_map_get(f);
+ CLASS(fd, f)(attr->map_fd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
- goto err_put;
+ err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+ if (err)
+ return err;
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ key = __bpf_copy_key(ukey, map->key_size);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ value_size = bpf_map_value_size(map);
err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER);
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- rcu_read_lock();
- ptr = map->ops->map_lookup_elem(map, key);
- if (ptr)
- memcpy(value, ptr, map->value_size);
- rcu_read_unlock();
+ if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
+ if (copy_from_user(value, uvalue, value_size))
+ err = -EFAULT;
+ else
+ err = bpf_map_copy_value(map, key, value, attr->flags);
+ goto free_value;
+ }
- err = -ENOENT;
- if (!ptr)
+ err = bpf_map_copy_value(map, key, value, attr->flags);
+ if (err)
goto free_value;
err = -EFAULT;
- if (copy_to_user(uvalue, value, map->value_size) != 0)
+ if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
err = 0;
free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
-err_put:
- fdput(f);
+ kvfree(key);
return err;
}
+
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
-static int map_update_elem(union bpf_attr *attr)
+static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *uvalue = u64_to_ptr(attr->value);
- int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
+ bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
struct bpf_map *map;
void *key, *value;
+ u32 value_size;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
- map = bpf_map_get(f);
+ CLASS(fd, f)(attr->map_fd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ err = bpf_map_check_op_flags(map, attr->flags, ~0);
+ if (err)
goto err_put;
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ key = ___bpf_copy_key(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
- err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER);
- if (!value)
+ value_size = bpf_map_value_size(map);
+ value = kvmemdup_bpfptr(uvalue, value_size);
+ if (IS_ERR(value)) {
+ err = PTR_ERR(value);
goto free_key;
+ }
- err = -EFAULT;
- if (copy_from_user(value, uvalue, map->value_size) != 0)
- goto free_value;
-
- /* eBPF program that use maps are running under rcu_read_lock(),
- * therefore all map accessors rely on this fact, so do the same here
- */
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
+ err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
+ if (!err)
+ maybe_wait_bpf_programs(map);
-free_value:
- kfree(value);
+ kvfree(value);
free_key:
- kfree(key);
+ kvfree(key);
err_put:
- fdput(f);
+ bpf_map_write_active_dec(map);
return err;
}
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
-static int map_delete_elem(union bpf_attr *attr)
+static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
+ bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
struct bpf_map *map;
void *key;
int err;
@@ -267,27 +1840,43 @@ static int map_delete_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
- map = bpf_map_get(f);
+ CLASS(fd, f)(attr->map_fd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = ___bpf_copy_key(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
+ }
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ if (bpf_map_is_offloaded(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ goto out;
+ } else if (IS_FD_PROG_ARRAY(map) ||
+ map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ /* These maps require sleepable context */
+ err = map->ops->map_delete_elem(map, key);
+ goto out;
+ }
+ bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
-
-free_key:
- kfree(key);
+ bpf_enable_instrumentation();
+ if (!err)
+ maybe_wait_bpf_programs(map);
+out:
+ kvfree(key);
err_put:
- fdput(f);
+ bpf_map_write_active_dec(map);
return err;
}
@@ -296,10 +1885,8 @@ err_put:
static int map_get_next_key(union bpf_attr *attr)
{
- void __user *ukey = u64_to_ptr(attr->key);
- void __user *unext_key = u64_to_ptr(attr->next_key);
- int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *unext_key = u64_to_user_ptr(attr->next_key);
struct bpf_map *map;
void *key, *next_key;
int err;
@@ -307,27 +1894,35 @@ static int map_get_next_key(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
- map = bpf_map_get(f);
+ CLASS(fd, f)(attr->map_fd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
+ return -EPERM;
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
- goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ if (ukey) {
+ key = __bpf_copy_key(ukey, map->key_size);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ } else {
+ key = NULL;
+ }
err = -ENOMEM;
- next_key = kmalloc(map->key_size, GFP_USER);
+ next_key = kvmalloc(map->key_size, GFP_USER);
if (!next_key)
goto free_key;
+ if (bpf_map_is_offloaded(map)) {
+ err = bpf_map_offload_get_next_key(map, key, next_key);
+ goto out;
+ }
+
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
+out:
if (err)
goto free_next_key;
@@ -338,88 +1933,505 @@ static int map_get_next_key(union bpf_attr *attr)
err = 0;
free_next_key:
- kfree(next_key);
+ kvfree(next_key);
free_key:
- kfree(key);
-err_put:
- fdput(f);
+ kvfree(key);
return err;
}
-static LIST_HEAD(bpf_prog_types);
-
-static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+int generic_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- struct bpf_prog_type_list *tl;
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 cp, max_count;
+ int err = 0;
+ void *key;
- list_for_each_entry(tl, &bpf_prog_types, list_node) {
- if (tl->type == type) {
- prog->aux->ops = tl->ops;
- prog->type = type;
- return 0;
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
+ return -EINVAL;
+ }
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size))
+ break;
+
+ if (bpf_map_is_offloaded(map)) {
+ err = bpf_map_offload_delete_elem(map, key);
+ break;
}
+
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ if (err)
+ break;
+ cond_resched();
}
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
- return -EINVAL;
+ kvfree(key);
+
+ return err;
}
-void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- list_add(&tl->list_node, &bpf_prog_types);
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ u32 value_size, cp, max_count;
+ void *key, *value;
+ int err = 0;
+
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!key)
+ return -ENOMEM;
+
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ if (!value) {
+ kvfree(key);
+ return -ENOMEM;
+ }
+
+ for (cp = 0; cp < max_count; cp++) {
+ err = -EFAULT;
+ if (copy_from_user(key, keys + cp * map->key_size,
+ map->key_size) ||
+ copy_from_user(value, values + cp * value_size, value_size))
+ break;
+
+ err = bpf_map_update_value(map, map_file, key, value,
+ attr->batch.elem_flags);
+
+ if (err)
+ break;
+ cond_resched();
+ }
+
+ if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
+ err = -EFAULT;
+
+ kvfree(value);
+ kvfree(key);
+
+ return err;
}
-/* fixup insn->imm field of bpf_call instructions:
- * if (insn->imm == BPF_FUNC_map_lookup_elem)
- * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
- * else if (insn->imm == BPF_FUNC_map_update_elem)
- * insn->imm = bpf_map_update_elem - __bpf_call_base;
- * else ...
- *
- * this function is called after eBPF program passed verification
- */
-static void fixup_bpf_calls(struct bpf_prog *prog)
+int generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- const struct bpf_func_proto *fn;
- int i;
+ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ void *buf, *buf_prevkey, *prev_key, *key, *value;
+ u32 value_size, cp, max_count;
+ int err;
- for (i = 0; i < prog->len; i++) {
- struct bpf_insn *insn = &prog->insnsi[i];
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ if (err)
+ return err;
- if (insn->code == (BPF_JMP | BPF_CALL)) {
- /* we reach here when program has bpf_call instructions
- * and it passed bpf_check(), means that
- * ops->get_func_proto must have been supplied, check it
- */
- BUG_ON(!prog->aux->ops->get_func_proto);
+ value_size = bpf_map_value_size(map);
- fn = prog->aux->ops->get_func_proto(insn->imm);
- /* all functions that have prototype and verifier allowed
- * programs to call them, must be real in-kernel functions
- */
- BUG_ON(!fn->func);
- insn->imm = fn->func - __bpf_call_base;
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ if (put_user(0, &uattr->batch.count))
+ return -EFAULT;
+
+ buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
+ if (!buf_prevkey)
+ return -ENOMEM;
+
+ buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ if (!buf) {
+ kvfree(buf_prevkey);
+ return -ENOMEM;
+ }
+
+ err = -EFAULT;
+ prev_key = NULL;
+ if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
+ goto free_buf;
+ key = buf;
+ value = key + map->key_size;
+ if (ubatch)
+ prev_key = buf_prevkey;
+
+ for (cp = 0; cp < max_count;) {
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, prev_key, key);
+ rcu_read_unlock();
+ if (err)
+ break;
+ err = bpf_map_copy_value(map, key, value,
+ attr->batch.elem_flags);
+
+ if (err == -ENOENT)
+ goto next_key;
+
+ if (err)
+ goto free_buf;
+
+ if (copy_to_user(keys + cp * map->key_size, key,
+ map->key_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+ if (copy_to_user(values + cp * value_size, value, value_size)) {
+ err = -EFAULT;
+ goto free_buf;
}
+
+ cp++;
+next_key:
+ if (!prev_key)
+ prev_key = buf_prevkey;
+
+ swap(prev_key, key);
+ cond_resched();
}
+
+ if (err == -EFAULT)
+ goto free_buf;
+
+ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
+ (cp && copy_to_user(uobatch, prev_key, map->key_size))))
+ err = -EFAULT;
+
+free_buf:
+ kvfree(buf_prevkey);
+ kvfree(buf);
+ return err;
}
-/* drop refcnt on maps used by eBPF program and free auxilary data */
-static void free_used_maps(struct bpf_prog_aux *aux)
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
- int i;
+ void __user *ukey = u64_to_user_ptr(attr->key);
+ void __user *uvalue = u64_to_user_ptr(attr->value);
+ struct bpf_map *map;
+ void *key, *value;
+ u32 value_size;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+ return -EINVAL;
+
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->map_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (attr->flags &&
+ (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ if ((attr->flags & BPF_F_LOCK) &&
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ key = __bpf_copy_key(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
+ goto err_put;
+ }
- for (i = 0; i < aux->used_map_cnt; i++)
- bpf_map_put(aux->used_maps[i]);
+ value_size = bpf_map_value_size(map);
- kfree(aux->used_maps);
+ err = -ENOMEM;
+ value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ goto free_key;
+
+ err = -ENOTSUPP;
+ if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = map->ops->map_pop_elem(map, value);
+ } else if (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ if (!bpf_map_is_offloaded(map)) {
+ bpf_disable_instrumentation();
+ rcu_read_lock();
+ err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ bpf_enable_instrumentation();
+ }
+ }
+
+ if (err)
+ goto free_value;
+
+ if (copy_to_user(uvalue, value, value_size) != 0) {
+ err = -EFAULT;
+ goto free_value;
+ }
+
+ err = 0;
+
+free_value:
+ kvfree(value);
+free_key:
+ kvfree(key);
+err_put:
+ bpf_map_write_active_dec(map);
+ return err;
}
-void bpf_prog_put(struct bpf_prog *prog)
+#define BPF_MAP_FREEZE_LAST_FIELD map_fd
+
+static int map_freeze(const union bpf_attr *attr)
+{
+ int err = 0;
+ struct bpf_map *map;
+
+ if (CHECK_ATTR(BPF_MAP_FREEZE))
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->map_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record))
+ return -ENOTSUPP;
+
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE))
+ return -EPERM;
+
+ mutex_lock(&map->freeze_mutex);
+ if (bpf_map_write_active(map)) {
+ err = -EBUSY;
+ goto err_put;
+ }
+ if (READ_ONCE(map->frozen)) {
+ err = -EBUSY;
+ goto err_put;
+ }
+
+ WRITE_ONCE(map->frozen, true);
+err_put:
+ mutex_unlock(&map->freeze_mutex);
+ return err;
+}
+
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ [_id] = & _name ## _prog_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
+};
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ const struct bpf_prog_ops *ops;
+
+ if (type >= ARRAY_SIZE(bpf_prog_types))
+ return -EINVAL;
+ type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
+ ops = bpf_prog_types[type];
+ if (!ops)
+ return -EINVAL;
+
+ if (!bpf_prog_is_offloaded(prog->aux))
+ prog->aux->ops = ops;
+ else
+ prog->aux->ops = &bpf_offload_prog_ops;
+ prog->type = type;
+ return 0;
+}
+
+enum bpf_audit {
+ BPF_AUDIT_LOAD,
+ BPF_AUDIT_UNLOAD,
+ BPF_AUDIT_MAX,
+};
+
+static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
+ [BPF_AUDIT_LOAD] = "LOAD",
+ [BPF_AUDIT_UNLOAD] = "UNLOAD",
+};
+
+static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
+{
+ struct audit_context *ctx = NULL;
+ struct audit_buffer *ab;
+
+ if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
+ return;
+ if (audit_enabled == AUDIT_OFF)
+ return;
+ if (!in_hardirq() && !irqs_disabled())
+ ctx = audit_context();
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "prog-id=%u op=%s",
+ prog->aux->id, bpf_audit_str[op]);
+ audit_log_end(ab);
+}
+
+static int bpf_prog_alloc_id(struct bpf_prog *prog)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&prog_idr_lock);
+ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ prog->aux->id = id;
+ spin_unlock_bh(&prog_idr_lock);
+ idr_preload_end();
+
+ /* id is in [1, INT_MAX) */
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+void bpf_prog_free_id(struct bpf_prog *prog)
+{
+ unsigned long flags;
+
+ /* cBPF to eBPF migrations are currently not in the idr store.
+ * Offloaded programs are removed from the store when their device
+ * disappears - even if someone grabs an fd to them they are unusable,
+ * simply waiting for refcnt to drop to be freed.
+ */
+ if (!prog->aux->id)
+ return;
+
+ spin_lock_irqsave(&prog_idr_lock, flags);
+ idr_remove(&prog_idr, prog->aux->id);
+ prog->aux->id = 0;
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
+}
+
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+ kvfree(aux->func_info);
+ kfree(aux->func_info_aux);
+ free_uid(aux->user);
+ security_bpf_prog_free(aux->prog);
+ bpf_prog_free(aux->prog);
+}
+
+static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
- if (atomic_dec_and_test(&prog->aux->refcnt)) {
- free_used_maps(prog->aux);
- bpf_prog_free(prog);
+ bpf_prog_kallsyms_del_all(prog);
+ btf_put(prog->aux->btf);
+ module_put(prog->aux->mod);
+ kvfree(prog->aux->jited_linfo);
+ kvfree(prog->aux->linfo);
+ kfree(prog->aux->kfunc_tab);
+ kfree(prog->aux->ctx_arg_info);
+ if (prog->aux->attach_btf)
+ btf_put(prog->aux->attach_btf);
+
+ if (deferred) {
+ if (prog->sleepable)
+ call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
+ else
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ } else {
+ __bpf_prog_put_rcu(&prog->aux->rcu);
}
}
+
+static void bpf_prog_put_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *prog;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ prog = aux->prog;
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ bpf_prog_free_id(prog);
+ __bpf_prog_put_noref(prog, true);
+}
+
+static void __bpf_prog_put(struct bpf_prog *prog)
+{
+ struct bpf_prog_aux *aux = prog->aux;
+
+ if (atomic64_dec_and_test(&aux->refcnt)) {
+ if (in_hardirq() || irqs_disabled()) {
+ INIT_WORK(&aux->work, bpf_prog_put_deferred);
+ schedule_work(&aux->work);
+ } else {
+ bpf_prog_put_deferred(&aux->work);
+ }
+ }
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ __bpf_prog_put(prog);
+}
EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -430,187 +2442,3824 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
-static const struct file_operations bpf_prog_fops = {
- .release = bpf_prog_release,
+struct bpf_prog_kstats {
+ u64 nsecs;
+ u64 cnt;
+ u64 misses;
};
-static struct bpf_prog *get_prog(struct fd f)
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
{
- struct bpf_prog *prog;
+ struct bpf_prog_stats *stats;
+ unsigned int flags;
- if (!f.file)
- return ERR_PTR(-EBADF);
+ if (unlikely(!prog->stats))
+ return;
- if (f.file->f_op != &bpf_prog_fops) {
- fdput(f);
- return ERR_PTR(-EINVAL);
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+ struct bpf_prog_kstats *stats)
+{
+ u64 nsecs = 0, cnt = 0, misses = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct bpf_prog_stats *st;
+ unsigned int start;
+ u64 tnsecs, tcnt, tmisses;
+
+ st = per_cpu_ptr(prog->stats, cpu);
+ do {
+ start = u64_stats_fetch_begin(&st->syncp);
+ tnsecs = u64_stats_read(&st->nsecs);
+ tcnt = u64_stats_read(&st->cnt);
+ tmisses = u64_stats_read(&st->misses);
+ } while (u64_stats_fetch_retry(&st->syncp, start));
+ nsecs += tnsecs;
+ cnt += tcnt;
+ misses += tmisses;
}
+ stats->nsecs = nsecs;
+ stats->cnt = cnt;
+ stats->misses = misses;
+}
+
+#ifdef CONFIG_PROC_FS
+static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_prog *prog = filp->private_data;
+ char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+ struct bpf_prog_kstats stats;
+
+ bpf_prog_get_stats(prog, &stats);
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_type:\t%u\n"
+ "prog_jited:\t%u\n"
+ "prog_tag:\t%s\n"
+ "memlock:\t%llu\n"
+ "prog_id:\t%u\n"
+ "run_time_ns:\t%llu\n"
+ "run_cnt:\t%llu\n"
+ "recursion_misses:\t%llu\n"
+ "verified_insns:\t%u\n",
+ prog->type,
+ prog->jited,
+ prog_tag,
+ prog->pages * 1ULL << PAGE_SHIFT,
+ prog->aux->id,
+ stats.nsecs,
+ stats.cnt,
+ stats.misses,
+ prog->aux->verified_insns);
+}
+#endif
+
+const struct file_operations bpf_prog_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_prog_show_fdinfo,
+#endif
+ .release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+};
+
+int bpf_prog_new_fd(struct bpf_prog *prog)
+{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
+ return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
+ O_RDWR | O_CLOEXEC);
+}
+
+void bpf_prog_add(struct bpf_prog *prog, int i)
+{
+ atomic64_add(i, &prog->aux->refcnt);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_add);
+
+void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+ /* Only to be used for undoing previous bpf_prog_add() in some
+ * error path. We still know that another entity in our call
+ * path holds a reference to the program, thus atomic_sub() can
+ * be safely used in such cases!
+ */
+ WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_sub);
+
+void bpf_prog_inc(struct bpf_prog *prog)
+{
+ atomic64_inc(&prog->aux->refcnt);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_inc);
- prog = f.file->private_data;
+/* prog_idr_lock should have been held */
+struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+ int refold;
+
+ refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+bool bpf_prog_get_ok(struct bpf_prog *prog,
+ enum bpf_prog_type *attach_type, bool attach_drv)
+{
+ /* not an attachment, just a refcount inc, always allow */
+ if (!attach_type)
+ return true;
+
+ if (prog->type != *attach_type)
+ return false;
+ if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)
+ return false;
+
+ return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+ bool attach_drv)
{
- struct fd f = fdget(ufd);
+ CLASS(fd, f)(ufd);
struct bpf_prog *prog;
- prog = get_prog(f);
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_prog_fops)
+ return ERR_PTR(-EINVAL);
- if (IS_ERR(prog))
- return prog;
+ prog = fd_file(f)->private_data;
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv))
+ return ERR_PTR(-EINVAL);
- atomic_inc(&prog->aux->refcnt);
- fdput(f);
+ bpf_prog_inc(prog);
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ return __bpf_prog_get(ufd, NULL, false);
+}
+
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+ bool attach_drv)
+{
+ return __bpf_prog_get(ufd, &type, attach_drv);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
+
+/* Initially all BPF programs could be loaded w/o specifying
+ * expected_attach_type. Later for some of them specifying expected_attach_type
+ * at load time became required so that program could be validated properly.
+ * Programs of types that are allowed to be loaded both w/ and w/o (for
+ * backward compatibility) expected_attach_type, should have the default attach
+ * type assigned to expected_attach_type for the latter case, so that it can be
+ * validated later at attach time.
+ *
+ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
+ * prog type requires it but has some attach types that have to be backward
+ * compatible.
+ */
+static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
+{
+ switch (attr->prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
+ * exist so checking for non-zero is the way to go here.
+ */
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_CGROUP_INET_SOCK_CREATE;
+ break;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ if (!attr->expected_attach_type)
+ attr->expected_attach_type =
+ BPF_SK_REUSEPORT_SELECT;
+ break;
+ }
+}
+
+static int
+bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
+ enum bpf_attach_type expected_attach_type,
+ struct btf *attach_btf, u32 btf_id,
+ struct bpf_prog *dst_prog)
+{
+ if (btf_id) {
+ if (btf_id > BTF_MAX_TYPE)
+ return -EINVAL;
+
+ if (!attach_btf && !dst_prog)
+ return -EINVAL;
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ case BPF_PROG_TYPE_EXT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (attach_btf && (!btf_id || dst_prog))
+ return -EINVAL;
+
+ if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
+ prog_type != BPF_PROG_TYPE_EXT)
+ return -EINVAL;
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_SETSOCKOPT:
+ case BPF_CGROUP_GETSOCKOPT:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ if (expected_attach_type == BPF_SK_LOOKUP)
+ return 0;
+ return -EINVAL;
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ switch (expected_attach_type) {
+ case BPF_SK_REUSEPORT_SELECT:
+ case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_NETFILTER:
+ if (expected_attach_type == BPF_NETFILTER)
+ return 0;
+ return -EINVAL;
+ case BPF_PROG_TYPE_SYSCALL:
+ case BPF_PROG_TYPE_EXT:
+ if (expected_attach_type)
+ return -EINVAL;
+ fallthrough;
+ default:
+ return 0;
+ }
+}
+
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ case BPF_PROG_TYPE_NETFILTER:
+ return true;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ /* always unpriv */
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ /* equivalent to SOCKET_FILTER. need CAP_BPF only */
+ default:
+ return false;
+ }
+}
+
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr,
+ bool is_kernel)
+{
+ bpfptr_t usig = make_bpfptr(attr->signature, is_kernel);
+ struct bpf_dynptr_kern sig_ptr, insns_ptr;
+ struct bpf_key *key = NULL;
+ void *sig;
+ int err = 0;
+
+ if (system_keyring_id_check(attr->keyring_id) == 0)
+ key = bpf_lookup_system_key(attr->keyring_id);
+ else
+ key = bpf_lookup_user_key(attr->keyring_id, 0);
+
+ if (!key)
+ return -EINVAL;
+
+ sig = kvmemdup_bpfptr(usig, attr->signature_size);
+ if (IS_ERR(sig)) {
+ bpf_key_put(key);
+ return -ENOMEM;
+ }
+
+ bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
+ attr->signature_size);
+ bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0,
+ prog->len * sizeof(struct bpf_insn));
+
+ err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr,
+ (struct bpf_dynptr *)&sig_ptr, key);
+
+ bpf_key_put(key);
+ kvfree(sig);
+ return err;
+}
+
+static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog)
+{
+ int err;
+ int i;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++) {
+ if (prog->aux->used_maps[i]->map_type != BPF_MAP_TYPE_INSN_ARRAY)
+ continue;
+
+ err = bpf_insn_array_ready(prog->aux->used_maps[i]);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD kern_version
+#define BPF_PROG_LOAD_LAST_FIELD keyring_id
-static int bpf_prog_load(union bpf_attr *attr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
- struct bpf_prog *prog;
+ struct bpf_prog *prog, *dst_prog = NULL;
+ struct btf *attach_btf = NULL;
+ struct bpf_token *token = NULL;
+ bool bpf_cap;
int err;
char license[128];
- bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
- /* copy eBPF program license from user space */
- if (strncpy_from_user(license, u64_to_ptr(attr->license),
- sizeof(license) - 1) < 0)
- return -EFAULT;
- license[sizeof(license) - 1] = 0;
+ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
+ BPF_F_ANY_ALIGNMENT |
+ BPF_F_TEST_STATE_FREQ |
+ BPF_F_SLEEPABLE |
+ BPF_F_TEST_RND_HI32 |
+ BPF_F_XDP_HAS_FRAGS |
+ BPF_F_XDP_DEV_BOUND_ONLY |
+ BPF_F_TEST_REG_INVARIANTS |
+ BPF_F_TOKEN_FD))
+ return -EINVAL;
- /* eBPF programs must be GPL compatible to use GPL-ed functions */
- is_gpl = license_is_gpl_compatible(license);
+ bpf_prog_load_fixup_attach_type(attr);
- if (attr->insn_cnt >= BPF_MAXINSNS)
- return -EINVAL;
+ if (attr->prog_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->prog_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ /* if current token doesn't grant prog loading permissions,
+ * then we can't use this token, so ignore it and rely on
+ * system-wide capabilities checks
+ */
+ if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+ !bpf_token_allow_prog_type(token, attr->prog_type,
+ attr->expected_attach_type)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
- if (type == BPF_PROG_TYPE_KPROBE &&
- attr->kern_version != LINUX_VERSION_CODE)
- return -EINVAL;
+ bpf_cap = bpf_token_capable(token, CAP_BPF);
+ err = -EPERM;
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
+ !bpf_cap)
+ goto put_token;
+
+ /* Intent here is for unprivileged_bpf_disabled to block BPF program
+ * creation for unprivileged users; other actions depend
+ * on fd availability and access to bpffs, so are dependent on
+ * object creation success. Even with unprivileged BPF disabled,
+ * capability checks are still carried out for these
+ * and other operations.
+ */
+ if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+ goto put_token;
+
+ if (attr->insn_cnt == 0 ||
+ attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+ err = -E2BIG;
+ goto put_token;
+ }
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !bpf_cap)
+ goto put_token;
+
+ if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+ goto put_token;
+ if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+ goto put_token;
+
+ /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
+ * or btf, we need to check which one it is
+ */
+ if (attr->attach_prog_fd) {
+ dst_prog = bpf_prog_get(attr->attach_prog_fd);
+ if (IS_ERR(dst_prog)) {
+ dst_prog = NULL;
+ attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
+ if (IS_ERR(attach_btf)) {
+ err = -EINVAL;
+ goto put_token;
+ }
+ if (!btf_is_kernel(attach_btf)) {
+ /* attaching through specifying bpf_prog's BTF
+ * objects directly might be supported eventually
+ */
+ btf_put(attach_btf);
+ err = -ENOTSUPP;
+ goto put_token;
+ }
+ }
+ } else if (attr->attach_btf_id) {
+ /* fall back to vmlinux BTF, if BTF type ID is specified */
+ attach_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(attach_btf)) {
+ err = PTR_ERR(attach_btf);
+ goto put_token;
+ }
+ if (!attach_btf) {
+ err = -EINVAL;
+ goto put_token;
+ }
+ btf_get(attach_btf);
+ }
+
+ if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
+ attach_btf, attr->attach_btf_id,
+ dst_prog)) {
+ if (dst_prog)
+ bpf_prog_put(dst_prog);
+ if (attach_btf)
+ btf_put(attach_btf);
+ err = -EINVAL;
+ goto put_token;
+ }
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
- if (!prog)
- return -ENOMEM;
+ if (!prog) {
+ if (dst_prog)
+ bpf_prog_put(dst_prog);
+ if (attach_btf)
+ btf_put(attach_btf);
+ err = -EINVAL;
+ goto put_token;
+ }
+
+ prog->expected_attach_type = attr->expected_attach_type;
+ prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
+ prog->aux->attach_btf = attach_btf;
+ prog->aux->attach_btf_id = attr->attach_btf_id;
+ prog->aux->dst_prog = dst_prog;
+ prog->aux->dev_bound = !!attr->prog_ifindex;
+ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
+ /* move token into prog->aux, reuse taken refcnt */
+ prog->aux->token = token;
+ token = NULL;
+
+ prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
err = -EFAULT;
- if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
- prog->len * sizeof(struct bpf_insn)) != 0)
+ if (copy_from_bpfptr(prog->insns,
+ make_bpfptr(attr->insns, uattr.is_kernel),
+ bpf_prog_insn_size(prog)) != 0)
+ goto free_prog;
+ /* copy eBPF program license from user space */
+ if (strncpy_from_bpfptr(license,
+ make_bpfptr(attr->license, uattr.is_kernel),
+ sizeof(license) - 1) < 0)
goto free_prog;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0;
+
+ if (attr->signature) {
+ err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel);
+ if (err)
+ goto free_prog;
+ }
prog->orig_prog = NULL;
- prog->jited = false;
+ prog->jited = 0;
+
+ atomic64_set(&prog->aux->refcnt, 1);
+
+ if (bpf_prog_is_dev_bound(prog->aux)) {
+ err = bpf_prog_dev_bound_init(prog, attr);
+ if (err)
+ goto free_prog;
+ }
- atomic_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl;
+ if (type == BPF_PROG_TYPE_EXT && dst_prog &&
+ bpf_prog_is_dev_bound(dst_prog->aux)) {
+ err = bpf_prog_dev_bound_inherit(prog, dst_prog);
+ if (err)
+ goto free_prog;
+ }
+
+ /*
+ * Bookkeeping for managing the program attachment chain.
+ *
+ * It might be tempting to set attach_tracing_prog flag at the attachment
+ * time, but this will not prevent from loading bunch of tracing prog
+ * first, then attach them one to another.
+ *
+ * The flag attach_tracing_prog is set for the whole program lifecycle, and
+ * doesn't have to be cleared in bpf_tracing_link_release, since tracing
+ * programs cannot change attachment target.
+ */
+ if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
+ dst_prog->type == BPF_PROG_TYPE_TRACING) {
+ prog->aux->attach_tracing_prog = true;
+ }
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
+ prog->aux->load_time = ktime_get_boottime_ns();
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
+ sizeof(attr->prog_name));
+ if (err < 0)
+ goto free_prog;
+
+ err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel);
+ if (err)
+ goto free_prog_sec;
+
/* run eBPF verifier */
- err = bpf_check(&prog, attr);
+ err = bpf_check(&prog, attr, uattr, uattr_size);
if (err < 0)
goto free_used_maps;
- /* fixup BPF_CALL->imm field */
- fixup_bpf_calls(prog);
-
- /* eBPF program is ready to be JITed */
- bpf_prog_select_runtime(prog);
+ prog = bpf_prog_select_runtime(prog, &err);
+ if (err < 0)
+ goto free_used_maps;
- err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+ err = bpf_prog_mark_insn_arrays_ready(prog);
if (err < 0)
- /* failed to allocate fd */
goto free_used_maps;
+ err = bpf_prog_alloc_id(prog);
+ if (err)
+ goto free_used_maps;
+
+ /* Upon success of bpf_prog_alloc_id(), the BPF prog is
+ * effectively publicly exposed. However, retrieving via
+ * bpf_prog_get_fd_by_id() will take another reference,
+ * therefore it cannot be gone underneath us.
+ *
+ * Only for the time /after/ successful bpf_prog_new_fd()
+ * and before returning to userspace, we might just hold
+ * one reference and any parallel close on that fd could
+ * rip everything out. Hence, below notifications must
+ * happen before bpf_prog_new_fd().
+ *
+ * Also, any failure handling from this point onwards must
+ * be using bpf_prog_put() given the program is exposed.
+ */
+ bpf_prog_kallsyms_add(prog);
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_LOAD);
+
+ err = bpf_prog_new_fd(prog);
+ if (err < 0)
+ bpf_prog_put(prog);
return err;
free_used_maps:
- free_used_maps(prog->aux);
+ /* In case we have subprogs, we need to wait for a grace
+ * period before we can tear down JIT memory since symbols
+ * are already exposed under kallsyms.
+ */
+ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
+ return err;
+
+free_prog_sec:
+ security_bpf_prog_free(prog);
free_prog:
+ free_uid(prog->aux->user);
+ if (prog->aux->attach_btf)
+ btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
+put_token:
+ bpf_token_put(token);
return err;
}
-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+#define BPF_OBJ_LAST_FIELD path_fd
+
+static int bpf_obj_pin(const union bpf_attr *attr)
+{
+ int path_fd;
+
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD)
+ return -EINVAL;
+
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
+ return -EINVAL;
+
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_pin_user(attr->bpf_fd, path_fd,
+ u64_to_user_ptr(attr->pathname));
+}
+
+static int bpf_obj_get(const union bpf_attr *attr)
+{
+ int path_fd;
+
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD))
+ return -EINVAL;
+
+ /* path_fd has to be accompanied by BPF_F_PATH_FD flag */
+ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
+ return -EINVAL;
+
+ path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
+ return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
+ attr->file_flags);
+}
+
+/* bpf_link_init_sleepable() allows to specify whether BPF link itself has
+ * "sleepable" semantics, which normally would mean that BPF link's attach
+ * hook can dereference link or link's underlying program for some time after
+ * detachment due to RCU Tasks Trace-based lifetime protection scheme.
+ * BPF program itself can be non-sleepable, yet, because it's transitively
+ * reachable through BPF link, its freeing has to be delayed until after RCU
+ * Tasks Trace GP.
+ */
+void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ enum bpf_attach_type attach_type, bool sleepable)
+{
+ WARN_ON(ops->dealloc && ops->dealloc_deferred);
+ atomic64_set(&link->refcnt, 1);
+ link->type = type;
+ link->sleepable = sleepable;
+ link->id = 0;
+ link->ops = ops;
+ link->prog = prog;
+ link->attach_type = attach_type;
+}
+
+void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ bpf_link_init_sleepable(link, type, ops, prog, attach_type, false);
+}
+
+static void bpf_link_free_id(int id)
+{
+ if (!id)
+ return;
+
+ spin_lock_bh(&link_idr_lock);
+ idr_remove(&link_idr, id);
+ spin_unlock_bh(&link_idr_lock);
+}
+
+/* Clean up bpf_link and corresponding anon_inode file and FD. After
+ * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
+ * anon_inode's release() call. This helper marks bpf_link as
+ * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
+ * is not decremented, it's the responsibility of a calling code that failed
+ * to complete bpf_link initialization.
+ * This helper eventually calls link's dealloc callback, but does not call
+ * link's release callback.
+ */
+void bpf_link_cleanup(struct bpf_link_primer *primer)
+{
+ primer->link->prog = NULL;
+ bpf_link_free_id(primer->id);
+ fput(primer->file);
+ put_unused_fd(primer->fd);
+}
+
+void bpf_link_inc(struct bpf_link *link)
+{
+ atomic64_inc(&link->refcnt);
+}
+
+static void bpf_link_dealloc(struct bpf_link *link)
+{
+ /* now that we know that bpf_link itself can't be reached, put underlying BPF program */
+ if (link->prog)
+ bpf_prog_put(link->prog);
+
+ /* free bpf_link and its containing memory */
+ if (link->ops->dealloc_deferred)
+ link->ops->dealloc_deferred(link);
+ else
+ link->ops->dealloc(link);
+}
+
+static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
{
- union bpf_attr attr = {};
+ struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+
+ bpf_link_dealloc(link);
+}
+
+static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_link_defer_dealloc_rcu_gp(rcu);
+ else
+ call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+}
+
+/* bpf_link_free is guaranteed to be called from process context */
+static void bpf_link_free(struct bpf_link *link)
+{
+ const struct bpf_link_ops *ops = link->ops;
+
+ bpf_link_free_id(link->id);
+ /* detach BPF program, clean up used resources */
+ if (link->prog)
+ ops->release(link);
+ if (ops->dealloc_deferred) {
+ /* Schedule BPF link deallocation, which will only then
+ * trigger putting BPF program refcount.
+ * If underlying BPF program is sleepable or BPF link's target
+ * attach hookpoint is sleepable or otherwise requires RCU GPs
+ * to ensure link and its underlying BPF program is not
+ * reachable anymore, we need to first wait for RCU tasks
+ * trace sync, and then go through "classic" RCU grace period
+ */
+ if (link->sleepable || (link->prog && link->prog->sleepable))
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ else
+ call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ } else if (ops->dealloc) {
+ bpf_link_dealloc(link);
+ }
+}
+
+static void bpf_link_put_deferred(struct work_struct *work)
+{
+ struct bpf_link *link = container_of(work, struct bpf_link, work);
+
+ bpf_link_free(link);
+}
+
+/* bpf_link_put might be called from atomic context. It needs to be called
+ * from sleepable context in order to acquire sleeping locks during the process.
+ */
+void bpf_link_put(struct bpf_link *link)
+{
+ if (!atomic64_dec_and_test(&link->refcnt))
+ return;
+
+ INIT_WORK(&link->work, bpf_link_put_deferred);
+ schedule_work(&link->work);
+}
+EXPORT_SYMBOL(bpf_link_put);
+
+static void bpf_link_put_direct(struct bpf_link *link)
+{
+ if (!atomic64_dec_and_test(&link->refcnt))
+ return;
+ bpf_link_free(link);
+}
+
+static int bpf_link_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_link *link = filp->private_data;
+
+ bpf_link_put_direct(link);
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
+static const char *bpf_link_type_strs[] = {
+ [BPF_LINK_TYPE_UNSPEC] = "<invalid>",
+#include <linux/bpf_types.h>
+};
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
+
+static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_link *link = filp->private_data;
+ const struct bpf_prog *prog = link->prog;
+ enum bpf_link_type type = link->type;
+ char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+
+ if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
+ if (link->type == BPF_LINK_TYPE_KPROBE_MULTI)
+ seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ?
+ "kretprobe_multi" : "kprobe_multi");
+ else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI)
+ seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ?
+ "uretprobe_multi" : "uprobe_multi");
+ else
+ seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
+ } else {
+ WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
+ seq_printf(m, "link_type:\t<%u>\n", type);
+ }
+ seq_printf(m, "link_id:\t%u\n", link->id);
+
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
+ if (link->ops->show_fdinfo)
+ link->ops->show_fdinfo(link, m);
+}
+#endif
+
+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct bpf_link *link = file->private_data;
+
+ return link->ops->poll(file, pts);
+}
+
+static const struct file_operations bpf_link_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_link_show_fdinfo,
+#endif
+ .release = bpf_link_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+};
+
+static const struct file_operations bpf_link_fops_poll = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_link_show_fdinfo,
+#endif
+ .release = bpf_link_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+ .poll = bpf_link_poll,
+};
+
+static int bpf_link_alloc_id(struct bpf_link *link)
+{
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_bh(&link_idr_lock);
+ id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
+ spin_unlock_bh(&link_idr_lock);
+ idr_preload_end();
+
+ return id;
+}
+
+/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
+ * reserving unused FD and allocating ID from link_idr. This is to be paired
+ * with bpf_link_settle() to install FD and ID and expose bpf_link to
+ * user-space, if bpf_link is successfully attached. If not, bpf_link and
+ * pre-allocated resources are to be freed with bpf_cleanup() call. All the
+ * transient state is passed around in struct bpf_link_primer.
+ * This is preferred way to create and initialize bpf_link, especially when
+ * there are complicated and expensive operations in between creating bpf_link
+ * itself and attaching it to BPF hook. By using bpf_link_prime() and
+ * bpf_link_settle() kernel code using bpf_link doesn't have to perform
+ * expensive (and potentially failing) roll back operations in a rare case
+ * that file, FD, or ID can't be allocated.
+ */
+int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
+{
+ struct file *file;
+ int fd, id;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+
+ id = bpf_link_alloc_id(link);
+ if (id < 0) {
+ put_unused_fd(fd);
+ return id;
+ }
+
+ file = anon_inode_getfile("bpf_link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
+ if (IS_ERR(file)) {
+ bpf_link_free_id(id);
+ put_unused_fd(fd);
+ return PTR_ERR(file);
+ }
+
+ primer->link = link;
+ primer->file = file;
+ primer->fd = fd;
+ primer->id = id;
+ return 0;
+}
+
+int bpf_link_settle(struct bpf_link_primer *primer)
+{
+ /* make bpf_link fetchable by ID */
+ spin_lock_bh(&link_idr_lock);
+ primer->link->id = primer->id;
+ spin_unlock_bh(&link_idr_lock);
+ /* make bpf_link fetchable by FD */
+ fd_install(primer->fd, primer->file);
+ /* pass through installed FD */
+ return primer->fd;
+}
+
+int bpf_link_new_fd(struct bpf_link *link)
+{
+ return anon_inode_getfd("bpf-link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
+}
+
+struct bpf_link *bpf_link_get_from_fd(u32 ufd)
+{
+ CLASS(fd, f)(ufd);
+ struct bpf_link *link;
+
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll)
+ return ERR_PTR(-EINVAL);
+
+ link = fd_file(f)->private_data;
+ bpf_link_inc(link);
+ return link;
+}
+EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL");
+
+static void bpf_tracing_link_release(struct bpf_link *link)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link.link);
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
+ tr_link->trampoline,
+ tr_link->tgt_prog));
+
+ bpf_trampoline_put(tr_link->trampoline);
+
+ /* tgt_prog is NULL if target is a kernel function */
+ if (tr_link->tgt_prog)
+ bpf_prog_put(tr_link->tgt_prog);
+}
+
+static void bpf_tracing_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link.link);
+
+ kfree(tr_link);
+}
+
+static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link.link);
+ u32 target_btf_id, target_obj_id;
+
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &target_obj_id, &target_btf_id);
+ seq_printf(seq,
+ "attach_type:\t%d\n"
+ "target_obj_id:\t%u\n"
+ "target_btf_id:\t%u\n"
+ "cookie:\t%llu\n",
+ link->attach_type,
+ target_obj_id,
+ target_btf_id,
+ tr_link->link.cookie);
+}
+
+static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_tracing_link *tr_link =
+ container_of(link, struct bpf_tracing_link, link.link);
+
+ info->tracing.attach_type = link->attach_type;
+ info->tracing.cookie = tr_link->link.cookie;
+ bpf_trampoline_unpack_key(tr_link->trampoline->key,
+ &info->tracing.target_obj_id,
+ &info->tracing.target_btf_id);
+
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_tracing_link_lops = {
+ .release = bpf_tracing_link_release,
+ .dealloc = bpf_tracing_link_dealloc,
+ .show_fdinfo = bpf_tracing_link_show_fdinfo,
+ .fill_link_info = bpf_tracing_link_fill_link_info,
+};
+
+static int bpf_tracing_prog_attach(struct bpf_prog *prog,
+ int tgt_prog_fd,
+ u32 btf_id,
+ u64 bpf_cookie,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_prog *tgt_prog = NULL;
+ struct bpf_trampoline *tr = NULL;
+ struct bpf_tracing_link *link;
+ u64 key = 0;
int err;
- /* the syscall is limited to root temporarily. This restriction will be
- * lifted when security audit is clean. Note that eBPF+tracing must have
- * this restriction, since it may pass kernel data to user space
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
+ prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->expected_attach_type != BPF_MODIFY_RETURN) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+ break;
+ case BPF_PROG_TYPE_EXT:
+ if (prog->expected_attach_type != 0) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ if (prog->expected_attach_type != BPF_LSM_MAC) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+ break;
+ default:
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ if (!!tgt_prog_fd != !!btf_id) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ if (tgt_prog_fd) {
+ /*
+ * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
+ * part would be changed to implement the same for
+ * BPF_PROG_TYPE_TRACING, do not forget to update the way how
+ * attach_tracing_prog flag is set.
+ */
+ if (prog->type != BPF_PROG_TYPE_EXT) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
+
+ tgt_prog = bpf_prog_get(tgt_prog_fd);
+ if (IS_ERR(tgt_prog)) {
+ err = PTR_ERR(tgt_prog);
+ tgt_prog = NULL;
+ goto out_put_prog;
+ }
+
+ key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_prog;
+ }
+ bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog, attach_type);
+
+ link->link.cookie = bpf_cookie;
+
+ mutex_lock(&prog->aux->dst_mutex);
+
+ /* There are a few possible cases here:
+ *
+ * - if prog->aux->dst_trampoline is set, the program was just loaded
+ * and not yet attached to anything, so we can use the values stored
+ * in prog->aux
+ *
+ * - if prog->aux->dst_trampoline is NULL, the program has already been
+ * attached to a target and its initial target was cleared (below)
+ *
+ * - if tgt_prog != NULL, the caller specified tgt_prog_fd +
+ * target_btf_id using the link_create API.
+ *
+ * - if tgt_prog == NULL when this function was called using the old
+ * raw_tracepoint_open API, and we need a target from prog->aux
+ *
+ * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
+ * was detached and is going for re-attachment.
+ *
+ * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
+ * are NULL, then program was already attached and user did not provide
+ * tgt_prog_fd so we have no way to find out or create trampoline
*/
+ if (!prog->aux->dst_trampoline && !tgt_prog) {
+ /*
+ * Allow re-attach for TRACING and LSM programs. If it's
+ * currently linked, bpf_trampoline_link_prog will fail.
+ * EXT programs need to specify tgt_prog_fd, so they
+ * re-attach in separate code path.
+ */
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ /* We can allow re-attach only if we have valid attach_btf. */
+ if (!prog->aux->attach_btf) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ btf_id = prog->aux->attach_btf_id;
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
+ }
+
+ if (!prog->aux->dst_trampoline ||
+ (key && key != prog->aux->dst_trampoline->key)) {
+ /* If there is no saved target, or the specified target is
+ * different from the destination specified at load time, we
+ * need a new trampoline and a check for compatibility
+ */
+ struct bpf_attach_target_info tgt_info = {};
+
+ err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
+ &tgt_info);
+ if (err)
+ goto out_unlock;
+
+ if (tgt_info.tgt_mod) {
+ module_put(prog->aux->mod);
+ prog->aux->mod = tgt_info.tgt_mod;
+ }
+
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ } else {
+ /* The caller didn't specify a target, or the target was the
+ * same as the destination supplied during program load. This
+ * means we can reuse the trampoline and reference from program
+ * load time, and there is no need to allocate a new one. This
+ * can only happen once for any program, as the saved values in
+ * prog->aux are cleared below.
+ */
+ tr = prog->aux->dst_trampoline;
+ tgt_prog = prog->aux->dst_prog;
+ }
+
+ err = bpf_link_prime(&link->link.link, &link_primer);
+ if (err)
+ goto out_unlock;
+
+ err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto out_unlock;
+ }
+
+ link->tgt_prog = tgt_prog;
+ link->trampoline = tr;
+
+ /* Always clear the trampoline and target prog from prog->aux to make
+ * sure the original attach destination is not kept alive after a
+ * program is (re-)attached to another target.
+ */
+ if (prog->aux->dst_prog &&
+ (tgt_prog_fd || tr != prog->aux->dst_trampoline))
+ /* got extra prog ref from syscall, or attaching to different prog */
+ bpf_prog_put(prog->aux->dst_prog);
+ if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
+ /* we allocated a new trampoline, so free the old one */
+ bpf_trampoline_put(prog->aux->dst_trampoline);
+
+ prog->aux->dst_prog = NULL;
+ prog->aux->dst_trampoline = NULL;
+ mutex_unlock(&prog->aux->dst_mutex);
+
+ return bpf_link_settle(&link_primer);
+out_unlock:
+ if (tr && tr != prog->aux->dst_trampoline)
+ bpf_trampoline_put(tr);
+ mutex_unlock(&prog->aux->dst_mutex);
+ kfree(link);
+out_put_prog:
+ if (tgt_prog_fd && tgt_prog)
+ bpf_prog_put(tgt_prog);
+ return err;
+}
+
+static void bpf_raw_tp_link_release(struct bpf_link *link)
+{
+ struct bpf_raw_tp_link *raw_tp =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ bpf_probe_unregister(raw_tp->btp, raw_tp);
+ bpf_put_raw_tracepoint(raw_tp->btp);
+}
+
+static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_raw_tp_link *raw_tp =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ kfree(raw_tp);
+}
+
+static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+
+ seq_printf(seq,
+ "tp_name:\t%s\n"
+ "cookie:\t%llu\n",
+ raw_tp_link->btp->tp->name,
+ raw_tp_link->cookie);
+}
+
+static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
+ u32 len)
+{
+ if (ulen >= len + 1) {
+ if (copy_to_user(ubuf, buf, len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, buf, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_raw_tp_link *raw_tp_link =
+ container_of(link, struct bpf_raw_tp_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
+ const char *tp_name = raw_tp_link->btp->tp->name;
+ u32 ulen = info->raw_tracepoint.tp_name_len;
+ size_t tp_len = strlen(tp_name);
+
+ if (!ulen ^ !ubuf)
+ return -EINVAL;
+
+ info->raw_tracepoint.tp_name_len = tp_len + 1;
+ info->raw_tracepoint.cookie = raw_tp_link->cookie;
+
+ if (!ubuf)
+ return 0;
+
+ return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len);
+}
+
+static const struct bpf_link_ops bpf_raw_tp_link_lops = {
+ .release = bpf_raw_tp_link_release,
+ .dealloc_deferred = bpf_raw_tp_link_dealloc,
+ .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
+ .fill_link_info = bpf_raw_tp_link_fill_link_info,
+};
+
+#ifdef CONFIG_PERF_EVENTS
+struct bpf_perf_link {
+ struct bpf_link link;
+ struct file *perf_file;
+};
+
+static void bpf_perf_link_release(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+ struct perf_event *event = perf_link->perf_file->private_data;
+
+ perf_event_free_bpf_prog(event);
+ fput(perf_link->perf_file);
+}
+
+static void bpf_perf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
+
+ kfree(perf_link);
+}
+
+static int bpf_perf_link_fill_common(const struct perf_event *event,
+ char __user *uname, u32 *ulenp,
+ u64 *probe_offset, u64 *probe_addr,
+ u32 *fd_type, unsigned long *missed)
+{
+ const char *buf;
+ u32 prog_id, ulen;
+ size_t len;
+ int err;
+
+ ulen = *ulenp;
+ if (!ulen ^ !uname)
+ return -EINVAL;
+
+ err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
+ probe_offset, probe_addr, missed);
+ if (err)
+ return err;
+
+ if (buf) {
+ len = strlen(buf);
+ *ulenp = len + 1;
+ } else {
+ *ulenp = 1;
+ }
+ if (!uname)
+ return 0;
+
+ if (buf) {
+ err = bpf_copy_to_user(uname, buf, ulen, len);
+ if (err)
+ return err;
+ } else {
+ char zero = '\0';
+
+ if (put_user(zero, uname))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_KPROBE_EVENTS
+static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ unsigned long missed;
+ char __user *uname;
+ u64 addr, offset;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
+ ulen = info->perf_event.kprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
+ &type, &missed);
+ if (err)
+ return err;
+ if (type == BPF_FD_TYPE_KRETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_KPROBE;
+ info->perf_event.kprobe.name_len = ulen;
+ info->perf_event.kprobe.offset = offset;
+ info->perf_event.kprobe.missed = missed;
+ if (!kallsyms_show_value(current_cred()))
+ addr = 0;
+ info->perf_event.kprobe.addr = addr;
+ info->perf_event.kprobe.cookie = event->bpf_cookie;
+ return 0;
+}
+
+static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ const char *name;
+ int err;
+ u32 prog_id, type;
+ u64 offset, addr;
+ unsigned long missed;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
+ &offset, &addr, &missed);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "name:\t%s\n"
+ "offset:\t%#llx\n"
+ "missed:\t%lu\n"
+ "addr:\t%#llx\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, offset, missed, addr,
+ type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe",
+ event->bpf_cookie);
+}
+#endif
+
+#ifdef CONFIG_UPROBE_EVENTS
+static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ u64 ref_ctr_offset, offset;
+ char __user *uname;
+ u32 ulen, type;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
+ ulen = info->perf_event.uprobe.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset,
+ &type, NULL);
+ if (err)
+ return err;
+
+ if (type == BPF_FD_TYPE_URETPROBE)
+ info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
+ else
+ info->perf_event.type = BPF_PERF_EVENT_UPROBE;
+ info->perf_event.uprobe.name_len = ulen;
+ info->perf_event.uprobe.offset = offset;
+ info->perf_event.uprobe.cookie = event->bpf_cookie;
+ info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset;
+ return 0;
+}
+
+static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ const char *name;
+ int err;
+ u32 prog_id, type;
+ u64 offset, ref_ctr_offset;
+ unsigned long missed;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
+ &offset, &ref_ctr_offset, &missed);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "name:\t%s\n"
+ "offset:\t%#llx\n"
+ "ref_ctr_offset:\t%#llx\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, offset, ref_ctr_offset,
+ type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe",
+ event->bpf_cookie);
+}
+#endif
+
+static int bpf_perf_link_fill_probe(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fill_kprobe(event, info);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fill_uprobe(event, info);
+#endif
+ return -EOPNOTSUPP;
+}
+
+static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ char __user *uname;
+ u32 ulen;
+ int err;
+
+ uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
+ ulen = info->perf_event.tracepoint.name_len;
+ err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL);
+ if (err)
+ return err;
+
+ info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
+ info->perf_event.tracepoint.name_len = ulen;
+ info->perf_event.tracepoint.cookie = event->bpf_cookie;
+ return 0;
+}
+
+static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
+ struct bpf_link_info *info)
+{
+ info->perf_event.event.type = event->attr.type;
+ info->perf_event.event.config = event->attr.config;
+ info->perf_event.event.cookie = event->bpf_cookie;
+ info->perf_event.type = BPF_PERF_EVENT_EVENT;
+ return 0;
+}
+
+static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return PTR_ERR(event);
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_link_fill_perf_event(event, info);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_perf_link_fill_tracepoint(event, info);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_perf_link_fill_probe(event, info);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ seq_printf(seq,
+ "type:\t%u\n"
+ "config:\t%llu\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ event->attr.type, event->attr.config,
+ "event", event->bpf_cookie);
+}
+
+static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ int err;
+ const char *name;
+ u32 prog_id;
+
+ err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL,
+ NULL, NULL);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "tp_name:\t%s\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, "tracepoint", event->bpf_cookie);
+}
+
+static void bpf_probe_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fdinfo_kprobe(event, seq);
+#endif
+
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fdinfo_uprobe(event, seq);
+#endif
+}
+
+static void bpf_perf_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return;
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_event_link_show_fdinfo(event, seq);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_tracepoint_link_show_fdinfo(event, seq);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_probe_link_show_fdinfo(event, seq);
+ default:
+ return;
+ }
+}
+
+static const struct bpf_link_ops bpf_perf_link_lops = {
+ .release = bpf_perf_link_release,
+ .dealloc = bpf_perf_link_dealloc,
+ .fill_link_info = bpf_perf_link_fill_link_info,
+ .show_fdinfo = bpf_perf_link_show_fdinfo,
+};
+
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_perf_link *link;
+ struct perf_event *event;
+ struct file *perf_file;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ perf_file = perf_event_get(attr->link_create.target_fd);
+ if (IS_ERR(perf_file))
+ return PTR_ERR(perf_file);
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_file;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog,
+ attr->link_create.attach_type);
+ link->perf_file = perf_file;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_file;
+ }
+
+ event = perf_file->private_data;
+ err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_file;
+ }
+ /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out_put_file:
+ fput(perf_file);
+ return err;
+}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PERF_EVENTS */
+
+static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
+ const char __user *user_tp_name, u64 cookie,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_raw_tp_link *link;
+ struct bpf_raw_event_map *btp;
+ const char *tp_name;
+ char buf[128];
+ int err;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_LSM:
+ if (user_tp_name)
+ /* The attach point for this category of programs
+ * should be specified via btf_id during program load.
+ */
+ return -EINVAL;
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ tp_name = prog->aux->attach_func_name;
+ break;
+ }
+ return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type);
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
+ return -EFAULT;
+ buf[sizeof(buf) - 1] = 0;
+ tp_name = buf;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ btp = bpf_get_raw_tracepoint(tp_name);
+ if (!btp)
+ return -ENOENT;
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_btp;
+ }
+ bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ &bpf_raw_tp_link_lops, prog, attach_type,
+ tracepoint_is_faultable(btp->tp));
+ link->btp = btp;
+ link->cookie = cookie;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_btp;
+ }
+
+ err = bpf_probe_register(link->btp, link);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_btp;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+out_put_btp:
+ bpf_put_raw_tracepoint(btp);
+ return err;
+}
+
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
+
+static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ void __user *tp_name;
+ __u64 cookie;
+ int fd;
+
+ if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
+ cookie = attr->raw_tracepoint.cookie;
+ fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type);
+ if (fd < 0)
+ bpf_prog_put(prog);
+ return fd;
+}
+
+static enum bpf_prog_type
+attach_type_to_prog_type(enum bpf_attach_type attach_type)
+{
+ switch (attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ return BPF_PROG_TYPE_CGROUP_SKB;
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ return BPF_PROG_TYPE_CGROUP_SOCK;
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+ case BPF_CGROUP_SOCK_OPS:
+ return BPF_PROG_TYPE_SOCK_OPS;
+ case BPF_CGROUP_DEVICE:
+ return BPF_PROG_TYPE_CGROUP_DEVICE;
+ case BPF_SK_MSG_VERDICT:
+ return BPF_PROG_TYPE_SK_MSG;
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return BPF_PROG_TYPE_SK_SKB;
+ case BPF_LIRC_MODE2:
+ return BPF_PROG_TYPE_LIRC_MODE2;
+ case BPF_FLOW_DISSECTOR:
+ return BPF_PROG_TYPE_FLOW_DISSECTOR;
+ case BPF_CGROUP_SYSCTL:
+ return BPF_PROG_TYPE_CGROUP_SYSCTL;
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ return BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ return BPF_PROG_TYPE_TRACING;
+ case BPF_LSM_MAC:
+ return BPF_PROG_TYPE_LSM;
+ case BPF_SK_LOOKUP:
+ return BPF_PROG_TYPE_SK_LOOKUP;
+ case BPF_XDP:
+ return BPF_PROG_TYPE_XDP;
+ case BPF_LSM_CGROUP:
+ return BPF_PROG_TYPE_LSM;
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return BPF_PROG_TYPE_SCHED_CLS;
+ default:
+ return BPF_PROG_TYPE_UNSPEC;
+ }
+}
+
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_prog_type ptype;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
+
+ ptype = attach_type_to_prog_type(attach_type);
+ if (prog->type != ptype)
+ return -EINVAL;
+
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
+ case BPF_PROG_TYPE_EXT:
+ return 0;
+ case BPF_PROG_TYPE_NETFILTER:
+ if (attach_type != BPF_NETFILTER)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ if (attach_type != BPF_PERF_EVENT)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_KPROBE:
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_KPROBE_SESSION)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_MULTI)
+ return -EINVAL;
+ if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
+ return -EINVAL;
+ if (attach_type != BPF_PERF_EVENT &&
+ attach_type != BPF_TRACE_KPROBE_MULTI &&
+ attach_type != BPF_TRACE_KPROBE_SESSION &&
+ attach_type != BPF_TRACE_UPROBE_MULTI &&
+ attach_type != BPF_TRACE_UPROBE_SESSION)
+ return -EINVAL;
+ return 0;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attach_type != BPF_TCX_INGRESS &&
+ attach_type != BPF_TCX_EGRESS &&
+ attach_type != BPF_NETKIT_PRIMARY &&
+ attach_type != BPF_NETKIT_PEER)
+ return -EINVAL;
+ return 0;
+ default:
+ ptype = attach_type_to_prog_type(attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type)
+ return -EINVAL;
+ return 0;
+ }
+}
+
+static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
+ bool check_atype)
+{
+ switch (ptype) {
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ return true;
+ case BPF_PROG_TYPE_LSM:
+ return check_atype ? atype == BPF_LSM_CGROUP : true;
+ default:
+ return false;
+ }
+}
+
+#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
+
+#define BPF_F_ATTACH_MASK_BASE \
+ (BPF_F_ALLOW_OVERRIDE | \
+ BPF_F_ALLOW_MULTI | \
+ BPF_F_REPLACE | \
+ BPF_F_PREORDER)
+
+#define BPF_F_ATTACH_MASK_MPROG \
+ (BPF_F_REPLACE | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_ID | \
+ BPF_F_LINK)
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_ATTACH))
+ return -EINVAL;
+
+ ptype = attach_type_to_prog_type(attr->attach_type);
+ if (ptype == BPF_PROG_TYPE_UNSPEC)
+ return -EINVAL;
+ if (bpf_mprog_supported(ptype)) {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ } else if (is_cgroup_prog_type(ptype, 0, false)) {
+ if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG))
+ return -EINVAL;
+ } else {
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
+ return -EINVAL;
+ if (attr->relative_fd ||
+ attr->expected_revision)
+ return -EINVAL;
+ }
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ goto out;
+ }
+
+ switch (ptype) {
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ ret = sock_map_get_from_fd(attr, prog);
+ break;
+ case BPF_PROG_TYPE_LIRC_MODE2:
+ ret = lirc_prog_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ ret = netns_bpf_prog_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_attach(attr, prog);
+ else
+ ret = netkit_prog_attach(attr, prog);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+out:
+ if (ret)
+ bpf_prog_put(prog);
+ return ret;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD expected_revision
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog = NULL;
+ enum bpf_prog_type ptype;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_DETACH))
+ return -EINVAL;
+
+ ptype = attach_type_to_prog_type(attr->attach_type);
+ if (bpf_mprog_supported(ptype)) {
+ if (ptype == BPF_PROG_TYPE_UNSPEC)
+ return -EINVAL;
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+ return -EINVAL;
+ if (attr->attach_bpf_fd) {
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+ } else if (is_cgroup_prog_type(ptype, 0, false)) {
+ if (attr->attach_flags || attr->relative_fd)
+ return -EINVAL;
+ } else if (attr->attach_flags ||
+ attr->relative_fd ||
+ attr->expected_revision) {
+ return -EINVAL;
+ }
+
+ switch (ptype) {
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_SK_SKB:
+ ret = sock_map_prog_detach(attr, ptype);
+ break;
+ case BPF_PROG_TYPE_LIRC_MODE2:
+ ret = lirc_prog_detach(attr);
+ break;
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ ret = netns_bpf_prog_detach(attr, ptype);
+ break;
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_LSM:
+ ret = cgroup_bpf_prog_detach(attr, ptype);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->attach_type == BPF_TCX_INGRESS ||
+ attr->attach_type == BPF_TCX_EGRESS)
+ ret = tcx_prog_detach(attr, prog);
+ else
+ ret = netkit_prog_detach(attr, prog);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (prog)
+ bpf_prog_put(prog);
+ return ret;
+}
+
+#define BPF_PROG_QUERY_LAST_FIELD query.revision
+
+static int bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ if (!bpf_net_capable())
+ return -EPERM;
+ if (CHECK_ATTR(BPF_PROG_QUERY))
+ return -EINVAL;
+ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+ return -EINVAL;
+
+ switch (attr->query.attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_SOCK_OPS:
+ case BPF_CGROUP_DEVICE:
+ case BPF_CGROUP_SYSCTL:
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ case BPF_LSM_CGROUP:
+ return cgroup_bpf_prog_query(attr, uattr);
+ case BPF_LIRC_MODE2:
+ return lirc_prog_query(attr, uattr);
+ case BPF_FLOW_DISSECTOR:
+ case BPF_SK_LOOKUP:
+ return netns_bpf_prog_query(attr, uattr);
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_MSG_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return sock_map_bpf_prog_query(attr, uattr);
+ case BPF_TCX_INGRESS:
+ case BPF_TCX_EGRESS:
+ return tcx_prog_query(attr, uattr);
+ case BPF_NETKIT_PRIMARY:
+ case BPF_NETKIT_PEER:
+ return netkit_prog_query(attr, uattr);
+ default:
+ return -EINVAL;
+ }
+}
+
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
+
+static int bpf_prog_test_run(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog *prog;
+ int ret = -ENOTSUPP;
+
+ if (CHECK_ATTR(BPF_PROG_TEST_RUN))
+ return -EINVAL;
+
+ if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
+ (!attr->test.ctx_size_in && attr->test.ctx_in))
+ return -EINVAL;
+
+ if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
+ (!attr->test.ctx_size_out && attr->test.ctx_out))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->test.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->aux->ops->test_run)
+ ret = prog->aux->ops->test_run(prog, attr, uattr);
+
+ bpf_prog_put(prog);
+ return ret;
+}
+
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ struct idr *idr,
+ spinlock_t *lock)
+{
+ u32 next_id = attr->start_id;
+ int err = 0;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+ return -EINVAL;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!access_ok(VERIFY_READ, uattr, 1))
+ next_id++;
+ spin_lock_bh(lock);
+ if (!idr_get_next(idr, &next_id))
+ err = -ENOENT;
+ spin_unlock_bh(lock);
+
+ if (!err)
+ err = put_user(next_id, &uattr->next_id);
+
+ return err;
+}
+
+struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
+{
+ struct bpf_map *map;
+
+ spin_lock_bh(&map_idr_lock);
+again:
+ map = idr_get_next(&map_idr, id);
+ if (map) {
+ map = __bpf_map_inc_not_zero(map, false);
+ if (IS_ERR(map)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&map_idr_lock);
+
+ return map;
+}
+
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
+{
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&prog_idr_lock);
+again:
+ prog = idr_get_next(&prog_idr, id);
+ if (prog) {
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&prog_idr_lock);
+
+ return prog;
+}
+
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+struct bpf_prog *bpf_prog_by_id(u32 id)
+{
+ struct bpf_prog *prog;
+
+ if (!id)
+ return ERR_PTR(-ENOENT);
+
+ spin_lock_bh(&prog_idr_lock);
+ prog = idr_find(&prog_idr, id);
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ else
+ prog = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&prog_idr_lock);
+ return prog;
+}
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ prog = bpf_prog_by_id(id);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_prog_new_fd(prog);
+ if (fd < 0)
+ bpf_prog_put(prog);
+
+ return fd;
+}
+
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ u32 id = attr->map_id;
+ int f_flags;
+ int fd;
+
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+ attr->open_flags & ~BPF_OBJ_FLAG_MASK)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ f_flags = bpf_get_file_flag(attr->open_flags);
+ if (f_flags < 0)
+ return f_flags;
+
+ spin_lock_bh(&map_idr_lock);
+ map = idr_find(&map_idr, id);
+ if (map)
+ map = __bpf_map_inc_not_zero(map, true);
+ else
+ map = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&map_idr_lock);
+
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ fd = bpf_map_new_fd(map, f_flags);
+ if (fd < 0)
+ bpf_map_put_with_uref(map);
+
+ return fd;
+}
+
+static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
+ unsigned long addr, u32 *off,
+ u32 *type)
+{
+ const struct bpf_map *map;
+ int i;
+
+ mutex_lock(&prog->aux->used_maps_mutex);
+ for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (map == (void *)addr) {
+ *type = BPF_PSEUDO_MAP_FD;
+ goto out;
+ }
+ if (!map->ops->map_direct_value_meta)
+ continue;
+ if (!map->ops->map_direct_value_meta(map, addr, off)) {
+ *type = BPF_PSEUDO_MAP_VALUE;
+ goto out;
+ }
+ }
+ map = NULL;
+
+out:
+ mutex_unlock(&prog->aux->used_maps_mutex);
+ return map;
+}
+
+static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
+ const struct cred *f_cred)
+{
+ const struct bpf_map *map;
+ struct bpf_insn *insns;
+ u32 off, type;
+ u64 imm;
+ u8 code;
+ int i;
+
+ insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
+ GFP_USER);
+ if (!insns)
+ return insns;
+
+ for (i = 0; i < prog->len; i++) {
+ code = insns[i].code;
+
+ if (code == (BPF_JMP | BPF_TAIL_CALL)) {
+ insns[i].code = BPF_JMP | BPF_CALL;
+ insns[i].imm = BPF_FUNC_tail_call;
+ /* fall-through */
+ }
+ if (code == (BPF_JMP | BPF_CALL) ||
+ code == (BPF_JMP | BPF_CALL_ARGS)) {
+ if (code == (BPF_JMP | BPF_CALL_ARGS))
+ insns[i].code = BPF_JMP | BPF_CALL;
+ if (!bpf_dump_raw_ok(f_cred))
+ insns[i].imm = 0;
+ continue;
+ }
+ if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
+ insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
+ continue;
+ }
+
+ if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX ||
+ BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
+ insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM;
+ continue;
+ }
+
+ if (code != (BPF_LD | BPF_IMM | BPF_DW))
+ continue;
+
+ imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
+ map = bpf_map_from_imm(prog, imm, &off, &type);
+ if (map) {
+ insns[i].src_reg = type;
+ insns[i].imm = map->id;
+ insns[i + 1].imm = off;
+ continue;
+ }
+ }
+
+ return insns;
+}
+
+static int set_info_rec_size(struct bpf_prog_info *info)
+{
+ /*
+ * Ensure info.*_rec_size is the same as kernel expected size
+ *
+ * or
+ *
+ * Only allow zero *_rec_size if both _rec_size and _cnt are
+ * zero. In this case, the kernel will set the expected
+ * _rec_size back to the info.
+ */
+
+ if ((info->nr_func_info || info->func_info_rec_size) &&
+ info->func_info_rec_size != sizeof(struct bpf_func_info))
+ return -EINVAL;
+
+ if ((info->nr_line_info || info->line_info_rec_size) &&
+ info->line_info_rec_size != sizeof(struct bpf_line_info))
+ return -EINVAL;
+
+ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
+ info->jited_line_info_rec_size != sizeof(__u64))
+ return -EINVAL;
+
+ info->func_info_rec_size = sizeof(struct bpf_func_info);
+ info->line_info_rec_size = sizeof(struct bpf_line_info);
+ info->jited_line_info_rec_size = sizeof(__u64);
+
+ return 0;
+}
+
+static int bpf_prog_get_info_by_fd(struct file *file,
+ struct bpf_prog *prog,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct btf *attach_btf = bpf_prog_get_target_btf(prog);
+ struct bpf_prog_info info;
+ u32 info_len = attr->info.info_len;
+ struct bpf_prog_kstats stats;
+ char __user *uinsns;
+ u32 ulen;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
- if (size > PAGE_SIZE) /* silly large */
- return -E2BIG;
+ info.type = prog->type;
+ info.id = prog->aux->id;
+ info.load_time = prog->aux->load_time;
+ info.created_by_uid = from_kuid_munged(current_user_ns(),
+ prog->aux->user->uid);
+ info.gpl_compatible = prog->gpl_compatible;
+
+ memcpy(info.tag, prog->tag, sizeof(prog->tag));
+ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+ mutex_lock(&prog->aux->used_maps_mutex);
+ ulen = info.nr_map_ids;
+ info.nr_map_ids = prog->aux->used_map_cnt;
+ ulen = min_t(u32, info.nr_map_ids, ulen);
+ if (ulen) {
+ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+ u32 i;
+
+ for (i = 0; i < ulen; i++)
+ if (put_user(prog->aux->used_maps[i]->id,
+ &user_map_ids[i])) {
+ mutex_unlock(&prog->aux->used_maps_mutex);
+ return -EFAULT;
+ }
+ }
+ mutex_unlock(&prog->aux->used_maps_mutex);
+
+ err = set_info_rec_size(&info);
+ if (err)
+ return err;
+
+ bpf_prog_get_stats(prog, &stats);
+ info.run_time_ns = stats.nsecs;
+ info.run_cnt = stats.cnt;
+ info.recursion_misses = stats.misses;
+
+ info.verified_insns = prog->aux->verified_insns;
+ if (prog->aux->btf)
+ info.btf_id = btf_obj_id(prog->aux->btf);
+
+ if (!bpf_capable()) {
+ info.jited_prog_len = 0;
+ info.xlated_prog_len = 0;
+ info.nr_jited_ksyms = 0;
+ info.nr_jited_func_lens = 0;
+ info.nr_func_info = 0;
+ info.nr_line_info = 0;
+ info.nr_jited_line_info = 0;
+ goto done;
+ }
+
+ ulen = info.xlated_prog_len;
+ info.xlated_prog_len = bpf_prog_insn_size(prog);
+ if (info.xlated_prog_len && ulen) {
+ struct bpf_insn *insns_sanitized;
+ bool fault;
- /* If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
+ if (!prog->blinded || bpf_dump_raw_ok(file->f_cred)) {
+ insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
+ if (!insns_sanitized)
+ return -ENOMEM;
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ fault = copy_to_user(uinsns, insns_sanitized, ulen);
+ kfree(insns_sanitized);
+ if (fault)
+ return -EFAULT;
+ } else {
+ info.xlated_prog_insns = 0;
+ }
+ }
+
+ if (bpf_prog_is_offloaded(prog->aux)) {
+ err = bpf_prog_offload_info_fill(&info, prog);
+ if (err)
+ return err;
+ goto done;
+ }
+
+ /* NOTE: the following code is supposed to be skipped for offload.
+ * bpf_prog_offload_info_fill() is the place to fill similar fields
+ * for offload.
*/
- if (size > sizeof(attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
+ ulen = info.jited_prog_len;
+ if (prog->aux->func_cnt) {
+ u32 i;
- addr = (void __user *)uattr + sizeof(attr);
- end = (void __user *)uattr + size;
+ info.jited_prog_len = 0;
+ for (i = 0; i < prog->aux->func_cnt; i++)
+ info.jited_prog_len += prog->aux->func[i]->jited_len;
+ } else {
+ info.jited_prog_len = prog->jited_len;
+ }
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
+ if (info.jited_prog_len && ulen) {
+ if (bpf_dump_raw_ok(file->f_cred)) {
+ uinsns = u64_to_user_ptr(info.jited_prog_insns);
+ ulen = min_t(u32, info.jited_prog_len, ulen);
+
+ /* for multi-function programs, copy the JITed
+ * instructions for all the functions
+ */
+ if (prog->aux->func_cnt) {
+ u32 len, free, i;
+ u8 *img;
+
+ free = ulen;
+ for (i = 0; i < prog->aux->func_cnt; i++) {
+ len = prog->aux->func[i]->jited_len;
+ len = min_t(u32, len, free);
+ img = (u8 *) prog->aux->func[i]->bpf_func;
+ if (copy_to_user(uinsns, img, len))
+ return -EFAULT;
+ uinsns += len;
+ free -= len;
+ if (!free)
+ break;
+ }
+ } else {
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_prog_insns = 0;
+ }
+ }
+
+ ulen = info.nr_jited_ksyms;
+ info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
+ if (ulen) {
+ if (bpf_dump_raw_ok(file->f_cred)) {
+ unsigned long ksym_addr;
+ u64 __user *user_ksyms;
+ u32 i;
+
+ /* copy the address of the kernel symbol
+ * corresponding to each function
+ */
+ ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+ user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+ if (prog->aux->func_cnt) {
+ for (i = 0; i < ulen; i++) {
+ ksym_addr = (unsigned long)
+ prog->aux->func[i]->bpf_func;
+ if (put_user((u64) ksym_addr,
+ &user_ksyms[i]))
+ return -EFAULT;
+ }
+ } else {
+ ksym_addr = (unsigned long) prog->bpf_func;
+ if (put_user((u64) ksym_addr, &user_ksyms[0]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_ksyms = 0;
+ }
+ }
+
+ ulen = info.nr_jited_func_lens;
+ info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
+ if (ulen) {
+ if (bpf_dump_raw_ok(file->f_cred)) {
+ u32 __user *user_lens;
+ u32 func_len, i;
+
+ /* copy the JITed image lengths for each function */
+ ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+ user_lens = u64_to_user_ptr(info.jited_func_lens);
+ if (prog->aux->func_cnt) {
+ for (i = 0; i < ulen; i++) {
+ func_len =
+ prog->aux->func[i]->jited_len;
+ if (put_user(func_len, &user_lens[i]))
+ return -EFAULT;
+ }
+ } else {
+ func_len = prog->jited_len;
+ if (put_user(func_len, &user_lens[0]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_func_lens = 0;
+ }
+ }
+
+ info.attach_btf_id = prog->aux->attach_btf_id;
+ if (attach_btf)
+ info.attach_btf_obj_id = btf_obj_id(attach_btf);
+
+ ulen = info.nr_func_info;
+ info.nr_func_info = prog->aux->func_info_cnt;
+ if (info.nr_func_info && ulen) {
+ char __user *user_finfo;
+
+ user_finfo = u64_to_user_ptr(info.func_info);
+ ulen = min_t(u32, info.nr_func_info, ulen);
+ if (copy_to_user(user_finfo, prog->aux->func_info,
+ info.func_info_rec_size * ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.nr_line_info;
+ info.nr_line_info = prog->aux->nr_linfo;
+ if (info.nr_line_info && ulen) {
+ __u8 __user *user_linfo;
+
+ user_linfo = u64_to_user_ptr(info.line_info);
+ ulen = min_t(u32, info.nr_line_info, ulen);
+ if (copy_to_user(user_linfo, prog->aux->linfo,
+ info.line_info_rec_size * ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.nr_jited_line_info;
+ if (prog->aux->jited_linfo)
+ info.nr_jited_line_info = prog->aux->nr_linfo;
+ else
+ info.nr_jited_line_info = 0;
+ if (info.nr_jited_line_info && ulen) {
+ if (bpf_dump_raw_ok(file->f_cred)) {
+ unsigned long line_addr;
+ __u64 __user *user_linfo;
+ u32 i;
+
+ user_linfo = u64_to_user_ptr(info.jited_line_info);
+ ulen = min_t(u32, info.nr_jited_line_info, ulen);
+ for (i = 0; i < ulen; i++) {
+ line_addr = (unsigned long)prog->aux->jited_linfo[i];
+ if (put_user((__u64)line_addr, &user_linfo[i]))
+ return -EFAULT;
+ }
+ } else {
+ info.jited_line_info = 0;
+ }
+ }
+
+ ulen = info.nr_prog_tags;
+ info.nr_prog_tags = prog->aux->func_cnt ? : 1;
+ if (ulen) {
+ __u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
+ u32 i;
+
+ user_prog_tags = u64_to_user_ptr(info.prog_tags);
+ ulen = min_t(u32, info.nr_prog_tags, ulen);
+ if (prog->aux->func_cnt) {
+ for (i = 0; i < ulen; i++) {
+ if (copy_to_user(user_prog_tags[i],
+ prog->aux->func[i]->tag,
+ BPF_TAG_SIZE))
+ return -EFAULT;
+ }
+ } else {
+ if (copy_to_user(user_prog_tags[0],
+ prog->tag, BPF_TAG_SIZE))
+ return -EFAULT;
+ }
+ }
+
+done:
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct file *file,
+ struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_map_info info;
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
+ info.type = map->map_type;
+ info.id = map->id;
+ info.key_size = map->key_size;
+ info.value_size = map->value_size;
+ info.max_entries = map->max_entries;
+ info.map_flags = map->map_flags;
+ info.map_extra = map->map_extra;
+ memcpy(info.name, map->name, sizeof(map->name));
+
+ if (map->btf) {
+ info.btf_id = btf_obj_id(map->btf);
+ info.btf_key_type_id = map->btf_key_type_id;
+ info.btf_value_type_id = map->btf_value_type_id;
+ }
+ info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
+ bpf_map_struct_ops_info_fill(&info, map);
+
+ if (bpf_map_is_offloaded(map)) {
+ err = bpf_map_offload_info_fill(&info, map);
+ if (err)
+ return err;
+ }
+
+ if (info.hash) {
+ char __user *uhash = u64_to_user_ptr(info.hash);
+
+ if (!map->ops->map_get_hash)
+ return -EINVAL;
+
+ if (info.hash_size != SHA256_DIGEST_SIZE)
+ return -EINVAL;
+
+ err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
+ if (err != 0)
+ return err;
+
+ if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0)
+ return -EFAULT;
+ } else if (info.hash_size) {
+ return -EINVAL;
+ }
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bpf_btf_get_info_by_fd(struct file *file,
+ struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
+ if (err)
+ return err;
+
+ return btf_get_info_by_fd(btf, attr, uattr);
+}
+
+static int bpf_link_get_info_by_fd(struct file *file,
+ struct bpf_link *link,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_link_info info;
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ memset(&info, 0, sizeof(info));
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
+ info.type = link->type;
+ info.id = link->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
+
+ if (link->ops->fill_link_info) {
+ err = link->ops->fill_link_info(link, &info);
+ if (err)
+ return err;
+ }
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+
+static int token_get_info_by_fd(struct file *file,
+ struct bpf_token *token,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
+ if (err)
+ return err;
+ return bpf_token_get_info_by_fd(token, attr, uattr);
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->info.bpf_fd);
+ if (fd_empty(f))
+ return -EBADFD;
+
+ if (fd_file(f)->f_op == &bpf_prog_fops)
+ return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
+ uattr);
+ else if (fd_file(f)->f_op == &bpf_map_fops)
+ return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
+ uattr);
+ else if (fd_file(f)->f_op == &btf_fops)
+ return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+ return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
+ attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_token_fops)
+ return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
+ attr, uattr);
+ return -EINVAL;
+}
+
+#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
+
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+{
+ struct bpf_token *token = NULL;
+
+ if (CHECK_ATTR(BPF_BTF_LOAD))
+ return -EINVAL;
+
+ if (attr->btf_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->btf_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->btf_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_BPF)) {
+ bpf_token_put(token);
+ return -EPERM;
+ }
+
+ bpf_token_put(token);
+
+ return btf_new_fd(attr, uattr, uattr_size);
+}
+
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_token *token = NULL;
+
+ if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (attr->open_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->open_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->fd_by_id_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_SYS_ADMIN)) {
+ bpf_token_put(token);
+ return -EPERM;
+ }
+
+ bpf_token_put(token);
+
+ return btf_get_fd_by_id(attr->btf_id);
+}
+
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ u32 prog_id, u32 fd_type,
+ const char *buf, u64 probe_offset,
+ u64 probe_addr)
+{
+ char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+ u32 len = buf ? strlen(buf) : 0, input_len;
+ int err = 0;
+
+ if (put_user(len, &uattr->task_fd_query.buf_len))
+ return -EFAULT;
+ input_len = attr->task_fd_query.buf_len;
+ if (input_len && ubuf) {
+ if (!len) {
+ /* nothing to copy, just make ubuf NULL terminated */
+ char zero = '\0';
+
+ if (put_user(zero, ubuf))
+ return -EFAULT;
+ } else {
+ err = bpf_copy_to_user(ubuf, buf, input_len, len);
+ if (err == -EFAULT)
return err;
- if (val)
- return -E2BIG;
}
- size = sizeof(attr);
}
+ if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+ put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+ put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+ put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+ return -EFAULT;
+
+ return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ pid_t pid = attr->task_fd_query.pid;
+ u32 fd = attr->task_fd_query.fd;
+ const struct perf_event *event;
+ struct task_struct *task;
+ struct file *file;
+ int err;
+
+ if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (attr->task_fd_query.flags != 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task)
+ return -ENOENT;
+
+ err = 0;
+ file = fget_task(task, fd);
+ put_task_struct(task);
+ if (!file)
+ return -EBADF;
+
+ if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
+ struct bpf_link *link = file->private_data;
+
+ if (link->ops == &bpf_raw_tp_link_lops) {
+ struct bpf_raw_tp_link *raw_tp =
+ container_of(link, struct bpf_raw_tp_link, link);
+ struct bpf_raw_event_map *btp = raw_tp->btp;
+
+ err = bpf_task_fd_query_copy(attr, uattr,
+ raw_tp->link.prog->aux->id,
+ BPF_FD_TYPE_RAW_TRACEPOINT,
+ btp->tp->name, 0, 0);
+ goto put_file;
+ }
+ goto out_not_supp;
+ }
+
+ event = perf_get_event(file);
+ if (!IS_ERR(event)) {
+ u64 probe_offset, probe_addr;
+ u32 prog_id, fd_type;
+ const char *buf;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+ &buf, &probe_offset,
+ &probe_addr, NULL);
+ if (!err)
+ err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+ fd_type, buf,
+ probe_offset,
+ probe_addr);
+ goto put_file;
+ }
+
+out_not_supp:
+ err = -ENOTSUPP;
+put_file:
+ fput(file);
+ return err;
+}
+
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn, ...) \
+ do { \
+ if (!fn) { \
+ err = -ENOTSUPP; \
+ goto err_put; \
+ } \
+ err = fn(__VA_ARGS__); \
+ } while (0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ int cmd)
+{
+ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
+ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
+ struct bpf_map *map;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_BATCH))
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->batch.map_fd);
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ if (has_write)
+ bpf_map_write_active_inc(map);
+ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd == BPF_MAP_LOOKUP_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
+ else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
+ else if (cmd == BPF_MAP_UPDATE_BATCH)
+ BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
+ else
+ BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
+err_put:
+ if (has_write) {
+ maybe_wait_bpf_programs(map);
+ bpf_map_write_active_dec(map);
+ }
+ return err;
+}
+
+#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
+static int link_create(union bpf_attr *attr, bpfptr_t uattr)
+{
+ struct bpf_prog *prog;
+ int ret;
+
+ if (CHECK_ATTR(BPF_LINK_CREATE))
+ return -EINVAL;
+
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
+ prog = bpf_prog_get(attr->link_create.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ ret = bpf_prog_attach_check_attach_type(prog,
+ attr->link_create.attach_type);
+ if (ret)
+ goto out;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ ret = cgroup_bpf_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_EXT:
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
+ break;
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_TRACING:
+ if (attr->link_create.attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
+ else if (prog->expected_attach_type == BPF_TRACE_ITER)
+ ret = bpf_iter_link_attach(attr, uattr, prog);
+ else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ ret = cgroup_bpf_link_attach(attr, prog);
+ else
+ ret = bpf_tracing_prog_attach(prog,
+ attr->link_create.target_fd,
+ attr->link_create.target_btf_id,
+ attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
+ break;
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ ret = netns_bpf_link_create(attr, prog);
+ break;
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_SK_SKB:
+ ret = sock_map_link_create(attr, prog);
+ break;
+#ifdef CONFIG_NET
+ case BPF_PROG_TYPE_XDP:
+ ret = bpf_xdp_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
+ attr->link_create.attach_type == BPF_TCX_EGRESS)
+ ret = tcx_link_attach(attr, prog);
+ else
+ ret = netkit_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_NETFILTER:
+ ret = bpf_nf_link_attach(attr, prog);
+ break;
+#endif
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ ret = bpf_perf_link_attach(attr, prog);
+ break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
+ ret = bpf_kprobe_multi_link_attach(attr, prog);
+ else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
+ attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
+ ret = bpf_uprobe_multi_link_attach(attr, prog);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+out:
+ if (ret < 0)
+ bpf_prog_put(prog);
+ return ret;
+}
+
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return PTR_ERR(new_map);
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = PTR_ERR(old_map);
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
+#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
+
+static int link_update(union bpf_attr *attr)
+{
+ struct bpf_prog *old_prog = NULL, *new_prog;
+ struct bpf_link *link;
+ u32 flags;
+ int ret;
+
+ if (CHECK_ATTR(BPF_LINK_UPDATE))
+ return -EINVAL;
+
+ flags = attr->link_update.flags;
+ if (flags & ~BPF_F_REPLACE)
+ return -EINVAL;
+
+ link = bpf_link_get_from_fd(attr->link_update.link_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
+ new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
+ if (IS_ERR(new_prog)) {
+ ret = PTR_ERR(new_prog);
+ goto out_put_link;
+ }
+
+ if (flags & BPF_F_REPLACE) {
+ old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
+ if (IS_ERR(old_prog)) {
+ ret = PTR_ERR(old_prog);
+ old_prog = NULL;
+ goto out_put_progs;
+ }
+ } else if (attr->link_update.old_prog_fd) {
+ ret = -EINVAL;
+ goto out_put_progs;
+ }
+
+ if (link->ops->update_prog)
+ ret = link->ops->update_prog(link, new_prog, old_prog);
+ else
+ ret = -EINVAL;
+
+out_put_progs:
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ if (ret)
+ bpf_prog_put(new_prog);
+out_put_link:
+ bpf_link_put_direct(link);
+ return ret;
+}
+
+#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
+
+static int link_detach(union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ int ret;
+
+ if (CHECK_ATTR(BPF_LINK_DETACH))
+ return -EINVAL;
+
+ link = bpf_link_get_from_fd(attr->link_detach.link_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ if (link->ops->detach)
+ ret = link->ops->detach(link);
+ else
+ ret = -EOPNOTSUPP;
+
+ bpf_link_put_direct(link);
+ return ret;
+}
+
+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
+{
+ return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(bpf_link_inc_not_zero);
+
+struct bpf_link *bpf_link_by_id(u32 id)
+{
+ struct bpf_link *link;
+
+ if (!id)
+ return ERR_PTR(-ENOENT);
+
+ spin_lock_bh(&link_idr_lock);
+ /* before link is "settled", ID is 0, pretend it doesn't exist yet */
+ link = idr_find(&link_idr, id);
+ if (link) {
+ if (link->id)
+ link = bpf_link_inc_not_zero(link);
+ else
+ link = ERR_PTR(-EAGAIN);
+ } else {
+ link = ERR_PTR(-ENOENT);
+ }
+ spin_unlock_bh(&link_idr_lock);
+ return link;
+}
+
+struct bpf_link *bpf_link_get_curr_or_next(u32 *id)
+{
+ struct bpf_link *link;
+
+ spin_lock_bh(&link_idr_lock);
+again:
+ link = idr_get_next(&link_idr, id);
+ if (link) {
+ link = bpf_link_inc_not_zero(link);
+ if (IS_ERR(link)) {
+ (*id)++;
+ goto again;
+ }
+ }
+ spin_unlock_bh(&link_idr_lock);
+
+ return link;
+}
+
+#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
+
+static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ u32 id = attr->link_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ link = bpf_link_by_id(id);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ fd = bpf_link_new_fd(link);
+ if (fd < 0)
+ bpf_link_put_direct(link);
+
+ return fd;
+}
+
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&bpf_stats_enabled_mutex);
+ static_key_slow_dec(&bpf_stats_enabled_key.key);
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+ .release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+ int fd;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+
+ /* Set a very high limit to avoid overflow */
+ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return -EBUSY;
+ }
+
+ fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
+ if (fd >= 0)
+ static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+ if (CHECK_ATTR(BPF_ENABLE_STATS))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (attr->enable_stats.type) {
+ case BPF_STATS_RUN_TIME:
+ return bpf_enable_runtime_stats();
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
+
+static int bpf_iter_create(union bpf_attr *attr)
+{
+ struct bpf_link *link;
+ int err;
+
+ if (CHECK_ATTR(BPF_ITER_CREATE))
+ return -EINVAL;
+
+ if (attr->iter_create.flags)
+ return -EINVAL;
+
+ link = bpf_link_get_from_fd(attr->iter_create.link_fd);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ err = bpf_iter_new_fd(link);
+ bpf_link_put_direct(link);
+
+ return err;
+}
+
+#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
+
+static int bpf_prog_bind_map(union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct bpf_map **used_maps_old, **used_maps_new;
+ int i, ret = 0;
+
+ if (CHECK_ATTR(BPF_PROG_BIND_MAP))
+ return -EINVAL;
+
+ if (attr->prog_bind_map.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ map = bpf_map_get(attr->prog_bind_map.map_fd);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
+ goto out_prog_put;
+ }
+
+ mutex_lock(&prog->aux->used_maps_mutex);
+
+ used_maps_old = prog->aux->used_maps;
+
+ for (i = 0; i < prog->aux->used_map_cnt; i++)
+ if (used_maps_old[i] == map) {
+ bpf_map_put(map);
+ goto out_unlock;
+ }
+
+ used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
+ sizeof(used_maps_new[0]),
+ GFP_KERNEL);
+ if (!used_maps_new) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /* The bpf program will not access the bpf map, but for the sake of
+ * simplicity, increase sleepable_refcnt for sleepable program as well.
+ */
+ if (prog->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
+ memcpy(used_maps_new, used_maps_old,
+ sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
+ used_maps_new[prog->aux->used_map_cnt] = map;
+
+ prog->aux->used_map_cnt++;
+ prog->aux->used_maps = used_maps_new;
+
+ kfree(used_maps_old);
+
+out_unlock:
+ mutex_unlock(&prog->aux->used_maps_mutex);
+
+ if (ret)
+ bpf_map_put(map);
+out_prog_put:
+ bpf_prog_put(prog);
+ return ret;
+}
+
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_TOKEN_CREATE))
+ return -EINVAL;
+
+ /* no flags are supported yet */
+ if (attr->token_create.flags)
+ return -EINVAL;
+
+ return bpf_token_create(attr);
+}
+
+#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd
+
+static int prog_stream_read(union bpf_attr *attr)
+{
+ char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf);
+ u32 len = attr->prog_stream_read.stream_buf_len;
+ struct bpf_prog *prog;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_stream_read.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len);
+ bpf_prog_put(prog);
+
+ return ret;
+}
+
+static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
+{
+ union bpf_attr attr;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
+ if (err)
+ return err;
+ size = min_t(u32, size, sizeof(attr));
+
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
- if (copy_from_user(&attr, uattr, size) != 0)
+ memset(&attr, 0, sizeof(attr));
+ if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size, uattr.is_kernel);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr);
+ err = map_create(&attr, uattr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
- err = map_update_elem(&attr);
+ err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
- err = map_delete_elem(&attr);
+ err = map_delete_elem(&attr, uattr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
break;
+ case BPF_MAP_FREEZE:
+ err = map_freeze(&attr);
+ break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr);
+ err = bpf_prog_load(&attr, uattr, size);
+ break;
+ case BPF_OBJ_PIN:
+ err = bpf_obj_pin(&attr);
+ break;
+ case BPF_OBJ_GET:
+ err = bpf_obj_get(&attr);
+ break;
+ case BPF_PROG_ATTACH:
+ err = bpf_prog_attach(&attr);
+ break;
+ case BPF_PROG_DETACH:
+ err = bpf_prog_detach(&attr);
+ break;
+ case BPF_PROG_QUERY:
+ err = bpf_prog_query(&attr, uattr.user);
+ break;
+ case BPF_PROG_TEST_RUN:
+ err = bpf_prog_test_run(&attr, uattr.user);
+ break;
+ case BPF_PROG_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr.user,
+ &prog_idr, &prog_idr_lock);
+ break;
+ case BPF_MAP_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr.user,
+ &map_idr, &map_idr_lock);
+ break;
+ case BPF_BTF_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr.user,
+ &btf_idr, &btf_idr_lock);
+ break;
+ case BPF_PROG_GET_FD_BY_ID:
+ err = bpf_prog_get_fd_by_id(&attr);
+ break;
+ case BPF_MAP_GET_FD_BY_ID:
+ err = bpf_map_get_fd_by_id(&attr);
+ break;
+ case BPF_OBJ_GET_INFO_BY_FD:
+ err = bpf_obj_get_info_by_fd(&attr, uattr.user);
+ break;
+ case BPF_RAW_TRACEPOINT_OPEN:
+ err = bpf_raw_tracepoint_open(&attr);
+ break;
+ case BPF_BTF_LOAD:
+ err = bpf_btf_load(&attr, uattr, size);
+ break;
+ case BPF_BTF_GET_FD_BY_ID:
+ err = bpf_btf_get_fd_by_id(&attr);
+ break;
+ case BPF_TASK_FD_QUERY:
+ err = bpf_task_fd_query(&attr, uattr.user);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+ err = map_lookup_and_delete_elem(&attr);
+ break;
+ case BPF_MAP_LOOKUP_BATCH:
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr.user,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+ break;
+ case BPF_MAP_UPDATE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
+ break;
+ case BPF_MAP_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
+ break;
+ case BPF_LINK_CREATE:
+ err = link_create(&attr, uattr);
+ break;
+ case BPF_LINK_UPDATE:
+ err = link_update(&attr);
+ break;
+ case BPF_LINK_GET_FD_BY_ID:
+ err = bpf_link_get_fd_by_id(&attr);
+ break;
+ case BPF_LINK_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr.user,
+ &link_idr, &link_idr_lock);
+ break;
+ case BPF_ENABLE_STATS:
+ err = bpf_enable_stats(&attr);
+ break;
+ case BPF_ITER_CREATE:
+ err = bpf_iter_create(&attr);
+ break;
+ case BPF_LINK_DETACH:
+ err = link_detach(&attr);
+ break;
+ case BPF_PROG_BIND_MAP:
+ err = bpf_prog_bind_map(&attr);
+ break;
+ case BPF_TOKEN_CREATE:
+ err = token_create(&attr);
+ break;
+ case BPF_PROG_STREAM_READ_BY_FD:
+ err = prog_stream_read(&attr);
break;
default:
err = -EINVAL;
@@ -619,3 +6268,261 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
return err;
}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
+}
+
+static bool syscall_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= U16_MAX)
+ return false;
+ if (off % size != 0)
+ return false;
+ return true;
+}
+
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
+{
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ case BPF_MAP_DELETE_ELEM:
+ case BPF_MAP_UPDATE_ELEM:
+ case BPF_MAP_FREEZE:
+ case BPF_MAP_GET_FD_BY_ID:
+ case BPF_PROG_LOAD:
+ case BPF_BTF_LOAD:
+ case BPF_LINK_CREATE:
+ case BPF_RAW_TRACEPOINT_OPEN:
+ break;
+ default:
+ return -EINVAL;
+ }
+ return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
+}
+
+
+/* To shut up -Wmissing-prototypes.
+ * This function is used by the kernel light skeleton
+ * to load bpf programs when modules are loaded or during kernel boot.
+ * See tools/lib/bpf/skel_internal.h
+ */
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+ struct bpf_prog * __maybe_unused prog;
+ struct bpf_tramp_run_ctx __maybe_unused run_ctx;
+
+ switch (cmd) {
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+ case BPF_PROG_TEST_RUN:
+ if (attr->test.data_in || attr->test.data_out ||
+ attr->test.ctx_out || attr->test.duration ||
+ attr->test.repeat || attr->test.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+ attr->test.ctx_size_in > U16_MAX) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ run_ctx.bpf_cookie = 0;
+ if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
+ /* recursion detected */
+ __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
+ bpf_prog_put(prog);
+ return -EBUSY;
+ }
+ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+ __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+ &run_ctx);
+ bpf_prog_put(prog);
+ return 0;
+#endif
+ default:
+ return ____bpf_sys_bpf(cmd, attr, size);
+ }
+}
+EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL");
+
+static const struct bpf_func_proto bpf_sys_bpf_proto = {
+ .func = bpf_sys_bpf,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+const struct bpf_func_proto * __weak
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+BPF_CALL_1(bpf_sys_close, u32, fd)
+{
+ /* When bpf program calls this helper there should not be
+ * an fdget() without matching completed fdput().
+ * This helper is allowed in the following callchain only:
+ * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
+ */
+ return close_fd(fd);
+}
+
+static const struct bpf_func_proto bpf_sys_close_proto = {
+ .func = bpf_sys_close,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
+{
+ *res = 0;
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ if (!bpf_dump_raw_ok(current_cred()))
+ return -EPERM;
+
+ *res = kallsyms_lookup_name(name);
+ return *res ? 0 : -ENOENT;
+}
+
+static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+ .func = bpf_kallsyms_lookup_name,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg4_size = sizeof(u64),
+};
+
+static const struct bpf_func_proto *
+syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sys_bpf:
+ return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+ ? NULL : &bpf_sys_bpf_proto;
+ case BPF_FUNC_btf_find_by_name_kind:
+ return &bpf_btf_find_by_name_kind_proto;
+ case BPF_FUNC_sys_close:
+ return &bpf_sys_close_proto;
+ case BPF_FUNC_kallsyms_lookup_name:
+ return &bpf_kallsyms_lookup_name_proto;
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
+const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
+ .get_func_proto = syscall_prog_func_proto,
+ .is_valid_access = syscall_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops bpf_syscall_prog_ops = {
+ .test_run = bpf_prog_test_run_syscall,
+};
+
+#ifdef CONFIG_SYSCTL
+static int bpf_stats_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static int saved_val;
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&bpf_stats_enabled_mutex);
+ val = saved_val;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret && val != saved_val) {
+ if (val)
+ static_key_slow_inc(key);
+ else
+ static_key_slow_dec(key);
+ saved_val = val;
+ }
+ mutex_unlock(&bpf_stats_enabled_mutex);
+ return ret;
+}
+
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
+static int bpf_unpriv_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, unpriv_enable = *(int *)table->data;
+ bool locked_state = unpriv_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &unpriv_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && unpriv_enable != 1)
+ return -EPERM;
+ *(int *)table->data = unpriv_enable;
+ }
+
+ if (write)
+ unpriv_ebpf_notify(unpriv_enable);
+
+ return ret;
+}
+
+static const struct ctl_table bpf_syscall_table[] = {
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ .proc_handler = bpf_unpriv_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &bpf_stats_enabled_key.key,
+ .mode = 0644,
+ .proc_handler = bpf_stats_handler,
+ },
+};
+
+static int __init bpf_syscall_sysctl_init(void)
+{
+ register_sysctl_init("kernel", bpf_syscall_table);
+ return 0;
+}
+late_initcall(bpf_syscall_sysctl_init);
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
new file mode 100644
index 000000000000..9cbe15ce3540
--- /dev/null
+++ b/kernel/bpf/sysfs_btf.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel BTF information for introspection and use by eBPF tools.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/init.h>
+#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/btf.h>
+
+/* See scripts/link-vmlinux.sh, gen_btf() func for details */
+extern char __start_BTF[];
+extern char __stop_BTF[];
+
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
+{
+ unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
+ size_t vm_size = vma->vm_end - vma->vm_start;
+ phys_addr_t addr = __pa_symbol(__start_BTF);
+ unsigned long pfn = addr >> PAGE_SHIFT;
+
+ if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
+ return -EACCES;
+
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ if ((vm_size >> PAGE_SHIFT) > pages)
+ return -EINVAL;
+
+ vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
+ return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
+}
+
+static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
+ .attr = { .name = "vmlinux", .mode = 0444, },
+ .read = sysfs_bin_attr_simple_read,
+ .mmap = btf_sysfs_vmlinux_mmap,
+};
+
+struct kobject *btf_kobj;
+
+static int __init btf_vmlinux_init(void)
+{
+ bin_attr_btf_vmlinux.private = __start_BTF;
+ bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
+
+ if (bin_attr_btf_vmlinux.size == 0)
+ return 0;
+
+ btf_kobj = kobject_create_and_add("btf", kernel_kobj);
+ if (!btf_kobj)
+ return -ENOMEM;
+
+ return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux);
+}
+
+subsys_initcall(btf_vmlinux_init);
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
new file mode 100644
index 000000000000..98d9b4c0daff
--- /dev/null
+++ b/kernel/bpf/task_iter.c
@@ -0,0 +1,1070 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf_ids.h>
+#include <linux/mm_types.h>
+#include "mmap_unlock_work.h"
+
+static const char * const iter_task_type_names[] = {
+ "ALL",
+ "TID",
+ "PID",
+};
+
+struct bpf_iter_seq_task_common {
+ struct pid_namespace *ns;
+ enum bpf_iter_task_type type;
+ u32 pid;
+ u32 pid_visiting;
+};
+
+struct bpf_iter_seq_task_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ u32 tid;
+};
+
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task;
+ struct pid *pid;
+ u32 next_tid;
+
+ if (!*tid) {
+ /* The first time, the iterator calls this function. */
+ pid = find_pid_ns(common->pid, common->ns);
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return NULL;
+
+ *tid = common->pid;
+ common->pid_visiting = common->pid;
+
+ return task;
+ }
+
+ /* If the control returns to user space and comes back to the
+ * kernel again, *tid and common->pid_visiting should be the
+ * same for task_seq_start() to pick up the correct task.
+ */
+ if (*tid == common->pid_visiting) {
+ pid = find_pid_ns(common->pid_visiting, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+
+ return task;
+ }
+
+ task = find_task_by_pid_ns(common->pid_visiting, common->ns);
+ if (!task)
+ return NULL;
+
+retry:
+ task = __next_thread(task);
+ if (!task)
+ return NULL;
+
+ next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
+ if (!next_tid)
+ goto retry;
+
+ if (skip_if_dup_files && task->files == task->group_leader->files)
+ goto retry;
+
+ *tid = common->pid_visiting = next_tid;
+ get_task_struct(task);
+ return task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
+ u32 *tid,
+ bool skip_if_dup_files)
+{
+ struct task_struct *task = NULL;
+ struct pid *pid;
+
+ if (common->type == BPF_TASK_ITER_TID) {
+ if (*tid && *tid != common->pid)
+ return NULL;
+ rcu_read_lock();
+ pid = find_pid_ns(common->pid, common->ns);
+ if (pid) {
+ task = get_pid_task(pid, PIDTYPE_PID);
+ *tid = common->pid;
+ }
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ if (common->type == BPF_TASK_ITER_TGID) {
+ rcu_read_lock();
+ task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+ rcu_read_unlock();
+
+ return task;
+ }
+
+ rcu_read_lock();
+retry:
+ pid = find_ge_pid(*tid, common->ns);
+ if (pid) {
+ *tid = pid_nr_ns(pid, common->ns);
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ++*tid;
+ goto retry;
+ } else if (skip_if_dup_files && !thread_group_leader(task) &&
+ task->files == task->group_leader->files) {
+ put_task_struct(task);
+ task = NULL;
+ ++*tid;
+ goto retry;
+ }
+ }
+ rcu_read_unlock();
+
+ return task;
+}
+
+static void *task_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_info *info = seq->private;
+ struct task_struct *task;
+
+ task = task_seq_get_next(&info->common, &info->tid, false);
+ if (!task)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return task;
+}
+
+static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_info *info = seq->private;
+ struct task_struct *task;
+
+ ++*pos;
+ ++info->tid;
+ put_task_struct((struct task_struct *)v);
+ task = task_seq_get_next(&info->common, &info->tid, false);
+ if (!task)
+ return NULL;
+
+ return task;
+}
+
+struct bpf_iter__task {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+};
+
+DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task)
+
+static int __task_seq_show(struct seq_file *seq, struct task_struct *task,
+ bool in_stop)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_iter__task ctx;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.task = task;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_seq_show(seq, v, false);
+}
+
+static void task_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__task_seq_show(seq, v, true);
+ else
+ put_task_struct((struct task_struct *)v);
+}
+
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ unsigned int flags;
+ struct pid *pid;
+ pid_t tgid;
+
+ if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+ return -EINVAL;
+
+ aux->task.type = BPF_TASK_ITER_ALL;
+ if (linfo->task.tid != 0) {
+ aux->task.type = BPF_TASK_ITER_TID;
+ aux->task.pid = linfo->task.tid;
+ }
+ if (linfo->task.pid != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+ aux->task.pid = linfo->task.pid;
+ }
+ if (linfo->task.pid_fd != 0) {
+ aux->task.type = BPF_TASK_ITER_TGID;
+
+ pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+ aux->task.pid = tgid;
+ put_pid(pid);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations task_seq_ops = {
+ .start = task_seq_start,
+ .next = task_seq_next,
+ .stop = task_seq_stop,
+ .show = task_seq_show,
+};
+
+struct bpf_iter_seq_task_file_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ struct task_struct *task;
+ u32 tid;
+ u32 fd;
+};
+
+static struct file *
+task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
+{
+ u32 saved_tid = info->tid;
+ struct task_struct *curr_task;
+ unsigned int curr_fd = info->fd;
+ struct file *f;
+
+ /* If this function returns a non-NULL file object,
+ * it held a reference to the task/file.
+ * Otherwise, it does not hold any reference.
+ */
+again:
+ if (info->task) {
+ curr_task = info->task;
+ curr_fd = info->fd;
+ } else {
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
+ if (!curr_task) {
+ info->task = NULL;
+ return NULL;
+ }
+
+ /* set info->task */
+ info->task = curr_task;
+ if (saved_tid == info->tid)
+ curr_fd = info->fd;
+ else
+ curr_fd = 0;
+ }
+
+ f = fget_task_next(curr_task, &curr_fd);
+ if (f) {
+ /* set info->fd */
+ info->fd = curr_fd;
+ return f;
+ }
+
+ /* the current task is done, go to the next task */
+ put_task_struct(curr_task);
+
+ if (info->common.type == BPF_TASK_ITER_TID) {
+ info->task = NULL;
+ return NULL;
+ }
+
+ info->task = NULL;
+ info->fd = 0;
+ saved_tid = ++(info->tid);
+ goto again;
+}
+
+static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct file *file;
+
+ info->task = NULL;
+ file = task_file_seq_get_next(info);
+ if (file && *pos == 0)
+ ++*pos;
+
+ return file;
+}
+
+static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+
+ ++*pos;
+ ++info->fd;
+ fput((struct file *)v);
+ return task_file_seq_get_next(info);
+}
+
+struct bpf_iter__task_file {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+ u32 fd __aligned(8);
+ __bpf_md_ptr(struct file *, file);
+};
+
+DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta,
+ struct task_struct *task, u32 fd,
+ struct file *file)
+
+static int __task_file_seq_show(struct seq_file *seq, struct file *file,
+ bool in_stop)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+ struct bpf_iter__task_file ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.task = info->task;
+ ctx.fd = info->fd;
+ ctx.file = file;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_file_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_file_seq_show(seq, v, false);
+}
+
+static void task_file_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_task_file_info *info = seq->private;
+
+ if (!v) {
+ (void)__task_file_seq_show(seq, v, true);
+ } else {
+ fput((struct file *)v);
+ put_task_struct(info->task);
+ info->task = NULL;
+ }
+}
+
+static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_task_common *common = priv_data;
+
+ common->ns = get_pid_ns(task_active_pid_ns(current));
+ common->type = aux->task.type;
+ common->pid = aux->task.pid;
+
+ return 0;
+}
+
+static void fini_seq_pidns(void *priv_data)
+{
+ struct bpf_iter_seq_task_common *common = priv_data;
+
+ put_pid_ns(common->ns);
+}
+
+static const struct seq_operations task_file_seq_ops = {
+ .start = task_file_seq_start,
+ .next = task_file_seq_next,
+ .stop = task_file_seq_stop,
+ .show = task_file_seq_show,
+};
+
+struct bpf_iter_seq_task_vma_info {
+ /* The first field must be struct bpf_iter_seq_task_common.
+ * this is assumed by {init, fini}_seq_pidns() callback functions.
+ */
+ struct bpf_iter_seq_task_common common;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ u32 tid;
+ unsigned long prev_vm_start;
+ unsigned long prev_vm_end;
+};
+
+enum bpf_task_vma_iter_find_op {
+ task_vma_iter_first_vma, /* use find_vma() with addr 0 */
+ task_vma_iter_next_vma, /* use vma_next() with curr_vma */
+ task_vma_iter_find_vma, /* use find_vma() to find next vma */
+};
+
+static struct vm_area_struct *
+task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
+{
+ enum bpf_task_vma_iter_find_op op;
+ struct vm_area_struct *curr_vma;
+ struct task_struct *curr_task;
+ struct mm_struct *curr_mm;
+ u32 saved_tid = info->tid;
+
+ /* If this function returns a non-NULL vma, it holds a reference to
+ * the task_struct, holds a refcount on mm->mm_users, and holds
+ * read lock on vma->mm->mmap_lock.
+ * If this function returns NULL, it does not hold any reference or
+ * lock.
+ */
+ if (info->task) {
+ curr_task = info->task;
+ curr_vma = info->vma;
+ curr_mm = info->mm;
+ /* In case of lock contention, drop mmap_lock to unblock
+ * the writer.
+ *
+ * After relock, call find(mm, prev_vm_end - 1) to find
+ * new vma to process.
+ *
+ * +------+------+-----------+
+ * | VMA1 | VMA2 | VMA3 |
+ * +------+------+-----------+
+ * | | | |
+ * 4k 8k 16k 400k
+ *
+ * For example, curr_vma == VMA2. Before unlock, we set
+ *
+ * prev_vm_start = 8k
+ * prev_vm_end = 16k
+ *
+ * There are a few cases:
+ *
+ * 1) VMA2 is freed, but VMA3 exists.
+ *
+ * find_vma() will return VMA3, just process VMA3.
+ *
+ * 2) VMA2 still exists.
+ *
+ * find_vma() will return VMA2, process VMA2->next.
+ *
+ * 3) no more vma in this mm.
+ *
+ * Process the next task.
+ *
+ * 4) find_vma() returns a different vma, VMA2'.
+ *
+ * 4.1) If VMA2 covers same range as VMA2', skip VMA2',
+ * because we already covered the range;
+ * 4.2) VMA2 and VMA2' covers different ranges, process
+ * VMA2'.
+ */
+ if (mmap_lock_is_contended(curr_mm)) {
+ info->prev_vm_start = curr_vma->vm_start;
+ info->prev_vm_end = curr_vma->vm_end;
+ op = task_vma_iter_find_vma;
+ mmap_read_unlock(curr_mm);
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
+ goto finish;
+ }
+ } else {
+ op = task_vma_iter_next_vma;
+ }
+ } else {
+again:
+ curr_task = task_seq_get_next(&info->common, &info->tid, true);
+ if (!curr_task) {
+ info->tid++;
+ goto finish;
+ }
+
+ if (saved_tid != info->tid) {
+ /* new task, process the first vma */
+ op = task_vma_iter_first_vma;
+ } else {
+ /* Found the same tid, which means the user space
+ * finished data in previous buffer and read more.
+ * We dropped mmap_lock before returning to user
+ * space, so it is necessary to use find_vma() to
+ * find the next vma to process.
+ */
+ op = task_vma_iter_find_vma;
+ }
+
+ curr_mm = get_task_mm(curr_task);
+ if (!curr_mm)
+ goto next_task;
+
+ if (mmap_read_lock_killable(curr_mm)) {
+ mmput(curr_mm);
+ goto finish;
+ }
+ }
+
+ switch (op) {
+ case task_vma_iter_first_vma:
+ curr_vma = find_vma(curr_mm, 0);
+ break;
+ case task_vma_iter_next_vma:
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
+ break;
+ case task_vma_iter_find_vma:
+ /* We dropped mmap_lock so it is necessary to use find_vma
+ * to find the next vma. This is similar to the mechanism
+ * in show_smaps_rollup().
+ */
+ curr_vma = find_vma(curr_mm, info->prev_vm_end - 1);
+ /* case 1) and 4.2) above just use curr_vma */
+
+ /* check for case 2) or case 4.1) above */
+ if (curr_vma &&
+ curr_vma->vm_start == info->prev_vm_start &&
+ curr_vma->vm_end == info->prev_vm_end)
+ curr_vma = find_vma(curr_mm, curr_vma->vm_end);
+ break;
+ }
+ if (!curr_vma) {
+ /* case 3) above, or case 2) 4.1) with vma->next == NULL */
+ mmap_read_unlock(curr_mm);
+ mmput(curr_mm);
+ goto next_task;
+ }
+ info->task = curr_task;
+ info->vma = curr_vma;
+ info->mm = curr_mm;
+ return curr_vma;
+
+next_task:
+ if (info->common.type == BPF_TASK_ITER_TID)
+ goto finish;
+
+ put_task_struct(curr_task);
+ info->task = NULL;
+ info->mm = NULL;
+ info->tid++;
+ goto again;
+
+finish:
+ if (curr_task)
+ put_task_struct(curr_task);
+ info->task = NULL;
+ info->vma = NULL;
+ info->mm = NULL;
+ return NULL;
+}
+
+static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+ struct vm_area_struct *vma;
+
+ vma = task_vma_seq_get_next(info);
+ if (vma && *pos == 0)
+ ++*pos;
+
+ return vma;
+}
+
+static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+ ++*pos;
+ return task_vma_seq_get_next(info);
+}
+
+struct bpf_iter__task_vma {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct task_struct *, task);
+ __bpf_md_ptr(struct vm_area_struct *, vma);
+};
+
+DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta,
+ struct task_struct *task, struct vm_area_struct *vma)
+
+static int __task_vma_seq_show(struct seq_file *seq, bool in_stop)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+ struct bpf_iter__task_vma ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.task = info->task;
+ ctx.vma = info->vma;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int task_vma_seq_show(struct seq_file *seq, void *v)
+{
+ return __task_vma_seq_show(seq, false);
+}
+
+static void task_vma_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_task_vma_info *info = seq->private;
+
+ if (!v) {
+ (void)__task_vma_seq_show(seq, true);
+ } else {
+ /* info->vma has not been seen by the BPF program. If the
+ * user space reads more, task_vma_seq_get_next should
+ * return this vma again. Set prev_vm_start to ~0UL,
+ * so that we don't skip the vma returned by the next
+ * find_vma() (case task_vma_iter_find_vma in
+ * task_vma_seq_get_next()).
+ */
+ info->prev_vm_start = ~0UL;
+ info->prev_vm_end = info->vma->vm_end;
+ mmap_read_unlock(info->mm);
+ mmput(info->mm);
+ info->mm = NULL;
+ put_task_struct(info->task);
+ info->task = NULL;
+ }
+}
+
+static const struct seq_operations task_vma_seq_ops = {
+ .start = task_vma_seq_start,
+ .next = task_vma_seq_next,
+ .stop = task_vma_seq_stop,
+ .show = task_vma_seq_show,
+};
+
+static const struct bpf_iter_seq_info task_seq_info = {
+ .seq_ops = &task_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_info),
+};
+
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+ switch (aux->task.type) {
+ case BPF_TASK_ITER_TID:
+ info->iter.task.tid = aux->task.pid;
+ break;
+ case BPF_TASK_ITER_TGID:
+ info->iter.task.pid = aux->task.pid;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+ seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+ if (aux->task.type == BPF_TASK_ITER_TID)
+ seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+ else if (aux->task.type == BPF_TASK_ITER_TGID)
+ seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
+static struct bpf_iter_reg task_reg_info = {
+ .target = "task",
+ .attach_target = bpf_iter_attach_task,
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task, task),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &task_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
+};
+
+static const struct bpf_iter_seq_info task_file_seq_info = {
+ .seq_ops = &task_file_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info),
+};
+
+static struct bpf_iter_reg task_file_reg_info = {
+ .target = "task_file",
+ .attach_target = bpf_iter_attach_task,
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task_file, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__task_file, file),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &task_file_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
+};
+
+static const struct bpf_iter_seq_info task_vma_seq_info = {
+ .seq_ops = &task_vma_seq_ops,
+ .init_seq_private = init_seq_pidns,
+ .fini_seq_private = fini_seq_pidns,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info),
+};
+
+static struct bpf_iter_reg task_vma_reg_info = {
+ .target = "task_vma",
+ .attach_target = bpf_iter_attach_task,
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__task_vma, task),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__task_vma, vma),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &task_vma_seq_info,
+ .fill_link_info = bpf_iter_fill_link_info,
+ .show_fdinfo = bpf_iter_task_show_fdinfo,
+};
+
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+ bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ struct vm_area_struct *vma;
+ bool irq_work_busy = false;
+ struct mm_struct *mm;
+ int ret = -ENOENT;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!task)
+ return -ENOENT;
+
+ mm = task->mm;
+ if (!mm)
+ return -ENOENT;
+
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+
+ if (irq_work_busy || !mmap_read_trylock(mm))
+ return -EBUSY;
+
+ vma = find_vma(mm, start);
+
+ if (vma && vma->vm_start <= start && vma->vm_end > start) {
+ callback_fn((u64)(long)task, (u64)(long)vma,
+ (u64)(long)callback_ctx, 0, 0);
+ ret = 0;
+ }
+ bpf_mmap_unlock_mm(work, mm);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+ .func = bpf_find_vma,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FUNC,
+ .arg4_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg5_type = ARG_ANYTHING,
+};
+
+struct bpf_iter_task_vma_kern_data {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct mmap_unlock_irq_work *work;
+ struct vma_iterator vmi;
+};
+
+struct bpf_iter_task_vma {
+ /* opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+ struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+ struct task_struct *task, u64 addr)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+ bool irq_work_busy = false;
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+ /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+ * before, so non-NULL kit->data doesn't point to previously
+ * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+ */
+ kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+ if (!kit->data)
+ return -ENOMEM;
+
+ kit->data->task = get_task_struct(task);
+ kit->data->mm = task->mm;
+ if (!kit->data->mm) {
+ err = -ENOENT;
+ goto err_cleanup_iter;
+ }
+
+ /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+ if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+ err = -EBUSY;
+ goto err_cleanup_iter;
+ }
+
+ vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ return 0;
+
+err_cleanup_iter:
+ if (kit->data->task)
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ /* NULL kit->data signals failed bpf_iter_task_vma initialization */
+ kit->data = NULL;
+ return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (!kit->data) /* bpf_iter_task_vma_new failed */
+ return NULL;
+ return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+ struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+ if (kit->data) {
+ bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ put_task_struct(kit->data->task);
+ bpf_mem_free(&bpf_global_ma, kit->data);
+ }
+}
+
+__bpf_kfunc_end_defs();
+
+#ifdef CONFIG_CGROUPS
+
+struct bpf_iter_css_task {
+ __u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_task_kern {
+ struct css_task_iter *css_it;
+} __attribute__((aligned(8)));
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+ struct cgroup_subsys_state *css, unsigned int flags)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
+ __alignof__(struct bpf_iter_css_task));
+ kit->css_it = NULL;
+ switch (flags) {
+ case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
+ case CSS_TASK_ITER_PROCS:
+ case 0:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
+ if (!kit->css_it)
+ return -ENOMEM;
+ css_task_iter_start(css, flags, kit->css_it);
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return NULL;
+ return css_task_iter_next(kit->css_it);
+}
+
+__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
+{
+ struct bpf_iter_css_task_kern *kit = (void *)it;
+
+ if (!kit->css_it)
+ return;
+ css_task_iter_end(kit->css_it);
+ bpf_mem_free(&bpf_global_ma, kit->css_it);
+}
+
+__bpf_kfunc_end_defs();
+
+#endif /* CONFIG_CGROUPS */
+
+struct bpf_iter_task {
+ __u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+ struct task_struct *task;
+ struct task_struct *pos;
+ unsigned int flags;
+} __attribute__((aligned(8)));
+
+enum {
+ /* all process in the system */
+ BPF_TASK_ITER_ALL_PROCS,
+ /* all threads in the system */
+ BPF_TASK_ITER_ALL_THREADS,
+ /* all threads of a specific process */
+ BPF_TASK_ITER_PROC_THREADS
+};
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
+ struct task_struct *task__nullable, unsigned int flags)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+ __alignof__(struct bpf_iter_task));
+
+ kit->pos = NULL;
+
+ switch (flags) {
+ case BPF_TASK_ITER_ALL_THREADS:
+ case BPF_TASK_ITER_ALL_PROCS:
+ break;
+ case BPF_TASK_ITER_PROC_THREADS:
+ if (!task__nullable)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (flags == BPF_TASK_ITER_PROC_THREADS)
+ kit->task = task__nullable;
+ else
+ kit->task = &init_task;
+ kit->pos = kit->task;
+ kit->flags = flags;
+ return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+ struct bpf_iter_task_kern *kit = (void *)it;
+ struct task_struct *pos;
+ unsigned int flags;
+
+ flags = kit->flags;
+ pos = kit->pos;
+
+ if (!pos)
+ return pos;
+
+ if (flags == BPF_TASK_ITER_ALL_PROCS)
+ goto get_next_task;
+
+ kit->pos = __next_thread(kit->pos);
+ if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS)
+ return pos;
+
+get_next_task:
+ kit->task = next_task(kit->task);
+ if (kit->task == &init_task)
+ kit->pos = NULL;
+ else
+ kit->pos = kit->task;
+
+ return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
+__bpf_kfunc_end_defs();
+
+DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+ struct mmap_unlock_irq_work *work;
+
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+ return;
+
+ work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+ mmap_read_unlock_non_owner(work->mm);
+}
+
+static int __init task_iter_init(void)
+{
+ struct mmap_unlock_irq_work *work;
+ int ret, cpu;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&mmap_unlock_work, cpu);
+ init_irq_work(&work->irq_work, do_mmap_read_unlock);
+ }
+
+ task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ ret = bpf_iter_reg_target(&task_reg_info);
+ if (ret)
+ return ret;
+
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
+ ret = bpf_iter_reg_target(&task_file_reg_info);
+ if (ret)
+ return ret;
+
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
+ return bpf_iter_reg_target(&task_vma_reg_info);
+}
+late_initcall(task_iter_init);
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
new file mode 100644
index 000000000000..efd987ea6872
--- /dev/null
+++ b/kernel/bpf/tcx.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mprog.h>
+#include <linux/netdevice.h>
+
+#include <net/tcx.h>
+
+int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool created, ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct bpf_prog *replace_prog = NULL;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ if (attr->attach_flags & BPF_F_REPLACE) {
+ replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+ prog->type);
+ if (IS_ERR(replace_prog)) {
+ ret = PTR_ERR(replace_prog);
+ replace_prog = NULL;
+ goto out;
+ }
+ }
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+ attr->attach_flags, attr->relative_fd,
+ attr->expected_revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+out:
+ if (replace_prog)
+ bpf_prog_put(replace_prog);
+ rtnl_unlock();
+ return ret;
+}
+
+int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ bool ingress = attr->attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+void tcx_uninstall(struct net_device *dev, bool ingress)
+{
+ struct bpf_mprog_entry *entry, *entry_new = NULL;
+ struct bpf_tuple tuple = {};
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ bool active;
+
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry)
+ return;
+ active = tcx_entry(entry)->miniq_active;
+ if (active)
+ bpf_mprog_clear_all(entry, &entry_new);
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+ if (tuple.link)
+ tcx_link(tuple.link)->dev = NULL;
+ else
+ bpf_prog_put(tuple.prog);
+ tcx_skeys_dec(ingress);
+ }
+ if (!active)
+ tcx_entry_free(entry);
+}
+
+int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+ bool ingress = attr->query.attach_type == BPF_TCX_INGRESS;
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->query.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress));
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
+ u64 revision)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool created, ingress = link->attach_type == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev = tcx->dev;
+ int ret;
+
+ ASSERT_RTNL();
+ entry = tcx_entry_fetch_or_create(dev, ingress, &created);
+ if (!entry)
+ return -ENOMEM;
+ ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+ id_or_fd, revision);
+ if (!ret) {
+ if (entry != entry_new) {
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_inc(ingress);
+ }
+ bpf_mprog_commit(entry);
+ } else if (created) {
+ tcx_entry_free(entry);
+ }
+ return ret;
+}
+
+static void tcx_link_release(struct bpf_link *link)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = link->attach_type == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev)
+ goto out;
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+ if (!ret) {
+ if (!tcx_entry_is_active(entry_new))
+ entry_new = NULL;
+ tcx_entry_update(dev, entry_new, ingress);
+ tcx_entry_sync();
+ tcx_skeys_dec(ingress);
+ bpf_mprog_commit(entry);
+ if (!entry_new)
+ tcx_entry_free(entry);
+ tcx->dev = NULL;
+ }
+out:
+ WARN_ON_ONCE(ret);
+ rtnl_unlock();
+}
+
+static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+ struct bpf_prog *oprog)
+{
+ struct tcx_link *tcx = tcx_link(link);
+ bool ingress = link->attach_type == BPF_TCX_INGRESS;
+ struct bpf_mprog_entry *entry, *entry_new;
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ dev = tcx->dev;
+ if (!dev) {
+ ret = -ENOLINK;
+ goto out;
+ }
+ if (oprog && link->prog != oprog) {
+ ret = -EPERM;
+ goto out;
+ }
+ oprog = link->prog;
+ if (oprog == nprog) {
+ bpf_prog_put(nprog);
+ goto out;
+ }
+ entry = tcx_entry_fetch(dev, ingress);
+ if (!entry) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+ BPF_F_REPLACE | BPF_F_ID,
+ link->prog->aux->id, 0);
+ if (!ret) {
+ WARN_ON_ONCE(entry != entry_new);
+ oprog = xchg(&link->prog, nprog);
+ bpf_prog_put(oprog);
+ bpf_mprog_commit(entry);
+ }
+out:
+ rtnl_unlock();
+ return ret;
+}
+
+static void tcx_link_dealloc(struct bpf_link *link)
+{
+ kfree(tcx_link(link));
+}
+
+static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+ const struct tcx_link *tcx = tcx_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+ seq_printf(seq, "attach_type:\t%u (%s)\n",
+ link->attach_type,
+ link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress");
+}
+
+static int tcx_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct tcx_link *tcx = tcx_link(link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (tcx->dev)
+ ifindex = tcx->dev->ifindex;
+ rtnl_unlock();
+
+ info->tcx.ifindex = ifindex;
+ info->tcx.attach_type = link->attach_type;
+ return 0;
+}
+
+static int tcx_link_detach(struct bpf_link *link)
+{
+ tcx_link_release(link);
+ return 0;
+}
+
+static const struct bpf_link_ops tcx_link_lops = {
+ .release = tcx_link_release,
+ .detach = tcx_link_detach,
+ .dealloc = tcx_link_dealloc,
+ .update_prog = tcx_link_update,
+ .show_fdinfo = tcx_link_fdinfo,
+ .fill_link_info = tcx_link_fill_info,
+};
+
+static int tcx_link_init(struct tcx_link *tcx,
+ struct bpf_link_primer *link_primer,
+ const union bpf_attr *attr,
+ struct net_device *dev,
+ struct bpf_prog *prog)
+{
+ bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog,
+ attr->link_create.attach_type);
+ tcx->dev = dev;
+ return bpf_link_prime(&tcx->link, link_primer);
+}
+
+int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct net_device *dev;
+ struct tcx_link *tcx;
+ int ret;
+
+ rtnl_lock();
+ dev = __dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ tcx = kzalloc(sizeof(*tcx), GFP_USER);
+ if (!tcx) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = tcx_link_init(tcx, &link_primer, attr, dev, prog);
+ if (ret) {
+ kfree(tcx);
+ goto out;
+ }
+ ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags,
+ attr->link_create.tcx.relative_fd,
+ attr->link_create.tcx.expected_revision);
+ if (ret) {
+ tcx->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+ ret = bpf_link_settle(&link_primer);
+out:
+ rtnl_unlock();
+ return ret;
+}
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
new file mode 100644
index 000000000000..f8e70e9c3998
--- /dev/null
+++ b/kernel/bpf/tnum.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value. Each bit can be either
+ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/kernel.h>
+#include <linux/tnum.h>
+
+#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}
+/* A completely unknown value */
+const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
+
+struct tnum tnum_const(u64 value)
+{
+ return TNUM(value, 0);
+}
+
+struct tnum tnum_range(u64 min, u64 max)
+{
+ u64 chi = min ^ max, delta;
+ u8 bits = fls64(chi);
+
+ /* special case, needed because 1ULL << 64 is undefined */
+ if (bits > 63)
+ return tnum_unknown;
+ /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
+ * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
+ * constant min (since min == max).
+ */
+ delta = (1ULL << bits) - 1;
+ return TNUM(min & ~delta, delta);
+}
+
+struct tnum tnum_lshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value << shift, a.mask << shift);
+}
+
+struct tnum tnum_rshift(struct tnum a, u8 shift)
+{
+ return TNUM(a.value >> shift, a.mask >> shift);
+}
+
+struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness)
+{
+ /* if a.value is negative, arithmetic shifting by minimum shift
+ * will have larger negative offset compared to more shifting.
+ * If a.value is nonnegative, arithmetic shifting by minimum shift
+ * will have larger positive offset compare to more shifting.
+ */
+ if (insn_bitness == 32)
+ return TNUM((u32)(((s32)a.value) >> min_shift),
+ (u32)(((s32)a.mask) >> min_shift));
+ else
+ return TNUM((s64)a.value >> min_shift,
+ (s64)a.mask >> min_shift);
+}
+
+struct tnum tnum_add(struct tnum a, struct tnum b)
+{
+ u64 sm, sv, sigma, chi, mu;
+
+ sm = a.mask + b.mask;
+ sv = a.value + b.value;
+ sigma = sm + sv;
+ chi = sigma ^ sv;
+ mu = chi | a.mask | b.mask;
+ return TNUM(sv & ~mu, mu);
+}
+
+struct tnum tnum_sub(struct tnum a, struct tnum b)
+{
+ u64 dv, alpha, beta, chi, mu;
+
+ dv = a.value - b.value;
+ alpha = dv + a.mask;
+ beta = dv - b.mask;
+ chi = alpha ^ beta;
+ mu = chi | a.mask | b.mask;
+ return TNUM(dv & ~mu, mu);
+}
+
+struct tnum tnum_neg(struct tnum a)
+{
+ return tnum_sub(TNUM(0, 0), a);
+}
+
+struct tnum tnum_and(struct tnum a, struct tnum b)
+{
+ u64 alpha, beta, v;
+
+ alpha = a.value | a.mask;
+ beta = b.value | b.mask;
+ v = a.value & b.value;
+ return TNUM(v, alpha & beta & ~v);
+}
+
+struct tnum tnum_or(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v, mu & ~v);
+}
+
+struct tnum tnum_xor(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value ^ b.value;
+ mu = a.mask | b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+/* Perform long multiplication, iterating through the bits in a using rshift:
+ * - if LSB(a) is a known 0, keep current accumulator
+ * - if LSB(a) is a known 1, add b to current accumulator
+ * - if LSB(a) is unknown, take a union of the above cases.
+ *
+ * For example:
+ *
+ * acc_0: acc_1:
+ *
+ * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1
+ * x1 01 11
+ * ------ ------ ------
+ * 11 11 11
+ * xx 00 11
+ * ------ ------ ------
+ * ???? 0011 1001
+ */
+struct tnum tnum_mul(struct tnum a, struct tnum b)
+{
+ struct tnum acc = TNUM(0, 0);
+
+ while (a.value || a.mask) {
+ /* LSB of tnum a is a certain 1 */
+ if (a.value & 1)
+ acc = tnum_add(acc, b);
+ /* LSB of tnum a is uncertain */
+ else if (a.mask & 1) {
+ /* acc = tnum_union(acc_0, acc_1), where acc_0 and
+ * acc_1 are partial accumulators for cases
+ * LSB(a) = certain 0 and LSB(a) = certain 1.
+ * acc_0 = acc + 0 * b = acc.
+ * acc_1 = acc + 1 * b = tnum_add(acc, b).
+ */
+
+ acc = tnum_union(acc, tnum_add(acc, b));
+ }
+ /* Note: no case for LSB is certain 0 */
+ a = tnum_rshift(a, 1);
+ b = tnum_lshift(b, 1);
+ }
+ return acc;
+}
+
+bool tnum_overlap(struct tnum a, struct tnum b)
+{
+ u64 mu;
+
+ mu = ~a.mask & ~b.mask;
+ return (a.value & mu) == (b.value & mu);
+}
+
+/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
+ * a 'known 0' - this will return a 'known 1' for that bit.
+ */
+struct tnum tnum_intersect(struct tnum a, struct tnum b)
+{
+ u64 v, mu;
+
+ v = a.value | b.value;
+ mu = a.mask & b.mask;
+ return TNUM(v & ~mu, mu);
+}
+
+/* Returns a tnum with the uncertainty from both a and b, and in addition, new
+ * uncertainty at any position that a and b disagree. This represents a
+ * superset of the union of the concrete sets of both a and b. Despite the
+ * overapproximation, it is optimal.
+ */
+struct tnum tnum_union(struct tnum a, struct tnum b)
+{
+ u64 v = a.value & b.value;
+ u64 mu = (a.value ^ b.value) | a.mask | b.mask;
+
+ return TNUM(v & ~mu, mu);
+}
+
+struct tnum tnum_cast(struct tnum a, u8 size)
+{
+ a.value &= (1ULL << (size * 8)) - 1;
+ a.mask &= (1ULL << (size * 8)) - 1;
+ return a;
+}
+
+bool tnum_is_aligned(struct tnum a, u64 size)
+{
+ if (!size)
+ return true;
+ return !((a.value | a.mask) & (size - 1));
+}
+
+bool tnum_in(struct tnum a, struct tnum b)
+{
+ if (b.mask & ~a.mask)
+ return false;
+ b.value &= ~a.mask;
+ return a.value == b.value;
+}
+
+int tnum_sbin(char *str, size_t size, struct tnum a)
+{
+ size_t n;
+
+ for (n = 64; n; n--) {
+ if (n < size) {
+ if (a.mask & 1)
+ str[n - 1] = 'x';
+ else if (a.value & 1)
+ str[n - 1] = '1';
+ else
+ str[n - 1] = '0';
+ }
+ a.mask >>= 1;
+ a.value >>= 1;
+ }
+ str[min(size - 1, (size_t)64)] = 0;
+ return 64;
+}
+
+struct tnum tnum_subreg(struct tnum a)
+{
+ return tnum_cast(a, 4);
+}
+
+struct tnum tnum_clear_subreg(struct tnum a)
+{
+ return tnum_lshift(tnum_rshift(a, 32), 32);
+}
+
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
+{
+ return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
+}
+
+struct tnum tnum_const_subreg(struct tnum a, u32 value)
+{
+ return tnum_with_subreg(a, tnum_const(value));
+}
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..feecd8f4dbf9
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,261 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+#include <linux/security.h>
+
+static bool bpf_ns_capable(struct user_namespace *ns, int cap)
+{
+ return ns_capable(ns, cap) || (cap != CAP_SYS_ADMIN && ns_capable(ns, CAP_SYS_ADMIN));
+}
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+ struct user_namespace *userns;
+
+ /* BPF token allows ns_capable() level of capabilities */
+ userns = token ? token->userns : &init_user_ns;
+ if (!bpf_ns_capable(userns, cap))
+ return false;
+ if (token && security_bpf_token_capable(token, cap) < 0)
+ return false;
+ return true;
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+ atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+ security_bpf_token_free(token);
+ put_user_ns(token->userns);
+ kfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+ struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+ bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+ if (!token)
+ return;
+
+ if (!atomic64_dec_and_test(&token->refcnt))
+ return;
+
+ INIT_WORK(&token->work, bpf_token_put_deferred);
+ schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+
+ bpf_token_put(token);
+ return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ struct bpf_token *token = filp->private_data;
+ u64 mask;
+
+ BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+ mask = BIT_ULL(__MAX_BPF_CMD) - 1;
+ if ((token->allowed_cmds & mask) == mask)
+ seq_printf(m, "allowed_cmds:\tany\n");
+ else
+ seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+ BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_MAP_TYPE) - 1;
+ if ((token->allowed_maps & mask) == mask)
+ seq_printf(m, "allowed_maps:\tany\n");
+ else
+ seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+ BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_PROG_TYPE) - 1;
+ if ((token->allowed_progs & mask) == mask)
+ seq_printf(m, "allowed_progs:\tany\n");
+ else
+ seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+ BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+ mask = BIT_ULL(__MAX_BPF_ATTACH_TYPE) - 1;
+ if ((token->allowed_attachs & mask) == mask)
+ seq_printf(m, "allowed_attachs:\tany\n");
+ else
+ seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+const struct file_operations bpf_token_fops = {
+ .release = bpf_token_release,
+ .show_fdinfo = bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+ struct bpf_token *token __free(kfree) = NULL;
+ struct bpf_mount_opts *mnt_opts;
+ struct user_namespace *userns;
+ struct inode *inode;
+ CLASS(fd, f)(attr->token_create.bpffs_fd);
+ struct path path;
+ struct super_block *sb;
+ umode_t mode;
+ int err;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ path = fd_file(f)->f_path;
+ sb = path.dentry->d_sb;
+
+ if (path.dentry != sb->s_root)
+ return -EINVAL;
+ if (sb->s_op != &bpf_super_ops)
+ return -EINVAL;
+ err = path_permission(&path, MAY_ACCESS);
+ if (err)
+ return err;
+
+ userns = sb->s_user_ns;
+ /*
+ * Enforce that creators of BPF tokens are in the same user
+ * namespace as the BPF FS instance. This makes reasoning about
+ * permissions a lot easier and we can always relax this later.
+ */
+ if (current_user_ns() != userns)
+ return -EPERM;
+ if (!ns_capable(userns, CAP_BPF))
+ return -EPERM;
+
+ /* Creating BPF token in init_user_ns doesn't make much sense. */
+ if (current_user_ns() == &init_user_ns)
+ return -EOPNOTSUPP;
+
+ mnt_opts = sb->s_fs_info;
+ if (mnt_opts->delegate_cmds == 0 &&
+ mnt_opts->delegate_maps == 0 &&
+ mnt_opts->delegate_progs == 0 &&
+ mnt_opts->delegate_attachs == 0)
+ return -ENOENT; /* no BPF token delegation is set up */
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ inode = bpf_get_inode(sb, NULL, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &bpf_token_iops;
+ inode->i_fop = &bpf_token_fops;
+ clear_nlink(inode); /* make sure it is unlinked */
+
+ FD_PREPARE(fdf, O_CLOEXEC,
+ alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME,
+ O_RDWR, &bpf_token_fops));
+ if (fdf.err)
+ return fdf.err;
+
+ token = kzalloc(sizeof(*token), GFP_USER);
+ if (!token)
+ return -ENOMEM;
+
+ atomic64_set(&token->refcnt, 1);
+
+ /* remember bpffs owning userns for future ns_capable() checks. */
+ token->userns = userns;
+ token->allowed_cmds = mnt_opts->delegate_cmds;
+ token->allowed_maps = mnt_opts->delegate_maps;
+ token->allowed_progs = mnt_opts->delegate_progs;
+ token->allowed_attachs = mnt_opts->delegate_attachs;
+
+ err = security_bpf_token_create(token, attr, &path);
+ if (err)
+ return err;
+
+ get_user_ns(token->userns);
+ fd_prepare_file(fdf)->private_data = no_free_ptr(token);
+ return fd_publish(fdf);
+}
+
+int bpf_token_get_info_by_fd(struct bpf_token *token,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_token_info info;
+ u32 info_len = attr->info.info_len;
+
+ info_len = min_t(u32, info_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
+
+ info.allowed_cmds = token->allowed_cmds;
+ info.allowed_maps = token->allowed_maps;
+ info.allowed_progs = token->allowed_progs;
+ info.allowed_attachs = token->allowed_attachs;
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+ CLASS(fd, f)(ufd);
+ struct bpf_token *token;
+
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ if (fd_file(f)->f_op != &bpf_token_fops)
+ return ERR_PTR(-EINVAL);
+
+ token = fd_file(f)->private_data;
+ bpf_token_inc(token);
+
+ return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+ if (!token)
+ return false;
+ if (!(token->allowed_cmds & BIT_ULL(cmd)))
+ return false;
+ return security_bpf_token_cmd(token, cmd) == 0;
+}
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+ if (!token || type >= __MAX_BPF_MAP_TYPE)
+ return false;
+
+ return token->allowed_maps & BIT_ULL(type);
+}
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+ enum bpf_prog_type prog_type,
+ enum bpf_attach_type attach_type)
+{
+ if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+ return false;
+
+ return (token->allowed_progs & BIT_ULL(prog_type)) &&
+ (token->allowed_attachs & BIT_ULL(attach_type));
+}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
new file mode 100644
index 000000000000..976d89011b15
--- /dev/null
+++ b/kernel/bpf/trampoline.c
@@ -0,0 +1,1185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ftrace.h>
+#include <linux/rbtree_latch.h>
+#include <linux/perf_event.h>
+#include <linux/btf.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
+#include <linux/delay.h>
+
+/* dummy _ops. The verifier will operate on target program's ops. */
+const struct bpf_verifier_ops bpf_extension_verifier_ops = {
+};
+const struct bpf_prog_ops bpf_extension_prog_ops = {
+};
+
+/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
+#define TRAMPOLINE_HASH_BITS 10
+#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
+
+static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+
+/* serializes access to trampoline_table */
+static DEFINE_MUTEX(trampoline_mutex);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+{
+ struct bpf_trampoline *tr = ops->private;
+ int ret = 0;
+
+ if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+ /* This is called inside register_ftrace_direct_multi(), so
+ * tr->mutex is already locked.
+ */
+ lockdep_assert_held_once(&tr->mutex);
+
+ /* Instead of updating the trampoline here, we propagate
+ * -EAGAIN to register_ftrace_direct(). Then we can
+ * retry register_ftrace_direct() after updating the
+ * trampoline.
+ */
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+ if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+ return -EBUSY;
+
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+ return -EAGAIN;
+ }
+
+ return 0;
+ }
+
+ /* The normal locking order is
+ * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+ *
+ * The following two commands are called from
+ *
+ * prepare_direct_functions_for_ipmodify
+ * cleanup_direct_functions_after_ipmodify
+ *
+ * In both cases, direct_mutex is already locked. Use
+ * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+ * (something else is making changes to this same trampoline).
+ */
+ if (!mutex_trylock(&tr->mutex)) {
+ /* sleep 1 ms to make sure whatever holding tr->mutex makes
+ * some progress.
+ */
+ msleep(1);
+ return -EAGAIN;
+ }
+
+ switch (cmd) {
+ case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+ tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tr->mutex);
+ return ret;
+}
+#endif
+
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type ptype = prog->type;
+
+ return (ptype == BPF_PROG_TYPE_TRACING &&
+ (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN)) ||
+ (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
+}
+
+void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
+{
+ ksym->start = (unsigned long) data;
+ ksym->end = ksym->start + size;
+}
+
+void bpf_image_ksym_add(struct bpf_ksym *ksym)
+{
+ bpf_ksym_add(ksym);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
+ PAGE_SIZE, false, ksym->name);
+}
+
+void bpf_image_ksym_del(struct bpf_ksym *ksym)
+{
+ bpf_ksym_del(ksym);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
+ PAGE_SIZE, true, ksym->name);
+}
+
+static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+{
+ struct bpf_trampoline *tr;
+ struct hlist_head *head;
+ int i;
+
+ mutex_lock(&trampoline_mutex);
+ head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
+ hlist_for_each_entry(tr, head, hlist) {
+ if (tr->key == key) {
+ refcount_inc(&tr->refcnt);
+ goto out;
+ }
+ }
+ tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+ if (!tr)
+ goto out;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops) {
+ kfree(tr);
+ tr = NULL;
+ goto out;
+ }
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
+
+ tr->key = key;
+ INIT_HLIST_NODE(&tr->hlist);
+ hlist_add_head(&tr->hlist, head);
+ refcount_set(&tr->refcnt, 1);
+ mutex_init(&tr->mutex);
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ INIT_HLIST_HEAD(&tr->progs_hlist[i]);
+out:
+ mutex_unlock(&trampoline_mutex);
+ return tr;
+}
+
+static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr)
+{
+ enum bpf_text_poke_type new_t = BPF_MOD_CALL, old_t = BPF_MOD_CALL;
+ void *ip = tr->func.addr;
+
+ if (!new_addr)
+ new_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(tr->flags))
+ new_t = BPF_MOD_JUMP;
+
+ if (!old_addr)
+ old_t = BPF_MOD_NOP;
+ else if (bpf_trampoline_use_jmp(orig_flags))
+ old_t = BPF_MOD_JUMP;
+
+ return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr);
+}
+
+static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr)
+{
+ int ret;
+
+ if (tr->func.ftrace_managed)
+ ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
+ else
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL);
+
+ return ret;
+}
+
+static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags,
+ void *old_addr, void *new_addr,
+ bool lock_direct_mutex)
+{
+ int ret;
+
+ if (tr->func.ftrace_managed) {
+ if (lock_direct_mutex)
+ ret = modify_ftrace_direct(tr->fops, (long)new_addr);
+ else
+ ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
+ } else {
+ ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr,
+ new_addr);
+ }
+ return ret;
+}
+
+/* first time registering */
+static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
+{
+ void *ip = tr->func.addr;
+ unsigned long faddr;
+ int ret;
+
+ faddr = ftrace_location((unsigned long)ip);
+ if (faddr) {
+ if (!tr->fops)
+ return -ENOTSUPP;
+ tr->func.ftrace_managed = true;
+ }
+
+ if (tr->func.ftrace_managed) {
+ ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ if (ret)
+ return ret;
+ ret = register_ftrace_direct(tr->fops, (long)new_addr);
+ } else {
+ ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr);
+ }
+
+ return ret;
+}
+
+static struct bpf_tramp_links *
+bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg)
+{
+ struct bpf_tramp_link *link;
+ struct bpf_tramp_links *tlinks;
+ struct bpf_tramp_link **links;
+ int kind;
+
+ *total = 0;
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks)
+ return ERR_PTR(-ENOMEM);
+
+ for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+ tlinks[kind].nr_links = tr->progs_cnt[kind];
+ *total += tr->progs_cnt[kind];
+ links = tlinks[kind].links;
+
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ *ip_arg |= link->link.prog->call_get_func_ip;
+ *links++ = link;
+ }
+ }
+ return tlinks;
+}
+
+static void bpf_tramp_image_free(struct bpf_tramp_image *im)
+{
+ bpf_image_ksym_del(&im->ksym);
+ arch_free_bpf_trampoline(im->image, im->size);
+ bpf_jit_uncharge_modmem(im->size);
+ percpu_ref_exit(&im->pcref);
+ kfree_rcu(im, rcu);
+}
+
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(work, struct bpf_tramp_image, work);
+ bpf_tramp_image_free(im);
+}
+
+/* callback, fexit step 3 or fentry step 2 */
+static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
+ schedule_work(&im->work);
+}
+
+/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
+static void __bpf_tramp_image_release(struct percpu_ref *pcref)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(pcref, struct bpf_tramp_image, pcref);
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+/* callback, fexit or fentry step 1 */
+static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ if (im->ip_after_call)
+ /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
+ percpu_ref_kill(&im->pcref);
+ else
+ /* the case of fentry trampoline */
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+static void bpf_tramp_image_put(struct bpf_tramp_image *im)
+{
+ /* The trampoline image that calls original function is using:
+ * rcu_read_lock_trace to protect sleepable bpf progs
+ * rcu_read_lock to protect normal bpf progs
+ * percpu_ref to protect trampoline itself
+ * rcu tasks to protect trampoline asm not covered by percpu_ref
+ * (which are few asm insns before __bpf_tramp_enter and
+ * after __bpf_tramp_exit)
+ *
+ * The trampoline is unreachable before bpf_tramp_image_put().
+ *
+ * First, patch the trampoline to avoid calling into fexit progs.
+ * The progs will be freed even if the original function is still
+ * executing or sleeping.
+ * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
+ * first few asm instructions to execute and call into
+ * __bpf_tramp_enter->percpu_ref_get.
+ * Then use percpu_ref_kill to wait for the trampoline and the original
+ * function to finish.
+ * Then use call_rcu_tasks() to make sure few asm insns in
+ * the trampoline epilogue are done as well.
+ *
+ * In !PREEMPT case the task that got interrupted in the first asm
+ * insns won't go through an RCU quiescent state which the
+ * percpu_ref_kill will be waiting for. Hence the first
+ * call_rcu_tasks() is not necessary.
+ */
+ if (im->ip_after_call) {
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
+ im->ip_epilogue);
+ WARN_ON(err);
+ if (IS_ENABLED(CONFIG_TASKS_RCU))
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+ else
+ percpu_ref_kill(&im->pcref);
+ return;
+ }
+
+ /* The trampoline without fexit and fmod_ret progs doesn't call original
+ * function and doesn't use percpu_ref.
+ * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu_tasks() to wait for the rest of trampoline asm
+ * and normal progs.
+ */
+ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+}
+
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
+{
+ struct bpf_tramp_image *im;
+ struct bpf_ksym *ksym;
+ void *image;
+ int err = -ENOMEM;
+
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ err = bpf_jit_charge_modmem(size);
+ if (err)
+ goto out_free_im;
+ im->size = size;
+
+ err = -ENOMEM;
+ im->image = image = arch_alloc_bpf_trampoline(size);
+ if (!image)
+ goto out_uncharge;
+
+ err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
+ if (err)
+ goto out_free_image;
+
+ ksym = &im->ksym;
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
+ bpf_image_ksym_init(image, size, ksym);
+ bpf_image_ksym_add(ksym);
+ return im;
+
+out_free_image:
+ arch_free_bpf_trampoline(im->image, im->size);
+out_uncharge:
+ bpf_jit_uncharge_modmem(size);
+out_free_im:
+ kfree(im);
+out:
+ return ERR_PTR(err);
+}
+
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
+{
+ struct bpf_tramp_image *im;
+ struct bpf_tramp_links *tlinks;
+ u32 orig_flags = tr->flags;
+ bool ip_arg = false;
+ int err, total, size;
+
+ tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
+ if (IS_ERR(tlinks))
+ return PTR_ERR(tlinks);
+
+ if (total == 0) {
+ err = unregister_fentry(tr, orig_flags, tr->cur_image->image);
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = NULL;
+ goto out;
+ }
+
+ /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
+
+ if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+ /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
+ * should not be set together.
+ */
+ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ } else {
+ tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+ }
+
+ if (ip_arg)
+ tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+ if (tr->flags & BPF_TRAMP_F_CALL_ORIG) {
+ if (tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) {
+ /* The BPF_TRAMP_F_SKIP_FRAME can be cleared in the
+ * first try, reset it in the second try.
+ */
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK | BPF_TRAMP_F_SKIP_FRAME;
+ } else if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_JMP)) {
+ /* Use "jmp" instead of "call" for the trampoline
+ * in the origin call case, and we don't need to
+ * skip the frame.
+ */
+ tr->flags &= ~BPF_TRAMP_F_SKIP_FRAME;
+ }
+ }
+#endif
+
+ size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
+ tlinks, tr->func.addr);
+ if (size < 0) {
+ err = size;
+ goto out;
+ }
+
+ if (size > PAGE_SIZE) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ im = bpf_tramp_image_alloc(tr->key, size);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
+
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
+ &tr->func.model, tr->flags, tlinks,
+ tr->func.addr);
+ if (err < 0)
+ goto out_free;
+
+ err = arch_protect_bpf_trampoline(im->image, im->size);
+ if (err)
+ goto out_free;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
+
+ WARN_ON(tr->cur_image && total == 0);
+ if (tr->cur_image)
+ /* progs already running at this address */
+ err = modify_fentry(tr, orig_flags, tr->cur_image->image,
+ im->image, lock_direct_mutex);
+ else
+ /* first time registering */
+ err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ if (err == -EAGAIN) {
+ /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+ * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+ * trampoline again, and retry register.
+ */
+ bpf_tramp_image_free(im);
+ goto again;
+ }
+#endif
+ if (err)
+ goto out_free;
+
+ if (tr->cur_image)
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = im;
+out:
+ /* If any error happens, restore previous flags */
+ if (err) {
+ tr->flags = orig_flags;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP
+ if (bpf_trampoline_use_jmp(tr->flags))
+ tr->fops->flags |= FTRACE_OPS_FL_JMP;
+ else
+ tr->fops->flags &= ~FTRACE_OPS_FL_JMP;
+#endif
+ }
+ kfree(tlinks);
+ return err;
+
+out_free:
+ bpf_tramp_image_free(im);
+ goto out;
+}
+
+static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
+{
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ return BPF_TRAMP_FENTRY;
+ case BPF_MODIFY_RETURN:
+ return BPF_TRAMP_MODIFY_RETURN;
+ case BPF_TRACE_FEXIT:
+ return BPF_TRAMP_FEXIT;
+ case BPF_LSM_MAC:
+ if (!prog->aux->attach_func_proto->type)
+ /* The function returns void, we cannot modify its
+ * return value.
+ */
+ return BPF_TRAMP_FEXIT;
+ else
+ return BPF_TRAMP_MODIFY_RETURN;
+ default:
+ return BPF_TRAMP_REPLACE;
+ }
+}
+
+static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
+{
+ struct bpf_prog_aux *aux = tgt_prog->aux;
+
+ guard(mutex)(&aux->ext_mutex);
+ if (aux->prog_array_member_cnt)
+ /* Program extensions can not extend target prog when the target
+ * prog has been updated to any prog_array map as tail callee.
+ * It's to prevent a potential infinite loop like:
+ * tgt prog entry -> tgt prog subprog -> freplace prog entry
+ * --tailcall-> tgt prog entry.
+ */
+ return -EBUSY;
+
+ aux->is_extended = true;
+ return 0;
+}
+
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
+{
+ enum bpf_tramp_prog_type kind;
+ struct bpf_tramp_link *link_exiting;
+ int err = 0;
+ int cnt = 0, i;
+
+ kind = bpf_attach_type_to_tramp(link->link.prog);
+ if (tr->extension_prog)
+ /* cannot attach fentry/fexit if extension prog is attached.
+ * cannot overwrite extension prog either.
+ */
+ return -EBUSY;
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ cnt += tr->progs_cnt[i];
+
+ if (kind == BPF_TRAMP_REPLACE) {
+ /* Cannot attach extension if fentry/fexit are in use. */
+ if (cnt)
+ return -EBUSY;
+ err = bpf_freplace_check_tgt_prog(tgt_prog);
+ if (err)
+ return err;
+ tr->extension_prog = link->link.prog;
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP,
+ BPF_MOD_JUMP, NULL,
+ link->link.prog->bpf_func);
+ }
+ if (cnt >= BPF_MAX_TRAMP_LINKS)
+ return -E2BIG;
+ if (!hlist_unhashed(&link->tramp_hlist))
+ /* prog already linked */
+ return -EBUSY;
+ hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+ if (link_exiting->link.prog != link->link.prog)
+ continue;
+ /* prog already linked */
+ return -EBUSY;
+ }
+
+ hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
+ tr->progs_cnt[kind]++;
+ err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+ if (err) {
+ hlist_del_init(&link->tramp_hlist);
+ tr->progs_cnt[kind]--;
+ }
+ return err;
+}
+
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
+ mutex_unlock(&tr->mutex);
+ return err;
+}
+
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
+{
+ enum bpf_tramp_prog_type kind;
+ int err;
+
+ kind = bpf_attach_type_to_tramp(link->link.prog);
+ if (kind == BPF_TRAMP_REPLACE) {
+ WARN_ON_ONCE(!tr->extension_prog);
+ err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
+ BPF_MOD_NOP,
+ tr->extension_prog->bpf_func, NULL);
+ tr->extension_prog = NULL;
+ guard(mutex)(&tgt_prog->aux->ext_mutex);
+ tgt_prog->aux->is_extended = false;
+ return err;
+ }
+ hlist_del_init(&link->tramp_hlist);
+ tr->progs_cnt[kind]--;
+ return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
+ struct bpf_trampoline *tr,
+ struct bpf_prog *tgt_prog)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
+ mutex_unlock(&tr->mutex);
+ return err;
+}
+
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+ if (!shim_link->trampoline)
+ return;
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
+ bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+ .release = bpf_shim_tramp_link_release,
+ .dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+ bpf_func_t bpf_func,
+ int cgroup_atype,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_prog *p;
+
+ shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+ if (!shim_link)
+ return NULL;
+
+ p = bpf_prog_alloc(1, 0);
+ if (!p) {
+ kfree(shim_link);
+ return NULL;
+ }
+
+ p->jited = false;
+ p->bpf_func = bpf_func;
+
+ p->aux->cgroup_atype = cgroup_atype;
+ p->aux->attach_func_proto = prog->aux->attach_func_proto;
+ p->aux->attach_btf_id = prog->aux->attach_btf_id;
+ p->aux->attach_btf = prog->aux->attach_btf;
+ btf_get(p->aux->attach_btf);
+ p->type = BPF_PROG_TYPE_LSM;
+ p->expected_attach_type = BPF_LSM_MAC;
+ bpf_prog_inc(p);
+ bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+ &bpf_shim_tramp_link_lops, p, attach_type);
+ bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
+
+ return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+ bpf_func_t bpf_func)
+{
+ struct bpf_tramp_link *link;
+ int kind;
+
+ for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ struct bpf_prog *p = link->link.prog;
+
+ if (p->bpf_func == bpf_func)
+ return container_of(link, struct bpf_shim_tramp_link, link);
+ }
+ }
+
+ return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+ int cgroup_atype,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_attach_target_info tgt_info = {};
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+ int err;
+
+ err = bpf_check_attach_target(NULL, prog, NULL,
+ prog->aux->attach_btf_id,
+ &tgt_info);
+ if (err)
+ return err;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ mutex_lock(&tr->mutex);
+
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ if (shim_link) {
+ /* Reusing existing shim attached by the other program. */
+ bpf_link_inc(&shim_link->link.link);
+
+ mutex_unlock(&tr->mutex);
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+ return 0;
+ }
+
+ /* Allocate and install new shim. */
+
+ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type);
+ if (!shim_link) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
+ if (err)
+ goto err;
+
+ shim_link->trampoline = tr;
+ /* note, we're still holding tr refcnt from above */
+
+ mutex_unlock(&tr->mutex);
+
+ return 0;
+err:
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ /* have to release tr while _not_ holding its mutex */
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
+ return err;
+}
+
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_lookup(key);
+ if (WARN_ON_ONCE(!tr))
+ return;
+
+ mutex_lock(&tr->mutex);
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
+struct bpf_trampoline *bpf_trampoline_get(u64 key,
+ struct bpf_attach_target_info *tgt_info)
+{
+ struct bpf_trampoline *tr;
+
+ tr = bpf_trampoline_lookup(key);
+ if (!tr)
+ return NULL;
+
+ mutex_lock(&tr->mutex);
+ if (tr->func.addr)
+ goto out;
+
+ memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
+ tr->func.addr = (void *)tgt_info->tgt_addr;
+out:
+ mutex_unlock(&tr->mutex);
+ return tr;
+}
+
+void bpf_trampoline_put(struct bpf_trampoline *tr)
+{
+ int i;
+
+ if (!tr)
+ return;
+ mutex_lock(&trampoline_mutex);
+ if (!refcount_dec_and_test(&tr->refcnt))
+ goto out;
+ WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
+
+ for (i = 0; i < BPF_TRAMP_MAX; i++)
+ if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
+ goto out;
+
+ /* This code will be executed even when the last bpf_tramp_image
+ * is alive. All progs are detached from the trampoline and the
+ * trampoline image is patched with jmp into epilogue to skip
+ * fexit progs. The fentry-only trampoline will be freed via
+ * multiple rcu callbacks.
+ */
+ hlist_del(&tr->hlist);
+ if (tr->fops) {
+ ftrace_free_filter(tr->fops);
+ kfree(tr->fops);
+ }
+ kfree(tr);
+out:
+ mutex_unlock(&trampoline_mutex);
+}
+
+#define NO_START_TIME 1
+static __always_inline u64 notrace bpf_prog_start_time(void)
+{
+ u64 start = NO_START_TIME;
+
+ if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+ start = sched_clock();
+ if (unlikely(!start))
+ start = NO_START_TIME;
+ }
+ return start;
+}
+
+/* The logic is similar to bpf_prog_run(), but with an explicit
+ * rcu_read_lock() and migrate_disable() which are required
+ * for the trampoline. The macro is split into
+ * call __bpf_prog_enter
+ * call prog->bpf_func
+ * call __bpf_prog_exit
+ *
+ * __bpf_prog_enter returns:
+ * 0 - skip execution of the bpf prog
+ * 1 - execute bpf prog
+ * [2..MAX_U64] - execute bpf prog and record execution time.
+ * This is start time.
+ */
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock_dont_migrate();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
+ return 0;
+ }
+ return bpf_prog_start_time();
+}
+
+static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
+{
+ struct bpf_prog_stats *stats;
+ unsigned long flags;
+ u64 duration;
+
+ /*
+ * static_key could be enabled in __bpf_prog_enter* and disabled in
+ * __bpf_prog_exit*. And vice versa. Check that 'start' is valid.
+ */
+ if (start <= NO_START_TIME)
+ return;
+
+ duration = sched_clock() - start;
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->cnt);
+ u64_stats_add(&stats->nsecs, duration);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
+static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
+ u64 start)
+{
+ if (static_branch_unlikely(&bpf_stats_enabled_key))
+ __update_prog_stats(prog, start);
+}
+
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ this_cpu_dec(*(prog->active));
+ rcu_read_unlock_migrate();
+}
+
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ /* Runtime stats are exported via actual BPF_LSM_CGROUP
+ * programs, not the shims.
+ */
+ rcu_read_lock_dont_migrate();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return NO_START_TIME;
+}
+
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ rcu_read_unlock_migrate();
+}
+
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+ bpf_prog_inc_misses_counter(prog);
+ if (prog->aux->recursion_detected)
+ prog->aux->recursion_detected(prog);
+ return 0;
+ }
+ return bpf_prog_start_time();
+}
+
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ this_cpu_dec(*(prog->active));
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ rcu_read_lock_dont_migrate();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ rcu_read_unlock_migrate();
+}
+
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
+{
+ percpu_ref_get(&tr->pcref);
+}
+
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
+{
+ percpu_ref_put(&tr->pcref);
+}
+
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_enter_sleepable_recur :
+ __bpf_prog_enter_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_enter_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_exit_sleepable_recur :
+ __bpf_prog_exit_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_exit_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
+int __weak
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ return -ENOTSUPP;
+}
+
+void * __weak arch_alloc_bpf_trampoline(unsigned int size)
+{
+ void *image;
+
+ if (WARN_ON_ONCE(size > PAGE_SIZE))
+ return NULL;
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (image)
+ set_vm_flush_reset_perms(image);
+ return image;
+}
+
+void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ /* bpf_jit_free_exec doesn't need "size", but
+ * bpf_prog_pack_free() needs it.
+ */
+ bpf_jit_free_exec(image);
+}
+
+int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+ WARN_ON_ONCE(size > PAGE_SIZE);
+ return set_memory_rox((long)image, 1);
+}
+
+int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
+{
+ return -ENOTSUPP;
+}
+
+static int __init init_trampolines(void)
+{
+ int i;
+
+ for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&trampoline_table[i]);
+ return 0;
+}
+late_initcall(init_trampolines);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6e23..f0ca69f888fa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,22 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
*/
+#include <uapi/linux/btf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
#include <linux/file.h>
#include <linux/vmalloc.h>
+#include <linux/stringify.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
+#include <linux/perf_event.h>
+#include <linux/ctype.h>
+#include <linux/error-injection.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/poison.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
+#include <net/xdp.h>
+#include <linux/trace_events.h>
+#include <linux/kallsyms.h>
+
+#include "disasm.h"
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+ [_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
+};
+
+enum bpf_features {
+ BPF_FEAT_RDONLY_CAST_TO_VOID = 0,
+ BPF_FEAT_STREAMS = 1,
+ __MAX_BPF_FEAT,
+};
+
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
/* bpf_check() is a static code analyzer that walks eBPF program
* instruction by instruction and updates register/stack state.
@@ -29,8 +64,8 @@
* - unreachable insns exist (shouldn't be a forest. program = one function)
* - out of bounds or malformed jumps
* The second pass is all possible path descent from the 1st insn.
- * Since it's analyzing all pathes through the program, the length of the
- * analysis is limited to 32k insn, which may be hit even if total number of
+ * Since it's analyzing all paths through the program, the length of the
+ * analysis is limited to 64k insn, which may be hit even if total number of
* insn is less then 4K, but there are too many branches that change stack/regs.
* Number of 'branches to be analyzed' is limited to 1k
*
@@ -58,13 +93,13 @@
* (and -20 constant is saved for further stack bounds checking).
* Meaning that this reg is a pointer to stack plus known immediate constant.
*
- * Most of the time the registers have UNKNOWN_VALUE type, which
+ * Most of the time the registers have SCALAR_VALUE type, which
* means the register has some value, but it's not a valid pointer.
- * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ * (like pointer plus pointer becomes SCALAR_VALUE type)
*
* When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
- * types recognized by check_mem_access() function.
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
+ * four pointer types recognized by check_mem_access() function.
*
* PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
* and the range of [ptr, ptr + map's value_size) is accessible.
@@ -114,7 +149,7 @@
* If it's ok, then verifier allows this BPF_CALL insn and looks at
* .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
* R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
- * returns ether pointer to map value or NULL.
+ * returns either pointer to map value or NULL.
*
* When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
* insn, the register holding that pointer in the true branch changes state to
@@ -123,335 +158,1998 @@
*
* After the call R0 is set to return type of the function and registers R1-R5
* are set to NOT_INIT to indicate that they are no longer readable.
+ *
+ * The following reference types represent a potential reference to a kernel
+ * resource which, after first being allocated, must be checked and freed by
+ * the BPF program:
+ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
+ *
+ * When the verifier sees a helper call return a reference type, it allocates a
+ * pointer id for the reference and stores it in the current function state.
+ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
+ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
+ * passes through a NULL-check conditional. For the branch wherein the state is
+ * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as
+ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
+ * bpf_sk_release(). When a reference type passes into the release function,
+ * the verifier also releases the reference. If any unchecked or unreleased
+ * reference remains at the end of the program, the verifier rejects it.
*/
-/* types of values stored in eBPF registers */
-enum bpf_reg_type {
- NOT_INIT = 0, /* nothing was written into register */
- UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
- PTR_TO_CTX, /* reg points to bpf_context */
- CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
- PTR_TO_MAP_VALUE, /* reg points to map element value */
- PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
- FRAME_PTR, /* reg == frame_pointer */
- PTR_TO_STACK, /* reg == frame_pointer + imm */
- CONST_IMM, /* constant integer value */
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct bpf_verifier_stack_elem {
+ /* verifier state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct bpf_verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct bpf_verifier_stack_elem *next;
+ /* length of verifier log at the time this state was pushed on stack */
+ u32 log_pos;
};
-struct reg_state {
- enum bpf_reg_type type;
- union {
- /* valid when type == CONST_IMM | PTR_TO_STACK */
- int imm;
+#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
+#define BPF_COMPLEXITY_LIMIT_STATES 64
- /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
- * PTR_TO_MAP_VALUE_OR_NULL
- */
- struct bpf_map *map_ptr;
- };
-};
+#define BPF_MAP_KEY_POISON (1ULL << 63)
+#define BPF_MAP_KEY_SEEN (1ULL << 62)
-enum bpf_stack_slot_type {
- STACK_INVALID, /* nothing was stored in this stack slot */
- STACK_SPILL, /* register spilled into stack */
- STACK_MISC /* BPF program wrote some data into this slot */
-};
+#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
-#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
+#define BPF_PRIV_STACK_MIN_SIZE 64
-/* state of the program:
- * type of all registers and stack info
- */
-struct verifier_state {
- struct reg_state regs[MAX_BPF_REG];
- u8 stack_slot_type[MAX_BPF_STACK];
- struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
-};
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
+static int ref_set_non_owning(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+static bool is_trusted_reg(const struct bpf_reg_state *reg);
-/* linked list of verifier states used to prune search */
-struct verifier_state_list {
- struct verifier_state state;
- struct verifier_state_list *next;
+static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_ptr_state.poison;
+}
+
+static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_ptr_state.unpriv;
+}
+
+static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
+ struct bpf_map *map,
+ bool unpriv, bool poison)
+{
+ unpriv |= bpf_map_ptr_unpriv(aux);
+ aux->map_ptr_state.unpriv = unpriv;
+ aux->map_ptr_state.poison = poison;
+ aux->map_ptr_state.map_ptr = map;
+}
+
+static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_key_state & BPF_MAP_KEY_POISON;
+}
+
+static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
+{
+ return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
+}
+
+static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
+}
+
+static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
+{
+ bool poisoned = bpf_map_key_poisoned(aux);
+
+ aux->map_key_state = state | BPF_MAP_KEY_SEEN |
+ (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
+}
+
+static bool bpf_helper_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0;
+}
+
+static bool bpf_pseudo_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_CALL;
+}
+
+static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
+}
+
+struct bpf_call_arg_meta {
+ struct bpf_map *map_ptr;
+ bool raw_mode;
+ bool pkt_access;
+ u8 release_regno;
+ int regno;
+ int access_size;
+ int mem_size;
+ u64 msize_max_value;
+ int ref_obj_id;
+ int dynptr_id;
+ int map_uid;
+ int func_id;
+ struct btf *btf;
+ u32 btf_id;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+ u32 subprogno;
+ struct btf_field *kptr_field;
+ s64 const_map_key;
};
-/* verifier_state + insn_idx are pushed to stack when branch is encountered */
-struct verifier_stack_elem {
- /* verifer state is 'st'
- * before processing instruction 'insn_idx'
- * and after processing instruction 'prev_insn_idx'
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ u32 subprogno;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+
+ /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
+ * generally to pass info about user-defined local kptr types to later
+ * verification logic
+ * bpf_obj_drop/bpf_percpu_obj_drop
+ * Record the local kptr type to be drop'd
+ * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
+ * Record the local kptr type to be refcount_incr'd and use
+ * arg_owning_ref to determine whether refcount_acquire should be
+ * fallible
*/
- struct verifier_state st;
- int insn_idx;
- int prev_insn_idx;
- struct verifier_stack_elem *next;
+ struct btf *arg_btf;
+ u32 arg_btf_id;
+ bool arg_owning_ref;
+ bool arg_prog;
+
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
+ struct {
+ enum bpf_dynptr_type type;
+ u32 id;
+ u32 ref_obj_id;
+ } initialized_dynptr;
+ struct {
+ u8 spi;
+ u8 frameno;
+ } iter;
+ struct {
+ struct bpf_map *ptr;
+ int uid;
+ } map;
+ u64 mem_size;
};
-#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+struct btf *btf_vmlinux;
-/* single container for all structs
- * one verifier_env per bpf_check() call
- */
-struct verifier_env {
- struct bpf_prog *prog; /* eBPF program being verified */
- struct verifier_stack_elem *head; /* stack of verifier states to be processed */
- int stack_size; /* number of states to be processed */
- struct verifier_state cur_state; /* current verifier state */
- struct verifier_state_list **explored_states; /* search pruning optimization */
- struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
- u32 used_map_cnt; /* number of used maps */
-};
-
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+ return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
-/* log_level controls verbosity level of eBPF verifier.
- * verbose() is used to dump the verification trace to the log, so the user
- * can figure out what's wrong with the program
- */
-static void verbose(const char *fmt, ...)
+__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
+ struct bpf_verifier_env *env = private_data;
va_list args;
- if (log_level == 0 || log_len >= log_size - 1)
+ if (!bpf_verifier_log_needed(&env->log))
return;
va_start(args, fmt);
- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ bpf_verifier_vlog(&env->log, fmt, args);
va_end(args);
}
-/* string representation of 'enum bpf_reg_type' */
-static const char * const reg_type_str[] = {
- [NOT_INIT] = "?",
- [UNKNOWN_VALUE] = "inv",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [FRAME_PTR] = "fp",
- [PTR_TO_STACK] = "fp",
- [CONST_IMM] = "imm",
-};
+static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct bpf_retval_range range, const char *ctx,
+ const char *reg_name)
+{
+ bool unknown = true;
-static void print_verifier_state(struct verifier_env *env)
+ verbose(env, "%s the register %s has", ctx, reg_name);
+ if (reg->smin_value > S64_MIN) {
+ verbose(env, " smin=%lld", reg->smin_value);
+ unknown = false;
+ }
+ if (reg->smax_value < S64_MAX) {
+ verbose(env, " smax=%lld", reg->smax_value);
+ unknown = false;
+ }
+ if (unknown)
+ verbose(env, " unknown scalar value");
+ verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
+}
+
+static bool reg_not_null(const struct bpf_reg_state *reg)
{
- enum bpf_reg_type t;
- int i;
+ enum bpf_reg_type type;
- for (i = 0; i < MAX_BPF_REG; i++) {
- t = env->cur_state.regs[i].type;
- if (t == NOT_INIT)
- continue;
- verbose(" R%d=%s", i, reg_type_str[t]);
- if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%d", env->cur_state.regs[i].imm);
- else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose("(ks=%d,vs=%d)",
- env->cur_state.regs[i].map_ptr->key_size,
- env->cur_state.regs[i].map_ptr->value_size);
- }
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
- }
- verbose("\n");
-}
-
-static const char *const bpf_class_string[] = {
- [BPF_LD] = "ld",
- [BPF_LDX] = "ldx",
- [BPF_ST] = "st",
- [BPF_STX] = "stx",
- [BPF_ALU] = "alu",
- [BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
- [BPF_ALU64] = "alu64",
-};
+ type = reg->type;
+ if (type_may_be_null(type))
+ return false;
-static const char *const bpf_alu_string[] = {
- [BPF_ADD >> 4] = "+=",
- [BPF_SUB >> 4] = "-=",
- [BPF_MUL >> 4] = "*=",
- [BPF_DIV >> 4] = "/=",
- [BPF_OR >> 4] = "|=",
- [BPF_AND >> 4] = "&=",
- [BPF_LSH >> 4] = "<<=",
- [BPF_RSH >> 4] = ">>=",
- [BPF_NEG >> 4] = "neg",
- [BPF_MOD >> 4] = "%=",
- [BPF_XOR >> 4] = "^=",
- [BPF_MOV >> 4] = "=",
- [BPF_ARSH >> 4] = "s>>=",
- [BPF_END >> 4] = "endian",
-};
+ type = base_type(type);
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_MAP_VALUE ||
+ type == PTR_TO_MAP_KEY ||
+ type == PTR_TO_SOCK_COMMON ||
+ (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
+ (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) ||
+ type == CONST_PTR_TO_MAP;
+}
-static const char *const bpf_ldst_string[] = {
- [BPF_W >> 3] = "u32",
- [BPF_H >> 3] = "u16",
- [BPF_B >> 3] = "u8",
- [BPF_DW >> 3] = "u64",
-};
+static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
+{
+ struct btf_record *rec = NULL;
+ struct btf_struct_meta *meta;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ rec = reg->map_ptr->record;
+ } else if (type_is_ptr_alloc_obj(reg->type)) {
+ meta = btf_find_struct_meta(reg->btf, reg->btf_id);
+ if (meta)
+ rec = meta->record;
+ }
+ return rec;
+}
-static const char *const bpf_jmp_string[] = {
- [BPF_JA >> 4] = "jmp",
- [BPF_JEQ >> 4] = "==",
- [BPF_JGT >> 4] = ">",
- [BPF_JGE >> 4] = ">=",
- [BPF_JSET >> 4] = "&",
- [BPF_JNE >> 4] = "!=",
- [BPF_JSGT >> 4] = "s>",
- [BPF_JSGE >> 4] = "s>=",
- [BPF_CALL >> 4] = "call",
- [BPF_EXIT >> 4] = "exit",
-};
+static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux;
+
+ return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
+}
-static void print_bpf_insn(struct bpf_insn *insn)
+static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
{
- u8 class = BPF_CLASS(insn->code);
+ struct bpf_func_info *info;
- if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_SRC(insn->code) == BPF_X)
- verbose("(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->src_reg);
- else
- verbose("(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
- insn->dst_reg,
- bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
- insn->imm);
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_MEM)
- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg, insn->off,
- insn->src_reg);
+ if (!env->prog->aux->func_info)
+ return "";
+
+ info = &env->prog->aux->func_info[subprog];
+ return btf_type_name(env->prog->aux->btf, info->type_id);
+}
+
+static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+ struct bpf_subprog_info *info = subprog_info(env, subprog);
+
+ info->is_cb = true;
+ info->is_async_cb = true;
+ info->is_exception_cb = true;
+}
+
+static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+ return subprog_info(env, subprog)->is_exception_cb;
+}
+
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK);
+}
+
+static bool type_is_rdonly_mem(u32 type)
+{
+ return type & MEM_RDONLY;
+}
+
+static bool is_acquire_function(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC;
+
+ if (func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp ||
+ func_id == BPF_FUNC_ringbuf_reserve ||
+ func_id == BPF_FUNC_kptr_xchg)
+ return true;
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ (map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map_type == BPF_MAP_TYPE_SOCKHASH))
+ return true;
+
+ return false;
+}
+
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_tcp_sock ||
+ func_id == BPF_FUNC_sk_fullsock ||
+ func_id == BPF_FUNC_skc_to_tcp_sock ||
+ func_id == BPF_FUNC_skc_to_tcp6_sock ||
+ func_id == BPF_FUNC_skc_to_udp6_sock ||
+ func_id == BPF_FUNC_skc_to_mptcp_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_timewait_sock ||
+ func_id == BPF_FUNC_skc_to_tcp_request_sock;
+}
+
+static bool is_dynptr_ref_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_dynptr_data;
+}
+
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_async_callback_calling_kfunc(u32 btf_id);
+static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
+
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+static bool is_task_work_add_kfunc(u32 func_id);
+
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_for_each_map_elem ||
+ func_id == BPF_FUNC_find_vma ||
+ func_id == BPF_FUNC_loop ||
+ func_id == BPF_FUNC_user_ringbuf_drain;
+}
+
+static bool is_async_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_timer_set_callback;
+}
+
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return is_sync_callback_calling_function(func_id) ||
+ is_async_callback_calling_function(func_id);
+}
+
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
+
+static bool is_async_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
+}
+
+static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ /* bpf_timer callbacks are never sleepable. */
+ if (bpf_helper_call(insn) && insn->imm == BPF_FUNC_timer_set_callback)
+ return false;
+
+ /* bpf_wq and bpf_task_work callbacks are always sleepable. */
+ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
+ return true;
+
+ verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
+ return false;
+}
+
+static bool is_may_goto_insn(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO;
+}
+
+static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
+{
+ return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
+}
+
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_storage_get ||
+ func_id == BPF_FUNC_inode_storage_get ||
+ func_id == BPF_FUNC_task_storage_get ||
+ func_id == BPF_FUNC_cgrp_storage_get;
+}
+
+static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
+ const struct bpf_map *map)
+{
+ int ref_obj_uses = 0;
+
+ if (is_ptr_cast_function(func_id))
+ ref_obj_uses++;
+ if (is_acquire_function(func_id, map))
+ ref_obj_uses++;
+ if (is_dynptr_ref_function(func_id))
+ ref_obj_uses++;
+
+ return ref_obj_uses > 1;
+}
+
+static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG;
+}
+
+static bool is_atomic_load_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ;
+}
+
+static int __get_spi(s32 off)
+{
+ return (-off - 1) / BPF_REG_SIZE;
+}
+
+static struct bpf_func_state *func(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+
+ return cur->frame[reg->frameno];
+}
+
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
+{
+ int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+ /* We need to check that slots between [spi - nr_slots + 1, spi] are
+ * within [0, allocated_stack).
+ *
+ * Please note that the spi grows downwards. For example, a dynptr
+ * takes the size of two stack slots; the first slot will be at
+ * spi and the second slot will be at spi - 1.
+ */
+ return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+}
+
+static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ const char *obj_kind, int nr_slots)
+{
+ int off, spi;
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "%s has to be at a constant offset\n", obj_kind);
+ return -EINVAL;
+ }
+
+ off = reg->off + reg->var_off.value;
+ if (off % BPF_REG_SIZE) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
+
+ spi = __get_spi(off);
+ if (spi + 1 < nr_slots) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
+ return -EINVAL;
+ }
+
+ if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
+ return -ERANGE;
+ return spi;
+}
+
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
+}
+
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+ return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
+static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "irq_flag", 1);
+}
+
+static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
+{
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ return BPF_DYNPTR_TYPE_LOCAL;
+ case DYNPTR_TYPE_RINGBUF:
+ return BPF_DYNPTR_TYPE_RINGBUF;
+ case DYNPTR_TYPE_SKB:
+ return BPF_DYNPTR_TYPE_SKB;
+ case DYNPTR_TYPE_XDP:
+ return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_SKB_META:
+ return BPF_DYNPTR_TYPE_SKB_META;
+ case DYNPTR_TYPE_FILE:
+ return BPF_DYNPTR_TYPE_FILE;
+ default:
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+}
+
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return DYNPTR_TYPE_LOCAL;
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return DYNPTR_TYPE_RINGBUF;
+ case BPF_DYNPTR_TYPE_SKB:
+ return DYNPTR_TYPE_SKB;
+ case BPF_DYNPTR_TYPE_XDP:
+ return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_SKB_META:
+ return DYNPTR_TYPE_SKB_META;
+ case BPF_DYNPTR_TYPE_FILE:
+ return DYNPTR_TYPE_FILE;
+ default:
+ return 0;
+ }
+}
+
+static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+{
+ return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE;
+}
+
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *sreg1,
+ struct bpf_reg_state *sreg2,
+ enum bpf_dynptr_type type)
+{
+ int id = ++env->id_gen;
+
+ __mark_dynptr_reg(sreg1, type, true, id);
+ __mark_dynptr_reg(sreg2, type, false, id);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type)
+{
+ __mark_dynptr_reg(reg, type, true, ++env->id_gen);
+}
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi);
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type type;
+ int spi, i, err;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ /* We cannot assume both spi and spi - 1 belong to the same dynptr,
+ * hence we need to call destroy_if_dynptr_stack_slot twice for both,
+ * to ensure that for the following example:
+ * [d1][d1][d2][d2]
+ * spi 3 2 1 0
+ * So marking spi = 2 should lead to destruction of both d1 and d2. In
+ * case they do belong to same dynptr, second call won't see slot_type
+ * as STACK_DYNPTR and will simply skip destruction.
+ */
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ err = destroy_if_dynptr_stack_slot(env, state, spi - 1);
+ if (err)
+ return err;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_DYNPTR;
+ state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+ }
+
+ type = arg_to_dynptr_type(arg_type);
+ if (type == BPF_DYNPTR_TYPE_INVALID)
+ return -EINVAL;
+
+ mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type);
+
+ if (dynptr_type_refcounted(type)) {
+ /* The id is used to track proper releasing */
+ int id;
+
+ if (clone_ref_obj_id)
+ id = clone_ref_obj_id;
else
- verbose("BUG_%02x\n", insn->code);
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_st_%02x\n", insn->code);
- return;
+ id = acquire_reference(env, insn_idx);
+
+ if (id < 0)
+ return id;
+
+ state->stack[spi].spilled_ptr.ref_obj_id = id;
+ state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
+ }
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+
+ return 0;
+}
+
+static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi)
+{
+ int i;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, ref_obj_id, i;
+
+ /*
+ * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
+ return -EFAULT;
+ }
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ invalidate_dynptr(env, state, spi);
+ return 0;
+ }
+
+ ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id;
+
+ /* If the dynptr has a ref_obj_id, then we need to invalidate
+ * two things:
+ *
+ * 1) Any dynptrs with a matching ref_obj_id (clones)
+ * 2) Any slices derived from this dynptr.
+ */
+
+ /* Invalidate any slices associated with this dynptr */
+ WARN_ON_ONCE(release_reference(env, ref_obj_id));
+
+ /* Invalidate any dynptr clones */
+ for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id)
+ continue;
+
+ /* it should always be the case that if the ref obj id
+ * matches then the stack slot also belongs to a
+ * dynptr
+ */
+ if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
+ verifier_bug(env, "misconfigured ref_obj_id");
+ return -EFAULT;
}
- verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
- } else if (class == BPF_LDX) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_ldx_%02x\n", insn->code);
- return;
+ if (state->stack[i].spilled_ptr.dynptr.first_slot)
+ invalidate_dynptr(env, state, i);
+ }
+
+ return 0;
+}
+
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+}
+
+static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int spi)
+{
+ struct bpf_func_state *fstate;
+ struct bpf_reg_state *dreg;
+ int i, dynptr_id;
+
+ /* We always ensure that STACK_DYNPTR is never set partially,
+ * hence just checking for slot_type[0] is enough. This is
+ * different for STACK_SPILL, where it may be only set for
+ * 1 byte, so code has to use is_spilled_reg.
+ */
+ if (state->stack[spi].slot_type[0] != STACK_DYNPTR)
+ return 0;
+
+ /* Reposition spi to first slot */
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ spi = spi + 1;
+
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+ verbose(env, "cannot overwrite referenced dynptr\n");
+ return -EINVAL;
+ }
+
+ mark_stack_slot_scratched(env, spi);
+ mark_stack_slot_scratched(env, spi - 1);
+
+ /* Writing partially to one dynptr stack slot destroys both. */
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ state->stack[spi].slot_type[i] = STACK_INVALID;
+ state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+ }
+
+ dynptr_id = state->stack[spi].spilled_ptr.id;
+ /* Invalidate any slices associated with this dynptr */
+ bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({
+ /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
+ if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
+ continue;
+ if (dreg->dynptr_id == dynptr_id)
+ mark_reg_invalid(env, dreg);
+ }));
+
+ /* Do not release reference state, we are destroying dynptr on stack,
+ * not using some helper to release it. Just reset register.
+ */
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi));
+
+ return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ spi = dynptr_get_spi(env, reg);
+
+ /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+ * error because this just means the stack state hasn't been updated yet.
+ * We will do check_mem_access to check and update stack bounds later.
+ */
+ if (spi < 0 && spi != -ERANGE)
+ return false;
+
+ /* We don't need to check if the stack slots are marked by previous
+ * dynptr initializations because we allow overwriting existing unreferenced
+ * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+ * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+ * touching are completely destructed before we reinitialize them for a new
+ * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+ * instead of delaying it until the end where the user will get "Unreleased
+ * reference" error.
+ */
+ return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int i, spi;
+
+ /* This already represents first slot of initialized bpf_dynptr.
+ *
+ * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic, so we don't need to check its
+ * offset and alignment.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return true;
+
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
+ return false;
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+ state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
+{
+ struct bpf_func_state *state = func(env, reg);
+ enum bpf_dynptr_type dynptr_type;
+ int spi;
+
+ /* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+ if (arg_type == ARG_PTR_TO_DYNPTR)
+ return true;
+
+ dynptr_type = arg_to_dynptr_type(arg_type);
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ return reg->dynptr.type == dynptr_type;
+ } else {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return false;
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ }
+}
+
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j, id;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_reference(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ if (is_kfunc_rcu_protected(meta)) {
+ if (in_rcu_cs(env))
+ st->type |= MEM_RCU;
+ else
+ st->type |= PTR_UNTRUSTED;
}
- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
- insn->code, insn->dst_reg,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->off);
- } else if (class == BPF_LD) {
- if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose("(%02x) r0 = *(%s *)skb[%d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM) {
- verbose("(%02x) r%d = 0x%x\n",
- insn->code, insn->dst_reg, insn->imm);
+ st->ref_obj_id = i == 0 ? id : 0;
+ st->iter.btf = btf;
+ st->iter.btf_id = btf_id;
+ st->iter.state = BPF_ITER_STATE_ACTIVE;
+ st->iter.depth = 0;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_ITER;
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (i == 0)
+ WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+ __mark_reg_not_init(env, st);
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_INVALID;
+
+ bpf_mark_stack_write(env, state->frameno, BIT(spi - i));
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] == STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return -EINVAL;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (st->type & PTR_UNTRUSTED)
+ return -EPROTO;
+ /* only main (first) slot has ref_obj_id set */
+ if (i == 0 && !st->ref_obj_id)
+ return -EINVAL;
+ if (i != 0 && st->ref_obj_id)
+ return -EINVAL;
+ if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+ return -EINVAL;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] != STACK_ITER)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_irq_state(struct bpf_verifier_state *state, int id);
+
+static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx,
+ int kfunc_class)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, id;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_irq_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->ref_obj_id = id;
+ st->irq.kfunc_class = kfunc_class;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_IRQ_FLAG;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int kfunc_class)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, err;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (st->irq.kfunc_class != kfunc_class) {
+ const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+ const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+
+ verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n",
+ flag_kfunc, used_kfunc);
+ return -EINVAL;
+ }
+
+ err = release_irq_state(env->cur_state, st->ref_obj_id);
+ WARN_ON_ONCE(err && err != -EACCES);
+ if (err) {
+ int insn_idx = 0;
+
+ for (int i = 0; i < env->cur_state->acquired_refs; i++) {
+ if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) {
+ insn_idx = env->cur_state->refs[i].insn_idx;
+ break;
+ }
+ }
+
+ verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
+ env->cur_state->active_irq_id, insn_idx);
+ return err;
+ }
+
+ __mark_reg_not_init(env, st);
+
+ bpf_mark_stack_write(env, reg->frameno, BIT(spi));
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ int spi, i;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = irq_flag_get_spi(env, reg);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ slot = &state->stack[spi];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] == STACK_IRQ_FLAG)
+ return false;
+ return true;
+}
+
+static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return -EINVAL;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (!st->ref_obj_id)
+ return -EINVAL;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] != STACK_IRQ_FLAG)
+ return -EINVAL;
+ return 0;
+}
+
+/* Check if given stack slot is "special":
+ * - spilled register state (STACK_SPILL);
+ * - dynptr state (STACK_DYNPTR);
+ * - iter state (STACK_ITER).
+ * - irq flag state (STACK_IRQ_FLAG)
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+ switch (type) {
+ case STACK_SPILL:
+ case STACK_DYNPTR:
+ case STACK_ITER:
+ case STACK_IRQ_FLAG:
+ return true;
+ case STACK_INVALID:
+ case STACK_MISC:
+ case STACK_ZERO:
+ return false;
+ default:
+ WARN_ONCE(1, "unknown stack slot type %d\n", type);
+ return true;
+ }
+}
+
+/* The reg state of a pointer or a bounded scalar was saved when
+ * it was spilled to the stack.
+ */
+static bool is_spilled_reg(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL;
+}
+
+static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
+static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack)
+{
+ return stack->slot_type[0] == STACK_SPILL &&
+ stack->spilled_ptr.type == SCALAR_VALUE;
+}
+
+/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
+ * case they are equivalent, or it's STACK_ZERO, in which case we preserve
+ * more precise STACK_ZERO.
+ * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged
+ * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is
+ * unnecessary as both are considered equivalent when loading data and pruning,
+ * in case of unprivileged mode it will be incorrect to allow reads of invalid
+ * slots.
+ */
+static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
+{
+ if (*stype == STACK_ZERO)
+ return;
+ if (*stype == STACK_INVALID)
+ return;
+ *stype = STACK_MISC;
+}
+
+static void scrub_spilled_slot(u8 *stype)
+{
+ if (*stype != STACK_INVALID)
+ *stype = STACK_MISC;
+}
+
+/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
+ * small to hold src. This is different from krealloc since we don't want to preserve
+ * the contents of dst.
+ *
+ * Leaves dst untouched if src is NULL or length is zero. Returns NULL if memory could
+ * not be allocated.
+ */
+static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags)
+{
+ size_t alloc_bytes;
+ void *orig = dst;
+ size_t bytes;
+
+ if (ZERO_OR_NULL_PTR(src))
+ goto out;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+
+ alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes));
+ dst = krealloc(orig, alloc_bytes, flags);
+ if (!dst) {
+ kfree(orig);
+ return NULL;
+ }
+
+ memcpy(dst, src, bytes);
+out:
+ return dst ? dst : ZERO_SIZE_PTR;
+}
+
+/* resize an array from old_n items to new_n items. the array is reallocated if it's too
+ * small to hold new_n items. new items are zeroed out if the array grows.
+ *
+ * Contrary to krealloc_array, does not free arr if new_n is zero.
+ */
+static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
+{
+ size_t alloc_size;
+ void *new_arr;
+
+ if (!new_n || old_n == new_n)
+ goto out;
+
+ alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!new_arr) {
+ kfree(arr);
+ return NULL;
+ }
+ arr = new_arr;
+
+ if (new_n > old_n)
+ memset(arr + old_n * size, 0, (new_n - old_n) * size);
+
+out:
+ return arr ? arr : ZERO_SIZE_PTR;
+}
+
+static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src)
+{
+ dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
+ sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT);
+ if (!dst->refs)
+ return -ENOMEM;
+
+ dst->acquired_refs = src->acquired_refs;
+ dst->active_locks = src->active_locks;
+ dst->active_preempt_locks = src->active_preempt_locks;
+ dst->active_rcu_locks = src->active_rcu_locks;
+ dst->active_irq_id = src->active_irq_id;
+ dst->active_lock_id = src->active_lock_id;
+ dst->active_lock_ptr = src->active_lock_ptr;
+ return 0;
+}
+
+static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+{
+ size_t n = src->allocated_stack / BPF_REG_SIZE;
+
+ dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst->stack)
+ return -ENOMEM;
+
+ dst->allocated_stack = src->allocated_stack;
+ return 0;
+}
+
+static int resize_reference_state(struct bpf_verifier_state *state, size_t n)
+{
+ state->refs = realloc_array(state->refs, state->acquired_refs, n,
+ sizeof(struct bpf_reference_state));
+ if (!state->refs)
+ return -ENOMEM;
+
+ state->acquired_refs = n;
+ return 0;
+}
+
+/* Possibly update state->allocated_stack to be at least size bytes. Also
+ * possibly update the function's high-water mark in its bpf_subprog_info.
+ */
+static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
+{
+ size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
+
+ /* The stack size is always a multiple of BPF_REG_SIZE. */
+ size = round_up(size, BPF_REG_SIZE);
+ n = size / BPF_REG_SIZE;
+
+ if (old_n >= n)
+ return 0;
+
+ state->stack = realloc_array(state->stack, old_n, n, sizeof(struct bpf_stack_state));
+ if (!state->stack)
+ return -ENOMEM;
+
+ state->allocated_stack = size;
+
+ /* update known max for given subprogram */
+ if (env->subprog_info[state->subprogno].stack_depth < size)
+ env->subprog_info[state->subprogno].stack_depth = size;
+
+ return 0;
+}
+
+/* Acquire a pointer id from the env and update the state->refs to include
+ * this new pointer reference.
+ * On success, returns a valid pointer id to associate with the register
+ * On failure, returns a negative errno.
+ */
+static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ int new_ofs = state->acquired_refs;
+ int err;
+
+ err = resize_reference_state(state, state->acquired_refs + 1);
+ if (err)
+ return NULL;
+ state->refs[new_ofs].insn_idx = insn_idx;
+
+ return &state->refs[new_ofs];
+}
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_PTR;
+ s->id = ++env->id_gen;
+ return s->id;
+}
+
+static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
+ int id, void *ptr)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = type;
+ s->id = id;
+ s->ptr = ptr;
+
+ state->active_locks++;
+ state->active_lock_id = id;
+ state->active_lock_ptr = ptr;
+ return 0;
+}
+
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_IRQ;
+ s->id = ++env->id_gen;
+
+ state->active_irq_id = s->id;
+ return s->id;
+}
+
+static void release_reference_state(struct bpf_verifier_state *state, int idx)
+{
+ int last_idx;
+ size_t rem;
+
+ /* IRQ state requires the relative ordering of elements remaining the
+ * same, since it relies on the refs array to behave as a stack, so that
+ * it can detect out-of-order IRQ restore. Hence use memmove to shift
+ * the array instead of swapping the final element into the deleted idx.
+ */
+ last_idx = state->acquired_refs - 1;
+ rem = state->acquired_refs - idx - 1;
+ if (last_idx && idx != last_idx)
+ memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem);
+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+ state->acquired_refs--;
+ return;
+}
+
+static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++)
+ if (state->refs[i].id == ptr_id)
+ return true;
+
+ return false;
+}
+
+static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
+{
+ void *prev_ptr = NULL;
+ u32 prev_id = 0;
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type == type && state->refs[i].id == id &&
+ state->refs[i].ptr == ptr) {
+ release_reference_state(state, i);
+ state->active_locks--;
+ /* Reassign active lock (id, ptr). */
+ state->active_lock_id = prev_id;
+ state->active_lock_ptr = prev_ptr;
+ return 0;
+ }
+ if (state->refs[i].type & REF_TYPE_LOCK_MASK) {
+ prev_id = state->refs[i].id;
+ prev_ptr = state->refs[i].ptr;
+ }
+ }
+ return -EINVAL;
+}
+
+static int release_irq_state(struct bpf_verifier_state *state, int id)
+{
+ u32 prev_id = 0;
+ int i;
+
+ if (id != state->active_irq_id)
+ return -EACCES;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_IRQ)
+ continue;
+ if (state->refs[i].id == id) {
+ release_reference_state(state, i);
+ state->active_irq_id = prev_id;
+ return 0;
} else {
- verbose("BUG_ld_%02x\n", insn->code);
- return;
+ prev_id = state->refs[i].id;
}
- } else if (class == BPF_JMP) {
- u8 opcode = BPF_OP(insn->code);
+ }
+ return -EINVAL;
+}
- if (opcode == BPF_CALL) {
- verbose("(%02x) call %d\n", insn->code, insn->imm);
- } else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose("(%02x) goto pc%+d\n",
- insn->code, insn->off);
- } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose("(%02x) exit\n", insn->code);
- } else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->src_reg, insn->off);
+static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type,
+ int id, void *ptr)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ struct bpf_reference_state *s = &state->refs[i];
+
+ if (!(s->type & type))
+ continue;
+
+ if (s->id == id && s->ptr == ptr)
+ return s;
+ }
+ return NULL;
+}
+
+static void update_peak_states(struct bpf_verifier_env *env)
+{
+ u32 cur_states;
+
+ cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
+ env->peak_states = max(env->peak_states, cur_states);
+}
+
+static void free_func_state(struct bpf_func_state *state)
+{
+ if (!state)
+ return;
+ kfree(state->stack);
+ kfree(state);
+}
+
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+ kfree(state->jmp_history);
+ state->jmp_history = NULL;
+ state->jmp_history_cnt = 0;
+}
+
+static void free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
+{
+ int i;
+
+ for (i = 0; i <= state->curframe; i++) {
+ free_func_state(state->frame[i]);
+ state->frame[i] = NULL;
+ }
+ kfree(state->refs);
+ clear_jmp_history(state);
+ if (free_self)
+ kfree(state);
+}
+
+/* struct bpf_verifier_state->parent refers to states
+ * that are in either of env->{expored_states,free_list}.
+ * In both cases the state is contained in struct bpf_verifier_state_list.
+ */
+static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
+{
+ if (st->parent)
+ return container_of(st->parent, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st);
+
+/* A state can be freed if it is no longer referenced:
+ * - is in the env->free_list;
+ * - has no children states;
+ */
+static void maybe_free_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state_list *sl)
+{
+ if (!sl->in_free_list
+ || sl->state.branches != 0
+ || incomplete_read_marks(env, &sl->state))
+ return;
+ list_del(&sl->node);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->free_list_size--;
+}
+
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_func_state(struct bpf_func_state *dst,
+ const struct bpf_func_state *src)
+{
+ memcpy(dst, src, offsetof(struct bpf_func_state, stack));
+ return copy_stack_state(dst, src);
+}
+
+static int copy_verifier_state(struct bpf_verifier_state *dst_state,
+ const struct bpf_verifier_state *src)
+{
+ struct bpf_func_state *dst;
+ int i, err;
+
+ dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
+ src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst_state->jmp_history)
+ return -ENOMEM;
+ dst_state->jmp_history_cnt = src->jmp_history_cnt;
+
+ /* if dst has more stack frames then src frame, free them, this is also
+ * necessary in case of exceptional exits using bpf_throw.
+ */
+ for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
+ free_func_state(dst_state->frame[i]);
+ dst_state->frame[i] = NULL;
+ }
+ err = copy_reference_state(dst_state, src);
+ if (err)
+ return err;
+ dst_state->speculative = src->speculative;
+ dst_state->in_sleepable = src->in_sleepable;
+ dst_state->cleaned = src->cleaned;
+ dst_state->curframe = src->curframe;
+ dst_state->branches = src->branches;
+ dst_state->parent = src->parent;
+ dst_state->first_insn_idx = src->first_insn_idx;
+ dst_state->last_insn_idx = src->last_insn_idx;
+ dst_state->dfs_depth = src->dfs_depth;
+ dst_state->callback_unroll_depth = src->callback_unroll_depth;
+ dst_state->may_goto_depth = src->may_goto_depth;
+ dst_state->equal_state = src->equal_state;
+ for (i = 0; i <= src->curframe; i++) {
+ dst = dst_state->frame[i];
+ if (!dst) {
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL_ACCOUNT);
+ if (!dst)
+ return -ENOMEM;
+ dst_state->frame[i] = dst;
+ }
+ err = copy_func_state(dst, src->frame[i]);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+ return env->prog->len;
+}
+
+static struct list_head *explored_state(struct bpf_verifier_env *env, int idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = cur->frame[cur->curframe];
+
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
+
+static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
+{
+ int fr;
+
+ if (a->curframe != b->curframe)
+ return false;
+
+ for (fr = a->curframe; fr >= 0; fr--)
+ if (a->frame[fr]->callsite != b->frame[fr]->callsite)
+ return false;
+
+ return true;
+}
+
+/* Return IP for a given frame in a call stack */
+static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
+{
+ return frame == st->curframe
+ ? st->insn_idx
+ : st->frame[frame + 1]->callsite;
+}
+
+/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
+ * if such frame exists form a corresponding @callchain as an array of
+ * call sites leading to this frame and SCC id.
+ * E.g.:
+ *
+ * void foo() { A: loop {... SCC#1 ...}; }
+ * void bar() { B: loop { C: foo(); ... SCC#2 ... }
+ * D: loop { E: foo(); ... SCC#3 ... } }
+ * void main() { F: bar(); }
+ *
+ * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
+ * on @st frame call sites being (F,C,A) or (F,E,A).
+ */
+static bool compute_scc_callchain(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_callchain *callchain)
+{
+ u32 i, scc, insn_idx;
+
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= st->curframe; i++) {
+ insn_idx = frame_insn_idx(st, i);
+ scc = env->insn_aux_data[insn_idx].scc;
+ if (scc) {
+ callchain->scc = scc;
+ break;
+ } else if (i < st->curframe) {
+ callchain->callsites[i] = insn_idx;
} else {
- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
- bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
+ return false;
}
- } else {
- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
}
+ return true;
}
-static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+/* Check if bpf_scc_visit instance for @callchain exists. */
+static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
{
- struct verifier_stack_elem *elem;
- int insn_idx;
+ struct bpf_scc_info *info = env->scc_info[callchain->scc];
+ struct bpf_scc_visit *visits = info->visits;
+ u32 i;
+
+ if (!info)
+ return NULL;
+ for (i = 0; i < info->num_visits; i++)
+ if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
+ return &visits[i];
+ return NULL;
+}
+
+/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
+ * Allocated instances are alive for a duration of the do_check_common()
+ * call and are freed by free_states().
+ */
+static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_visit *visit;
+ struct bpf_scc_info *info;
+ u32 scc, num_visits;
+ u64 new_sz;
+
+ scc = callchain->scc;
+ info = env->scc_info[scc];
+ num_visits = info ? info->num_visits : 0;
+ new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
+ info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
+ if (!info)
+ return NULL;
+ env->scc_info[scc] = info;
+ info->num_visits = num_visits + 1;
+ visit = &info->visits[num_visits];
+ memset(visit, 0, sizeof(*visit));
+ memcpy(&visit->callchain, callchain, sizeof(*callchain));
+ return visit;
+}
+
+/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
+static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
+{
+ char *buf = env->tmp_str_buf;
+ int i, delta = 0;
+
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
+ for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
+ if (!callchain->callsites[i])
+ break;
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
+ callchain->callsites[i]);
+ }
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
+ return env->tmp_str_buf;
+}
+
+/* If callchain for @st exists (@st is in some SCC), ensure that
+ * bpf_scc_visit instance for this callchain exists.
+ * If instance does not exist or is empty, assign visit->entry_state to @st.
+ */
+static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ visit = visit ?: scc_visit_alloc(env, callchain);
+ if (!visit)
+ return -ENOMEM;
+ if (!visit->entry_state) {
+ visit->entry_state = st;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
+ }
+ return 0;
+}
+
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
+
+/* If callchain for @st exists (@st is in some SCC), make it empty:
+ * - set visit->entry_state to NULL;
+ * - flush accumulated backedges.
+ */
+static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ /*
+ * If path traversal stops inside an SCC, corresponding bpf_scc_visit
+ * must exist for non-speculative paths. For non-speculative paths
+ * traversal stops when:
+ * a. Verification error is found, maybe_exit_scc() is not called.
+ * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member
+ * of any SCC.
+ * c. A checkpoint is reached and matched. Checkpoints are created by
+ * is_state_visited(), which calls maybe_enter_scc(), which allocates
+ * bpf_scc_visit instances for checkpoints within SCCs.
+ * (c) is the only case that can reach this point.
+ */
+ if (!st->speculative) {
+ verifier_bug(env, "scc exit: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ return 0;
+ }
+ if (visit->entry_state != st)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
+ visit->entry_state = NULL;
+ env->num_backedges -= visit->num_backedges;
+ visit->num_backedges = 0;
+ update_peak_states(env);
+ return propagate_backedges(env, visit);
+}
+
+/* Lookup an bpf_scc_visit instance corresponding to @st callchain
+ * and add @backedge to visit->backedges. @st callchain must exist.
+ */
+static int add_scc_backedge(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_backedge *backedge)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain)) {
+ verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
+ st->insn_idx);
+ return -EFAULT;
+ }
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ verifier_bug(env, "add backedge: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
+ backedge->next = visit->backedges;
+ visit->backedges = backedge;
+ visit->num_backedges++;
+ env->num_backedges++;
+ update_peak_states(env);
+ return 0;
+}
+
+/* bpf_reg_state->live marks for registers in a state @st are incomplete,
+ * if state @st is in some SCC and not all execution paths starting at this
+ * SCC are fully explored.
+ */
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return false;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit)
+ return false;
+ return !!visit->backedges;
+}
+
+static void free_backedges(struct bpf_scc_visit *visit)
+{
+ struct bpf_scc_backedge *backedge, *next;
+
+ for (backedge = visit->backedges; backedge; backedge = next) {
+ free_verifier_state(&backedge->state, false);
+ next = backedge->next;
+ kfree(backedge);
+ }
+ visit->backedges = NULL;
+}
+
+static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_verifier_state_list *sl = NULL, *parent_sl;
+ struct bpf_verifier_state *parent;
+ int err;
+
+ while (st) {
+ u32 br = --st->branches;
+
+ /* verifier_bug_if(br > 1, ...) technically makes sense here,
+ * but see comment in push_stack(), hence:
+ */
+ verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
+ if (br)
+ break;
+ err = maybe_exit_scc(env, st);
+ if (err)
+ return err;
+ parent = st->parent;
+ parent_sl = state_parent_as_list(st);
+ if (sl)
+ maybe_free_verifier_state(env, sl);
+ st = parent;
+ sl = parent_sl;
+ }
+ return 0;
+}
+
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+ int *insn_idx, bool pop_log)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_stack_elem *elem, *head = env->head;
+ int err;
if (env->head == NULL)
- return -1;
+ return -ENOENT;
- memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
- insn_idx = env->head->insn_idx;
+ if (cur) {
+ err = copy_verifier_state(cur, &head->st);
+ if (err)
+ return err;
+ }
+ if (pop_log)
+ bpf_vlog_reset(&env->log, head->log_pos);
+ if (insn_idx)
+ *insn_idx = head->insn_idx;
if (prev_insn_idx)
- *prev_insn_idx = env->head->prev_insn_idx;
- elem = env->head->next;
- kfree(env->head);
+ *prev_insn_idx = head->prev_insn_idx;
+ elem = head->next;
+ free_verifier_state(&head->st, false);
+ kfree(head);
env->head = elem;
env->stack_size--;
- return insn_idx;
+ return 0;
}
-static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
- int prev_insn_idx)
+static bool error_recoverable_with_nospec(int err)
{
- struct verifier_stack_elem *elem;
+ /* Should only return true for non-fatal errors that are allowed to
+ * occur during speculative verification. For these we can insert a
+ * nospec and the program might still be accepted. Do not include
+ * something like ENOMEM because it is likely to re-occur for the next
+ * architectural path once it has been recovered-from in all speculative
+ * paths.
+ */
+ return err == -EPERM || err == -EACCES || err == -EINVAL;
+}
- elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx,
+ bool speculative)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_stack_elem *elem;
+ int err;
+
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- goto err;
+ return ERR_PTR(-ENOMEM);
- memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
- if (env->stack_size > 1024) {
- verbose("BPF program is too complex\n");
- goto err;
+ err = copy_verifier_state(&elem->st, cur);
+ if (err)
+ return ERR_PTR(-ENOMEM);
+ elem->st.speculative |= speculative;
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env, "The sequence of %d jumps is too complex.\n",
+ env->stack_size);
+ return ERR_PTR(-E2BIG);
+ }
+ if (elem->st.parent) {
+ ++elem->st.parent->branches;
+ /* WARN_ON(branches > 2) technically makes sense here,
+ * but
+ * 1. speculative states will bump 'branches' for non-branch
+ * instructions
+ * 2. is_state_visited() heuristics may decide not to create
+ * a new state for a sequence of branches and all such current
+ * and cloned states will be pointing to a single parent state
+ * which might have large 'branches' count.
+ */
}
return &elem->st;
-err:
- /* pop all elements and return */
- while (pop_stack(env, NULL) >= 0);
- return NULL;
}
#define CALLER_SAVED_REGS 6
@@ -459,721 +2157,15041 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void init_reg_state(struct reg_state *regs)
+/* This helper doesn't clear reg->id */
+static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
+{
+ reg->var_off = tnum_const(imm);
+ reg->smin_value = (s64)imm;
+ reg->smax_value = (s64)imm;
+ reg->umin_value = imm;
+ reg->umax_value = imm;
+
+ reg->s32_min_value = (s32)imm;
+ reg->s32_max_value = (s32)imm;
+ reg->u32_min_value = (u32)imm;
+ reg->u32_max_value = (u32)imm;
+}
+
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
+{
+ /* Clear off and union(map_ptr, range) */
+ memset(((u8 *)reg) + sizeof(reg->type), 0,
+ offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
+ reg->id = 0;
+ reg->ref_obj_id = 0;
+ ___mark_reg_known(reg, imm);
+}
+
+static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
+{
+ reg->var_off = tnum_const_subreg(reg->var_off, imm);
+ reg->s32_min_value = (s32)imm;
+ reg->s32_max_value = (s32)imm;
+ reg->u32_min_value = (u32)imm;
+ reg->u32_max_value = (u32)imm;
+}
+
+/* Mark the 'variable offset' part of a register as zero. This should be
+ * used only on registers holding a pointer type.
+ */
+static void __mark_reg_known_zero(struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
+}
+
+static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ __mark_reg_known(reg, 0);
+ reg->type = SCALAR_VALUE;
+ /* all scalars are assumed imprecise initially (unless unprivileged,
+ * in which case everything is forced to be precise)
+ */
+ reg->precise = !env->bpf_capable;
+}
+
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
+{
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs */
+ for (regno = 0; regno < MAX_BPF_REG; regno++)
+ __mark_reg_not_init(env, regs + regno);
+ return;
+ }
+ __mark_reg_known_zero(regs + regno);
+}
+
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+ bool first_slot, int dynptr_id)
+{
+ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+ * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+ * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+ */
+ __mark_reg_known_zero(reg);
+ reg->type = CONST_PTR_TO_DYNPTR;
+ /* Give each dynptr a unique id to uniquely associate slices to it. */
+ reg->id = dynptr_id;
+ reg->dynptr.type = type;
+ reg->dynptr.first_slot = first_slot;
+}
+
+static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
+{
+ if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
+ const struct bpf_map *map = reg->map_ptr;
+
+ if (map->inner_map_meta) {
+ reg->type = CONST_PTR_TO_MAP;
+ reg->map_ptr = map->inner_map_meta;
+ /* transfer reg's id which is unique for every map_lookup_elem
+ * as UID of the inner map.
+ */
+ if (btf_record_has_field(map->inner_map_meta->record,
+ BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
+ reg->map_uid = reg->id;
+ }
+ } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+ reg->type = PTR_TO_XDP_SOCK;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+ map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ reg->type = PTR_TO_SOCKET;
+ } else {
+ reg->type = PTR_TO_MAP_VALUE;
+ }
+ return;
+ }
+
+ reg->type &= ~PTR_MAYBE_NULL;
+}
+
+static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno,
+ struct btf_field_graph_root *ds_head)
+{
+ __mark_reg_known_zero(&regs[regno]);
+ regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[regno].btf = ds_head->btf;
+ regs[regno].btf_id = ds_head->value_btf_id;
+ regs[regno].off = ds_head->node_offset;
+}
+
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+ return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+ return reg_is_pkt_pointer(reg) ||
+ reg->type == PTR_TO_PACKET_END;
+}
+
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+ return base_type(reg->type) == PTR_TO_MEM &&
+ (reg->type &
+ (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META));
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+ enum bpf_reg_type which)
+{
+ /* The register can already have a range from prior markings.
+ * This is fine as long as it hasn't been advanced from its
+ * origin.
+ */
+ return reg->type == which &&
+ reg->id == 0 &&
+ reg->off == 0 &&
+ tnum_equals_const(reg->var_off, 0);
+}
+
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+{
+ reg->smin_value = S64_MIN;
+ reg->smax_value = S64_MAX;
+ reg->umin_value = 0;
+ reg->umax_value = U64_MAX;
+
+ reg->s32_min_value = S32_MIN;
+ reg->s32_max_value = S32_MAX;
+ reg->u32_min_value = 0;
+ reg->u32_max_value = U32_MAX;
+}
+
+static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
+{
+ reg->smin_value = S64_MIN;
+ reg->smax_value = S64_MAX;
+ reg->umin_value = 0;
+ reg->umax_value = U64_MAX;
+}
+
+static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
+{
+ reg->s32_min_value = S32_MIN;
+ reg->s32_max_value = S32_MAX;
+ reg->u32_min_value = 0;
+ reg->u32_max_value = U32_MAX;
+}
+
+static void __update_reg32_bounds(struct bpf_reg_state *reg)
+{
+ struct tnum var32_off = tnum_subreg(reg->var_off);
+
+ /* min signed is max(sign bit) | min(other bits) */
+ reg->s32_min_value = max_t(s32, reg->s32_min_value,
+ var32_off.value | (var32_off.mask & S32_MIN));
+ /* max signed is min(sign bit) | max(other bits) */
+ reg->s32_max_value = min_t(s32, reg->s32_max_value,
+ var32_off.value | (var32_off.mask & S32_MAX));
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
+ reg->u32_max_value = min(reg->u32_max_value,
+ (u32)(var32_off.value | var32_off.mask));
+}
+
+static void __update_reg64_bounds(struct bpf_reg_state *reg)
+{
+ /* min signed is max(sign bit) | min(other bits) */
+ reg->smin_value = max_t(s64, reg->smin_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MIN));
+ /* max signed is min(sign bit) | max(other bits) */
+ reg->smax_value = min_t(s64, reg->smax_value,
+ reg->var_off.value | (reg->var_off.mask & S64_MAX));
+ reg->umin_value = max(reg->umin_value, reg->var_off.value);
+ reg->umax_value = min(reg->umax_value,
+ reg->var_off.value | reg->var_off.mask);
+}
+
+static void __update_reg_bounds(struct bpf_reg_state *reg)
+{
+ __update_reg32_bounds(reg);
+ __update_reg64_bounds(reg);
+}
+
+/* Uses signed min/max values to inform unsigned, and vice-versa */
+static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
+{
+ /* If upper 32 bits of u64/s64 range don't change, we can use lower 32
+ * bits to improve our u32/s32 boundaries.
+ *
+ * E.g., the case where we have upper 32 bits as zero ([10, 20] in
+ * u64) is pretty trivial, it's obvious that in u32 we'll also have
+ * [10, 20] range. But this property holds for any 64-bit range as
+ * long as upper 32 bits in that entire range of values stay the same.
+ *
+ * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
+ * in decimal) has the same upper 32 bits throughout all the values in
+ * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
+ * range.
+ *
+ * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
+ * following the rules outlined below about u64/s64 correspondence
+ * (which equally applies to u32 vs s32 correspondence). In general it
+ * depends on actual hexadecimal values of 32-bit range. They can form
+ * only valid u32, or only valid s32 ranges in some cases.
+ *
+ * So we use all these insights to derive bounds for subregisters here.
+ */
+ if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
+ /* u64 to u32 casting preserves validity of low 32 bits as
+ * a range, if upper 32 bits are the same
+ */
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
+
+ if ((s32)reg->umin_value <= (s32)reg->umax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ }
+ if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
+ /* low 32 bits should form a proper u32 range */
+ if ((u32)reg->smin_value <= (u32)reg->smax_value) {
+ reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
+ reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
+ }
+ /* low 32 bits should form a proper s32 range */
+ if ((s32)reg->smin_value <= (s32)reg->smax_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
+ }
+ /* Special case where upper bits form a small sequence of two
+ * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
+ * 0x00000000 is also valid), while lower bits form a proper s32 range
+ * going from negative numbers to positive numbers. E.g., let's say we
+ * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
+ * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
+ * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
+ * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
+ * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
+ * upper 32 bits. As a random example, s64 range
+ * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
+ * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
+ */
+ if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
+ (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+ }
+ if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
+ (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+ }
+ /* if u32 range forms a valid s32 range (due to matching sign bit),
+ * try to learn from that
+ */
+ if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
+ }
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+ reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
+ reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
+ }
+}
+
+static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
+{
+ /* If u64 range forms a valid s64 range (due to matching sign bit),
+ * try to learn from that. Let's do a bit of ASCII art to see when
+ * this is happening. Let's take u64 range first:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ *
+ * Valid u64 range is formed when umin and umax are anywhere in the
+ * range [0, U64_MAX], and umin <= umax. u64 case is simple and
+ * straightforward. Let's see how s64 range maps onto the same range
+ * of values, annotated below the line for comparison:
+ *
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * So s64 values basically start in the middle and they are logically
+ * contiguous to the right of it, wrapping around from -1 to 0, and
+ * then finishing as S64_MAX (0x7fffffffffffffff) right before
+ * S64_MIN. We can try drawing the continuity of u64 vs s64 values
+ * more visually as mapped to sign-agnostic range of hex values.
+ *
+ * u64 start u64 end
+ * _______________________________________________________________
+ * / \
+ * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX
+ * |-------------------------------|--------------------------------|
+ * 0 S64_MAX S64_MIN -1
+ * / \
+ * >------------------------------ ------------------------------->
+ * s64 continues... s64 end s64 start s64 "midpoint"
+ *
+ * What this means is that, in general, we can't always derive
+ * something new about u64 from any random s64 range, and vice versa.
+ *
+ * But we can do that in two particular cases. One is when entire
+ * u64/s64 range is *entirely* contained within left half of the above
+ * diagram or when it is *entirely* contained in the right half. I.e.:
+ *
+ * |-------------------------------|--------------------------------|
+ * ^ ^ ^ ^
+ * A B C D
+ *
+ * [A, B] and [C, D] are contained entirely in their respective halves
+ * and form valid contiguous ranges as both u64 and s64 values. [A, B]
+ * will be non-negative both as u64 and s64 (and in fact it will be
+ * identical ranges no matter the signedness). [C, D] treated as s64
+ * will be a range of negative values, while in u64 it will be
+ * non-negative range of values larger than 0x8000000000000000.
+ *
+ * Now, any other range here can't be represented in both u64 and s64
+ * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
+ * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
+ * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
+ * for example. Similarly, valid s64 range [D, A] (going from negative
+ * to positive values), would be two separate [D, U64_MAX] and [0, A]
+ * ranges as u64. Currently reg_state can't represent two segments per
+ * numeric domain, so in such situations we can only derive maximal
+ * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
+ *
+ * So we use these facts to derive umin/umax from smin/smax and vice
+ * versa only if they stay within the same "half". This is equivalent
+ * to checking sign bit: lower half will have sign bit as zero, upper
+ * half have sign bit 1. Below in code we simplify this by just
+ * casting umin/umax as smin/smax and checking if they form valid
+ * range, and vice versa. Those are equivalent checks.
+ */
+ if ((s64)reg->umin_value <= (s64)reg->umax_value) {
+ reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
+ reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
+ }
+ /* If we cannot cross the sign boundary, then signed and unsigned bounds
+ * are the same, so combine. This works even in the negative case, e.g.
+ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+ */
+ if ((u64)reg->smin_value <= (u64)reg->smax_value) {
+ reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
+ reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
+ } else {
+ /* If the s64 range crosses the sign boundary, then it's split
+ * between the beginning and end of the U64 domain. In that
+ * case, we can derive new bounds if the u64 range overlaps
+ * with only one end of the s64 range.
+ *
+ * In the following example, the u64 range overlaps only with
+ * positive portion of the s64 range.
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxx s64 range xxxxxxxxx] [xxxxxxx|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * We can thus derive the following new s64 and u64 ranges.
+ *
+ * 0 U64_MAX
+ * | [xxxxxx u64 range xxxxx] |
+ * |----------------------------|----------------------------|
+ * | [xxxxxx s64 range xxxxx] |
+ * 0 S64_MAX S64_MIN -1
+ *
+ * If they overlap in two places, we can't derive anything
+ * because reg_state can't represent two ranges per numeric
+ * domain.
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * The first condition below corresponds to the first diagram
+ * above.
+ */
+ if (reg->umax_value < (u64)reg->smin_value) {
+ reg->smin_value = (s64)reg->umin_value;
+ reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value);
+ } else if ((u64)reg->smax_value < reg->umin_value) {
+ /* This second condition considers the case where the u64 range
+ * overlaps with the negative portion of the s64 range:
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxxxxxx] [xxxxxxxxxxxx s64 range |
+ * 0 S64_MAX S64_MIN -1
+ */
+ reg->smax_value = (s64)reg->umax_value;
+ reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value);
+ }
+ }
+}
+
+static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+{
+ /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
+ * values on both sides of 64-bit range in hope to have tighter range.
+ * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
+ * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
+ * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
+ * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
+ * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
+ * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
+ * We just need to make sure that derived bounds we are intersecting
+ * with are well-formed ranges in respective s64 or u64 domain, just
+ * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
+ */
+ __u64 new_umin, new_umax;
+ __s64 new_smin, new_smax;
+
+ /* u32 -> u64 tightening, it's always well-formed */
+ new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+ reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+ /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
+ new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
+ new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
+ reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+ reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+
+ /* Here we would like to handle a special case after sign extending load,
+ * when upper bits for a 64-bit range are all 1s or all 0s.
+ *
+ * Upper bits are all 1s when register is in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff]
+ * Upper bits are all 0s when register is in a range:
+ * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff]
+ * Together this forms are continuous range:
+ * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff]
+ *
+ * Now, suppose that register range is in fact tighter:
+ * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R)
+ * Also suppose that it's 32-bit range is positive,
+ * meaning that lower 32-bits of the full 64-bit register
+ * are in the range:
+ * [0x0000_0000, 0x7fff_ffff] (W)
+ *
+ * If this happens, then any value in a range:
+ * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff]
+ * is smaller than a lowest bound of the range (R):
+ * 0xffff_ffff_8000_0000
+ * which means that upper bits of the full 64-bit register
+ * can't be all 1s, when lower bits are in range (W).
+ *
+ * Note that:
+ * - 0xffff_ffff_8000_0000 == (s64)S32_MIN
+ * - 0x0000_0000_7fff_ffff == (s64)S32_MAX
+ * These relations are used in the conditions below.
+ */
+ if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) {
+ reg->smin_value = reg->s32_min_value;
+ reg->smax_value = reg->s32_max_value;
+ reg->umin_value = reg->s32_min_value;
+ reg->umax_value = reg->s32_max_value;
+ reg->var_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->smin_value, reg->smax_value));
+ }
+}
+
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+{
+ __reg32_deduce_bounds(reg);
+ __reg64_deduce_bounds(reg);
+ __reg_deduce_mixed_bounds(reg);
+}
+
+/* Attempts to improve var_off based on unsigned min/max information */
+static void __reg_bound_offset(struct bpf_reg_state *reg)
+{
+ struct tnum var64_off = tnum_intersect(reg->var_off,
+ tnum_range(reg->umin_value,
+ reg->umax_value));
+ struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+ tnum_range(reg->u32_min_value,
+ reg->u32_max_value));
+
+ reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
+}
+
+static void reg_bounds_sync(struct bpf_reg_state *reg)
+{
+ /* We might have learned new bounds from the var_off. */
+ __update_reg_bounds(reg);
+ /* We might have learned something about the sign bit. */
+ __reg_deduce_bounds(reg);
+ __reg_deduce_bounds(reg);
+ __reg_deduce_bounds(reg);
+ /* We might have learned some bits from the bounds. */
+ __reg_bound_offset(reg);
+ /* Intersecting with the old var_off might have improved our bounds
+ * slightly, e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+ * then new var_off is (0; 0x7f...fc) which improves our umax.
+ */
+ __update_reg_bounds(reg);
+}
+
+static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, const char *ctx)
+{
+ const char *msg;
+
+ if (reg->umin_value > reg->umax_value ||
+ reg->smin_value > reg->smax_value ||
+ reg->u32_min_value > reg->u32_max_value ||
+ reg->s32_min_value > reg->s32_max_value) {
+ msg = "range bounds violation";
+ goto out;
+ }
+
+ if (tnum_is_const(reg->var_off)) {
+ u64 uval = reg->var_off.value;
+ s64 sval = (s64)uval;
+
+ if (reg->umin_value != uval || reg->umax_value != uval ||
+ reg->smin_value != sval || reg->smax_value != sval) {
+ msg = "const tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ if (tnum_subreg_is_const(reg->var_off)) {
+ u32 uval32 = tnum_subreg(reg->var_off).value;
+ s32 sval32 = (s32)uval32;
+
+ if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+ reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
+ msg = "const subreg tnum out of sync with range bounds";
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
+ "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)",
+ ctx, msg, reg->umin_value, reg->umax_value,
+ reg->smin_value, reg->smax_value,
+ reg->u32_min_value, reg->u32_max_value,
+ reg->s32_min_value, reg->s32_max_value,
+ reg->var_off.value, reg->var_off.mask);
+ if (env->test_reg_invariants)
+ return -EFAULT;
+ __mark_reg_unbounded(reg);
+ return 0;
+}
+
+static bool __reg32_bound_s64(s32 a)
+{
+ return a >= 0 && a <= S32_MAX;
+}
+
+static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
+{
+ reg->umin_value = reg->u32_min_value;
+ reg->umax_value = reg->u32_max_value;
+
+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
+ * be positive otherwise set to worse case bounds and refine later
+ * from tnum.
+ */
+ if (__reg32_bound_s64(reg->s32_min_value) &&
+ __reg32_bound_s64(reg->s32_max_value)) {
+ reg->smin_value = reg->s32_min_value;
+ reg->smax_value = reg->s32_max_value;
+ } else {
+ reg->smin_value = 0;
+ reg->smax_value = U32_MAX;
+ }
+}
+
+/* Mark a register as having a completely unknown (scalar) value. */
+static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg)
+{
+ /*
+ * Clear type, off, and union(map_ptr, range) and
+ * padding between 'type' and union
+ */
+ memset(reg, 0, offsetof(struct bpf_reg_state, var_off));
+ reg->type = SCALAR_VALUE;
+ reg->id = 0;
+ reg->ref_obj_id = 0;
+ reg->var_off = tnum_unknown;
+ reg->frameno = 0;
+ reg->precise = false;
+ __mark_reg_unbounded(reg);
+}
+
+/* Mark a register as having a completely unknown (scalar) value,
+ * initialize .precise as true when not bpf capable.
+ */
+static void __mark_reg_unknown(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ __mark_reg_unknown_imprecise(reg);
+ reg->precise = !env->bpf_capable;
+}
+
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
+{
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs except FP */
+ for (regno = 0; regno < BPF_REG_FP; regno++)
+ __mark_reg_not_init(env, regs + regno);
+ return;
+ }
+ __mark_reg_unknown(env, regs + regno);
+}
+
+static int __mark_reg_s32_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ u32 regno,
+ s32 s32_min,
+ s32 s32_max)
+{
+ struct bpf_reg_state *reg = regs + regno;
+
+ reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min);
+ reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max);
+
+ reg->smin_value = max_t(s64, reg->smin_value, s32_min);
+ reg->smax_value = min_t(s64, reg->smax_value, s32_max);
+
+ reg_bounds_sync(reg);
+
+ return reg_bounds_sanity_check(env, reg, "s32_range");
+}
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ __mark_reg_unknown(env, reg);
+ reg->type = NOT_INIT;
+}
+
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
+{
+ if (WARN_ON(regno >= MAX_BPF_REG)) {
+ verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
+ /* Something bad happened, let's kill all regs except FP */
+ for (regno = 0; regno < BPF_REG_FP; regno++)
+ __mark_reg_not_init(env, regs + regno);
+ return;
+ }
+ __mark_reg_not_init(env, regs + regno);
+}
+
+static int mark_btf_ld_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno,
+ enum bpf_reg_type reg_type,
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
+{
+ switch (reg_type) {
+ case SCALAR_VALUE:
+ mark_reg_unknown(env, regs, regno);
+ return 0;
+ case PTR_TO_BTF_ID:
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_BTF_ID | flag;
+ regs[regno].btf = btf;
+ regs[regno].btf_id = btf_id;
+ if (type_may_be_null(flag))
+ regs[regno].id = ++env->id_gen;
+ return 0;
+ case PTR_TO_MEM:
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_MEM | flag;
+ regs[regno].mem_size = 0;
+ return 0;
+ default:
+ verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__);
+ return -EFAULT;
+ }
+}
+
+#define DEF_NOT_SUBREG (0)
+static void init_reg_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state)
{
+ struct bpf_reg_state *regs = state->regs;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- regs[i].type = NOT_INIT;
- regs[i].imm = 0;
- regs[i].map_ptr = NULL;
+ mark_reg_not_init(env, regs, i);
+ regs[i].subreg_def = DEF_NOT_SUBREG;
}
/* frame pointer */
- regs[BPF_REG_FP].type = FRAME_PTR;
+ regs[BPF_REG_FP].type = PTR_TO_STACK;
+ mark_reg_known_zero(env, regs, BPF_REG_FP);
+ regs[BPF_REG_FP].frameno = state->frameno;
+}
+
+static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
+{
+ return (struct bpf_retval_range){ minval, maxval };
+}
- /* 1st arg to a function */
- regs[BPF_REG_1].type = PTR_TO_CTX;
+#define BPF_MAIN_FUNC (-1)
+static void init_func_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state,
+ int callsite, int frameno, int subprogno)
+{
+ state->callsite = callsite;
+ state->frameno = frameno;
+ state->subprogno = subprogno;
+ state->callback_ret_range = retval_range(0, 0);
+ init_reg_state(env, state);
+ mark_verifier_state_scratched(env);
}
-static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+/* Similar to push_stack(), but for async callbacks */
+static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx,
+ int subprog, bool is_sleepable)
{
- BUG_ON(regno >= MAX_BPF_REG);
- regs[regno].type = UNKNOWN_VALUE;
- regs[regno].imm = 0;
- regs[regno].map_ptr = NULL;
+ struct bpf_verifier_stack_elem *elem;
+ struct bpf_func_state *frame;
+
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
+ if (!elem)
+ return ERR_PTR(-ENOMEM);
+
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ elem->log_pos = env->log.end_pos;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env,
+ "The sequence of %d jumps is too complex for async cb.\n",
+ env->stack_size);
+ return ERR_PTR(-E2BIG);
+ }
+ /* Unlike push_stack() do not copy_verifier_state().
+ * The caller state doesn't matter.
+ * This is async callback. It starts in a fresh stack.
+ * Initialize it similar to do_check_common().
+ */
+ elem->st.branches = 1;
+ elem->st.in_sleepable = is_sleepable;
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT);
+ if (!frame)
+ return ERR_PTR(-ENOMEM);
+ init_func_state(env, frame,
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ elem->st.frame[0] = frame;
+ return &elem->st;
}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct reg_state *regs, u32 regno,
- enum reg_arg_type t)
+static int cmp_subprogs(const void *a, const void *b)
+{
+ return ((struct bpf_subprog_info *)a)->start -
+ ((struct bpf_subprog_info *)b)->start;
+}
+
+/* Find subprogram that contains instruction at 'off' */
+struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *vals = env->subprog_info;
+ int l, r, m;
+
+ if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0)
+ return NULL;
+
+ l = 0;
+ r = env->subprog_cnt - 1;
+ while (l < r) {
+ m = l + (r - l + 1) / 2;
+ if (vals[m].start <= off)
+ l = m;
+ else
+ r = m - 1;
+ }
+ return &vals[l];
+}
+
+/* Find subprogram that starts exactly at 'off' */
+static int find_subprog(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *p;
+
+ p = bpf_find_containing_subprog(env, off);
+ if (!p || p->start != off)
+ return -ENOENT;
+ return p - env->subprog_info;
+}
+
+static int add_subprog(struct bpf_verifier_env *env, int off)
+{
+ int insn_cnt = env->prog->len;
+ int ret;
+
+ if (off >= insn_cnt || off < 0) {
+ verbose(env, "call to invalid destination\n");
+ return -EINVAL;
+ }
+ ret = find_subprog(env, off);
+ if (ret >= 0)
+ return ret;
+ if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+ verbose(env, "too many subprograms\n");
+ return -E2BIG;
+ }
+ /* determine subprog starts. The end is one before the next starts */
+ env->subprog_info[env->subprog_cnt++].start = off;
+ sort(env->subprog_info, env->subprog_cnt,
+ sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
+ return env->subprog_cnt - 1;
+}
+
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct btf *btf = aux->btf;
+ const struct btf_type *t;
+ u32 main_btf_id, id;
+ const char *name;
+ int ret, i;
+
+ /* Non-zero func_info_cnt implies valid btf */
+ if (!aux->func_info_cnt)
+ return 0;
+ main_btf_id = aux->func_info[0].type_id;
+
+ t = btf_type_by_id(btf, main_btf_id);
+ if (!t) {
+ verbose(env, "invalid btf id for main subprog in func_info\n");
+ return -EINVAL;
+ }
+
+ name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ /* If there is no tag present, there is no exception callback */
+ if (ret == -ENOENT)
+ ret = 0;
+ else if (ret == -EEXIST)
+ verbose(env, "multiple exception callback tags for main subprog\n");
+ return ret;
+ }
+
+ ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+ if (ret < 0) {
+ verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+ return ret;
+ }
+ id = ret;
+ t = btf_type_by_id(btf, id);
+ if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+ verbose(env, "exception callback '%s' must have global linkage\n", name);
+ return -EINVAL;
+ }
+ ret = 0;
+ for (i = 0; i < aux->func_info_cnt; i++) {
+ if (aux->func_info[i].type_id != id)
+ continue;
+ ret = aux->func_info[i].insn_off;
+ /* Further func_info and subprog checks will also happen
+ * later, so assume this is the right insn_off for now.
+ */
+ if (!ret) {
+ verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+ ret = -EINVAL;
+ }
+ }
+ if (!ret) {
+ verbose(env, "exception callback type id not found in func_info\n");
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+#define MAX_KFUNC_DESCS 256
+#define MAX_KFUNC_BTFS 256
+
+struct bpf_kfunc_desc {
+ struct btf_func_model func_model;
+ u32 func_id;
+ s32 imm;
+ u16 offset;
+ unsigned long addr;
+};
+
+struct bpf_kfunc_btf {
+ struct btf *btf;
+ struct module *module;
+ u16 offset;
+};
+
+struct bpf_kfunc_desc_tab {
+ /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+ * verification. JITs do lookups by bpf_insn, where func_id may not be
+ * available, therefore at the end of verification do_misc_fixups()
+ * sorts this by imm and offset.
+ */
+ struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
+ u32 nr_descs;
+};
+
+struct bpf_kfunc_btf_tab {
+ struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS];
+ u32 nr_descs;
+};
+
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc,
+ int insn_idx);
+
+static int kfunc_desc_cmp_by_id_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ /* func_id is not greater than BTF_MAX_TYPE */
+ return d0->func_id - d1->func_id ?: d0->offset - d1->offset;
+}
+
+static int kfunc_btf_cmp_by_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_btf *d0 = a;
+ const struct bpf_kfunc_btf *d1 = b;
+
+ return d0->offset - d1->offset;
+}
+
+static struct bpf_kfunc_desc *
+find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
+{
+ struct bpf_kfunc_desc desc = {
+ .func_id = func_id,
+ .offset = offset,
+ };
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ return bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
+}
+
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+ u16 btf_fd_idx, u8 **func_addr)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+ if (!desc)
+ return -EFAULT;
+
+ *func_addr = (u8 *)desc->addr;
+ return 0;
+}
+
+static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
+ s16 offset)
+{
+ struct bpf_kfunc_btf kf_btf = { .offset = offset };
+ struct bpf_kfunc_btf_tab *tab;
+ struct bpf_kfunc_btf *b;
+ struct module *mod;
+ struct btf *btf;
+ int btf_fd;
+
+ tab = env->prog->aux->kfunc_btf_tab;
+ b = bsearch(&kf_btf, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_btf_cmp_by_off);
+ if (!b) {
+ if (tab->nr_descs == MAX_KFUNC_BTFS) {
+ verbose(env, "too many different module BTFs\n");
+ return ERR_PTR(-E2BIG);
+ }
+
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "kfunc offset > 0 without fd_array is invalid\n");
+ return ERR_PTR(-EPROTO);
+ }
+
+ if (copy_from_bpfptr_offset(&btf_fd, env->fd_array,
+ offset * sizeof(btf_fd),
+ sizeof(btf_fd)))
+ return ERR_PTR(-EFAULT);
+
+ btf = btf_get_by_fd(btf_fd);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF fd specified\n");
+ return btf;
+ }
+
+ if (!btf_is_module(btf)) {
+ verbose(env, "BTF fd for kfunc is not a module BTF\n");
+ btf_put(btf);
+ return ERR_PTR(-EINVAL);
+ }
+
+ mod = btf_try_get_module(btf);
+ if (!mod) {
+ btf_put(btf);
+ return ERR_PTR(-ENXIO);
+ }
+
+ b = &tab->descs[tab->nr_descs++];
+ b->btf = btf;
+ b->module = mod;
+ b->offset = offset;
+
+ /* sort() reorders entries by value, so b may no longer point
+ * to the right entry after this
+ */
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_btf_cmp_by_off, NULL);
+ } else {
+ btf = b->btf;
+ }
+
+ return btf;
+}
+
+void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
+{
+ if (!tab)
+ return;
+
+ while (tab->nr_descs--) {
+ module_put(tab->descs[tab->nr_descs].module);
+ btf_put(tab->descs[tab->nr_descs].btf);
+ }
+ kfree(tab);
+}
+
+static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset)
+{
+ if (offset) {
+ if (offset < 0) {
+ /* In the future, this can be allowed to increase limit
+ * of fd index into fd_array, interpreted as u16.
+ */
+ verbose(env, "negative offset disallowed for kernel module function call\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return __find_kfunc_desc_btf(env, offset);
+ }
+ return btf_vmlinux ?: ERR_PTR(-ENOENT);
+}
+
+static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
+{
+ const struct btf_type *func, *func_proto;
+ struct bpf_kfunc_btf_tab *btf_tab;
+ struct btf_func_model func_model;
+ struct bpf_kfunc_desc_tab *tab;
+ struct bpf_prog_aux *prog_aux;
+ struct bpf_kfunc_desc *desc;
+ const char *func_name;
+ struct btf *desc_btf;
+ unsigned long addr;
+ int err;
+
+ prog_aux = env->prog->aux;
+ tab = prog_aux->kfunc_tab;
+ btf_tab = prog_aux->kfunc_btf_tab;
+ if (!tab) {
+ if (!btf_vmlinux) {
+ verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required for calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!bpf_jit_supports_kfunc_call()) {
+ verbose(env, "JIT does not support calling kernel function\n");
+ return -ENOTSUPP;
+ }
+
+ if (!env->prog->gpl_compatible) {
+ verbose(env, "cannot call kernel function from non-GPL compatible program\n");
+ return -EINVAL;
+ }
+
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL_ACCOUNT);
+ if (!tab)
+ return -ENOMEM;
+ prog_aux->kfunc_tab = tab;
+ }
+
+ /* func_id == 0 is always invalid, but instead of returning an error, be
+ * conservative and wait until the code elimination pass before returning
+ * error, so that invalid calls that get pruned out can be in BPF programs
+ * loaded from userspace. It is also required that offset be untouched
+ * for such calls.
+ */
+ if (!func_id && !offset)
+ return 0;
+
+ if (!btf_tab && offset) {
+ btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT);
+ if (!btf_tab)
+ return -ENOMEM;
+ prog_aux->kfunc_btf_tab = btf_tab;
+ }
+
+ desc_btf = find_kfunc_desc_btf(env, offset);
+ if (IS_ERR(desc_btf)) {
+ verbose(env, "failed to find BTF for kernel function\n");
+ return PTR_ERR(desc_btf);
+ }
+
+ if (find_kfunc_desc(env->prog, func_id, offset))
+ return 0;
+
+ if (tab->nr_descs == MAX_KFUNC_DESCS) {
+ verbose(env, "too many different kernel function calls\n");
+ return -E2BIG;
+ }
+
+ func = btf_type_by_id(desc_btf, func_id);
+ if (!func || !btf_type_is_func(func)) {
+ verbose(env, "kernel btf_id %u is not a function\n",
+ func_id);
+ return -EINVAL;
+ }
+ func_proto = btf_type_by_id(desc_btf, func->type);
+ if (!func_proto || !btf_type_is_func_proto(func_proto)) {
+ verbose(env, "kernel function btf_id %u does not have a valid func_proto\n",
+ func_id);
+ return -EINVAL;
+ }
+
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
+ addr = kallsyms_lookup_name(func_name);
+ if (!addr) {
+ verbose(env, "cannot find address for kernel function %s\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ err = bpf_dev_bound_kfunc_check(&env->log, prog_aux);
+ if (err)
+ return err;
+ }
+
+ err = btf_distill_func_proto(&env->log, desc_btf,
+ func_proto, func_name,
+ &func_model);
+ if (err)
+ return err;
+
+ desc = &tab->descs[tab->nr_descs++];
+ desc->func_id = func_id;
+ desc->offset = offset;
+ desc->addr = addr;
+ desc->func_model = func_model;
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_id_off, NULL);
+ return 0;
+}
+
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
+{
+ const struct bpf_kfunc_desc *d0 = a;
+ const struct bpf_kfunc_desc *d1 = b;
+
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
+ return 0;
+}
+
+static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc)
+{
+ unsigned long call_imm;
+
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = desc->func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(desc->addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel func_id %u is out of range\n",
+ desc->func_id);
+ return -EINVAL;
+ }
+ }
+ desc->imm = call_imm;
+ return 0;
+}
+
+static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env)
+{
+ struct bpf_kfunc_desc_tab *tab;
+ int i, err;
+
+ tab = env->prog->aux->kfunc_tab;
+ if (!tab)
+ return 0;
+
+ for (i = 0; i < tab->nr_descs; i++) {
+ err = set_kfunc_desc_imm(env, &tab->descs[i]);
+ if (err)
+ return err;
+ }
+
+ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
+ kfunc_desc_cmp_by_imm_off, NULL);
+ return 0;
+}
+
+bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
+{
+ return !!prog->aux->kfunc_tab;
+}
+
+const struct btf_func_model *
+bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_kfunc_desc desc = {
+ .imm = insn->imm,
+ .offset = insn->off,
+ };
+ const struct bpf_kfunc_desc *res;
+ struct bpf_kfunc_desc_tab *tab;
+
+ tab = prog->aux->kfunc_tab;
+ res = bsearch(&desc, tab->descs, tab->nr_descs,
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
+
+ return res ? &res->func_model : NULL;
+}
+
+static int add_kfunc_in_insns(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int cnt)
+{
+ int i, ret;
+
+ for (i = 0; i < cnt; i++, insn++) {
+ if (bpf_pseudo_kfunc_call(insn)) {
+ ret = add_kfunc_call(env, insn->imm, insn->off);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
+ struct bpf_insn *insn = env->prog->insnsi;
+
+ /* Add entry function. */
+ ret = add_subprog(env, 0);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) &&
+ !bpf_pseudo_kfunc_call(insn))
+ continue;
+
+ if (!env->bpf_capable) {
+ verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+ return -EPERM;
+ }
+
+ if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
+ ret = add_subprog(env, i + insn->imm + 1);
+ else
+ ret = add_kfunc_call(env, insn->imm, insn->off);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = bpf_find_exception_callback_insn_off(env);
+ if (ret < 0)
+ return ret;
+ ex_cb_insn = ret;
+
+ /* If ex_cb_insn > 0, this means that the main program has a subprog
+ * marked using BTF decl tag to serve as the exception callback.
+ */
+ if (ex_cb_insn) {
+ ret = add_subprog(env, ex_cb_insn);
+ if (ret < 0)
+ return ret;
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start != ex_cb_insn)
+ continue;
+ env->exception_callback_subprog = i;
+ mark_subprog_exc_cb(env, i);
+ break;
+ }
+ }
+
+ /* Add a fake 'exit' subprog which could simplify subprog iteration
+ * logic. 'subprog_cnt' should not be increased.
+ */
+ subprog[env->subprog_cnt].start = insn_cnt;
+
+ if (env->log.level & BPF_LOG_LEVEL2)
+ for (i = 0; i < env->subprog_cnt; i++)
+ verbose(env, "func#%d @%d\n", i, subprog[i].start);
+
+ return 0;
+}
+
+static int check_subprogs(struct bpf_verifier_env *env)
+{
+ int i, subprog_start, subprog_end, off, cur_subprog = 0;
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+
+ /* now check that all jumps are within the same subprog */
+ subprog_start = subprog[cur_subprog].start;
+ subprog_end = subprog[cur_subprog + 1].start;
+ for (i = 0; i < insn_cnt; i++) {
+ u8 code = insn[i].code;
+
+ if (code == (BPF_JMP | BPF_CALL) &&
+ insn[i].src_reg == 0 &&
+ insn[i].imm == BPF_FUNC_tail_call) {
+ subprog[cur_subprog].has_tail_call = true;
+ subprog[cur_subprog].tail_call_reachable = true;
+ }
+ if (BPF_CLASS(code) == BPF_LD &&
+ (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
+ subprog[cur_subprog].has_ld_abs = true;
+ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
+ goto next;
+ if (BPF_OP(code) == BPF_CALL)
+ goto next;
+ if (BPF_OP(code) == BPF_EXIT) {
+ subprog[cur_subprog].exit_idx = i;
+ goto next;
+ }
+ off = i + bpf_jmp_offset(&insn[i]) + 1;
+ if (off < subprog_start || off >= subprog_end) {
+ verbose(env, "jump out of range from insn %d to %d\n", i, off);
+ return -EINVAL;
+ }
+next:
+ if (i == subprog_end - 1) {
+ /* to avoid fall-through from one subprog into another
+ * the last insn of the subprog should be either exit
+ * or unconditional jump back or bpf_throw call
+ */
+ if (code != (BPF_JMP | BPF_EXIT) &&
+ code != (BPF_JMP32 | BPF_JA) &&
+ code != (BPF_JMP | BPF_JA)) {
+ verbose(env, "last insn is not an exit or jmp\n");
+ return -EINVAL;
+ }
+ subprog_start = subprog_end;
+ cur_subprog++;
+ if (cur_subprog < env->subprog_cnt)
+ subprog_end = subprog[cur_subprog + 1].start;
+ }
+ }
+ return 0;
+}
+
+static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
+{
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i));
+ if (err)
+ return err;
+ mark_stack_slot_scratched(env, spi - i);
+ }
+ return 0;
+}
+
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
+
+ /* For CONST_PTR_TO_DYNPTR, it must have already been done by
+ * check_reg_arg in check_helper_call and mark_btf_func_reg_size in
+ * check_kfunc_call.
+ */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return 0;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ /* Caller ensures dynptr is valid and initialized, which means spi is in
+ * bounds and spi is the first dynptr slot. Simply mark stack slot as
+ * read.
+ */
+ return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
+}
+
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
{
+ return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
+}
+
+static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return mark_stack_slot_obj_read(env, reg, spi, 1);
+}
+
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_insn *insn,
+ u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+ u8 code, class, op;
+
+ code = insn->code;
+ class = BPF_CLASS(code);
+ op = BPF_OP(code);
+ if (class == BPF_JMP) {
+ /* BPF_EXIT for "main" will reach here. Return TRUE
+ * conservatively.
+ */
+ if (op == BPF_EXIT)
+ return true;
+ if (op == BPF_CALL) {
+ /* BPF to BPF call will reach here because of marking
+ * caller saved clobber with DST_OP_NO_MARK for which we
+ * don't care the register def because they are anyway
+ * marked as NOT_INIT already.
+ */
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ return false;
+ /* Helper call will reach here because of arg type
+ * check, conservatively return TRUE.
+ */
+ if (t == SRC_OP)
+ return true;
+
+ return false;
+ }
+ }
+
+ if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32))
+ return false;
+
+ if (class == BPF_ALU64 || class == BPF_JMP ||
+ (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+ return true;
+
+ if (class == BPF_ALU || class == BPF_JMP32)
+ return false;
+
+ if (class == BPF_LDX) {
+ if (t != SRC_OP)
+ return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
+ /* LDX source must be ptr. */
+ return true;
+ }
+
+ if (class == BPF_STX) {
+ /* BPF_STX (including atomic variants) has one or more source
+ * operands, one of which is a ptr. Check whether the caller is
+ * asking about it.
+ */
+ if (t == SRC_OP && reg->type != SCALAR_VALUE)
+ return true;
+ return BPF_SIZE(code) == BPF_DW;
+ }
+
+ if (class == BPF_LD) {
+ u8 mode = BPF_MODE(code);
+
+ /* LD_IMM64 */
+ if (mode == BPF_IMM)
+ return true;
+
+ /* Both LD_IND and LD_ABS return 32-bit data. */
+ if (t != SRC_OP)
+ return false;
+
+ /* Implicit ctx ptr. */
+ if (regno == BPF_REG_6)
+ return true;
+
+ /* Explicit source could be any width. */
+ return true;
+ }
+
+ if (class == BPF_ST)
+ /* The only source register for BPF_ST is a ptr. */
+ return true;
+
+ /* Conservatively return true at default. */
+ return true;
+}
+
+/* Return the regno defined by the insn, or -1. */
+static int insn_def_regno(const struct bpf_insn *insn)
+{
+ switch (BPF_CLASS(insn->code)) {
+ case BPF_JMP:
+ case BPF_JMP32:
+ case BPF_ST:
+ return -1;
+ case BPF_STX:
+ if (BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
+ if (insn->imm == BPF_CMPXCHG)
+ return BPF_REG_0;
+ else if (insn->imm == BPF_LOAD_ACQ)
+ return insn->dst_reg;
+ else if (insn->imm & BPF_FETCH)
+ return insn->src_reg;
+ }
+ return -1;
+ default:
+ return insn->dst_reg;
+ }
+}
+
+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
+static bool insn_has_def32(struct bpf_insn *insn)
+{
+ int dst_reg = insn_def_regno(insn);
+
+ if (dst_reg == -1)
+ return false;
+
+ return !is_reg64(insn, dst_reg, NULL, DST_OP);
+}
+
+static void mark_insn_zext(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ s32 def_idx = reg->subreg_def;
+
+ if (def_idx == DEF_NOT_SUBREG)
+ return;
+
+ env->insn_aux_data[def_idx - 1].zext_dst = true;
+ /* The dst will be zero extended, so won't be sub-register anymore. */
+ reg->subreg_def = DEF_NOT_SUBREG;
+}
+
+static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
+ struct bpf_reg_state *reg;
+ bool rw64;
+
if (regno >= MAX_BPF_REG) {
- verbose("R%d is invalid\n", regno);
+ verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
+ mark_reg_scratched(env, regno);
+
+ reg = &regs[regno];
+ rw64 = is_reg64(insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
- if (regs[regno].type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ if (reg->type == NOT_INIT) {
+ verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
+ /* We don't need to worry about FP liveness because it's read-only */
+ if (regno == BPF_REG_FP)
+ return 0;
+
+ if (rw64)
+ mark_insn_zext(env, reg);
+
+ return 0;
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
- verbose("frame pointer is read only\n");
+ verbose(env, "frame pointer is read only\n");
return -EACCES;
}
+ reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
if (t == DST_OP)
- mark_reg_unknown_value(regs, regno);
+ mark_reg_unknown(env, regs, regno);
}
return 0;
}
-static int bpf_size_to_bytes(int bpf_size)
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+ enum reg_arg_type t)
{
- if (bpf_size == BPF_W)
- return 4;
- else if (bpf_size == BPF_H)
- return 2;
- else if (bpf_size == BPF_B)
- return 1;
- else if (bpf_size == BPF_DW)
- return 8;
- else
- return -EINVAL;
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+ return __check_reg_arg(env, state->regs, regno, t);
}
-/* check_stack_read/write functions track spill/fill of registers,
- * stack boundary and alignment are checked in check_mem_access()
+static int insn_stack_access_flags(int frameno, int spi)
+{
+ return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+ return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+ return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].jmp_point;
+}
+
+#define LR_FRAMENO_BITS 3
+#define LR_SPI_BITS 6
+#define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1)
+#define LR_SIZE_BITS 4
+#define LR_FRAMENO_MASK ((1ull << LR_FRAMENO_BITS) - 1)
+#define LR_SPI_MASK ((1ull << LR_SPI_BITS) - 1)
+#define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1)
+#define LR_SPI_OFF LR_FRAMENO_BITS
+#define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS)
+#define LINKED_REGS_MAX 6
+
+struct linked_reg {
+ u8 frameno;
+ union {
+ u8 spi;
+ u8 regno;
+ };
+ bool is_reg;
+};
+
+struct linked_regs {
+ int cnt;
+ struct linked_reg entries[LINKED_REGS_MAX];
+};
+
+static struct linked_reg *linked_regs_push(struct linked_regs *s)
+{
+ if (s->cnt < LINKED_REGS_MAX)
+ return &s->entries[s->cnt++];
+
+ return NULL;
+}
+
+/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track
+ * number of elements currently in stack.
+ * Pack one history entry for linked registers as 10 bits in the following format:
+ * - 3-bits frameno
+ * - 6-bits spi_or_reg
+ * - 1-bit is_reg
+ */
+static u64 linked_regs_pack(struct linked_regs *s)
+{
+ u64 val = 0;
+ int i;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+ u64 tmp = 0;
+
+ tmp |= e->frameno;
+ tmp |= e->spi << LR_SPI_OFF;
+ tmp |= (e->is_reg ? 1 : 0) << LR_IS_REG_OFF;
+
+ val <<= LR_ENTRY_BITS;
+ val |= tmp;
+ }
+ val <<= LR_SIZE_BITS;
+ val |= s->cnt;
+ return val;
+}
+
+static void linked_regs_unpack(u64 val, struct linked_regs *s)
+{
+ int i;
+
+ s->cnt = val & LR_SIZE_MASK;
+ val >>= LR_SIZE_BITS;
+
+ for (i = 0; i < s->cnt; ++i) {
+ struct linked_reg *e = &s->entries[i];
+
+ e->frameno = val & LR_FRAMENO_MASK;
+ e->spi = (val >> LR_SPI_OFF) & LR_SPI_MASK;
+ e->is_reg = (val >> LR_IS_REG_OFF) & 0x1;
+ val >>= LR_ENTRY_BITS;
+ }
+}
+
+/* for any branch, call, exit record the history of jmps in the given state */
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
+{
+ u32 cnt = cur->jmp_history_cnt;
+ struct bpf_jmp_history_entry *p;
+ size_t alloc_size;
+
+ /* combine instruction flags if we already recorded this instruction */
+ if (env->cur_hist_ent) {
+ /* atomic instructions push insn_flags twice, for READ and
+ * WRITE sides, but they should agree on stack slot
+ */
+ verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ env, "insn history: insn_idx %d cur flags %x new flags %x",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ env->cur_hist_ent->flags |= insn_flags;
+ verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
+ "insn history: insn_idx %d linked_regs: %#llx",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
+ env->cur_hist_ent->linked_regs = linked_regs;
+ return 0;
+ }
+
+ cnt++;
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!p)
+ return -ENOMEM;
+ cur->jmp_history = p;
+
+ p = &cur->jmp_history[cnt - 1];
+ p->idx = env->insn_idx;
+ p->prev_idx = env->prev_insn_idx;
+ p->flags = insn_flags;
+ p->linked_regs = linked_regs;
+ cur->jmp_history_cnt = cnt;
+ env->cur_hist_ent = p;
+
+ return 0;
+}
+
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
+{
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
+ return NULL;
+}
+
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+ u32 *history)
+{
+ u32 cnt = *history;
+
+ if (i == st->first_insn_idx) {
+ if (cnt == 0)
+ return -ENOENT;
+ if (cnt == 1 && st->jmp_history[0].idx == i)
+ return -ENOENT;
+ }
+
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
+ i = st->jmp_history[cnt - 1].prev_idx;
+ (*history)--;
+ } else {
+ i--;
+ }
+ return i;
+}
+
+static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
+{
+ const struct btf_type *func;
+ struct btf *desc_btf;
+
+ if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
+ return NULL;
+
+ desc_btf = find_kfunc_desc_btf(data, insn->off);
+ if (IS_ERR(desc_btf))
+ return "<error>";
+
+ func = btf_type_by_id(desc_btf, insn->imm);
+ return btf_name_by_offset(desc_btf, func->name_off);
+}
+
+static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
+ .cb_print = verbose,
+ .private_data = env,
+ };
+
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+}
+
+static inline void bt_init(struct backtrack_state *bt, u32 frame)
+{
+ bt->frame = frame;
+}
+
+static inline void bt_reset(struct backtrack_state *bt)
+{
+ struct bpf_verifier_env *env = bt->env;
+
+ memset(bt, 0, sizeof(*bt));
+ bt->env = env;
+}
+
+static inline u32 bt_empty(struct backtrack_state *bt)
+{
+ u64 mask = 0;
+ int i;
+
+ for (i = 0; i <= bt->frame; i++)
+ mask |= bt->reg_masks[i] | bt->stack_masks[i];
+
+ return mask == 0;
+}
+
+static inline int bt_subprog_enter(struct backtrack_state *bt)
+{
+ if (bt->frame == MAX_CALL_FRAMES - 1) {
+ verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
+ return -EFAULT;
+ }
+ bt->frame++;
+ return 0;
+}
+
+static inline int bt_subprog_exit(struct backtrack_state *bt)
+{
+ if (bt->frame == 0) {
+ verifier_bug(bt->env, "subprog exit from frame 0");
+ return -EFAULT;
+ }
+ bt->frame--;
+ return 0;
+}
+
+static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] |= 1 << reg;
+}
+
+static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ bt->reg_masks[frame] &= ~(1 << reg);
+}
+
+static inline void bt_set_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_set_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg)
+{
+ bt_clear_frame_reg(bt, bt->frame, reg);
+}
+
+static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] |= 1ull << slot;
+}
+
+static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ bt->stack_masks[frame] &= ~(1ull << slot);
+}
+
+static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->reg_masks[frame];
+}
+
+static inline u32 bt_reg_mask(struct backtrack_state *bt)
+{
+ return bt->reg_masks[bt->frame];
+}
+
+static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame)
+{
+ return bt->stack_masks[frame];
+}
+
+static inline u64 bt_stack_mask(struct backtrack_state *bt)
+{
+ return bt->stack_masks[bt->frame];
+}
+
+static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
+{
+ return bt->reg_masks[bt->frame] & (1 << reg);
+}
+
+static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg)
+{
+ return bt->reg_masks[frame] & (1 << reg);
+}
+
+static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+ return bt->stack_masks[frame] & (1ull << slot);
+}
+
+/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
+static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */
+void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
+{
+ DECLARE_BITMAP(mask, 64);
+ bool first = true;
+ int i, n;
+
+ buf[0] = '\0';
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8);
+ first = false;
+ buf += n;
+ buf_sz -= n;
+ if (buf_sz < 0)
+ break;
+ }
+}
+
+/* If any register R in hist->linked_regs is marked as precise in bt,
+ * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
*/
-static int check_stack_write(struct verifier_state *state, int off, int size,
- int value_regno)
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
{
+ struct linked_regs linked_regs;
+ bool some_precise = false;
int i;
+
+ if (!hist || hist->linked_regs == 0)
+ return;
+
+ linked_regs_unpack(hist->linked_regs, &linked_regs);
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if ((e->is_reg && bt_is_frame_reg_set(bt, e->frameno, e->regno)) ||
+ (!e->is_reg && bt_is_frame_slot_set(bt, e->frameno, e->spi))) {
+ some_precise = true;
+ break;
+ }
+ }
+
+ if (!some_precise)
+ return;
+
+ for (i = 0; i < linked_regs.cnt; ++i) {
+ struct linked_reg *e = &linked_regs.entries[i];
+
+ if (e->is_reg)
+ bt_set_frame_reg(bt, e->frameno, e->regno);
+ else
+ bt_set_frame_slot(bt, e->frameno, e->spi);
+ }
+}
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ *
+ * @idx is an index of the instruction we are currently processing;
+ * @subseq_idx is an index of the subsequent instruction that:
+ * - *would be* executed next, if jump history is viewed in forward order;
+ * - *was* processed previously during backtracking.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
+{
+ struct bpf_insn *insn = env->prog->insnsi + idx;
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u32 dreg = insn->dst_reg;
+ u32 sreg = insn->src_reg;
+ u32 spi, i, fr;
+
+ if (insn->code == 0)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt));
+ verbose(env, "mark_precise: frame%d: regs=%s ",
+ bt->frame, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
+ verbose(env, "stack=%s before ", env->tmp_str_buf);
+ verbose(env, "%d: ", idx);
+ verbose_insn(env, insn);
+ }
+
+ /* If there is a history record that some registers gained range at this insn,
+ * propagate precision marks to those registers, so that bt_is_reg_set()
+ * accounts for these registers.
+ */
+ bt_sync_linked_regs(bt, hist);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ /* sreg is reserved and unused
+ * dreg still need precision before this insn
+ */
+ return 0;
+ } else if (opcode == BPF_MOV) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg = sreg or dreg = (s8, s16, s32)sreg
+ * dreg needs precision after this insn
+ * sreg needs precision before this insn
+ */
+ bt_clear_reg(bt, dreg);
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
+ } else {
+ /* dreg = K
+ * dreg needs precision after this insn.
+ * Corresponding register is already marked
+ * as precise=true in this verifier state.
+ * No further markings in parent are necessary
+ */
+ bt_clear_reg(bt, dreg);
+ }
+ } else {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg += sreg
+ * both dreg and sreg need precision
+ * before this insn
+ */
+ if (sreg != BPF_REG_FP)
+ bt_set_reg(bt, sreg);
+ } /* else dreg += K
+ * dreg still needs precision before this insn
+ */
+ }
+ } else if (class == BPF_LDX || is_atomic_load_insn(insn)) {
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ bt_clear_reg(bt, dreg);
+
+ /* scalars can only be spilled into stack w/o losing precision.
+ * Load from any other memory can be zero extended.
+ * The desire to keep that precision is already indicated
+ * by 'precise' mark in corresponding register of this state.
+ * No further tracking necessary.
+ */
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
+ return 0;
+ /* dreg = *(u64 *)[fp - off] was a fill from the stack.
+ * that [fp - off] slot contains scalar that needs to be
+ * tracked with precision
+ */
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ bt_set_frame_slot(bt, fr, spi);
+ } else if (class == BPF_STX || class == BPF_ST) {
+ if (bt_is_reg_set(bt, dreg))
+ /* stx & st shouldn't be using _scalar_ dst_reg
+ * to access memory. It means backtracking
+ * encountered a case of pointer subtraction.
+ */
+ return -ENOTSUPP;
+ /* scalars can only be spilled into stack */
+ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
+ return 0;
+ spi = insn_stack_access_spi(hist->flags);
+ fr = insn_stack_access_frameno(hist->flags);
+ if (!bt_is_frame_slot_set(bt, fr, spi))
+ return 0;
+ bt_clear_frame_slot(bt, fr, spi);
+ if (class == BPF_STX)
+ bt_set_reg(bt, sreg);
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ if (bpf_pseudo_call(insn)) {
+ int subprog_insn_idx, subprog;
+
+ subprog_insn_idx = idx + insn->imm + 1;
+ subprog = find_subprog(env, subprog_insn_idx);
+ if (subprog < 0)
+ return -EFAULT;
+
+ if (subprog_is_global(env, subprog)) {
+ /* check that jump history doesn't have any
+ * extra instructions from subprog; the next
+ * instruction after call to global subprog
+ * should be literally next instruction in
+ * caller program
+ */
+ verifier_bug_if(idx + 1 != subseq_idx, env,
+ "extra insn from subprog");
+ /* r1-r5 are invalidated after subprog call,
+ * so for global func call it shouldn't be set
+ * anymore
+ */
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "global subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* global subprog always sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ return 0;
+ } else {
+ /* static subprog call instruction, which
+ * means that we are exiting current subprog,
+ * so only r1-r5 could be still requested as
+ * precise, r0 and r6-r10 or any stack slot in
+ * the current frame should be zero by now
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "static subprog unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ /* we are now tracking register spills correctly,
+ * so any instance of leftover slots is a bug
+ */
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env,
+ "static subprog leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* propagate r1-r5 to the caller */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
+ if (bt_is_reg_set(bt, i)) {
+ bt_clear_reg(bt, i);
+ bt_set_frame_reg(bt, bt->frame - 1, i);
+ }
+ }
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ }
+ } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
+ */
+ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
+ verifier_bug(env, "callback unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ if (bt_stack_mask(bt) != 0) {
+ verifier_bug(env, "callback leftover stack slots %llx",
+ bt_stack_mask(bt));
+ return -EFAULT;
+ }
+ /* clear r1-r5 in callback subprog's mask */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_subprog_exit(bt))
+ return -EFAULT;
+ return 0;
+ } else if (opcode == BPF_CALL) {
+ /* kfunc with imm==0 is invalid and fixup_kfunc_call will
+ * catch this error later. Make backtracking conservative
+ * with ENOTSUPP.
+ */
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0)
+ return -ENOTSUPP;
+ /* regular helper call sets R0 */
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ /* if backtracking was looking for registers R1-R5
+ * they should have been found already.
+ */
+ verifier_bug(env, "backtracking call unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+ if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call
+ && subseq_idx - idx != 1) {
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+ }
+ } else if (opcode == BPF_EXIT) {
+ bool r0_precise;
+
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
+ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
+ verifier_bug(env, "backtracking exit unexpected regs %x",
+ bt_reg_mask(bt));
+ return -EFAULT;
+ }
+
+ /* BPF_EXIT in subprog or callback always returns
+ * right after the call instruction, so by checking
+ * whether the instruction at subseq_idx-1 is subprog
+ * call or not we can distinguish actual exit from
+ * *subprog* from exit from *callback*. In the former
+ * case, we need to propagate r0 precision, if
+ * necessary. In the former we never do that.
+ */
+ r0_precise = subseq_idx - 1 >= 0 &&
+ bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) &&
+ bt_is_reg_set(bt, BPF_REG_0);
+
+ bt_clear_reg(bt, BPF_REG_0);
+ if (bt_subprog_enter(bt))
+ return -EFAULT;
+
+ if (r0_precise)
+ bt_set_reg(bt, BPF_REG_0);
+ /* r6-r9 and stack slots will stay set in caller frame
+ * bitmasks until we return back from callee(s)
+ */
+ return 0;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
+ } else if (BPF_SRC(insn->code) == BPF_K) {
+ /* dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
+ }
+ } else if (class == BPF_LD) {
+ if (!bt_is_reg_set(bt, dreg))
+ return 0;
+ bt_clear_reg(bt, dreg);
+ /* It's ld_imm64 or ld_abs or ld_ind.
+ * For ld_imm64 no further tracking of precision
+ * into parent is necessary
+ */
+ if (mode == BPF_IND || mode == BPF_ABS)
+ /* to be analyzed */
+ return -ENOTSUPP;
+ }
+ /* Propagate precision marks to linked registers, to account for
+ * registers marked as precise in this function.
+ */
+ bt_sync_linked_regs(bt, hist);
+ return 0;
+}
+
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ * . ptr + scalar alu
+ * . if (scalar cond K|scalar)
+ * . helper_call(.., scalar, ...) where ARG_CONST is expected
+ * backtrack through the verifier states and mark all registers and
+ * stack slots with spilled constants that these scalar registers
+ * should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ * are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n",
+ st->curframe);
+ }
+
+ /* big hammer: mark all scalars precise in this path.
+ * pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
+ */
+ for (st = st->parent; st; st = st->parent) {
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE || reg->precise)
+ continue;
+ reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing r%d to be precise\n",
+ i, j);
+ }
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE || reg->precise)
+ continue;
+ reg->precise = true;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n",
+ i, -(j + 1) * 8);
+ }
+ }
+ }
+ }
+}
+
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+static int __mark_chain_precision(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state,
+ int regno,
+ bool *changed)
+{
+ struct bpf_verifier_state *st = starting_state;
+ struct backtrack_state *bt = &env->bt;
+ int first_idx = st->first_insn_idx;
+ int last_idx = starting_state->insn_idx;
+ int subseq_idx = -1;
+ struct bpf_func_state *func;
+ bool tmp, skip_first = true;
+ struct bpf_reg_state *reg;
+ int i, fr, err;
+
+ if (!env->bpf_capable)
+ return 0;
+
+ changed = changed ?: &tmp;
+ /* set frame number from which we are starting to backtrack */
+ bt_init(bt, starting_state->curframe);
+
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[bt->frame];
+ if (regno >= 0) {
+ reg = &func->regs[regno];
+ if (reg->type != SCALAR_VALUE) {
+ verifier_bug(env, "backtracking misuse");
+ return -EFAULT;
+ }
+ bt_set_reg(bt, regno);
+ }
+
+ if (bt_empty(bt))
+ return 0;
+
+ for (;;) {
+ DECLARE_BITMAP(mask, 64);
+ u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
+ bt->frame, last_idx, first_idx, subseq_idx);
+ }
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ bt_stack_mask(bt) == 0 &&
+ (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) {
+ bitmap_from_u64(mask, bt_reg_mask(bt));
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ bt_clear_reg(bt, i);
+ if (reg->type == SCALAR_VALUE) {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+ return 0;
+ }
+
+ verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
+ return -EFAULT;
+ }
+
+ for (i = last_idx;;) {
+ if (skip_first) {
+ err = 0;
+ skip_first = false;
+ } else {
+ hist = get_jmp_hist_entry(st, history, i);
+ err = backtrack_insn(env, i, subseq_idx, hist, bt);
+ }
+ if (err == -ENOTSUPP) {
+ mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
+ return 0;
+ } else if (err) {
+ return err;
+ }
+ if (bt_empty(bt))
+ /* Found assignment(s) into tracked register in this state.
+ * Since this state is already marked, just return.
+ * Nothing to be tracked further in the parent state.
+ */
+ return 0;
+ subseq_idx = i;
+ i = get_prev_insn_idx(st, i, &history);
+ if (i == -ENOENT)
+ break;
+ if (i >= env->prog->len) {
+ /* This can happen if backtracking reached insn 0
+ * and there are still reg_mask or stack_mask
+ * to backtrack.
+ * It means the backtracking missed the spot where
+ * particular register was initialized with a constant.
+ */
+ verifier_bug(env, "backtracking idx %d", i);
+ return -EFAULT;
+ }
+ }
+ st = st->parent;
+ if (!st)
+ break;
+
+ for (fr = bt->frame; fr >= 0; fr--) {
+ func = st->frame[fr];
+ bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr));
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ bt_clear_frame_reg(bt, fr, i);
+ continue;
+ }
+ if (reg->precise) {
+ bt_clear_frame_reg(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+
+ bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
+ for_each_set_bit(i, mask, 64) {
+ if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
+ env, "stack slot %d, total slots %d",
+ i, func->allocated_stack / BPF_REG_SIZE))
+ return -EFAULT;
+
+ if (!is_spilled_scalar_reg(&func->stack[i])) {
+ bt_clear_frame_slot(bt, fr, i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->precise) {
+ bt_clear_frame_slot(bt, fr, i);
+ } else {
+ reg->precise = true;
+ *changed = true;
+ }
+ }
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_reg_mask(bt, fr));
+ verbose(env, "mark_precise: frame%d: parent state regs=%s ",
+ fr, env->tmp_str_buf);
+ bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
+ bt_frame_stack_mask(bt, fr));
+ verbose(env, "stack=%s: ", env->tmp_str_buf);
+ print_verifier_state(env, st, fr, true);
+ }
+ }
+
+ if (bt_empty(bt))
+ return 0;
+
+ subseq_idx = first_idx;
+ last_idx = st->last_insn_idx;
+ first_idx = st->first_insn_idx;
+ }
+
+ /* if we still have requested precise regs or slots, we missed
+ * something (e.g., stack access through non-r10 register), so
+ * fallback to marking all precise
+ */
+ if (!bt_empty(bt)) {
+ mark_all_scalars_precise(env, starting_state);
+ bt_reset(bt);
+ }
+
+ return 0;
+}
+
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+ return __mark_chain_precision(env, env->cur_state, regno, NULL);
+}
+
+/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
+ * desired reg and stack masks across all relevant frames
+ */
+static int mark_chain_precision_batch(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state)
+{
+ return __mark_chain_precision(env, starting_state, -1, NULL);
+}
+
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+ switch (base_type(type)) {
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_STACK:
+ case PTR_TO_CTX:
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET_END:
+ case PTR_TO_FLOW_KEYS:
+ case CONST_PTR_TO_MAP:
+ case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_XDP_SOCK:
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BUF:
+ case PTR_TO_MEM:
+ case PTR_TO_FUNC:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_ARENA:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state *reg)
+{
+ return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
+}
+
+/* check if register is a constant scalar value */
+static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
+{
+ return reg->type == SCALAR_VALUE &&
+ tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
+}
+
+/* assuming is_reg_const() is true, return constant value of a register */
+static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
+{
+ return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
+}
+
+static bool __is_pointer_value(bool allow_ptr_leaks,
+ const struct bpf_reg_state *reg)
+{
+ if (allow_ptr_leaks)
+ return false;
+
+ return reg->type != SCALAR_VALUE;
+}
+
+static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
+ struct bpf_reg_state *src_reg)
+{
+ if (src_reg->type != SCALAR_VALUE)
+ return;
+
+ if (src_reg->id & BPF_ADD_CONST) {
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ src_reg->id = 0;
+ src_reg->off = 0;
+ }
+
+ if (!src_reg->id && !tnum_is_const(src_reg->var_off))
+ /* Ensure that src_reg has a valid ID that will be copied to
+ * dst_reg and then will be used by sync_linked_regs() to
+ * propagate min/max range.
+ */
+ src_reg->id = ++env->id_gen;
+}
+
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+ *dst = *src;
+}
+
+static void save_register_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *state,
+ int spi, struct bpf_reg_state *reg,
+ int size)
+{
+ int i;
+
+ copy_register_state(&state->stack[spi].spilled_ptr, reg);
+
+ for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--)
+ state->stack[spi].slot_type[i - 1] = STACK_SPILL;
+
+ /* size < 8 bytes spill */
+ for (; i; i--)
+ mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
+}
+
+static bool is_bpf_st_mem(struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM;
+}
+
+static int get_reg_width(struct bpf_reg_state *reg)
+{
+ return fls64(reg->umax_value);
+}
+
+/* See comment for mark_fastcall_pattern_for_call() */
+static void check_fastcall_stack_contract(struct bpf_verifier_env *env,
+ struct bpf_func_state *state, int insn_idx, int off)
+{
+ struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i;
+
+ if (subprog->fastcall_stack_off <= off || aux[insn_idx].fastcall_pattern)
+ return;
+ /* access to the region [max_stack_depth .. fastcall_stack_off)
+ * from something that is not a part of the fastcall pattern,
+ * disable fastcall rewrites for current subprogram by setting
+ * fastcall_stack_off to a value smaller than any possible offset.
+ */
+ subprog->fastcall_stack_off = S16_MIN;
+ /* reset fastcall aux flags within subprogram,
+ * happens at most once per subprogram
+ */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ aux[i].fastcall_spills_num = 0;
+ aux[i].fastcall_pattern = 0;
+ }
+}
+
+/* check_stack_{read,write}_fixed_off functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
+ /* stack frame we're writing to */
+ struct bpf_func_state *state,
+ int off, int size, int value_regno,
+ int insn_idx)
+{
+ struct bpf_func_state *cur; /* state of the current function */
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ struct bpf_reg_state *reg = NULL;
+ int insn_flags = insn_stack_access_flags(state->frameno, spi);
+
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
+ if (!env->allow_ptr_leaks &&
+ is_spilled_reg(&state->stack[spi]) &&
+ !is_spilled_scalar_reg(&state->stack[spi]) &&
+ size != BPF_REG_SIZE) {
+ verbose(env, "attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
+
+ cur = env->cur_state->frame[env->cur_state->curframe];
+ if (value_regno >= 0)
+ reg = &cur->regs[value_regno];
+ if (!env->bypass_spec_v4) {
+ bool sanitize = reg && is_spillable_regtype(reg->type);
+
+ for (i = 0; i < size; i++) {
+ u8 type = state->stack[spi].slot_type[i];
+
+ if (type != STACK_MISC && type != STACK_ZERO) {
+ sanitize = true;
+ break;
+ }
+ }
+
+ if (sanitize)
+ env->insn_aux_data[insn_idx].nospec_result = true;
+ }
+
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
- if (value_regno >= 0 &&
- (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
- state->regs[value_regno].type == PTR_TO_STACK ||
- state->regs[value_regno].type == PTR_TO_CTX)) {
+ if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) {
+ /* only mark the slot as written if all 8 bytes were written
+ * otherwise read propagation may incorrectly stop too soon
+ * when stack slots are partially written.
+ * This heuristic means that read propagation will be
+ * conservative, since it will add reg_live_read marks
+ * to stack slots all the way to first state when programs
+ * writes+reads less than 8 bytes
+ */
+ bpf_mark_stack_write(env, state->frameno, BIT(spi));
+ }
+ check_fastcall_stack_contract(env, state, insn_idx, off);
+ mark_stack_slot_scratched(env, spi);
+ if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) {
+ bool reg_value_fits;
+
+ reg_value_fits = get_reg_width(reg) <= BITS_PER_BYTE * size;
+ /* Make sure that reg had an ID to build a relation on spill. */
+ if (reg_value_fits)
+ assign_scalar_id_before_mov(env, reg);
+ save_register_state(env, state, spi, reg, size);
+ /* Break the relation on a narrowing spill. */
+ if (!reg_value_fits)
+ state->stack[spi].spilled_ptr.id = 0;
+ } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) &&
+ env->bpf_capable) {
+ struct bpf_reg_state *tmp_reg = &env->fake_reg[0];
+
+ memset(tmp_reg, 0, sizeof(*tmp_reg));
+ __mark_reg_known(tmp_reg, insn->imm);
+ tmp_reg->type = SCALAR_VALUE;
+ save_register_state(env, state, spi, tmp_reg, size);
+ } else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose_linfo(env, insn_idx, "; ");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
-
- /* save register state */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- state->regs[value_regno];
-
- for (i = 0; i < BPF_REG_SIZE; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ if (state != cur && reg->type == PTR_TO_STACK) {
+ verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
+ return -EINVAL;
+ }
+ save_register_state(env, state, spi, reg, size);
} else {
- /* regular write of data into stack */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct reg_state) {};
+ u8 type = STACK_MISC;
+
+ /* regular write of data into stack destroys any spilled ptr */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ scrub_spilled_slot(&state->stack[spi].slot_type[i]);
+
+ /* when we zero initialize stack slots mark them as such */
+ if ((reg && register_is_null(reg)) ||
+ (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
+ /* STACK_ZERO case happened because register spill
+ * wasn't properly aligned at the stack slot boundary,
+ * so it's not a register spill anymore; force
+ * originating register to be precise to make
+ * STACK_ZERO correct for subsequent states
+ */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
+ type = STACK_ZERO;
+ }
+ /* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
+ insn_flags = 0; /* not a register spill */
}
+
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
-static int check_stack_read(struct verifier_state *state, int off, int size,
- int value_regno)
+/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is
+ * known to contain a variable offset.
+ * This function checks whether the write is permitted and conservatively
+ * tracks the effects of the write, considering that each stack slot in the
+ * dynamic range is potentially written to.
+ *
+ * 'off' includes 'regno->off'.
+ * 'value_regno' can be -1, meaning that an unknown value is being written to
+ * the stack.
+ *
+ * Spilled pointers in range are not marked as written because we don't know
+ * what's going to be actually written. This means that read propagation for
+ * future reads cannot be terminated by this write.
+ *
+ * For privileged programs, uninitialized stack slots are considered
+ * initialized by this write (even though we don't know exactly what offsets
+ * are going to be written to). The idea is that we don't want the verifier to
+ * reject future reads that access slots written to through variable offsets.
+ */
+static int check_stack_write_var_off(struct bpf_verifier_env *env,
+ /* func where register points to */
+ struct bpf_func_state *state,
+ int ptr_regno, int off, int size,
+ int value_regno, int insn_idx)
{
- u8 *slot_type;
- int i;
+ struct bpf_func_state *cur; /* state of the current function */
+ int min_off, max_off;
+ int i, err;
+ struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL;
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ bool writing_zero = false;
+ /* set if the fact that we're writing a zero is used to let any
+ * stack slots remain STACK_ZERO
+ */
+ bool zero_used = false;
+
+ cur = env->cur_state->frame[env->cur_state->curframe];
+ ptr_reg = &cur->regs[ptr_regno];
+ min_off = ptr_reg->smin_value + off;
+ max_off = ptr_reg->smax_value + off + size;
+ if (value_regno >= 0)
+ value_reg = &cur->regs[value_regno];
+ if ((value_reg && register_is_null(value_reg)) ||
+ (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
+ writing_zero = true;
+
+ for (i = min_off; i < max_off; i++) {
+ int spi;
+
+ spi = __get_spi(i);
+ err = destroy_if_dynptr_stack_slot(env, state, spi);
+ if (err)
+ return err;
+ }
- slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+ check_fastcall_stack_contract(env, state, insn_idx, min_off);
+ /* Variable offset writes destroy any spilled pointers in range. */
+ for (i = min_off; i < max_off; i++) {
+ u8 new_type, *stype;
+ int slot, spi;
+
+ slot = -i - 1;
+ spi = slot / BPF_REG_SIZE;
+ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ mark_stack_slot_scratched(env, spi);
+
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
+ /* Reject the write if range we may write to has not
+ * been initialized beforehand. If we didn't reject
+ * here, the ptr status would be erased below (even
+ * though not all slots are actually overwritten),
+ * possibly opening the door to leaks.
+ *
+ * We do however catch STACK_INVALID case below, and
+ * only allow reading possibly uninitialized memory
+ * later for CAP_PERFMON, as the write may not happen to
+ * that slot.
+ */
+ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
+ insn_idx, i);
+ return -EINVAL;
+ }
- if (slot_type[0] == STACK_SPILL) {
- if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
- return -EACCES;
+ /* If writing_zero and the spi slot contains a spill of value 0,
+ * maintain the spill type.
+ */
+ if (writing_zero && *stype == STACK_SPILL &&
+ is_spilled_scalar_reg(&state->stack[spi])) {
+ struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr;
+
+ if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) {
+ zero_used = true;
+ continue;
+ }
+ }
+
+ /* Erase all other spilled pointers. */
+ state->stack[spi].spilled_ptr.type = NOT_INIT;
+
+ /* Update the slot type. */
+ new_type = STACK_MISC;
+ if (writing_zero && *stype == STACK_ZERO) {
+ new_type = STACK_ZERO;
+ zero_used = true;
+ }
+ /* If the slot is STACK_INVALID, we check whether it's OK to
+ * pretend that it will be initialized by this write. The slot
+ * might not actually be written to, and so if we mark it as
+ * initialized future reads might leak uninitialized memory.
+ * For privileged programs, we will accept such reads to slots
+ * that may or may not be written because, if we're reject
+ * them, the error would be too confusing.
+ */
+ if (*stype == STACK_INVALID && !env->allow_uninit_stack) {
+ verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d",
+ insn_idx, i);
+ return -EINVAL;
}
- for (i = 1; i < BPF_REG_SIZE; i++) {
- if (slot_type[i] != STACK_SPILL) {
- verbose("corrupted spill memory\n");
+ *stype = new_type;
+ }
+ if (zero_used) {
+ /* backtracking doesn't work for STACK_ZERO yet. */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/* When register 'dst_regno' is assigned some values from stack[min_off,
+ * max_off), we set the register's type according to the types of the
+ * respective stack slots. If all the stack values are known to be zeros, then
+ * so is the destination reg. Otherwise, the register is considered to be
+ * SCALAR. This function does not deal with register filling; the caller must
+ * ensure that all spilled registers in the stack range have been marked as
+ * read.
+ */
+static void mark_reg_stack_read(struct bpf_verifier_env *env,
+ /* func where src register points to */
+ struct bpf_func_state *ptr_state,
+ int min_off, int max_off, int dst_regno)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ int i, slot, spi;
+ u8 *stype;
+ int zeros = 0;
+
+ for (i = min_off; i < max_off; i++) {
+ slot = -i - 1;
+ spi = slot / BPF_REG_SIZE;
+ mark_stack_slot_scratched(env, spi);
+ stype = ptr_state->stack[spi].slot_type;
+ if (stype[slot % BPF_REG_SIZE] != STACK_ZERO)
+ break;
+ zeros++;
+ }
+ if (zeros == max_off - min_off) {
+ /* Any access_size read into register is zero extended,
+ * so the whole register == const_zero.
+ */
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ } else {
+ /* have read misc data from the stack */
+ mark_reg_unknown(env, state->regs, dst_regno);
+ }
+}
+
+/* Read the stack at 'off' and put the results into the register indicated by
+ * 'dst_regno'. It handles reg filling if the addressed stack slot is a
+ * spilled reg.
+ *
+ * 'dst_regno' can be -1, meaning that the read value is not going to a
+ * register.
+ *
+ * The access is assumed to be within the current stack bounds.
+ */
+static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
+ /* func where src register points to */
+ struct bpf_func_state *reg_state,
+ int off, int size, int dst_regno)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ struct bpf_reg_state *reg;
+ u8 *stype, type;
+ int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
+ int err;
+
+ stype = reg_state->stack[spi].slot_type;
+ reg = &reg_state->stack[spi].spilled_ptr;
+
+ mark_stack_slot_scratched(env, spi);
+ check_fastcall_stack_contract(env, state, env->insn_idx, off);
+ err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
+
+ if (is_spilled_reg(&reg_state->stack[spi])) {
+ u8 spill_size = 1;
+
+ for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
+ spill_size++;
+
+ if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose_linfo(env, env->insn_idx, "; ");
+ verbose(env, "invalid size of register fill\n");
return -EACCES;
}
- }
- if (value_regno >= 0)
+ if (dst_regno < 0)
+ return 0;
+
+ if (size <= spill_size &&
+ bpf_stack_narrow_access_ok(off, size, spill_size)) {
+ /* The earlier check_reg_arg() has decided the
+ * subreg_def for this insn. Save it first.
+ */
+ s32 subreg_def = state->regs[dst_regno].subreg_def;
+
+ copy_register_state(&state->regs[dst_regno], reg);
+ state->regs[dst_regno].subreg_def = subreg_def;
+
+ /* Break the relation on a narrowing fill.
+ * coerce_reg_to_size will adjust the boundaries.
+ */
+ if (get_reg_width(reg) > size * BITS_PER_BYTE)
+ state->regs[dst_regno].id = 0;
+ } else {
+ int spill_cnt = 0, zero_cnt = 0;
+
+ for (i = 0; i < size; i++) {
+ type = stype[(slot - i) % BPF_REG_SIZE];
+ if (type == STACK_SPILL) {
+ spill_cnt++;
+ continue;
+ }
+ if (type == STACK_MISC)
+ continue;
+ if (type == STACK_ZERO) {
+ zero_cnt++;
+ continue;
+ }
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+
+ if (spill_cnt == size &&
+ tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ /* this IS register fill, so keep insn_flags */
+ } else if (zero_cnt == size) {
+ /* similarly to mark_reg_stack_read(), preserve zeroes */
+ __mark_reg_const_zero(env, &state->regs[dst_regno]);
+ insn_flags = 0; /* not restoring original register state */
+ } else {
+ mark_reg_unknown(env, state->regs, dst_regno);
+ insn_flags = 0; /* not restoring original register state */
+ }
+ }
+ } else if (dst_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] =
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
- return 0;
+ copy_register_state(&state->regs[dst_regno], reg);
+ /* mark reg as written since spilled pointer state likely
+ * has its liveness marks cleared by is_state_visited()
+ * which resets stack/reg liveness for state transitions
+ */
+ } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) {
+ /* If dst_regno==-1, the caller is asking us whether
+ * it is acceptable to use this value as a SCALAR_VALUE
+ * (e.g. for XADD).
+ * We must not allow unprivileged callers to do that
+ * with spilled pointers.
+ */
+ verbose(env, "leaking pointer from stack off %d\n",
+ off);
+ return -EACCES;
+ }
} else {
for (i = 0; i < size; i++) {
- if (slot_type[i] != STACK_MISC) {
- verbose("invalid read from stack off %d+%d size %d\n",
- off, i, size);
+ type = stype[(slot - i) % BPF_REG_SIZE];
+ if (type == STACK_MISC)
+ continue;
+ if (type == STACK_ZERO)
+ continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ if (dst_regno >= 0)
+ mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+ insn_flags = 0; /* we are not restoring spilled register */
+ }
+ if (insn_flags)
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
+ return 0;
+}
+
+enum bpf_access_src {
+ ACCESS_DIRECT = 1, /* the access is performed by an instruction */
+ ACCESS_HELPER = 2, /* the access is performed by a helper */
+};
+
+static int check_stack_range_initialized(struct bpf_verifier_env *env,
+ int regno, int off, int access_size,
+ bool zero_size_allowed,
+ enum bpf_access_type type,
+ struct bpf_call_arg_meta *meta);
+
+static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+{
+ return cur_regs(env) + regno;
+}
+
+/* Read the stack at 'ptr_regno + off' and put the result into the register
+ * 'dst_regno'.
+ * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'),
+ * but not its variable offset.
+ * 'size' is assumed to be <= reg size and the access is assumed to be aligned.
+ *
+ * As opposed to check_stack_read_fixed_off, this function doesn't deal with
+ * filling registers (i.e. reads of spilled register cannot be detected when
+ * the offset is not fixed). We conservatively mark 'dst_regno' as containing
+ * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable
+ * offset; for a fixed offset check_stack_read_fixed_off should be used
+ * instead.
+ */
+static int check_stack_read_var_off(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size, int dst_regno)
+{
+ /* The state of the source register. */
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *ptr_state = func(env, reg);
+ int err;
+ int min_off, max_off;
+
+ /* Note that we pass a NULL meta, so raw access will not be permitted.
+ */
+ err = check_stack_range_initialized(env, ptr_regno, off, size,
+ false, BPF_READ, NULL);
+ if (err)
+ return err;
+
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off;
+ mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno);
+ check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off);
+ return 0;
+}
+
+/* check_stack_read dispatches to check_stack_read_fixed_off or
+ * check_stack_read_var_off.
+ *
+ * The caller must ensure that the offset falls within the allocated stack
+ * bounds.
+ *
+ * 'dst_regno' is a register which will receive the value from the stack. It
+ * can be -1, meaning that the read value is not going to a register.
+ */
+static int check_stack_read(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size,
+ int dst_regno)
+{
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *state = func(env, reg);
+ int err;
+ /* Some accesses are only permitted with a static offset. */
+ bool var_off = !tnum_is_const(reg->var_off);
+
+ /* The offset is required to be static when reads don't go to a
+ * register, in order to not leak pointers (see
+ * check_stack_read_fixed_off).
+ */
+ if (dst_regno < 0 && var_off) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+ /* Variable offset is prohibited for unprivileged mode for simplicity
+ * since it requires corresponding support in Spectre masking for stack
+ * ALU. See also retrieve_ptr_limit(). The check in
+ * check_stack_access_for_ptr_arithmetic() called by
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+ * with variable offsets, therefore no check is required here. Further,
+ * just checking it here would be insufficient as speculative stack
+ * writes could still lead to unsafe speculative behaviour.
+ */
+ if (!var_off) {
+ off += reg->var_off.value;
+ err = check_stack_read_fixed_off(env, state, off, size,
+ dst_regno);
+ } else {
+ /* Variable offset stack reads need more conservative handling
+ * than fixed offset ones. Note that dst_regno >= 0 on this
+ * branch.
+ */
+ err = check_stack_read_var_off(env, ptr_regno, off, size,
+ dst_regno);
+ }
+ return err;
+}
+
+
+/* check_stack_write dispatches to check_stack_write_fixed_off or
+ * check_stack_write_var_off.
+ *
+ * 'ptr_regno' is the register used as a pointer into the stack.
+ * 'off' includes 'ptr_regno->off', but not its variable offset (if any).
+ * 'value_regno' is the register whose value we're writing to the stack. It can
+ * be -1, meaning that we're not writing from a register.
+ *
+ * The caller must ensure that the offset falls within the maximum stack size.
+ */
+static int check_stack_write(struct bpf_verifier_env *env,
+ int ptr_regno, int off, int size,
+ int value_regno, int insn_idx)
+{
+ struct bpf_reg_state *reg = reg_state(env, ptr_regno);
+ struct bpf_func_state *state = func(env, reg);
+ int err;
+
+ if (tnum_is_const(reg->var_off)) {
+ off += reg->var_off.value;
+ err = check_stack_write_fixed_off(env, state, off, size,
+ value_regno, insn_idx);
+ } else {
+ /* Variable offset stack reads need more conservative handling
+ * than fixed offset ones.
+ */
+ err = check_stack_write_var_off(env, state,
+ ptr_regno, off, size,
+ value_regno, insn_idx);
+ }
+ return err;
+}
+
+static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, enum bpf_access_type type)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
+ u32 cap = bpf_map_flags_to_cap(map);
+
+ if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
+ verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+
+ if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
+ verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
+static int __check_mem_access(struct bpf_verifier_env *env, int regno,
+ int off, int size, u32 mem_size,
+ bool zero_size_allowed)
+{
+ bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
+ struct bpf_reg_state *reg;
+
+ if (off >= 0 && size_ok && (u64)off + size <= mem_size)
+ return 0;
+
+ reg = &cur_regs(env)[regno];
+ switch (reg->type) {
+ case PTR_TO_MAP_KEY:
+ verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
+ case PTR_TO_MAP_VALUE:
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
+ mem_size, off, size);
+ break;
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET_END:
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ off, size, regno, reg->id, off, mem_size);
+ break;
+ case PTR_TO_MEM:
+ default:
+ verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
+ mem_size, off, size);
+ }
+
+ return -EACCES;
+}
+
+/* check read/write into a memory region with possible variable offset */
+static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, u32 mem_size,
+ bool zero_size_allowed)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *reg = &state->regs[regno];
+ int err;
+
+ /* We may have adjusted the register pointing to memory region, so we
+ * need to try adding each of min_value and max_value to off
+ * to make sure our theoretical access will be safe.
+ *
+ * The minimum value is only important with signed
+ * comparisons where we can't assume the floor of a
+ * value is 0. If we are using signed variables for our
+ * index'es we need to make sure that whatever we use
+ * will have a set floor within our range.
+ */
+ if (reg->smin_value < 0 &&
+ (reg->smin_value == S64_MIN ||
+ (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
+ reg->smin_value + off < 0)) {
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = __check_mem_access(env, regno, reg->smin_value + off, size,
+ mem_size, zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d min value is outside of the allowed memory range\n",
+ regno);
+ return err;
+ }
+
+ /* If we haven't set a max value then we need to bail since we can't be
+ * sure we won't do bad things.
+ * If reg->umax_value + off could overflow, treat that as unbounded too.
+ */
+ if (reg->umax_value >= BPF_MAX_VAR_OFF) {
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
+ regno);
+ return -EACCES;
+ }
+ err = __check_mem_access(env, regno, reg->umax_value + off, size,
+ mem_size, zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d max value is outside of the allowed memory range\n",
+ regno);
+ return err;
+ }
+
+ return 0;
+}
+
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ bool fixed_off_ok)
+{
+ /* Access to this pointer-typed register or passing it to a helper
+ * is only allowed in its original, unmodified form.
+ */
+
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!fixed_off_ok && reg->off) {
+ verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
+{
+ return __check_ptr_off_reg(env, reg, regno, false);
+}
+
+static int map_kptr_match_type(struct bpf_verifier_env *env,
+ struct btf_field *kptr_field,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags;
+ const char *reg_name = "";
+
+ if (btf_is_kernel(reg->btf)) {
+ perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
+
+ /* Only unreferenced case accepts untrusted pointers */
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ perm_flags |= PTR_UNTRUSTED;
+ } else {
+ perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ perm_flags |= MEM_PERCPU;
+ }
+
+ if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
+ goto bad_type;
+
+ /* We need to verify reg->type and reg->btf, before accessing reg->btf */
+ reg_name = btf_type_name(reg->btf, reg->btf_id);
+
+ /* For ref_ptr case, release function check should ensure we get one
+ * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
+ * normal store of unreferenced kptr, we must ensure var_off is zero.
+ * Since ref_ptr cannot be accessed directly by BPF insns, checks for
+ * reg->off and reg->ref_obj_id are not needed here.
+ */
+ if (__check_ptr_off_reg(env, reg, regno, true))
+ return -EACCES;
+
+ /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and
+ * we also need to take into account the reg->off.
+ *
+ * We want to support cases like:
+ *
+ * struct foo {
+ * struct bar br;
+ * struct baz bz;
+ * };
+ *
+ * struct foo *v;
+ * v = func(); // PTR_TO_BTF_ID
+ * val->foo = v; // reg->off is zero, btf and btf_id match type
+ * val->bar = &v->br; // reg->off is still zero, but we need to retry with
+ * // first member type of struct after comparison fails
+ * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked
+ * // to match type
+ *
+ * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off
+ * is zero. We must also ensure that btf_struct_ids_match does not walk
+ * the struct to match type against first member of struct, i.e. reject
+ * second case from above. Hence, when type is BPF_KPTR_REF, we set
+ * strict mode to true for type match.
+ */
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ kptr_field->type != BPF_KPTR_UNREF))
+ goto bad_type;
+ return 0;
+bad_type:
+ verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
+ reg_type_str(env, reg->type), reg_name);
+ verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
+ if (kptr_field->type == BPF_KPTR_UNREF)
+ verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
+ targ_name);
+ else
+ verbose(env, "\n");
+ return -EINVAL;
+}
+
+static bool in_sleepable(struct bpf_verifier_env *env)
+{
+ return env->cur_state->in_sleepable;
+}
+
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+ return env->cur_state->active_rcu_locks ||
+ env->cur_state->active_locks ||
+ !in_sleepable(env);
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+#ifdef CONFIG_NET
+BTF_ID(struct, prog_test_ref_kfunc)
+#endif
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+#endif
+#ifdef CONFIG_BPF_JIT
+BTF_ID(struct, bpf_cpumask)
+#endif
+BTF_ID(struct, task_struct)
+#ifdef CONFIG_CRYPTO
+BTF_ID(struct, bpf_crypto_ctx)
+#endif
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+ if (!btf_is_kernel(btf))
+ return true;
+ return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
+{
+ struct btf_struct_meta *meta;
+
+ if (btf_is_kernel(kptr_field->kptr.btf))
+ return NULL;
+
+ meta = btf_find_struct_meta(kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id);
+
+ return meta ? meta->record : NULL;
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+ const struct btf_field_kptr *kptr = &field->kptr;
+
+ return field->type == BPF_KPTR_PERCPU ||
+ (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
+}
+
+static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
+{
+ struct btf_record *rec;
+ u32 ret;
+
+ ret = PTR_MAYBE_NULL;
+ if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
+ ret |= MEM_RCU;
+ if (kptr_field->type == BPF_KPTR_PERCPU)
+ ret |= MEM_PERCPU;
+ else if (!btf_is_kernel(kptr_field->kptr.btf))
+ ret |= MEM_ALLOC;
+
+ rec = kptr_pointee_btf_record(kptr_field);
+ if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
+ ret |= NON_OWN_REF;
+ } else {
+ ret |= PTR_UNTRUSTED;
+ }
+
+ return ret;
+}
+
+static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno,
+ struct btf_field *field)
+{
+ struct bpf_reg_state *reg;
+ const struct btf_type *t;
+
+ t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
+ mark_reg_known_zero(env, cur_regs(env), regno);
+ reg = reg_state(env, regno);
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
+ reg->mem_size = t->size;
+ reg->id = ++env->id_gen;
+
+ return 0;
+}
+
+static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
+ int value_regno, int insn_idx,
+ struct btf_field *kptr_field)
+{
+ struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
+ int class = BPF_CLASS(insn->code);
+ struct bpf_reg_state *val_reg;
+ int ret;
+
+ /* Things we already checked for in check_map_access and caller:
+ * - Reject cases where variable offset may touch kptr
+ * - size of access (must be BPF_DW)
+ * - tnum_is_const(reg->var_off)
+ * - kptr_field->offset == off + reg->var_off.value
+ */
+ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose(env, "kptr in map can only be accessed using BPF_MEM instruction mode\n");
+ return -EACCES;
+ }
+
+ /* We only allow loading referenced kptr, since it will be marked as
+ * untrusted, similar to unreferenced kptr.
+ */
+ if (class != BPF_LDX &&
+ (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
+ verbose(env, "store to referenced kptr disallowed\n");
+ return -EACCES;
+ }
+ if (class != BPF_LDX && kptr_field->type == BPF_UPTR) {
+ verbose(env, "store to uptr disallowed\n");
+ return -EACCES;
+ }
+
+ if (class == BPF_LDX) {
+ if (kptr_field->type == BPF_UPTR)
+ return mark_uptr_ld_reg(env, value_regno, kptr_field);
+
+ /* We can simply mark the value_regno receiving the pointer
+ * value from map as PTR_TO_BTF_ID, with the correct type.
+ */
+ ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID,
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ btf_ld_kptr_type(env, kptr_field));
+ if (ret < 0)
+ return ret;
+ } else if (class == BPF_STX) {
+ val_reg = reg_state(env, value_regno);
+ if (!register_is_null(val_reg) &&
+ map_kptr_match_type(env, kptr_field, val_reg, value_regno))
+ return -EACCES;
+ } else if (class == BPF_ST) {
+ if (insn->imm) {
+ verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
+ kptr_field->offset);
+ return -EACCES;
+ }
+ } else {
+ verbose(env, "kptr in map can only be accessed using BPF_LDX/BPF_STX/BPF_ST\n");
+ return -EACCES;
+ }
+ return 0;
+}
+
+/*
+ * Return the size of the memory region accessible from a pointer to map value.
+ * For INSN_ARRAY maps whole bpf_insn_array->ips array is accessible.
+ */
+static u32 map_mem_size(const struct bpf_map *map)
+{
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ return map->max_entries * sizeof(long);
+
+ return map->value_size;
+}
+
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, bool zero_size_allowed,
+ enum bpf_access_src src)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ u32 mem_size = map_mem_size(map);
+ struct btf_record *rec;
+ int err, i;
+
+ err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed);
+ if (err)
+ return err;
+
+ if (IS_ERR_OR_NULL(map->record))
+ return 0;
+ rec = map->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 p = field->offset;
+
+ /* If any part of a field can be touched by load/store, reject
+ * this program. To check that [x1, x2) overlaps with [y1, y2),
+ * it is sufficient to check x1 < y2 && y1 < x2.
+ */
+ if (reg->smin_value + off < p + field->size &&
+ p < reg->umax_value + off + size) {
+ switch (field->type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_UPTR:
+ if (src != ACCESS_DIRECT) {
+ verbose(env, "%s cannot be accessed indirectly by helper\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "%s access cannot have variable offset\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ if (p != off + reg->var_off.value) {
+ verbose(env, "%s access misaligned expected=%u off=%llu\n",
+ btf_field_type_name(field->type),
+ p, off + reg->var_off.value);
+ return -EACCES;
+ }
+ if (size != bpf_size_to_bytes(BPF_DW)) {
+ verbose(env, "%s access size must be BPF_DW\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ break;
+ default:
+ verbose(env, "%s cannot be accessed directly by load/store\n",
+ btf_field_type_name(field->type));
return -EACCES;
}
}
- if (value_regno >= 0)
- /* have read misc data from the stack */
- mark_reg_unknown_value(state->regs, value_regno);
- return 0;
}
+ return 0;
}
-/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct verifier_env *env, u32 regno, int off,
- int size)
+#define MAX_PACKET_OFF 0xffff
+
+static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_access_type t)
{
- struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
+ /* Program types only with direct read access go here! */
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (t == BPF_WRITE)
+ return false;
+ fallthrough;
- if (off < 0 || off + size > map->value_size) {
- verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
- map->value_size, off, size);
+ /* Program types with direct read + write access go here! */
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ if (meta)
+ return meta->pkt_access;
+
+ env->seen_direct_write = true;
+ return true;
+
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ if (t == BPF_WRITE)
+ env->seen_direct_write = true;
+
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size, bool zero_size_allowed)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ int err;
+
+ /* We may have added a variable offset to the packet pointer; but any
+ * reg->range we have comes after that. We are only checking the fixed
+ * offset.
+ */
+
+ /* We don't allow negative numbers, because we aren't tracking enough
+ * detail to prove they're safe.
+ */
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+
+ err = reg->range < 0 ? -EINVAL :
+ __check_mem_access(env, regno, off, size, reg->range,
+ zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d offset is outside of the packet\n", regno);
+ return err;
+ }
+
+ /* __check_mem_access has made sure "off + size - 1" is within u16.
+ * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
+ * otherwise find_good_pkt_pointers would have refused to set range info
+ * that __check_mem_access would have rejected this pkt access.
+ * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
+ */
+ env->prog->aux->max_pkt_offset =
+ max_t(u32, env->prog->aux->max_pkt_offset,
+ off + reg->umax_value + size - 1);
+
+ return err;
+}
+
+/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
+ enum bpf_access_type t, struct bpf_insn_access_aux *info)
+{
+ if (env->ops->is_valid_access &&
+ env->ops->is_valid_access(off, size, t, env->prog, info)) {
+ /* A non zero info.ctx_field_size indicates that this field is a
+ * candidate for later verifier transformation to load the whole
+ * field and then apply a mask when accessed with a narrower
+ * access than actual ctx access size. A zero info.ctx_field_size
+ * will only allow for whole field access and rejects any other
+ * type of narrower access.
+ */
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID) {
+ if (info->ref_obj_id &&
+ !find_reference_state(env->cur_state, info->ref_obj_id)) {
+ verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
+ off);
+ return -EACCES;
+ }
+ } else {
+ env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size;
+ }
+ /* remember the offset of last byte accessed in ctx */
+ if (env->prog->aux->max_ctx_offset < off + size)
+ env->prog->aux->max_ctx_offset = off + size;
+ return 0;
+ }
+
+ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
+ int size)
+{
+ if (size < 0 || off < 0 ||
+ (u64)off + size > sizeof(struct bpf_flow_keys)) {
+ verbose(env, "invalid access to flow keys off=%d size=%d\n",
+ off, size);
return -EACCES;
}
return 0;
}
-/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct verifier_env *env, int off, int size,
- enum bpf_access_type t)
+static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
+ u32 regno, int off, int size,
+ enum bpf_access_type t)
{
- if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t))
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_insn_access_aux info = {};
+ bool valid;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+
+ switch (reg->type) {
+ case PTR_TO_SOCK_COMMON:
+ valid = bpf_sock_common_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_SOCKET:
+ valid = bpf_sock_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_TCP_SOCK:
+ valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_XDP_SOCK:
+ valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
+ break;
+ default:
+ valid = false;
+ }
+
+
+ if (valid) {
+ env->insn_aux_data[insn_idx].ctx_field_size =
+ info.ctx_field_size;
return 0;
+ }
+
+ verbose(env, "R%d invalid %s access off=%d size=%d\n",
+ regno, reg_type_str(env, reg->type), off, size);
- verbose("invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+ return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
+}
+
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return reg->type == PTR_TO_CTX;
+}
+
+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return type_is_sk_pointer(reg->type);
+}
+
+static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return type_is_pkt_pointer(reg->type);
+}
+
+static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */
+ return reg->type == PTR_TO_FLOW_KEYS;
+}
+
+static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return reg->type == PTR_TO_ARENA;
+}
+
+/* Return false if @regno contains a pointer whose type isn't supported for
+ * atomic instruction @insn.
+ */
+static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno,
+ struct bpf_insn *insn)
+{
+ if (is_ctx_reg(env, regno))
+ return false;
+ if (is_pkt_reg(env, regno))
+ return false;
+ if (is_flow_key_reg(env, regno))
+ return false;
+ if (is_sk_reg(env, regno))
+ return false;
+ if (is_arena_reg(env, regno))
+ return bpf_jit_supports_insn(insn, true);
+
+ return true;
+}
+
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+ [CONST_PTR_TO_MAP] = btf_bpf_map_id,
+};
+
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+ /* A referenced register is always trusted. */
+ if (reg->ref_obj_id)
+ return true;
+
+ /* Types listed in the reg2btf_ids are always trusted */
+ if (reg2btf_ids[base_type(reg->type)] &&
+ !bpf_type_has_unsafe_modifiers(reg->type))
+ return true;
+
+ /* If a register is not referenced, it is trusted if it has the
+ * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+ * other type modifiers may be safe, but we elect to take an opt-in
+ * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+ * not.
+ *
+ * Eventually, we should make PTR_TRUSTED the single source of truth
+ * for whether a register is trusted.
+ */
+ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+ !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+ return reg->type & MEM_RCU;
+}
+
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+ *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int off, int size, bool strict)
+{
+ struct tnum reg_off;
+ int ip_align;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ /* For platforms that do not have a Kconfig enabling
+ * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
+ * NET_IP_ALIGN is universally set to '2'. And on platforms
+ * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
+ * to this code only in strict mode where we want to emulate
+ * the NET_IP_ALIGN==2 checking. Therefore use an
+ * unconditional IP align value of '2'.
+ */
+ ip_align = 2;
+
+ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "misaligned packet access off %d+%s+%d+%d size %d\n",
+ ip_align, tn_buf, reg->off, off, size);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ const char *pointer_desc,
+ int off, int size, bool strict)
+{
+ struct tnum reg_off;
+
+ /* Byte size accesses are always allowed. */
+ if (!strict || size == 1)
+ return 0;
+
+ reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+ if (!tnum_is_aligned(reg_off, size)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
+ pointer_desc, tn_buf, reg->off, off, size);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int off,
+ int size, bool strict_alignment_once)
+{
+ bool strict = env->strict_alignment || strict_alignment_once;
+ const char *pointer_desc = "";
+
+ switch (reg->type) {
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ /* Special case, because of NET_IP_ALIGN. Given metadata sits
+ * right in front, treat it the very same way.
+ */
+ return check_pkt_ptr_alignment(env, reg, off, size, strict);
+ case PTR_TO_FLOW_KEYS:
+ pointer_desc = "flow keys ";
+ break;
+ case PTR_TO_MAP_KEY:
+ pointer_desc = "key ";
+ break;
+ case PTR_TO_MAP_VALUE:
+ pointer_desc = "value ";
+ if (reg->map_ptr->map_type == BPF_MAP_TYPE_INSN_ARRAY)
+ strict = true;
+ break;
+ case PTR_TO_CTX:
+ pointer_desc = "context ";
+ break;
+ case PTR_TO_STACK:
+ pointer_desc = "stack ";
+ /* The stack spill tracking logic in check_stack_write_fixed_off()
+ * and check_stack_read_fixed_off() relies on stack accesses being
+ * aligned.
+ */
+ strict = true;
+ break;
+ case PTR_TO_SOCKET:
+ pointer_desc = "sock ";
+ break;
+ case PTR_TO_SOCK_COMMON:
+ pointer_desc = "sock_common ";
+ break;
+ case PTR_TO_TCP_SOCK:
+ pointer_desc = "tcp_sock ";
+ break;
+ case PTR_TO_XDP_SOCK:
+ pointer_desc = "xdp_sock ";
+ break;
+ case PTR_TO_ARENA:
+ return 0;
+ default:
+ break;
+ }
+ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+ strict);
+}
+
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
+
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (prog->aux->priv_stack_requested || bpf_prog_check_recur(prog))
+ return PRIV_STACK_ADAPTIVE;
+ fallthrough;
+ default:
+ break;
+ }
+
+ return NO_PRIV_STACK;
+}
+
+static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
+{
+ if (env->prog->jit_requested)
+ return round_up(stack_depth, 16);
+
+ /* round up to 32-bytes, since this is granularity
+ * of interpreter stack size
+ */
+ return round_up(max_t(u32, stack_depth, 1), 32);
+}
+
+/* starting from main bpf function walk all instructions of the function
+ * and recursively walk all callees that given function can call.
+ * Ignore jump and exit insns.
+ * Since recursion is prevented by check_cfg() this algorithm
+ * only needs a local stack of MAX_CALL_FRAMES to remember callsites
+ */
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+ bool priv_stack_supported)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
+ bool tail_call_reachable = false;
+ int ret_insn[MAX_CALL_FRAMES];
+ int ret_prog[MAX_CALL_FRAMES];
+ int j;
+
+ i = subprog[idx].start;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
+process_func:
+ /* protect against potential stack overflow that might happen when
+ * bpf2bpf calls get combined with tailcalls. Limit the caller's stack
+ * depth for such case down to 256 so that the worst case scenario
+ * would result in 8k stack size (32 which is tailcall limit * 256 =
+ * 8k).
+ *
+ * To get the idea what might happen, see an example:
+ * func1 -> sub rsp, 128
+ * subfunc1 -> sub rsp, 256
+ * tailcall1 -> add rsp, 256
+ * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320)
+ * subfunc2 -> sub rsp, 64
+ * subfunc22 -> sub rsp, 128
+ * tailcall2 -> add rsp, 128
+ * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416)
+ *
+ * tailcall will unwind the current stack frame but it will not get rid
+ * of caller's stack as shown on the example above.
+ */
+ if (idx && subprog[idx].has_tail_call && depth >= 256) {
+ verbose(env,
+ "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
+ depth);
+ return -EACCES;
+ }
+
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ if (priv_stack_supported) {
+ /* Request private stack support only if the subprog stack
+ * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to
+ * avoid jit penalty if the stack usage is small.
+ */
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_UNKNOWN &&
+ subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].priv_stack_mode = PRIV_STACK_ADAPTIVE;
+ }
+
+ if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ } else {
+ depth += subprog_depth;
+ if (depth > MAX_BPF_STACK) {
+ verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ frame + 1, depth);
+ return -EACCES;
+ }
+ }
+continue_func:
+ subprog_end = subprog[idx + 1].start;
+ for (; i < subprog_end; i++) {
+ int next_insn, sidx;
+
+ if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+ bool err = false;
+
+ if (!is_bpf_throw_kfunc(insn + i))
+ continue;
+ if (subprog[idx].is_cb)
+ err = true;
+ for (int c = 0; c < frame && !err; c++) {
+ if (subprog[ret_prog[c]].is_cb) {
+ err = true;
+ break;
+ }
+ }
+ if (!err)
+ continue;
+ verbose(env,
+ "bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+ i, idx);
+ return -EINVAL;
+ }
+
+ if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
+ continue;
+ /* remember insn and function to return to */
+ ret_insn[frame] = i + 1;
+ ret_prog[frame] = idx;
+
+ /* find the callee */
+ next_insn = i + insn[i].imm + 1;
+ sidx = find_subprog(env, next_insn);
+ if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
+ return -EFAULT;
+ if (subprog[sidx].is_async_cb) {
+ if (subprog[sidx].has_tail_call) {
+ verifier_bug(env, "subprog has tail_call and async cb");
+ return -EFAULT;
+ }
+ /* async callbacks don't increase bpf prog stack size unless called directly */
+ if (!bpf_pseudo_call(insn + i))
+ continue;
+ if (subprog[sidx].is_exception_cb) {
+ verbose(env, "insn %d cannot call exception cb directly", i);
+ return -EINVAL;
+ }
+ }
+ i = next_insn;
+ idx = sidx;
+ if (!priv_stack_supported)
+ subprog[idx].priv_stack_mode = NO_PRIV_STACK;
+
+ if (subprog[idx].has_tail_call)
+ tail_call_reachable = true;
+
+ frame++;
+ if (frame >= MAX_CALL_FRAMES) {
+ verbose(env, "the call stack of %d frames is too deep !\n",
+ frame);
+ return -E2BIG;
+ }
+ goto process_func;
+ }
+ /* if tail call got detected across bpf2bpf calls then mark each of the
+ * currently present subprog frames as tail call reachable subprogs;
+ * this info will be utilized by JIT so that we will be preserving the
+ * tail call counter throughout bpf2bpf calls combined with tailcalls
+ */
+ if (tail_call_reachable)
+ for (j = 0; j < frame; j++) {
+ if (subprog[ret_prog[j]].is_exception_cb) {
+ verbose(env, "cannot tail call within exception cb\n");
+ return -EINVAL;
+ }
+ subprog[ret_prog[j]].tail_call_reachable = true;
+ }
+ if (subprog[0].tail_call_reachable)
+ env->prog->aux->tail_call_reachable = true;
+
+ /* end of for() loop means the last insn of the 'subprog'
+ * was reached. Doesn't matter whether it was JA or EXIT
+ */
+ if (frame == 0)
+ return 0;
+ if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE)
+ depth -= round_up_stack_depth(env, subprog[idx].stack_depth);
+ frame--;
+ i = ret_insn[frame];
+ idx = ret_prog[frame];
+ goto continue_func;
+}
+
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+ enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN;
+ struct bpf_subprog_info *si = env->subprog_info;
+ bool priv_stack_supported;
+ int ret;
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].has_tail_call) {
+ priv_stack_mode = NO_PRIV_STACK;
+ break;
+ }
+ }
+
+ if (priv_stack_mode == PRIV_STACK_UNKNOWN)
+ priv_stack_mode = bpf_enable_priv_stack(env->prog);
+
+ /* All async_cb subprogs use normal kernel stack. If a particular
+ * subprog appears in both main prog and async_cb subtree, that
+ * subprog will use normal kernel stack to avoid potential nesting.
+ * The reverse subprog traversal ensures when main prog subtree is
+ * checked, the subprogs appearing in async_cb subtrees are already
+ * marked as using normal kernel stack, so stack size checking can
+ * be done properly.
+ */
+ for (int i = env->subprog_cnt - 1; i >= 0; i--) {
+ if (!i || si[i].is_async_cb) {
+ priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE;
+ ret = check_max_stack_depth_subprog(env, i, priv_stack_supported);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (si[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) {
+ env->prog->aux->jits_use_priv_stack = true;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+static int get_callee_stack_depth(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int idx)
+{
+ int start = idx + insn->imm + 1, subprog;
+
+ subprog = find_subprog(env, start);
+ if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
+ return -EFAULT;
+ return env->subprog_info[subprog].stack_depth;
+}
+#endif
+
+static int __check_buffer_access(struct bpf_verifier_env *env,
+ const char *buf_info,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size)
+{
+ if (off < 0) {
+ verbose(env,
+ "R%d invalid %s buffer access: off=%d, size=%d\n",
+ regno, buf_info, off, size);
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "R%d invalid variable buffer offset: off=%d, var_off=%s\n",
+ regno, off, tn_buf);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size)
+{
+ int err;
+
+ err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
+ if (err)
+ return err;
+
+ if (off + size > env->prog->aux->max_tp_access)
+ env->prog->aux->max_tp_access = off + size;
+
+ return 0;
+}
+
+static int check_buffer_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size,
+ bool zero_size_allowed,
+ u32 *max_access)
+{
+ const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
+ int err;
+
+ err = __check_buffer_access(env, buf_info, reg, regno, off, size);
+ if (err)
+ return err;
+
+ if (off + size > *max_access)
+ *max_access = off + size;
+
+ return 0;
+}
+
+/* BPF architecture zero extends alu32 ops into 64-bit registesr */
+static void zext_32_to_64(struct bpf_reg_state *reg)
+{
+ reg->var_off = tnum_subreg(reg->var_off);
+ __reg_assign_32_into_64(reg);
+}
+
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+ u64 mask;
+
+ /* clear high bits in bit representation */
+ reg->var_off = tnum_cast(reg->var_off, size);
+
+ /* fix arithmetic bounds */
+ mask = ((u64)1 << (size * 8)) - 1;
+ if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+ reg->umin_value &= mask;
+ reg->umax_value &= mask;
+ } else {
+ reg->umin_value = 0;
+ reg->umax_value = mask;
+ }
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value;
+
+ /* If size is smaller than 32bit register the 32bit register
+ * values are also truncated so we push 64-bit bounds into
+ * 32-bit bounds. Above were truncated < 32-bits already.
+ */
+ if (size < 4)
+ __mark_reg32_unbounded(reg);
+
+ reg_bounds_sync(reg);
+}
+
+static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->smin_value = reg->s32_min_value = S8_MIN;
+ reg->smax_value = reg->s32_max_value = S8_MAX;
+ } else if (size == 2) {
+ reg->smin_value = reg->s32_min_value = S16_MIN;
+ reg->smax_value = reg->s32_max_value = S16_MAX;
+ } else {
+ /* size == 4 */
+ reg->smin_value = reg->s32_min_value = S32_MIN;
+ reg->smax_value = reg->s32_max_value = S32_MAX;
+ }
+ reg->umin_value = reg->u32_min_value = 0;
+ reg->umax_value = U64_MAX;
+ reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_unknown;
+}
+
+static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval;
+ u64 top_smax_value, top_smin_value;
+ u64 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u64_cval = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u64_cval);
+ else if (size == 2)
+ reg->var_off = tnum_const((s16)u64_cval);
+ else
+ /* size == 4 */
+ reg->var_off = tnum_const((s32)u64_cval);
+
+ u64_cval = reg->var_off.value;
+ reg->smax_value = reg->smin_value = u64_cval;
+ reg->umax_value = reg->umin_value = u64_cval;
+ reg->s32_max_value = reg->s32_min_value = u64_cval;
+ reg->u32_max_value = reg->u32_min_value = u64_cval;
+ return;
+ }
+
+ top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits;
+ top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s64_min and s64_min after sign extension */
+ if (size == 1) {
+ init_s64_max = (s8)reg->smax_value;
+ init_s64_min = (s8)reg->smin_value;
+ } else if (size == 2) {
+ init_s64_max = (s16)reg->smax_value;
+ init_s64_min = (s16)reg->smin_value;
+ } else {
+ init_s64_max = (s32)reg->smax_value;
+ init_s64_min = (s32)reg->smin_value;
+ }
+
+ s64_max = max(init_s64_max, init_s64_min);
+ s64_min = min(init_s64_max, init_s64_min);
+
+ /* both of s64_max/s64_min positive or negative */
+ if ((s64_max >= 0) == (s64_min >= 0)) {
+ reg->s32_min_value = reg->smin_value = s64_min;
+ reg->s32_max_value = reg->smax_value = s64_max;
+ reg->u32_min_value = reg->umin_value = s64_min;
+ reg->u32_max_value = reg->umax_value = s64_max;
+ reg->var_off = tnum_range(s64_min, s64_max);
+ return;
+ }
+
+out:
+ set_sext64_default_val(reg, size);
+}
+
+static void set_sext32_default_val(struct bpf_reg_state *reg, int size)
+{
+ if (size == 1) {
+ reg->s32_min_value = S8_MIN;
+ reg->s32_max_value = S8_MAX;
+ } else {
+ /* size == 2 */
+ reg->s32_min_value = S16_MIN;
+ reg->s32_max_value = S16_MAX;
+ }
+ reg->u32_min_value = 0;
+ reg->u32_max_value = U32_MAX;
+ reg->var_off = tnum_subreg(tnum_unknown);
+}
+
+static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size)
+{
+ s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val;
+ u32 top_smax_value, top_smin_value;
+ u32 num_bits = size * 8;
+
+ if (tnum_is_const(reg->var_off)) {
+ u32_val = reg->var_off.value;
+ if (size == 1)
+ reg->var_off = tnum_const((s8)u32_val);
+ else
+ reg->var_off = tnum_const((s16)u32_val);
+
+ u32_val = reg->var_off.value;
+ reg->s32_min_value = reg->s32_max_value = u32_val;
+ reg->u32_min_value = reg->u32_max_value = u32_val;
+ return;
+ }
+
+ top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits;
+ top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits;
+
+ if (top_smax_value != top_smin_value)
+ goto out;
+
+ /* find the s32_min and s32_min after sign extension */
+ if (size == 1) {
+ init_s32_max = (s8)reg->s32_max_value;
+ init_s32_min = (s8)reg->s32_min_value;
+ } else {
+ /* size == 2 */
+ init_s32_max = (s16)reg->s32_max_value;
+ init_s32_min = (s16)reg->s32_min_value;
+ }
+ s32_max = max(init_s32_max, init_s32_min);
+ s32_min = min(init_s32_max, init_s32_min);
+
+ if ((s32_min >= 0) == (s32_max >= 0)) {
+ reg->s32_min_value = s32_min;
+ reg->s32_max_value = s32_max;
+ reg->u32_min_value = (u32)s32_min;
+ reg->u32_max_value = (u32)s32_max;
+ reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max));
+ return;
+ }
+
+out:
+ set_sext32_default_val(reg, size);
+}
+
+static bool bpf_map_is_rdonly(const struct bpf_map *map)
+{
+ /* A map is considered read-only if the following condition are true:
+ *
+ * 1) BPF program side cannot change any of the map content. The
+ * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
+ * and was set at map creation time.
+ * 2) The map value(s) have been initialized from user space by a
+ * loader and then "frozen", such that no new map update/delete
+ * operations from syscall side are possible for the rest of
+ * the map's lifetime from that point onwards.
+ * 3) Any parallel/pending map update/delete operations from syscall
+ * side have been completed. Only after that point, it's safe to
+ * assume that map value(s) are immutable.
+ */
+ return (map->map_flags & BPF_F_RDONLY_PROG) &&
+ READ_ONCE(map->frozen) &&
+ !bpf_map_write_active(map);
+}
+
+static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val,
+ bool is_ldsx)
+{
+ void *ptr;
+ u64 addr;
+ int err;
+
+ err = map->ops->map_direct_value_addr(map, &addr, off);
+ if (err)
+ return err;
+ ptr = (void *)(long)addr + off;
+
+ switch (size) {
+ case sizeof(u8):
+ *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr;
+ break;
+ case sizeof(u16):
+ *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr;
+ break;
+ case sizeof(u32):
+ *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr;
+ break;
+ case sizeof(u64):
+ *val = *(u64 *)ptr;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
+#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+#define BTF_TYPE_SAFE_TRUSTED_OR_NULL(__type) __PASTE(__type, __safe_trusted_or_null)
+
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
+
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
+ const cpumask_t *cpus_ptr;
+ struct css_set __rcu *cgroups;
+ struct task_struct __rcu *real_parent;
+ struct task_struct *group_leader;
+};
+
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+ /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+ struct kernfs_node *kn;
+};
+
+BTF_TYPE_SAFE_RCU(struct css_set) {
+ struct cgroup *dfl_cgrp;
+};
+
+BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
+ struct cgroup *cgroup;
+};
+
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+ struct file __rcu *exe_file;
+#ifdef CONFIG_MEMCG
+ struct task_struct __rcu *owner;
+#endif
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+ struct sock *sk;
+};
+
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+ struct seq_file *seq;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+ struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+ struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) {
+ struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
+ struct mm_struct *vm_mm;
+ struct file *vm_file;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
+}
+
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
+}
+
+static bool type_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
+}
+
+static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
+ "__safe_trusted_or_null");
+}
+
+static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ int regno, int off, int size,
+ enum bpf_access_type atype,
+ int value_regno)
+{
+ struct bpf_reg_state *reg = regs + regno;
+ const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
+ const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ const char *field_name = NULL;
+ enum bpf_type_flag flag = 0;
+ u32 btf_id = 0;
+ int ret;
+
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+ if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+ verbose(env,
+ "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+ tname);
+ return -EINVAL;
+ }
+ if (off < 0) {
+ verbose(env,
+ "R%d is ptr_%s invalid negative access: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
+ regno, tname, off, tn_buf);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_USER) {
+ verbose(env,
+ "R%d is ptr_%s access user memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_PERCPU) {
+ verbose(env,
+ "R%d is ptr_%s access percpu memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
+ if (!btf_is_kernel(reg->btf)) {
+ verifier_bug(env, "reg->btf must be kernel btf");
+ return -EFAULT;
+ }
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size);
+ } else {
+ /* Writes are permitted with default btf_struct_access for
+ * program allocated objects (which always have ref_obj_id > 0),
+ * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+ */
+ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) {
+ verbose(env, "only read is supported\n");
+ return -EACCES;
+ }
+
+ if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
+ !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
+ verifier_bug(env, "ref_obj_id for allocated object must be non-zero");
+ return -EFAULT;
+ }
+
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
+ }
+
+ if (ret < 0)
+ return ret;
+
+ if (ret != PTR_TO_BTF_ID) {
+ /* just mark; */
+
+ } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+ /* If this is an untrusted pointer, all pointers formed by walking it
+ * also inherit the untrusted flag.
+ */
+ flag = PTR_UNTRUSTED;
+
+ } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust (either full or RCU).
+ * For example:
+ * 'cgroups' pointer is untrusted if task->cgroups dereference
+ * happened in a sleepable program outside of bpf_rcu_read_lock()
+ * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+ * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+ *
+ * A regular RCU-protected pointer with __rcu tag can also be deemed
+ * trusted if we are in an RCU CS. Such pointer can be NULL.
+ */
+ if (type_is_trusted(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED;
+ } else if (type_is_trusted_or_null(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED | PTR_MAYBE_NULL;
+ } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+ if (type_is_rcu(env, reg, field_name, btf_id)) {
+ /* ignore __rcu tag and mark it MEM_RCU */
+ flag |= MEM_RCU;
+ } else if (flag & MEM_RCU ||
+ type_is_rcu_or_null(env, reg, field_name, btf_id)) {
+ /* __rcu tagged pointers can be NULL */
+ flag |= MEM_RCU | PTR_MAYBE_NULL;
+
+ /* We always trust them */
+ if (type_is_rcu_or_null(env, reg, field_name, btf_id) &&
+ flag & PTR_UNTRUSTED)
+ flag &= ~PTR_UNTRUSTED;
+ } else if (flag & (MEM_PERCPU | MEM_USER)) {
+ /* keep as-is */
+ } else {
+ /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+ clear_trusted_flags(&flag);
+ }
+ } else {
+ /*
+ * If not in RCU CS or MEM_RCU pointer can be NULL then
+ * aggressively mark as untrusted otherwise such
+ * pointers will be plain PTR_TO_BTF_ID without flags
+ * and will be allowed to be passed into helpers for
+ * compat reasons.
+ */
+ flag = PTR_UNTRUSTED;
+ }
+ } else {
+ /* Old compat. Deprecated */
+ clear_trusted_flags(&flag);
+ }
+
+ if (atype == BPF_READ && value_regno >= 0) {
+ ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int check_ptr_to_map_access(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs,
+ int regno, int off, int size,
+ enum bpf_access_type atype,
+ int value_regno)
+{
+ struct bpf_reg_state *reg = regs + regno;
+ struct bpf_map *map = reg->map_ptr;
+ struct bpf_reg_state map_reg;
+ enum bpf_type_flag flag = 0;
+ const struct btf_type *t;
+ const char *tname;
+ u32 btf_id;
+ int ret;
+
+ if (!btf_vmlinux) {
+ verbose(env, "map_ptr access not supported without CONFIG_DEBUG_INFO_BTF\n");
+ return -ENOTSUPP;
+ }
+
+ if (!map->ops->map_btf_id || !*map->ops->map_btf_id) {
+ verbose(env, "map_ptr access not supported for map type %d\n",
+ map->map_type);
+ return -ENOTSUPP;
+ }
+
+ t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+
+ if (off < 0) {
+ verbose(env, "R%d is %s invalid negative access: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (atype != BPF_READ) {
+ verbose(env, "only read from %s is supported\n", tname);
+ return -EACCES;
+ }
+
+ /* Simulate access to a PTR_TO_BTF_ID */
+ memset(&map_reg, 0, sizeof(map_reg));
+ ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID,
+ btf_vmlinux, *map->ops->map_btf_id, 0);
+ if (ret < 0)
+ return ret;
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (value_regno >= 0) {
+ ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Check that the stack access at the given offset is within bounds. The
+ * maximum valid offset is -1.
+ *
+ * The minimum valid offset is -MAX_BPF_STACK for writes, and
+ * -state->allocated_stack for reads.
+ */
+static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
+ s64 off,
+ struct bpf_func_state *state,
+ enum bpf_access_type t)
+{
+ int min_valid_off;
+
+ if (t == BPF_WRITE || env->allow_uninit_stack)
+ min_valid_off = -MAX_BPF_STACK;
+ else
+ min_valid_off = -state->allocated_stack;
+
+ if (off < min_valid_off || off > -1)
+ return -EACCES;
+ return 0;
+}
+
+/* Check that the stack access at 'regno + off' falls within the maximum stack
+ * bounds.
+ *
+ * 'off' includes `regno->offset`, but not its dynamic part (if any).
+ */
+static int check_stack_access_within_bounds(
+ struct bpf_verifier_env *env,
+ int regno, int off, int access_size,
+ enum bpf_access_type type)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
+ struct bpf_func_state *state = func(env, reg);
+ s64 min_off, max_off;
+ int err;
+ char *err_extra;
+
+ if (type == BPF_READ)
+ err_extra = " read from";
+ else
+ err_extra = " write to";
+
+ if (tnum_is_const(reg->var_off)) {
+ min_off = (s64)reg->var_off.value + off;
+ max_off = min_off + access_size;
+ } else {
+ if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+ reg->smin_value <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
+ err_extra, regno);
+ return -EACCES;
+ }
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off + access_size;
+ }
+
+ err = check_stack_slot_within_bounds(env, min_off, state, type);
+ if (!err && max_off > 0)
+ err = -EINVAL; /* out of stack access into non-negative offsets */
+ if (!err && access_size < 0)
+ /* access_size should not be negative (or overflow an int); others checks
+ * along the way should have prevented such an access.
+ */
+ err = -EFAULT; /* invalid negative access size; integer overflow? */
+
+ if (err) {
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid%s stack R%d off=%d size=%d\n",
+ err_extra, regno, off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
+ err_extra, regno, tn_buf, off, access_size);
+ }
+ return err;
+ }
+
+ /* Note that there is no stack access with offset zero, so the needed stack
+ * size is -min_off, not -min_off+1.
+ */
+ return grow_stack_state(env, state, -min_off /* size */);
+}
+
+static bool get_func_retval_range(struct bpf_prog *prog,
+ struct bpf_retval_range *range)
+{
+ if (prog->type == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_MAC &&
+ !bpf_lsm_get_retval_range(prog, range)) {
+ return true;
+ }
+ return false;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct verifier_env *env, u32 regno, int off,
- int bpf_size, enum bpf_access_type t,
- int value_regno)
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
+ int off, int bpf_size, enum bpf_access_type t,
+ int value_regno, bool strict_alignment_once, bool is_ldsx)
{
- struct verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n", off, size);
- return -EACCES;
- }
+ /* alignment checks will add in reg->off themselves */
+ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
+ if (err)
+ return err;
- if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
- err = check_map_access(env, regno, off, size);
- if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value(state->regs, value_regno);
+ /* for access checks, reg->off is just part of off */
+ off += reg->off;
- } else if (state->regs[regno].type == PTR_TO_CTX) {
- err = check_ctx_access(env, off, size, t);
- if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value(state->regs, value_regno);
+ if (reg->type == PTR_TO_MAP_KEY) {
+ if (t == BPF_WRITE) {
+ verbose(env, "write to change key R%d not allowed\n", regno);
+ return -EACCES;
+ }
+
+ err = check_mem_region_access(env, regno, off, size,
+ reg->map_ptr->key_size, false);
+ if (err)
+ return err;
+ if (value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_MAP_VALUE) {
+ struct btf_field *kptr_field = NULL;
- } else if (state->regs[regno].type == FRAME_PTR) {
- if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
- if (t == BPF_WRITE)
- err = check_stack_write(state, off, size, value_regno);
+ err = check_map_access_type(env, regno, off, size, t);
+ if (err)
+ return err;
+ err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT);
+ if (err)
+ return err;
+ if (tnum_is_const(reg->var_off))
+ kptr_field = btf_record_find(reg->map_ptr->record,
+ off + reg->var_off.value, BPF_KPTR | BPF_UPTR);
+ if (kptr_field) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
+ } else if (t == BPF_READ && value_regno >= 0) {
+ struct bpf_map *map = reg->map_ptr;
+
+ /*
+ * If map is read-only, track its contents as scalars,
+ * unless it is an insn array (see the special case below)
+ */
+ if (tnum_is_const(reg->var_off) &&
+ bpf_map_is_rdonly(map) &&
+ map->ops->map_direct_value_addr &&
+ map->map_type != BPF_MAP_TYPE_INSN_ARRAY) {
+ int map_off = off + reg->var_off.value;
+ u64 val = 0;
+
+ err = bpf_map_direct_read(map, map_off, size,
+ &val, is_ldsx);
+ if (err)
+ return err;
+
+ regs[value_regno].type = SCALAR_VALUE;
+ __mark_reg_known(&regs[value_regno], val);
+ } else if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ if (bpf_size != BPF_DW) {
+ verbose(env, "Invalid read of %d bytes from insn_array\n",
+ size);
+ return -EACCES;
+ }
+ copy_register_state(&regs[value_regno], reg);
+ regs[value_regno].type = PTR_TO_INSN;
+ } else {
+ mark_reg_unknown(env, regs, value_regno);
+ }
+ }
+ } else if (base_type(reg->type) == PTR_TO_MEM) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED);
+
+ if (type_may_be_null(reg->type)) {
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ if (t == BPF_WRITE && rdonly_mem) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into mem\n", value_regno);
+ return -EACCES;
+ }
+
+ /*
+ * Accesses to untrusted PTR_TO_MEM are done through probe
+ * instructions, hence no need to check bounds in that case.
+ */
+ if (!rdonly_untrusted)
+ err = check_mem_region_access(env, regno, off, size,
+ reg->mem_size, false);
+ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_CTX) {
+ struct bpf_retval_range range;
+ struct bpf_insn_access_aux info = {
+ .reg_type = SCALAR_VALUE,
+ .is_ldsx = is_ldsx,
+ .log = &env->log,
+ };
+
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into ctx\n", value_regno);
+ return -EACCES;
+ }
+
+ err = check_ptr_off_reg(env, reg, regno);
+ if (err < 0)
+ return err;
+
+ err = check_ctx_access(env, insn_idx, off, size, t, &info);
+ if (err)
+ verbose_linfo(env, insn_idx, "; ");
+ if (!err && t == BPF_READ && value_regno >= 0) {
+ /* ctx access returns either a scalar, or a
+ * PTR_TO_PACKET[_META,_END]. In the latter
+ * case, we know the offset is zero.
+ */
+ if (info.reg_type == SCALAR_VALUE) {
+ if (info.is_retval && get_func_retval_range(env->prog, &range)) {
+ err = __mark_reg_s32_range(env, regs, value_regno,
+ range.minval, range.maxval);
+ if (err)
+ return err;
+ } else {
+ mark_reg_unknown(env, regs, value_regno);
+ }
+ } else {
+ mark_reg_known_zero(env, regs,
+ value_regno);
+ if (type_may_be_null(info.reg_type))
+ regs[value_regno].id = ++env->id_gen;
+ /* A load of ctx field could have different
+ * actual load size with the one encoded in the
+ * insn. When the dst is PTR, it is for sure not
+ * a sub-register.
+ */
+ regs[value_regno].subreg_def = DEF_NOT_SUBREG;
+ if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
+ regs[value_regno].btf = info.btf;
+ regs[value_regno].btf_id = info.btf_id;
+ regs[value_regno].ref_obj_id = info.ref_obj_id;
+ }
+ }
+ regs[value_regno].type = info.reg_type;
+ }
+
+ } else if (reg->type == PTR_TO_STACK) {
+ /* Basic bounds checks. */
+ err = check_stack_access_within_bounds(env, regno, off, size, t);
+ if (err)
+ return err;
+
+ if (t == BPF_READ)
+ err = check_stack_read(env, regno, off, size,
+ value_regno);
else
- err = check_stack_read(state, off, size, value_regno);
+ err = check_stack_write(env, regno, off, size,
+ value_regno, insn_idx);
+ } else if (reg_is_pkt_pointer(reg)) {
+ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
+ verbose(env, "cannot write into packet\n");
+ return -EACCES;
+ }
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into packet\n",
+ value_regno);
+ return -EACCES;
+ }
+ err = check_packet_access(env, regno, off, size, false);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_FLOW_KEYS) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose(env, "R%d leaks addr into flow keys\n",
+ value_regno);
+ return -EACCES;
+ }
+
+ err = check_flow_keys_access(env, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (type_is_sk_pointer(reg->type)) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ err = check_sock_access(env, insn_idx, regno, off, size, t);
+ if (!err && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_TP_BUFFER) {
+ err = check_tp_buffer_access(env, reg, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+ !type_may_be_null(reg->type)) {
+ err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
+ value_regno);
+ } else if (reg->type == CONST_PTR_TO_MAP) {
+ err = check_ptr_to_map_access(env, regs, regno, off, size, t,
+ value_regno);
+ } else if (base_type(reg->type) == PTR_TO_BUF) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ u32 *max_access;
+
+ if (rdonly_mem) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
+
+ err = check_buffer_access(env, reg, regno, off, size, false,
+ max_access);
+
+ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
+ mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_ARENA) {
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
- verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[state->regs[regno].type]);
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str(env, reg->type));
return -EACCES;
}
+
+ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
+ regs[value_regno].type == SCALAR_VALUE) {
+ if (!is_ldsx)
+ /* b/h/w load zero-extends, mark upper bits as known 0 */
+ coerce_reg_to_size(&regs[value_regno], size);
+ else
+ coerce_reg_to_size_sx(&regs[value_regno], size);
+ }
+ return err;
+}
+
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_mismatch);
+
+static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once, bool is_ldsx,
+ bool allow_trust_mismatch, const char *ctx)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type src_reg_type;
+ int err;
+
+ /* check src operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dst operand */
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ src_reg_type = regs[insn->src_reg].type;
+
+ /* Check if (src_reg + off) is readable. The state of dst_reg will be
+ * updated by this call.
+ */
+ err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
+ strict_alignment_once, is_ldsx);
+ err = err ?: save_aux_ptr_type(env, src_reg_type,
+ allow_trust_mismatch);
+ err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], ctx);
+
+ return err;
+}
+
+static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type dst_reg_type;
+ int err;
+
+ /* check src1 operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = regs[insn->dst_reg].type;
+
+ /* Check if (dst_reg + off) is writeable. */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg,
+ strict_alignment_once, false);
+ err = err ?: save_aux_ptr_type(env, dst_reg_type, false);
+
return err;
}
-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+static int check_atomic_rmw(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ int load_reg;
int err;
- if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
- insn->imm != 0) {
- verbose("BPF_XADD uses reserved fields\n");
+ if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
+ verbose(env, "invalid atomic operand size\n");
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
- /* check whether atomic_add can read the memory */
- err = check_mem_access(env, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, -1);
+ if (insn->imm == BPF_CMPXCHG) {
+ /* Check comparison of R0 with memory location */
+ const u32 aux_reg = BPF_REG_0;
+
+ err = check_reg_arg(env, aux_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, aux_reg)) {
+ verbose(env, "R%d leaks addr into mem\n", aux_reg);
+ return -EACCES;
+ }
+ }
+
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
+ insn->dst_reg,
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
+ return -EACCES;
+ }
+
+ if (insn->imm & BPF_FETCH) {
+ if (insn->imm == BPF_CMPXCHG)
+ load_reg = BPF_REG_0;
+ else
+ load_reg = insn->src_reg;
+
+ /* check and record load of old value */
+ err = check_reg_arg(env, load_reg, DST_OP);
+ if (err)
+ return err;
+ } else {
+ /* This instruction accesses a memory location but doesn't
+ * actually load it into a register.
+ */
+ load_reg = -1;
+ }
+
+ /* Check whether we can read the memory, with second call for fetch
+ * case to simulate the register fill.
+ */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1, true, false);
+ if (!err && load_reg >= 0)
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_READ, load_reg, true, false);
if (err)
return err;
- /* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE, -1);
+ if (is_arena_reg(env, insn->dst_reg)) {
+ err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
+ if (err)
+ return err;
+ }
+ /* Check whether we can write into the same memory. */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
+ if (err)
+ return err;
+ return 0;
}
-/* when register 'regno' is passed into function that will read 'access_size'
- * bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized
- */
-static int check_stack_boundary(struct verifier_env *env,
- int regno, int access_size)
+static int check_atomic_load(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int err;
+
+ err = check_load_mem(env, insn, true, false, false, "atomic_load");
+ if (err)
+ return err;
+
+ if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) {
+ verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n",
+ insn->src_reg,
+ reg_type_str(env, reg_state(env, insn->src_reg)->type));
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_atomic_store(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs;
- int off, i;
+ int err;
+
+ err = check_store_reg(env, insn, true);
+ if (err)
+ return err;
- if (regs[regno].type != PTR_TO_STACK)
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
+ insn->dst_reg,
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ switch (insn->imm) {
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ case BPF_XCHG:
+ case BPF_CMPXCHG:
+ return check_atomic_rmw(env, insn);
+ case BPF_LOAD_ACQ:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit load-acquires are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_load(env, insn);
+ case BPF_STORE_REL:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit store-releases are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_store(env, insn);
+ default:
+ verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n",
+ insn->imm);
+ return -EINVAL;
+ }
+}
+
+/* When register 'regno' is used to read the stack (either directly or through
+ * a helper function) make sure that it's within stack boundary and, depending
+ * on the access type and privileges, that all elements of the stack are
+ * initialized.
+ *
+ * 'off' includes 'regno->off', but not its dynamic part (if any).
+ *
+ * All registers that have been spilled on the stack in the slots within the
+ * read offsets are marked as read.
+ */
+static int check_stack_range_initialized(
+ struct bpf_verifier_env *env, int regno, int off,
+ int access_size, bool zero_size_allowed,
+ enum bpf_access_type type, struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_func_state *state = func(env, reg);
+ int err, min_off, max_off, i, j, slot, spi;
+ /* Some accesses can write anything into the stack, others are
+ * read-only.
+ */
+ bool clobber = false;
- off = regs[regno].imm;
- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size <= 0) {
- verbose("invalid stack type R%d off=%d access_size=%d\n",
- regno, off, access_size);
+ if (access_size == 0 && !zero_size_allowed) {
+ verbose(env, "invalid zero-sized read\n");
return -EACCES;
}
- for (i = 0; i < access_size; i++) {
- if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
- verbose("invalid indirect read from stack off %d+%d size %d\n",
- off, i, access_size);
+ if (type == BPF_WRITE)
+ clobber = true;
+
+ err = check_stack_access_within_bounds(env, regno, off, access_size, type);
+ if (err)
+ return err;
+
+
+ if (tnum_is_const(reg->var_off)) {
+ min_off = max_off = reg->var_off.value + off;
+ } else {
+ /* Variable offset is prohibited for unprivileged mode for
+ * simplicity since it requires corresponding support in
+ * Spectre masking for stack ALU.
+ * See also retrieve_ptr_limit().
+ */
+ if (!env->bypass_spec_v1) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, tn_buf);
return -EACCES;
}
+ /* Only initialized buffer on stack is allowed to be accessed
+ * with variable offset. With uninitialized buffer it's hard to
+ * guarantee that whole memory is marked as initialized on
+ * helper return since specific bounds are unknown what may
+ * cause uninitialized stack leaking.
+ */
+ if (meta && meta->raw_mode)
+ meta = NULL;
+
+ min_off = reg->smin_value + off;
+ max_off = reg->smax_value + off;
+ }
+
+ if (meta && meta->raw_mode) {
+ /* Ensure we won't be overwriting dynptrs when simulating byte
+ * by byte access in check_helper_call using meta.access_size.
+ * This would be a problem if we have a helper in the future
+ * which takes:
+ *
+ * helper(uninit_mem, len, dynptr)
+ *
+ * Now, uninint_mem may overlap with dynptr pointer. Hence, it
+ * may end up writing to dynptr itself when touching memory from
+ * arg 1. This can be relaxed on a case by case basis for known
+ * safe cases, but reject due to the possibilitiy of aliasing by
+ * default.
+ */
+ for (i = min_off; i < max_off + access_size; i++) {
+ int stack_off = -i - 1;
+
+ spi = __get_spi(i);
+ /* raw_mode may write past allocated_stack */
+ if (state->allocated_stack <= stack_off)
+ continue;
+ if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) {
+ verbose(env, "potential write to dynptr at off=%d disallowed\n", i);
+ return -EACCES;
+ }
+ }
+ meta->access_size = access_size;
+ meta->regno = regno;
+ return 0;
+ }
+
+ for (i = min_off; i < max_off + access_size; i++) {
+ u8 *stype;
+
+ slot = -i - 1;
+ spi = slot / BPF_REG_SIZE;
+ if (state->allocated_stack <= slot) {
+ verbose(env, "allocated_stack too small\n");
+ return -EFAULT;
+ }
+
+ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ if (*stype == STACK_MISC)
+ goto mark;
+ if ((*stype == STACK_ZERO) ||
+ (*stype == STACK_INVALID && env->allow_uninit_stack)) {
+ if (clobber) {
+ /* helper can write anything into the stack */
+ *stype = STACK_MISC;
+ }
+ goto mark;
+ }
+
+ if (is_spilled_reg(&state->stack[spi]) &&
+ (state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
+ env->allow_ptr_leaks)) {
+ if (clobber) {
+ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ scrub_spilled_slot(&state->stack[spi].slot_type[j]);
+ }
+ goto mark;
+ }
+
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
+ regno, min_off, i - min_off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
+ regno, tn_buf, i - min_off, access_size);
+ }
+ return -EACCES;
+mark:
+ /* reading any byte out of 8-byte 'spill_slot' will cause
+ * the whole slot to be marked as 'read'
+ */
+ err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi));
+ if (err)
+ return err;
+ /* We do not call bpf_mark_stack_write(), as we can not
+ * be sure that whether stack slot is written to or not. Hence,
+ * we must still conservatively propagate reads upwards even if
+ * helper may write to the entire memory range.
+ */
}
return 0;
}
-static int check_func_arg(struct verifier_env *env, u32 regno,
- enum bpf_arg_type arg_type, struct bpf_map **mapp)
+static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+ int access_size, enum bpf_access_type access_type,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
{
- struct reg_state *reg = env->cur_state.regs + regno;
- enum bpf_reg_type expected_type;
- int err = 0;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ u32 *max_access;
+
+ switch (base_type(reg->type)) {
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ return check_packet_access(env, regno, reg->off, access_size,
+ zero_size_allowed);
+ case PTR_TO_MAP_KEY:
+ if (access_type == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return check_mem_region_access(env, regno, reg->off, access_size,
+ reg->map_ptr->key_size, false);
+ case PTR_TO_MAP_VALUE:
+ if (check_map_access_type(env, regno, reg->off, access_size, access_type))
+ return -EACCES;
+ return check_map_access(env, regno, reg->off, access_size,
+ zero_size_allowed, ACCESS_HELPER);
+ case PTR_TO_MEM:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (access_type == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ }
+ return check_mem_region_access(env, regno, reg->off,
+ access_size, reg->mem_size,
+ zero_size_allowed);
+ case PTR_TO_BUF:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (access_type == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
- if (arg_type == ARG_DONTCARE)
- return 0;
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
+ return check_buffer_access(env, reg, regno, reg->off,
+ access_size, zero_size_allowed,
+ max_access);
+ case PTR_TO_STACK:
+ return check_stack_range_initialized(
+ env,
+ regno, reg->off, access_size,
+ zero_size_allowed, access_type, meta);
+ case PTR_TO_BTF_ID:
+ return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ access_size, BPF_READ, -1);
+ case PTR_TO_CTX:
+ /* in case the function doesn't know how to access the context,
+ * (because we are in a program of type SYSCALL for example), we
+ * can not statically check its size.
+ * Dynamically check it now.
+ */
+ if (!env->ops->convert_ctx_access) {
+ int offset = access_size - 1;
+
+ /* Allow zero-byte read from PTR_TO_CTX */
+ if (access_size == 0)
+ return zero_size_allowed ? 0 : -EACCES;
+
+ return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+ access_type, -1, false, false);
+ }
- if (reg->type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ fallthrough;
+ default: /* scalar_value or invalid ptr */
+ /* Allow zero-byte read from NULL, regardless of pointer type */
+ if (zero_size_allowed && access_size == 0 &&
+ register_is_null(reg))
+ return 0;
+
+ verbose(env, "R%d type=%s ", regno,
+ reg_type_str(env, reg->type));
+ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
+ return -EACCES;
+ }
+}
+
+/* verify arguments to helpers or kfuncs consisting of a pointer and an access
+ * size.
+ *
+ * @regno is the register containing the access size. regno-1 is the register
+ * containing the pointer.
+ */
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ enum bpf_access_type access_type,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ int err;
+
+ /* This is used to refine r0 return value bounds for helpers
+ * that enforce this value as an upper bound on return values.
+ * See do_refine_retval_range() for helpers that can refine
+ * the return value. C type of helper is u32 so we pull register
+ * bound from umax_value however, if negative verifier errors
+ * out. Only upper bounds can be learned because retval is an
+ * int type and negative retvals are allowed.
+ */
+ meta->msize_max_value = reg->umax_value;
+
+ /* The register is SCALAR_VALUE; the access check happens using
+ * its boundaries. For unprivileged variable accesses, disable
+ * raw mode so that the program is required to initialize all
+ * the memory that the helper could just partially fill up.
+ */
+ if (!tnum_is_const(reg->var_off))
+ meta = NULL;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
return -EACCES;
}
- if (arg_type == ARG_ANYTHING)
+ if (reg->umin_value == 0 && !zero_size_allowed) {
+ verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
+ regno, reg->umin_value, reg->umax_value);
+ return -EACCES;
+ }
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1, reg->umax_value,
+ access_type, zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ return err;
+}
+
+static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno, u32 mem_size)
+{
+ bool may_be_null = type_may_be_null(reg->type);
+ struct bpf_reg_state saved_reg;
+ int err;
+
+ if (register_is_null(reg))
return 0;
- if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
- arg_type == ARG_PTR_TO_MAP_VALUE) {
- expected_type = PTR_TO_STACK;
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
- expected_type = CONST_IMM;
- } else if (arg_type == ARG_CONST_MAP_PTR) {
- expected_type = CONST_PTR_TO_MAP;
- } else if (arg_type == ARG_PTR_TO_CTX) {
- expected_type = PTR_TO_CTX;
+ /* Assuming that the register contains a value check if the memory
+ * access is safe. Temporarily save and restore the register's state as
+ * the conversion shouldn't be visible to a caller.
+ */
+ if (may_be_null) {
+ saved_reg = *reg;
+ mark_ptr_not_null_reg(reg);
+ }
+
+ err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL);
+ err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL);
+
+ if (may_be_null)
+ *reg = saved_reg;
+
+ return err;
+}
+
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
+{
+ struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+ bool may_be_null = type_may_be_null(mem_reg->type);
+ struct bpf_reg_state saved_reg;
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+ memset(&meta, 0, sizeof(meta));
+
+ if (may_be_null) {
+ saved_reg = *mem_reg;
+ mark_ptr_not_null_reg(mem_reg);
+ }
+
+ err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta);
+ err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta);
+
+ if (may_be_null)
+ *mem_reg = saved_reg;
+
+ return err;
+}
+
+enum {
+ PROCESS_SPIN_LOCK = (1 << 0),
+ PROCESS_RES_LOCK = (1 << 1),
+ PROCESS_LOCK_IRQ = (1 << 2),
+};
+
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
+ * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * Two separate bpf_obj_new will also have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
+ * clears reg->id after value_or_null->value transition, since the verifier only
+ * cares about the range of access to valid map value pointer and doesn't care
+ * about actual address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks. Likewise for pointers to allocated objects
+ * returned from bpf_obj_new.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * env->cur_state->active_locks remembers which map value element or allocated
+ * object got locked and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
+{
+ bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
+ const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_verifier_state *cur = env->cur_state;
+ bool is_const = tnum_is_const(reg->var_off);
+ bool is_irq = flags & PROCESS_LOCK_IRQ;
+ u64 val = reg->var_off.value;
+ struct bpf_map *map = NULL;
+ struct btf *btf = NULL;
+ struct btf_record *rec;
+ u32 spin_lock_off;
+ int err;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
+ regno, lock_str);
+ return -EINVAL;
+ }
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ map = reg->map_ptr;
+ if (!map->btf) {
+ verbose(env,
+ "map '%s' has to have BTF in order to use %s_lock\n",
+ map->name, lock_str);
+ return -EINVAL;
+ }
+ } else {
+ btf = reg->btf;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr", lock_str);
+ return -EINVAL;
+ }
+ spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off;
+ if (spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
+ val + reg->off, lock_str, spin_lock_off);
+ return -EINVAL;
+ }
+ if (is_lock) {
+ void *ptr;
+ int type;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ } else if (is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) {
+ verbose(env, "Acquiring the same lock again, AA deadlock detected\n");
+ return -EINVAL;
+ }
+ }
+
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
+ else
+ type = REF_TYPE_LOCK;
+ err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr);
+ if (err < 0) {
+ verbose(env, "Failed to acquire lock state\n");
+ return err;
+ }
} else {
- verbose("unsupported arg_type %d\n", arg_type);
+ void *ptr;
+ int type;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!cur->active_locks) {
+ verbose(env, "%s_unlock without taking a lock\n", lock_str);
+ return -EINVAL;
+ }
+
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
+ else
+ type = REF_TYPE_LOCK;
+ if (!find_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
+ return -EINVAL;
+ }
+ if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) {
+ verbose(env, "%s_unlock cannot be out of order\n", lock_str);
+ return -EINVAL;
+ }
+ if (release_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
+ return -EINVAL;
+ }
+
+ invalidate_non_owning_refs(env);
+ }
+ return 0;
+}
+
+/* Check if @regno is a pointer to a specific field in a map value */
+static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno,
+ enum btf_field_type field_type)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+ const char *struct_name = btf_field_type_name(field_type);
+ int field_off = -1;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, struct_name);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name,
+ struct_name);
+ return -EINVAL;
+ }
+ if (!btf_record_has_field(map->record, field_type)) {
+ verbose(env, "map '%s' has no valid %s\n", map->name, struct_name);
+ return -EINVAL;
+ }
+ switch (field_type) {
+ case BPF_TIMER:
+ field_off = map->record->timer_off;
+ break;
+ case BPF_TASK_WORK:
+ field_off = map->record->task_work_off;
+ break;
+ case BPF_WORKQUEUE:
+ field_off = map->record->wq_off;
+ break;
+ default:
+ verifier_bug(env, "unsupported BTF field type: %s\n", struct_name);
+ return -EINVAL;
+ }
+ if (field_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n",
+ val + reg->off, struct_name, field_off);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TIMER);
+ if (err)
+ return err;
+
+ if (meta->map_ptr) {
+ verifier_bug(env, "Two map pointers in a timer helper");
+ return -EFAULT;
+ }
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n");
+ return -EOPNOTSUPP;
+ }
+ meta->map_uid = reg->map_uid;
+ meta->map_ptr = map;
+ return 0;
+}
+
+static int process_wq_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_WORKQUEUE);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_wq helper");
+ return -EFAULT;
+ }
+
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
+static int process_task_work_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+
+ err = check_map_field_pointer(env, regno, BPF_TASK_WORK);
+ if (err)
+ return err;
+
+ if (meta->map.ptr) {
+ verifier_bug(env, "Two map pointers in a bpf_task_work helper");
return -EFAULT;
}
+ meta->map.uid = reg->map_uid;
+ meta->map.ptr = map;
+ return 0;
+}
+
+static int process_kptr_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct btf_field *kptr_field;
+ struct bpf_map *map_ptr;
+ struct btf_record *rec;
+ u32 kptr_off;
+
+ if (type_is_ptr_alloc_obj(reg->type)) {
+ rec = reg_btf_record(reg);
+ } else { /* PTR_TO_MAP_VALUE */
+ map_ptr = reg->map_ptr;
+ if (!map_ptr->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
+ map_ptr->name);
+ return -EINVAL;
+ }
+ rec = map_ptr->record;
+ meta->map_ptr = map_ptr;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. kptr has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
- if (reg->type != expected_type) {
- verbose("R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type], reg_type_str[expected_type]);
+ if (!btf_record_has_field(rec, BPF_KPTR)) {
+ verbose(env, "R%d has no valid kptr\n", regno);
+ return -EINVAL;
+ }
+
+ kptr_off = reg->off + reg->var_off.value;
+ kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
+ if (!kptr_field) {
+ verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
+ return -EACCES;
+ }
+ if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
+ verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
return -EACCES;
}
+ meta->kptr_field = kptr_field;
+ return 0;
+}
- if (arg_type == ARG_CONST_MAP_PTR) {
- /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
- *mapp = reg->map_ptr;
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
+ enum bpf_arg_type arg_type, int clone_ref_obj_id)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err;
+
+ if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env,
+ "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
+ regno - 1);
+ return -EINVAL;
+ }
+
+ /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+ * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+ */
+ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+ verifier_bug(env, "misconfigured dynptr helper type flags");
+ return -EFAULT;
+ }
+
+ /* MEM_UNINIT - Points to memory that is an appropriate candidate for
+ * constructing a mutable bpf_dynptr object.
+ *
+ * Currently, this is only possible with PTR_TO_STACK
+ * pointing to a region of at least 16 bytes which doesn't
+ * contain an existing bpf_dynptr.
+ *
+ * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+ * mutated or destroyed. However, the memory it points to
+ * may be mutated.
+ *
+ * None - Points to a initialized dynptr that can be mutated and
+ * destroyed, including mutation of the memory it points
+ * to.
+ */
+ if (arg_type & MEM_UNINIT) {
+ int i;
+
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id);
+ } else /* MEM_RDONLY and None case from above */ {
+ /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+ verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ regno - 1);
+ return -EINVAL;
+ }
+
+ /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1);
+ return -EINVAL;
+ }
+
+ err = mark_dynptr_read(env, reg);
+ }
+ return err;
+}
+
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+ struct bpf_func_state *state = func(env, reg);
+
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx,
+ const struct btf_param *arg)
+{
+ /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+ * kfunc is iter state pointer
+ */
+ if (is_iter_kfunc(meta))
+ return arg_idx == 0;
+
+ /* iter passed as an argument to a generic kfunc */
+ return btf_param_match_suffix(meta->btf, arg, "__iter");
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const struct btf_type *t;
+ int spi, err, i, nr_slots, btf_id;
+
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1);
+ return -EINVAL;
+ }
+
+ /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs()
+ * ensures struct convention, so we wouldn't need to do any BTF
+ * validation here. But given iter state can be passed as a parameter
+ * to any kfunc, if arg has "__iter" suffix, we need to be a bit more
+ * conservative here.
+ */
+ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1);
+ if (btf_id < 0) {
+ verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(meta->btf, btf_id);
+ nr_slots = t->size / BPF_REG_SIZE;
+
+ if (is_iter_new_kfunc(meta)) {
+ /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+ verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno - 1);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ if (err)
+ return err;
+ } else {
+ /* iter_next() or iter_destroy(), as well as any kfunc
+ * accepting iter argument, expect initialized iter state
+ */
+ err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+ switch (err) {
+ case 0:
+ break;
+ case -EINVAL:
+ verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno - 1);
+ return err;
+ case -EPROTO:
+ verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+ return err;
+ default:
+ return err;
+ }
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ err = mark_iter_read(env, reg, spi, nr_slots);
+ if (err)
+ return err;
+
+ /* remember meta->iter info for process_iter_next_call() */
+ meta->iter.spi = spi;
+ meta->iter.frameno = reg->frameno;
+ meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+ if (is_iter_destroy_kfunc(meta)) {
+ err = unmark_stack_slots_iter(env, reg, nr_slots);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur,
+ int insn_idx)
+{
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *st;
+ struct list_head *pos, *head;
+
+ /* Explored states are pushed in stack order, most recent states come first */
+ head = explored_state(env, insn_idx);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ /* If st->branches != 0 state is a part of current DFS verification path,
+ * hence cur & st for a loop.
+ */
+ st = &sl->state;
+ if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+ st->dfs_depth < cur->dfs_depth)
+ return st;
+ }
+
+ return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ if (rold->type != SCALAR_VALUE)
+ return;
+ if (rold->type != rcur->type)
+ return;
+ if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+ return;
+ __mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr, num_slots;
+
+ reset_idmap_scratch(env);
+ for (fr = old->curframe; fr >= 0; fr--) {
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ maybe_widen_reg(env,
+ &fold->regs[i],
+ &fcur->regs[i],
+ &env->idmap_scratch);
+
+ num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
+ fcur->allocated_stack / BPF_REG_SIZE);
+ for (i = 0; i < num_slots; i++) {
+ if (!is_spilled_reg(&fold->stack[i]) ||
+ !is_spilled_reg(&fcur->stack[i]))
+ continue;
+
+ maybe_widen_reg(env,
+ &fold->stack[i].spilled_ptr,
+ &fcur->stack[i].spilled_ptr,
+ &env->idmap_scratch);
+ }
+ }
+ return 0;
+}
+
+static struct bpf_reg_state *get_iter_from_state(struct bpf_verifier_state *cur_st,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ return &cur_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ * i = 0;
+ * while(iter_next(&it))
+ * i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ * struct bpf_num_iter it;
+ * int arr[10];
+ * int i = 0, a = 0;
+ * bpf_iter_num_new(&it, 0, 10);
+ * while (bpf_iter_num_next(&it)) {
+ * if (a == 0) {
+ * a = 1;
+ * i = 7; // Because i changed verifier would forget
+ * // it's range on second loop entry.
+ * } else {
+ * arr[i] = 42; // This would fail to verify.
+ * }
+ * }
+ * bpf_iter_num_destroy(&it);
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+ struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+ struct bpf_reg_state *cur_iter, *queued_iter;
+
+ BTF_TYPE_EMIT(struct bpf_iter);
+
+ cur_iter = get_iter_from_state(cur_st, meta);
+
+ if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+ cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+ verifier_bug(env, "unexpected iterator state %d (%s)",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ return -EFAULT;
+ }
+
+ if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* Because iter_next() call is a checkpoint is_state_visitied()
+ * should guarantee parent state with same call sites and insn_idx.
+ */
+ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+ !same_callsites(cur_st->parent, cur_st)) {
+ verifier_bug(env, "bad parent state for iter next call");
+ return -EFAULT;
+ }
+ /* Note cur_st->parent in the call below, it is necessary to skip
+ * checkpoint created for cur_st by is_state_visited()
+ * right at this instruction.
+ */
+ prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
+ /* branch out active iter state */
+ queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
+
+ queued_iter = get_iter_from_state(queued_st, meta);
+ queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+ queued_iter->iter.depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
+
+ queued_fr = queued_st->frame[queued_st->curframe];
+ mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+ }
+
+ /* switch to DRAINED state, but keep the depth unchanged */
+ /* mark current iter state as drained and assume returned NULL */
+ cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+ __mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
+
+ return 0;
+}
+
+static bool arg_type_is_mem_size(enum bpf_arg_type type)
+{
+ return type == ARG_CONST_SIZE ||
+ type == ARG_CONST_SIZE_OR_ZERO;
+}
+
+static bool arg_type_is_raw_mem(enum bpf_arg_type type)
+{
+ return base_type(type) == ARG_PTR_TO_MEM &&
+ type & MEM_UNINIT;
+}
+
+static bool arg_type_is_release(enum bpf_arg_type type)
+{
+ return type & OBJ_RELEASE;
+}
+
+static bool arg_type_is_dynptr(enum bpf_arg_type type)
+{
+ return base_type(type) == ARG_PTR_TO_DYNPTR;
+}
+
+static int resolve_map_arg_type(struct bpf_verifier_env *env,
+ const struct bpf_call_arg_meta *meta,
+ enum bpf_arg_type *arg_type)
+{
+ if (!meta->map_ptr) {
+ /* kernel subsystem misconfigured verifier */
+ verifier_bug(env, "invalid map_ptr to access map->type");
+ return -EFAULT;
+ }
+
+ switch (meta->map_ptr->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ case BPF_MAP_TYPE_SOCKHASH:
+ if (*arg_type == ARG_PTR_TO_MAP_VALUE) {
+ *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON;
+ } else {
+ verbose(env, "invalid arg_type for sockmap/sockhash\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (meta->func_id == BPF_FUNC_map_peek_elem)
+ *arg_type = ARG_PTR_TO_MAP_VALUE;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+struct bpf_reg_types {
+ const enum bpf_reg_type types[10];
+ u32 *btf_id;
+};
+
+static const struct bpf_reg_types sock_types = {
+ .types = {
+ PTR_TO_SOCK_COMMON,
+ PTR_TO_SOCKET,
+ PTR_TO_TCP_SOCK,
+ PTR_TO_XDP_SOCK,
+ },
+};
+
+#ifdef CONFIG_NET
+static const struct bpf_reg_types btf_id_sock_common_types = {
+ .types = {
+ PTR_TO_SOCK_COMMON,
+ PTR_TO_SOCKET,
+ PTR_TO_TCP_SOCK,
+ PTR_TO_XDP_SOCK,
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ },
+ .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+};
+#endif
+
+static const struct bpf_reg_types mem_types = {
+ .types = {
+ PTR_TO_STACK,
+ PTR_TO_PACKET,
+ PTR_TO_PACKET_META,
+ PTR_TO_MAP_KEY,
+ PTR_TO_MAP_VALUE,
+ PTR_TO_MEM,
+ PTR_TO_MEM | MEM_RINGBUF,
+ PTR_TO_BUF,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ },
+};
+
+static const struct bpf_reg_types spin_lock_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ }
+};
+
+static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
+static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
+static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
+static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
+static const struct bpf_reg_types btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_BTF_ID | MEM_RCU,
+ },
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
+ PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+ }
+};
+static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
+static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types kptr_xchg_dest_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC
+ }
+};
+static const struct bpf_reg_types dynptr_types = {
+ .types = {
+ PTR_TO_STACK,
+ CONST_PTR_TO_DYNPTR,
+ }
+};
+
+static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
+ [ARG_PTR_TO_MAP_KEY] = &mem_types,
+ [ARG_PTR_TO_MAP_VALUE] = &mem_types,
+ [ARG_CONST_SIZE] = &scalar_types,
+ [ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
+ [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
+ [ARG_CONST_MAP_PTR] = &const_map_ptr_types,
+ [ARG_PTR_TO_CTX] = &context_types,
+ [ARG_PTR_TO_SOCK_COMMON] = &sock_types,
+#ifdef CONFIG_NET
+ [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
+#endif
+ [ARG_PTR_TO_SOCKET] = &fullsock_types,
+ [ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
+ [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
+ [ARG_PTR_TO_MEM] = &mem_types,
+ [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
+ [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
+ [ARG_PTR_TO_FUNC] = &func_ptr_types,
+ [ARG_PTR_TO_STACK] = &stack_ptr_types,
+ [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
+ [ARG_PTR_TO_TIMER] = &timer_types,
+ [ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types,
+ [ARG_PTR_TO_DYNPTR] = &dynptr_types,
+};
+
+static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type,
+ const u32 *arg_btf_id,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ enum bpf_reg_type expected, type = reg->type;
+ const struct bpf_reg_types *compatible;
+ int i, j;
+
+ compatible = compatible_reg_types[base_type(arg_type)];
+ if (!compatible) {
+ verifier_bug(env, "unsupported arg type %d", arg_type);
+ return -EFAULT;
+ }
+
+ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+ *
+ * Same for MAYBE_NULL:
+ *
+ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+ *
+ * ARG_PTR_TO_MEM is compatible with PTR_TO_MEM that is tagged with a dynptr type.
+ *
+ * Therefore we fold these flags depending on the arg_type before comparison.
+ */
+ if (arg_type & MEM_RDONLY)
+ type &= ~MEM_RDONLY;
+ if (arg_type & PTR_MAYBE_NULL)
+ type &= ~PTR_MAYBE_NULL;
+ if (base_type(arg_type) == ARG_PTR_TO_MEM)
+ type &= ~DYNPTR_TYPE_FLAG_MASK;
+
+ /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
+ type &= ~MEM_ALLOC;
+ type &= ~MEM_PERCPU;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
+ expected = compatible->types[i];
+ if (expected == NOT_INIT)
+ break;
+
+ if (type == expected)
+ goto found;
+ }
+
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
+ for (j = 0; j + 1 < i; j++)
+ verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
+ verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
+ return -EACCES;
+
+found:
+ if (base_type(reg->type) != PTR_TO_BTF_ID)
+ return 0;
+
+ if (compatible == &mem_types) {
+ if (!(arg_type & MEM_RDONLY)) {
+ verbose(env,
+ "%s() may write into memory pointed by R%d type=%s\n",
+ func_id_name(meta->func_id),
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ switch ((int)reg->type) {
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
+ {
+ /* For bpf_sk_release, it needs to match against first member
+ * 'struct sock_common', hence make an exception for it. This
+ * allows bpf_sk_release to work for multiple socket types.
+ */
+ bool strict_type_match = arg_type_is_release(arg_type) &&
+ meta->func_id != BPF_FUNC_sk_release;
+
+ if (type_may_be_null(reg->type) &&
+ (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+ verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ return -EACCES;
+ }
+
+ if (!arg_btf_id) {
+ if (!compatible->btf_id) {
+ verifier_bug(env, "missing arg compatible BTF ID");
+ return -EFAULT;
+ }
+ arg_btf_id = compatible->btf_id;
+ }
+
+ if (meta->func_id == BPF_FUNC_kptr_xchg) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ } else {
+ if (arg_btf_id == BPF_PTR_POISON) {
+ verbose(env, "verifier internal error:");
+ verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+ btf_vmlinux, *arg_btf_id,
+ strict_type_match)) {
+ verbose(env, "R%d is of type %s but %s is expected\n",
+ regno, btf_type_name(reg->btf, reg->btf_id),
+ btf_type_name(btf_vmlinux, *arg_btf_id));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+ meta->func_id != BPF_FUNC_kptr_xchg) {
+ verifier_bug(env, "unimplemented handling of MEM_ALLOC");
+ return -EFAULT;
+ }
+ /* Check if local kptr in src arg matches kptr in dst arg */
+ if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
+ return -EACCES;
+ }
+ break;
+ case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
+ /* Handled by helper specific checks */
+ break;
+ default:
+ verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static struct btf_field *
+reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
+
+ rec = reg_btf_record(reg);
+ if (!rec)
+ return NULL;
+
+ field = btf_record_find(rec, off, fields);
+ if (!field)
+ return NULL;
+
+ return field;
+}
+
+static int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type)
+{
+ u32 type = reg->type;
+
+ /* When referenced register is passed to release function, its fixed
+ * offset must be 0.
+ *
+ * We will check arg_type_is_release reg has ref_obj_id when storing
+ * meta->release_regno.
+ */
+ if (arg_type_is_release(arg_type)) {
+ /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+ * may not directly point to the object being released, but to
+ * dynptr pointing to such object, which might be at some offset
+ * on the stack. In that case, we simply to fallback to the
+ * default handling.
+ */
+ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+ return 0;
+
+ /* Doing check_ptr_off_reg check for the offset will catch this
+ * because fixed_off_ok is false, but checking here allows us
+ * to give the user a better error message.
+ */
+ if (reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+ regno);
+ return -EINVAL;
+ }
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+
+ switch (type) {
+ /* Pointer types where both fixed and variable offset is explicitly allowed: */
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_MEM | MEM_RDONLY:
+ case PTR_TO_MEM | MEM_RINGBUF:
+ case PTR_TO_BUF:
+ case PTR_TO_BUF | MEM_RDONLY:
+ case PTR_TO_ARENA:
+ case SCALAR_VALUE:
+ return 0;
+ /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+ * fixed offset.
+ */
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
+ case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU:
+ /* When referenced PTR_TO_BTF_ID is passed to release function,
+ * its fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero. This was already checked above. So pass
+ * fixed_off_ok as true to allow fixed offset for all other
+ * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+ * still need to do checks instead of returning.
+ */
+ return __check_ptr_off_reg(env, reg, regno, true);
+ default:
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+}
+
+static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
+ const struct bpf_func_proto *fn,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *state = NULL;
+ int i;
+
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (state) {
+ verbose(env, "verifier internal error: multiple dynptr args\n");
+ return NULL;
+ }
+ state = &regs[BPF_REG_1 + i];
+ }
+
+ if (!state)
+ verbose(env, "verifier internal error: no dynptr arg found\n");
+
+ return state;
+}
+
+static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.id;
+}
+
+static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->dynptr.type;
+
+ spi = __get_spi(reg->off);
+ if (spi < 0) {
+ verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+
+ return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
+static int check_reg_const_str(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno)
+{
+ struct bpf_map *map = reg->map_ptr;
+ int err;
+ int map_off;
+ u64 map_addr;
+ char *str_ptr;
- } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ if (reg->type != PTR_TO_MAP_VALUE)
+ return -EINVAL;
+
+ if (!bpf_map_is_rdonly(map)) {
+ verbose(env, "R%d does not point to a readonly map'\n", regno);
+ return -EACCES;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a constant address'\n", regno);
+ return -EACCES;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ return -EACCES;
+ }
+
+ err = check_map_access(env, regno, reg->off,
+ map->value_size - reg->off, false,
+ ACCESS_HELPER);
+ if (err)
+ return err;
+
+ map_off = reg->off + reg->var_off.value;
+ err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+ if (err) {
+ verbose(env, "direct value access on string failed\n");
+ return err;
+ }
+
+ str_ptr = (char *)(long)(map_addr);
+ if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+ verbose(env, "string is not zero-terminated\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Returns constant key value in `value` if possible, else negative error */
+static int get_constant_map_key(struct bpf_verifier_env *env,
+ struct bpf_reg_state *key,
+ u32 key_size,
+ s64 *value)
+{
+ struct bpf_func_state *state = func(env, key);
+ struct bpf_reg_state *reg;
+ int slot, spi, off;
+ int spill_size = 0;
+ int zero_size = 0;
+ int stack_off;
+ int i, err;
+ u8 *stype;
+
+ if (!env->bpf_capable)
+ return -EOPNOTSUPP;
+ if (key->type != PTR_TO_STACK)
+ return -EOPNOTSUPP;
+ if (!tnum_is_const(key->var_off))
+ return -EOPNOTSUPP;
+
+ stack_off = key->off + key->var_off.value;
+ slot = -stack_off - 1;
+ spi = slot / BPF_REG_SIZE;
+ off = slot % BPF_REG_SIZE;
+ stype = state->stack[spi].slot_type;
+
+ /* First handle precisely tracked STACK_ZERO */
+ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
+ zero_size++;
+ if (zero_size >= key_size) {
+ *value = 0;
+ return 0;
+ }
+
+ /* Check that stack contains a scalar spill of expected size */
+ if (!is_spilled_scalar_reg(&state->stack[spi]))
+ return -EOPNOTSUPP;
+ for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
+ spill_size++;
+ if (spill_size != key_size)
+ return -EOPNOTSUPP;
+
+ reg = &state->stack[spi].spilled_ptr;
+ if (!tnum_is_const(reg->var_off))
+ /* Stack value not statically known */
+ return -EOPNOTSUPP;
+
+ /* We are relying on a constant value. So mark as precise
+ * to prevent pruning on it.
+ */
+ bt_set_frame_slot(&env->bt, key->frameno, spi);
+ err = mark_chain_precision_batch(env, env->cur_state);
+ if (err < 0)
+ return err;
+
+ *value = reg->var_off.value;
+ return 0;
+}
+
+static bool can_elide_value_nullness(enum bpf_map_type type);
+
+static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
+ struct bpf_call_arg_meta *meta,
+ const struct bpf_func_proto *fn,
+ int insn_idx)
+{
+ u32 regno = BPF_REG_1 + arg;
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ enum bpf_arg_type arg_type = fn->arg_type[arg];
+ enum bpf_reg_type type = reg->type;
+ u32 *arg_btf_id = NULL;
+ u32 key_size;
+ int err = 0;
+
+ if (arg_type == ARG_DONTCARE)
+ return 0;
+
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
+
+ if (arg_type == ARG_ANYTHING) {
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr into helper function\n",
+ regno);
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ if (type_is_pkt_pointer(type) &&
+ !may_access_direct_pkt_data(env, meta, BPF_READ)) {
+ verbose(env, "helper access to the packet is not allowed\n");
+ return -EACCES;
+ }
+
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
+ err = resolve_map_arg_type(env, meta, &arg_type);
+ if (err)
+ return err;
+ }
+
+ if (register_is_null(reg) && type_may_be_null(arg_type))
+ /* A NULL register has a SCALAR_VALUE type, so skip
+ * type checking.
+ */
+ goto skip_type_check;
+
+ /* arg_btf_id and arg_size are in a union. */
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
+ base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
+ arg_btf_id = fn->arg_btf_id[arg];
+
+ err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
+ if (err)
+ return err;
+
+ err = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (err)
+ return err;
+
+skip_type_check:
+ if (arg_type_is_release(arg_type)) {
+ if (arg_type_is_dynptr(arg_type)) {
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ /* Only dynptr created on stack can be released, thus
+ * the get_spi and stack state checks for spilled_ptr
+ * should only be done before process_dynptr_func for
+ * PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = dynptr_get_spi(env, reg);
+ if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "cannot release unowned const bpf_dynptr\n");
+ return -EINVAL;
+ }
+ } else if (!reg->ref_obj_id && !register_is_null(reg)) {
+ verbose(env, "R%d must be referenced when passed to release function\n",
+ regno);
+ return -EINVAL;
+ }
+ if (meta->release_regno) {
+ verifier_bug(env, "more than one release argument");
+ return -EFAULT;
+ }
+ meta->release_regno = regno;
+ }
+
+ if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
+ if (meta->ref_obj_id) {
+ verbose(env, "more than one arg with ref_obj_id R%d %u %u",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EACCES;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ }
+
+ switch (base_type(arg_type)) {
+ case ARG_CONST_MAP_PTR:
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ if (meta->map_ptr) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * timer = bpf_map_lookup_elem(inner_map1);
+ * if (timer)
+ * // mismatch would have been allowed
+ * bpf_timer_init(timer, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map_ptr != reg->map_ptr ||
+ meta->map_uid != reg->map_uid) {
+ verbose(env,
+ "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map_uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
+ meta->map_ptr = reg->map_ptr;
+ meta->map_uid = reg->map_uid;
+ break;
+ case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
*/
- if (!*mapp) {
+ if (!meta->map_ptr) {
/* in function declaration map_ptr must come before
* map_key, so that it's verified and known before
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose("invalid map_ptr to access map->key\n");
- return -EACCES;
+ verifier_bug(env, "invalid map_ptr to access map->key");
+ return -EFAULT;
+ }
+ key_size = meta->map_ptr->key_size;
+ err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+ if (err)
+ return err;
+ if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
+ if (err < 0) {
+ meta->const_map_key = -1;
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ else
+ return err;
+ }
}
- err = check_stack_boundary(env, regno, (*mapp)->key_size);
+ break;
+ case ARG_PTR_TO_MAP_VALUE:
+ if (type_may_be_null(arg_type) && register_is_null(reg))
+ return 0;
- } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
- if (!*mapp) {
+ if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose("invalid map_ptr to access map->value\n");
+ verifier_bug(env, "invalid map_ptr to access map->value");
+ return -EFAULT;
+ }
+ meta->raw_mode = arg_type & MEM_UNINIT;
+ err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
+ break;
+ case ARG_PTR_TO_PERCPU_BTF_ID:
+ if (!reg->btf_id) {
+ verbose(env, "Helper has invalid btf_id in R%d\n", regno);
+ return -EACCES;
+ }
+ meta->ret_btf = reg->btf;
+ meta->ret_btf_id = reg->btf_id;
+ break;
+ case ARG_PTR_TO_SPIN_LOCK:
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "can't spin_{lock,unlock} in rbtree cb\n");
+ return -EACCES;
+ }
+ if (meta->func_id == BPF_FUNC_spin_lock) {
+ err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK);
+ if (err)
+ return err;
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+ err = process_spin_lock(env, regno, 0);
+ if (err)
+ return err;
+ } else {
+ verifier_bug(env, "spin lock arg on unexpected helper");
+ return -EFAULT;
+ }
+ break;
+ case ARG_PTR_TO_TIMER:
+ err = process_timer_func(env, regno, meta);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_FUNC:
+ meta->subprogno = reg->subprogno;
+ break;
+ case ARG_PTR_TO_MEM:
+ /* The access to this pointer is only checked when we hit the
+ * next is_mem_size argument below.
+ */
+ meta->raw_mode = arg_type & MEM_UNINIT;
+ if (arg_type & MEM_FIXED_SIZE) {
+ err = check_helper_mem_access(env, regno, fn->arg_size[arg],
+ arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ,
+ false, meta);
+ if (err)
+ return err;
+ if (arg_type & MEM_ALIGNED)
+ err = check_ptr_alignment(env, reg, 0, fn->arg_size[arg], true);
+ }
+ break;
+ case ARG_CONST_SIZE:
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ false, meta);
+ break;
+ case ARG_CONST_SIZE_OR_ZERO:
+ err = check_mem_size_reg(env, reg, regno,
+ fn->arg_type[arg - 1] & MEM_WRITE ?
+ BPF_WRITE : BPF_READ,
+ true, meta);
+ break;
+ case ARG_PTR_TO_DYNPTR:
+ err = process_dynptr_func(env, regno, insn_idx, arg_type, 0);
+ if (err)
+ return err;
+ break;
+ case ARG_CONST_ALLOC_SIZE_OR_ZERO:
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a known constant'\n",
+ regno);
return -EACCES;
}
- err = check_stack_boundary(env, regno, (*mapp)->value_size);
+ meta->mem_size = reg->var_off.value;
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+ break;
+ case ARG_PTR_TO_CONST_STR:
+ {
+ err = check_reg_const_str(env, reg, regno);
+ if (err)
+ return err;
+ break;
+ }
+ case ARG_KPTR_XCHG_DEST:
+ err = process_kptr_func(env, regno, meta);
+ if (err)
+ return err;
+ break;
+ }
+
+ return err;
+}
+
+static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
+{
+ enum bpf_attach_type eatype = env->prog->expected_attach_type;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+
+ if (func_id != BPF_FUNC_map_update_elem &&
+ func_id != BPF_FUNC_map_delete_elem)
+ return false;
+
+ /* It's not possible to get access to a locked struct sock in these
+ * contexts, so updating is safe.
+ */
+ switch (type) {
+ case BPF_PROG_TYPE_TRACING:
+ if (eatype == BPF_TRACE_ITER)
+ return true;
+ break;
+ case BPF_PROG_TYPE_SOCK_OPS:
+ /* map_update allowed only via dedicated helpers with event type checks */
+ if (func_id == BPF_FUNC_map_delete_elem)
+ return true;
+ break;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ return true;
+ default:
+ break;
+ }
+
+ verbose(env, "cannot update sockmap in this context\n");
+ return false;
+}
+
+static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
+{
+ return env->prog->jit_requested &&
+ bpf_jit_supports_subprog_tailcalls();
+}
+
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map, int func_id)
+{
+ if (!map)
+ return 0;
+
+ /* We need a two way check, first is from map perspective ... */
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ if (func_id != BPF_FUNC_tail_call)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ if (func_id != BPF_FUNC_perf_event_read &&
+ func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_skb_output &&
+ func_id != BPF_FUNC_perf_event_read_value &&
+ func_id != BPF_FUNC_xdp_output)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_RINGBUF:
+ if (func_id != BPF_FUNC_ringbuf_output &&
+ func_id != BPF_FUNC_ringbuf_reserve &&
+ func_id != BPF_FUNC_ringbuf_query &&
+ func_id != BPF_FUNC_ringbuf_reserve_dynptr &&
+ func_id != BPF_FUNC_ringbuf_submit_dynptr &&
+ func_id != BPF_FUNC_ringbuf_discard_dynptr)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ if (func_id != BPF_FUNC_user_ringbuf_drain)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_STACK_TRACE:
+ if (func_id != BPF_FUNC_get_stackid)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ if (func_id != BPF_FUNC_skb_under_cgroup &&
+ func_id != BPF_FUNC_current_task_under_cgroup)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_CGROUP_STORAGE:
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
+ if (func_id != BPF_FUNC_get_local_storage)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_elem)
+ goto error;
+ break;
+ /* Restrict bpf side of cpumap and xskmap, open when use-cases
+ * appear.
+ */
+ case BPF_MAP_TYPE_CPUMAP:
+ if (func_id != BPF_FUNC_redirect_map)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_elem)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ if (func_id != BPF_FUNC_map_lookup_elem)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_SOCKMAP:
+ if (func_id != BPF_FUNC_sk_redirect_map &&
+ func_id != BPF_FUNC_sock_map_update &&
+ func_id != BPF_FUNC_msg_redirect_map &&
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem &&
+ !may_update_sockmap(env, func_id))
+ goto error;
+ break;
+ case BPF_MAP_TYPE_SOCKHASH:
+ if (func_id != BPF_FUNC_sk_redirect_hash &&
+ func_id != BPF_FUNC_sock_hash_update &&
+ func_id != BPF_FUNC_msg_redirect_hash &&
+ func_id != BPF_FUNC_sk_select_reuseport &&
+ func_id != BPF_FUNC_map_lookup_elem &&
+ !may_update_sockmap(env, func_id))
+ goto error;
+ break;
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ if (func_id != BPF_FUNC_sk_select_reuseport)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ if (func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_map_pop_elem &&
+ func_id != BPF_FUNC_map_push_elem)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ if (func_id != BPF_FUNC_sk_storage_get &&
+ func_id != BPF_FUNC_sk_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ if (func_id != BPF_FUNC_inode_storage_get &&
+ func_id != BPF_FUNC_inode_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ if (func_id != BPF_FUNC_task_storage_get &&
+ func_id != BPF_FUNC_task_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ if (func_id != BPF_FUNC_cgrp_storage_get &&
+ func_id != BPF_FUNC_cgrp_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_BLOOM_FILTER:
+ if (func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_map_push_elem)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_INSN_ARRAY:
+ goto error;
+ default:
+ break;
+ }
+
+ /* ... and second from the function itself. */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ goto error;
+ if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
+ verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_FUNC_perf_event_read:
+ case BPF_FUNC_perf_event_output:
+ case BPF_FUNC_perf_event_read_value:
+ case BPF_FUNC_skb_output:
+ case BPF_FUNC_xdp_output:
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_ringbuf_output:
+ case BPF_FUNC_ringbuf_reserve:
+ case BPF_FUNC_ringbuf_query:
+ case BPF_FUNC_ringbuf_reserve_dynptr:
+ case BPF_FUNC_ringbuf_submit_dynptr:
+ case BPF_FUNC_ringbuf_discard_dynptr:
+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
+ goto error;
+ break;
+ case BPF_FUNC_user_ringbuf_drain:
+ if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+ goto error;
+ break;
+ case BPF_FUNC_get_stackid:
+ if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
+ goto error;
+ break;
+ case BPF_FUNC_current_task_under_cgroup:
+ case BPF_FUNC_skb_under_cgroup:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_redirect_map:
+ if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
+ map->map_type != BPF_MAP_TYPE_CPUMAP &&
+ map->map_type != BPF_MAP_TYPE_XSKMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sk_redirect_map:
+ case BPF_FUNC_msg_redirect_map:
+ case BPF_FUNC_sock_map_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ goto error;
+ break;
+ case BPF_FUNC_sk_redirect_hash:
+ case BPF_FUNC_msg_redirect_hash:
+ case BPF_FUNC_sock_hash_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
+ goto error;
+ break;
+ case BPF_FUNC_get_local_storage:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ goto error;
+ break;
+ case BPF_FUNC_sk_select_reuseport:
+ if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY &&
+ map->map_type != BPF_MAP_TYPE_SOCKMAP &&
+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
+ goto error;
+ break;
+ case BPF_FUNC_map_pop_elem:
+ if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+ map->map_type != BPF_MAP_TYPE_STACK)
+ goto error;
+ break;
+ case BPF_FUNC_map_peek_elem:
+ case BPF_FUNC_map_push_elem:
+ if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+ map->map_type != BPF_MAP_TYPE_STACK &&
+ map->map_type != BPF_MAP_TYPE_BLOOM_FILTER)
+ goto error;
+ break;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+ goto error;
+ break;
+ case BPF_FUNC_sk_storage_get:
+ case BPF_FUNC_sk_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ goto error;
+ break;
+ case BPF_FUNC_inode_storage_get:
+ case BPF_FUNC_inode_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE)
+ goto error;
+ break;
+ case BPF_FUNC_task_storage_get:
+ case BPF_FUNC_task_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
+ goto error;
+ break;
+ case BPF_FUNC_cgrp_storage_get:
+ case BPF_FUNC_cgrp_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+error:
+ verbose(env, "cannot pass map_type %d into func %s#%d\n",
+ map->map_type, func_id_name(func_id), func_id);
+ return -EINVAL;
+}
+
+static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
+{
+ int count = 0;
+
+ if (arg_type_is_raw_mem(fn->arg1_type))
+ count++;
+ if (arg_type_is_raw_mem(fn->arg2_type))
+ count++;
+ if (arg_type_is_raw_mem(fn->arg3_type))
+ count++;
+ if (arg_type_is_raw_mem(fn->arg4_type))
+ count++;
+ if (arg_type_is_raw_mem(fn->arg5_type))
+ count++;
+
+ /* We only support one arg being in raw mode at the moment,
+ * which is sufficient for the helper functions we have
+ * right now.
+ */
+ return count <= 1;
+}
+
+static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
+{
+ bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
+ bool has_size = fn->arg_size[arg] != 0;
+ bool is_next_size = false;
+
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type))
+ is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
+
+ if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
+ return is_next_size;
+
+ return has_size == is_next_size || is_next_size == is_fixed;
+}
+
+static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
+{
+ /* bpf_xxx(..., buf, len) call will access 'len'
+ * bytes from memory 'buf'. Both arg types need
+ * to be paired, so make sure there's no buggy
+ * helper function specification.
+ */
+ if (arg_type_is_mem_size(fn->arg1_type) ||
+ check_args_pair_invalid(fn, 0) ||
+ check_args_pair_invalid(fn, 1) ||
+ check_args_pair_invalid(fn, 2) ||
+ check_args_pair_invalid(fn, 3) ||
+ check_args_pair_invalid(fn, 4))
+ return false;
+
+ return true;
+}
+
+static bool check_btf_id_ok(const struct bpf_func_proto *fn)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
+ return !!fn->arg_btf_id[i];
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
+ return fn->arg_btf_id[i] == BPF_PTR_POISON;
+ if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
+ /* arg_btf_id and arg_size are in a union. */
+ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
+ !(fn->arg_type[i] & MEM_FIXED_SIZE)))
+ return false;
+ }
+
+ return true;
+}
+
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
+{
+ return check_raw_mode_ok(fn) &&
+ check_arg_pair_ok(fn) &&
+ check_btf_id_ok(fn) ? 0 : -EINVAL;
+}
+
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
+ */
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
+{
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
+ mark_reg_invalid(env, reg);
+ }));
+}
+
+enum {
+ AT_PKT_END = -1,
+ BEYOND_PKT_END = -2,
+};
+
+static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open)
+{
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *reg = &state->regs[regn];
+
+ if (reg->type != PTR_TO_PACKET)
+ /* PTR_TO_PACKET_META is not supported yet */
+ return;
+
+ /* The 'reg' is pkt > pkt_end or pkt >= pkt_end.
+ * How far beyond pkt_end it goes is unknown.
+ * if (!range_open) it's the case of pkt >= pkt_end
+ * if (range_open) it's the case of pkt > pkt_end
+ * hence this pointer is at least 1 byte bigger than pkt_end
+ */
+ if (range_open)
+ reg->range = BEYOND_PKT_END;
+ else
+ reg->range = AT_PKT_END;
+}
+
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ if (state->refs[i].id == ref_obj_id) {
+ release_reference_state(state, i);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+/* The pointer with the specified id has released its reference to kernel
+ * resources. Identify all copies of the same pointer and clear the reference.
+ *
+ * This is the release function corresponding to acquire_reference(). Idempotent.
+ */
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int err;
+
+ err = release_reference_nomark(vstate, ref_obj_id);
+ if (err)
+ return err;
+
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ mark_reg_invalid(env, reg);
+ }));
+
+ return 0;
+}
+
+static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
+{
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (type_is_non_owning_ref(reg->type))
+ mark_reg_invalid(env, reg);
+ }));
+}
+
+static void clear_caller_saved_regs(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ int i;
+
+ /* after the call registers r0 - r5 were scratched */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
+ }
+}
+
+typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx);
+
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx);
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
- /* bpf_xxx(..., buf, len) call will access 'len' bytes
- * from stack pointer 'buf'. Check it
- * note: regno == len, regno - 1 == buf
+static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
+ set_callee_state_fn set_callee_state_cb,
+ struct bpf_verifier_state *state)
+{
+ struct bpf_func_state *caller, *callee;
+ int err;
+
+ if (state->curframe + 1 >= MAX_CALL_FRAMES) {
+ verbose(env, "the call stack of %d frames is too deep\n",
+ state->curframe + 2);
+ return -E2BIG;
+ }
+
+ if (state->frame[state->curframe + 1]) {
+ verifier_bug(env, "Frame %d already allocated", state->curframe + 1);
+ return -EFAULT;
+ }
+
+ caller = state->frame[state->curframe];
+ callee = kzalloc(sizeof(*callee), GFP_KERNEL_ACCOUNT);
+ if (!callee)
+ return -ENOMEM;
+ state->frame[state->curframe + 1] = callee;
+
+ /* callee cannot access r0, r6 - r9 for reading and has to write
+ * into its own stack before reading from it.
+ * callee can read/write into caller's stack
+ */
+ init_func_state(env, callee,
+ /* remember the callsite, it will be used by bpf_exit */
+ callsite,
+ state->curframe + 1 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ err = set_callee_state_cb(env, caller, callee, callsite);
+ if (err)
+ goto err_out;
+
+ /* only increment it after check_reg_arg() finished */
+ state->curframe++;
+
+ return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
+}
+
+static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+ const struct btf *btf,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_verifier_log *log = &env->log;
+ u32 i;
+ int ret;
+
+ ret = btf_prepare_func_args(env, subprog);
+ if (ret)
+ return ret;
+
+ /* check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < sub->arg_cnt; i++) {
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
+ struct bpf_subprog_arg_info *arg = &sub->args[i];
+
+ if (arg->arg_type == ARG_ANYTHING) {
+ if (reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type & PTR_UNTRUSTED) {
+ /*
+ * Anything is allowed for untrusted arguments, as these are
+ * read-only and probe read instructions would protect against
+ * invalid memory access.
+ */
+ } else if (arg->arg_type == ARG_PTR_TO_CTX) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log, "arg#%d expects pointer to ctx\n", i);
+ return -EINVAL;
+ }
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+ if (ret < 0)
+ return ret;
+ if (check_mem_reg(env, reg, regno, arg->mem_size))
+ return -EINVAL;
+ if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
+ bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
+ return -EINVAL;
+ }
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /*
+ * Can pass any value and the kernel won't crash, but
+ * only PTR_TO_ARENA or SCALAR make sense. Everything
+ * else is a bug in the bpf program. Point it out to
+ * the user at the verification time instead of
+ * run-time debug nightmare.
+ */
+ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) {
+ bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno);
+ return -EINVAL;
+ }
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+ if (ret)
+ return ret;
+
+ ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
+ if (ret)
+ return ret;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ struct bpf_call_arg_meta meta;
+ int err;
+
+ if (register_is_null(reg) && type_may_be_null(arg->arg_type))
+ continue;
+
+ memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */
+ err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta);
+ err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type);
+ if (err)
+ return err;
+ } else {
+ verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ err = btf_check_func_arg_match(env, subprog, btf, regs);
+ /* Compiler optimizations can remove arguments from static functions
+ * or mismatched type can be passed into a global function.
+ * In such cases mark the function as unreliable from BTF point of view.
+ */
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
+{
+ struct bpf_verifier_state *state = env->cur_state, *callback_state;
+ struct bpf_func_state *caller, *callee;
+ int err;
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+
+ /* set_callee_state is used for direct subprog calls, but we are
+ * interested in validating only BPF helpers that can call subprogs as
+ * callbacks
+ */
+ env->subprog_info[subprog].is_cb = true;
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_kfunc(insn->imm)) {
+ verifier_bug(env, "kfunc %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verifier_bug(env, "helper %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+
+ if (is_async_callback_calling_insn(insn)) {
+ struct bpf_verifier_state *async_cb;
+
+ /* there is no real recursion here. timer and workqueue callbacks are async */
+ env->subprog_info[subprog].is_async_cb = true;
+ async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+ insn_idx, subprog,
+ is_async_cb_sleepable(env, insn));
+ if (IS_ERR(async_cb))
+ return PTR_ERR(async_cb);
+ callee = async_cb->frame[0];
+ callee->async_entry_cnt = caller->async_entry_cnt + 1;
+
+ /* Convert bpf_timer_set_callback() args into timer callback args */
+ err = set_callee_state_cb(env, caller, callee, insn_idx);
+ if (err)
+ return err;
+
+ return 0;
+ }
+
+ /* for callback functions enqueue entry to callback and
+ * proceed with next instruction within current frame.
+ */
+ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+ if (IS_ERR(callback_state))
+ return PTR_ERR(callback_state);
+
+ err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+ callback_state);
+ if (err)
+ return err;
+
+ callback_state->callback_unroll_depth++;
+ callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+ caller->callback_depth = 0;
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller;
+ int err, subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
+ target_insn))
+ return -EFAULT;
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (subprog_is_global(env, subprog)) {
+ const char *sub_name = subprog_name(env, subprog);
+
+ if (env->cur_state->active_locks) {
+ verbose(env, "global function calls are not allowed while holding a lock,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
+ if (env->subprog_info[subprog].might_sleep &&
+ (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks ||
+ env->cur_state->active_irq_id || !in_sleepable(env))) {
+ verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
+ "i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
+ "a non-sleepable BPF program context\n");
+ return -EINVAL;
+ }
+
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
+ subprog, sub_name);
+ return err;
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+ subprog, sub_name);
+ if (env->subprog_info[subprog].changes_pkt_data)
+ clear_all_pkt_pointers(env);
+ /* mark global subprog for verifying after main prog */
+ subprog_aux(env, subprog)->called = true;
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return a 64-bit SCALAR_VALUE */
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
+ /* continue with next insn after call */
+ return 0;
+ }
+
+ /* for regular function entry setup new frame and continue
+ * from that frame.
+ */
+ err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
+ if (err)
+ return err;
+
+ clear_caller_saved_regs(env, caller->regs);
+
+ /* and go analyze first insn of the callee */
+ *insn_idx = env->subprog_info[subprog].start - 1;
+
+ bpf_reset_live_stack_callchain(env);
+
+ if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env, "caller:\n");
+ print_verifier_state(env, state, caller->frameno, true);
+ verbose(env, "callee:\n");
+ print_verifier_state(env, state, state->curframe, true);
+ }
+
+ return 0;
+}
+
+int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee)
+{
+ /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+ * void *callback_ctx, u64 flags);
+ * callback_fn(struct bpf_map *map, void *key, void *value,
+ * void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ return 0;
+}
+
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx)
+{
+ int i;
+
+ /* copy r1 - r5 args that callee can access. The copy includes parent
+ * pointers, which connects us up to the liveness chain
+ */
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ callee->regs[i] = caller->regs[i];
+ return 0;
+}
+
+static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map;
+ int err;
+
+ /* valid map_ptr and poison value does not matter */
+ map = insn_aux->map_ptr_state.map_ptr;
+ if (!map->ops->map_set_for_each_callback_args ||
+ !map->ops->map_for_each_callback) {
+ verbose(env, "callback function not allowed for map\n");
+ return -ENOTSUPP;
+ }
+
+ err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+ if (err)
+ return err;
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_loop_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
+ * u64 flags);
+ * callback_fn(u64 index, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1].type = SCALAR_VALUE;
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 0);
+ return 0;
+}
+
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_find_vma(struct task_struct *task, u64 addr,
+ * void *callback_fn, void *callback_ctx, u64 flags)
+ * (callback_fn)(struct task_struct *task,
+ * struct vm_area_struct *vma, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].btf = btf_vmlinux;
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+ * callback_ctx, u64 flags);
+ * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+ mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
+ *
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
+ * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
+ * by this point, so look at 'root'
+ */
+ struct btf_field *field;
+
+ field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off,
+ BPF_RB_ROOT);
+ if (!field || !field->graph_root.value_btf_id)
+ return -EFAULT;
+
+ mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_1]);
+ mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root);
+ ref_set_non_owning(env, &callee->regs[BPF_REG_2]);
+
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ callee->callback_ret_range = retval_range(0, 1);
+ return 0;
+}
+
+static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr;
+
+ /*
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ callee->callback_ret_range = retval_range(S32_MIN, S32_MAX);
+ return 0;
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id);
+
+/* Are we currently verifying the callback for a rbtree helper that must
+ * be called with lock held? If so, no need to complain about unreleased
+ * lock
+ */
+static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_insn *insn = env->prog->insnsi;
+ struct bpf_func_state *callee;
+ int kfunc_btf_id;
+
+ if (!state->curframe)
+ return false;
+
+ callee = state->frame[state->curframe];
+
+ if (!callee->in_callback_fn)
+ return false;
+
+ kfunc_btf_id = insn[callee->callsite].imm;
+ return is_rbtree_lock_required_kfunc(kfunc_btf_id);
+}
+
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg,
+ bool return_32bit)
+{
+ if (return_32bit)
+ return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval;
+ else
+ return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+}
+
+static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state, *prev_st;
+ struct bpf_func_state *caller, *callee;
+ struct bpf_reg_state *r0;
+ bool in_callback_fn;
+ int err;
+
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
+
+ callee = state->frame[state->curframe];
+ r0 = &callee->regs[BPF_REG_0];
+ if (r0->type == PTR_TO_STACK) {
+ /* technically it's ok to return caller's stack pointer
+ * (or caller's caller's pointer) back to the caller,
+ * since these pointers are valid. Only current stack
+ * pointer will be invalid as soon as function exits,
+ * but let's be conservative
*/
- if (regno == 0) {
- /* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ verbose(env, "cannot return stack pointer to the caller\n");
+ return -EINVAL;
+ }
+
+ caller = state->frame[state->curframe - 1];
+ if (callee->in_callback_fn) {
+ if (r0->type != SCALAR_VALUE) {
+ verbose(env, "R0 not a scalar value\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno - 1, reg->imm);
+
+ /* we are going to rely on register's precise value */
+ err = mark_chain_precision(env, BPF_REG_0);
+ if (err)
+ return err;
+
+ /* enforce R0 return value range, and bpf_callback_t returns 64bit */
+ if (!retval_range_within(callee->callback_ret_range, r0, false)) {
+ verbose_invalid_scalar(env, r0, callee->callback_ret_range,
+ "At callback return", "R0");
+ return -EINVAL;
+ }
+ if (!bpf_calls_callback(env, callee->callsite)) {
+ verifier_bug(env, "in callback at %d, callsite %d !calls_callback",
+ *insn_idx, callee->callsite);
+ return -EFAULT;
+ }
+ } else {
+ /* return to the caller whatever r0 had in the callee */
+ caller->regs[BPF_REG_0] = *r0;
+ }
+
+ /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+ * there function call logic would reschedule callback visit. If iteration
+ * converges is_state_visited() would prune that visit eventually.
+ */
+ in_callback_fn = callee->in_callback_fn;
+ if (in_callback_fn)
+ *insn_idx = callee->callsite;
+ else
+ *insn_idx = callee->callsite + 1;
+
+ if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env, "returning from callee:\n");
+ print_verifier_state(env, state, callee->frameno, true);
+ verbose(env, "to caller at %d:\n", *insn_idx);
+ print_verifier_state(env, state, caller->frameno, true);
+ }
+ /* clear everything in the callee. In case of exceptional exits using
+ * bpf_throw, this will be done by copy_verifier_state for extra frames. */
+ free_func_state(callee);
+ state->frame[state->curframe--] = NULL;
+
+ /* for callbacks widen imprecise scalars to make programs like below verify:
+ *
+ * struct ctx { int i; }
+ * void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
+ * ...
+ * struct ctx = { .i = 0; }
+ * bpf_loop(100, cb, &ctx, 0);
+ *
+ * This is similar to what is done in process_iter_next_call() for open
+ * coded iterators.
+ */
+ prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
+ if (prev_st) {
+ err = widen_imprecise_scalars(env, prev_st, state);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, int ret_type,
+ int func_id,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+ if (ret_type != RET_INTEGER)
+ return 0;
+
+ switch (func_id) {
+ case BPF_FUNC_get_stack:
+ case BPF_FUNC_get_task_stack:
+ case BPF_FUNC_probe_read_str:
+ case BPF_FUNC_probe_read_kernel_str:
+ case BPF_FUNC_probe_read_user_str:
+ ret_reg->smax_value = meta->msize_max_value;
+ ret_reg->s32_max_value = meta->msize_max_value;
+ ret_reg->smin_value = -MAX_ERRNO;
+ ret_reg->s32_min_value = -MAX_ERRNO;
+ reg_bounds_sync(ret_reg);
+ break;
+ case BPF_FUNC_get_smp_processor_id:
+ ret_reg->umax_value = nr_cpu_ids - 1;
+ ret_reg->u32_max_value = nr_cpu_ids - 1;
+ ret_reg->smax_value = nr_cpu_ids - 1;
+ ret_reg->s32_max_value = nr_cpu_ids - 1;
+ ret_reg->umin_value = 0;
+ ret_reg->u32_min_value = 0;
+ ret_reg->smin_value = 0;
+ ret_reg->s32_min_value = 0;
+ reg_bounds_sync(ret_reg);
+ break;
}
+ return reg_bounds_sanity_check(env, ret_reg, "retval");
+}
+
+static int
+record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+ int func_id, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map = meta->map_ptr;
+
+ if (func_id != BPF_FUNC_tail_call &&
+ func_id != BPF_FUNC_map_lookup_elem &&
+ func_id != BPF_FUNC_map_update_elem &&
+ func_id != BPF_FUNC_map_delete_elem &&
+ func_id != BPF_FUNC_map_push_elem &&
+ func_id != BPF_FUNC_map_pop_elem &&
+ func_id != BPF_FUNC_map_peek_elem &&
+ func_id != BPF_FUNC_for_each_map_elem &&
+ func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_percpu_elem)
+ return 0;
+
+ if (map == NULL) {
+ verifier_bug(env, "expected map for helper call");
+ return -EFAULT;
+ }
+
+ /* In case of read-only, some additional restrictions
+ * need to be applied in order to prevent altering the
+ * state of the map from program side.
+ */
+ if ((map->map_flags & BPF_F_RDONLY_PROG) &&
+ (func_id == BPF_FUNC_map_delete_elem ||
+ func_id == BPF_FUNC_map_update_elem ||
+ func_id == BPF_FUNC_map_push_elem ||
+ func_id == BPF_FUNC_map_pop_elem)) {
+ verbose(env, "write into map forbidden\n");
+ return -EACCES;
+ }
+
+ if (!aux->map_ptr_state.map_ptr)
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ !meta->map_ptr->bypass_spec_v1, false);
+ else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ !meta->map_ptr->bypass_spec_v1, true);
+ return 0;
+}
+
+static int
+record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+ int func_id, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ struct bpf_reg_state *regs = cur_regs(env), *reg;
+ struct bpf_map *map = meta->map_ptr;
+ u64 val, max;
+ int err;
+
+ if (func_id != BPF_FUNC_tail_call)
+ return 0;
+ if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
+ verbose(env, "expected prog array map for tail call");
+ return -EINVAL;
+ }
+
+ reg = &regs[BPF_REG_3];
+ val = reg->var_off.value;
+ max = map->max_entries;
+
+ if (!(is_reg_const(reg, false) && val < max)) {
+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+ return 0;
+ }
+
+ err = mark_chain_precision(env, BPF_REG_3);
+ if (err)
+ return err;
+ if (bpf_map_key_unseen(aux))
+ bpf_map_key_store(aux, val);
+ else if (!bpf_map_key_poisoned(aux) &&
+ bpf_map_key_immediate(aux) != val)
+ bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+ return 0;
+}
+
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
+ bool refs_lingering = false;
+ int i;
+
+ if (!exception_exit && cur_func(env)->frameno)
+ return 0;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ /* Allow struct_ops programs to return a referenced kptr back to
+ * kernel. Type checks are performed later in check_return_code.
+ */
+ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
+ reg->ref_obj_id == state->refs[i].id)
+ continue;
+ verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
+ state->refs[i].id, state->refs[i].insn_idx);
+ refs_lingering = true;
+ }
+ return refs_lingering ? -EINVAL : 0;
+}
+
+static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit, bool check_lock, const char *prefix)
+{
+ int err;
+
+ if (check_lock && env->cur_state->active_locks) {
+ verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ err = check_reference_leak(env, exception_exit);
+ if (err) {
+ verbose(env, "%s would lead to reference leak\n", prefix);
+ return err;
+ }
+
+ if (check_lock && env->cur_state->active_irq_id) {
+ verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_rcu_locks) {
+ verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ if (check_lock && env->cur_state->active_preempt_locks) {
+ verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *fmt_reg = &regs[BPF_REG_3];
+ struct bpf_reg_state *data_len_reg = &regs[BPF_REG_5];
+ struct bpf_map *fmt_map = fmt_reg->map_ptr;
+ struct bpf_bprintf_data data = {};
+ int err, fmt_map_off, num_args;
+ u64 fmt_addr;
+ char *fmt;
+
+ /* data must be an array of u64 */
+ if (data_len_reg->var_off.value % 8)
+ return -EINVAL;
+ num_args = data_len_reg->var_off.value / 8;
+
+ /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const
+ * and map_direct_value_addr is set.
+ */
+ fmt_map_off = fmt_reg->off + fmt_reg->var_off.value;
+ err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
+ fmt_map_off);
+ if (err) {
+ verbose(env, "failed to retrieve map value address\n");
+ return -EFAULT;
+ }
+ fmt = (char *)(long)fmt_addr + fmt_map_off;
+
+ /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we
+ * can focus on validating the format specifiers.
+ */
+ err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);
+ if (err < 0)
+ verbose(env, "Invalid format string\n");
+
return err;
}
-static int check_call(struct verifier_env *env, int func_id)
+static int check_get_func_ip(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ int func_id = BPF_FUNC_get_func_ip;
+
+ if (type == BPF_PROG_TYPE_TRACING) {
+ if (!bpf_prog_has_trampoline(env->prog)) {
+ verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
+ func_id_name(func_id), func_id);
+ return -ENOTSUPP;
+ }
+ return 0;
+ } else if (type == BPF_PROG_TYPE_KPROBE) {
+ return 0;
+ }
+
+ verbose(env, "func %s#%d not supported for program type %d\n",
+ func_id_name(func_id), func_id, type);
+ return -ENOTSUPP;
+}
+
+static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ bool reg_is_null = register_is_null(reg);
+
+ if (reg_is_null)
+ mark_chain_precision(env, BPF_REG_4);
+
+ return reg_is_null;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+ struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+ if (!state->initialized) {
+ state->initialized = 1;
+ state->fit_for_inline = loop_flag_is_zero(env);
+ state->callback_subprogno = subprogno;
+ return;
+ }
+
+ if (!state->fit_for_inline)
+ return;
+
+ state->fit_for_inline = (loop_flag_is_zero(env) &&
+ state->callback_subprogno == subprogno);
+}
+
+/* Returns whether or not the given map type can potentially elide
+ * lookup return value nullness check. This is possible if the key
+ * is statically known.
+ */
+static bool can_elide_value_nullness(enum bpf_map_type type)
+{
+ switch (type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
+ const struct bpf_func_proto **ptr)
+{
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID)
+ return -ERANGE;
+
+ if (!env->ops->get_func_proto)
+ return -EINVAL;
+
+ *ptr = env->ops->get_func_proto(func_id, env->prog);
+ return *ptr && (*ptr)->func ? 0 : -EINVAL;
+}
+
+/* Check if we're in a sleepable context. */
+static inline bool in_sleepable_context(struct bpf_verifier_env *env)
{
- struct verifier_state *state = &env->cur_state;
+ return !env->cur_state->active_rcu_locks &&
+ !env->cur_state->active_preempt_locks &&
+ !env->cur_state->active_irq_id &&
+ in_sleepable(env);
+}
+
+static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ bool returns_cpu_specific_alloc_ptr = false;
const struct bpf_func_proto *fn = NULL;
- struct reg_state *regs = state->regs;
- struct bpf_map *map = NULL;
- struct reg_state *reg;
- int i, err;
+ enum bpf_return_type ret_type;
+ enum bpf_type_flag ret_flag;
+ struct bpf_reg_state *regs;
+ struct bpf_call_arg_meta meta;
+ int insn_idx = *insn_idx_p;
+ bool changes_data;
+ int i, err, func_id;
/* find function prototype */
- if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %d\n", func_id);
+ func_id = insn->imm;
+ err = get_helper_proto(env, insn->imm, &fn);
+ if (err == -ERANGE) {
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id);
return -EINVAL;
}
- if (env->prog->aux->ops->get_func_proto)
- fn = env->prog->aux->ops->get_func_proto(func_id);
-
- if (!fn) {
- verbose("unknown func %d\n", func_id);
- return -EINVAL;
+ if (err) {
+ verbose(env, "program of this type cannot use helper %s#%d\n",
+ func_id_name(func_id), func_id);
+ return err;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose("cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
return -EINVAL;
}
- /* check args */
- err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
- if (err)
- return err;
- err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
- if (err)
+ if (fn->allowed && !fn->allowed(env->prog)) {
+ verbose(env, "helper call is not allowed in probe\n");
+ return -EINVAL;
+ }
+
+ if (!in_sleepable(env) && fn->might_sleep) {
+ verbose(env, "helper call might sleep in a non-sleepable prog\n");
+ return -EINVAL;
+ }
+
+ /* With LD_ABS/IND some JITs save/restore skb from r1. */
+ changes_data = bpf_helper_changes_pkt_data(func_id);
+ if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
+ verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ memset(&meta, 0, sizeof(meta));
+ meta.pkt_access = fn->pkt_access;
+
+ err = check_func_proto(fn, func_id);
+ if (err) {
+ verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
return err;
- err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ }
+
+ if (env->cur_state->active_rcu_locks) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+ }
+
+ if (env->cur_state->active_preempt_locks) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+ }
+
+ if (env->cur_state->active_irq_id) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+ }
+
+ /* Track non-sleepable context for helpers. */
+ if (!in_sleepable_context(env))
+ env->insn_aux_data[insn_idx].non_sleepable = true;
+
+ meta.func_id = func_id;
+ /* check args */
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
+ err = check_func_arg(env, i, &meta, fn, insn_idx);
+ if (err)
+ return err;
+ }
+
+ err = record_func_map(env, &meta, func_id, insn_idx);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+
+ err = record_func_key(env, &meta, func_id, insn_idx);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+
+ /* Mark slots with STACK_MISC in case of raw mode, stack offset
+ * is inferred from register state.
+ */
+ for (i = 0; i < meta.access_size; i++) {
+ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B,
+ BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+ }
+
+ regs = cur_regs(env);
+
+ if (meta.release_regno) {
+ err = -EINVAL;
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+ err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+ } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
+ u32 ref_obj_id = meta.ref_obj_id;
+ bool in_rcu = in_rcu_cs(env);
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ err = release_reference_nomark(env->cur_state, ref_obj_id);
+ if (!err) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+ reg->ref_obj_id = 0;
+ reg->type &= ~MEM_ALLOC;
+ reg->type |= MEM_RCU;
+ } else {
+ mark_reg_invalid(env, reg);
+ }
+ }
+ }));
+ }
+ } else if (meta.ref_obj_id) {
+ err = release_reference(env, meta.ref_obj_id);
+ } else if (register_is_null(&regs[meta.release_regno])) {
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
+ err = 0;
+ }
+ if (err) {
+ verbose(env, "func %s#%d reference has not been acquired before\n",
+ func_id_name(func_id), func_id);
+ return err;
+ }
+ }
+
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ err = check_resource_leak(env, false, true, "tail_call");
+ if (err)
+ return err;
+ break;
+ case BPF_FUNC_get_local_storage:
+ /* check that flags argument in get_local_storage(map, flags) is 0,
+ * this is required because get_local_storage() can't return an error.
+ */
+ if (!register_is_null(&regs[BPF_REG_2])) {
+ verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_FUNC_for_each_map_elem:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_map_elem_callback_state);
+ break;
+ case BPF_FUNC_timer_set_callback:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
+ break;
+ case BPF_FUNC_find_vma:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_find_vma_callback_state);
+ break;
+ case BPF_FUNC_snprintf:
+ err = check_bpf_snprintf_call(env, regs);
+ break;
+ case BPF_FUNC_loop:
+ update_loop_inline_state(env, meta.subprogno);
+ /* Verifier relies on R1 value to determine if bpf_loop() iteration
+ * is finished, thus mark it precise.
+ */
+ err = mark_chain_precision(env, BPF_REG_1);
+ if (err)
+ return err;
+ if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_loop_callback_state);
+ } else {
+ cur_func(env)->callback_depth = 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame%d bpf_loop iteration limit reached\n",
+ env->cur_state->curframe);
+ }
+ break;
+ case BPF_FUNC_dynptr_from_mem:
+ if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
+ verbose(env, "Unsupported reg type %s for bpf_dynptr_from_mem data\n",
+ reg_type_str(env, regs[BPF_REG_1].type));
+ return -EACCES;
+ }
+ break;
+ case BPF_FUNC_set_retval:
+ if (prog_type == BPF_PROG_TYPE_LSM &&
+ env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+ return -EINVAL;
+ }
+ }
+ break;
+ case BPF_FUNC_dynptr_data:
+ {
+ struct bpf_reg_state *reg;
+ int id, ref_obj_id;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
+
+
+ if (meta.dynptr_id) {
+ verifier_bug(env, "meta.dynptr_id already set");
+ return -EFAULT;
+ }
+ if (meta.ref_obj_id) {
+ verifier_bug(env, "meta.ref_obj_id already set");
+ return -EFAULT;
+ }
+
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verifier_bug(env, "failed to obtain dynptr id");
+ return id;
+ }
+
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verifier_bug(env, "failed to obtain dynptr ref_obj_id");
+ return ref_obj_id;
+ }
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
+
+ break;
+ }
+ case BPF_FUNC_dynptr_write:
+ {
+ enum bpf_dynptr_type dynptr_type;
+ struct bpf_reg_state *reg;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
+
+ dynptr_type = dynptr_get_type(env, reg);
+ if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+ return -EFAULT;
+
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB ||
+ dynptr_type == BPF_DYNPTR_TYPE_SKB_META)
+ /* this will trigger clear_all_pkt_pointers(), which will
+ * invalidate all dynptr slices associated with the skb
+ */
+ changes_data = true;
+
+ break;
+ }
+ case BPF_FUNC_per_cpu_ptr:
+ case BPF_FUNC_this_cpu_ptr:
+ {
+ struct bpf_reg_state *reg = &regs[BPF_REG_1];
+ const struct btf_type *type;
+
+ if (reg->type & MEM_RCU) {
+ type = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!type || !btf_type_is_struct(type)) {
+ verbose(env, "Helper has invalid btf/btf_id in R1\n");
+ return -EFAULT;
+ }
+ returns_cpu_specific_alloc_ptr = true;
+ env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+ }
+ break;
+ }
+ case BPF_FUNC_user_ringbuf_drain:
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_user_ringbuf_callback_state);
+ break;
+ }
+
if (err)
return err;
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- reg = regs + caller_saved[i];
- reg->type = NOT_INIT;
- reg->imm = 0;
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
- /* update return register */
- if (fn->ret_type == RET_INTEGER) {
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
- } else if (fn->ret_type == RET_VOID) {
+ /* helper call returns 64-bit value. */
+ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
+ /* update return register (already marked as written above) */
+ ret_type = fn->ret_type;
+ ret_flag = type_flag(ret_type);
+
+ switch (base_type(ret_type)) {
+ case RET_INTEGER:
+ /* sets type to SCALAR_VALUE */
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ break;
+ case RET_VOID:
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ break;
+ case RET_PTR_TO_MAP_VALUE:
+ /* There is no offset yet applied, variable or fixed */
+ mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
- if (map == NULL) {
- verbose("kernel subsystem misconfigured verifier\n");
+ if (meta.map_ptr == NULL) {
+ verifier_bug(env, "unexpected null map_ptr");
+ return -EFAULT;
+ }
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ can_elide_value_nullness(meta.map_ptr->map_type) &&
+ meta.const_map_key >= 0 &&
+ meta.const_map_key < meta.map_ptr->max_entries)
+ ret_flag &= ~PTR_MAYBE_NULL;
+
+ regs[BPF_REG_0].map_ptr = meta.map_ptr;
+ regs[BPF_REG_0].map_uid = meta.map_uid;
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
+ if (!type_may_be_null(ret_flag) &&
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
+ break;
+ case RET_PTR_TO_SOCKET:
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
+ break;
+ case RET_PTR_TO_SOCK_COMMON:
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
+ break;
+ case RET_PTR_TO_TCP_SOCK:
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
+ break;
+ case RET_PTR_TO_MEM:
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
+ regs[BPF_REG_0].mem_size = meta.mem_size;
+ break;
+ case RET_PTR_TO_MEM_OR_BTF_ID:
+ {
+ const struct btf_type *t;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL);
+ if (!btf_type_is_struct(t)) {
+ u32 tsize;
+ const struct btf_type *ret;
+ const char *tname;
+
+ /* resolve the type size of ksym. */
+ ret = btf_resolve_size(meta.ret_btf, t, &tsize);
+ if (IS_ERR(ret)) {
+ tname = btf_name_by_offset(meta.ret_btf, t->name_off);
+ verbose(env, "unable to resolve the size of type '%s': %ld\n",
+ tname, PTR_ERR(ret));
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
+ regs[BPF_REG_0].mem_size = tsize;
+ } else {
+ if (returns_cpu_specific_alloc_ptr) {
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+ } else {
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ }
+
+ regs[BPF_REG_0].btf = meta.ret_btf;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ }
+ break;
+ }
+ case RET_PTR_TO_BTF_ID:
+ {
+ struct btf *ret_btf;
+ int ret_btf_id;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+ if (func_id == BPF_FUNC_kptr_xchg) {
+ ret_btf = meta.kptr_field->kptr.btf;
+ ret_btf_id = meta.kptr_field->kptr.btf_id;
+ if (!btf_is_kernel(ret_btf)) {
+ regs[BPF_REG_0].type |= MEM_ALLOC;
+ if (meta.kptr_field->type == BPF_KPTR_PERCPU)
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+ }
+ } else {
+ if (fn->ret_btf_id == BPF_PTR_POISON) {
+ verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type",
+ func_id_name(func_id));
+ return -EFAULT;
+ }
+ ret_btf = btf_vmlinux;
+ ret_btf_id = *fn->ret_btf_id;
+ }
+ if (ret_btf_id == 0) {
+ verbose(env, "invalid return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id),
+ func_id);
return -EINVAL;
}
- regs[BPF_REG_0].map_ptr = map;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ break;
+ }
+ default:
+ verbose(env, "unknown return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (type_may_be_null(regs[BPF_REG_0].type))
+ regs[BPF_REG_0].id = ++env->id_gen;
+
+ if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
+ verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
+ func_id_name(func_id), func_id);
+ return -EFAULT;
+ }
+
+ if (is_dynptr_ref_function(func_id))
+ regs[BPF_REG_0].dynptr_id = meta.dynptr_id;
+
+ if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else if (is_acquire_function(func_id, meta.map_ptr)) {
+ int id = acquire_reference(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ /* For mark_ptr_or_null_reg() */
+ regs[BPF_REG_0].id = id;
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = id;
+ }
+
+ err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+ if (err)
+ return err;
+
+ err = check_map_func_compatibility(env, meta.map_ptr, func_id);
+ if (err)
+ return err;
+
+ if ((func_id == BPF_FUNC_get_stack ||
+ func_id == BPF_FUNC_get_task_stack) &&
+ !env->prog->has_callchain_buf) {
+ const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
+ err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+ err = -ENOTSUPP;
+ err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+ if (err) {
+ verbose(env, err_str, func_id_name(func_id), func_id);
+ return err;
+ }
+
+ env->prog->has_callchain_buf = true;
+ }
+
+ if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
+ env->prog->call_get_stack = true;
+
+ if (func_id == BPF_FUNC_get_func_ip) {
+ if (check_get_func_ip(env))
+ return -ENOTSUPP;
+ env->prog->call_get_func_ip = true;
+ }
+
+ if (func_id == BPF_FUNC_tail_call) {
+ if (env->cur_state->curframe) {
+ struct bpf_verifier_state *branch;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch))
+ return PTR_ERR(branch);
+ clear_all_pkt_pointers(env);
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ env->insn_idx--;
+ } else {
+ changes_data = false;
+ }
+ }
+
+ if (changes_data)
+ clear_all_pkt_pointers(env);
+ return 0;
+}
+
+/* mark_btf_func_reg_size() is used when the reg size is determined by
+ * the BTF func_proto's return value size and argument.
+ */
+static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs,
+ u32 regno, size_t reg_size)
+{
+ struct bpf_reg_state *reg = &regs[regno];
+
+ if (regno == BPF_REG_0) {
+ /* Function return value */
+ reg->subreg_def = reg_size == sizeof(u64) ?
+ DEF_NOT_SUBREG : env->insn_idx + 1;
+ } else if (reg_size == sizeof(u64)) {
+ /* Function argument */
+ mark_insn_zext(env, reg);
+ }
+}
+
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size);
+}
+
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU;
+}
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return btf_param_match_suffix(btf, arg, "__sz");
+}
+
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return btf_param_match_suffix(btf, arg, "__szk");
+}
+
+static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__opt");
+}
+
+static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__k");
+}
+
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__map");
+}
+
+static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__alloc");
+}
+
+static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__uninit");
+}
+
+static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__refcounted_kptr");
+}
+
+static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__nullable");
+}
+
+static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__str");
+}
+
+static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__irq_flag");
+}
+
+static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__prog");
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const char *param_name;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
+enum {
+ KF_ARG_DYNPTR_ID,
+ KF_ARG_LIST_HEAD_ID,
+ KF_ARG_LIST_NODE_ID,
+ KF_ARG_RB_ROOT_ID,
+ KF_ARG_RB_NODE_ID,
+ KF_ARG_WORKQUEUE_ID,
+ KF_ARG_RES_SPIN_LOCK_ID,
+ KF_ARG_TASK_WORK_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr)
+BTF_ID(struct, bpf_list_head)
+BTF_ID(struct, bpf_list_node)
+BTF_ID(struct, bpf_rb_root)
+BTF_ID(struct, bpf_rb_node)
+BTF_ID(struct, bpf_wq)
+BTF_ID(struct, bpf_res_spin_lock)
+BTF_ID(struct, bpf_task_work)
+
+static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
+ const struct btf_param *arg, int type)
+{
+ const struct btf_type *t;
+ u32 res_id;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t)
+ return false;
+ if (!btf_type_is_ptr(t))
+ return false;
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ if (!t)
+ return false;
+ return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
+}
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
+}
+
+static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
+}
+
+static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
+}
+
+static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID);
+}
+
+static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID);
+}
+
+static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
+}
+
+static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID);
+}
+
+static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
+}
+
+static bool is_rbtree_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]);
+}
+
+static bool is_list_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]);
+}
+
+static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
+ const struct btf_param *arg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_func_ptr(btf, arg->type, NULL);
+ if (!t)
+ return false;
+
+ return true;
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ verbose(env, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
+enum kfunc_ptr_arg_type {
+ KF_ARG_PTR_TO_CTX,
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
+ KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_ITER,
+ KF_ARG_PTR_TO_LIST_HEAD,
+ KF_ARG_PTR_TO_LIST_NODE,
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_MEM,
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_CALLBACK,
+ KF_ARG_PTR_TO_RB_ROOT,
+ KF_ARG_PTR_TO_RB_NODE,
+ KF_ARG_PTR_TO_NULL,
+ KF_ARG_PTR_TO_CONST_STR,
+ KF_ARG_PTR_TO_MAP,
+ KF_ARG_PTR_TO_WORKQUEUE,
+ KF_ARG_PTR_TO_IRQ_FLAG,
+ KF_ARG_PTR_TO_RES_SPIN_LOCK,
+ KF_ARG_PTR_TO_TASK_WORK,
+};
+
+enum special_kfunc_type {
+ KF_bpf_obj_new_impl,
+ KF_bpf_obj_drop_impl,
+ KF_bpf_refcount_acquire_impl,
+ KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_back_impl,
+ KF_bpf_list_pop_front,
+ KF_bpf_list_pop_back,
+ KF_bpf_list_front,
+ KF_bpf_list_back,
+ KF_bpf_cast_to_kern_ctx,
+ KF_bpf_rdonly_cast,
+ KF_bpf_rcu_read_lock,
+ KF_bpf_rcu_read_unlock,
+ KF_bpf_rbtree_remove,
+ KF_bpf_rbtree_add_impl,
+ KF_bpf_rbtree_first,
+ KF_bpf_rbtree_root,
+ KF_bpf_rbtree_left,
+ KF_bpf_rbtree_right,
+ KF_bpf_dynptr_from_skb,
+ KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_from_skb_meta,
+ KF_bpf_xdp_pull_data,
+ KF_bpf_dynptr_slice,
+ KF_bpf_dynptr_slice_rdwr,
+ KF_bpf_dynptr_clone,
+ KF_bpf_percpu_obj_new_impl,
+ KF_bpf_percpu_obj_drop_impl,
+ KF_bpf_throw,
+ KF_bpf_wq_set_callback_impl,
+ KF_bpf_preempt_disable,
+ KF_bpf_preempt_enable,
+ KF_bpf_iter_css_task_new,
+ KF_bpf_session_cookie,
+ KF_bpf_get_kmem_cache,
+ KF_bpf_local_irq_save,
+ KF_bpf_local_irq_restore,
+ KF_bpf_iter_num_new,
+ KF_bpf_iter_num_next,
+ KF_bpf_iter_num_destroy,
+ KF_bpf_set_dentry_xattr,
+ KF_bpf_remove_dentry_xattr,
+ KF_bpf_res_spin_lock,
+ KF_bpf_res_spin_unlock,
+ KF_bpf_res_spin_lock_irqsave,
+ KF_bpf_res_spin_unlock_irqrestore,
+ KF_bpf_dynptr_from_file,
+ KF_bpf_dynptr_file_discard,
+ KF___bpf_trap,
+ KF_bpf_task_work_schedule_signal_impl,
+ KF_bpf_task_work_schedule_resume_impl,
+};
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_front)
+BTF_ID(func, bpf_list_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_add_impl)
+BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_rbtree_root)
+BTF_ID(func, bpf_rbtree_left)
+BTF_ID(func, bpf_rbtree_right)
+#ifdef CONFIG_NET
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_from_skb_meta)
+BTF_ID(func, bpf_xdp_pull_data)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
+BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_preempt_disable)
+BTF_ID(func, bpf_preempt_enable)
+#ifdef CONFIG_CGROUPS
+BTF_ID(func, bpf_iter_css_task_new)
+#else
+BTF_ID_UNUSED
+#endif
+#ifdef CONFIG_BPF_EVENTS
+BTF_ID(func, bpf_session_cookie)
+#else
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_get_kmem_cache)
+BTF_ID(func, bpf_local_irq_save)
+BTF_ID(func, bpf_local_irq_restore)
+BTF_ID(func, bpf_iter_num_new)
+BTF_ID(func, bpf_iter_num_next)
+BTF_ID(func, bpf_iter_num_destroy)
+#ifdef CONFIG_BPF_LSM
+BTF_ID(func, bpf_set_dentry_xattr)
+BTF_ID(func, bpf_remove_dentry_xattr)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_res_spin_lock)
+BTF_ID(func, bpf_res_spin_unlock)
+BTF_ID(func, bpf_res_spin_lock_irqsave)
+BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, bpf_dynptr_from_file)
+BTF_ID(func, bpf_dynptr_file_discard)
+BTF_ID(func, __bpf_trap)
+BTF_ID(func, bpf_task_work_schedule_signal_impl)
+BTF_ID(func, bpf_task_work_schedule_resume_impl)
+
+static bool is_task_work_add_kfunc(u32 func_id)
+{
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
+}
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ meta->arg_owning_ref) {
+ return false;
+ }
+
+ return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
+
+static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
+}
+
+static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
+}
+
+static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data];
+}
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const struct btf_type *t, const struct btf_type *ref_t,
+ const char *ref_tname, const struct btf_param *args,
+ int argno, int nargs)
+{
+ u32 regno = argno + 1;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ bool arg_mem_size = false;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ return KF_ARG_PTR_TO_CTX;
+
+ /* In this function, we verify the kfunc's BTF as per the argument type,
+ * leaving the rest of the verification with respect to the register
+ * type to our caller. When a set of conditions hold in the BTF type of
+ * arguments, we resolve it to a known kfunc_ptr_arg_type.
+ */
+ if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ return KF_ARG_PTR_TO_CTX;
+
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
+
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_ALLOC_BTF_ID;
+
+ if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
+
+ if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_DYNPTR;
+
+ if (is_kfunc_arg_iter(meta, argno, &args[argno]))
+ return KF_ARG_PTR_TO_ITER;
+
+ if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_HEAD;
+
+ if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_NODE;
+
+ if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_ROOT;
+
+ if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RB_NODE;
+
+ if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CONST_STR;
+
+ if (is_kfunc_arg_map(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_MAP;
+
+ if (is_kfunc_arg_wq(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_WORKQUEUE;
+
+ if (is_kfunc_arg_task_work(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_TASK_WORK;
+
+ if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_IRQ_FLAG;
+
+ if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RES_SPIN_LOCK;
+
+ if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_BTF_ID;
+ }
+
+ if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_CALLBACK;
+
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
+ arg_mem_size = true;
+
+ /* This is the catch all argument type of register types supported by
+ * check_helper_mem_access. However, we only allow when argument type is
+ * pointer to scalar, or struct composed (recursively) of scalars. When
+ * arg_mem_size is true, the pointer can be void *.
+ */
+ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+ return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname, u32 ref_id,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ const struct btf_type *reg_ref_t;
+ bool strict_type_match = false;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ bool taking_projection;
+ bool struct_same;
+ u32 reg_ref_id;
+
+ if (base_type(reg->type) == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
} else {
- verbose("unknown return type %d of func %d\n",
- fn->ret_type, func_id);
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+ }
+
+ /* Enforce strict type matching for calls to kfuncs that are acquiring
+ * or releasing a reference, or are no-cast aliases. We do _not_
+ * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+ * as we want to enable BPF programs to pass types that are bitwise
+ * equivalent without forcing them to explicitly cast with something
+ * like bpf_cast_to_kern_ctx().
+ *
+ * For example, say we had a type like the following:
+ *
+ * struct bpf_cpumask {
+ * cpumask_t cpumask;
+ * refcount_t usage;
+ * };
+ *
+ * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed
+ * to a struct cpumask, so it would be safe to pass a struct
+ * bpf_cpumask * to a kfunc expecting a struct cpumask *.
+ *
+ * The philosophy here is similar to how we allow scalars of different
+ * types to be passed to kfuncs as long as the size is the same. The
+ * only difference here is that we're simply allowing
+ * btf_struct_ids_match() to walk the struct at the 0th offset, and
+ * resolve types.
+ */
+ if ((is_kfunc_release(meta) && reg->ref_obj_id) ||
+ btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
+ strict_type_match = true;
+
+ WARN_ON_ONCE(is_kfunc_release(meta) &&
+ (reg->off || !tnum_is_const(reg->var_off) ||
+ reg->var_off.value));
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
+ * actually use it -- it must cast to the underlying type. So we allow
+ * caller to pass in the underlying type.
+ */
+ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
+ if (!taking_projection && !struct_same) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ btf_type_str(reg_ref_t), reg_ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err, kfunc_class = IRQ_NATIVE_KFUNC;
+ bool irq_save;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) {
+ irq_save = true;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ kfunc_class = IRQ_LOCK_KFUNC;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) {
+ irq_save = false;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ kfunc_class = IRQ_LOCK_KFUNC;
+ } else {
+ verifier_bug(env, "unknown irq flags kfunc");
+ return -EFAULT;
+ }
+
+ if (irq_save) {
+ if (!is_irq_flag_reg_valid_uninit(env, reg)) {
+ verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
+ return -EINVAL;
+ }
+
+ err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class);
+ if (err)
+ return err;
+ } else {
+ err = is_irq_flag_reg_valid_init(env, reg);
+ if (err) {
+ verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
+ return err;
+ }
+
+ err = mark_irq_flag_read(env, reg);
+ if (err)
+ return err;
+
+ err = unmark_stack_slot_irq_flag(env, reg, kfunc_class);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+
+static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct btf_record *rec = reg_btf_record(reg);
+
+ if (!env->cur_state->active_locks) {
+ verifier_bug(env, "%s w/o active lock", __func__);
+ return -EFAULT;
+ }
+
+ if (type_flag(reg->type) & NON_OWN_REF) {
+ verifier_bug(env, "NON_OWN_REF already set");
+ return -EFAULT;
+ }
+
+ reg->type |= NON_OWN_REF;
+ if (rec->refcount_off >= 0)
+ reg->type |= MEM_RCU;
+
+ return 0;
+}
+
+static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *unused;
+ struct bpf_reg_state *reg;
+ int i;
+
+ if (!ref_obj_id) {
+ verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion");
+ return -EFAULT;
+ }
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].id != ref_obj_id)
+ continue;
+
+ /* Clear ref_obj_id here so release_reference doesn't clobber
+ * the whole reg
+ */
+ bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
+ if (reg->ref_obj_id == ref_obj_id) {
+ reg->ref_obj_id = 0;
+ ref_set_non_owning(env, reg);
+ }
+ }));
+ return 0;
+ }
+
+ verifier_bug(env, "ref state missing for ref_obj_id");
+ return -EFAULT;
+}
+
+/* Implementation details:
+ *
+ * Each register points to some region of memory, which we define as an
+ * allocation. Each allocation may embed a bpf_spin_lock which protects any
+ * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
+ * allocation. The lock and the data it protects are colocated in the same
+ * memory region.
+ *
+ * Hence, everytime a register holds a pointer value pointing to such
+ * allocation, the verifier preserves a unique reg->id for it.
+ *
+ * The verifier remembers the lock 'ptr' and the lock 'id' whenever
+ * bpf_spin_lock is called.
+ *
+ * To enable this, lock state in the verifier captures two values:
+ * active_lock.ptr = Register's type specific pointer
+ * active_lock.id = A unique ID for each register pointer value
+ *
+ * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
+ * supported register types.
+ *
+ * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
+ * allocated objects is the reg->btf pointer.
+ *
+ * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
+ * can establish the provenance of the map value statically for each distinct
+ * lookup into such maps. They always contain a single map value hence unique
+ * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
+ *
+ * So, in case of global variables, they use array maps with max_entries = 1,
+ * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
+ * into the same map value as max_entries is 1, as described above).
+ *
+ * In case of inner map lookups, the inner map pointer has same map_ptr as the
+ * outer map pointer (in verifier context), but each lookup into an inner map
+ * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
+ * maps from the same outer map share the same map_ptr as active_lock.ptr, they
+ * will get different reg->id assigned to each lookup, hence different
+ * active_lock.id.
+ *
+ * In case of allocated objects, active_lock.ptr is the reg->btf, and the
+ * reg->id is a unique ID preserved after the NULL pointer check on the pointer
+ * returned from bpf_obj_new. Each allocation receives a new reg->id.
+ */
+static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_reference_state *s;
+ void *ptr;
+ u32 id;
+
+ switch ((int)reg->type) {
+ case PTR_TO_MAP_VALUE:
+ ptr = reg->map_ptr;
+ break;
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ ptr = reg->btf;
+ break;
+ default:
+ verifier_bug(env, "unknown reg type for lock check");
+ return -EFAULT;
+ }
+ id = reg->id;
+
+ if (!env->cur_state->active_locks)
+ return -EINVAL;
+ s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr);
+ if (!s) {
+ verbose(env, "held lock and object are not in the same allocation\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool is_bpf_list_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_back];
+}
+
+static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_right];
+}
+
+static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_iter_num_new] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_next] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_destroy];
+}
+
+static bool is_bpf_graph_api_kfunc(u32 btf_id)
+{
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
+ btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
+}
+
+static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore];
+}
+
+static bool kfunc_spin_allowed(u32 btf_id)
+{
+ return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) ||
+ is_bpf_res_spin_lock_kfunc(btf_id);
+}
+
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
+}
+
+static bool is_async_callback_calling_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] ||
+ is_task_work_add_kfunc(btf_id);
+}
+
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+ return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+ insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+ return is_sync_callback_calling_kfunc(btf_id) ||
+ is_async_callback_calling_kfunc(btf_id);
+}
+
+static bool is_rbtree_lock_required_kfunc(u32 btf_id)
+{
+ return is_bpf_rbtree_api_kfunc(btf_id);
+}
+
+static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env,
+ enum btf_field_type head_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (head_field_type) {
+ case BPF_LIST_HEAD:
+ ret = is_bpf_list_api_kfunc(kfunc_btf_id);
+ break;
+ case BPF_RB_ROOT:
+ ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph root argument type %s\n",
+ btf_field_type_name(head_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s head arg for unknown kfunc\n",
+ btf_field_type_name(head_field_type));
+ return ret;
+}
+
+static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
+ enum btf_field_type node_field_type,
+ u32 kfunc_btf_id)
+{
+ bool ret;
+
+ switch (node_field_type) {
+ case BPF_LIST_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
+ break;
+ case BPF_RB_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
+ break;
+ default:
+ verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
+ btf_field_type_name(node_field_type));
+ return false;
+ }
+
+ if (!ret)
+ verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
+ btf_field_type_name(node_field_type));
+ return ret;
+}
+
+static int
+__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ struct btf_field **head_field)
+{
+ const char *head_type_name;
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 head_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verifier_bug(env, "unexpected btf mismatch in kfunc call");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id))
+ return -EFAULT;
+
+ head_type_name = btf_field_type_name(head_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, head_type_name);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, head_off, head_field_type);
+ if (!field) {
+ verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);
+ return -EINVAL;
+ }
+
+ /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
+ if (check_reg_allocation_locked(env, reg)) {
+ verbose(env, "bpf_spin_lock at off=%d must be held for %s\n",
+ rec->spin_lock_off, head_type_name);
+ return -EINVAL;
+ }
+
+ if (*head_field) {
+ verifier_bug(env, "repeating %s arg", head_type_name);
+ return -EFAULT;
+ }
+ *head_field = field;
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT,
+ &meta->arg_rbtree_root.field);
+}
+
+static int
+__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta,
+ enum btf_field_type head_field_type,
+ enum btf_field_type node_field_type,
+ struct btf_field **node_field)
+{
+ const char *node_type_name;
+ const struct btf_type *et, *t;
+ struct btf_field *field;
+ u32 node_off;
+
+ if (meta->btf != btf_vmlinux) {
+ verifier_bug(env, "unexpected btf mismatch in kfunc call");
+ return -EFAULT;
+ }
+
+ if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id))
+ return -EFAULT;
+
+ node_type_name = btf_field_type_name(node_field_type);
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. %s has to be at the constant offset\n",
+ regno, node_type_name);
+ return -EINVAL;
+ }
+
+ node_off = reg->off + reg->var_off.value;
+ field = reg_find_field_offset(reg, node_off, node_field_type);
+ if (!field) {
+ verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
+ return -EINVAL;
+ }
+
+ field = *node_field;
+
+ et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf,
+ field->graph_root.value_btf_id, true)) {
+ verbose(env, "operation on %s expects arg#1 %s at offset=%d "
+ "in struct %s, but arg is at offset=%d in struct %s\n",
+ btf_field_type_name(head_field_type),
+ btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off),
+ node_off, btf_name_by_offset(reg->btf, t->name_off));
+ return -EINVAL;
+ }
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+
+ if (node_off != field->graph_root.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n",
+ node_off, btf_field_type_name(node_field_type),
+ field->graph_root.node_offset,
+ btf_name_by_offset(field->graph_root.btf, et->name_off));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_NODE,
+ &meta->arg_list_head.field);
+}
+
+static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_NODE,
+ &meta->arg_rbtree_root.field);
+}
+
+/*
+ * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
+ * LSM hooks and iters (both sleepable and non-sleepable) are safe.
+ * Any sleepable progs are also safe since bpf_check_attach_target() enforce
+ * them can only be attached to some specific hook points.
+ */
+static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ return true;
+ case BPF_PROG_TYPE_TRACING:
+ if (env->prog->expected_attach_type == BPF_TRACE_ITER)
+ return true;
+ fallthrough;
+ default:
+ return in_sleepable(env);
+ }
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ int insn_idx)
+{
+ const char *func_name = meta->func_name, *ref_tname;
+ const struct btf *btf = meta->btf;
+ const struct btf_param *args;
+ struct btf_record *rec;
+ u32 i, nargs;
+ int ret;
+
+ args = (const struct btf_param *)(meta->func_proto + 1);
+ nargs = btf_type_vlen(meta->func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+
+ /* Check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ const struct btf_type *t, *ref_t, *resolve_ret;
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
+ u32 regno = i + 1, ref_id, type_size;
+ bool is_ret_buf_sz = false;
+ int kf_arg_type;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+ if (is_kfunc_arg_ignore(btf, &args[i]))
+ continue;
+
+ if (is_kfunc_arg_prog(btf, &args[i])) {
+ /* Used to reject repeated use of __prog. */
+ if (meta->arg_prog) {
+ verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
+ return -EFAULT;
+ }
+ meta->arg_prog = true;
+ cur_aux(env)->arg_prog = regno;
+ continue;
+ }
+
+ if (btf_type_is_scalar(t)) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+
+ if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (meta->arg_constant.found) {
+ verifier_bug(env, "only one constant argument permitted");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ ret = mark_chain_precision(env, regno);
+ if (ret < 0)
+ return ret;
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = reg->var_off.value;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+ meta->r0_rdonly = true;
+ is_ret_buf_sz = true;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+ is_ret_buf_sz = true;
+ }
+
+ if (is_ret_buf_sz) {
+ if (meta->r0_size) {
+ verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ continue;
+ }
+
+ if (!btf_type_is_ptr(t)) {
+ verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
+ (register_is_null(reg) || type_may_be_null(reg->type)) &&
+ !is_kfunc_arg_nullable(meta->btf, &args[i])) {
+ verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
+ return -EACCES;
+ }
+
+ if (reg->ref_obj_id) {
+ if (is_kfunc_release(meta) && meta->ref_obj_id) {
+ verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ if (is_kfunc_release(meta))
+ meta->release_regno = regno;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ if (kf_arg_type < 0)
+ return kf_arg_type;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_NULL:
+ continue;
+ case KF_ARG_PTR_TO_MAP:
+ if (!reg->map_ptr) {
+ verbose(env, "pointer in R%d isn't map pointer\n", regno);
+ return -EINVAL;
+ }
+ if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 ||
+ reg->map_ptr->record->task_work_off >= 0)) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * wq = bpf_map_lookup_elem(inner_map1);
+ * if (wq)
+ * // mismatch would have been allowed
+ * bpf_wq_init(wq, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map.ptr != reg->map_ptr ||
+ meta->map.uid != reg->map_uid) {
+ if (reg->map_ptr->record->task_work_off >= 0) {
+ verbose(env,
+ "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
+ verbose(env,
+ "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map.uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
+ meta->map.ptr = reg->map_ptr;
+ meta->map.uid = reg->map_uid;
+ fallthrough;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ case KF_ARG_PTR_TO_BTF_ID:
+ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
+ break;
+
+ if (!is_trusted_reg(reg)) {
+ if (!is_kfunc_rcu(meta)) {
+ verbose(env, "R%d must be referenced or trusted\n", regno);
+ return -EINVAL;
+ }
+ if (!is_rcu_reg(reg)) {
+ verbose(env, "R%d must be a rcu pointer\n", regno);
+ return -EINVAL;
+ }
+ }
+ fallthrough;
+ case KF_ARG_PTR_TO_CTX:
+ case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_ITER:
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_RB_ROOT:
+ case KF_ARG_PTR_TO_RB_NODE:
+ case KF_ARG_PTR_TO_MEM:
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ case KF_ARG_PTR_TO_CALLBACK:
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ case KF_ARG_PTR_TO_CONST_STR:
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_TASK_WORK:
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
+ break;
+ default:
+ verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type);
+ return -EFAULT;
+ }
+
+ if (is_kfunc_release(meta) && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_CTX:
+ if (reg->type != PTR_TO_CTX) {
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n",
+ i, reg_type_str(env, reg->type));
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+ if (ret < 0)
+ return -EINVAL;
+ meta->ret_btf_id = ret;
+ }
+ break;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
+ if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ if (meta->btf == btf_vmlinux) {
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+ }
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ {
+ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+ int clone_ref_obj_id = 0;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ dynptr_arg_type |= MEM_RDONLY;
+
+ if (is_kfunc_arg_uninit(btf, &args[i]))
+ dynptr_arg_type |= MEM_UNINIT;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ dynptr_arg_type |= DYNPTR_TYPE_SKB;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) {
+ dynptr_arg_type |= DYNPTR_TYPE_XDP;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) {
+ dynptr_arg_type |= DYNPTR_TYPE_SKB_META;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) {
+ dynptr_arg_type |= DYNPTR_TYPE_FILE;
+ meta->release_regno = regno;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
+
+ if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
+ verifier_bug(env, "no dynptr type for parent of clone");
+ return -EFAULT;
+ }
+
+ dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
+ clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
+ if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
+ verifier_bug(env, "missing ref obj id for parent of clone");
+ return -EFAULT;
+ }
+ }
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
+ if (ret < 0)
+ return ret;
+
+ if (!(dynptr_arg_type & MEM_UNINIT)) {
+ int id = dynptr_id(env, reg);
+
+ if (id < 0) {
+ verifier_bug(env, "failed to obtain dynptr id");
+ return id;
+ }
+ meta->initialized_dynptr.id = id;
+ meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg);
+ }
+
+ break;
+ }
+ case KF_ARG_PTR_TO_ITER:
+ if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
+ if (!check_css_task_iter_allowlist(env)) {
+ verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
+ return -EINVAL;
+ }
+ }
+ ret = process_iter_arg(env, regno, insn_idx, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_ROOT:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_NODE:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RB_NODE:
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ } else {
+ if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+ verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
+ return -EINVAL;
+ }
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "%s not allowed in rbtree cb\n", func_name);
+ return -EINVAL;
+ }
+ }
+
+ ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MAP:
+ /* If argument has '__map' suffix expect 'struct bpf_map *' */
+ ref_id = *reg2btf_ids[CONST_PTR_TO_MAP];
+ ref_t = btf_type_by_id(btf_vmlinux, ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ fallthrough;
+ case KF_ARG_PTR_TO_BTF_ID:
+ /* Only base_type is checked, further checks are done here */
+ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+ (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
+ !reg2btf_ids[base_type(reg->type)]) {
+ verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "expected %s or socket\n",
+ reg_type_str(env, base_type(reg->type) |
+ (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM:
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+ ret = check_mem_reg(env, reg, regno, type_size);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ {
+ struct bpf_reg_state *buff_reg = &regs[regno];
+ const struct btf_param *buff_arg = &args[i];
+ struct bpf_reg_state *size_reg = &regs[regno + 1];
+ const struct btf_param *size_arg = &args[i + 1];
+
+ if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) {
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
+ }
+
+ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+ if (meta->arg_constant.found) {
+ verifier_bug(env, "only one constant argument permitted");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(size_reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno + 1);
+ return -EINVAL;
+ }
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = size_reg->var_off.value;
+ }
+
+ /* Skip next '__sz' or '__szk' argument */
+ i++;
+ break;
+ }
+ case KF_ARG_PTR_TO_CALLBACK:
+ if (reg->type != PTR_TO_FUNC) {
+ verbose(env, "arg%d expected pointer to func\n", i);
+ return -EINVAL;
+ }
+ meta->subprogno = reg->subprogno;
+ break;
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ if (!type_is_ptr_alloc_obj(reg->type)) {
+ verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+ return -EINVAL;
+ }
+ if (!type_is_non_owning_ref(reg->type))
+ meta->arg_owning_ref = true;
+
+ rec = reg_btf_record(reg);
+ if (!rec) {
+ verifier_bug(env, "Couldn't find btf_record");
+ return -EFAULT;
+ }
+
+ if (rec->refcount_off < 0) {
+ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+ return -EINVAL;
+ }
+
+ meta->arg_btf = reg->btf;
+ meta->arg_btf_id = reg->btf_id;
+ break;
+ case KF_ARG_PTR_TO_CONST_STR:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a const string\n", i);
+ return -EINVAL;
+ }
+ ret = check_reg_const_str(env, reg, regno);
+ if (ret)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_WORKQUEUE:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_wq_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_TASK_WORK:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#%d doesn't point to a map value\n", i);
+ return -EINVAL;
+ }
+ ret = process_task_work_func(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+ return -EINVAL;
+ }
+ ret = process_irq_flag(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
+ {
+ int flags = PROCESS_RES_LOCK;
+
+ if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d doesn't point to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+
+ if (!is_bpf_res_spin_lock_kfunc(meta->func_id))
+ return -EFAULT;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ flags |= PROCESS_SPIN_LOCK;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ flags |= PROCESS_LOCK_IRQ;
+ ret = process_spin_lock(env, regno, flags);
+ if (ret < 0)
+ return ret;
+ break;
+ }
+ }
+ }
+
+ if (is_kfunc_release(meta) && !meta->release_regno) {
+ verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const char **kfunc_name)
+{
+ const struct btf_type *func, *func_proto;
+ u32 func_id, *kfunc_flags;
+ const char *func_name;
+ struct btf *desc_btf;
+
+ if (kfunc_name)
+ *kfunc_name = NULL;
+
+ if (!insn->imm)
+ return -EINVAL;
+
+ desc_btf = find_kfunc_desc_btf(env, insn->off);
+ if (IS_ERR(desc_btf))
+ return PTR_ERR(desc_btf);
+
+ func_id = insn->imm;
+ func = btf_type_by_id(desc_btf, func_id);
+ func_name = btf_name_by_offset(desc_btf, func->name_off);
+ if (kfunc_name)
+ *kfunc_name = func_name;
+ func_proto = btf_type_by_id(desc_btf, func->type);
+
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog);
+ if (!kfunc_flags) {
+ return -EACCES;
+ }
+
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = desc_btf;
+ meta->func_id = func_id;
+ meta->kfunc_flags = *kfunc_flags;
+ meta->func_proto = func_proto;
+ meta->func_name = func_name;
+
+ return 0;
+}
+
+/* check special kfuncs and return:
+ * 1 - not fall-through to 'else' branch, continue verification
+ * 0 - fall-through to 'else' branch
+ * < 0 - not fall-through to 'else' branch, return error
+ */
+static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux,
+ const struct btf_type *ptr_type, struct btf *desc_btf)
+{
+ const struct btf_type *ret_t;
+ int err = 0;
+
+ if (meta->btf != btf_vmlinux)
+ return 0;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ return -ENOMEM;
+
+ if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta->arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+ verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+ ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ if (!bpf_global_percpu_ma_set) {
+ mutex_lock(&bpf_percpu_ma_lock);
+ if (!bpf_global_percpu_ma_set) {
+ /* Charge memory allocated with bpf_global_percpu_ma to
+ * root memcg. The obj_cgroup for root memcg is NULL.
+ */
+ err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+ if (!err)
+ bpf_global_percpu_ma_set = true;
+ }
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&bpf_percpu_ma_lock);
+ err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta = struct_meta;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta->arg_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta->arg_btf,
+ meta->arg_btf_id);
+ } else if (is_list_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (is_rbtree_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->ret_btf_id;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value);
+ if (!ret_t) {
+ verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n",
+ meta->arg_constant.value);
+ return -EINVAL;
+ } else if (btf_type_is_struct(ret_t)) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_constant.value;
+ } else if (btf_type_is_void(ret_t)) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
+ regs[BPF_REG_0].mem_size = 0;
+ } else {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n");
+ return -EINVAL;
+ }
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta->arg_constant.found) {
+ verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta->arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta->initialized_dynptr.id) {
+ verifier_bug(env, "no dynptr id");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
+ } else {
+ return 0;
+ }
+
+ return 1;
+}
+
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
+ u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ const struct btf_type *t, *ptr_type;
+ struct bpf_kfunc_call_arg_meta meta;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, insn_idx = *insn_idx_p;
+ const struct btf_param *args;
+ struct btf *desc_btf;
+
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+ if (err == -EACCES && func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", func_name);
+ if (err)
+ return err;
+ desc_btf = meta.btf;
+ insn_aux = &env->insn_aux_data[insn_idx];
+
+ insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
+ if (!insn->off &&
+ (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) {
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (IS_ERR(branch)) {
+ verbose(env, "failed to push state for failed lock acquisition\n");
+ return PTR_ERR(branch);
+ }
+
+ regs = branch->frame[branch->curframe]->regs;
+
+ /* Clear r0-r5 registers in forked state */
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1);
+ if (err) {
+ verbose(env, "failed to mark s32 range for retval in forked state for lock\n");
+ return err;
+ }
+ __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32));
+ } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) {
+ verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n");
+ return -EFAULT;
+ }
+
+ if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
+ return -EACCES;
+ }
+
+ sleepable = is_kfunc_sleepable(&meta);
+ if (sleepable && !in_sleepable(env)) {
+ verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
+ return -EACCES;
+ }
+
+ /* Track non-sleepable context for kfuncs, same as for helpers. */
+ if (!in_sleepable_context(env))
+ insn_aux->non_sleepable = true;
+
+ /* Check the arguments */
+ err = check_kfunc_args(env, &meta, insn_idx);
+ if (err < 0)
+ return err;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) {
+ meta.r0_size = sizeof(u64);
+ meta.r0_rdonly = false;
+ }
+
+ if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (is_task_work_add_kfunc(meta.func_id)) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_task_work_schedule_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+ rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+
+ preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
+ preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
+
+ if (rcu_lock) {
+ env->cur_state->active_rcu_locks++;
+ } else if (rcu_unlock) {
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
+
+ if (env->cur_state->active_rcu_locks == 0) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+ if (--env->cur_state->active_rcu_locks == 0) {
+ bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+ }
+ } else if (sleepable && env->cur_state->active_rcu_locks) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+
+ if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
+ verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
+ return -EACCES;
+ }
+
+ if (env->cur_state->active_preempt_locks) {
+ if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ env->cur_state->active_preempt_locks--;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
+ return -EACCES;
+ }
+ } else if (preempt_disable) {
+ env->cur_state->active_preempt_locks++;
+ } else if (preempt_enable) {
+ verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_irq_id && sleepable) {
+ verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+ return -EACCES;
+ }
+
+ if (is_kfunc_rcu_protected(&meta) && !in_rcu_cs(env)) {
+ verbose(env, "kernel func %s requires RCU critical section protection\n", func_name);
+ return -EACCES;
+ }
+
+ /* In case of release function, we get register number of refcounted
+ * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
+ */
+ if (meta.release_regno) {
+ struct bpf_reg_state *reg = &regs[meta.release_regno];
+
+ if (meta.initialized_dynptr.ref_obj_id) {
+ err = unmark_stack_slots_dynptr(env, reg);
+ } else {
+ err = release_reference(env, reg->ref_obj_id);
+ if (err)
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ }
+ if (err)
+ return err;
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ insn_aux->insert_off = regs[BPF_REG_2].off;
+ insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
+ err = ref_convert_owning_non_owning(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
+ func_name, meta.func_id);
+ return err;
+ }
+
+ err = release_reference(env, release_ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+ if (!bpf_jit_supports_exceptions()) {
+ verbose(env, "JIT does not support calling kfunc %s#%d\n",
+ func_name, meta.func_id);
+ return -ENOTSUPP;
+ }
+ env->seen_exception = true;
+
+ /* In the case of the default callback, the cookie value passed
+ * to bpf_throw becomes the return value of the program.
+ */
+ if (!env->exception_callback_subprog) {
+ err = check_return_code(env, BPF_REG_1, "R1");
+ if (err < 0)
+ return err;
+ }
+ }
+
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ /* Check return type */
+ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
+
+ if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
+ /* Only exception is bpf_obj_new_impl */
+ if (meta.btf != btf_vmlinux ||
+ (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
+ }
+
+ if (btf_type_is_scalar(t)) {
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]))
+ __mark_reg_const_zero(env, &regs[BPF_REG_0]);
+ mark_btf_func_reg_size(env, BPF_REG_0, t->size);
+ } else if (btf_type_is_ptr(t)) {
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+ err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf);
+ if (err) {
+ if (err < 0)
+ return err;
+ } else if (btf_type_is_void(ptr_type)) {
+ /* kfunc returning 'void *' is equivalent to returning scalar */
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ } else if (!__btf_type_is_struct(ptr_type)) {
+ if (!meta.r0_size) {
+ __u32 sz;
+
+ if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
+ meta.r0_size = sz;
+ meta.r0_rdonly = true;
+ }
+ }
+ if (!meta.r0_size) {
+ ptr_type_name = btf_name_by_offset(desc_btf,
+ ptr_type->name_off);
+ verbose(env,
+ "kernel function %s returns pointer type %s %s is not supported\n",
+ func_name,
+ btf_type_str(ptr_type),
+ ptr_type_name);
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.r0_size;
+
+ if (meta.r0_rdonly)
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+
+ /* Ensures we don't access the memory after a release_reference() */
+ if (meta.ref_obj_id)
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+
+ if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
+ } else {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+ regs[BPF_REG_0].btf_id = ptr_type_id;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
+ regs[BPF_REG_0].type |= PTR_UNTRUSTED;
+ else if (is_kfunc_rcu_protected(&meta))
+ regs[BPF_REG_0].type |= MEM_RCU;
+
+ if (is_iter_next_kfunc(&meta)) {
+ struct bpf_reg_state *cur_iter;
+
+ cur_iter = get_iter_from_state(env->cur_state, &meta);
+
+ if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
+ regs[BPF_REG_0].type |= MEM_RCU;
+ else
+ regs[BPF_REG_0].type |= PTR_TRUSTED;
+ }
+ }
+
+ if (is_kfunc_ret_null(&meta)) {
+ regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+ /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
+ mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ if (is_kfunc_acquire(&meta)) {
+ int id = acquire_reference(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ if (is_kfunc_ret_null(&meta))
+ regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].ref_obj_id = id;
+ } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
+ ref_set_non_owning(env, &regs[BPF_REG_0]);
+ }
+
+ if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (btf_type_is_void(t)) {
+ if (meta.btf == btf_vmlinux) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_btf,
+ meta.arg_btf_id);
+ }
+ }
+ }
+
+ if (is_kfunc_pkt_changing(&meta))
+ clear_all_pkt_pointers(env);
+
+ nargs = btf_type_vlen(meta.func_proto);
+ args = (const struct btf_param *)(meta.func_proto + 1);
+ for (i = 0; i < nargs; i++) {
+ u32 regno = i + 1;
+
+ t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL);
+ if (btf_type_is_ptr(t))
+ mark_btf_func_reg_size(env, regno, sizeof(void *));
+ else
+ /* scalar. ensured by btf_check_kfunc_arg_match() */
+ mark_btf_func_reg_size(env, regno, t->size);
+ }
+
+ if (is_iter_next_kfunc(&meta)) {
+ err = process_iter_next_call(env, insn_idx, &meta);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
+{
+ bool known = tnum_is_const(reg->var_off);
+ s64 val = reg->var_off.value;
+ s64 smin = reg->smin_value;
+
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+ verbose(env, "math between %s pointer and %lld is not allowed\n",
+ reg_type_str(env, type), val);
+ return false;
+ }
+
+ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "%s pointer offset %d is not allowed\n",
+ reg_type_str(env, type), reg->off);
+ return false;
+ }
+
+ if (smin == S64_MIN) {
+ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+ reg_type_str(env, type));
+ return false;
+ }
+
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "value %lld makes %s pointer be out of bounds\n",
+ smin, reg_type_str(env, type));
+ return false;
+ }
+
+ return true;
+}
+
+enum {
+ REASON_BOUNDS = -1,
+ REASON_TYPE = -2,
+ REASON_PATHS = -3,
+ REASON_LIMIT = -4,
+ REASON_STACK = -5,
+};
+
+static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
+ u32 *alu_limit, bool mask_to_left)
+{
+ u32 max = 0, ptr_limit = 0;
+
+ switch (ptr_reg->type) {
+ case PTR_TO_STACK:
+ /* Offset 0 is out-of-bounds, but acceptable start for the
+ * left direction, see BPF_REG_FP. Also, unknown scalar
+ * offset where we would need to deal with min/max bounds is
+ * currently prohibited for unprivileged.
+ */
+ max = MAX_BPF_STACK + mask_to_left;
+ ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off);
+ break;
+ case PTR_TO_MAP_VALUE:
+ max = ptr_reg->map_ptr->value_size;
+ ptr_limit = (mask_to_left ?
+ ptr_reg->smin_value :
+ ptr_reg->umax_value) + ptr_reg->off;
+ break;
+ default:
+ return REASON_TYPE;
+ }
+
+ if (ptr_limit >= max)
+ return REASON_LIMIT;
+ *alu_limit = ptr_limit;
+ return 0;
+}
+
+static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ return env->bypass_spec_v1 ||
+ BPF_SRC(insn->code) == BPF_K ||
+ cur_aux(env)->nospec;
+}
+
+static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
+ u32 alu_state, u32 alu_limit)
+{
+ /* If we arrived here from different branches with different
+ * state or limits to sanitize, then this won't work.
+ */
+ if (aux->alu_state &&
+ (aux->alu_state != alu_state ||
+ aux->alu_limit != alu_limit))
+ return REASON_PATHS;
+
+ /* Corresponding fixup done in do_misc_fixups(). */
+ aux->alu_state = alu_state;
+ aux->alu_limit = alu_limit;
+ return 0;
+}
+
+static int sanitize_val_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+
+ if (can_skip_alu_sanitation(env, insn))
+ return 0;
+
+ return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
+}
+
+static bool sanitize_needed(u8 opcode)
+{
+ return opcode == BPF_ADD || opcode == BPF_SUB;
+}
+
+struct bpf_sanitize_info {
+ struct bpf_insn_aux_data aux;
+ bool mask_to_left;
+};
+
+static int sanitize_speculative_path(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ u32 next_idx, u32 curr_idx)
+{
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+
+ branch = push_stack(env, next_idx, curr_idx, true);
+ if (!IS_ERR(branch) && insn) {
+ regs = branch->frame[branch->curframe]->regs;
+ if (BPF_SRC(insn->code) == BPF_K) {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->src_reg);
+ }
+ }
+ return PTR_ERR_OR_ZERO(branch);
+}
+
+static int sanitize_ptr_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_sanitize_info *info,
+ const bool commit_window)
+{
+ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux;
+ struct bpf_verifier_state *vstate = env->cur_state;
+ bool off_is_imm = tnum_is_const(off_reg->var_off);
+ bool off_is_neg = off_reg->smin_value < 0;
+ bool ptr_is_dst_reg = ptr_reg == dst_reg;
+ u8 opcode = BPF_OP(insn->code);
+ u32 alu_state, alu_limit;
+ struct bpf_reg_state tmp;
+ int err;
+
+ if (can_skip_alu_sanitation(env, insn))
+ return 0;
+
+ /* We already marked aux for masking from non-speculative
+ * paths, thus we got here in the first place. We only care
+ * to explore bad access from here.
+ */
+ if (vstate->speculative)
+ goto do_sim;
+
+ if (!commit_window) {
+ if (!tnum_is_const(off_reg->var_off) &&
+ (off_reg->smin_value < 0) != (off_reg->smax_value < 0))
+ return REASON_BOUNDS;
+
+ info->mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
+ (opcode == BPF_SUB && !off_is_neg);
+ }
+
+ err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left);
+ if (err < 0)
+ return err;
+
+ if (commit_window) {
+ /* In commit phase we narrow the masking window based on
+ * the observed pointer move after the simulated operation.
+ */
+ alu_state = info->aux.alu_state;
+ alu_limit = abs(info->aux.alu_limit - alu_limit);
+ } else {
+ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
+ alu_state |= ptr_is_dst_reg ?
+ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ /* Limit pruning on unknown scalars to enable deep search for
+ * potential masking differences from other program paths.
+ */
+ if (!off_is_imm)
+ env->explore_alu_limits = true;
+ }
+
+ err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+ if (err < 0)
+ return err;
+do_sim:
+ /* If we're in commit phase, we're done here given we already
+ * pushed the truncated dst_reg into the speculative verification
+ * stack.
+ *
+ * Also, when register is a known constant, we rewrite register-based
+ * operation to immediate-based, and thus do not need masking (and as
+ * a consequence, do not need to simulate the zero-truncation either).
+ */
+ if (commit_window || off_is_imm)
+ return 0;
+
+ /* Simulate and find potential out-of-bounds access under
+ * speculative execution from truncation as a result of
+ * masking when off was not within expected range. If off
+ * sits in dst, then we temporarily need to move ptr there
+ * to simulate dst (== 0) +/-= ptr. Needed, for example,
+ * for cases where we use K-based arithmetic in one direction
+ * and truncated reg-based in the other in order to explore
+ * bad access.
+ */
+ if (!ptr_is_dst_reg) {
+ tmp = *dst_reg;
+ copy_register_state(dst_reg, ptr_reg);
+ }
+ err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx);
+ if (err < 0)
+ return REASON_STACK;
+ if (!ptr_is_dst_reg)
+ *dst_reg = tmp;
+ return 0;
+}
+
+static void sanitize_mark_insn_seen(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+
+ /* If we simulate paths under speculation, we don't update the
+ * insn as 'seen' such that when we verify unreachable paths in
+ * the non-speculative domain, sanitize_dead_code() can still
+ * rewrite/sanitize them.
+ */
+ if (!vstate->speculative)
+ env->insn_aux_data[env->insn_idx].seen = env->pass_cnt;
+}
+
+static int sanitize_err(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn, int reason,
+ const struct bpf_reg_state *off_reg,
+ const struct bpf_reg_state *dst_reg)
+{
+ static const char *err = "pointer arithmetic with it prohibited for !root";
+ const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub";
+ u32 dst = insn->dst_reg, src = insn->src_reg;
+
+ switch (reason) {
+ case REASON_BOUNDS:
+ verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n",
+ off_reg == dst_reg ? dst : src, err);
+ break;
+ case REASON_TYPE:
+ verbose(env, "R%d has pointer with unsupported alu operation, %s\n",
+ off_reg == dst_reg ? src : dst, err);
+ break;
+ case REASON_PATHS:
+ verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n",
+ dst, op, err);
+ break;
+ case REASON_LIMIT:
+ verbose(env, "R%d tried to %s beyond pointer bounds, %s\n",
+ dst, op, err);
+ break;
+ case REASON_STACK:
+ verbose(env, "R%d could not be pushed for speculative verification, %s\n",
+ dst, err);
+ return -ENOMEM;
+ default:
+ verifier_bug(env, "unknown reason (%d)", reason);
+ break;
+ }
+
+ return -EACCES;
+}
+
+/* check that stack access falls within stack limits and that 'reg' doesn't
+ * have a variable offset.
+ *
+ * Variable offset is prohibited for unprivileged mode for simplicity since it
+ * requires corresponding support in Spectre masking for stack ALU. See also
+ * retrieve_ptr_limit().
+ *
+ *
+ * 'off' includes 'reg->off'.
+ */
+static int check_stack_access_for_ptr_arithmetic(
+ struct bpf_verifier_env *env,
+ int regno,
+ const struct bpf_reg_state *reg,
+ int off)
+{
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n",
+ regno, tn_buf, off);
+ return -EACCES;
+ }
+
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose(env, "R%d stack pointer arithmetic goes out of range, "
+ "prohibited for !root; off=%d\n", regno, off);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int sanitize_check_bounds(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn,
+ const struct bpf_reg_state *dst_reg)
+{
+ u32 dst = insn->dst_reg;
+
+ /* For unprivileged we require that resulting offset must be in bounds
+ * in order to be able to sanitize access later on.
+ */
+ if (env->bypass_spec_v1)
+ return 0;
+
+ switch (dst_reg->type) {
+ case PTR_TO_STACK:
+ if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg,
+ dst_reg->off + dst_reg->var_off.value))
+ return -EACCES;
+ break;
+ case PTR_TO_MAP_VALUE:
+ if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) {
+ verbose(env, "R%d pointer arithmetic of map value goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ }
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+ * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks.
+ */
+static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ const struct bpf_reg_state *off_reg)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *regs = state->regs, *dst_reg;
+ bool known = tnum_is_const(off_reg->var_off);
+ s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
+ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+ struct bpf_sanitize_info info = {};
+ u8 opcode = BPF_OP(insn->code);
+ u32 dst = insn->dst_reg;
+ int ret, bounds_ret;
+
+ dst_reg = &regs[dst];
+
+ if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
+
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops on pointers produce (meaningless) scalars */
+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
+
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
+ return -EACCES;
+ }
+
+ if (ptr_reg->type & PTR_MAYBE_NULL) {
+ verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
+ dst, reg_type_str(env, ptr_reg->type));
+ return -EACCES;
+ }
+
+ /*
+ * Accesses to untrusted PTR_TO_MEM are done through probe
+ * instructions, hence no need to track offsets.
+ */
+ if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED))
+ return 0;
+
+ switch (base_type(ptr_reg->type)) {
+ case PTR_TO_CTX:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_STACK:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ case PTR_TO_TP_BUFFER:
+ case PTR_TO_BTF_ID:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_FUNC:
+ case CONST_PTR_TO_DYNPTR:
+ break;
+ case PTR_TO_FLOW_KEYS:
+ if (known)
+ break;
+ fallthrough;
+ case CONST_PTR_TO_MAP:
+ /* smin_val represents the known value */
+ if (known && smin_val == 0 && opcode == BPF_ADD)
+ break;
+ fallthrough;
+ default:
+ verbose(env, "R%d pointer arithmetic on %s prohibited\n",
+ dst, reg_type_str(env, ptr_reg->type));
+ return -EACCES;
+ }
+
+ /* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
+ * The id may be overwritten later if we create a new variable offset.
+ */
+ dst_reg->type = ptr_reg->type;
+ dst_reg->id = ptr_reg->id;
+
+ if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+ !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+ return -EINVAL;
+
+ /* pointer types do not carry 32-bit bounds at the moment. */
+ __mark_reg32_unbounded(dst_reg);
+
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg,
+ &info, false);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ }
+
+ switch (opcode) {
+ case BPF_ADD:
+ /* We can take a fixed offset as long as it doesn't overflow
+ * the s32 'off' field
+ */
+ if (known && (ptr_reg->off + smin_val ==
+ (s64)(s32)(ptr_reg->off + smin_val))) {
+ /* pointer += K. Accumulate it into fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->off = ptr_reg->off + smin_val;
+ dst_reg->raw = ptr_reg->raw;
+ break;
+ }
+ /* A new variable offset is created. Note that off_reg->off
+ * == 0, since it's a scalar.
+ * dst_reg gets the pointer type and since some positive
+ * integer value was added to the pointer, give it a new 'id'
+ * if it's a PTR_TO_PACKET.
+ * this creates a new 'base' pointer, off_reg (variable) gets
+ * added into the variable offset, and we copy the fixed offset
+ * from ptr_reg.
+ */
+ if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
+ check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
+ if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
+ check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ }
+ dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ dst_reg->raw = ptr_reg->raw;
+ if (reg_is_pkt_pointer(ptr_reg)) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
+ }
+ break;
+ case BPF_SUB:
+ if (dst_reg == off_reg) {
+ /* scalar -= pointer. Creates an unknown scalar */
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
+ dst);
+ return -EACCES;
+ }
+ /* We don't allow subtraction from FP, because (according to
+ * test_verifier.c test "invalid fp arithmetic", JITs might not
+ * be able to deal with it.
+ */
+ if (ptr_reg->type == PTR_TO_STACK) {
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
+ dst);
+ return -EACCES;
+ }
+ if (known && (ptr_reg->off - smin_val ==
+ (s64)(s32)(ptr_reg->off - smin_val))) {
+ /* pointer -= K. Subtract it from fixed offset */
+ dst_reg->smin_value = smin_ptr;
+ dst_reg->smax_value = smax_ptr;
+ dst_reg->umin_value = umin_ptr;
+ dst_reg->umax_value = umax_ptr;
+ dst_reg->var_off = ptr_reg->var_off;
+ dst_reg->id = ptr_reg->id;
+ dst_reg->off = ptr_reg->off - smin_val;
+ dst_reg->raw = ptr_reg->raw;
+ break;
+ }
+ /* A new variable offset is created. If the subtrahend is known
+ * nonnegative, then any reg->range we had before is still good.
+ */
+ if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
+ check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
+ /* Overflow possible, we know nothing */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
+ if (umin_ptr < umax_val) {
+ /* Overflow possible, we know nothing */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ /* Cannot overflow (as long as bounds are consistent) */
+ dst_reg->umin_value = umin_ptr - umax_val;
+ dst_reg->umax_value = umax_ptr - umin_val;
+ }
+ dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
+ dst_reg->off = ptr_reg->off;
+ dst_reg->raw = ptr_reg->raw;
+ if (reg_is_pkt_pointer(ptr_reg)) {
+ dst_reg->id = ++env->id_gen;
+ /* something was added to pkt_ptr, set range to zero */
+ if (smin_val < 0)
+ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw));
+ }
+ break;
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* bitwise ops on pointers are troublesome, prohibit. */
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ default:
+ /* other operators (e.g. MUL,LSH) produce non-pointer results */
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ }
+
+ if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
return -EINVAL;
+ reg_bounds_sync(dst_reg);
+ bounds_ret = sanitize_check_bounds(env, insn, dst_reg);
+ if (bounds_ret == -EACCES)
+ return bounds_ret;
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
+ &info, true);
+ if (verifier_bug_if(!can_skip_alu_sanitation(env, insn)
+ && !env->cur_state->speculative
+ && bounds_ret
+ && !ret,
+ env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) {
+ return -EFAULT;
+ }
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, off_reg, dst_reg);
+ }
+
+ return 0;
+}
+
+static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ u32 umin_val = src_reg->u32_min_value;
+ u32 umax_val = src_reg->u32_max_value;
+ bool min_overflow, max_overflow;
+
+ if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
+ }
+
+ /* If either all additions overflow or no additions overflow, then
+ * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
+ * dst_umax + src_umax. Otherwise (some additions overflow), set
+ * the output bounds to unbounded.
+ */
+ min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
+ max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
+
+ if (!min_overflow && max_overflow) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
+ }
+}
+
+static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ u64 umin_val = src_reg->umin_value;
+ u64 umax_val = src_reg->umax_value;
+ bool min_overflow, max_overflow;
+
+ if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
+ }
+
+ /* If either all additions overflow or no additions overflow, then
+ * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
+ * dst_umax + src_umax. Otherwise (some additions overflow), set
+ * the output bounds to unbounded.
+ */
+ min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
+ max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
+
+ if (!min_overflow && max_overflow) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
+ }
+}
+
+static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ u32 umin_val = src_reg->u32_min_value;
+ u32 umax_val = src_reg->u32_max_value;
+ bool min_underflow, max_underflow;
+
+ if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
+ /* Overflow possible, we know nothing */
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
+ }
+
+ /* If either all subtractions underflow or no subtractions
+ * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
+ * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
+ * underflow), set the output bounds to unbounded.
+ */
+ min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
+ max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
+
+ if (min_underflow && !max_underflow) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
+ }
+}
+
+static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ u64 umin_val = src_reg->umin_value;
+ u64 umax_val = src_reg->umax_value;
+ bool min_underflow, max_underflow;
+
+ if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
+ /* Overflow possible, we know nothing */
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
+ }
+
+ /* If either all subtractions underflow or no subtractions
+ * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
+ * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
+ * underflow), set the output bounds to unbounded.
+ */
+ min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
+ max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
+
+ if (min_underflow && !max_underflow) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
+ }
+}
+
+static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ s32 tmp_prod[4];
+
+ if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
+ }
+ if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
+ /* Overflow possible, we know nothing */
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
+ } else {
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
+ }
+}
+
+static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ s64 tmp_prod[4];
+
+ if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
+ }
+ if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
+ /* Overflow possible, we know nothing */
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
+ } else {
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
+ }
+}
+
+static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
+ u32 umax_val = src_reg->u32_max_value;
+
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
+ return;
+ }
+
+ /* We get our minimum from the var_off, since that's inherently
+ * bitwise. Our maximum is the minimum of the operands' maxima.
+ */
+ dst_reg->u32_min_value = var32_off.value;
+ dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ }
+}
+
+static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_is_const(src_reg->var_off);
+ bool dst_known = tnum_is_const(dst_reg->var_off);
+ u64 umax_val = src_reg->umax_value;
+
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
+ return;
+ }
+
+ /* We get our minimum from the var_off, since that's inherently
+ * bitwise. Our maximum is the minimum of the operands' maxima.
+ */
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+}
+
+static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
+ u32 umin_val = src_reg->u32_min_value;
+
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
+ return;
+ }
+
+ /* We get our maximum from the var_off, and our minimum is the
+ * maximum of the operands' minima
+ */
+ dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
+ dst_reg->u32_max_value = var32_off.value | var32_off.mask;
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ }
+}
+
+static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_is_const(src_reg->var_off);
+ bool dst_known = tnum_is_const(dst_reg->var_off);
+ u64 umin_val = src_reg->umin_value;
+
+ if (src_known && dst_known) {
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
+ return;
+ }
+
+ /* We get our maximum from the var_off, and our minimum is the
+ * maximum of the operands' minima
+ */
+ dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
+ dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+}
+
+static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_subreg_is_const(src_reg->var_off);
+ bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
+ struct tnum var32_off = tnum_subreg(dst_reg->var_off);
+
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);
+ return;
+ }
+
+ /* We get both minimum and maximum from the var32_off. */
+ dst_reg->u32_min_value = var32_off.value;
+ dst_reg->u32_max_value = var32_off.value | var32_off.mask;
+
+ /* Safe to set s32 bounds by casting u32 result into s32 when u32
+ * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded.
+ */
+ if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) {
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
+ } else {
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ }
+}
+
+static void scalar_min_max_xor(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ bool src_known = tnum_is_const(src_reg->var_off);
+ bool dst_known = tnum_is_const(dst_reg->var_off);
+
+ if (src_known && dst_known) {
+ /* dst_reg->var_off.value has been updated earlier */
+ __mark_reg_known(dst_reg, dst_reg->var_off.value);
+ return;
+ }
+
+ /* We get both minimum and maximum from the var_off. */
+ dst_reg->umin_value = dst_reg->var_off.value;
+ dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+
+ /* Safe to set s64 bounds by casting u64 result into s64 when u64
+ * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded.
+ */
+ if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) {
+ dst_reg->smin_value = dst_reg->umin_value;
+ dst_reg->smax_value = dst_reg->umax_value;
+ } else {
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ }
+
+ __update_reg_bounds(dst_reg);
+}
+
+static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
+ u64 umin_val, u64 umax_val)
+{
+ /* We lose all sign bit information (except what we can pick
+ * up from var_off)
+ */
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+ /* If we might shift our top bit out, then we know nothing */
+ if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
+ dst_reg->u32_min_value = 0;
+ dst_reg->u32_max_value = U32_MAX;
+ } else {
+ dst_reg->u32_min_value <<= umin_val;
+ dst_reg->u32_max_value <<= umax_val;
+ }
+}
+
+static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u32 umax_val = src_reg->u32_max_value;
+ u32 umin_val = src_reg->u32_min_value;
+ /* u32 alu operation will zext upper bits */
+ struct tnum subreg = tnum_subreg(dst_reg->var_off);
+
+ __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
+ dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
+ /* Not required but being careful mark reg64 bounds as unknown so
+ * that we are forced to pick them up from tnum and zext later and
+ * if some path skips this step we are still safe.
+ */
+ __mark_reg64_unbounded(dst_reg);
+ __update_reg32_bounds(dst_reg);
+}
+
+static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
+ u64 umin_val, u64 umax_val)
+{
+ /* Special case <<32 because it is a common compiler pattern to sign
+ * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
+ * positive we know this shift will also be positive so we can track
+ * bounds correctly. Otherwise we lose all sign bit information except
+ * what we can pick up from var_off. Perhaps we can generalize this
+ * later to shifts of any length.
+ */
+ if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
+ dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
+ else
+ dst_reg->smax_value = S64_MAX;
+
+ if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
+ dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
+ else
+ dst_reg->smin_value = S64_MIN;
+
+ /* If we might shift our top bit out, then we know nothing */
+ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+ } else {
+ dst_reg->umin_value <<= umin_val;
+ dst_reg->umax_value <<= umax_val;
+ }
+}
+
+static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 umax_val = src_reg->umax_value;
+ u64 umin_val = src_reg->umin_value;
+
+ /* scalar64 calc uses 32bit unshifted bounds so must be called first */
+ __scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
+ __scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
+
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+ /* We may learn something more from the var_off */
+ __update_reg_bounds(dst_reg);
+}
+
+static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ struct tnum subreg = tnum_subreg(dst_reg->var_off);
+ u32 umax_val = src_reg->u32_max_value;
+ u32 umin_val = src_reg->u32_min_value;
+
+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
+ * be negative, then either:
+ * 1) src_reg might be zero, so the sign bit of the result is
+ * unknown, so we lose our signed bounds
+ * 2) it's known negative, thus the unsigned bounds capture the
+ * signed bounds
+ * 3) the signed bounds cross zero, so they tell us nothing
+ * about the result
+ * If the value in dst_reg is known nonnegative, then again the
+ * unsigned bounds capture the signed bounds.
+ * Thus, in all cases it suffices to blow away our signed bounds
+ * and rely on inferring new ones from the unsigned bounds and
+ * var_off of the result.
+ */
+ dst_reg->s32_min_value = S32_MIN;
+ dst_reg->s32_max_value = S32_MAX;
+
+ dst_reg->var_off = tnum_rshift(subreg, umin_val);
+ dst_reg->u32_min_value >>= umax_val;
+ dst_reg->u32_max_value >>= umin_val;
+
+ __mark_reg64_unbounded(dst_reg);
+ __update_reg32_bounds(dst_reg);
+}
+
+static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 umax_val = src_reg->umax_value;
+ u64 umin_val = src_reg->umin_value;
+
+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
+ * be negative, then either:
+ * 1) src_reg might be zero, so the sign bit of the result is
+ * unknown, so we lose our signed bounds
+ * 2) it's known negative, thus the unsigned bounds capture the
+ * signed bounds
+ * 3) the signed bounds cross zero, so they tell us nothing
+ * about the result
+ * If the value in dst_reg is known nonnegative, then again the
+ * unsigned bounds capture the signed bounds.
+ * Thus, in all cases it suffices to blow away our signed bounds
+ * and rely on inferring new ones from the unsigned bounds and
+ * var_off of the result.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
+ dst_reg->umin_value >>= umax_val;
+ dst_reg->umax_value >>= umin_val;
+
+ /* Its not easy to operate on alu32 bounds here because it depends
+ * on bits being shifted in. Take easy way out and mark unbounded
+ * so we can recalculate later from tnum.
+ */
+ __mark_reg32_unbounded(dst_reg);
+ __update_reg_bounds(dst_reg);
+}
+
+static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 umin_val = src_reg->u32_min_value;
+
+ /* Upon reaching here, src_known is true and
+ * umax_val is equal to umin_val.
+ */
+ dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
+ dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
+
+ dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);
+
+ /* blow away the dst_reg umin_value/umax_value and rely on
+ * dst_reg var_off to refine the result.
+ */
+ dst_reg->u32_min_value = 0;
+ dst_reg->u32_max_value = U32_MAX;
+
+ __mark_reg64_unbounded(dst_reg);
+ __update_reg32_bounds(dst_reg);
+}
+
+static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg)
+{
+ u64 umin_val = src_reg->umin_value;
+
+ /* Upon reaching here, src_known is true and umax_val is equal
+ * to umin_val.
+ */
+ dst_reg->smin_value >>= umin_val;
+ dst_reg->smax_value >>= umin_val;
+
+ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);
+
+ /* blow away the dst_reg umin_value/umax_value and rely on
+ * dst_reg var_off to refine the result.
+ */
+ dst_reg->umin_value = 0;
+ dst_reg->umax_value = U64_MAX;
+
+ /* Its not easy to operate on alu32 bounds here because it depends
+ * on bits being shifted in from upper 32-bits. Take easy way out
+ * and mark unbounded so we can recalculate later from tnum.
+ */
+ __mark_reg32_unbounded(dst_reg);
+ __update_reg_bounds(dst_reg);
+}
+
+static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
+ const struct bpf_reg_state *src_reg)
+{
+ bool src_is_const = false;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+
+ if (insn_bitness == 32) {
+ if (tnum_subreg_is_const(src_reg->var_off)
+ && src_reg->s32_min_value == src_reg->s32_max_value
+ && src_reg->u32_min_value == src_reg->u32_max_value)
+ src_is_const = true;
+ } else {
+ if (tnum_is_const(src_reg->var_off)
+ && src_reg->smin_value == src_reg->smax_value
+ && src_reg->umin_value == src_reg->umax_value)
+ src_is_const = true;
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_NEG:
+ case BPF_AND:
+ case BPF_XOR:
+ case BPF_OR:
+ case BPF_MUL:
+ return true;
+
+ /* Shift operators range is only computable if shift dimension operand
+ * is a constant. Shifts greater than 31 or 63 are undefined. This
+ * includes shifts by a negative number.
+ */
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_ARSH:
+ return (src_is_const && src_reg->umax_value < insn_bitness);
+ default:
+ return false;
+ }
+}
+
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
+static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state src_reg)
+{
+ u8 opcode = BPF_OP(insn->code);
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
+ int ret;
+
+ if (!is_safe_to_compute_dst_reg_range(insn, &src_reg)) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
+
+ if (sanitize_needed(opcode)) {
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0)
+ return sanitize_err(env, insn, ret, NULL, NULL);
+ }
+
+ /* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
+ * There are two classes of instructions: The first class we track both
+ * alu32 and alu64 sign/unsigned bounds independently this provides the
+ * greatest amount of precision when alu operations are mixed with jmp32
+ * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
+ * and BPF_OR. This is possible because these ops have fairly easy to
+ * understand and calculate behavior in both 32-bit and 64-bit alu ops.
+ * See alu32 verifier tests for examples. The second class of
+ * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
+ * with regards to tracking sign/unsigned bounds because the bits may
+ * cross subreg boundaries in the alu64 case. When this happens we mark
+ * the reg unbounded in the subreg bound space and use the resulting
+ * tnum to calculate an approximation of the sign/unsigned bounds.
+ */
+ switch (opcode) {
+ case BPF_ADD:
+ scalar32_min_max_add(dst_reg, &src_reg);
+ scalar_min_max_add(dst_reg, &src_reg);
+ dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
+ break;
+ case BPF_SUB:
+ scalar32_min_max_sub(dst_reg, &src_reg);
+ scalar_min_max_sub(dst_reg, &src_reg);
+ dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
+ break;
+ case BPF_NEG:
+ env->fake_reg[0] = *dst_reg;
+ __mark_reg_known(dst_reg, 0);
+ scalar32_min_max_sub(dst_reg, &env->fake_reg[0]);
+ scalar_min_max_sub(dst_reg, &env->fake_reg[0]);
+ dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off);
+ break;
+ case BPF_MUL:
+ dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+ scalar32_min_max_mul(dst_reg, &src_reg);
+ scalar_min_max_mul(dst_reg, &src_reg);
+ break;
+ case BPF_AND:
+ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+ scalar32_min_max_and(dst_reg, &src_reg);
+ scalar_min_max_and(dst_reg, &src_reg);
+ break;
+ case BPF_OR:
+ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+ scalar32_min_max_or(dst_reg, &src_reg);
+ scalar_min_max_or(dst_reg, &src_reg);
+ break;
+ case BPF_XOR:
+ dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);
+ scalar32_min_max_xor(dst_reg, &src_reg);
+ scalar_min_max_xor(dst_reg, &src_reg);
+ break;
+ case BPF_LSH:
+ if (alu32)
+ scalar32_min_max_lsh(dst_reg, &src_reg);
+ else
+ scalar_min_max_lsh(dst_reg, &src_reg);
+ break;
+ case BPF_RSH:
+ if (alu32)
+ scalar32_min_max_rsh(dst_reg, &src_reg);
+ else
+ scalar_min_max_rsh(dst_reg, &src_reg);
+ break;
+ case BPF_ARSH:
+ if (alu32)
+ scalar32_min_max_arsh(dst_reg, &src_reg);
+ else
+ scalar_min_max_arsh(dst_reg, &src_reg);
+ break;
+ default:
+ break;
+ }
+
+ /* ALU32 ops are zero extended into 64bit register */
+ if (alu32)
+ zext_32_to_64(dst_reg);
+ reg_bounds_sync(dst_reg);
+ return 0;
+}
+
+/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
+ * and var_off.
+ */
+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ dst_reg = &regs[insn->dst_reg];
+ src_reg = NULL;
+
+ if (dst_reg->type == PTR_TO_ARENA) {
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ /*
+ * 32-bit operations zero upper bits automatically.
+ * 64-bit operations need to be converted to 32.
+ */
+ aux->needs_zext = true;
+
+ /* Any arithmetic operations are allowed on arena pointers */
+ return 0;
+ }
+
+ if (dst_reg->type != SCALAR_VALUE)
+ ptr_reg = dst_reg;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ src_reg = &regs[insn->src_reg];
+ if (src_reg->type != SCALAR_VALUE) {
+ if (dst_reg->type != SCALAR_VALUE) {
+ /* Combining two pointers by any ALU op yields
+ * an arbitrary scalar. Disallow all math except
+ * pointer subtraction
+ */
+ if (opcode == BPF_SUB && env->allow_ptr_leaks) {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ return 0;
+ }
+ verbose(env, "R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
+ } else {
+ /* scalar += pointer
+ * This is legal, but we have to reverse our
+ * src/dest handling in computing the range
+ */
+ err = mark_chain_precision(env, insn->dst_reg);
+ if (err)
+ return err;
+ return adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
+ }
+ } else if (ptr_reg) {
+ /* pointer += scalar */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
+ return adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
+ } else if (dst_reg->precise) {
+ /* if dst_reg is precise, src_reg should be precise as well */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
+ }
+ } else {
+ /* Pretend the src is a reg with a known value, since we only
+ * need to be able to read from this state.
+ */
+ off_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&off_reg, insn->imm);
+ src_reg = &off_reg;
+ if (ptr_reg) /* pointer += K */
+ return adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
+ }
+
+ /* Got here implies adding two SCALAR_VALUEs */
+ if (WARN_ON_ONCE(ptr_reg)) {
+ print_verifier_state(env, vstate, vstate->curframe, true);
+ verbose(env, "verifier internal error: unexpected ptr_reg\n");
+ return -EFAULT;
+ }
+ if (WARN_ON(!src_reg)) {
+ print_verifier_state(env, vstate, vstate->curframe, true);
+ verbose(env, "verifier internal error: no src_reg\n");
+ return -EFAULT;
+ }
+ err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ if (err)
+ return err;
+ /*
+ * Compilers can generate the code
+ * r1 = r2
+ * r1 += 0x1
+ * if r2 < 1000 goto ...
+ * use r1 in memory access
+ * So for 64-bit alu remember constant delta between r2 and r1 and
+ * update r1 after 'if' condition.
+ */
+ if (env->bpf_capable &&
+ BPF_OP(insn->code) == BPF_ADD && !alu32 &&
+ dst_reg->id && is_reg_const(src_reg, false)) {
+ u64 val = reg_const_value(src_reg, false);
+
+ if ((dst_reg->id & BPF_ADD_CONST) ||
+ /* prevent overflow in sync_linked_regs() later */
+ val > (u32)S32_MAX) {
+ /*
+ * If the register already went through rX += val
+ * we cannot accumulate another val into rx->off.
+ */
+ dst_reg->off = 0;
+ dst_reg->id = 0;
+ } else {
+ dst_reg->id |= BPF_ADD_CONST;
+ dst_reg->off = val;
+ }
+ } else {
+ /*
+ * Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by sync_linked_regs()
+ */
+ dst_reg->id = 0;
}
return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
int err;
if (opcode == BPF_END || opcode == BPF_NEG) {
if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != 0 ||
+ if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
- verbose("BPF_NEG uses reserved fields\n");
+ verbose(env, "BPF_NEG uses reserved fields\n");
return -EINVAL;
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
- verbose("BPF_END uses reserved fields\n");
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ BPF_SRC(insn->code) != BPF_TO_LE)) {
+ verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
}
/* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
+ if (is_pointer_value(env, insn->dst_reg)) {
+ verbose(env, "R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (opcode == BPF_NEG &&
+ regs[insn->dst_reg].type == SCALAR_VALUE) {
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ err = err ?: adjust_scalar_min_max_vals(env, insn,
+ &regs[insn->dst_reg],
+ regs[insn->dst_reg]);
+ } else {
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
+ }
if (err)
return err;
} else if (opcode == BPF_MOV) {
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
- return -EINVAL;
+ if (BPF_CLASS(insn->code) == BPF_ALU) {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16) ||
+ insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else if (insn->off == BPF_ADDR_SPACE_CAST) {
+ if (insn->imm != 1 && insn->imm != 1u << 16) {
+ verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n");
+ return -EINVAL;
+ }
+ if (!env->prog->aux->arena) {
+ verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n");
+ return -EINVAL;
+ }
+ } else {
+ if ((insn->off != 0 && insn->off != 8 && insn->off != 16 &&
+ insn->off != 32) || insn->imm) {
+ verbose(env, "BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
}
/* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}
- /* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ /* check dest operand, mark as required later */
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
if (err)
return err;
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = regs + insn->src_reg;
+ struct bpf_reg_state *dst_reg = regs + insn->dst_reg;
+
if (BPF_CLASS(insn->code) == BPF_ALU64) {
- /* case: R1 = R2
- * copy register state to dest reg
- */
- regs[insn->dst_reg] = regs[insn->src_reg];
+ if (insn->imm) {
+ /* off == BPF_ADDR_SPACE_CAST */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ if (insn->imm == 1) { /* cast from as(1) to as(0) */
+ dst_reg->type = PTR_TO_ARENA;
+ /* PTR_TO_ARENA is 32-bit */
+ dst_reg->subreg_def = env->insn_idx + 1;
+ }
+ } else if (insn->off == 0) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ /* case: R1 = (s8, s16 s32)R2 */
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose(env,
+ "R%d sign-extension part of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ bool no_sext;
+
+ no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ coerce_reg_to_size_sx(dst_reg, insn->off >> 3);
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
+ } else {
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ }
+ }
} else {
- regs[insn->dst_reg].type = UNKNOWN_VALUE;
- regs[insn->dst_reg].map_ptr = NULL;
+ /* R1 = (u32) R2 */
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose(env,
+ "R%d partial copy of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ } else if (src_reg->type == SCALAR_VALUE) {
+ if (insn->off == 0) {
+ bool is_src_reg_u32 = get_reg_width(src_reg) <= 32;
+
+ if (is_src_reg_u32)
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ /* Make sure ID is cleared if src_reg is not in u32
+ * range otherwise dst_reg min/max could be incorrectly
+ * propagated into src_reg by sync_linked_regs()
+ */
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ } else {
+ /* case: W1 = (s8, s16)W2 */
+ bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1));
+
+ if (no_sext)
+ assign_scalar_id_before_mov(env, src_reg);
+ copy_register_state(dst_reg, src_reg);
+ if (!no_sext)
+ dst_reg->id = 0;
+ dst_reg->subreg_def = env->insn_idx + 1;
+ coerce_subreg_to_size_sx(dst_reg, insn->off >> 3);
+ }
+ } else {
+ mark_reg_unknown(env, regs,
+ insn->dst_reg);
+ }
+ zext_32_to_64(dst_reg);
+ reg_bounds_sync(dst_reg);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
+ /* clear any state __mark_reg_known doesn't set */
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ regs[insn->dst_reg].type = SCALAR_VALUE;
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ __mark_reg_known(regs + insn->dst_reg,
+ insn->imm);
+ } else {
+ __mark_reg_known(regs + insn->dst_reg,
+ (u32)insn->imm);
+ }
}
} else if (opcode > BPF_END) {
- verbose("invalid BPF_ALU opcode %x\n", opcode);
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;
} else { /* all other ALU ops: and, sub, xor, add, ... */
- bool stack_relative = false;
-
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
- if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) ||
+ (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
}
/* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
if (err)
return err;
if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
- verbose("div by zero\n");
+ verbose(env, "div by zero\n");
return -EINVAL;
}
- /* pattern match 'bpf_add Rx, imm' instruction */
- if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- regs[insn->dst_reg].type == FRAME_PTR &&
- BPF_SRC(insn->code) == BPF_K)
- stack_relative = true;
+ if ((opcode == BPF_LSH || opcode == BPF_RSH ||
+ opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+ int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
+
+ if (insn->imm < 0 || insn->imm >= size) {
+ verbose(env, "invalid shift %d\n", insn->imm);
+ return -EINVAL;
+ }
+ }
/* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ err = err ?: adjust_reg_min_max_vals(env, insn);
if (err)
return err;
+ }
+
+ return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
+}
+
+static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
+ struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type,
+ bool range_right_open)
+{
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+ int new_range;
+
+ if (dst_reg->off < 0 ||
+ (dst_reg->off == 0 && range_right_open))
+ /* This doesn't give us any range */
+ return;
+
+ if (dst_reg->umax_value > MAX_PACKET_OFF ||
+ dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
+ /* Risk of overflow. For instance, ptr + (1<<63) may be less
+ * than pkt_end, but that's because it's also less than pkt.
+ */
+ return;
+
+ new_range = dst_reg->off;
+ if (range_right_open)
+ new_range++;
+
+ /* Examples for register markings:
+ *
+ * pkt_data in dst register:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 > pkt_end) goto <handle exception>
+ * <access okay>
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 < pkt_end) goto <access okay>
+ * <handle exception>
+ *
+ * Where:
+ * r2 == dst_reg, pkt_end == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * pkt_data in src register:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end >= r2) goto <access okay>
+ * <handle exception>
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end <= r2) goto <handle exception>
+ * <access okay>
+ *
+ * Where:
+ * pkt_end == dst_reg, r2 == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+ * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8)
+ * and [r3, r3 + 8-1) respectively is safe to access depending on
+ * the check.
+ */
+
+ /* If our ids match, then we must have the same max_value. And we
+ * don't care about the other reg's fixed offset, since if it's too big
+ * the range won't allow anything.
+ * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+ */
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }));
+}
- if (stack_relative) {
- regs[insn->dst_reg].type = PTR_TO_STACK;
- regs[insn->dst_reg].imm = insn->imm;
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
+ struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+ u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
+ u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
+ s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
+ s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
+ u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
+ u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
+ s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
+ s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
+
+ if (reg1 == reg2) {
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JLE:
+ case BPF_JSGE:
+ case BPF_JSLE:
+ case BPF_JEQ:
+ return 1;
+ case BPF_JGT:
+ case BPF_JLT:
+ case BPF_JSGT:
+ case BPF_JSLT:
+ case BPF_JNE:
+ return 0;
+ case BPF_JSET:
+ if (tnum_is_const(t1))
+ return t1.value != 0;
+ else
+ return (smin1 <= 0 && smax1 >= 0) ? -1 : 1;
+ default:
+ return -1;
}
}
- return 0;
+ switch (opcode) {
+ case BPF_JEQ:
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value == t2.value;
+ if (!tnum_overlap(t1, t2))
+ return 0;
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
+ return 0;
+ if (smin1 > smax2 || smax1 < smin2)
+ return 0;
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 0;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 0;
+ }
+ break;
+ case BPF_JNE:
+ /* constants, umin/umax and smin/smax checks would be
+ * redundant in this case because they all should match
+ */
+ if (tnum_is_const(t1) && tnum_is_const(t2))
+ return t1.value != t2.value;
+ if (!tnum_overlap(t1, t2))
+ return 1;
+ /* non-overlapping ranges */
+ if (umin1 > umax2 || umax1 < umin2)
+ return 1;
+ if (smin1 > smax2 || smax1 < smin2)
+ return 1;
+ if (!is_jmp32) {
+ /* if 64-bit ranges are inconclusive, see if we can
+ * utilize 32-bit subrange knowledge to eliminate
+ * branches that can't be taken a priori
+ */
+ if (reg1->u32_min_value > reg2->u32_max_value ||
+ reg1->u32_max_value < reg2->u32_min_value)
+ return 1;
+ if (reg1->s32_min_value > reg2->s32_max_value ||
+ reg1->s32_max_value < reg2->s32_min_value)
+ return 1;
+ }
+ break;
+ case BPF_JSET:
+ if (!is_reg_const(reg2, is_jmp32)) {
+ swap(reg1, reg2);
+ swap(t1, t2);
+ }
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+ if ((~t1.mask & t1.value) & t2.value)
+ return 1;
+ if (!((t1.mask | t1.value) & t2.value))
+ return 0;
+ break;
+ case BPF_JGT:
+ if (umin1 > umax2)
+ return 1;
+ else if (umax1 <= umin2)
+ return 0;
+ break;
+ case BPF_JSGT:
+ if (smin1 > smax2)
+ return 1;
+ else if (smax1 <= smin2)
+ return 0;
+ break;
+ case BPF_JLT:
+ if (umax1 < umin2)
+ return 1;
+ else if (umin1 >= umax2)
+ return 0;
+ break;
+ case BPF_JSLT:
+ if (smax1 < smin2)
+ return 1;
+ else if (smin1 >= smax2)
+ return 0;
+ break;
+ case BPF_JGE:
+ if (umin1 >= umax2)
+ return 1;
+ else if (umax1 < umin2)
+ return 0;
+ break;
+ case BPF_JSGE:
+ if (smin1 >= smax2)
+ return 1;
+ else if (smax1 < smin2)
+ return 0;
+ break;
+ case BPF_JLE:
+ if (umax1 <= umin2)
+ return 1;
+ else if (umin1 > umax2)
+ return 0;
+ break;
+ case BPF_JSLE:
+ if (smax1 <= smin2)
+ return 1;
+ else if (smin1 > smax2)
+ return 0;
+ break;
+ }
+
+ return -1;
+}
+
+static int flip_opcode(u32 opcode)
+{
+ /* How can we transform "a <op> b" into "b <op> a"? */
+ static const u8 opcode_flip[16] = {
+ /* these stay the same */
+ [BPF_JEQ >> 4] = BPF_JEQ,
+ [BPF_JNE >> 4] = BPF_JNE,
+ [BPF_JSET >> 4] = BPF_JSET,
+ /* these swap "lesser" and "greater" (L and G in the opcodes) */
+ [BPF_JGE >> 4] = BPF_JLE,
+ [BPF_JGT >> 4] = BPF_JLT,
+ [BPF_JLE >> 4] = BPF_JGE,
+ [BPF_JLT >> 4] = BPF_JGT,
+ [BPF_JSGE >> 4] = BPF_JSLE,
+ [BPF_JSGT >> 4] = BPF_JSLT,
+ [BPF_JSLE >> 4] = BPF_JSGE,
+ [BPF_JSLT >> 4] = BPF_JSGT
+ };
+ return opcode_flip[opcode >> 4];
+}
+
+static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg,
+ u8 opcode)
+{
+ struct bpf_reg_state *pkt;
+
+ if (src_reg->type == PTR_TO_PACKET_END) {
+ pkt = dst_reg;
+ } else if (dst_reg->type == PTR_TO_PACKET_END) {
+ pkt = src_reg;
+ opcode = flip_opcode(opcode);
+ } else {
+ return -1;
+ }
+
+ if (pkt->range >= 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JLE:
+ /* pkt <= pkt_end */
+ fallthrough;
+ case BPF_JGT:
+ /* pkt > pkt_end */
+ if (pkt->range == BEYOND_PKT_END)
+ /* pkt has at last one extra byte beyond pkt_end */
+ return opcode == BPF_JGT;
+ break;
+ case BPF_JLT:
+ /* pkt < pkt_end */
+ fallthrough;
+ case BPF_JGE:
+ /* pkt >= pkt_end */
+ if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END)
+ return opcode == BPF_JGE;
+ break;
+ }
+ return -1;
+}
+
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
+ * and return:
+ * 1 - branch will be taken and "goto target" will be executed
+ * 0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
+ * range [0,10]
+ */
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
+ return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
+
+ if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
+ u64 val;
+
+ /* arrange that reg2 is a scalar, and reg1 is a pointer */
+ if (!is_reg_const(reg2, is_jmp32)) {
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ }
+ /* and ensure that reg2 is a constant */
+ if (!is_reg_const(reg2, is_jmp32))
+ return -1;
+
+ if (!reg_not_null(reg1))
+ return -1;
+
+ /* If pointer is valid tests against zero will fail so we can
+ * use this to direct branch taken.
+ */
+ val = reg_const_value(reg2, is_jmp32);
+ if (val != 0)
+ return -1;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ return 0;
+ case BPF_JNE:
+ return 1;
+ default:
+ return -1;
+ }
+ }
+
+ /* now deal with two scalars, but not necessarily constants */
+ return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
+}
+
+/* Opcode that corresponds to a *false* branch condition.
+ * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
+ */
+static u8 rev_opcode(u8 opcode)
+{
+ switch (opcode) {
+ case BPF_JEQ: return BPF_JNE;
+ case BPF_JNE: return BPF_JEQ;
+ /* JSET doesn't have it's reverse opcode in BPF, so add
+ * BPF_X flag to denote the reverse of that operation
+ */
+ case BPF_JSET: return BPF_JSET | BPF_X;
+ case BPF_JSET | BPF_X: return BPF_JSET;
+ case BPF_JGE: return BPF_JLT;
+ case BPF_JGT: return BPF_JLE;
+ case BPF_JLE: return BPF_JGT;
+ case BPF_JLT: return BPF_JGE;
+ case BPF_JSGE: return BPF_JSLT;
+ case BPF_JSGT: return BPF_JSLE;
+ case BPF_JSLE: return BPF_JSGT;
+ case BPF_JSLT: return BPF_JSGE;
+ default: return 0;
+ }
+}
+
+/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+ u8 opcode, bool is_jmp32)
+{
+ struct tnum t;
+ u64 val;
+
+ /* In case of GE/GT/SGE/JST, reuse LE/LT/SLE/SLT logic from below */
+ switch (opcode) {
+ case BPF_JGE:
+ case BPF_JGT:
+ case BPF_JSGE:
+ case BPF_JSGT:
+ opcode = flip_opcode(opcode);
+ swap(reg1, reg2);
+ break;
+ default:
+ break;
+ }
+
+ switch (opcode) {
+ case BPF_JEQ:
+ if (is_jmp32) {
+ reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->u32_min_value = reg1->u32_min_value;
+ reg2->u32_max_value = reg1->u32_max_value;
+ reg2->s32_min_value = reg1->s32_min_value;
+ reg2->s32_max_value = reg1->s32_max_value;
+
+ t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+ reg2->var_off = tnum_with_subreg(reg2->var_off, t);
+ } else {
+ reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->umin_value = reg1->umin_value;
+ reg2->umax_value = reg1->umax_value;
+ reg2->smin_value = reg1->smin_value;
+ reg2->smax_value = reg1->smax_value;
+
+ reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
+ reg2->var_off = reg1->var_off;
+ }
+ break;
+ case BPF_JNE:
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+
+ /* try to recompute the bound of reg1 if reg2 is a const and
+ * is exactly the edge of reg1.
+ */
+ val = reg_const_value(reg2, is_jmp32);
+ if (is_jmp32) {
+ /* u32_min_value is not equal to 0xffffffff at this point,
+ * because otherwise u32_max_value is 0xffffffff as well,
+ * in such a case both reg1 and reg2 would be constants,
+ * jump would be predicted and reg_set_min_max() won't
+ * be called.
+ *
+ * Same reasoning works for all {u,s}{min,max}{32,64} cases
+ * below.
+ */
+ if (reg1->u32_min_value == (u32)val)
+ reg1->u32_min_value++;
+ if (reg1->u32_max_value == (u32)val)
+ reg1->u32_max_value--;
+ if (reg1->s32_min_value == (s32)val)
+ reg1->s32_min_value++;
+ if (reg1->s32_max_value == (s32)val)
+ reg1->s32_max_value--;
+ } else {
+ if (reg1->umin_value == (u64)val)
+ reg1->umin_value++;
+ if (reg1->umax_value == (u64)val)
+ reg1->umax_value--;
+ if (reg1->smin_value == (s64)val)
+ reg1->smin_value++;
+ if (reg1->smax_value == (s64)val)
+ reg1->smax_value--;
+ }
+ break;
+ case BPF_JSET:
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
+ /* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
+ * requires single bit to learn something useful. E.g., if we
+ * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
+ * are actually set? We can learn something definite only if
+ * it's a single-bit value to begin with.
+ *
+ * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
+ * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
+ * bit 1 is set, which we can readily use in adjustments.
+ */
+ if (!is_power_of_2(val))
+ break;
+ if (is_jmp32) {
+ t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+ } else {
+ reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
+ }
+ break;
+ case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
+ if (!is_reg_const(reg2, is_jmp32))
+ swap(reg1, reg2);
+ if (!is_reg_const(reg2, is_jmp32))
+ break;
+ val = reg_const_value(reg2, is_jmp32);
+ /* Forget the ranges before narrowing tnums, to avoid invariant
+ * violations if we're on a dead branch.
+ */
+ __mark_reg_unbounded(reg1);
+ if (is_jmp32) {
+ t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
+ reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+ } else {
+ reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
+ }
+ break;
+ case BPF_JLE:
+ if (is_jmp32) {
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+ reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+ } else {
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+ reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
+ }
+ break;
+ case BPF_JLT:
+ if (is_jmp32) {
+ reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
+ reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
+ } else {
+ reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
+ reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
+ }
+ break;
+ case BPF_JSLE:
+ if (is_jmp32) {
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+ reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+ } else {
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+ reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+ }
+ break;
+ case BPF_JSLT:
+ if (is_jmp32) {
+ reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
+ reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
+ } else {
+ reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
+ reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
+ }
+ break;
+ default:
+ return;
+ }
+}
+
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we have a fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
+ */
+static int reg_set_min_max(struct bpf_verifier_env *env,
+ struct bpf_reg_state *true_reg1,
+ struct bpf_reg_state *true_reg2,
+ struct bpf_reg_state *false_reg1,
+ struct bpf_reg_state *false_reg2,
+ u8 opcode, bool is_jmp32)
+{
+ int err;
+
+ /* If either register is a pointer, we can't learn anything about its
+ * variable offset from the compare (unless they were a pointer into
+ * the same object, but we don't bother with that).
+ */
+ if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+ return 0;
+
+ /* We compute branch direction for same SCALAR_VALUE registers in
+ * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET)
+ * on the same registers, we don't need to adjust the min/max values.
+ */
+ if (false_reg1 == false_reg2)
+ return 0;
+
+ /* fallthrough (FALSE) branch */
+ regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
+ reg_bounds_sync(false_reg1);
+ reg_bounds_sync(false_reg2);
+
+ /* jump (TRUE) branch */
+ regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
+ reg_bounds_sync(true_reg1);
+ reg_bounds_sync(true_reg2);
+
+ err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
+ err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
+ err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
+ err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+ return err;
}
-static int check_cond_jmp_op(struct verifier_env *env,
+static void mark_ptr_or_null_reg(struct bpf_func_state *state,
+ struct bpf_reg_state *reg, u32 id,
+ bool is_null)
+{
+ if (type_may_be_null(reg->type) && reg->id == id &&
+ (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
+ /* Old offset (both fixed and variable parts) should have been
+ * known-zero, because we don't allow pointer arithmetic on
+ * pointers that might be NULL. If we see this happening, don't
+ * convert the register.
+ *
+ * But in some cases, some helpers that return local kptrs
+ * advance offset for the returned pointer. In those cases, it
+ * is fine to expect to see reg->off.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
+ return;
+ if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) &&
+ WARN_ON_ONCE(reg->off))
+ return;
+
+ if (is_null) {
+ reg->type = SCALAR_VALUE;
+ /* We don't need id and ref_obj_id from this point
+ * onwards anymore, thus we should better reset it,
+ * so that state pruning has chances to take effect.
+ */
+ reg->id = 0;
+ reg->ref_obj_id = 0;
+
+ return;
+ }
+
+ mark_ptr_not_null_reg(reg);
+
+ if (!reg_may_point_to_spin_lock(reg)) {
+ /* For not-NULL ptr, reg->ref_obj_id will be reset
+ * in release_reference().
+ *
+ * reg->id is still used by spin_lock ptr. Other
+ * than spin_lock ptr type, reg->id can be reset.
+ */
+ reg->id = 0;
+ }
+ }
+}
+
+/* The logic is similar to find_good_pkt_pointers(), both could eventually
+ * be folded together at some point.
+ */
+static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
+ bool is_null)
+{
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ struct bpf_reg_state *regs = state->regs, *reg;
+ u32 ref_obj_id = regs[regno].ref_obj_id;
+ u32 id = regs[regno].id;
+
+ if (ref_obj_id && ref_obj_id == id && is_null)
+ /* regs[regno] is in the " == NULL" branch.
+ * No one could have freed the reference state before
+ * doing the NULL check.
+ */
+ WARN_ON_ONCE(release_reference_nomark(vstate, id));
+
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }));
+}
+
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+ struct bpf_reg_state *dst_reg,
+ struct bpf_reg_state *src_reg,
+ struct bpf_verifier_state *this_branch,
+ struct bpf_verifier_state *other_branch)
+{
+ if (BPF_SRC(insn->code) != BPF_X)
+ return false;
+
+ /* Pointers are always 64-bit. */
+ if (BPF_CLASS(insn->code) == BPF_JMP32)
+ return false;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_JGT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, false);
+ mark_pkt_end(other_branch, insn->dst_reg, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end > pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, true);
+ mark_pkt_end(this_branch, insn->src_reg, false);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLT:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, true);
+ mark_pkt_end(this_branch, insn->dst_reg, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end < pkt_data', pkt_data > pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, false);
+ mark_pkt_end(other_branch, insn->src_reg, true);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JGE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+ find_good_pkt_pointers(this_branch, dst_reg,
+ dst_reg->type, true);
+ mark_pkt_end(other_branch, insn->dst_reg, false);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+ find_good_pkt_pointers(other_branch, src_reg,
+ src_reg->type, false);
+ mark_pkt_end(this_branch, insn->src_reg, true);
+ } else {
+ return false;
+ }
+ break;
+ case BPF_JLE:
+ if ((dst_reg->type == PTR_TO_PACKET &&
+ src_reg->type == PTR_TO_PACKET_END) ||
+ (dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+ /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+ find_good_pkt_pointers(other_branch, dst_reg,
+ dst_reg->type, false);
+ mark_pkt_end(this_branch, insn->dst_reg, true);
+ } else if ((dst_reg->type == PTR_TO_PACKET_END &&
+ src_reg->type == PTR_TO_PACKET) ||
+ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ src_reg->type == PTR_TO_PACKET_META)) {
+ /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+ find_good_pkt_pointers(this_branch, src_reg,
+ src_reg->type, true);
+ mark_pkt_end(other_branch, insn->src_reg, false);
+ } else {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_state *reg,
+ u32 id, u32 frameno, u32 spi_or_reg, bool is_reg)
+{
+ struct linked_reg *e;
+
+ if (reg->type != SCALAR_VALUE || (reg->id & ~BPF_ADD_CONST) != id)
+ return;
+
+ e = linked_regs_push(reg_set);
+ if (e) {
+ e->frameno = frameno;
+ e->is_reg = is_reg;
+ e->regno = spi_or_reg;
+ } else {
+ reg->id = 0;
+ }
+}
+
+/* For all R being scalar registers or spilled scalar registers
+ * in verifier state, save R in linked_regs if R->id == id.
+ * If there are too many Rs sharing same id, reset id for leftover Rs.
+ */
+static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ id = id & ~BPF_ADD_CONST;
+ for (i = vstate->curframe; i >= 0; i--) {
+ func = vstate->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ __collect_linked_regs(linked_regs, reg, id, i, j, true);
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ __collect_linked_regs(linked_regs, reg, id, i, j, false);
+ }
+ }
+}
+
+/* For all R in linked_regs, copy known_reg range into R
+ * if R->id == known_reg->id.
+ */
+static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg,
+ struct linked_regs *linked_regs)
+{
+ struct bpf_reg_state fake_reg;
+ struct bpf_reg_state *reg;
+ struct linked_reg *e;
+ int i;
+
+ for (i = 0; i < linked_regs->cnt; ++i) {
+ e = &linked_regs->entries[i];
+ reg = e->is_reg ? &vstate->frame[e->frameno]->regs[e->regno]
+ : &vstate->frame[e->frameno]->stack[e->spi].spilled_ptr;
+ if (reg->type != SCALAR_VALUE || reg == known_reg)
+ continue;
+ if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
+ continue;
+ if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
+ reg->off == known_reg->off) {
+ s32 saved_subreg_def = reg->subreg_def;
+
+ copy_register_state(reg, known_reg);
+ reg->subreg_def = saved_subreg_def;
+ } else {
+ s32 saved_subreg_def = reg->subreg_def;
+ s32 saved_off = reg->off;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+
+ /* reg = known_reg; reg += delta */
+ copy_register_state(reg, known_reg);
+ /*
+ * Must preserve off, id and add_const flag,
+ * otherwise another sync_linked_regs() will be incorrect.
+ */
+ reg->off = saved_off;
+ reg->subreg_def = saved_subreg_def;
+
+ scalar32_min_max_add(reg, &fake_reg);
+ scalar_min_max_add(reg, &fake_reg);
+ reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+ }
+ }
+}
+
+static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct reg_state *regs = env->cur_state.regs;
- struct verifier_state *other_branch;
+ struct bpf_verifier_state *this_branch = env->cur_state;
+ struct bpf_verifier_state *other_branch;
+ struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
+ struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
+ struct bpf_reg_state *eq_branch_regs;
+ struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
+ int insn_flags = 0;
+ bool is_jmp32;
+ int pred = -1;
int err;
- if (opcode > BPF_EXIT) {
- verbose("invalid BPF_JMP opcode %x\n", opcode);
+ /* Only conditional jumps are expected to reach here. */
+ if (opcode == BPF_JA || opcode > BPF_JCOND) {
+ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
+ if (opcode == BPF_JCOND) {
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
+ int idx = *insn_idx;
+
+ if (insn->code != (BPF_JMP | BPF_JCOND) ||
+ insn->src_reg != BPF_MAY_GOTO ||
+ insn->dst_reg || insn->imm) {
+ verbose(env, "invalid may_goto imm %d\n", insn->imm);
+ return -EINVAL;
+ }
+ prev_st = find_prev_entry(env, cur_st->parent, idx);
+
+ /* branch out 'fallthrough' insn as a new state to explore */
+ queued_st = push_stack(env, idx + 1, idx, false);
+ if (IS_ERR(queued_st))
+ return PTR_ERR(queued_st);
+
+ queued_st->may_goto_depth++;
+ if (prev_st)
+ widen_imprecise_scalars(env, prev_st, queued_st);
+ *insn_idx += insn->off;
+ return 0;
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
+
+ src_reg = &regs[insn->src_reg];
+ if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) &&
+ is_pointer_value(env, insn->src_reg)) {
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
+
+ if (src_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_SRC_REG_STACK;
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
+ src_reg = &env->fake_reg[0];
+ memset(src_reg, 0, sizeof(*src_reg));
+ src_reg->type = SCALAR_VALUE;
+ __mark_reg_known(src_reg, insn->imm);
+
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
}
- /* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
- if (err)
- return err;
+ if (insn_flags) {
+ err = push_jmp_history(env, this_branch, insn_flags, 0);
+ if (err)
+ return err;
+ }
- /* detect if R == 0 where R was initialized to zero earlier */
- if (BPF_SRC(insn->code) == BPF_K &&
- (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- regs[insn->dst_reg].type == CONST_IMM &&
- regs[insn->dst_reg].imm == insn->imm) {
- if (opcode == BPF_JEQ) {
- /* if (imm == imm) goto pc+off;
- * only follow the goto, ignore fall-through
- */
- *insn_idx += insn->off;
- return 0;
- } else {
- /* if (imm != imm) goto pc+off;
- * only follow fall-through branch, since
- * that's where the program will go
- */
- return 0;
+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
+ pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
+ if (pred >= 0) {
+ /* If we get here with a dst_reg pointer type it is because
+ * above is_branch_taken() special cased the 0 comparison.
+ */
+ if (!__is_pointer_value(false, dst_reg))
+ err = mark_chain_precision(env, insn->dst_reg);
+ if (BPF_SRC(insn->code) == BPF_X && !err &&
+ !__is_pointer_value(false, src_reg))
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
+ }
+
+ if (pred == 1) {
+ /* Only follow the goto, ignore fall-through. If needed, push
+ * the fall-through branch for simulation under speculative
+ * execution.
+ */
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + 1, *insn_idx);
+ if (err < 0)
+ return err;
+ }
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch, this_branch->curframe);
+ *insn_idx += insn->off;
+ return 0;
+ } else if (pred == 0) {
+ /* Only follow the fall-through branch, since that's where the
+ * program will go. If needed, push the goto branch for
+ * simulation under speculative execution.
+ */
+ if (!env->bypass_spec_v1) {
+ err = sanitize_speculative_path(env, insn, *insn_idx + insn->off + 1,
+ *insn_idx);
+ if (err < 0)
+ return err;
}
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch, this_branch->curframe);
+ return 0;
}
- other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
- if (!other_branch)
- return -EFAULT;
+ /* Push scalar registers sharing same ID to jump history,
+ * do this before creating 'other_branch', so that both
+ * 'this_branch' and 'other_branch' share this history
+ * if parent state is created.
+ */
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id)
+ collect_linked_regs(this_branch, src_reg->id, &linked_regs);
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
+ collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
+ if (linked_regs.cnt > 1) {
+ err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ if (err)
+ return err;
+ }
- /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
- if (BPF_SRC(insn->code) == BPF_K &&
- insn->imm == 0 && (opcode == BPF_JEQ ||
- opcode == BPF_JNE) &&
- regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (opcode == BPF_JEQ) {
- /* next fallthrough insn can access memory via
- * this register
- */
- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- /* branch targer cannot access it, since reg == 0 */
- other_branch->regs[insn->dst_reg].type = CONST_IMM;
- other_branch->regs[insn->dst_reg].imm = 0;
- } else {
- other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = 0;
- }
- } else if (BPF_SRC(insn->code) == BPF_K &&
- (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, false);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
+ other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
- if (opcode == BPF_JEQ) {
- /* detect if (R == imm) goto
- * and in the target state recognize that R = imm
- */
- other_branch->regs[insn->dst_reg].type = CONST_IMM;
- other_branch->regs[insn->dst_reg].imm = insn->imm;
- } else {
- /* detect if (R != imm) goto
- * and in the fall-through state recognize that R = imm
- */
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- }
+ if (BPF_SRC(insn->code) == BPF_X) {
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ &other_branch_regs[insn->src_reg],
+ dst_reg, src_reg, opcode, is_jmp32);
+ } else /* BPF_SRC(insn->code) == BPF_K */ {
+ /* reg_set_min_max() can mangle the fake_reg. Make a copy
+ * so that these are two different memory locations. The
+ * src_reg is not used beyond here in context of K.
+ */
+ memcpy(&env->fake_reg[1], &env->fake_reg[0],
+ sizeof(env->fake_reg[0]));
+ err = reg_set_min_max(env,
+ &other_branch_regs[insn->dst_reg],
+ &env->fake_reg[0],
+ dst_reg, &env->fake_reg[1],
+ opcode, is_jmp32);
}
- if (log_level)
- print_verifier_state(env);
- return 0;
-}
+ if (err)
+ return err;
-/* return the map pointer stored inside BPF_LD_IMM64 instruction */
-static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
-{
- u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+ if (BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == SCALAR_VALUE && src_reg->id &&
+ !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
+ sync_linked_regs(this_branch, src_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs);
+ }
+ if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
+ !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
+ sync_linked_regs(this_branch, dst_reg, &linked_regs);
+ sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs);
+ }
- return (struct bpf_map *) (unsigned long) imm64;
+ /* if one pointer register is compared to another pointer
+ * register check if PTR_MAYBE_NULL could be lifted.
+ * E.g. register A - maybe null
+ * register B - not null
+ * for JNE A, B, ... - A is not null in the false branch;
+ * for JEQ A, B, ... - A is not null in the true branch.
+ *
+ * Since PTR_TO_BTF_ID points to a kernel struct that does
+ * not need to be null checked by the BPF program, i.e.,
+ * could be null even without PTR_MAYBE_NULL marking, so
+ * only propagate nullness when neither reg is that type.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
+ __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) &&
+ base_type(src_reg->type) != PTR_TO_BTF_ID &&
+ base_type(dst_reg->type) != PTR_TO_BTF_ID) {
+ eq_branch_regs = NULL;
+ switch (opcode) {
+ case BPF_JEQ:
+ eq_branch_regs = other_branch_regs;
+ break;
+ case BPF_JNE:
+ eq_branch_regs = regs;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ if (eq_branch_regs) {
+ if (type_may_be_null(src_reg->type))
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
+ else
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
+ }
+ }
+
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
+ * NOTE: these optimizations below are related with pointer comparison
+ * which will never be JMP32.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ type_may_be_null(dst_reg->type)) {
+ /* Mark all identical registers in each branch as either
+ * safe or unknown depending R == 0 or R != 0 conditional.
+ */
+ mark_ptr_or_null_regs(this_branch, insn->dst_reg,
+ opcode == BPF_JNE);
+ mark_ptr_or_null_regs(other_branch, insn->dst_reg,
+ opcode == BPF_JEQ);
+ } else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+ this_branch, other_branch) &&
+ is_pointer_value(env, insn->dst_reg)) {
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+ if (env->log.level & BPF_LOG_LEVEL)
+ print_insn_state(env, this_branch, this_branch->curframe);
+ return 0;
}
/* verify BPF_LD_IMM64 instruction */
-static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *dst_reg;
+ struct bpf_map *map;
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
- verbose("invalid BPF_LD_IMM insn\n");
+ verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
if (insn->off != 0) {
- verbose("BPF_LD_IMM64 uses reserved fields\n");
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
return -EINVAL;
}
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
if (err)
return err;
- if (insn->src_reg == 0)
- /* generic move 64-bit immediate into a register */
+ dst_reg = &regs[insn->dst_reg];
+ if (insn->src_reg == 0) {
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+
+ dst_reg->type = SCALAR_VALUE;
+ __mark_reg_known(&regs[insn->dst_reg], imm);
+ return 0;
+ }
+
+ /* All special src_reg cases are listed below. From this point onwards
+ * we either succeed and assign a corresponding dst_reg->type after
+ * zeroing the offset, or fail and reject the program.
+ */
+ mark_reg_known_zero(env, regs, insn->dst_reg);
+
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
+ dst_reg->type = aux->btf_var.reg_type;
+ switch (base_type(dst_reg->type)) {
+ case PTR_TO_MEM:
+ dst_reg->mem_size = aux->btf_var.mem_size;
+ break;
+ case PTR_TO_BTF_ID:
+ dst_reg->btf = aux->btf_var.btf;
+ dst_reg->btf_id = aux->btf_var.btf_id;
+ break;
+ default:
+ verifier_bug(env, "pseudo btf id: unexpected dst reg type");
+ return -EFAULT;
+ }
+ return 0;
+ }
+
+ if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ u32 subprogno = find_subprog(env,
+ env->insn_idx + insn->imm + 1);
+
+ if (!aux->func_info) {
+ verbose(env, "missing btf func_info\n");
+ return -EINVAL;
+ }
+ if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+ verbose(env, "callback function not static\n");
+ return -EINVAL;
+ }
+
+ dst_reg->type = PTR_TO_FUNC;
+ dst_reg->subprogno = subprogno;
return 0;
+ }
- /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
- BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+ map = env->used_maps[aux->map_index];
+ dst_reg->map_ptr = map;
+
+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ __mark_reg_unknown(env, dst_reg);
+ return 0;
+ }
+ dst_reg->type = PTR_TO_MAP_VALUE;
+ dst_reg->off = aux->map_off;
+ WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY &&
+ map->max_entries != 1);
+ /* We want reg->id to be same (0) as map_value is not distinct */
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_IDX) {
+ dst_reg->type = CONST_PTR_TO_MAP;
+ } else {
+ verifier_bug(env, "unexpected src reg value for ldimm64");
+ return -EFAULT;
+ }
- regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
- regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
return 0;
}
@@ -1204,62 +17222,314 @@ static bool may_access_skb(enum bpf_prog_type type)
* Output:
* R0 - 8/16/32-bit skb data converted to cpu endianness
*/
-static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
+ static const int ctx_reg = BPF_REG_6;
u8 mode = BPF_MODE(insn->code);
- struct reg_state *reg;
int i, err;
- if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+ if (!may_access_skb(resolve_prog_type(env->prog))) {
+ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
+ if (!env->ops->gen_ld_abs) {
+ verifier_bug(env, "gen_ld_abs is null");
+ return -EFAULT;
+ }
+
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_ABS uses reserved fields\n");
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
/* check whether implicit source operand (register R6) is readable */
- err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ err = check_reg_arg(env, ctx_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as
+ * gen_ld_abs() may terminate the program at runtime, leading to
+ * reference leak.
+ */
+ err = check_resource_leak(env, false, true, "BPF_LD_[ABS|IND]");
if (err)
return err;
- if (regs[BPF_REG_6].type != PTR_TO_CTX) {
- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ if (regs[ctx_reg].type != PTR_TO_CTX) {
+ verbose(env,
+ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
}
if (mode == BPF_IND) {
/* check explicit source operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
if (err)
return err;
}
+ err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
+ if (err < 0)
+ return err;
+
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- reg = regs + caller_saved[i];
- reg->type = NOT_INIT;
- reg->imm = 0;
+ mark_reg_not_init(env, regs, caller_saved[i]);
+ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* mark destination R0 register as readable, since it contains
- * the value fetched from the packet
+ * the value fetched from the packet.
+ * Already marked as written above.
*/
- regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ /* ld_abs load up to 32-bit skb data. */
+ regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
+ return 0;
+}
+
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
+{
+ const char *exit_ctx = "At program exit";
+ struct tnum enforce_attach_type_range = tnum_unknown;
+ const struct bpf_prog *prog = env->prog;
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ struct bpf_retval_range range = retval_range(0, 1);
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ int err;
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const bool is_subprog = frame->subprogno;
+ bool return_32bit = false;
+ const struct btf_type *reg_type, *ret_type = NULL;
+
+ /* LSM and struct_ops func-ptr's return type could be "void" */
+ if (!is_subprog || frame->in_exception_callback_fn) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ /* See below, can be 0 or 0-1 depending on hook. */
+ break;
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+
+ if (frame->in_exception_callback_fn)
+ break;
+
+ /* Allow a struct_ops program to return a referenced kptr if it
+ * matches the operator's return type and is in its unmodified
+ * form. A scalar zero (i.e., a null pointer) is also allowed.
+ */
+ reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
+ ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
+ prog->aux->attach_func_proto->type,
+ NULL);
+ if (ret_type && ret_type == reg_type && reg->ref_obj_id)
+ return __check_ptr_off_reg(env, reg, regno, false);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* eBPF calling convention is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(env, regno, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, regno)) {
+ verbose(env, "R%d leaks addr as return value\n", regno);
+ return -EACCES;
+ }
+
+ if (frame->in_async_callback_fn) {
+ exit_ctx = "At async callback return";
+ range = frame->callback_ret_range;
+ goto enforce_retval;
+ }
+
+ if (is_subprog && !frame->in_exception_callback_fn) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
+ regno, reg_type_str(env, reg->type));
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ switch (prog_type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+ env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
+ range = retval_range(1, 1);
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
+ env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
+ range = retval_range(0, 3);
+ break;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+ range = retval_range(0, 3);
+ enforce_attach_type_range = tnum_range(2, 3);
+ }
+ break;
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ if (!env->prog->aux->attach_btf_id)
+ return 0;
+ range = retval_range(0, 0);
+ break;
+ case BPF_PROG_TYPE_TRACING:
+ switch (env->prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ range = retval_range(0, 0);
+ break;
+ case BPF_TRACE_RAW_TP:
+ case BPF_MODIFY_RETURN:
+ return 0;
+ case BPF_TRACE_ITER:
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+ break;
+ case BPF_PROG_TYPE_KPROBE:
+ switch (env->prog->expected_attach_type) {
+ case BPF_TRACE_KPROBE_SESSION:
+ case BPF_TRACE_UPROBE_SESSION:
+ range = retval_range(0, 1);
+ break;
+ default:
+ return 0;
+ }
+ break;
+ case BPF_PROG_TYPE_SK_LOOKUP:
+ range = retval_range(SK_DROP, SK_PASS);
+ break;
+
+ case BPF_PROG_TYPE_LSM:
+ if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+ /* no range found, any return value is allowed */
+ if (!get_func_retval_range(env->prog, &range))
+ return 0;
+ /* no restricted range, any return value is allowed */
+ if (range.minval == S32_MIN && range.maxval == S32_MAX)
+ return 0;
+ return_32bit = true;
+ } else if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ range = retval_range(1, 1);
+ }
+ break;
+
+ case BPF_PROG_TYPE_NETFILTER:
+ range = retval_range(NF_DROP, NF_ACCEPT);
+ break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!ret_type)
+ return 0;
+ range = retval_range(0, 0);
+ break;
+ case BPF_PROG_TYPE_EXT:
+ /* freplace program can return anything as its return value
+ * depends on the to-be-replaced kernel func or bpf program.
+ */
+ default:
+ return 0;
+ }
+
+enforce_retval:
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "%s the register R%d is not a known value (%s)\n",
+ exit_ctx, regno, reg_type_str(env, reg->type));
+ return -EINVAL;
+ }
+
+ err = mark_chain_precision(env, regno);
+ if (err)
+ return err;
+
+ if (!retval_range_within(range, reg, return_32bit)) {
+ verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
+ if (!is_subprog &&
+ prog->expected_attach_type == BPF_LSM_CGROUP &&
+ prog_type == BPF_PROG_TYPE_LSM &&
+ !prog->aux->attach_func_proto->type)
+ verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_unknown(enforce_attach_type_range) &&
+ tnum_in(enforce_attach_type_range, reg->var_off))
+ env->prog->enforce_expected_attach_type = 1;
return 0;
}
+static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->changes_pkt_data = true;
+}
+
+static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = bpf_find_containing_subprog(env, off);
+ subprog->might_sleep = true;
+}
+
+/* 't' is an index of a call-site.
+ * 'w' is a callee entry point.
+ * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
+ * Rely on DFS traversal order and absence of recursive calls to guarantee that
+ * callee's change_pkt_data marks would be correct at that moment.
+ */
+static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
+{
+ struct bpf_subprog_info *caller, *callee;
+
+ caller = bpf_find_containing_subprog(env, t);
+ callee = bpf_find_containing_subprog(env, w);
+ caller->changes_pkt_data |= callee->changes_pkt_data;
+ caller->might_sleep |= callee->might_sleep;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
* 3 let S be a stack
* 4 S.push(v)
* 5 while S is not empty
- * 6 t <- S.pop()
+ * 6 t <- S.peek()
* 7 if t is what we're looking for:
* 8 return t
* 9 for all edges e in G.adjacentEdges(t) do
@@ -1293,162 +17563,1859 @@ enum {
BRANCH = 2,
};
-#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].prune_point = true;
+}
+
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].prune_point;
+}
+
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].calls_callback = true;
+}
+
+bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].calls_callback;
+}
-static int *insn_stack; /* stack of insns to process */
-static int cur_stack; /* current stack index */
-static int *insn_state;
+enum {
+ DONE_EXPLORING = 0,
+ KEEP_EXPLORING = 1,
+};
/* t, w, e - match pseudo-code above:
* t - index of current instruction
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
- return 0;
+ return DONE_EXPLORING;
if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
- return 0;
+ return DONE_EXPLORING;
if (w < 0 || w >= env->prog->len) {
- verbose("jump out of range from insn %d to %d\n", t, w);
+ verbose_linfo(env, t, "%d: ", t);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
- if (e == BRANCH)
+ if (e == BRANCH) {
/* mark branch target for state pruning */
- env->explored_states[w] = STATE_LIST_MARK;
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
if (insn_state[w] == 0) {
/* tree-edge */
insn_state[t] = DISCOVERED | e;
insn_state[w] = DISCOVERED;
- if (cur_stack >= env->prog->len)
+ if (env->cfg.cur_stack >= env->prog->len)
return -E2BIG;
- insn_stack[cur_stack++] = w;
- return 1;
+ insn_stack[env->cfg.cur_stack++] = w;
+ return KEEP_EXPLORING;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- verbose("back-edge from insn %d to %d\n", t, w);
+ if (env->bpf_capable)
+ return DONE_EXPLORING;
+ verbose_linfo(env, t, "%d: ", t);
+ verbose_linfo(env, w, "%d: ", w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose("insn state internal bug\n");
+ verifier_bug(env, "insn state internal bug");
return -EFAULT;
}
+ return DONE_EXPLORING;
+}
+
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+{
+ int ret, insn_sz;
+ int w;
+
+ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + insn_sz);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + insn_sz);
+
+ if (visit_callee) {
+ w = t + insns[t].imm + 1;
+ mark_prune_point(env, t);
+ merge_callee_effects(env, t, w);
+ ret = push_insn(t, w, BRANCH, env);
+ }
+ return ret;
+}
+
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* True if do_misc_fixups() replaces calls to helper number 'imm',
+ * replacement patch is presumed to follow bpf_fastcall contract
+ * (see mark_fastcall_pattern_for_call() below).
+ */
+static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
+{
+ switch (imm) {
+#ifdef CONFIG_X86_64
+ case BPF_FUNC_get_smp_processor_id:
+ return env->prog->jit_requested && bpf_jit_supports_percpu_insn();
+#endif
+ default:
+ return false;
+ }
+}
+
+struct call_summary {
+ u8 num_params;
+ bool is_void;
+ bool fastcall;
+};
+
+/* If @call is a kfunc or helper call, fills @cs and returns true,
+ * otherwise returns false.
+ */
+static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
+ struct call_summary *cs)
+{
+ struct bpf_kfunc_call_arg_meta meta;
+ const struct bpf_func_proto *fn;
+ int i;
+
+ if (bpf_helper_call(call)) {
+
+ if (get_helper_proto(env, call->imm, &fn) < 0)
+ /* error would be reported later */
+ return false;
+ cs->fastcall = fn->allow_fastcall &&
+ (verifier_inlines_helper_call(env, call->imm) ||
+ bpf_jit_inlines_helper_call(call->imm));
+ cs->is_void = fn->ret_type == RET_VOID;
+ cs->num_params = 0;
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) {
+ if (fn->arg_type[i] == ARG_DONTCARE)
+ break;
+ cs->num_params++;
+ }
+ return true;
+ }
+
+ if (bpf_pseudo_kfunc_call(call)) {
+ int err;
+
+ err = fetch_kfunc_meta(env, call, &meta, NULL);
+ if (err < 0)
+ /* error would be reported later */
+ return false;
+ cs->num_params = btf_type_vlen(meta.func_proto);
+ cs->fastcall = meta.kfunc_flags & KF_FASTCALL;
+ cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type));
+ return true;
+ }
+
+ return false;
+}
+
+/* LLVM define a bpf_fastcall function attribute.
+ * This attribute means that function scratches only some of
+ * the caller saved registers defined by ABI.
+ * For BPF the set of such registers could be defined as follows:
+ * - R0 is scratched only if function is non-void;
+ * - R1-R5 are scratched only if corresponding parameter type is defined
+ * in the function prototype.
+ *
+ * The contract between kernel and clang allows to simultaneously use
+ * such functions and maintain backwards compatibility with old
+ * kernels that don't understand bpf_fastcall calls:
+ *
+ * - for bpf_fastcall calls clang allocates registers as-if relevant r0-r5
+ * registers are not scratched by the call;
+ *
+ * - as a post-processing step, clang visits each bpf_fastcall call and adds
+ * spill/fill for every live r0-r5;
+ *
+ * - stack offsets used for the spill/fill are allocated as lowest
+ * stack offsets in whole function and are not used for any other
+ * purposes;
+ *
+ * - when kernel loads a program, it looks for such patterns
+ * (bpf_fastcall function surrounded by spills/fills) and checks if
+ * spill/fill stack offsets are used exclusively in fastcall patterns;
+ *
+ * - if so, and if verifier or current JIT inlines the call to the
+ * bpf_fastcall function (e.g. a helper call), kernel removes unnecessary
+ * spill/fill pairs;
+ *
+ * - when old kernel loads a program, presence of spill/fill pairs
+ * keeps BPF program valid, albeit slightly less efficient.
+ *
+ * For example:
+ *
+ * r1 = 1;
+ * r2 = 2;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * *(u64 *)(r10 - 16) = r2; r2 = 2;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r2 = *(u64 *)(r10 - 16); r0 = r1;
+ * r1 = *(u64 *)(r10 - 8); r0 += r2;
+ * r0 = r1; exit;
+ * r0 += r2;
+ * exit;
+ *
+ * The purpose of mark_fastcall_pattern_for_call is to:
+ * - look for such patterns;
+ * - mark spill and fill instructions in env->insn_aux_data[*].fastcall_pattern;
+ * - mark set env->insn_aux_data[*].fastcall_spills_num for call instruction;
+ * - update env->subprog_info[*]->fastcall_stack_off to find an offset
+ * at which bpf_fastcall spill/fill stack slots start;
+ * - update env->subprog_info[*]->keep_fastcall_stack.
+ *
+ * The .fastcall_pattern and .fastcall_stack_off are used by
+ * check_fastcall_stack_contract() to check if every stack access to
+ * fastcall spill/fill stack slot originates from spill/fill
+ * instructions, members of fastcall patterns.
+ *
+ * If such condition holds true for a subprogram, fastcall patterns could
+ * be rewritten by remove_fastcall_spills_fills().
+ * Otherwise bpf_fastcall patterns are not changed in the subprogram
+ * (code, presumably, generated by an older clang version).
+ *
+ * For example, it is *not* safe to remove spill/fill below:
+ *
+ * r1 = 1;
+ * *(u64 *)(r10 - 8) = r1; r1 = 1;
+ * call %[to_be_inlined] --> call %[to_be_inlined]
+ * r1 = *(u64 *)(r10 - 8); r0 = *(u64 *)(r10 - 8); <---- wrong !!!
+ * r0 = *(u64 *)(r10 - 8); r0 += r1;
+ * r0 += r1; exit;
+ * exit;
+ */
+static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
+ struct bpf_subprog_info *subprog,
+ int insn_idx, s16 lowest_off)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
+ struct bpf_insn *call = &env->prog->insnsi[insn_idx];
+ u32 clobbered_regs_mask;
+ struct call_summary cs;
+ u32 expected_regs_mask;
+ s16 off;
+ int i;
+
+ if (!get_call_summary(env, call, &cs))
+ return;
+
+ /* A bitmask specifying which caller saved registers are clobbered
+ * by a call to a helper/kfunc *as if* this helper/kfunc follows
+ * bpf_fastcall contract:
+ * - includes R0 if function is non-void;
+ * - includes R1-R5 if corresponding parameter has is described
+ * in the function prototype.
+ */
+ clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0);
+ /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */
+ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS;
+
+ /* match pairs of form:
+ *
+ * *(u64 *)(r10 - Y) = rX (where Y % 8 == 0)
+ * ...
+ * call %[to_be_inlined]
+ * ...
+ * rX = *(u64 *)(r10 - Y)
+ */
+ for (i = 1, off = lowest_off; i <= ARRAY_SIZE(caller_saved); ++i, off += BPF_REG_SIZE) {
+ if (insn_idx - i < 0 || insn_idx + i >= env->prog->len)
+ break;
+ stx = &insns[insn_idx - i];
+ ldx = &insns[insn_idx + i];
+ /* must be a stack spill/fill pair */
+ if (stx->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ ldx->code != (BPF_LDX | BPF_MEM | BPF_DW) ||
+ stx->dst_reg != BPF_REG_10 ||
+ ldx->src_reg != BPF_REG_10)
+ break;
+ /* must be a spill/fill for the same reg */
+ if (stx->src_reg != ldx->dst_reg)
+ break;
+ /* must be one of the previously unseen registers */
+ if ((BIT(stx->src_reg) & expected_regs_mask) == 0)
+ break;
+ /* must be a spill/fill for the same expected offset,
+ * no need to check offset alignment, BPF_DW stack access
+ * is always 8-byte aligned.
+ */
+ if (stx->off != off || ldx->off != off)
+ break;
+ expected_regs_mask &= ~BIT(stx->src_reg);
+ env->insn_aux_data[insn_idx - i].fastcall_pattern = 1;
+ env->insn_aux_data[insn_idx + i].fastcall_pattern = 1;
+ }
+ if (i == 1)
+ return;
+
+ /* Conditionally set 'fastcall_spills_num' to allow forward
+ * compatibility when more helper functions are marked as
+ * bpf_fastcall at compile time than current kernel supports, e.g:
+ *
+ * 1: *(u64 *)(r10 - 8) = r1
+ * 2: call A ;; assume A is bpf_fastcall for current kernel
+ * 3: r1 = *(u64 *)(r10 - 8)
+ * 4: *(u64 *)(r10 - 8) = r1
+ * 5: call B ;; assume B is not bpf_fastcall for current kernel
+ * 6: r1 = *(u64 *)(r10 - 8)
+ *
+ * There is no need to block bpf_fastcall rewrite for such program.
+ * Set 'fastcall_pattern' for both calls to keep check_fastcall_stack_contract() happy,
+ * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills()
+ * does not remove spill/fill pair {4,6}.
+ */
+ if (cs.fastcall)
+ env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1;
+ else
+ subprog->keep_fastcall_stack = 1;
+ subprog->fastcall_stack_off = min(subprog->fastcall_stack_off, off);
+}
+
+static int mark_fastcall_patterns(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn *insn;
+ s16 lowest_off;
+ int s, i;
+
+ for (s = 0; s < env->subprog_cnt; ++s, ++subprog) {
+ /* find lowest stack spill offset used in this subprog */
+ lowest_off = 0;
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->dst_reg != BPF_REG_10)
+ continue;
+ lowest_off = min(lowest_off, insn->off);
+ }
+ /* use this offset to find fastcall patterns */
+ for (i = subprog->start; i < (subprog + 1)->start; ++i) {
+ insn = env->prog->insnsi + i;
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+ mark_fastcall_pattern_for_call(env, subprog, i, lowest_off);
+ }
+ }
+ return 0;
+}
+
+static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem)
+{
+ size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]);
+ struct bpf_iarray *new;
+
+ new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT);
+ if (!new) {
+ /* this is what callers always want, so simplify the call site */
+ kvfree(old);
+ return NULL;
+ }
+
+ new->cnt = n_elem;
+ return new;
+}
+
+static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items)
+{
+ struct bpf_insn_array_value *value;
+ u32 i;
+
+ for (i = start; i <= end; i++) {
+ value = map->ops->map_lookup_elem(map, &i);
+ /*
+ * map_lookup_elem of an array map will never return an error,
+ * but not checking it makes some static analysers to worry
+ */
+ if (IS_ERR(value))
+ return PTR_ERR(value);
+ else if (!value)
+ return -EINVAL;
+ items[i - start] = value->xlated_off;
+ }
+ return 0;
+}
+
+static int cmp_ptr_to_u32(const void *a, const void *b)
+{
+ return *(u32 *)a - *(u32 *)b;
+}
+
+static int sort_insn_array_uniq(u32 *items, int cnt)
+{
+ int unique = 1;
+ int i;
+
+ sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL);
+
+ for (i = 1; i < cnt; i++)
+ if (items[i] != items[unique - 1])
+ items[unique++] = items[i];
+
+ return unique;
+}
+
+/*
+ * sort_unique({map[start], ..., map[end]}) into off
+ */
+static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off)
+{
+ u32 n = end - start + 1;
+ int err;
+
+ err = copy_insn_array(map, start, end, off);
+ if (err)
+ return err;
+
+ return sort_insn_array_uniq(off, n);
+}
+
+/*
+ * Copy all unique offsets from the map
+ */
+static struct bpf_iarray *jt_from_map(struct bpf_map *map)
+{
+ struct bpf_iarray *jt;
+ int err;
+ int n;
+
+ jt = iarray_realloc(NULL, map->max_entries);
+ if (!jt)
+ return ERR_PTR(-ENOMEM);
+
+ n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items);
+ if (n < 0) {
+ err = n;
+ goto err_free;
+ }
+ if (n == 0) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ jt->cnt = n;
+ return jt;
+
+err_free:
+ kvfree(jt);
+ return ERR_PTR(err);
+}
+
+/*
+ * Find and collect all maps which fit in the subprog. Return the result as one
+ * combined jump table in jt->items (allocated with kvcalloc)
+ */
+static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
+ int subprog_start, int subprog_end)
+{
+ struct bpf_iarray *jt = NULL;
+ struct bpf_map *map;
+ struct bpf_iarray *jt_cur;
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++) {
+ /*
+ * TODO (when needed): collect only jump tables, not static keys
+ * or maps for indirect calls
+ */
+ map = env->insn_array_maps[i];
+
+ jt_cur = jt_from_map(map);
+ if (IS_ERR(jt_cur)) {
+ kvfree(jt);
+ return jt_cur;
+ }
+
+ /*
+ * This is enough to check one element. The full table is
+ * checked to fit inside the subprog later in create_jt()
+ */
+ if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) {
+ u32 old_cnt = jt ? jt->cnt : 0;
+ jt = iarray_realloc(jt, old_cnt + jt_cur->cnt);
+ if (!jt) {
+ kvfree(jt_cur);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2);
+ }
+
+ kvfree(jt_cur);
+ }
+
+ if (!jt) {
+ verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
+ return ERR_PTR(-EINVAL);
+ }
+
+ jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt);
+ return jt;
+}
+
+static struct bpf_iarray *
+create_jt(int t, struct bpf_verifier_env *env)
+{
+ static struct bpf_subprog_info *subprog;
+ int subprog_start, subprog_end;
+ struct bpf_iarray *jt;
+ int i;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ subprog_start = subprog->start;
+ subprog_end = (subprog + 1)->start;
+ jt = jt_from_subprog(env, subprog_start, subprog_end);
+ if (IS_ERR(jt))
+ return jt;
+
+ /* Check that the every element of the jump table fits within the given subprogram */
+ for (i = 0; i < jt->cnt; i++) {
+ if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) {
+ verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n",
+ t, subprog_start, subprog_end);
+ kvfree(jt);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return jt;
+}
+
+/* "conditional jump with N edges" */
+static int visit_gotox_insn(int t, struct bpf_verifier_env *env)
+{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+ bool keep_exploring = false;
+ struct bpf_iarray *jt;
+ int i, w;
+
+ jt = env->insn_aux_data[t].jt;
+ if (!jt) {
+ jt = create_jt(t, env);
+ if (IS_ERR(jt))
+ return PTR_ERR(jt);
+
+ env->insn_aux_data[t].jt = jt;
+ }
+
+ mark_prune_point(env, t);
+ for (i = 0; i < jt->cnt; i++) {
+ w = jt->items[i];
+ if (w < 0 || w >= env->prog->len) {
+ verbose(env, "indirect jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ mark_jmp_point(env, w);
+
+ /* EXPLORED || DISCOVERED */
+ if (insn_state[w])
+ continue;
+
+ if (env->cfg.cur_stack >= env->prog->len)
+ return -E2BIG;
+
+ insn_stack[env->cfg.cur_stack++] = w;
+ insn_state[w] |= DISCOVERED;
+ keep_exploring = true;
+ }
+
+ return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING;
+}
+
+static int visit_tailcall_insn(struct bpf_verifier_env *env, int t)
+{
+ static struct bpf_subprog_info *subprog;
+ struct bpf_iarray *jt;
+
+ if (env->insn_aux_data[t].jt)
+ return 0;
+
+ jt = iarray_realloc(NULL, 2);
+ if (!jt)
+ return -ENOMEM;
+
+ subprog = bpf_find_containing_subprog(env, t);
+ jt->items[0] = t + 1;
+ jt->items[1] = subprog->exit_idx;
+ env->insn_aux_data[t].jt = jt;
return 0;
}
+/* Visits the instruction at index t and returns one of the following:
+ * < 0 - an error occurred
+ * DONE_EXPLORING - the instruction was fully explored
+ * KEEP_EXPLORING - there is still work to be done before it is fully explored
+ */
+static int visit_insn(int t, struct bpf_verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+ int ret, off, insn_sz;
+
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
+
+ /* All non-branch instructions have a single fall-through edge. */
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32) {
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ return push_insn(t, t + insn_sz, FALLTHROUGH, env);
+ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_EXIT:
+ return DONE_EXPLORING;
+
+ case BPF_CALL:
+ if (is_async_callback_calling_insn(insn))
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
+ if (bpf_helper_call(insn)) {
+ const struct bpf_func_proto *fp;
+
+ ret = get_helper_proto(env, insn->imm, &fp);
+ /* If called in a non-sleepable context program will be
+ * rejected anyway, so we should end up with precise
+ * sleepable marks on subprogs, except for dead code
+ * elimination.
+ */
+ if (ret == 0 && fp->might_sleep)
+ mark_subprog_might_sleep(env, t);
+ if (bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
+ if (insn->imm == BPF_FUNC_tail_call)
+ visit_tailcall_insn(env, t);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ if (ret == 0 && is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ /* Same as helpers, if called in a non-sleepable context
+ * program will be rejected anyway, so we should end up
+ * with precise sleepable marks on subprogs, except for
+ * dead code elimination.
+ */
+ if (ret == 0 && is_kfunc_sleepable(&meta))
+ mark_subprog_might_sleep(env, t);
+ if (ret == 0 && is_kfunc_pkt_changing(&meta))
+ mark_subprog_changes_pkt_data(env, t);
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
+
+ case BPF_JA:
+ if (BPF_SRC(insn->code) == BPF_X)
+ return visit_gotox_insn(t, env);
+
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ off = insn->off;
+ else
+ off = insn->imm;
+
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ mark_prune_point(env, t + off + 1);
+ mark_jmp_point(env, t + off + 1);
+
+ return ret;
+
+ default:
+ /* conditional jump with two edges */
+ mark_prune_point(env, t);
+ if (is_may_goto_insn(insn))
+ mark_force_checkpoint(env, t);
+
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret)
+ return ret;
+
+ return push_insn(t, t + insn->off + 1, BRANCH, env);
+ }
+}
+
/* non-recursive depth-first-search to detect loops in BPF program
* loop == back-edge in directed graph
*/
-static int check_cfg(struct verifier_env *env)
+static int check_cfg(struct bpf_verifier_env *env)
{
- struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int ret = 0;
- int i, t;
+ int *insn_stack, *insn_state;
+ int ex_insn_beg, i, ret = 0;
- insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
if (!insn_state)
return -ENOMEM;
- insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
if (!insn_stack) {
- kfree(insn_state);
+ kvfree(insn_state);
return -ENOMEM;
}
+ ex_insn_beg = env->exception_callback_subprog
+ ? env->subprog_info[env->exception_callback_subprog].start
+ : 0;
+
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
- cur_stack = 1;
+ env->cfg.cur_stack = 1;
+
+walk_cfg:
+ while (env->cfg.cur_stack > 0) {
+ int t = insn_stack[env->cfg.cur_stack - 1];
+
+ ret = visit_insn(t, env);
+ switch (ret) {
+ case DONE_EXPLORING:
+ insn_state[t] = EXPLORED;
+ env->cfg.cur_stack--;
+ break;
+ case KEEP_EXPLORING:
+ break;
+ default:
+ if (ret > 0) {
+ verifier_bug(env, "visit_insn internal bug");
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+ }
-peek_stack:
- if (cur_stack == 0)
- goto check_state;
- t = insn_stack[cur_stack - 1];
+ if (env->cfg.cur_stack < 0) {
+ verifier_bug(env, "pop stack internal bug");
+ ret = -EFAULT;
+ goto err_free;
+ }
- if (BPF_CLASS(insns[t].code) == BPF_JMP) {
- u8 opcode = BPF_OP(insns[t].code);
+ if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
+ insn_state[ex_insn_beg] = DISCOVERED;
+ insn_stack[0] = ex_insn_beg;
+ env->cfg.cur_stack = 1;
+ goto walk_cfg;
+ }
- if (opcode == BPF_EXIT) {
- goto mark_explored;
- } else if (opcode == BPF_CALL) {
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
- } else if (opcode == BPF_JA) {
- if (BPF_SRC(insns[t].code) != BPF_K) {
+ for (i = 0; i < insn_cnt; i++) {
+ struct bpf_insn *insn = &env->prog->insnsi[i];
+
+ if (insn_state[i] != EXPLORED) {
+ verbose(env, "unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ if (bpf_is_ldimm64(insn)) {
+ if (insn_state[i + 1] != 0) {
+ verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
- /* unconditional jump with single edge */
- ret = push_insn(t, t + insns[t].off + 1,
- FALLTHROUGH, env);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
- /* tell verifier to check for equivalent states
- * after every call and jump
- */
- if (t + 1 < insn_cnt)
- env->explored_states[t + 1] = STATE_LIST_MARK;
- } else {
- /* conditional jump with two edges */
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
- goto err_free;
+ i++; /* skip second half of ldimm64 */
+ }
+ }
+ ret = 0; /* cfg looks good */
+ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
+ env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
+
+err_free:
+ kvfree(insn_state);
+ kvfree(insn_stack);
+ env->cfg.insn_state = env->cfg.insn_stack = NULL;
+ return ret;
+}
+
+/*
+ * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range
+ * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start)
+ * with indices of 'i' instructions in postorder.
+ */
+static int compute_postorder(struct bpf_verifier_env *env)
+{
+ u32 cur_postorder, i, top, stack_sz, s;
+ int *stack = NULL, *postorder = NULL, *state = NULL;
+ struct bpf_iarray *succ;
+
+ postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT);
+ if (!postorder || !state || !stack) {
+ kvfree(postorder);
+ kvfree(state);
+ kvfree(stack);
+ return -ENOMEM;
+ }
+ cur_postorder = 0;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ env->subprog_info[i].postorder_start = cur_postorder;
+ stack[0] = env->subprog_info[i].start;
+ stack_sz = 1;
+ do {
+ top = stack[stack_sz - 1];
+ state[top] |= DISCOVERED;
+ if (state[top] & EXPLORED) {
+ postorder[cur_postorder++] = top;
+ stack_sz--;
+ continue;
+ }
+ succ = bpf_insn_successors(env, top);
+ for (s = 0; s < succ->cnt; ++s) {
+ if (!state[succ->items[s]]) {
+ stack[stack_sz++] = succ->items[s];
+ state[succ->items[s]] |= DISCOVERED;
+ }
+ }
+ state[top] |= EXPLORED;
+ } while (stack_sz);
+ }
+ env->subprog_info[i].postorder_start = cur_postorder;
+ env->cfg.insn_postorder = postorder;
+ env->cfg.cur_postorder = cur_postorder;
+ kvfree(stack);
+ kvfree(state);
+ return 0;
+}
+
+static int check_abnormal_return(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ if (env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is not allowed in subprogs without BTF\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/* The minimum supported BTF func info size */
+#define MIN_BPF_FUNCINFO_SIZE 8
+#define MAX_FUNCINFO_REC_SIZE 252
+
+static int check_btf_func_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 krec_size = sizeof(struct bpf_func_info);
+ const struct btf_type *type, *func_proto;
+ u32 i, nfuncs, urec_size, min_size;
+ struct bpf_func_info *krecord;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ u32 prev_offset = 0;
+ bpfptr_t urecord;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ urec_size = attr->func_info_rec_size;
+ if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
+ urec_size > MAX_FUNCINFO_REC_SIZE ||
+ urec_size % sizeof(u32)) {
+ verbose(env, "invalid func info rec size %u\n", urec_size);
+ return -EINVAL;
+ }
+
+ prog = env->prog;
+ btf = prog->aux->btf;
- ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+ min_size = min_t(u32, krec_size, urec_size);
+
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!krecord)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
+ if (ret) {
+ if (ret == -E2BIG) {
+ verbose(env, "nonzero tailing record in func info");
+ /* set the size kernel expects so loader can zero
+ * out the rest of the record.
+ */
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, func_info_rec_size),
+ &min_size, sizeof(min_size)))
+ ret = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_bpfptr(&krecord[i], urecord, min_size)) {
+ ret = -EFAULT;
+ goto err_free;
+ }
+
+ /* check insn_off */
+ ret = -EINVAL;
+ if (i == 0) {
+ if (krecord[i].insn_off) {
+ verbose(env,
+ "nonzero insn_off %u for the first func info record",
+ krecord[i].insn_off);
goto err_free;
+ }
+ } else if (krecord[i].insn_off <= prev_offset) {
+ verbose(env,
+ "same or smaller insn offset (%u) than previous func info record (%u)",
+ krecord[i].insn_off, prev_offset);
+ goto err_free;
}
- } else {
- /* all other non-branch instructions with single
- * fall-through edge
- */
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
- if (ret == 1)
- goto peek_stack;
- else if (ret < 0)
+
+ /* check type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ if (!type || !btf_type_is_func(type)) {
+ verbose(env, "invalid type id %d in func info",
+ krecord[i].type_id);
+ goto err_free;
+ }
+
+ func_proto = btf_type_by_id(btf, type->type);
+ if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
+ /* btf_func_check() already verified it during BTF load */
goto err_free;
+
+ prev_offset = krecord[i].insn_off;
+ bpfptr_add(&urecord, urec_size);
}
-mark_explored:
- insn_state[t] = EXPLORED;
- if (cur_stack-- <= 0) {
- verbose("pop stack internal bug\n");
- ret = -EFAULT;
- goto err_free;
+ prog->aux->func_info = krecord;
+ prog->aux->func_info_cnt = nfuncs;
+ return 0;
+
+err_free:
+ kvfree(krecord);
+ return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ const struct btf_type *type, *func_proto, *ret_type;
+ u32 i, nfuncs, urec_size;
+ struct bpf_func_info *krecord;
+ struct bpf_func_info_aux *info_aux = NULL;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t urecord;
+ bool scalar_return;
+ int ret = -ENOMEM;
+
+ nfuncs = attr->func_info_cnt;
+ if (!nfuncs) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+ if (nfuncs != env->subprog_cnt) {
+ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+ return -EINVAL;
}
- goto peek_stack;
-check_state:
- for (i = 0; i < insn_cnt; i++) {
- if (insn_state[i] != EXPLORED) {
- verbose("unreachable insn %d\n", i);
- ret = -EINVAL;
+ urec_size = attr->func_info_rec_size;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+
+ krecord = prog->aux->func_info;
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!info_aux)
+ return -ENOMEM;
+
+ for (i = 0; i < nfuncs; i++) {
+ /* check insn_off */
+ ret = -EINVAL;
+
+ if (env->subprog_info[i].start != krecord[i].insn_off) {
+ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
goto err_free;
}
+
+ /* Already checked type_id */
+ type = btf_type_by_id(btf, krecord[i].type_id);
+ info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+ /* Already checked func_proto */
+ func_proto = btf_type_by_id(btf, type->type);
+
+ ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+ scalar_return =
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
+ if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
+ verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+ if (i && !scalar_return && env->subprog_info[i].has_tail_call) {
+ verbose(env, "tail_call is only allowed in functions that return 'int'.\n");
+ goto err_free;
+ }
+
+ bpfptr_add(&urecord, urec_size);
}
- ret = 0; /* cfg looks good */
+
+ prog->aux->func_info_aux = info_aux;
+ return 0;
err_free:
- kfree(insn_state);
- kfree(insn_stack);
+ kfree(info_aux);
return ret;
}
+static void adjust_btf_func(struct bpf_verifier_env *env)
+{
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int i;
+
+ if (!aux->func_info)
+ return;
+
+ /* func_info is not available for hidden subprogs */
+ for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
+ aux->func_info[i].insn_off = env->subprog_info[i].start;
+}
+
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
+#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_btf_line(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0;
+ struct bpf_subprog_info *sub;
+ struct bpf_line_info *linfo;
+ struct bpf_prog *prog;
+ const struct btf *btf;
+ bpfptr_t ulinfo;
+ int err;
+
+ nr_linfo = attr->line_info_cnt;
+ if (!nr_linfo)
+ return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
+
+ rec_size = attr->line_info_rec_size;
+ if (rec_size < MIN_BPF_LINEINFO_SIZE ||
+ rec_size > MAX_LINEINFO_REC_SIZE ||
+ rec_size & (sizeof(u32) - 1))
+ return -EINVAL;
+
+ /* Need to zero it in case the userspace may
+ * pass in a smaller bpf_line_info object.
+ */
+ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!linfo)
+ return -ENOMEM;
+
+ prog = env->prog;
+ btf = prog->aux->btf;
+
+ s = 0;
+ sub = env->subprog_info;
+ ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_line_info);
+ ncopy = min_t(u32, expected_size, rec_size);
+ for (i = 0; i < nr_linfo; i++) {
+ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in line_info");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, line_info_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ goto err_free;
+ }
+
+ if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ /*
+ * Check insn_off to ensure
+ * 1) strictly increasing AND
+ * 2) bounded by prog->len
+ *
+ * The linfo[0].insn_off == 0 check logically falls into
+ * the later "missing bpf_line_info for func..." case
+ * because the first linfo[0].insn_off must be the
+ * first sub also and the first sub must have
+ * subprog_info[0].start == 0.
+ */
+ if ((i && linfo[i].insn_off <= prev_offset) ||
+ linfo[i].insn_off >= prog->len) {
+ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n",
+ i, linfo[i].insn_off, prev_offset,
+ prog->len);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!prog->insnsi[linfo[i].insn_off].code) {
+ verbose(env,
+ "Invalid insn code at line_info[%u].insn_off\n",
+ i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (!btf_name_by_offset(btf, linfo[i].line_off) ||
+ !btf_name_by_offset(btf, linfo[i].file_name_off)) {
+ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (s != env->subprog_cnt) {
+ if (linfo[i].insn_off == sub[s].start) {
+ sub[s].linfo_idx = i;
+ s++;
+ } else if (sub[s].start < linfo[i].insn_off) {
+ verbose(env, "missing bpf_line_info for func#%u\n", s);
+ err = -EINVAL;
+ goto err_free;
+ }
+ }
+
+ prev_offset = linfo[i].insn_off;
+ bpfptr_add(&ulinfo, rec_size);
+ }
+
+ if (s != env->subprog_cnt) {
+ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n",
+ env->subprog_cnt - s, s);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ prog->aux->linfo = linfo;
+ prog->aux->nr_linfo = nr_linfo;
+
+ return 0;
+
+err_free:
+ kvfree(linfo);
+ return err;
+}
+
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
+static int check_btf_info_early(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ struct btf *btf;
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ btf = btf_get_by_fd(attr->prog_btf_fd);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
+ env->prog->aux->btf = btf;
+
+ err = check_btf_func_early(env, attr, uattr);
+ if (err)
+ return err;
+ return 0;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ int err;
+
+ if (!attr->func_info_cnt && !attr->line_info_cnt) {
+ if (check_abnormal_return(env))
+ return -EINVAL;
+ return 0;
+ }
+
+ err = check_btf_func(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_btf_line(env, attr, uattr);
+ if (err)
+ return err;
+
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/* check %cur's range satisfies %old's */
+static bool range_within(const struct bpf_reg_state *old,
+ const struct bpf_reg_state *cur)
+{
+ return old->umin_value <= cur->umin_value &&
+ old->umax_value >= cur->umax_value &&
+ old->smin_value <= cur->smin_value &&
+ old->smax_value >= cur->smax_value &&
+ old->u32_min_value <= cur->u32_min_value &&
+ old->u32_max_value >= cur->u32_max_value &&
+ old->s32_min_value <= cur->s32_min_value &&
+ old->s32_max_value >= cur->s32_max_value;
+}
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well. But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe. But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before. If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
+ */
+static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ struct bpf_id_pair *map = idmap->map;
+ unsigned int i;
+
+ /* either both IDs should be set or both should be zero */
+ if (!!old_id != !!cur_id)
+ return false;
+
+ if (old_id == 0) /* cur_id == 0 as well */
+ return true;
+
+ for (i = 0; i < BPF_ID_MAP_SIZE; i++) {
+ if (!map[i].old) {
+ /* Reached an empty slot; haven't seen this id before */
+ map[i].old = old_id;
+ map[i].cur = cur_id;
+ return true;
+ }
+ if (map[i].old == old_id)
+ return map[i].cur == cur_id;
+ if (map[i].cur == cur_id)
+ return false;
+ }
+ /* We ran out of idmap slots, which should be impossible */
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+/* Similar to check_ids(), but allocate a unique temporary ID
+ * for 'old_id' or 'cur_id' of zero.
+ * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+ */
+static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
+{
+ old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+ cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
+
+ return check_ids(old_id, cur_id, idmap);
+}
+
+static void clean_func_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *st,
+ u32 ip)
+{
+ u16 live_regs = env->insn_aux_data[ip].live_regs_before;
+ int i, j;
+
+ for (i = 0; i < BPF_REG_FP; i++) {
+ /* liveness must not touch this register anymore */
+ if (!(live_regs & BIT(i)))
+ /* since the register is unused, clear its state
+ * to make further comparison simpler
+ */
+ __mark_reg_not_init(env, &st->regs[i]);
+ }
+
+ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!bpf_stack_slot_alive(env, st->frameno, i)) {
+ __mark_reg_not_init(env, &st->stack[i].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ st->stack[i].slot_type[j] = STACK_INVALID;
+ }
+ }
+}
+
+static void clean_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ int i, ip;
+
+ bpf_live_stack_query_init(env, st);
+ st->cleaned = true;
+ for (i = 0; i <= st->curframe; i++) {
+ ip = frame_insn_idx(st, i);
+ clean_func_state(env, st->frame[i], ip);
+ }
+}
+
+/* the parentage chains form a tree.
+ * the verifier states are added to state lists at given insn and
+ * pushed into state stack for future exploration.
+ * when the verifier reaches bpf_exit insn some of the verifier states
+ * stored in the state lists have their final liveness state already,
+ * but a lot of states will get revised from liveness point of view when
+ * the verifier explores other branches.
+ * Example:
+ * 1: *(u64)(r10 - 8) = 1
+ * 2: if r1 == 100 goto pc+1
+ * 3: *(u64)(r10 - 8) = 2
+ * 4: r0 = *(u64)(r10 - 8)
+ * 5: exit
+ * when the verifier reaches exit insn the stack slot -8 in the state list of
+ * insn 2 is not yet marked alive. Then the verifier pops the other_branch
+ * of insn 2 and goes exploring further. After the insn 4 read, liveness
+ * analysis would propagate read mark for -8 at insn 2.
+ *
+ * Since the verifier pushes the branch states as it sees them while exploring
+ * the program the condition of walking the branch instruction for the second
+ * time means that all states below this branch were already explored and
+ * their final liveness marks are already propagated.
+ * Hence when the verifier completes the search of state list in is_state_visited()
+ * we can call this clean_live_states() function to clear dead the registers and stack
+ * slots to simplify state merging.
+ *
+ * Important note here that walking the same branch instruction in the callee
+ * doesn't meant that the states are DONE. The verifier has to compare
+ * the callsites
+ */
+static void clean_live_states(struct bpf_verifier_env *env, int insn,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_verifier_state_list *sl;
+ struct list_head *pos, *head;
+
+ head = explored_state(env, insn);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ if (sl->state.branches)
+ continue;
+ if (sl->state.insn_idx != insn ||
+ !same_callsites(&sl->state, cur))
+ continue;
+ if (sl->state.cleaned)
+ /* all regs in this state in all frames were already marked */
+ continue;
+ if (incomplete_read_marks(env, &sl->state))
+ continue;
+ clean_verifier_state(env, &sl->state);
+ }
+}
+
+static bool regs_exact(const struct bpf_reg_state *rold,
+ const struct bpf_reg_state *rcur,
+ struct bpf_idmap *idmap)
+{
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+}
+
+enum exact_level {
+ NOT_EXACT,
+ EXACT,
+ RANGE_WITHIN
+};
+
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ if (exact == EXACT)
+ return regs_exact(rold, rcur, idmap);
+
+ if (rold->type == NOT_INIT) {
+ if (exact == NOT_EXACT || rcur->type == NOT_INIT)
+ /* explored state can't have used this */
+ return true;
+ }
+
+ /* Enforce that register types have to match exactly, including their
+ * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general
+ * rule.
+ *
+ * One can make a point that using a pointer register as unbounded
+ * SCALAR would be technically acceptable, but this could lead to
+ * pointer leaks because scalars are allowed to leak while pointers
+ * are not. We could make this safe in special cases if root is
+ * calling us, but it's probably not worth the hassle.
+ *
+ * Also, register types that are *not* MAYBE_NULL could technically be
+ * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE
+ * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point
+ * to the same map).
+ * However, if the old MAYBE_NULL register then got NULL checked,
+ * doing so could have affected others with the same id, and we can't
+ * check for that because we lost the id when we converted to
+ * a non-MAYBE_NULL variant.
+ * So, as a general rule we don't allow mixing MAYBE_NULL and
+ * non-MAYBE_NULL registers as well.
+ */
+ if (rold->type != rcur->type)
+ return false;
+
+ switch (base_type(rold->type)) {
+ case SCALAR_VALUE:
+ if (env->explore_alu_limits) {
+ /* explore_alu_limits disables tnum_in() and range_within()
+ * logic and requires everything to be strict
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
+ }
+ if (!rold->precise && exact == NOT_EXACT)
+ return true;
+ if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+ if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
+ return false;
+ /* Why check_ids() for scalar registers?
+ *
+ * Consider the following BPF code:
+ * 1: r6 = ... unbound scalar, ID=a ...
+ * 2: r7 = ... unbound scalar, ID=b ...
+ * 3: if (r6 > r7) goto +1
+ * 4: r6 = r7
+ * 5: if (r6 > X) goto ...
+ * 6: ... memory operation using r7 ...
+ *
+ * First verification path is [1-6]:
+ * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7;
+ * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark
+ * r7 <= X, because r6 and r7 share same id.
+ * Next verification path is [1-4, 6].
+ *
+ * Instruction (6) would be reached in two states:
+ * I. r6{.id=b}, r7{.id=b} via path 1-6;
+ * II. r6{.id=a}, r7{.id=b} via path 1-4, 6.
+ *
+ * Use check_ids() to distinguish these states.
+ * ---
+ * Also verify that new value satisfies old value range knowledge.
+ */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_scalar_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
+ /* If the new min/max/var_off satisfy the old ones and
+ * everything else matches, we are OK.
+ */
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
+ case PTR_TO_PACKET_META:
+ case PTR_TO_PACKET:
+ /* We must have at least as much range as the old ptr
+ * did, so that any accesses which were safe before are
+ * still safe. This is true even if old range < old off,
+ * since someone could have accessed through (ptr - k), or
+ * even done ptr -= k in a register, to get a safe access.
+ */
+ if (rold->range > rcur->range)
+ return false;
+ /* If the offsets don't match, we can't trust our alignment;
+ * nor can we be sure that we won't fall out of range.
+ */
+ if (rold->off != rcur->off)
+ return false;
+ /* id relations must be preserved */
+ if (!check_ids(rold->id, rcur->id, idmap))
+ return false;
+ /* new val must satisfy old val knowledge */
+ return range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
+ */
+ return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;
+ case PTR_TO_ARENA:
+ return true;
+ case PTR_TO_INSN:
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
+ rold->off == rcur->off && range_within(rold, rcur) &&
+ tnum_in(rold->var_off, rcur->var_off);
+ default:
+ return regs_exact(rold, rcur, idmap);
+ }
+}
+
+static struct bpf_reg_state unbound_reg;
+
+static __init int unbound_reg_init(void)
+{
+ __mark_reg_unknown_imprecise(&unbound_reg);
+ return 0;
+}
+late_initcall(unbound_reg_init);
+
+static bool is_stack_all_misc(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) {
+ if ((stack->slot_type[i] == STACK_MISC) ||
+ (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack))
+ continue;
+ return false;
+ }
+
+ return true;
+}
+
+static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env,
+ struct bpf_stack_state *stack)
+{
+ if (is_spilled_scalar_reg64(stack))
+ return &stack->spilled_ptr;
+
+ if (is_stack_all_misc(env, stack))
+ return &unbound_reg;
+
+ return NULL;
+}
+
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_idmap *idmap,
+ enum exact_level exact)
+{
+ int i, spi;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+
+ spi = i / BPF_REG_SIZE;
+
+ if (exact != NOT_EXACT &&
+ (i >= cur->allocated_stack ||
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE]))
+ return false;
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+ continue;
+
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
+ /* explored stack has more populated slots than current stack
+ * and these slots were used
+ */
+ if (i >= cur->allocated_stack)
+ return false;
+
+ /* 64-bit scalar spill vs all slots MISC and vice versa.
+ * Load from all slots MISC produces unbound scalar.
+ * Construct a fake register for such stack and call
+ * regsafe() to ensure scalar ids are compared.
+ */
+ old_reg = scalar_reg_for_stack(env, &old->stack[spi]);
+ cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]);
+ if (old_reg && cur_reg) {
+ if (!regsafe(env, old_reg, cur_reg, idmap, exact))
+ return false;
+ i += BPF_REG_SIZE - 1;
+ continue;
+ }
+
+ /* if old state was safe with misc data in the stack
+ * it will be safe with zero-initialized stack.
+ * The opposite is not true
+ */
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)
+ continue;
+ /* Both old and cur are having same slot_type */
+ switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap, exact))
+ return false;
+ break;
+ case STACK_DYNPTR:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (old_reg->dynptr.type != cur_reg->dynptr.type ||
+ old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot ||
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
+ case STACK_IRQ_FLAG:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+ old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
+ return false;
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ case STACK_INVALID:
+ continue;
+ /* Ensure that new unhandled slot types return false by default */
+ default:
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
+ struct bpf_idmap *idmap)
+{
+ int i;
+
+ if (old->acquired_refs != cur->acquired_refs)
+ return false;
+
+ if (old->active_locks != cur->active_locks)
+ return false;
+
+ if (old->active_preempt_locks != cur->active_preempt_locks)
+ return false;
+
+ if (old->active_rcu_locks != cur->active_rcu_locks)
+ return false;
+
+ if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+ return false;
+
+ if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
+ old->active_lock_ptr != cur->active_lock_ptr)
+ return false;
+
+ for (i = 0; i < old->acquired_refs; i++) {
+ if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
+ old->refs[i].type != cur->refs[i].type)
+ return false;
+ switch (old->refs[i].type) {
+ case REF_TYPE_PTR:
+ case REF_TYPE_IRQ:
+ break;
+ case REF_TYPE_LOCK:
+ case REF_TYPE_RES_LOCK:
+ case REF_TYPE_RES_LOCK_IRQ:
+ if (old->refs[i].ptr != cur->refs[i].ptr)
+ return false;
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type);
+ return false;
+ }
+ }
+
+ return true;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -1475,419 +19442,1734 @@ err_free:
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
+{
+ u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
+ u16 i;
+
+ if (old->callback_depth > cur->callback_depth)
+ return false;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (((1 << i) & live_regs) &&
+ !regsafe(env, &old->regs[i], &cur->regs[i],
+ &env->idmap_scratch, exact))
+ return false;
+
+ if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
+ return false;
+
+ return true;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
{
+ env->idmap_scratch.tmp_id_gen = env->id_gen;
+ memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+}
+
+static bool states_equal(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ enum exact_level exact)
+{
+ u32 insn_idx;
int i;
- for (i = 0; i < MAX_BPF_REG; i++) {
- if (memcmp(&old->regs[i], &cur->regs[i],
- sizeof(old->regs[0])) != 0) {
- if (old->regs[i].type == NOT_INIT ||
- (old->regs[i].type == UNKNOWN_VALUE &&
- cur->regs[i].type != NOT_INIT))
- continue;
+ if (old->curframe != cur->curframe)
+ return false;
+
+ reset_idmap_scratch(env);
+
+ /* Verification state from speculative execution simulation
+ * must never prune a non-speculative execution one.
+ */
+ if (old->speculative && !cur->speculative)
+ return false;
+
+ if (old->in_sleepable != cur->in_sleepable)
+ return false;
+
+ if (!refsafe(old, cur, &env->idmap_scratch))
+ return false;
+
+ /* for states to be equal callsites have to be the same
+ * and all frame states need to be equivalent
+ */
+ for (i = 0; i <= old->curframe; i++) {
+ insn_idx = frame_insn_idx(old, i);
+ if (old->frame[i]->callsite != cur->frame[i]->callsite)
+ return false;
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
return false;
+ }
+ return true;
+}
+
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
+ */
+static int propagate_precision(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ bool *changed)
+{
+ struct bpf_reg_state *state_reg;
+ struct bpf_func_state *state;
+ int i, err = 0, fr;
+ bool first;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ first = true;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating r%d", fr, i);
+ else
+ verbose(env, ",r%d", i);
+ }
+ bt_set_frame_reg(&env->bt, fr, i);
+ first = false;
}
+
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ if (first)
+ verbose(env, "frame %d: propagating fp%d",
+ fr, (-i - 1) * BPF_REG_SIZE);
+ else
+ verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE);
+ }
+ bt_set_frame_slot(&env->bt, fr, i);
+ first = false;
+ }
+ if (!first && (env->log.level & BPF_LOG_LEVEL2))
+ verbose(env, "\n");
}
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (old->stack_slot_type[i] == STACK_INVALID)
- continue;
- if (old->stack_slot_type[i] != cur->stack_slot_type[i])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- return false;
- if (i % BPF_REG_SIZE)
- continue;
- if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- sizeof(old->spilled_regs[0])))
- /* when explored and current stack slot types are
- * the same, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
- * but current path has stored:
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
+ err = __mark_chain_precision(env, cur, -1, changed);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+#define MAX_BACKEDGE_ITERS 64
+
+/* Propagate read and precision marks from visit->backedges[*].state->equal_state
+ * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
+ * then free visit->backedges.
+ * After execution of this function incomplete_read_marks() will return false
+ * for all states corresponding to @visit->callchain.
+ */
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
+{
+ struct bpf_scc_backedge *backedge;
+ struct bpf_verifier_state *st;
+ bool changed;
+ int i, err;
+
+ i = 0;
+ do {
+ if (i++ > MAX_BACKEDGE_ITERS) {
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "%s: too many iterations\n", __func__);
+ for (backedge = visit->backedges; backedge; backedge = backedge->next)
+ mark_all_scalars_precise(env, &backedge->state);
+ break;
+ }
+ changed = false;
+ for (backedge = visit->backedges; backedge; backedge = backedge->next) {
+ st = &backedge->state;
+ err = propagate_precision(env, st->equal_state, st, &changed);
+ if (err)
+ return err;
+ }
+ } while (changed);
+
+ free_backedges(visit);
+ return 0;
+}
+
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr = cur->curframe;
+
+ if (old->curframe != fr)
+ return false;
+
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (memcmp(&fold->regs[i], &fcur->regs[i],
+ offsetof(struct bpf_reg_state, frameno)))
return false;
- else
- continue;
- }
return true;
}
-static int is_state_visited(struct verifier_env *env, int insn_idx)
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "infinite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * infinitely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
{
- struct verifier_state_list *new_sl;
- struct verifier_state_list *sl;
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
- sl = env->explored_states[insn_idx];
- if (!sl)
- /* this 'insn_idx' instruction wasn't marked, so we will not
- * be doing state search here
- */
- return 0;
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
- while (sl != STATE_LIST_MARK) {
- if (states_equal(&sl->state, &env->cur_state))
- /* reached equivalent register/stack state,
- * prune the search
+static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state_list *new_sl;
+ struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state *cur = env->cur_state, *new;
+ bool force_new_state, add_new_state, loop;
+ int n, err, states_cnt = 0;
+ struct list_head *pos, *tmp, *head;
+
+ force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
+ /* Avoid accumulating infinitely long jmp history */
+ cur->jmp_history_cnt > 40;
+
+ /* bpf progs typically have pruning point every 4 instructions
+ * http://vger.kernel.org/bpfconf2019.html#session-1
+ * Do not add new state for future pruning if the verifier hasn't seen
+ * at least 2 jumps and at least 8 instructions.
+ * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+ * In tests that amounts to up to 50% reduction into total verifier
+ * memory consumption and 20% verifier time speedup.
+ */
+ add_new_state = force_new_state;
+ if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+ env->insn_processed - env->prev_insn_processed >= 8)
+ add_new_state = true;
+
+ clean_live_states(env, insn_idx, cur);
+
+ loop = false;
+ head = explored_state(env, insn_idx);
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ states_cnt++;
+ if (sl->state.insn_idx != insn_idx)
+ continue;
+
+ if (sl->state.branches) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ *
+ * Note, that states have to be compared exactly in this case because
+ * read and precision marks might not be finalized inside the loop.
+ * E.g. as in the program below:
+ *
+ * 1. r7 = -16
+ * 2. r6 = bpf_get_prandom_u32()
+ * 3. while (bpf_iter_num_next(&fp[-8])) {
+ * 4. if (r6 != 42) {
+ * 5. r7 = -32
+ * 6. r6 = bpf_get_prandom_u32()
+ * 7. continue
+ * 8. }
+ * 9. r0 = r10
+ * 10. r0 += r7
+ * 11. r8 = *(u64 *)(r0 + 0)
+ * 12. r6 = bpf_get_prandom_u32()
+ * 13. }
+ *
+ * Here verifier would first visit path 1-3, create a checkpoint at 3
+ * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+ * not have read or precision mark for r7 yet, thus inexact states
+ * comparison would discard current state with r7=-32
+ * => unsafe memory access at 11 would not be caught.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+ iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+ loop = true;
+ goto hit;
+ }
+ }
+ goto skip_inf_loop_check;
+ }
+ if (is_may_goto_insn_at(env, insn_idx)) {
+ if (sl->state.may_goto_depth != cur->may_goto_depth &&
+ states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
+ loop = true;
+ goto hit;
+ }
+ }
+ if (bpf_calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, RANGE_WITHIN))
+ goto hit;
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur, EXACT) &&
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.may_goto_depth == cur->may_goto_depth &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
+ verbose_linfo(env, insn_idx, "; ");
+ verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ verbose(env, "cur state:");
+ print_verifier_state(env, cur, cur->curframe, true);
+ verbose(env, "old state:");
+ print_verifier_state(env, &sl->state, cur->curframe, true);
+ return -EINVAL;
+ }
+ /* if the verifier is processing a loop, avoid adding new state
+ * too often, since different loop iterations have distinct
+ * states and may not help future pruning.
+ * This threshold shouldn't be too low to make sure that
+ * a loop with large bound will be rejected quickly.
+ * The most abusive loop will be:
+ * r1 += 1
+ * if r1 < 1000000 goto pc-2
+ * 1M insn_procssed limit / 100 == 10k peak states.
+ * This threshold shouldn't be too high either, since states
+ * at the end of the loop are likely to be useful in pruning.
*/
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
+ env->insn_processed - env->prev_insn_processed < 100)
+ add_new_state = false;
+ goto miss;
+ }
+ /* See comments for mark_all_regs_read_and_precise() */
+ loop = incomplete_read_marks(env, &sl->state);
+ if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
+hit:
+ sl->hit_cnt++;
+
+ /* if previous state reached the exit with precision and
+ * current state is equivalent to it (except precision marks)
+ * the precision needs to be propagated back in
+ * the current state.
+ */
+ err = 0;
+ if (is_jmp_point(env, env->insn_idx))
+ err = push_jmp_history(env, cur, 0, 0);
+ err = err ? : propagate_precision(env, &sl->state, cur, NULL);
+ if (err)
+ return err;
+ /* When processing iterator based loops above propagate_liveness and
+ * propagate_precision calls are not sufficient to transfer all relevant
+ * read and precision marks. E.g. consider the following case:
+ *
+ * .-> A --. Assume the states are visited in the order A, B, C.
+ * | | | Assume that state B reaches a state equivalent to state A.
+ * | v v At this point, state C is not processed yet, so state A
+ * '-- B C has not received any read or precision marks from C.
+ * Thus, marks propagated from A to B are incomplete.
+ *
+ * The verifier mitigates this by performing the following steps:
+ *
+ * - Prior to the main verification pass, strongly connected components
+ * (SCCs) are computed over the program's control flow graph,
+ * intraprocedurally.
+ *
+ * - During the main verification pass, `maybe_enter_scc()` checks
+ * whether the current verifier state is entering an SCC. If so, an
+ * instance of a `bpf_scc_visit` object is created, and the state
+ * entering the SCC is recorded as the entry state.
+ *
+ * - This instance is associated not with the SCC itself, but with a
+ * `bpf_scc_callchain`: a tuple consisting of the call sites leading to
+ * the SCC and the SCC id. See `compute_scc_callchain()`.
+ *
+ * - When a verification path encounters a `states_equal(...,
+ * RANGE_WITHIN)` condition, there exists a call chain describing the
+ * current state and a corresponding `bpf_scc_visit` instance. A copy
+ * of the current state is created and added to
+ * `bpf_scc_visit->backedges`.
+ *
+ * - When a verification path terminates, `maybe_exit_scc()` is called
+ * from `update_branch_counts()`. For states with `branches == 0`, it
+ * checks whether the state is the entry state of any `bpf_scc_visit`
+ * instance. If it is, this indicates that all paths originating from
+ * this SCC visit have been explored. `propagate_backedges()` is then
+ * called, which propagates read and precision marks through the
+ * backedges until a fixed point is reached.
+ * (In the earlier example, this would propagate marks from A to B,
+ * from C to A, and then again from A to B.)
+ *
+ * A note on callchains
+ * --------------------
+ *
+ * Consider the following example:
+ *
+ * void foo() { loop { ... SCC#1 ... } }
+ * void main() {
+ * A: foo();
+ * B: ...
+ * C: foo();
+ * }
+ *
+ * Here, there are two distinct callchains leading to SCC#1:
+ * - (A, SCC#1)
+ * - (C, SCC#1)
+ *
+ * Each callchain identifies a separate `bpf_scc_visit` instance that
+ * accumulates backedge states. The `propagate_{liveness,precision}()`
+ * functions traverse the parent state of each backedge state, which
+ * means these parent states must remain valid (i.e., not freed) while
+ * the corresponding `bpf_scc_visit` instance exists.
+ *
+ * Associating `bpf_scc_visit` instances directly with SCCs instead of
+ * callchains would break this invariant:
+ * - States explored during `C: foo()` would contribute backedges to
+ * SCC#1, but SCC#1 would only be exited once the exploration of
+ * `A: foo()` completes.
+ * - By that time, the states explored between `A: foo()` and `C: foo()`
+ * (i.e., `B: ...`) may have already been freed, causing the parent
+ * links for states from `C: foo()` to become invalid.
+ */
+ if (loop) {
+ struct bpf_scc_backedge *backedge;
+
+ backedge = kzalloc(sizeof(*backedge), GFP_KERNEL_ACCOUNT);
+ if (!backedge)
+ return -ENOMEM;
+ err = copy_verifier_state(&backedge->state, cur);
+ backedge->state.equal_state = &sl->state;
+ backedge->state.insn_idx = insn_idx;
+ err = err ?: add_scc_backedge(env, &sl->state, backedge);
+ if (err) {
+ free_verifier_state(&backedge->state, false);
+ kfree(backedge);
+ return err;
+ }
+ }
return 1;
- sl = sl->next;
+ }
+miss:
+ /* when new state is not going to be added do not increase miss count.
+ * Otherwise several loop iterations will remove the state
+ * recorded earlier. The goal of these heuristics is to have
+ * states from some iterations of the loop (some in the beginning
+ * and some at the end) to help pruning.
+ */
+ if (add_new_state)
+ sl->miss_cnt++;
+ /* heuristic to determine whether this state is beneficial
+ * to keep checking from state equivalence point of view.
+ * Higher numbers increase max_states_per_insn and verification time,
+ * but do not meaningfully decrease insn_processed.
+ * 'n' controls how many times state could miss before eviction.
+ * Use bigger 'n' for checkpoints because evicting checkpoint states
+ * too early would hinder iterator convergence.
+ */
+ n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+ if (sl->miss_cnt > sl->hit_cnt * n + n) {
+ /* the state is unlikely to be useful. Remove it to
+ * speed up verification
+ */
+ sl->in_free_list = true;
+ list_del(&sl->node);
+ list_add(&sl->node, &env->free_list);
+ env->free_list_size++;
+ env->explored_states_size--;
+ maybe_free_verifier_state(env, sl);
+ }
}
- /* there were no equivalent states, remember current one.
- * technically the current state is not proven to be safe yet,
- * but it will either reach bpf_exit (which means it's safe) or
- * it will be rejected. Since there are no loops, we won't be
- * seeing this 'insn_idx' instruction again on the way to bpf_exit
+ if (env->max_states_per_insn < states_cnt)
+ env->max_states_per_insn = states_cnt;
+
+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ return 0;
+
+ if (!add_new_state)
+ return 0;
+
+ /* There were no equivalent states, remember the current one.
+ * Technically the current state is not proven to be safe yet,
+ * but it will either reach outer most bpf_exit (which means it's safe)
+ * or it will be rejected. When there are no loops the verifier won't be
+ * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
+ * again on the way to bpf_exit.
+ * When looping the sl->state.branches will be > 0 and this state
+ * will not be considered for equivalence until branches == 0.
*/
- new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL_ACCOUNT);
if (!new_sl)
return -ENOMEM;
+ env->total_states++;
+ env->explored_states_size++;
+ update_peak_states(env);
+ env->prev_jmps_processed = env->jmps_processed;
+ env->prev_insn_processed = env->insn_processed;
+
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
/* add new state to the head of linked list */
- memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
- new_sl->next = env->explored_states[insn_idx];
- env->explored_states[insn_idx] = new_sl;
+ new = &new_sl->state;
+ err = copy_verifier_state(new, cur);
+ if (err) {
+ free_verifier_state(new, false);
+ kfree(new_sl);
+ return err;
+ }
+ new->insn_idx = insn_idx;
+ verifier_bug_if(new->branches != 1, env,
+ "%s:branches_to_explore=%d insn %d",
+ __func__, new->branches, insn_idx);
+ err = maybe_enter_scc(env, new);
+ if (err) {
+ free_verifier_state(new, false);
+ kfree(new_sl);
+ return err;
+ }
+
+ cur->parent = new;
+ cur->first_insn_idx = insn_idx;
+ cur->dfs_depth = new->dfs_depth + 1;
+ clear_jmp_history(cur);
+ list_add(&new_sl->node, head);
return 0;
}
-static int do_check(struct verifier_env *env)
+/* Return true if it's OK to have the same insn return a different type. */
+static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
- struct verifier_state *state = &env->cur_state;
- struct bpf_insn *insns = env->prog->insnsi;
- struct reg_state *regs = state->regs;
- int insn_cnt = env->prog->len;
- int insn_idx, prev_insn_idx = 0;
- int insn_processed = 0;
- bool do_print_state = false;
+ switch (base_type(type)) {
+ case PTR_TO_CTX:
+ case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_XDP_SOCK:
+ case PTR_TO_BTF_ID:
+ case PTR_TO_ARENA:
+ return false;
+ default:
+ return true;
+ }
+}
- init_reg_state(regs);
- insn_idx = 0;
- for (;;) {
- struct bpf_insn *insn;
- u8 class;
- int err;
+/* If an instruction was previously used with particular pointer types, then we
+ * need to be careful to avoid cases such as the below, where it may be ok
+ * for one branch accessing the pointer, but not ok for the other branch:
+ *
+ * R1 = sock_ptr
+ * goto X;
+ * ...
+ * R1 = some_other_valid_ptr;
+ * goto X;
+ * ...
+ * R2 = *(u32 *)(R1 + 0);
+ */
+static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
+{
+ return src != prev && (!reg_type_mismatch_ok(src) ||
+ !reg_type_mismatch_ok(prev));
+}
- if (insn_idx >= insn_cnt) {
- verbose("invalid insn idx %d insn_cnt %d\n",
- insn_idx, insn_cnt);
- return -EFAULT;
- }
+static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type)
+{
+ switch (base_type(type)) {
+ case PTR_TO_MEM:
+ case PTR_TO_BTF_ID:
+ return true;
+ default:
+ return false;
+ }
+}
- insn = &insns[insn_idx];
- class = BPF_CLASS(insn->code);
+static bool is_ptr_to_mem(enum bpf_reg_type type)
+{
+ return base_type(type) == PTR_TO_MEM;
+}
- if (++insn_processed > 32768) {
- verbose("BPF program is too large. Proccessed %d insn\n",
- insn_processed);
- return -E2BIG;
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_mismatch)
+{
+ enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
+ enum bpf_reg_type merged_type;
+
+ if (*prev_type == NOT_INIT) {
+ /* Saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * save type to validate intersecting paths
+ */
+ *prev_type = type;
+ } else if (reg_type_mismatch(type, *prev_type)) {
+ /* Abuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ if (allow_trust_mismatch &&
+ is_ptr_to_mem_or_btf_id(type) &&
+ is_ptr_to_mem_or_btf_id(*prev_type)) {
+ /*
+ * Have to support a use case when one path through
+ * the program yields TRUSTED pointer while another
+ * is UNTRUSTED. Fallback to UNTRUSTED to generate
+ * BPF_PROBE_MEM/BPF_PROBE_MEMSX.
+ * Same behavior of MEM_RDONLY flag.
+ */
+ if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type))
+ merged_type = PTR_TO_MEM;
+ else
+ merged_type = PTR_TO_BTF_ID;
+ if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED))
+ merged_type |= PTR_UNTRUSTED;
+ if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY))
+ merged_type |= MEM_RDONLY;
+ *prev_type = merged_type;
+ } else {
+ verbose(env, "same insn cannot be used with different pointers\n");
+ return -EINVAL;
}
+ }
- err = is_state_visited(env, insn_idx);
- if (err < 0)
+ return 0;
+}
+
+enum {
+ PROCESS_BPF_EXIT = 1
+};
+
+static int process_bpf_exit_full(struct bpf_verifier_env *env,
+ bool *do_print_state,
+ bool exception_exit)
+{
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback function,
+ * for which reference_state must match caller reference
+ * state when it exits.
+ */
+ int err = check_resource_leak(env, exception_exit,
+ !env->cur_state->curframe,
+ "BPF_EXIT instruction in main prog");
+ if (err)
+ return err;
+
+ /* The side effect of the prepare_func_exit which is
+ * being skipped is that it frees bpf_func_state.
+ * Typically, process_bpf_exit will only be hit with
+ * outermost exit. copy_verifier_state in pop_stack will
+ * handle freeing of any extra bpf_func_state left over
+ * from not processing all nested function exits. We
+ * also skip return code checks as they are not needed
+ * for exceptional exits.
+ */
+ if (exception_exit)
+ return PROCESS_BPF_EXIT;
+
+ if (env->cur_state->curframe) {
+ /* exit from nested function */
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
return err;
- if (err == 1) {
- /* found equivalent state, can prune the search */
- if (log_level) {
- if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
- prev_insn_idx, insn_idx);
- else
- verbose("%d: safe\n", insn_idx);
- }
- goto process_bpf_exit;
- }
+ *do_print_state = true;
+ return 0;
+ }
- if (log_level && do_print_state) {
- verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
- print_verifier_state(env);
- do_print_state = false;
- }
+ err = check_return_code(env, BPF_REG_0, "R0");
+ if (err)
+ return err;
+ return PROCESS_BPF_EXIT;
+}
- if (log_level) {
- verbose("%d: ", insn_idx);
- print_bpf_insn(insn);
- }
+static int indirect_jump_min_max_index(struct bpf_verifier_env *env,
+ int regno,
+ struct bpf_map *map,
+ u32 *pmin_index, u32 *pmax_index)
+{
+ struct bpf_reg_state *reg = reg_state(env, regno);
+ u64 min_index, max_index;
+ const u32 size = 8;
+
+ if (check_add_overflow(reg->umin_value, reg->off, &min_index) ||
+ (min_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n",
+ regno, reg->umin_value, reg->off);
+ return -ERANGE;
+ }
+ if (check_add_overflow(reg->umax_value, reg->off, &max_index) ||
+ (max_index > (u64) U32_MAX * size)) {
+ verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n",
+ regno, reg->umax_value, reg->off);
+ return -ERANGE;
+ }
- if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(regs, insn);
- if (err)
- return err;
+ min_index /= size;
+ max_index /= size;
- } else if (class == BPF_LDX) {
- enum bpf_reg_type src_reg_type;
+ if (max_index >= map->max_entries) {
+ verbose(env, "R%u points to outside of jump table: [%llu,%llu] max_entries %u\n",
+ regno, min_index, max_index, map->max_entries);
+ return -EINVAL;
+ }
- /* check for reserved fields is already done */
+ *pmin_index = min_index;
+ *pmax_index = max_index;
+ return 0;
+}
- /* check src operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
- if (err)
- return err;
+/* gotox *dst_reg */
+static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ struct bpf_verifier_state *other_branch;
+ struct bpf_reg_state *dst_reg;
+ struct bpf_map *map;
+ u32 min_index, max_index;
+ int err = 0;
+ int n;
+ int i;
+
+ dst_reg = reg_state(env, insn->dst_reg);
+ if (dst_reg->type != PTR_TO_INSN) {
+ verbose(env, "R%d has type %s, expected PTR_TO_INSN\n",
+ insn->dst_reg, reg_type_str(env, dst_reg->type));
+ return -EINVAL;
+ }
+
+ map = dst_reg->map_ptr;
+ if (verifier_bug_if(!map, env, "R%d has an empty map pointer", insn->dst_reg))
+ return -EFAULT;
+
+ if (verifier_bug_if(map->map_type != BPF_MAP_TYPE_INSN_ARRAY, env,
+ "R%d has incorrect map type %d", insn->dst_reg, map->map_type))
+ return -EFAULT;
+
+ err = indirect_jump_min_max_index(env, insn->dst_reg, map, &min_index, &max_index);
+ if (err)
+ return err;
+
+ /* Ensure that the buffer is large enough */
+ if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) {
+ env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf,
+ max_index - min_index + 1);
+ if (!env->gotox_tmp_buf)
+ return -ENOMEM;
+ }
+
+ n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items);
+ if (n < 0)
+ return n;
+ if (n == 0) {
+ verbose(env, "register R%d doesn't point to any offset in map id=%d\n",
+ insn->dst_reg, map->id);
+ return -EINVAL;
+ }
- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ for (i = 0; i < n - 1; i++) {
+ other_branch = push_stack(env, env->gotox_tmp_buf->items[i],
+ env->insn_idx, env->cur_state->speculative);
+ if (IS_ERR(other_branch))
+ return PTR_ERR(other_branch);
+ }
+ env->insn_idx = env->gotox_tmp_buf->items[n-1];
+ return 0;
+}
+
+static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
+{
+ int err;
+ struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx];
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(env, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
+
+ /* Check for reserved fields is already done in
+ * resolve_pseudo_ldimm64().
+ */
+ err = check_load_mem(env, insn, false, is_ldsx, true, "ldx");
+ if (err)
+ return err;
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ err = check_atomic(env, insn);
if (err)
return err;
+ env->insn_idx++;
+ return 0;
+ }
- src_reg_type = regs[insn->src_reg].type;
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+ verbose(env, "BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
- /* check that memory (src_reg + off) is readable,
- * the state of dst_reg will be updated by this func
- */
- err = check_mem_access(env, insn->src_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ,
- insn->dst_reg);
+ err = check_store_reg(env, insn, false);
+ if (err)
+ return err;
+ } else if (class == BPF_ST) {
+ enum bpf_reg_type dst_reg_type;
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose(env, "BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = cur_regs(env)[insn->dst_reg].type;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ u8 opcode = BPF_OP(insn->code);
+
+ env->jmps_processed++;
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL &&
+ insn->off != 0) ||
+ (insn->src_reg != BPF_REG_0 &&
+ insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
+ insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
+ verbose(env, "BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_locks) {
+ if ((insn->src_reg == BPF_REG_0 &&
+ insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
+ verbose(env,
+ "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
+ }
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
+ err = check_func_call(env, insn, &env->insn_idx);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ err = check_kfunc_call(env, insn, &env->insn_idx);
+ if (!err && is_bpf_throw_kfunc(insn))
+ return process_bpf_exit_full(env, do_print_state, true);
+ } else {
+ err = check_helper_call(env, insn, &env->insn_idx);
+ }
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W) {
- insn_idx++;
- continue;
+ mark_reg_scratched(env, BPF_REG_0);
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->src_reg != BPF_REG_0 ||
+ insn->imm != 0 || insn->off != 0) {
+ verbose(env, "BPF_JA|BPF_X uses reserved fields\n");
+ return -EINVAL;
+ }
+ return check_indirect_jump(env, insn);
}
- if (insn->imm == 0) {
- /* saw a valid insn
- * dst_reg = *(u32 *)(src_reg + off)
- * use reserved 'imm' field to mark this insn
- */
- insn->imm = src_reg_type;
-
- } else if (src_reg_type != insn->imm &&
- (src_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
- /* ABuser program is trying to use the same insn
- * dst_reg = *(u32*) (src_reg + off)
- * with different pointer types:
- * src_reg == ctx in one branch and
- * src_reg == stack|map in some other branch.
- * Reject it.
- */
- verbose("same insn cannot be used with different pointers\n");
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0 ||
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
+ verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn);
- if (err)
- return err;
- insn_idx++;
- continue;
- }
-
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0) {
- verbose("BPF_STX uses reserved fields\n");
+ if (class == BPF_JMP)
+ env->insn_idx += insn->off + 1;
+ else
+ env->insn_idx += insn->imm + 1;
+ return 0;
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
+ verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
- /* check src1 operand */
- err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ return process_bpf_exit_full(env, do_print_state, false);
+ } else {
+ err = check_cond_jmp_op(env, insn, &env->insn_idx);
if (err)
return err;
- /* check src2 operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ err = check_ld_abs(env, insn);
if (err)
return err;
- /* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE,
- insn->src_reg);
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
if (err)
return err;
- } else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->src_reg != BPF_REG_0) {
- verbose("BPF_ST uses reserved fields\n");
- return -EINVAL;
- }
- /* check src operand */
- err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
- if (err)
+ env->insn_idx++;
+ sanitize_mark_insn_seen(env);
+ } else {
+ verbose(env, "invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ env->insn_idx++;
+ return 0;
+}
+
+static int do_check(struct bpf_verifier_env *env)
+{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ bool do_print_state = false;
+ int prev_insn_idx = -1;
+
+ for (;;) {
+ struct bpf_insn *insn;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, marks_err;
+
+ /* reset current history entry on each new instruction */
+ env->cur_hist_ent = NULL;
+
+ env->prev_insn_idx = prev_insn_idx;
+ if (env->insn_idx >= insn_cnt) {
+ verbose(env, "invalid insn idx %d insn_cnt %d\n",
+ env->insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[env->insn_idx];
+ insn_aux = &env->insn_aux_data[env->insn_idx];
+
+ if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
+ verbose(env,
+ "BPF program is too large. Processed %d insn\n",
+ env->insn_processed);
+ return -E2BIG;
+ }
+
+ state->last_insn_idx = env->prev_insn_idx;
+ state->insn_idx = env->insn_idx;
+
+ if (is_prune_point(env, env->insn_idx)) {
+ err = is_state_visited(env, env->insn_idx);
+ if (err < 0)
return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (do_print_state)
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ else
+ verbose(env, "%d: safe\n", env->insn_idx);
+ }
+ goto process_bpf_exit;
+ }
+ }
- /* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE,
- -1);
+ if (is_jmp_point(env, env->insn_idx)) {
+ err = push_jmp_history(env, state, 0, 0);
if (err)
return err;
+ }
- } else if (class == BPF_JMP) {
- u8 opcode = BPF_OP(insn->code);
+ if (signal_pending(current))
+ return -EAGAIN;
- if (opcode == BPF_CALL) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->off != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
- verbose("BPF_CALL uses reserved fields\n");
- return -EINVAL;
- }
+ if (need_resched())
+ cond_resched();
- err = check_call(env, insn->imm);
- if (err)
- return err;
+ if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
+ verbose(env, "\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ print_verifier_state(env, state, state->curframe, true);
+ do_print_state = false;
+ }
- } else if (opcode == BPF_JA) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
- verbose("BPF_JA uses reserved fields\n");
- return -EINVAL;
- }
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (verifier_state_scratched(env))
+ print_insn_state(env, state, state->curframe);
- insn_idx += insn->off + 1;
- continue;
+ verbose_linfo(env, env->insn_idx, "; ");
+ env->prev_log_pos = env->log.end_pos;
+ verbose(env, "%d: ", env->insn_idx);
+ verbose_insn(env, insn);
+ env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+ env->prev_log_pos = env->log.end_pos;
+ }
- } else if (opcode == BPF_EXIT) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
- verbose("BPF_EXIT uses reserved fields\n");
- return -EINVAL;
- }
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
+ err = bpf_prog_offload_verify_insn(env, env->insn_idx,
+ env->prev_insn_idx);
+ if (err)
+ return err;
+ }
- /* eBPF calling convetion is such that R0 is used
- * to return the value from eBPF program.
- * Make sure that it's readable at this time
- * of bpf_exit, which means that program wrote
- * something into it earlier
- */
- err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
- if (err)
- return err;
+ sanitize_mark_insn_seen(env);
+ prev_insn_idx = env->insn_idx;
+
+ /* Reduce verification complexity by stopping speculative path
+ * verification when a nospec is encountered.
+ */
+ if (state->speculative && insn_aux->nospec)
+ goto process_bpf_exit;
+ err = bpf_reset_stack_write_marks(env, env->insn_idx);
+ if (err)
+ return err;
+ err = do_check_insn(env, &do_print_state);
+ if (err >= 0 || error_recoverable_with_nospec(err)) {
+ marks_err = bpf_commit_stack_write_marks(env);
+ if (marks_err)
+ return marks_err;
+ }
+ if (error_recoverable_with_nospec(err) && state->speculative) {
+ /* Prevent this speculative path from ever reaching the
+ * insn that would have been unsafe to execute.
+ */
+ insn_aux->nospec = true;
+ /* If it was an ADD/SUB insn, potentially remove any
+ * markings for alu sanitization.
+ */
+ insn_aux->alu_state = 0;
+ goto process_bpf_exit;
+ } else if (err < 0) {
+ return err;
+ } else if (err == PROCESS_BPF_EXIT) {
+ goto process_bpf_exit;
+ }
+ WARN_ON_ONCE(err);
+
+ if (state->speculative && insn_aux->nospec_result) {
+ /* If we are on a path that performed a jump-op, this
+ * may skip a nospec patched-in after the jump. This can
+ * currently never happen because nospec_result is only
+ * used for the write-ops
+ * `*(size*)(dst_reg+off)=src_reg|imm32` which must
+ * never skip the following insn. Still, add a warning
+ * to document this in case nospec_result is used
+ * elsewhere in the future.
+ *
+ * All non-branch instructions have a single
+ * fall-through edge. For these, nospec_result should
+ * already work.
+ */
+ if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP ||
+ BPF_CLASS(insn->code) == BPF_JMP32, env,
+ "speculation barrier after jump instruction may not have the desired effect"))
+ return -EFAULT;
process_bpf_exit:
- insn_idx = pop_stack(env, &prev_insn_idx);
- if (insn_idx < 0) {
- break;
- } else {
- do_print_state = true;
- continue;
- }
- } else {
- err = check_cond_jmp_op(env, insn, &insn_idx);
- if (err)
+ mark_verifier_state_scratched(env);
+ err = update_branch_counts(env, env->cur_state);
+ if (err)
+ return err;
+ err = bpf_update_live_stack(env);
+ if (err)
+ return err;
+ err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
+ pop_log);
+ if (err < 0) {
+ if (err != -ENOENT)
return err;
+ break;
+ } else {
+ do_print_state = true;
+ continue;
}
- } else if (class == BPF_LD) {
- u8 mode = BPF_MODE(insn->code);
+ }
+ }
- if (mode == BPF_ABS || mode == BPF_IND) {
- err = check_ld_abs(env, insn);
- if (err)
- return err;
+ return 0;
+}
- } else if (mode == BPF_IMM) {
- err = check_ld_imm(env, insn);
- if (err)
- return err;
+static int find_btf_percpu_datasec(struct btf *btf)
+{
+ const struct btf_type *t;
+ const char *tname;
+ int i, n;
+
+ /*
+ * Both vmlinux and module each have their own ".data..percpu"
+ * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF
+ * types to look at only module's own BTF types.
+ */
+ n = btf_nr_types(btf);
+ if (btf_is_module(btf))
+ i = btf_nr_types(btf_vmlinux);
+ else
+ i = 1;
- insn_idx++;
- } else {
- verbose("invalid BPF_LD mode\n");
- return -EINVAL;
+ for(; i < n; i++) {
+ t = btf_type_by_id(btf, i);
+ if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
+ continue;
+
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!strcmp(tname, ".data..percpu"))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+/*
+ * Add btf to the used_btfs array and return the index. (If the btf was
+ * already added, then just return the index.) Upon successful insertion
+ * increase btf refcnt, and, if present, also refcount the corresponding
+ * kernel module.
+ */
+static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
+{
+ struct btf_mod_pair *btf_mod;
+ int i;
+
+ /* check whether we recorded this BTF (and maybe module) already */
+ for (i = 0; i < env->used_btf_cnt; i++)
+ if (env->used_btfs[i].btf == btf)
+ return i;
+
+ if (env->used_btf_cnt >= MAX_USED_BTFS) {
+ verbose(env, "The total number of btfs per program has reached the limit of %u\n",
+ MAX_USED_BTFS);
+ return -E2BIG;
+ }
+
+ btf_get(btf);
+
+ btf_mod = &env->used_btfs[env->used_btf_cnt];
+ btf_mod->btf = btf;
+ btf_mod->module = NULL;
+
+ /* if we reference variables from kernel module, bump its refcount */
+ if (btf_is_module(btf)) {
+ btf_mod->module = btf_try_get_module(btf);
+ if (!btf_mod->module) {
+ btf_put(btf);
+ return -ENXIO;
+ }
+ }
+
+ return env->used_btf_cnt++;
+}
+
+/* replace pseudo btf_id with kernel symbol address */
+static int __check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux,
+ struct btf *btf)
+{
+ const struct btf_var_secinfo *vsi;
+ const struct btf_type *datasec;
+ const struct btf_type *t;
+ const char *sym_name;
+ bool percpu = false;
+ u32 type, id = insn->imm;
+ s32 datasec_id;
+ u64 addr;
+ int i;
+
+ t = btf_type_by_id(btf, id);
+ if (!t) {
+ verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
+ return -ENOENT;
+ }
+
+ if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
+ return -EINVAL;
+ }
+
+ sym_name = btf_name_by_offset(btf, t->name_off);
+ addr = kallsyms_lookup_name(sym_name);
+ if (!addr) {
+ verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
+ sym_name);
+ return -ENOENT;
+ }
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ if (btf_type_is_func(t)) {
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = 0;
+ return 0;
+ }
+
+ datasec_id = find_btf_percpu_datasec(btf);
+ if (datasec_id > 0) {
+ datasec = btf_type_by_id(btf, datasec_id);
+ for_each_vsi(i, datasec, vsi) {
+ if (vsi->type == id) {
+ percpu = true;
+ break;
}
- } else {
- verbose("unknown insn class %d\n", class);
+ }
+ }
+
+ type = t->type;
+ t = btf_type_skip_modifiers(btf, type, NULL);
+ if (percpu) {
+ aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
+ aux->btf_var.btf = btf;
+ aux->btf_var.btf_id = type;
+ } else if (!btf_type_is_struct(t)) {
+ const struct btf_type *ret;
+ const char *tname;
+ u32 tsize;
+
+ /* resolve the type size of ksym. */
+ ret = btf_resolve_size(btf, t, &tsize);
+ if (IS_ERR(ret)) {
+ tname = btf_name_by_offset(btf, t->name_off);
+ verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
+ tname, PTR_ERR(ret));
+ return -EINVAL;
+ }
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = tsize;
+ } else {
+ aux->btf_var.reg_type = PTR_TO_BTF_ID;
+ aux->btf_var.btf = btf;
+ aux->btf_var.btf_id = type;
+ }
+
+ return 0;
+}
+
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux)
+{
+ struct btf *btf;
+ int btf_fd;
+ int err;
+
+ btf_fd = insn[1].imm;
+ if (btf_fd) {
+ CLASS(fd, f)(btf_fd);
+
+ btf = __btf_get_by_fd(f);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF object FD specified.\n");
+ return -EINVAL;
+ }
+ } else {
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
+ }
+ btf = btf_vmlinux;
+ }
+
+ err = __check_pseudo_btf_id(env, insn, aux, btf);
+ if (err)
+ return err;
+
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+static bool is_tracing_prog_type(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+{
+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+}
+
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map,
+ struct bpf_prog *prog)
+
+{
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+
+ if (map->excl_prog_sha &&
+ memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) {
+ verbose(env, "program's hash doesn't match map's excl_prog_hash\n");
+ return -EACCES;
+ }
+
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD) ||
+ btf_record_has_field(map->record, BPF_RB_ROOT)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");
+ return -EINVAL;
+ }
+ }
+
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
+ if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
+ verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
+ return -EINVAL;
+ }
+
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+ return -EINVAL;
+ }
+ }
+
+ if (btf_record_has_field(map->record, BPF_TIMER)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_timer yet\n");
+ return -EINVAL;
+ }
+ }
+
+ if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_wq yet\n");
return -EINVAL;
}
+ }
+
+ if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
+ !bpf_offload_prog_map_match(prog, map)) {
+ verbose(env, "offload device mismatch between prog and map\n");
+ return -EINVAL;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ verbose(env, "bpf_struct_ops map cannot be used in prog\n");
+ return -EINVAL;
+ }
- insn_idx++;
+ if (prog->sleepable)
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_HASH:
+ case BPF_MAP_TYPE_LRU_HASH:
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_HASH:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ case BPF_MAP_TYPE_LRU_PERCPU_HASH:
+ case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+ case BPF_MAP_TYPE_HASH_OF_MAPS:
+ case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ case BPF_MAP_TYPE_QUEUE:
+ case BPF_MAP_TYPE_STACK:
+ case BPF_MAP_TYPE_ARENA:
+ case BPF_MAP_TYPE_INSN_ARRAY:
+ break;
+ default:
+ verbose(env,
+ "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
+ return -EINVAL;
+ }
+
+ if (bpf_map_is_cgroup_storage(map) &&
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
+ verbose(env, "only one cgroup storage of each type is allowed\n");
+ return -EBUSY;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ if (env->prog->aux->arena) {
+ verbose(env, "Only one arena per program\n");
+ return -EBUSY;
+ }
+ if (!env->allow_ptr_leaks || !env->bpf_capable) {
+ verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+ return -EPERM;
+ }
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required to use arena\n");
+ return -EOPNOTSUPP;
+ }
+ if (!bpf_jit_supports_arena()) {
+ verbose(env, "JIT doesn't support arena\n");
+ return -EOPNOTSUPP;
+ }
+ env->prog->aux->arena = (void *)map;
+ if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+ verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ return -EINVAL;
+ }
}
return 0;
}
-/* look for pseudo eBPF instructions that access map FDs and
- * replace them with actual map pointers
+static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
+{
+ int i, err;
+
+ /* check whether we recorded this map already */
+ for (i = 0; i < env->used_map_cnt; i++)
+ if (env->used_maps[i] == map)
+ return i;
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ verbose(env, "The total number of maps per program has reached the limit of %u\n",
+ MAX_USED_MAPS);
+ return -E2BIG;
+ }
+
+ err = check_map_prog_compatibility(env, map, env->prog);
+ if (err)
+ return err;
+
+ if (env->prog->sleepable)
+ atomic64_inc(&map->sleepable_refcnt);
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in bpf_free_used_maps()
+ */
+ bpf_map_inc(map);
+
+ env->used_maps[env->used_map_cnt++] = map;
+
+ if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) {
+ err = bpf_insn_array_init(map, env->prog);
+ if (err) {
+ verbose(env, "Failed to properly initialize insn array\n");
+ return err;
+ }
+ env->insn_array_maps[env->insn_array_map_cnt++] = map;
+ }
+
+ return env->used_map_cnt - 1;
+}
+
+/* Add map behind fd to used maps list, if it's not already there, and return
+ * its index.
+ * Returns <0 on error, or >= 0 index, on success.
+ */
+static int add_used_map(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ CLASS(fd, f)(fd);
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
+ return PTR_ERR(map);
+ }
+
+ return __add_used_map(env, map);
+}
+
+/* find and rewrite pseudo imm in ld_imm64 instructions:
+ *
+ * 1. if it accesses map FD, replace it with actual map pointer.
+ * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
+ *
+ * NOTE: btf_vmlinux is required for converting pseudo btf_id.
*/
-static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j;
+ int i, err;
+
+ err = bpf_prog_calc_tag(env->prog);
+ if (err)
+ return err;
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0)) {
- verbose("BPF_LDX uses reserved fields\n");
+ ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) ||
+ insn->imm != 0)) {
+ verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_insn_aux_data *aux;
struct bpf_map *map;
- struct fd f;
+ int map_idx;
+ u64 addr;
+ u32 fd;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
- verbose("invalid bpf_ld_imm64 insn\n");
+ verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
- if (insn->src_reg == 0)
+ if (insn[0].src_reg == 0)
/* valid generic load 64-bit imm */
goto next_insn;
- if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose("unrecognized bpf_ld_imm64 insn\n");
- return -EINVAL;
+ if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
+ aux = &env->insn_aux_data[i];
+ err = check_pseudo_btf_id(env, insn, aux);
+ if (err)
+ return err;
+ goto next_insn;
+ }
+
+ if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+ aux = &env->insn_aux_data[i];
+ aux->ptr_type = PTR_TO_FUNC;
+ goto next_insn;
}
- f = fdget(insn->imm);
+ /* In final convert_pseudo_ld_imm64() step, this is
+ * converted into regular 64-bit imm load insn.
+ */
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_VALUE:
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ break;
+ case BPF_PSEUDO_MAP_FD:
+ case BPF_PSEUDO_MAP_IDX:
+ if (insn[1].imm == 0)
+ break;
+ fallthrough;
+ default:
+ verbose(env, "unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
- map = bpf_map_get(f);
- if (IS_ERR(map)) {
- verbose("fd %d is not pointing to valid bpf_map\n",
- insn->imm);
- fdput(f);
- return PTR_ERR(map);
+ switch (insn[0].src_reg) {
+ case BPF_PSEUDO_MAP_IDX_VALUE:
+ case BPF_PSEUDO_MAP_IDX:
+ if (bpfptr_is_null(env->fd_array)) {
+ verbose(env, "fd_idx without fd_array is invalid\n");
+ return -EPROTO;
+ }
+ if (copy_from_bpfptr_offset(&fd, env->fd_array,
+ insn[0].imm * sizeof(fd),
+ sizeof(fd)))
+ return -EFAULT;
+ break;
+ default:
+ fd = insn[0].imm;
+ break;
}
- /* store map pointer inside BPF_LD_IMM64 instruction */
- insn[0].imm = (u32) (unsigned long) map;
- insn[1].imm = ((u64) (unsigned long) map) >> 32;
+ map_idx = add_used_map(env, fd);
+ if (map_idx < 0)
+ return map_idx;
+ map = env->used_maps[map_idx];
- /* check whether we recorded this map already */
- for (j = 0; j < env->used_map_cnt; j++)
- if (env->used_maps[j] == map) {
- fdput(f);
- goto next_insn;
+ aux = &env->insn_aux_data[i];
+ aux->map_index = map_idx;
+
+ if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
+ insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
+ addr = (unsigned long)map;
+ } else {
+ u32 off = insn[1].imm;
+
+ if (off >= BPF_MAX_VAR_OFF) {
+ verbose(env, "direct value offset of %u is not allowed\n", off);
+ return -EINVAL;
}
- if (env->used_map_cnt >= MAX_USED_MAPS) {
- fdput(f);
- return -E2BIG;
- }
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ return -EINVAL;
+ }
+
+ err = map->ops->map_direct_value_addr(map, &addr, off);
+ if (err) {
+ verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
+ map->value_size, off);
+ return err;
+ }
- /* remember this map */
- env->used_maps[env->used_map_cnt++] = map;
+ aux->map_off = off;
+ addr += off;
+ }
- /* hold the map. If the program is rejected by verifier,
- * the map will be released by release_maps() or it
- * will be used by the valid program until it's unloaded
- * and all maps are released in free_bpf_prog_info()
- */
- atomic_inc(&map->refcnt);
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
- fdput(f);
next_insn:
insn++;
i++;
+ continue;
+ }
+
+ /* Basic sanity check before we invest more work here. */
+ if (!bpf_opcode_in_insntable(insn->code)) {
+ verbose(env, "unknown opcode %02x\n", insn->code);
+ return -EINVAL;
}
}
@@ -1899,248 +21181,4218 @@ next_insn:
}
/* drop refcnt of maps used by the rejected program */
-static void release_maps(struct verifier_env *env)
+static void release_maps(struct bpf_verifier_env *env)
{
- int i;
+ __bpf_free_used_maps(env->prog->aux, env->used_maps,
+ env->used_map_cnt);
+}
- for (i = 0; i < env->used_map_cnt; i++)
- bpf_map_put(env->used_maps[i]);
+/* drop refcnt of maps used by the rejected program */
+static void release_btfs(struct bpf_verifier_env *env)
+{
+ __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
-static void convert_pseudo_ld_imm64(struct verifier_env *env)
+static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
int i;
- for (i = 0; i < insn_cnt; i++, insn++)
- if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
- insn->src_reg = 0;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+ continue;
+ if (insn->src_reg == BPF_PSEUDO_FUNC)
+ continue;
+ insn->src_reg = 0;
+ }
+}
+
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static void adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *data = env->insn_aux_data;
+ struct bpf_insn *insn = new_prog->insnsi;
+ u32 old_seen = data[off].seen;
+ u32 prog_len;
+ int i;
+
+ /* aux info at OFF always needs adjustment, no matter fast path
+ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
+ * original insn at old prog.
+ */
+ data[off].zext_dst = insn_has_def32(insn + off + cnt - 1);
+
+ if (cnt == 1)
+ return;
+ prog_len = new_prog->len;
+
+ memmove(data + off + cnt - 1, data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1));
+ for (i = off; i < off + cnt - 1; i++) {
+ /* Expand insni[off]'s seen count to the patched range. */
+ data[i].seen = old_seen;
+ data[i].zext_dst = insn_has_def32(insn + i);
+ }
}
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+ /* NOTE: fake 'exit' subprog should be updated as well. */
+ for (i = 0; i <= env->subprog_cnt; i++) {
+ if (env->subprog_info[i].start <= off)
+ continue;
+ env->subprog_info[i].start += len - 1;
+ }
+}
+
+static void release_insn_arrays(struct bpf_verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_release(env->insn_array_maps[i]);
+}
+
+static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len)
+{
+ int i;
+
+ if (len == 1)
+ return;
+
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len)
{
- struct bpf_insn *insn = prog->insnsi;
- int insn_cnt = prog->len;
int i;
+ for (i = 0; i < env->insn_array_map_cnt; i++)
+ bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len);
+}
+
+static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len)
+{
+ struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+ int i, sz = prog->aux->size_poke_tab;
+ struct bpf_jit_poke_descriptor *desc;
+
+ for (i = 0; i < sz; i++) {
+ desc = &tab[i];
+ if (desc->insn_idx <= off)
+ continue;
+ desc->insn_idx += len - 1;
+ }
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+ struct bpf_insn_aux_data *new_data = NULL;
+
+ if (len > 1) {
+ new_data = vrealloc(env->insn_aux_data,
+ array_size(env->prog->len + len - 1,
+ sizeof(struct bpf_insn_aux_data)),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!new_data)
+ return NULL;
+
+ env->insn_aux_data = new_data;
+ }
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (IS_ERR(new_prog)) {
+ if (PTR_ERR(new_prog) == -ERANGE)
+ verbose(env,
+ "insn %d cannot be patched due to 16-bit range\n",
+ env->insn_aux_data[off].orig_idx);
+ return NULL;
+ }
+ adjust_insn_aux_data(env, new_prog, off, len);
+ adjust_subprog_starts(env, off, len);
+ adjust_insn_arrays(env, off, len);
+ adjust_poke_descs(new_prog, off, len);
+ return new_prog;
+}
+
+/*
+ * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the
+ * jump offset by 'delta'.
+ */
+static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
+
for (i = 0; i < insn_cnt; i++, insn++) {
- if (BPF_CLASS(insn->code) != BPF_JMP ||
- BPF_OP(insn->code) == BPF_CALL ||
- BPF_OP(insn->code) == BPF_EXIT)
+ u8 code = insn->code;
+
+ if (tgt_idx <= i && i < tgt_idx + delta)
+ continue;
+
+ if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) ||
+ BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT)
continue;
- /* adjust offset of jmps if necessary */
- if (i < pos && i + insn->off + 1 > pos)
- insn->off += delta;
- else if (i > pos && i + insn->off + 1 < pos)
- insn->off -= delta;
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
+ if (i + 1 + insn->imm != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->imm, delta, &imm))
+ return -ERANGE;
+ insn->imm = imm;
+ } else {
+ if (i + 1 + insn->off != tgt_idx)
+ continue;
+ if (check_add_overflow(insn->off, delta, &off))
+ return -ERANGE;
+ insn->off = off;
+ }
+ }
+ return 0;
+}
+
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
+ u32 off, u32 cnt)
+{
+ int i, j;
+
+ /* find first prog starting at or after off (first to remove) */
+ for (i = 0; i < env->subprog_cnt; i++)
+ if (env->subprog_info[i].start >= off)
+ break;
+ /* find first prog starting at or after off + cnt (first to stay) */
+ for (j = i; j < env->subprog_cnt; j++)
+ if (env->subprog_info[j].start >= off + cnt)
+ break;
+ /* if j doesn't start exactly at off + cnt, we are just removing
+ * the front of previous prog
+ */
+ if (env->subprog_info[j].start != off + cnt)
+ j--;
+
+ if (j > i) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int move;
+
+ /* move fake 'exit' subprog as well */
+ move = env->subprog_cnt + 1 - j;
+
+ memmove(env->subprog_info + i,
+ env->subprog_info + j,
+ sizeof(*env->subprog_info) * move);
+ env->subprog_cnt -= j - i;
+
+ /* remove func_info */
+ if (aux->func_info) {
+ move = aux->func_info_cnt - j;
+
+ memmove(aux->func_info + i,
+ aux->func_info + j,
+ sizeof(*aux->func_info) * move);
+ aux->func_info_cnt -= j - i;
+ /* func_info->insn_off is set after all code rewrites,
+ * in adjust_btf_func() - no need to adjust
+ */
+ }
+ } else {
+ /* convert i from "first prog to remove" to "first to adjust" */
+ if (env->subprog_info[i].start == off)
+ i++;
}
+
+ /* update fake 'exit' subprog as well */
+ for (; i <= env->subprog_cnt; i++)
+ env->subprog_info[i].start -= cnt;
+
+ return 0;
}
-/* convert load instructions that access fields of 'struct __sk_buff'
- * into sequence of instructions that access fields of 'struct sk_buff'
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
+ u32 cnt)
+{
+ struct bpf_prog *prog = env->prog;
+ u32 i, l_off, l_cnt, nr_linfo;
+ struct bpf_line_info *linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo)
+ return 0;
+
+ linfo = prog->aux->linfo;
+
+ /* find first line info to remove, count lines to be removed */
+ for (i = 0; i < nr_linfo; i++)
+ if (linfo[i].insn_off >= off)
+ break;
+
+ l_off = i;
+ l_cnt = 0;
+ for (; i < nr_linfo; i++)
+ if (linfo[i].insn_off < off + cnt)
+ l_cnt++;
+ else
+ break;
+
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
+ * last removed linfo. prog is already modified, so prog->len == off
+ * means no live instructions after (tail of the program was removed).
+ */
+ if (prog->len != off && l_cnt &&
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
+ l_cnt--;
+ linfo[--i].insn_off = off + cnt;
+ }
+
+ /* remove the line info which refer to the removed instructions */
+ if (l_cnt) {
+ memmove(linfo + l_off, linfo + i,
+ sizeof(*linfo) * (nr_linfo - i));
+
+ prog->aux->nr_linfo -= l_cnt;
+ nr_linfo = prog->aux->nr_linfo;
+ }
+
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
+ for (i = l_off; i < nr_linfo; i++)
+ linfo[i].insn_off -= cnt;
+
+ /* fix up all subprogs (incl. 'exit') which start >= off */
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (env->subprog_info[i].linfo_idx > l_off) {
+ /* program may have started in the removed region but
+ * may not be fully removed
+ */
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
+ env->subprog_info[i].linfo_idx -= l_cnt;
+ else
+ env->subprog_info[i].linfo_idx = l_off;
+ }
+
+ return 0;
+}
+
+/*
+ * Clean up dynamically allocated fields of aux data for instructions [start, ...]
+ */
+static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ int end = start + len;
+ int i;
+
+ for (i = start; i < end; i++) {
+ if (aux_data[i].jt) {
+ kvfree(aux_data[i].jt);
+ aux_data[i].jt = NULL;
+ }
+
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+}
+
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ unsigned int orig_prog_len = env->prog->len;
+ int err;
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ bpf_prog_offload_remove_insns(env, off, cnt);
+
+ /* Should be called before bpf_remove_insns, as it uses prog->insnsi */
+ clear_insn_aux_data(env, off, cnt);
+
+ err = bpf_remove_insns(env->prog, off, cnt);
+ if (err)
+ return err;
+
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ adjust_insn_arrays_after_remove(env, off, cnt);
+
+ memmove(aux_data + off, aux_data + off + cnt,
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
+
+ return 0;
+}
+
+/* The verifier does more data flow analysis than llvm and will not
+ * explore branches that are dead at run time. Malicious programs can
+ * have dead code too. Therefore replace all dead at-run-time code
+ * with 'ja -1'.
+ *
+ * Just nops are not optimal, e.g. if they would sit at the end of the
+ * program and through another bug we would manage to jump there, then
+ * we'd execute beyond program memory otherwise. Returning exception
+ * code also wouldn't work since we can have subprogs where the dead
+ * code could be located.
*/
-static int convert_ctx_accesses(struct verifier_env *env)
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &trap, sizeof(trap));
+ aux_data[i].zext_dst = false;
+ }
+}
+
+static bool insn_is_cond_jump(u8 code)
+{
+ u8 op;
+
+ op = BPF_OP(code);
+ if (BPF_CLASS(code) == BPF_JMP32)
+ return op != BPF_JA;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+}
+
+static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!insn_is_cond_jump(insn->code))
+ continue;
+
+ if (!aux_data[i + 1].seen)
+ ja.off = insn->off;
+ else if (!aux_data[i + 1 + insn->off].seen)
+ ja.off = 0;
+ else
+ continue;
+
+ if (bpf_prog_is_offloaded(env->prog->aux))
+ bpf_prog_offload_replace_insn(env, i, &ja);
+
+ memcpy(insn, &ja, sizeof(ja));
+ }
+}
+
+static int opt_remove_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ int j;
+
+ j = 0;
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
+ j++;
+ if (!j)
+ continue;
+
+ err = verifier_remove_insns(env, i, j);
+ if (err)
+ return err;
+ insn_cnt = env->prog->len;
+ }
+
+ return 0;
+}
+
+static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
+
+static int opt_remove_nops(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16];
+ bool is_may_goto_0, is_ja;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
+ is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
+
+ if (!is_may_goto_0 && !is_ja)
+ continue;
+
+ err = verifier_remove_insns(env, i, 1);
+ if (err)
+ return err;
+ insn_cnt--;
+ /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
+ i -= (is_may_goto_0 && i > 0) ? 2 : 1;
+ }
+
+ return 0;
+}
+
+static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
+ const union bpf_attr *attr)
+{
+ struct bpf_insn *patch;
+ /* use env->insn_buf as two independent buffers */
+ struct bpf_insn *zext_patch = env->insn_buf;
+ struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i, patch_len, delta = 0, len = env->prog->len;
+ struct bpf_insn *insns = env->prog->insnsi;
struct bpf_prog *new_prog;
- u32 cnt;
- int i;
+ bool rnd_hi32;
+
+ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
+ zext_patch[1] = BPF_ZEXT_REG(0);
+ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
+ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
+ for (i = 0; i < len; i++) {
+ int adj_idx = i + delta;
+ struct bpf_insn insn;
+ int load_reg;
+
+ insn = insns[adj_idx];
+ load_reg = insn_def_regno(&insn);
+ if (!aux[adj_idx].zext_dst) {
+ u8 code, class;
+ u32 imm_rnd;
+
+ if (!rnd_hi32)
+ continue;
+
+ code = insn.code;
+ class = BPF_CLASS(code);
+ if (load_reg == -1)
+ continue;
+
+ /* NOTE: arg "reg" (the fourth one) is only used for
+ * BPF_STX + SRC_OP, so it is safe to pass NULL
+ * here.
+ */
+ if (is_reg64(&insn, load_reg, NULL, DST_OP)) {
+ if (class == BPF_LD &&
+ BPF_MODE(code) == BPF_IMM)
+ i++;
+ continue;
+ }
+
+ /* ctx load could be transformed into wider load. */
+ if (class == BPF_LDX &&
+ aux[adj_idx].ptr_type == PTR_TO_CTX)
+ continue;
+
+ imm_rnd = get_random_u32();
+ rnd_hi32_patch[0] = insn;
+ rnd_hi32_patch[1].imm = imm_rnd;
+ rnd_hi32_patch[3].dst_reg = load_reg;
+ patch = rnd_hi32_patch;
+ patch_len = 4;
+ goto apply_patch_buffer;
+ }
+
+ /* Add in an zero-extend instruction if a) the JIT has requested
+ * it or b) it's a CMPXCHG.
+ *
+ * The latter is because: BPF_CMPXCHG always loads a value into
+ * R0, therefore always zero-extends. However some archs'
+ * equivalent instruction only does this load when the
+ * comparison is successful. This detail of CMPXCHG is
+ * orthogonal to the general zero-extension behaviour of the
+ * CPU, so it's treated independently of bpf_jit_needs_zext.
+ */
+ if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
+ continue;
+
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
+ if (verifier_bug_if(load_reg == -1, env,
+ "zext_dst is set, but no reg is defined"))
+ return -EFAULT;
+
+ zext_patch[0] = insn;
+ zext_patch[1].dst_reg = load_reg;
+ zext_patch[1].src_reg = load_reg;
+ patch = zext_patch;
+ patch_len = 2;
+apply_patch_buffer:
+ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ insns = new_prog->insnsi;
+ aux = env->insn_aux_data;
+ delta += patch_len - 1;
+ }
+
+ return 0;
+}
+
+/* convert load instructions that access fields of a context type into a
+ * sequence of instructions that access fields of the underlying structure:
+ * struct __sk_buff -> struct sk_buff
+ * struct bpf_sock_ops -> struct sock
+ */
+static int convert_ctx_accesses(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ const struct bpf_verifier_ops *ops = env->ops;
+ int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
+ const int insn_cnt = env->prog->len;
+ struct bpf_insn *epilogue_buf = env->epilogue_buf;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_insn *insn;
+ u32 target_size, size_default, off;
+ struct bpf_prog *new_prog;
+ enum bpf_access_type type;
+ bool is_narrower_load;
+ int epilogue_idx = 0;
+
+ if (ops->gen_epilogue) {
+ epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+ -(subprogs[0].stack_depth + 8));
+ if (epilogue_cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "epilogue is too long");
+ return -EFAULT;
+ } else if (epilogue_cnt) {
+ /* Save the ARG_PTR_TO_CTX for the epilogue to use */
+ cnt = 0;
+ subprogs[0].stack_depth += 8;
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+ -subprogs[0].stack_depth);
+ insn_buf[cnt++] = env->prog->insnsi[0];
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ if (ops->gen_prologue || env->seen_direct_write) {
+ if (!ops->gen_prologue) {
+ verifier_bug(env, "gen_prologue is null");
+ return -EFAULT;
+ }
+ cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+ env->prog);
+ if (cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "prologue is too long");
+ return -EFAULT;
+ } else if (cnt) {
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = new_prog;
+ delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ if (delta)
+ WARN_ON(adjust_jmp_off(env->prog, 0, delta));
- if (!env->prog->aux->ops->convert_ctx_access)
+ if (bpf_prog_is_offloaded(env->prog->aux))
return 0;
+ insn = env->prog->insnsi + delta;
+
for (i = 0; i < insn_cnt; i++, insn++) {
- if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+ bpf_convert_ctx_access_t convert_ctx_access;
+ u8 mode;
+
+ if (env->insn_aux_data[i + delta].nospec) {
+ WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = BPF_ST_NOSPEC();
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ /* This can not be easily merged with the
+ * nospec_result-case, because an insn may require a
+ * nospec before and after itself. Therefore also do not
+ * 'continue' here but potentially apply further
+ * patching to insn. *insn should equal patch[1] now.
+ */
+ }
+
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) {
+ type = BPF_READ;
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
+ type = BPF_WRITE;
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+ env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+ insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+ env->prog->aux->num_exentries++;
+ continue;
+ } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+ epilogue_cnt &&
+ i + delta < subprogs[1].start) {
+ /* Generate epilogue for the main prog */
+ if (epilogue_idx) {
+ /* jump back to the earlier generated epilogue */
+ insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1);
+ cnt = 1;
+ } else {
+ memcpy(insn_buf, epilogue_buf,
+ epilogue_cnt * sizeof(*epilogue_buf));
+ cnt = epilogue_cnt;
+ /* epilogue_idx cannot be 0. It must have at
+ * least one ctx ptr saving insn before the
+ * epilogue.
+ */
+ epilogue_idx = i + delta;
+ }
+ goto patch_insn_buf;
+ } else {
continue;
+ }
- if (insn->imm != PTR_TO_CTX) {
- /* clear internal mark */
- insn->imm = 0;
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].nospec_result) {
+ /* nospec_result is only used to mitigate Spectre v4 and
+ * to limit verification-time for Spectre v1.
+ */
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = *insn;
+ *patch++ = BPF_ST_NOSPEC();
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ switch ((int)env->insn_aux_data[i + delta].ptr_type) {
+ case PTR_TO_CTX:
+ if (!ops->convert_ctx_access)
+ continue;
+ convert_ctx_access = ops->convert_ctx_access;
+ break;
+ case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
+ convert_ctx_access = bpf_sock_convert_ctx_access;
+ break;
+ case PTR_TO_TCP_SOCK:
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+ break;
+ case PTR_TO_XDP_SOCK:
+ convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+ break;
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+ case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
+ if (type == BPF_READ) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ insn->code = BPF_LDX | BPF_PROBE_MEM |
+ BPF_SIZE((insn)->code);
+ else
+ insn->code = BPF_LDX | BPF_PROBE_MEMSX |
+ BPF_SIZE((insn)->code);
+ env->prog->aux->num_exentries++;
+ }
+ continue;
+ case PTR_TO_ARENA:
+ if (BPF_MODE(insn->code) == BPF_MEMSX) {
+ if (!bpf_jit_supports_insn(insn, true)) {
+ verbose(env, "sign extending loads from arena are not supported yet\n");
+ return -EOPNOTSUPP;
+ }
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code);
+ } else {
+ insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code);
+ }
+ env->prog->aux->num_exentries++;
+ continue;
+ default:
+ continue;
+ }
+
+ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
+ mode = BPF_MODE(insn->code);
+
+ /* If the read access is a narrower load of the field,
+ * convert to a 4/8-byte load, to minimum program type specific
+ * convert_ctx_access changes. If conversion is successful,
+ * we will apply proper mask to the result.
+ */
+ is_narrower_load = size < ctx_field_size;
+ size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
+ off = insn->off;
+ if (is_narrower_load) {
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verifier_bug(env, "narrow ctx access misconfigured");
+ return -EFAULT;
+ }
+
+ size_code = BPF_H;
+ if (ctx_field_size == 4)
+ size_code = BPF_W;
+ else if (ctx_field_size == 8)
+ size_code = BPF_DW;
+
+ insn->off = off & ~(size_default - 1);
+ insn->code = BPF_LDX | BPF_MEM | size_code;
+ }
+
+ target_size = 0;
+ cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
+ (ctx_field_size && !target_size)) {
+ verifier_bug(env, "error during ctx access conversion (%d)", cnt);
+ return -EFAULT;
+ }
+
+ if (is_narrower_load && size < target_size) {
+ u8 shift = bpf_ctx_narrow_access_offset(
+ off, size, size_default) * 8;
+ if (shift && cnt + 1 >= INSN_BUF_SIZE) {
+ verifier_bug(env, "narrow ctx load misconfigured");
+ return -EFAULT;
+ }
+ if (ctx_field_size <= 4) {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ } else {
+ if (shift)
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
+ insn->dst_reg,
+ shift);
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1ULL << size * 8) - 1);
+ }
+ }
+ if (mode == BPF_MEMSX)
+ insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X,
+ insn->dst_reg, insn->dst_reg,
+ size * 8, 0);
+
+patch_insn_buf:
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ return 0;
+}
+
+static int jit_subprogs(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog, **func, *tmp;
+ int i, j, subprog_start, subprog_end = 0, len, subprog;
+ struct bpf_map *map_ptr;
+ struct bpf_insn *insn;
+ void *old_bpf_func;
+ int err, num_exentries;
+ int old_len, subprog_start_adjustment = 0;
+
+ if (env->subprog_cnt <= 1)
+ return 0;
+
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
+ continue;
+
+ /* Upon error here we cannot fall back to interpreter but
+ * need a hard reject of the program. Thus -EFAULT is
+ * propagated in any case.
+ */
+ subprog = find_subprog(env, i + insn->imm + 1);
+ if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
+ i + insn->imm + 1))
+ return -EFAULT;
+ /* temporarily remember subprog id inside insn instead of
+ * aux_data, since next loop will split up all insns into funcs
+ */
+ insn->off = subprog;
+ /* remember original imm in case JIT fails and fallback
+ * to interpreter will be needed
+ */
+ env->insn_aux_data[i].call_imm = insn->imm;
+ /* point imm to __bpf_call_base+1 from JITs point of view */
+ insn->imm = 1;
+ if (bpf_pseudo_func(insn)) {
+#if defined(MODULES_VADDR)
+ u64 addr = MODULES_VADDR;
+#else
+ u64 addr = VMALLOC_START;
+#endif
+ /* jit (e.g. x86_64) may emit fewer instructions
+ * if it learns a u32 imm is the same as a u64 imm.
+ * Set close enough to possible prog address.
+ */
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+ }
+ }
+
+ err = bpf_prog_alloc_jited_linfo(prog);
+ if (err)
+ goto out_undo_insn;
+
+ err = -ENOMEM;
+ func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
+ if (!func)
+ goto out_undo_insn;
+
+ for (i = 0; i < env->subprog_cnt; i++) {
+ subprog_start = subprog_end;
+ subprog_end = env->subprog_info[i + 1].start;
+
+ len = subprog_end - subprog_start;
+ /* bpf_prog_run() doesn't call subprogs directly,
+ * hence main prog stats include the runtime of subprogs.
+ * subprogs don't have IDs and not reachable via prog_get_next_id
+ * func[i]->stats will never be accessed and stays NULL
+ */
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
+ if (!func[i])
+ goto out_free;
+ memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
+ len * sizeof(struct bpf_insn));
+ func[i]->type = prog->type;
+ func[i]->len = len;
+ if (bpf_prog_calc_tag(func[i]))
+ goto out_free;
+ func[i]->is_func = 1;
+ func[i]->sleepable = prog->sleepable;
+ func[i]->aux->func_idx = i;
+ /* Below members will be freed only at prog->aux */
+ func[i]->aux->btf = prog->aux->btf;
+ func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment;
+ func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
+ func[i]->aux->poke_tab = prog->aux->poke_tab;
+ func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+ func[i]->aux->main_prog_aux = prog->aux;
+
+ for (j = 0; j < prog->aux->size_poke_tab; j++) {
+ struct bpf_jit_poke_descriptor *poke;
+
+ poke = &prog->aux->poke_tab[j];
+ if (poke->insn_idx < subprog_end &&
+ poke->insn_idx >= subprog_start)
+ poke->aux = func[i]->aux;
+ }
+
+ func[i]->aux->name[0] = 'F';
+ func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
+ if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE)
+ func[i]->aux->jits_use_priv_stack = true;
+
+ func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
+ func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
+ func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
+ func[i]->aux->linfo = prog->aux->linfo;
+ func[i]->aux->nr_linfo = prog->aux->nr_linfo;
+ func[i]->aux->jited_linfo = prog->aux->jited_linfo;
+ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
+ func[i]->aux->arena = prog->aux->arena;
+ func[i]->aux->used_maps = env->used_maps;
+ func[i]->aux->used_map_cnt = env->used_map_cnt;
+ num_exentries = 0;
+ insn = func[i]->insnsi;
+ for (j = 0; j < func[i]->len; j++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32 ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32SX ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX))
+ num_exentries++;
+ if ((BPF_CLASS(insn->code) == BPF_STX ||
+ BPF_CLASS(insn->code) == BPF_ST) &&
+ BPF_MODE(insn->code) == BPF_PROBE_MEM32)
+ num_exentries++;
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
+ num_exentries++;
+ }
+ func[i]->aux->num_exentries = num_exentries;
+ func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+ func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+ func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
+ func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
+ if (!i)
+ func[i]->aux->exception_boundary = env->seen_exception;
+
+ /*
+ * To properly pass the absolute subprog start to jit
+ * all instruction adjustments should be accumulated
+ */
+ old_len = func[i]->len;
+ func[i] = bpf_int_jit_compile(func[i]);
+ subprog_start_adjustment += func[i]->len - old_len;
+
+ if (!func[i]->jited) {
+ err = -ENOTSUPP;
+ goto out_free;
+ }
+ cond_resched();
+ }
+
+ /* at this point all bpf functions were successfully JITed
+ * now populate all bpf_calls with correct addresses and
+ * run last pass of JIT
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ insn = func[i]->insnsi;
+ for (j = 0; j < func[i]->len; j++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ subprog = insn->off;
+ insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ continue;
+ }
+ if (!bpf_pseudo_call(insn))
+ continue;
+ subprog = insn->off;
+ insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func);
+ }
+
+ /* we use the aux data to keep a list of the start addresses
+ * of the JITed images for each function in the program
+ *
+ * for some architectures, such as powerpc64, the imm field
+ * might not be large enough to hold the offset of the start
+ * address of the callee's JITed image from __bpf_call_base
+ *
+ * in such cases, we can lookup the start address of a callee
+ * by using its subprog id, available from the off field of
+ * the call instruction, as an index for this list
+ */
+ func[i]->aux->func = func;
+ func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ func[i]->aux->real_func_cnt = env->subprog_cnt;
+ }
+ for (i = 0; i < env->subprog_cnt; i++) {
+ old_bpf_func = func[i]->bpf_func;
+ tmp = bpf_int_jit_compile(func[i]);
+ if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
+ verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
+ err = -ENOTSUPP;
+ goto out_free;
+ }
+ cond_resched();
+ }
+
+ /*
+ * Cleanup func[i]->aux fields which aren't required
+ * or can become invalid in future
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ func[i]->aux->used_maps = NULL;
+ func[i]->aux->used_map_cnt = 0;
+ }
+
+ /* finally lock prog and jit images for all functions and
+ * populate kallsysm. Begin at the first subprogram, since
+ * bpf_prog_load will add the kallsyms for the main program.
+ */
+ for (i = 1; i < env->subprog_cnt; i++) {
+ err = bpf_prog_lock_ro(func[i]);
+ if (err)
+ goto out_free;
+ }
+
+ for (i = 1; i < env->subprog_cnt; i++)
+ bpf_prog_kallsyms_add(func[i]);
+
+ /* Last step: make now unused interpreter insns from main
+ * prog consistent for later dump requests, so they can
+ * later look the same as if they were interpreted only.
+ */
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ insn[0].imm = env->insn_aux_data[i].call_imm;
+ insn[1].imm = insn->off;
+ insn->off = 0;
continue;
}
+ if (!bpf_pseudo_call(insn))
+ continue;
+ insn->off = env->insn_aux_data[i].call_imm;
+ subprog = find_subprog(env, i + insn->off + 1);
+ insn->imm = subprog;
+ }
+
+ prog->jited = 1;
+ prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
+ prog->aux->extable = func[0]->aux->extable;
+ prog->aux->num_exentries = func[0]->aux->num_exentries;
+ prog->aux->func = func;
+ prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+ prog->aux->real_func_cnt = env->subprog_cnt;
+ prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+ prog->aux->exception_boundary = func[0]->aux->exception_boundary;
+ bpf_prog_jit_attempt_done(prog);
+ return 0;
+out_free:
+ /* We failed JIT'ing, so at this point we need to unregister poke
+ * descriptors from subprogs, so that kernel is not attempting to
+ * patch it anymore as we're freeing the subprog JIT memory.
+ */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+ }
+ /* At this point we're guaranteed that poke descriptors are not
+ * live anymore. We can just unlink its descriptor table as it's
+ * released with the main prog.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (!func[i])
+ continue;
+ func[i]->aux->poke_tab = NULL;
+ bpf_jit_free(func[i]);
+ }
+ kfree(func);
+out_undo_insn:
+ /* cleanup main prog to be interpreted */
+ prog->jit_requested = 0;
+ prog->blinding_requested = 0;
+ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ if (!bpf_pseudo_call(insn))
+ continue;
+ insn->off = 0;
+ insn->imm = env->insn_aux_data[i].call_imm;
+ }
+ bpf_prog_jit_attempt_done(prog);
+ return err;
+}
- cnt = env->prog->aux->ops->
- convert_ctx_access(insn->dst_reg, insn->src_reg,
- insn->off, insn_buf);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+static int fixup_call_args(struct bpf_verifier_env *env)
+{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ bool has_kfunc_call = bpf_prog_has_kfunc_call(prog);
+ int i, depth;
+#endif
+ int err = 0;
+
+ if (env->prog->jit_requested &&
+ !bpf_prog_is_offloaded(env->prog->aux)) {
+ err = jit_subprogs(env);
+ if (err == 0)
+ return 0;
+ if (err == -EFAULT)
+ return err;
+ }
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ if (has_kfunc_call) {
+ verbose(env, "calling kernel functions are not allowed in non-JITed programs\n");
+ return -EINVAL;
+ }
+ if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) {
+ /* When JIT fails the progs with bpf2bpf calls and tail_calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ return -EINVAL;
+ }
+ for (i = 0; i < prog->len; i++, insn++) {
+ if (bpf_pseudo_func(insn)) {
+ /* When JIT fails the progs with callback calls
+ * have to be rejected, since interpreter doesn't support them yet.
+ */
+ verbose(env, "callbacks are not allowed in non-JITed programs\n");
return -EINVAL;
}
- if (cnt == 1) {
- memcpy(insn, insn_buf, sizeof(*insn));
+ if (!bpf_pseudo_call(insn))
+ continue;
+ depth = get_callee_stack_depth(env, insn, i);
+ if (depth < 0)
+ return depth;
+ bpf_patch_call_args(insn, depth);
+ }
+ err = 0;
+#endif
+ return err;
+}
+
+/* replace a generic kfunc with a specialized version if necessary */
+static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx)
+{
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+ u32 func_id = desc->func_id;
+ u16 offset = desc->offset;
+ unsigned long addr = desc->addr;
+
+ if (offset) /* return if module BTF is used */
+ return 0;
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc)
+ addr = (unsigned long)xdp_kfunc;
+ /* fallback to default kfunc when not supported by netdev */
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+ if (is_rdonly)
+ addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_set_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) {
+ if (bpf_lsm_has_d_inode_locked(prog))
+ addr = (unsigned long)bpf_remove_dentry_xattr_locked;
+ } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) {
+ if (!env->insn_aux_data[insn_idx].non_sleepable)
+ addr = (unsigned long)bpf_dynptr_from_file_sleepable;
+ }
+ desc->addr = addr;
+ return 0;
+}
+
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
+}
+
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
+{
+ struct bpf_kfunc_desc *desc;
+ int err;
+
+ if (!insn->imm) {
+ verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
+ return -EINVAL;
+ }
+
+ *cnt = 0;
+
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
+ */
+ desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
+ if (!desc) {
+ verifier_bug(env, "kernel function descriptor not found for func_id %u",
+ insn->imm);
+ return -EFAULT;
+ }
+
+ err = specialize_kfunc(env, desc, insn_idx);
+ if (err)
+ return err;
+
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
+ if (insn->off)
+ return 0;
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
+ !kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
+
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+
+ if (!kptr_struct_meta) {
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ }
+
+ if (env->insn_aux_data[insn_idx].arg_prog) {
+ u32 regno = env->insn_aux_data[insn_idx].arg_prog;
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
+ int idx = *cnt;
+
+ insn_buf[idx++] = ld_addrs[0];
+ insn_buf[idx++] = ld_addrs[1];
+ insn_buf[idx++] = *insn;
+ *cnt = idx;
+ }
+ return 0;
+}
+
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+ struct bpf_subprog_info *info = env->subprog_info;
+ int cnt = env->subprog_cnt;
+ struct bpf_prog *prog;
+
+ /* We only reserve one slot for hidden subprogs in subprog_info. */
+ if (env->hidden_subprog_cnt) {
+ verifier_bug(env, "only one hidden subprog supported");
+ return -EFAULT;
+ }
+ /* We're not patching any existing instruction, just appending the new
+ * ones for the hidden subprog. Hence all of the adjustment operations
+ * in bpf_patch_insn_data are no-ops.
+ */
+ prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+ if (!prog)
+ return -ENOMEM;
+ env->prog = prog;
+ info[cnt + 1].start = info[cnt].start;
+ info[cnt].start = prog->len - len + 1;
+ env->subprog_cnt++;
+ env->hidden_subprog_cnt++;
+ return 0;
+}
+
+/* Do various post-verification rewrites in a single program pass.
+ * These rewrites simplify JIT and interpreter implementations.
+ */
+static int do_misc_fixups(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+ enum bpf_prog_type prog_type = resolve_prog_type(prog);
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ const struct bpf_map_ops *ops;
+ struct bpf_insn_aux_data *aux;
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, ret, cnt, delta = 0, cur_subprog = 0;
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_extra = 0;
+
+ if (env->seen_exception && !env->exception_callback_subprog) {
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = env->prog->insnsi[insn_cnt - 1];
+ *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *patch++ = BPF_EXIT_INSN();
+ ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
+ if (ret < 0)
+ return ret;
+ prog = env->prog;
+ insn = prog->insnsi;
+
+ env->exception_callback_subprog = env->subprog_cnt - 1;
+ /* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+ mark_subprog_exc_cb(env, env->exception_callback_subprog);
+ }
+
+ for (i = 0; i < insn_cnt;) {
+ if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) {
+ if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) ||
+ (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) {
+ /* convert to 32-bit mov that clears upper 32-bit */
+ insn->code = BPF_ALU | BPF_MOV | BPF_X;
+ /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */
+ insn->off = 0;
+ insn->imm = 0;
+ } /* cast from as(0) to as(1) should be handled by JIT */
+ goto next_insn;
+ }
+
+ if (env->insn_aux_data[i + delta].needs_zext)
+ /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */
+ insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code);
+
+ /* Make sdiv/smod divide-by-minus-one exceptions impossible. */
+ if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_K) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_K)) &&
+ insn->off == 1 && insn->imm == -1) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ struct bpf_insn *patch = insn_buf;
+
+ if (isdiv)
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ else
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */
+ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+ bool is_sdiv = isdiv && insn->off == 1;
+ bool is_smod = !isdiv && insn->off == 1;
+ struct bpf_insn *patch = insn_buf;
+
+ if (is_sdiv) {
+ /* [R,W]x sdiv 0 -> 0
+ * LLONG_MIN sdiv -1 -> LLONG_MIN
+ * INT_MIN sdiv -1 -> INT_MIN
+ */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else if (is_smod) {
+ /* [R,W]x mod 0 -> [R,W]x */
+ /* [R,W]x mod -1 -> 0 */
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1);
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ } else if (isdiv) {
+ /* [R,W]x div 0 -> 0 */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JNE | BPF_K, insn->src_reg,
+ 0, 2, 0);
+ *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else {
+ /* [R,W]x mod 0 -> [R,W]x */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, insn->src_reg,
+ 0, 1 + (is64 ? 0 : 1), 0);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Make it impossible to de-reference a userspace address */
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+ struct bpf_insn *patch = insn_buf;
+ u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+ if (!uaddress_limit)
+ goto next_insn;
+
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ if (insn->off)
+ *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+ *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+ *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+ *patch++ = *insn;
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
+ if (BPF_CLASS(insn->code) == BPF_LD &&
+ (BPF_MODE(insn->code) == BPF_ABS ||
+ BPF_MODE(insn->code) == BPF_IND)) {
+ cnt = env->ops->gen_ld_abs(insn, insn_buf);
+ if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for ld_abs", cnt);
+ return -EFAULT;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Rewrite pointer arithmetic to mitigate speculation attacks. */
+ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
+ const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
+ const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
+ struct bpf_insn *patch = insn_buf;
+ bool issrc, isneg, isimm;
+ u32 off_reg;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (!aux->alu_state ||
+ aux->alu_state == BPF_ALU_NON_POINTER)
+ goto next_insn;
+
+ isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
+ issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
+ BPF_ALU_SANITIZE_SRC;
+ isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
+
+ off_reg = issrc ? insn->src_reg : insn->dst_reg;
+ if (isimm) {
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ } else {
+ if (isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
+ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg);
+ }
+ if (!issrc)
+ *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg);
+ insn->src_reg = BPF_REG_AX;
+ if (isneg)
+ insn->code = insn->code == code_add ?
+ code_sub : code_add;
+ *patch++ = *insn;
+ if (issrc && isneg && !isimm)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+ int stack_off_cnt = -stack_depth - 16;
+
+ /*
+ * Two 8 byte slots, depth-16 stores the count, and
+ * depth-8 stores the start timestamp of the loop.
+ *
+ * The starting value of count is BPF_MAX_TIMED_LOOPS
+ * (0xffff). Every iteration loads it and subs it by 1,
+ * until the value becomes 0 in AX (thus, 1 in stack),
+ * after which we call arch_bpf_timed_may_goto, which
+ * either sets AX to 0xffff to keep looping, or to 0
+ * upon timeout. AX is then stored into the stack. In
+ * the next iteration, we either see 0 and break out, or
+ * continue iterating until the next time value is 0
+ * after subtraction, rinse and repeat.
+ */
+ stack_depth_extra = 16;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+ /*
+ * AX is used as an argument to pass in stack_off_cnt
+ * (to add to r10/fp), and also as the return value of
+ * the call to arch_bpf_timed_may_goto.
+ */
+ insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+ insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+ insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+ cnt = 7;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ } else if (is_may_goto_insn(insn)) {
+ int stack_off = -stack_depth - 8;
+
+ stack_depth_extra = 8;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off);
+ cnt = 4;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ goto next_insn;
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ goto next_insn;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+ if (ret)
+ return ret;
+ if (cnt == 0)
+ goto next_insn;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Skip inlining the helper call if the JIT does it. */
+ if (bpf_jit_inlines_helper_call(insn->imm))
+ goto next_insn;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_override_return)
+ prog->kprobe_override = 1;
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* If we tail call into other programs, we
+ * cannot make any assumptions since they can
+ * be replaced dynamically during runtime in
+ * the program array.
+ */
+ prog->cb_access = 1;
+ if (!allow_tail_call_in_subprogs(env))
+ prog->aux->stack_depth = MAX_BPF_STACK;
+ prog->aux->max_pkt_offset = MAX_PACKET_OFF;
+
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpreter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (env->bpf_capable && !prog->blinding_requested &&
+ prog->jit_requested &&
+ !bpf_map_key_poisoned(aux) &&
+ !bpf_map_ptr_poisoned(aux) &&
+ !bpf_map_ptr_unpriv(aux)) {
+ struct bpf_jit_poke_descriptor desc = {
+ .reason = BPF_POKE_REASON_TAIL_CALL,
+ .tail_call.map = aux->map_ptr_state.map_ptr,
+ .tail_call.key = bpf_map_key_immediate(aux),
+ .insn_idx = i + delta,
+ };
+
+ ret = bpf_jit_add_poke_descriptor(prog, &desc);
+ if (ret < 0) {
+ verbose(env, "adding tail call poke descriptor failed\n");
+ return ret;
+ }
+
+ insn->imm = ret + 1;
+ goto next_insn;
+ }
+
+ if (!bpf_map_ptr_unpriv(aux))
+ goto next_insn;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ if (bpf_map_ptr_poisoned(aux)) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+
+ map_ptr = aux->map_ptr_state.map_ptr;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ if (is_storage_get_function(insn->imm)) {
+ if (env->insn_aux_data[i + delta].non_sleepable)
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+ if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+ /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+ * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+ */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
+ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
+ * and other inlining handlers are currently limited to 64 bit
+ * only.
+ */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ (insn->imm == BPF_FUNC_map_lookup_elem ||
+ insn->imm == BPF_FUNC_map_update_elem ||
+ insn->imm == BPF_FUNC_map_delete_elem ||
+ insn->imm == BPF_FUNC_map_push_elem ||
+ insn->imm == BPF_FUNC_map_pop_elem ||
+ insn->imm == BPF_FUNC_map_peek_elem ||
+ insn->imm == BPF_FUNC_redirect_map ||
+ insn->imm == BPF_FUNC_for_each_map_elem ||
+ insn->imm == BPF_FUNC_map_lookup_percpu_elem)) {
+ aux = &env->insn_aux_data[i + delta];
+ if (bpf_map_ptr_poisoned(aux))
+ goto patch_call_imm;
+
+ map_ptr = aux->map_ptr_state.map_ptr;
+ ops = map_ptr->ops;
+ if (insn->imm == BPF_FUNC_map_lookup_elem &&
+ ops->map_gen_lookup) {
+ cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+ if (cnt == -EOPNOTSUPP)
+ goto patch_map_ops_generic;
+ if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
+ verifier_bug(env, "%d insns generated for map lookup", cnt);
+ return -EFAULT;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta,
+ insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+ (void *(*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+ (long (*)(struct bpf_map *map, void *key))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+ (long (*)(struct bpf_map *map, void *key, void *value,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_push_elem,
+ (long (*)(struct bpf_map *map, void *value,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
+ (long (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
+ (long (*)(struct bpf_map *map, void *value))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
+ (long (*)(struct bpf_map *map,
+ bpf_callback_t callback_fn,
+ void *callback_ctx,
+ u64 flags))NULL));
+ BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem,
+ (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL));
+
+patch_map_ops_generic:
+ switch (insn->imm) {
+ case BPF_FUNC_map_lookup_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_elem);
+ goto next_insn;
+ case BPF_FUNC_map_update_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_update_elem);
+ goto next_insn;
+ case BPF_FUNC_map_delete_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_delete_elem);
+ goto next_insn;
+ case BPF_FUNC_map_push_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_push_elem);
+ goto next_insn;
+ case BPF_FUNC_map_pop_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_pop_elem);
+ goto next_insn;
+ case BPF_FUNC_map_peek_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_peek_elem);
+ goto next_insn;
+ case BPF_FUNC_redirect_map:
+ insn->imm = BPF_CALL_IMM(ops->map_redirect);
+ goto next_insn;
+ case BPF_FUNC_for_each_map_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_for_each_callback);
+ goto next_insn;
+ case BPF_FUNC_map_lookup_percpu_elem:
+ insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem);
+ goto next_insn;
+ }
+
+ goto patch_call_imm;
+ }
+
+ /* Implement bpf_jiffies64 inline. */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_jiffies64) {
+ struct bpf_insn ld_jiffies_addr[2] = {
+ BPF_LD_IMM64(BPF_REG_0,
+ (unsigned long)&jiffies),
+ };
+
+ insn_buf[0] = ld_jiffies_addr[0];
+ insn_buf[1] = ld_jiffies_addr[1];
+ insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0,
+ BPF_REG_0, 0);
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+ cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ /* Implement bpf_get_smp_processor_id() inline. */
+ if (insn->imm == BPF_FUNC_get_smp_processor_id &&
+ verifier_inlines_helper_call(env, insn->imm)) {
+ /* BPF_FUNC_get_smp_processor_id inlining is an
+ * optimization, so if cpu_number is ever
+ * changed in some incompatible and hard to support
+ * way, it's fine to back out this inlining logic
+ */
+#ifdef CONFIG_SMP
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
+ insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
+ insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
+ cnt = 3;
+#else
+ insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ cnt = 1;
+#endif
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+#endif
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[7] = BPF_JMP_A(1);
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ cnt = 9;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 6;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_func_ip inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ip) {
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_get_branch_snapshot inline. */
+ if (IS_ENABLED(CONFIG_PERF_EVENTS) &&
+ prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_get_branch_snapshot) {
+ /* We are dealing with the following func protos:
+ * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags);
+ * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt);
+ */
+ const u32 br_entry_size = sizeof(struct perf_branch_entry);
+
+ /* struct perf_branch_entry is part of UAPI and is
+ * used as an array element, so extremely unlikely to
+ * ever grow or shrink
+ */
+ BUILD_BUG_ON(br_entry_size != 24);
+
+ /* if (unlikely(flags)) return -EINVAL */
+ insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7);
+
+ /* Transform size (bytes) into number of entries (cnt = size / 24).
+ * But to avoid expensive division instruction, we implement
+ * divide-by-3 through multiplication, followed by further
+ * division by 8 through 3-bit right shift.
+ * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr.,
+ * p. 227, chapter "Unsigned Division by 3" for details and proofs.
+ *
+ * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab.
+ */
+ insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab);
+ insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0);
+ insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36);
+
+ /* call perf_snapshot_branch_stack implementation */
+ insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack));
+ /* if (entry_cnt == 0) return -ENOENT */
+ insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4);
+ /* return entry_cnt * sizeof(struct perf_branch_entry) */
+ insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size);
+ insn_buf[7] = BPF_JMP_A(3);
+ /* return -EINVAL; */
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ insn_buf[9] = BPF_JMP_A(1);
+ /* return -ENOENT; */
+ insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT);
+ cnt = 11;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+
+ /* Implement bpf_kptr_xchg inline */
+ if (prog->jit_requested && BITS_PER_LONG == 64 &&
+ insn->imm == BPF_FUNC_kptr_xchg &&
+ bpf_jit_supports_ptr_xchg()) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2);
+ insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0);
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ }
+patch_call_imm:
+ fn = env->ops->get_func_proto(insn->imm, env->prog);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verifier_bug(env,
+ "not inlined functions %s#%d is missing func",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
+next_insn:
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ subprogs[cur_subprog].stack_extra = stack_depth_extra;
+
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
+ verbose(env, "stack size %d(extra %d) is too large\n",
+ stack_depth, stack_depth_extra);
+ return -EINVAL;
+ }
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_extra = 0;
+ }
+ i++;
+ insn++;
+ }
+
+ env->prog->aux->stack_depth = subprogs[0].stack_depth;
+ for (i = 0; i < env->subprog_cnt; i++) {
+ int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
+ int subprog_start = subprogs[i].start;
+ int stack_slots = subprogs[i].stack_extra / 8;
+ int slots = delta, cnt = 0;
+
+ if (!stack_slots)
continue;
+ /* We need two slots in case timed may_goto is supported. */
+ if (stack_slots > slots) {
+ verifier_bug(env, "stack_slots supports may_goto only");
+ return -EFAULT;
}
- /* several new insns need to be inserted. Make room for them */
- insn_cnt += cnt - 1;
- new_prog = bpf_prog_realloc(env->prog,
- bpf_prog_size(insn_cnt),
- GFP_USER);
+ stack_depth = subprogs[i].stack_depth;
+ if (bpf_jit_supports_timed_may_goto()) {
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_TIMED_LOOPS);
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+ } else {
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_LOOPS);
+ }
+ /* Copy first actual insn to preserve it */
+ insn_buf[cnt++] = env->prog->insnsi[subprog_start];
+
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
+ env->prog = prog = new_prog;
+ /*
+ * If may_goto is a first insn of a prog there could be a jmp
+ * insn that points to it, hence adjust all such jmps to point
+ * to insn after BPF_ST that inits may_goto count.
+ * Adjustment will succeed because bpf_patch_insn_data() didn't fail.
+ */
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
+ }
+
+ /* Since poke tab is now finalized, publish aux to tracker. */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ if (!map_ptr->ops->map_poke_track ||
+ !map_ptr->ops->map_poke_untrack ||
+ !map_ptr->ops->map_poke_run) {
+ verifier_bug(env, "poke tab is misconfigured");
+ return -EFAULT;
+ }
+
+ ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
+ if (ret < 0) {
+ verbose(env, "tracking tail call prog failed\n");
+ return ret;
+ }
+ }
- new_prog->len = insn_cnt;
+ ret = sort_kfunc_descs_by_imm_off(env);
+ if (ret)
+ return ret;
- memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
- sizeof(*insn) * (insn_cnt - i - cnt));
+ return 0;
+}
- /* copy substitute insns in place of load instruction */
- memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *total_cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_insn *insn_buf = env->insn_buf;
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+ u32 cnt = 0;
- /* adjust branches in the whole program */
- adjust_branches(new_prog, i, cnt - 1);
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
- /* keep walking new program and skip insns we just inserted */
- env->prog = new_prog;
- insn = new_prog->insnsi + i + cnt - 1;
- i += cnt - 1;
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2);
+ insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG);
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16);
+ /* spill R6, R7, R8 to use these as loop vars */
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset);
+ insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset);
+ /* initialize loop vars */
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1);
+ insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0);
+ insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3);
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5);
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
+ insn_buf[cnt++] = BPF_CALL_REL(0);
+ /* increment loop counter */
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
+ /* jump to loop header if callback returned 0 */
+ insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6);
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt);
+ /* restore original values of R6, R7, R8 */
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset);
+ insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset);
+
+ *total_cnt = cnt;
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
}
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
return 0;
}
-static void free_states(struct verifier_env *env)
+/* Remove unnecessary spill/fill pairs, members of fastcall pattern,
+ * adjust subprograms stack depth when possible.
+ */
+static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
{
- struct verifier_state_list *sl, *sln;
- int i;
+ struct bpf_subprog_info *subprog = env->subprog_info;
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u32 spills_num;
+ bool modified = false;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (aux[i].fastcall_spills_num > 0) {
+ spills_num = aux[i].fastcall_spills_num;
+ /* NOPs would be removed by opt_remove_nops() */
+ for (j = 1; j <= spills_num; ++j) {
+ *(insn - j) = NOP;
+ *(insn + j) = NOP;
+ }
+ modified = true;
+ }
+ if ((subprog + 1)->start == i + 1) {
+ if (modified && !subprog->keep_fastcall_stack)
+ subprog->stack_depth = -subprog->fastcall_stack_off;
+ subprog++;
+ modified = false;
+ }
+ }
+
+ return 0;
+}
+
+static void free_states(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state_list *sl;
+ struct list_head *head, *pos, *tmp;
+ struct bpf_scc_info *info;
+ int i, j;
+
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ while (!pop_stack(env, NULL, NULL, false));
+
+ list_for_each_safe(pos, tmp, &env->free_list) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ }
+ INIT_LIST_HEAD(&env->free_list);
+
+ for (i = 0; i < env->scc_cnt; ++i) {
+ info = env->scc_info[i];
+ if (!info)
+ continue;
+ for (j = 0; j < info->num_visits; j++)
+ free_backedges(&info->visits[j]);
+ kvfree(info);
+ env->scc_info[i] = NULL;
+ }
if (!env->explored_states)
return;
- for (i = 0; i < env->prog->len; i++) {
- sl = env->explored_states[i];
+ for (i = 0; i < state_htab_size(env); i++) {
+ head = &env->explored_states[i];
- if (sl)
- while (sl != STATE_LIST_MARK) {
- sln = sl->next;
- kfree(sl);
- sl = sln;
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ }
+ INIT_LIST_HEAD(&env->explored_states[i]);
+ }
+}
+
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
+{
+ bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
+ struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct bpf_verifier_state *state;
+ struct bpf_reg_state *regs;
+ int ret, i;
+
+ env->prev_linfo = NULL;
+ env->pass_cnt++;
+
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL_ACCOUNT);
+ if (!state)
+ return -ENOMEM;
+ state->curframe = 0;
+ state->speculative = false;
+ state->branches = 1;
+ state->in_sleepable = env->prog->sleepable;
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT);
+ if (!state->frame[0]) {
+ kfree(state);
+ return -ENOMEM;
+ }
+ env->cur_state = state;
+ init_func_state(env, state->frame[0],
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno */,
+ subprog);
+ state->first_insn_idx = env->subprog_info[subprog].start;
+ state->last_insn_idx = -1;
+
+ regs = state->frame[state->curframe]->regs;
+ if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
+ const char *sub_name = subprog_name(env, subprog);
+ struct bpf_subprog_arg_info *arg;
+ struct bpf_reg_state *reg;
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+ ret = btf_prepare_func_args(env, subprog);
+ if (ret)
+ goto out;
+
+ if (subprog_is_exc_cb(env, subprog)) {
+ state->frame[0]->in_exception_callback_fn = true;
+ /* We have already ensured that the callback returns an integer, just
+ * like all global subprogs. We need to determine it only has a single
+ * scalar argument.
+ */
+ if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
+ verbose(env, "exception cb only supports single integer argument\n");
+ ret = -EINVAL;
+ goto out;
}
+ }
+ for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+ arg = &sub->args[i - BPF_REG_1];
+ reg = &regs[i];
+
+ if (arg->arg_type == ARG_PTR_TO_CTX) {
+ reg->type = PTR_TO_CTX;
+ mark_reg_known_zero(env, regs, i);
+ } else if (arg->arg_type == ARG_ANYTHING) {
+ reg->type = SCALAR_VALUE;
+ mark_reg_unknown(env, regs, i);
+ } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ /* assume unspecial LOCAL dynptr type */
+ __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+ reg->type = PTR_TO_MEM;
+ reg->type |= arg->arg_type &
+ (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY);
+ mark_reg_known_zero(env, regs, i);
+ reg->mem_size = arg->mem_size;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
+ reg->type = PTR_TO_BTF_ID;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->type |= PTR_MAYBE_NULL;
+ if (arg->arg_type & PTR_UNTRUSTED)
+ reg->type |= PTR_UNTRUSTED;
+ if (arg->arg_type & PTR_TRUSTED)
+ reg->type |= PTR_TRUSTED;
+ mark_reg_known_zero(env, regs, i);
+ reg->btf = bpf_get_btf_vmlinux(); /* can't fail at this point */
+ reg->btf_id = arg->btf_id;
+ reg->id = ++env->id_gen;
+ } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) {
+ /* caller can pass either PTR_TO_ARENA or SCALAR */
+ mark_reg_unknown(env, regs, i);
+ } else {
+ verifier_bug(env, "unhandled arg#%d type %d",
+ i - BPF_REG_1, arg->arg_type);
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ } else {
+ /* if main BPF program has associated BTF info, validate that
+ * it's matching expected signature, and otherwise mark BTF
+ * info for main program as unreliable
+ */
+ if (env->prog->aux->func_info_aux) {
+ ret = btf_prepare_func_args(env, 0);
+ if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+ env->prog->aux->func_info_aux[0].unreliable = true;
+ }
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+ mark_reg_known_zero(env, regs, BPF_REG_1);
}
- kfree(env->explored_states);
+ /* Acquire references for struct_ops program arguments tagged with "__ref" */
+ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ for (i = 0; i < aux->ctx_arg_info_size; i++)
+ aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ?
+ acquire_reference(env, 0) : 0;
+ }
+
+ ret = do_check(env);
+out:
+ if (!ret && pop_log)
+ bpf_vlog_reset(&env->log, 0);
+ free_states(env);
+ return ret;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+/* Lazily verify all global functions based on their BTF, if they are called
+ * from main BPF program or any of subprograms transitively.
+ * BPF global subprogs called from dead code are not validated.
+ * All callable global functions must pass verification.
+ * Otherwise the whole program is rejected.
+ * Consider:
+ * int bar(int);
+ * int foo(int f)
+ * {
+ * return bar(f);
+ * }
+ * int bar(int b)
+ * {
+ * ...
+ * }
+ * foo() will be verified first for R1=any_scalar_value. During verification it
+ * will be assumed that bar() already verified successfully and call to bar()
+ * from foo() will be checked for type match only. Later bar() will be verified
+ * independently to check that it's safe for R1=any_scalar_value.
+ */
+static int do_check_subprogs(struct bpf_verifier_env *env)
{
- char __user *log_ubuf = NULL;
- struct verifier_env *env;
- int ret = -EINVAL;
+ struct bpf_prog_aux *aux = env->prog->aux;
+ struct bpf_func_info_aux *sub_aux;
+ int i, ret, new_cnt;
- if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
- return -E2BIG;
+ if (!aux->func_info)
+ return 0;
+
+ /* exception callback is presumed to be always called */
+ if (env->exception_callback_subprog)
+ subprog_aux(env, env->exception_callback_subprog)->called = true;
+
+again:
+ new_cnt = 0;
+ for (i = 1; i < env->subprog_cnt; i++) {
+ if (!subprog_is_global(env, i))
+ continue;
+
+ sub_aux = subprog_aux(env, i);
+ if (!sub_aux->called || sub_aux->verified)
+ continue;
+
+ env->insn_idx = env->subprog_info[i].start;
+ WARN_ON_ONCE(env->insn_idx == 0);
+ ret = do_check_common(env, i);
+ if (ret) {
+ return ret;
+ } else if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
+ i, subprog_name(env, i));
+ }
+
+ /* We verified new global subprog, it might have called some
+ * more global subprogs that we haven't verified yet, so we
+ * need to do another pass over subprogs to verify those.
+ */
+ sub_aux->verified = true;
+ new_cnt++;
+ }
+
+ /* We can't loop forever as we verify at least one global subprog on
+ * each pass.
+ */
+ if (new_cnt)
+ goto again;
+
+ return 0;
+}
+
+static int do_check_main(struct bpf_verifier_env *env)
+{
+ int ret;
+
+ env->insn_idx = 0;
+ ret = do_check_common(env, 0);
+ if (!ret)
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+ return ret;
+}
+
+
+static void print_verification_stats(struct bpf_verifier_env *env)
+{
+ int i;
+
+ if (env->log.level & BPF_LOG_STATS) {
+ verbose(env, "verification time %lld usec\n",
+ div_u64(env->verification_time, 1000));
+ verbose(env, "stack depth ");
+ for (i = 0; i < env->subprog_cnt; i++) {
+ u32 depth = env->subprog_info[i].stack_depth;
+
+ verbose(env, "%d", depth);
+ if (i + 1 < env->subprog_cnt)
+ verbose(env, "+");
+ }
+ verbose(env, "\n");
+ }
+ verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
+ "total_states %d peak_states %d mark_read %d\n",
+ env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
+ env->max_states_per_insn, env->total_states,
+ env->peak_states, env->longest_mark_read_walk);
+}
+
+int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
+ const struct bpf_ctx_arg_aux *info, u32 cnt)
+{
+ prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT);
+ prog->aux->ctx_arg_info_size = cnt;
+
+ return prog->aux->ctx_arg_info ? 0 : -ENOMEM;
+}
+
+static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
+{
+ const struct btf_type *t, *func_proto;
+ const struct bpf_struct_ops_desc *st_ops_desc;
+ const struct bpf_struct_ops *st_ops;
+ const struct btf_member *member;
+ struct bpf_prog *prog = env->prog;
+ bool has_refcounted_arg = false;
+ u32 btf_id, member_idx, member_off;
+ struct btf *btf;
+ const char *mname;
+ int i, err;
+
+ if (!prog->gpl_compatible) {
+ verbose(env, "struct ops programs must have a GPL compatible license\n");
+ return -EINVAL;
+ }
+
+ if (!prog->aux->attach_btf_id)
+ return -ENOTSUPP;
+
+ btf = prog->aux->attach_btf;
+ if (btf_is_module(btf)) {
+ /* Make sure st_ops is valid through the lifetime of env */
+ env->attach_btf_mod = btf_try_get_module(btf);
+ if (!env->attach_btf_mod) {
+ verbose(env, "struct_ops module %s is not found\n",
+ btf_get_name(btf));
+ return -ENOTSUPP;
+ }
+ }
+
+ btf_id = prog->aux->attach_btf_id;
+ st_ops_desc = bpf_struct_ops_find(btf, btf_id);
+ if (!st_ops_desc) {
+ verbose(env, "attach_btf_id %u is not a supported struct\n",
+ btf_id);
+ return -ENOTSUPP;
+ }
+ st_ops = st_ops_desc->st_ops;
+
+ t = st_ops_desc->type;
+ member_idx = prog->expected_attach_type;
+ if (member_idx >= btf_type_vlen(t)) {
+ verbose(env, "attach to invalid member idx %u of struct %s\n",
+ member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ member = &btf_type_member(t)[member_idx];
+ mname = btf_name_by_offset(btf, member->name_off);
+ func_proto = btf_type_resolve_func_ptr(btf, member->type,
+ NULL);
+ if (!func_proto) {
+ verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
+ mname, member_idx, st_ops->name);
+ return -EINVAL;
+ }
+
+ member_off = __btf_member_bit_offset(t, member) / 8;
+ err = bpf_struct_ops_supported(st_ops, member_off);
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+
+ if (st_ops->check_member) {
+ err = st_ops->check_member(t, member, prog);
+
+ if (err) {
+ verbose(env, "attach to unsupported member %s of struct %s\n",
+ mname, st_ops->name);
+ return err;
+ }
+ }
+
+ if (prog->aux->priv_stack_requested && !bpf_jit_supports_private_stack()) {
+ verbose(env, "Private stack not supported by jit\n");
+ return -EACCES;
+ }
+
+ for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) {
+ if (st_ops_desc->arg_info[member_idx].info->refcounted) {
+ has_refcounted_arg = true;
+ break;
+ }
+ }
+
+ /* Tail call is not allowed for programs with refcounted arguments since we
+ * cannot guarantee that valid refcounted kptrs will be passed to the callee.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (has_refcounted_arg && env->subprog_info[i].has_tail_call) {
+ verbose(env, "program with __ref argument cannot tail call\n");
+ return -EINVAL;
+ }
+ }
+
+ prog->aux->st_ops = st_ops;
+ prog->aux->attach_st_ops_member_off = member_off;
+
+ prog->aux->attach_func_proto = func_proto;
+ prog->aux->attach_func_name = mname;
+ env->ops = st_ops->verifier_ops;
+
+ return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info,
+ st_ops_desc->arg_info[member_idx].cnt);
+}
+#define SECURITY_PREFIX "security_"
+
+static int check_attach_modify_return(unsigned long addr, const char *func_name)
+{
+ if (within_error_injection_list(addr) ||
+ !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1))
+ return 0;
+
+ return -EINVAL;
+}
+
+/* list of non-sleepable functions that are otherwise on
+ * ALLOW_ERROR_INJECTION list
+ */
+BTF_SET_START(btf_non_sleepable_error_inject)
+/* Three functions below can be called from sleepable and non-sleepable context.
+ * Assume non-sleepable from bpf safety point of view.
+ */
+BTF_ID(func, __filemap_add_folio)
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+BTF_ID(func, should_fail_alloc_page)
+#endif
+#ifdef CONFIG_FAILSLAB
+BTF_ID(func, should_failslab)
+#endif
+BTF_SET_END(btf_non_sleepable_error_inject)
+
+static int check_non_sleepable_error_inject(u32 btf_id)
+{
+ return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
+}
+
+int bpf_check_attach_target(struct bpf_verifier_log *log,
+ const struct bpf_prog *prog,
+ const struct bpf_prog *tgt_prog,
+ u32 btf_id,
+ struct bpf_attach_target_info *tgt_info)
+{
+ bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
+ bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
+ char trace_symbol[KSYM_SYMBOL_LEN];
+ const char prefix[] = "btf_trace_";
+ struct bpf_raw_event_map *btp;
+ int ret = 0, subprog = -1, i;
+ const struct btf_type *t;
+ bool conservative = true;
+ const char *tname, *fname;
+ struct btf *btf;
+ long addr = 0;
+ struct module *mod = NULL;
+
+ if (!btf_id) {
+ bpf_log(log, "Tracing programs must provide btf_id\n");
+ return -EINVAL;
+ }
+ btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
+ if (!btf) {
+ bpf_log(log,
+ "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, btf_id);
+ if (!t) {
+ bpf_log(log, "attach_btf_id %u is invalid\n", btf_id);
+ return -EINVAL;
+ }
+ tname = btf_name_by_offset(btf, t->name_off);
+ if (!tname) {
+ bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id);
+ return -EINVAL;
+ }
+ if (tgt_prog) {
+ struct bpf_prog_aux *aux = tgt_prog->aux;
+ bool tgt_changes_pkt_data;
+ bool tgt_might_sleep;
+
+ if (bpf_prog_is_dev_bound(prog->aux) &&
+ !bpf_prog_dev_bound_match(prog, tgt_prog)) {
+ bpf_log(log, "Target program bound device mismatch");
+ return -EINVAL;
+ }
+
+ for (i = 0; i < aux->func_info_cnt; i++)
+ if (aux->func_info[i].type_id == btf_id) {
+ subprog = i;
+ break;
+ }
+ if (subprog == -1) {
+ bpf_log(log, "Subprog %s doesn't exist\n", tname);
+ return -EINVAL;
+ }
+ if (aux->func && aux->func[subprog]->aux->exception_cb) {
+ bpf_log(log,
+ "%s programs cannot attach to exception callback\n",
+ prog_extension ? "Extension" : "FENTRY/FEXIT");
+ return -EINVAL;
+ }
+ conservative = aux->func_info_aux[subprog].unreliable;
+ if (prog_extension) {
+ if (conservative) {
+ bpf_log(log,
+ "Cannot replace static functions\n");
+ return -EINVAL;
+ }
+ if (!prog->jit_requested) {
+ bpf_log(log,
+ "Extension programs should be JITed\n");
+ return -EINVAL;
+ }
+ tgt_changes_pkt_data = aux->func
+ ? aux->func[subprog]->aux->changes_pkt_data
+ : aux->changes_pkt_data;
+ if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) {
+ bpf_log(log,
+ "Extension program changes packet data, while original does not\n");
+ return -EINVAL;
+ }
+
+ tgt_might_sleep = aux->func
+ ? aux->func[subprog]->aux->might_sleep
+ : aux->might_sleep;
+ if (prog->aux->might_sleep && !tgt_might_sleep) {
+ bpf_log(log,
+ "Extension program may sleep, while original does not\n");
+ return -EINVAL;
+ }
+ }
+ if (!tgt_prog->jited) {
+ bpf_log(log, "Can attach to only JITed progs\n");
+ return -EINVAL;
+ }
+ if (prog_tracing) {
+ if (aux->attach_tracing_prog) {
+ /*
+ * Target program is an fentry/fexit which is already attached
+ * to another tracing program. More levels of nesting
+ * attachment are not allowed.
+ */
+ bpf_log(log, "Cannot nest tracing program attach more than once\n");
+ return -EINVAL;
+ }
+ } else if (tgt_prog->type == prog->type) {
+ /*
+ * To avoid potential call chain cycles, prevent attaching of a
+ * program extension to another extension. It's ok to attach
+ * fentry/fexit to extension program.
+ */
+ bpf_log(log, "Cannot recursively attach\n");
+ return -EINVAL;
+ }
+ if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
+ prog_extension &&
+ (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
+ tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+ /* Program extensions can extend all program types
+ * except fentry/fexit. The reason is the following.
+ * The fentry/fexit programs are used for performance
+ * analysis, stats and can be attached to any program
+ * type. When extension program is replacing XDP function
+ * it is necessary to allow performance analysis of all
+ * functions. Both original XDP program and its program
+ * extension. Hence attaching fentry/fexit to
+ * BPF_PROG_TYPE_EXT is allowed. If extending of
+ * fentry/fexit was allowed it would be possible to create
+ * long call chain fentry->extension->fentry->extension
+ * beyond reasonable stack size. Hence extending fentry
+ * is not allowed.
+ */
+ bpf_log(log, "Cannot extend fentry/fexit\n");
+ return -EINVAL;
+ }
+ } else {
+ if (prog_extension) {
+ bpf_log(log, "Cannot replace kernel functions\n");
+ return -EINVAL;
+ }
+ }
+
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_RAW_TP:
+ if (tgt_prog) {
+ bpf_log(log,
+ "Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
+ return -EINVAL;
+ }
+ if (!btf_type_is_typedef(t)) {
+ bpf_log(log, "attach_btf_id %u is not a typedef\n",
+ btf_id);
+ return -EINVAL;
+ }
+ if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
+ bpf_log(log, "attach_btf_id %u points to wrong type name %s\n",
+ btf_id, tname);
+ return -EINVAL;
+ }
+ tname += sizeof(prefix) - 1;
+
+ /* The func_proto of "btf_trace_##tname" is generated from typedef without argument
+ * names. Thus using bpf_raw_event_map to get argument names.
+ */
+ btp = bpf_get_raw_tracepoint(tname);
+ if (!btp)
+ return -EINVAL;
+ fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL,
+ trace_symbol);
+ bpf_put_raw_tracepoint(btp);
+
+ if (fname)
+ ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC);
+
+ if (!fname || ret < 0) {
+ bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n",
+ prefix, tname);
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_ptr(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ } else {
+ t = btf_type_by_id(btf, ret);
+ if (!btf_type_is_func(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+ }
+
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ /* should never happen in valid vmlinux build */
+ return -EINVAL;
+
+ break;
+ case BPF_TRACE_ITER:
+ if (!btf_type_is_func(t)) {
+ bpf_log(log, "attach_btf_id %u is not a function\n",
+ btf_id);
+ return -EINVAL;
+ }
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ return -EINVAL;
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
+ if (ret)
+ return ret;
+ break;
+ default:
+ if (!prog_extension)
+ return -EINVAL;
+ fallthrough;
+ case BPF_MODIFY_RETURN:
+ case BPF_LSM_MAC:
+ case BPF_LSM_CGROUP:
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ if (!btf_type_is_func(t)) {
+ bpf_log(log, "attach_btf_id %u is not a function\n",
+ btf_id);
+ return -EINVAL;
+ }
+ if (prog_extension &&
+ btf_check_type_match(log, prog, btf, t))
+ return -EINVAL;
+ t = btf_type_by_id(btf, t->type);
+ if (!btf_type_is_func_proto(t))
+ return -EINVAL;
+
+ if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) &&
+ (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type ||
+ prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type))
+ return -EINVAL;
+
+ if (tgt_prog && conservative)
+ t = NULL;
+
+ ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel);
+ if (ret < 0)
+ return ret;
+
+ if (tgt_prog) {
+ if (subprog == 0)
+ addr = (long) tgt_prog->bpf_func;
+ else
+ addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
+ } else {
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (mod)
+ addr = find_kallsyms_symbol_value(mod, tname);
+ else
+ addr = 0;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
+ if (!addr) {
+ module_put(mod);
+ bpf_log(log,
+ "The address of function %s cannot be found\n",
+ tname);
+ return -ENOENT;
+ }
+ }
+
+ if (prog->sleepable) {
+ ret = -EINVAL;
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
+ * attached to ALLOW_ERROR_INJECTION and are not in denylist.
+ */
+ if (!check_non_sleepable_error_inject(btf_id) &&
+ within_error_injection_list(addr))
+ ret = 0;
+ /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id,
+ prog);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ ret = 0;
+ }
+ break;
+ case BPF_PROG_TYPE_LSM:
+ /* LSM progs check that they are attached to bpf_lsm_*() funcs.
+ * Only some of them are sleepable.
+ */
+ if (bpf_lsm_is_sleepable_hook(btf_id))
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+ if (ret) {
+ module_put(mod);
+ bpf_log(log, "%s is not sleepable\n", tname);
+ return ret;
+ }
+ } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
+ if (tgt_prog) {
+ module_put(mod);
+ bpf_log(log, "can't modify return codes of BPF programs\n");
+ return -EINVAL;
+ }
+ ret = -EINVAL;
+ if (btf_kfunc_is_modify_return(btf, btf_id, prog) ||
+ !check_attach_modify_return(addr, tname))
+ ret = 0;
+ if (ret) {
+ module_put(mod);
+ bpf_log(log, "%s() is not modifiable\n", tname);
+ return ret;
+ }
+ }
+
+ break;
+ }
+ tgt_info->tgt_addr = addr;
+ tgt_info->tgt_name = tname;
+ tgt_info->tgt_type = t;
+ tgt_info->tgt_mod = mod;
+ return 0;
+}
+
+BTF_SET_START(btf_id_deny)
+BTF_ID_UNUSED
+#ifdef CONFIG_SMP
+BTF_ID(func, ___migrate_enable)
+BTF_ID(func, migrate_disable)
+BTF_ID(func, migrate_enable)
+#endif
+#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
+BTF_ID(func, rcu_read_unlock_strict)
+#endif
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+BTF_ID(func, preempt_count_add)
+BTF_ID(func, preempt_count_sub)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+BTF_ID(func, __rcu_read_lock)
+BTF_ID(func, __rcu_read_unlock)
+#endif
+BTF_SET_END(btf_id_deny)
+
+/* fexit and fmod_ret can't be used to attach to __noreturn functions.
+ * Currently, we must manually list all __noreturn functions here. Once a more
+ * robust solution is implemented, this workaround can be removed.
+ */
+BTF_SET_START(noreturn_deny)
+#ifdef CONFIG_IA32_EMULATION
+BTF_ID(func, __ia32_sys_exit)
+BTF_ID(func, __ia32_sys_exit_group)
+#endif
+#ifdef CONFIG_KUNIT
+BTF_ID(func, __kunit_abort)
+BTF_ID(func, kunit_try_catch_throw)
+#endif
+#ifdef CONFIG_MODULES
+BTF_ID(func, __module_put_and_kthread_exit)
+#endif
+#ifdef CONFIG_X86_64
+BTF_ID(func, __x64_sys_exit)
+BTF_ID(func, __x64_sys_exit_group)
+#endif
+BTF_ID(func, do_exit)
+BTF_ID(func, do_group_exit)
+BTF_ID(func, kthread_complete_and_exit)
+BTF_ID(func, kthread_exit)
+BTF_ID(func, make_task_dead)
+BTF_SET_END(noreturn_deny)
+
+static bool can_be_sleepable(struct bpf_prog *prog)
+{
+ if (prog->type == BPF_PROG_TYPE_TRACING) {
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ case BPF_MODIFY_RETURN:
+ case BPF_TRACE_ITER:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return prog->type == BPF_PROG_TYPE_LSM ||
+ prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
+ prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+}
+
+static int check_attach_btf_id(struct bpf_verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
+ struct bpf_attach_target_info tgt_info = {};
+ u32 btf_id = prog->aux->attach_btf_id;
+ struct bpf_trampoline *tr;
+ int ret;
+ u64 key;
+
+ if (prog->type == BPF_PROG_TYPE_SYSCALL) {
+ if (prog->sleepable)
+ /* attach_btf_id checked to be zero already */
+ return 0;
+ verbose(env, "Syscall programs can only be sleepable\n");
+ return -EINVAL;
+ }
+
+ if (prog->sleepable && !can_be_sleepable(prog)) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+ return -EINVAL;
+ }
+
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
+ return check_struct_ops_btf_id(env);
+
+ if (prog->type != BPF_PROG_TYPE_TRACING &&
+ prog->type != BPF_PROG_TYPE_LSM &&
+ prog->type != BPF_PROG_TYPE_EXT)
+ return 0;
+
+ ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
+ if (ret)
+ return ret;
+
+ if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) {
+ /* to make freplace equivalent to their targets, they need to
+ * inherit env->ops and expected_attach_type for the rest of the
+ * verification
+ */
+ env->ops = bpf_verifier_ops[tgt_prog->type];
+ prog->expected_attach_type = tgt_prog->expected_attach_type;
+ }
+
+ /* store info about the attachment target that will be used later */
+ prog->aux->attach_func_proto = tgt_info.tgt_type;
+ prog->aux->attach_func_name = tgt_info.tgt_name;
+ prog->aux->mod = tgt_info.tgt_mod;
+
+ if (tgt_prog) {
+ prog->aux->saved_dst_prog_type = tgt_prog->type;
+ prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type;
+ }
+
+ if (prog->expected_attach_type == BPF_TRACE_RAW_TP) {
+ prog->aux->attach_btf_trace = true;
+ return 0;
+ } else if (prog->expected_attach_type == BPF_TRACE_ITER) {
+ return bpf_iter_prog_supported(prog);
+ }
+
+ if (prog->type == BPF_PROG_TYPE_LSM) {
+ ret = bpf_lsm_verify_prog(&env->log, prog);
+ if (ret < 0)
+ return ret;
+ } else if (prog->type == BPF_PROG_TYPE_TRACING &&
+ btf_id_set_contains(&btf_id_deny, btf_id)) {
+ verbose(env, "Attaching tracing programs to function '%s' is rejected.\n",
+ tgt_info.tgt_name);
+ return -EINVAL;
+ } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_MODIFY_RETURN) &&
+ btf_id_set_contains(&noreturn_deny, btf_id)) {
+ verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n",
+ tgt_info.tgt_name);
+ return -EINVAL;
+ }
+
+ key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+ tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
+ prog->aux->dst_trampoline = tr;
+ return 0;
+}
+
+struct btf *bpf_get_btf_vmlinux(void)
+{
+ if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ mutex_lock(&bpf_verifier_lock);
+ if (!btf_vmlinux)
+ btf_vmlinux = btf_parse_vmlinux();
+ mutex_unlock(&bpf_verifier_lock);
+ }
+ return btf_vmlinux;
+}
+
+/*
+ * The add_fd_from_fd_array() is executed only if fd_array_cnt is non-zero. In
+ * this case expect that every file descriptor in the array is either a map or
+ * a BTF. Everything else is considered to be trash.
+ */
+static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ struct btf *btf;
+ CLASS(fd, f)(fd);
+ int err;
+
+ map = __bpf_map_get(f);
+ if (!IS_ERR(map)) {
+ err = __add_used_map(env, map);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf)) {
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd);
+ return PTR_ERR(map);
+}
+
+static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, bpfptr_t uattr)
+{
+ size_t size = sizeof(int);
+ int ret;
+ int fd;
+ u32 i;
+
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
+
+ /*
+ * The only difference between old (no fd_array_cnt is given) and new
+ * APIs is that in the latter case the fd_array is expected to be
+ * continuous and is scanned for map fds right away
+ */
+ if (!attr->fd_array_cnt)
+ return 0;
+
+ /* Check for integer overflow */
+ if (attr->fd_array_cnt >= (U32_MAX / size)) {
+ verbose(env, "fd_array_cnt is too big (%u)\n", attr->fd_array_cnt);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < attr->fd_array_cnt; i++) {
+ if (copy_from_bpfptr_offset(&fd, env->fd_array, i * size, size))
+ return -EFAULT;
+
+ ret = add_fd_from_fd_array(env, fd);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Each field is a register bitmask */
+struct insn_live_regs {
+ u16 use; /* registers read by instruction */
+ u16 def; /* registers written by instruction */
+ u16 in; /* registers that may be alive before instruction */
+ u16 out; /* registers that may be alive after instruction */
+};
+
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Compute info->{use,def} fields for the instruction */
+static void compute_insn_live_regs(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct insn_live_regs *info)
+{
+ struct call_summary cs;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u16 src = BIT(insn->src_reg);
+ u16 dst = BIT(insn->dst_reg);
+ u16 r0 = BIT(0);
+ u16 def = 0;
+ u16 use = 0xffff;
+
+ switch (class) {
+ case BPF_LD:
+ switch (mode) {
+ case BPF_IMM:
+ if (BPF_SIZE(insn->code) == BPF_DW) {
+ def = dst;
+ use = 0;
+ }
+ break;
+ case BPF_LD | BPF_ABS:
+ case BPF_LD | BPF_IND:
+ /* stick with defaults */
+ break;
+ }
+ break;
+ case BPF_LDX:
+ switch (mode) {
+ case BPF_MEM:
+ case BPF_MEMSX:
+ def = dst;
+ use = src;
+ break;
+ }
+ break;
+ case BPF_ST:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst;
+ break;
+ }
+ break;
+ case BPF_STX:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst | src;
+ break;
+ case BPF_ATOMIC:
+ switch (insn->imm) {
+ case BPF_CMPXCHG:
+ use = r0 | dst | src;
+ def = r0;
+ break;
+ case BPF_LOAD_ACQ:
+ def = dst;
+ use = src;
+ break;
+ case BPF_STORE_REL:
+ def = 0;
+ use = dst | src;
+ break;
+ default:
+ use = dst | src;
+ if (insn->imm & BPF_FETCH)
+ def = src;
+ else
+ def = 0;
+ }
+ break;
+ }
+ break;
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (code) {
+ case BPF_END:
+ use = dst;
+ def = dst;
+ break;
+ case BPF_MOV:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = 0;
+ else
+ use = src;
+ break;
+ default:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ case BPF_JMP:
+ case BPF_JMP32:
+ switch (code) {
+ case BPF_JA:
+ case BPF_JCOND:
+ def = 0;
+ use = 0;
+ break;
+ case BPF_EXIT:
+ def = 0;
+ use = r0;
+ break;
+ case BPF_CALL:
+ def = ALL_CALLER_SAVED_REGS;
+ use = def & ~BIT(BPF_REG_0);
+ if (get_call_summary(env, insn, &cs))
+ use = GENMASK(cs.num_params, 1);
+ break;
+ default:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ }
+
+ info->def = def;
+ info->use = use;
+}
+
+/* Compute may-live registers after each instruction in the program.
+ * The register is live after the instruction I if it is read by some
+ * instruction S following I during program execution and is not
+ * overwritten between I and S.
+ *
+ * Store result in env->insn_aux_data[i].live_regs.
+ */
+static int compute_live_registers(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct insn_live_regs *state;
+ int insn_cnt = env->prog->len;
+ int err = 0, i, j;
+ bool changed;
+
+ /* Use the following algorithm:
+ * - define the following:
+ * - I.use : a set of all registers read by instruction I;
+ * - I.def : a set of all registers written by instruction I;
+ * - I.in : a set of all registers that may be alive before I execution;
+ * - I.out : a set of all registers that may be alive after I execution;
+ * - insn_successors(I): a set of instructions S that might immediately
+ * follow I for some program execution;
+ * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
+ * - visit each instruction in a postorder and update
+ * state[i].in, state[i].out as follows:
+ *
+ * state[i].out = U [state[s].in for S in insn_successors(i)]
+ * state[i].in = (state[i].out / state[i].def) U state[i].use
+ *
+ * (where U stands for set union, / stands for set difference)
+ * - repeat the computation while {in,out} fields changes for
+ * any instruction.
+ */
+ state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL_ACCOUNT);
+ if (!state) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ compute_insn_live_regs(env, &insns[i], &state[i]);
+
+ changed = true;
+ while (changed) {
+ changed = false;
+ for (i = 0; i < env->cfg.cur_postorder; ++i) {
+ int insn_idx = env->cfg.insn_postorder[i];
+ struct insn_live_regs *live = &state[insn_idx];
+ struct bpf_iarray *succ;
+ u16 new_out = 0;
+ u16 new_in = 0;
+
+ succ = bpf_insn_successors(env, insn_idx);
+ for (int s = 0; s < succ->cnt; ++s)
+ new_out |= state[succ->items[s]].in;
+ new_in = (new_out & ~live->def) | live->use;
+ if (new_out != live->out || new_in != live->in) {
+ live->in = new_in;
+ live->out = new_out;
+ changed = true;
+ }
+ }
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ insn_aux[i].live_regs_before = state[i].in;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "Live regs before insn:\n");
+ for (i = 0; i < insn_cnt; ++i) {
+ if (env->insn_aux_data[i].scc)
+ verbose(env, "%3d ", env->insn_aux_data[i].scc);
+ else
+ verbose(env, " ");
+ verbose(env, "%3d: ", i);
+ for (j = BPF_REG_0; j < BPF_REG_10; ++j)
+ if (insn_aux[i].live_regs_before & BIT(j))
+ verbose(env, "%d", j);
+ else
+ verbose(env, ".");
+ verbose(env, " ");
+ verbose_insn(env, &insns[i]);
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+ }
+
+out:
+ kvfree(state);
+ return err;
+}
+
+/*
+ * Compute strongly connected components (SCCs) on the CFG.
+ * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
+ * If instruction is a sole member of its SCC and there are no self edges,
+ * assign it SCC number of zero.
+ * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
+ */
+static int compute_scc(struct bpf_verifier_env *env)
+{
+ const u32 NOT_ON_STACK = U32_MAX;
+
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ const u32 insn_cnt = env->prog->len;
+ int stack_sz, dfs_sz, err = 0;
+ u32 *stack, *pre, *low, *dfs;
+ u32 i, j, t, w;
+ u32 next_preorder_num;
+ u32 next_scc_id;
+ bool assign_scc;
+ struct bpf_iarray *succ;
+
+ next_preorder_num = 1;
+ next_scc_id = 1;
+ /*
+ * - 'stack' accumulates vertices in DFS order, see invariant comment below;
+ * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
+ * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
+ * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
+ */
+ stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
+ if (!stack || !pre || !low || !dfs) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ /*
+ * References:
+ * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
+ * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
+ *
+ * The algorithm maintains the following invariant:
+ * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
+ * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
+ *
+ * Consequently:
+ * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
+ * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
+ * and thus there is an SCC (loop) containing both 'u' and 'v'.
+ * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
+ * and 'v' can be considered the root of some SCC.
+ *
+ * Here is a pseudo-code for an explicitly recursive version of the algorithm:
+ *
+ * NOT_ON_STACK = insn_cnt + 1
+ * pre = [0] * insn_cnt
+ * low = [0] * insn_cnt
+ * scc = [0] * insn_cnt
+ * stack = []
+ *
+ * next_preorder_num = 1
+ * next_scc_id = 1
+ *
+ * def recur(w):
+ * nonlocal next_preorder_num
+ * nonlocal next_scc_id
+ *
+ * pre[w] = next_preorder_num
+ * low[w] = next_preorder_num
+ * next_preorder_num += 1
+ * stack.append(w)
+ * for s in successors(w):
+ * # Note: for classic algorithm the block below should look as:
+ * #
+ * # if pre[s] == 0:
+ * # recur(s)
+ * # low[w] = min(low[w], low[s])
+ * # elif low[s] != NOT_ON_STACK:
+ * # low[w] = min(low[w], pre[s])
+ * #
+ * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
+ * # does not break the invariant and makes itartive version of the algorithm
+ * # simpler. See 'Algorithm #3' from [2].
+ *
+ * # 's' not yet visited
+ * if pre[s] == 0:
+ * recur(s)
+ * # if 's' is on stack, pick lowest reachable preorder number from it;
+ * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
+ * # so 'min' would be a noop.
+ * low[w] = min(low[w], low[s])
+ *
+ * if low[w] == pre[w]:
+ * # 'w' is the root of an SCC, pop all vertices
+ * # below 'w' on stack and assign same SCC to them.
+ * while True:
+ * t = stack.pop()
+ * low[t] = NOT_ON_STACK
+ * scc[t] = next_scc_id
+ * if t == w:
+ * break
+ * next_scc_id += 1
+ *
+ * for i in range(0, insn_cnt):
+ * if pre[i] == 0:
+ * recur(i)
+ *
+ * Below implementation replaces explicit recursion with array 'dfs'.
+ */
+ for (i = 0; i < insn_cnt; i++) {
+ if (pre[i])
+ continue;
+ stack_sz = 0;
+ dfs_sz = 1;
+ dfs[0] = i;
+dfs_continue:
+ while (dfs_sz) {
+ w = dfs[dfs_sz - 1];
+ if (pre[w] == 0) {
+ low[w] = next_preorder_num;
+ pre[w] = next_preorder_num;
+ next_preorder_num++;
+ stack[stack_sz++] = w;
+ }
+ /* Visit 'w' successors */
+ succ = bpf_insn_successors(env, w);
+ for (j = 0; j < succ->cnt; ++j) {
+ if (pre[succ->items[j]]) {
+ low[w] = min(low[w], low[succ->items[j]]);
+ } else {
+ dfs[dfs_sz++] = succ->items[j];
+ goto dfs_continue;
+ }
+ }
+ /*
+ * Preserve the invariant: if some vertex above in the stack
+ * is reachable from 'w', keep 'w' on the stack.
+ */
+ if (low[w] < pre[w]) {
+ dfs_sz--;
+ goto dfs_continue;
+ }
+ /*
+ * Assign SCC number only if component has two or more elements,
+ * or if component has a self reference.
+ */
+ assign_scc = stack[stack_sz - 1] != w;
+ for (j = 0; j < succ->cnt; ++j) {
+ if (succ->items[j] == w) {
+ assign_scc = true;
+ break;
+ }
+ }
+ /* Pop component elements from stack */
+ do {
+ t = stack[--stack_sz];
+ low[t] = NOT_ON_STACK;
+ if (assign_scc)
+ aux[t].scc = next_scc_id;
+ } while (t != w);
+ if (assign_scc)
+ next_scc_id++;
+ dfs_sz--;
+ }
+ }
+ env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL_ACCOUNT);
+ if (!env->scc_info) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ env->scc_cnt = next_scc_id;
+exit:
+ kvfree(stack);
+ kvfree(pre);
+ kvfree(low);
+ kvfree(dfs);
+ return err;
+}
- /* 'struct verifier_env' can be global, but since it's not small,
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
+{
+ u64 start_time = ktime_get_ns();
+ struct bpf_verifier_env *env;
+ int i, len, ret = -EINVAL, err;
+ u32 log_true_size;
+ bool is_priv;
+
+ BTF_TYPE_EMIT(enum bpf_features);
+
+ /* no program is valid */
+ if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+ return -EINVAL;
+
+ /* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL_ACCOUNT);
if (!env)
return -ENOMEM;
+ env->bt.env = env;
+
+ len = (*prog)->len;
+ env->insn_aux_data =
+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
+ for (i = 0; i < len; i++)
+ env->insn_aux_data[i].orig_idx = i;
+ env->succ = iarray_realloc(NULL, 2);
+ if (!env->succ)
+ goto err_free_env;
env->prog = *prog;
+ env->ops = bpf_verifier_ops[env->prog->type];
+
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+ env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+ env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+ env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+ env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
+
+ bpf_get_btf_vmlinux();
/* grab the mutex to protect few globals used by verifier */
- mutex_lock(&bpf_verifier_lock);
+ if (!is_priv)
+ mutex_lock(&bpf_verifier_lock);
- if (attr->log_level || attr->log_buf || attr->log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log_level = attr->log_level;
- log_ubuf = (char __user *) (unsigned long) attr->log_buf;
- log_size = attr->log_size;
- log_len = 0;
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ ret = bpf_vlog_init(&env->log, attr->log_level,
+ (char __user *) (unsigned long) attr->log_buf,
+ attr->log_size);
+ if (ret)
+ goto err_unlock;
+
+ ret = process_fd_array(env, attr, uattr);
+ if (ret)
+ goto skip_full_check;
- ret = -EINVAL;
- /* log_* values have to be sane */
- if (log_size < 128 || log_size > UINT_MAX >> 8 ||
- log_level == 0 || log_ubuf == NULL)
- goto free_env;
-
- ret = -ENOMEM;
- log_buf = vmalloc(log_size);
- if (!log_buf)
- goto free_env;
- } else {
- log_level = 0;
- }
+ mark_verifier_state_clean(env);
- ret = replace_map_fd_with_map_ptr(env);
- if (ret < 0)
+ if (IS_ERR(btf_vmlinux)) {
+ /* Either gcc or pahole or kernel are broken. */
+ verbose(env, "in-kernel BTF is malformed\n");
+ ret = PTR_ERR(btf_vmlinux);
goto skip_full_check;
+ }
+
+ env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ env->strict_alignment = true;
+ if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
+ env->strict_alignment = false;
- env->explored_states = kcalloc(env->prog->len,
- sizeof(struct verifier_state_list *),
- GFP_USER);
+ if (is_priv)
+ env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+ env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
+
+ env->explored_states = kvcalloc(state_htab_size(env),
+ sizeof(struct list_head),
+ GFP_KERNEL_ACCOUNT);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
+ for (i = 0; i < state_htab_size(env); i++)
+ INIT_LIST_HEAD(&env->explored_states[i]);
+ INIT_LIST_HEAD(&env->free_list);
+
+ ret = check_btf_info_early(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = add_subprog_and_kfunc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = check_subprogs(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = check_btf_info(env, attr, uattr);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = resolve_pseudo_ldimm64(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ if (bpf_prog_is_offloaded(env->prog->aux)) {
+ ret = bpf_prog_offload_verifier_prep(env->prog);
+ if (ret)
+ goto skip_full_check;
+ }
+
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;
- ret = do_check(env);
+ ret = compute_postorder(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = bpf_stack_liveness_init(env);
+ if (ret)
+ goto skip_full_check;
+
+ ret = check_attach_btf_id(env);
+ if (ret)
+ goto skip_full_check;
+
+ ret = compute_scc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = compute_live_registers(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = mark_fastcall_patterns(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = do_check_main(env);
+ ret = ret ?: do_check_subprogs(env);
+
+ if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
+ ret = bpf_prog_offload_finalize(env);
skip_full_check:
- while (pop_stack(env, NULL) >= 0);
- free_states(env);
+ kvfree(env->explored_states);
+
+ /* might decrease stack depth, keep it before passes that
+ * allocate additional slots.
+ */
+ if (ret == 0)
+ ret = remove_fastcall_spills_fills(env);
+
+ if (ret == 0)
+ ret = check_max_stack_depth(env);
+
+ /* instruction rewrites happen after this point */
+ if (ret == 0)
+ ret = optimize_bpf_loop(env);
+
+ if (is_priv) {
+ if (ret == 0)
+ opt_hard_wire_dead_code_branches(env);
+ if (ret == 0)
+ ret = opt_remove_dead_code(env);
+ if (ret == 0)
+ ret = opt_remove_nops(env);
+ } else {
+ if (ret == 0)
+ sanitize_dead_code(env);
+ }
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
- if (log_level && log_len >= log_size - 1) {
- BUG_ON(log_len >= log_size);
- /* verifier log exceeded user supplied buffer */
- ret = -ENOSPC;
- /* fall through to return what was recorded */
+ if (ret == 0)
+ ret = do_misc_fixups(env);
+
+ /* do 32-bit optimization after insn patching has done so those patched
+ * insns could be handled correctly.
+ */
+ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {
+ ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
+ env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
+ : false;
}
- /* copy verifier log back to user space including trailing zero */
- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ if (ret == 0)
+ ret = fixup_call_args(env);
+
+ env->verification_time = ktime_get_ns() - start_time;
+ print_verification_stats(env);
+ env->prog->aux->verified_insns = env->insn_processed;
+
+ /* preserve original error even if log finalization is successful */
+ err = bpf_vlog_finalize(&env->log, &log_true_size);
+ if (err)
+ ret = err;
+
+ if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+ &log_true_size, sizeof(log_true_size))) {
ret = -EFAULT;
- goto free_log_buf;
+ goto err_release_maps;
}
- if (ret == 0 && env->used_map_cnt) {
+ if (ret)
+ goto err_release_maps;
+
+ if (env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
- goto free_log_buf;
+ goto err_release_maps;
}
memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
env->prog->aux->used_map_cnt = env->used_map_cnt;
+ }
+ if (env->used_btf_cnt) {
+ /* if program passed verifier, update used_btfs in bpf_prog_aux */
+ env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
+ sizeof(env->used_btfs[0]),
+ GFP_KERNEL_ACCOUNT);
+ if (!env->prog->aux->used_btfs) {
+ ret = -ENOMEM;
+ goto err_release_maps;
+ }
+ memcpy(env->prog->aux->used_btfs, env->used_btfs,
+ sizeof(env->used_btfs[0]) * env->used_btf_cnt);
+ env->prog->aux->used_btf_cnt = env->used_btf_cnt;
+ }
+ if (env->used_map_cnt || env->used_btf_cnt) {
/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
*/
convert_pseudo_ld_imm64(env);
}
-free_log_buf:
- if (log_level)
- vfree(log_buf);
-free_env:
+ adjust_btf_func(env);
+
+err_release_maps:
+ if (ret)
+ release_insn_arrays(env);
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
- * them now. Otherwise free_bpf_prog_info() will release them.
+ * them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
+ if (!env->prog->aux->used_btfs)
+ release_btfs(env);
+
+ /* extension progs temporarily inherit the attach_type of their targets
+ for verification purposes, so set it back to zero before returning
+ */
+ if (env->prog->type == BPF_PROG_TYPE_EXT)
+ env->prog->expected_attach_type = 0;
+
*prog = env->prog;
- kfree(env);
- mutex_unlock(&bpf_verifier_lock);
+
+ module_put(env->attach_btf_mod);
+err_unlock:
+ if (!is_priv)
+ mutex_unlock(&bpf_verifier_lock);
+ clear_insn_aux_data(env, 0, env->prog->len);
+ vfree(env->insn_aux_data);
+err_free_env:
+ bpf_stack_liveness_free(env);
+ kvfree(env->cfg.insn_postorder);
+ kvfree(env->scc_info);
+ kvfree(env->succ);
+ kvfree(env->gotox_tmp_buf);
+ kvfree(env);
return ret;
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b54d5c6..829f49ae07b9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/capability.c
*
@@ -17,14 +18,7 @@
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
-#include <asm/uaccess.h>
-
-/*
- * Leveraged for setting/resetting capabilities
- */
-
-const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-EXPORT_SYMBOL(__cap_empty_set);
+#include <linux/uaccess.h>
int file_caps_enabled = 1;
@@ -44,10 +38,8 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- char name[sizeof(current->comm)];
-
pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
- get_task_comm(name, current));
+ current->comm);
}
/*
@@ -68,10 +60,8 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- char name[sizeof(current->comm)];
-
pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
- get_task_comm(name, current));
+ current->comm);
}
/*
@@ -92,9 +82,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
- /*
- * fall through - v3 is otherwise equivalent to v2.
- */
+ fallthrough; /* v3 is otherwise equivalent to v2 */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
@@ -120,7 +108,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
int ret;
if (pid && (pid != task_pid_vnr(current))) {
- struct task_struct *target;
+ const struct task_struct *target;
rcu_read_lock();
@@ -152,6 +140,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
+ struct __user_cap_data_struct kdata[2];
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
@@ -164,42 +153,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
- if (!ret) {
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i;
-
- for (i = 0; i < tocopy; i++) {
- kdata[i].effective = pE.cap[i];
- kdata[i].permitted = pP.cap[i];
- kdata[i].inheritable = pI.cap[i];
- }
-
- /*
- * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
- * we silently drop the upper capabilities here. This
- * has the effect of making older libcap
- * implementations implicitly drop upper capability
- * bits when they perform a: capget/modify/capset
- * sequence.
- *
- * This behavior is considered fail-safe
- * behavior. Upgrading the application to a newer
- * version of libcap will enable access to the newer
- * capabilities.
- *
- * An alternative would be to return an error here
- * (-ERANGE), but that causes legacy applications to
- * unexpectedly fail; the capget/modify/capset aborts
- * before modification is attempted and the application
- * fails.
- */
- if (copy_to_user(dataptr, kdata, tocopy
- * sizeof(struct __user_cap_data_struct))) {
- return -EFAULT;
- }
- }
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * Annoying legacy format with 64-bit capabilities exposed
+ * as two sets of 32-bit fields, so we need to split the
+ * capability values up.
+ */
+ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32;
+ kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32;
+ kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;
+
+ /*
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+ * we silently drop the upper capabilities here. This
+ * has the effect of making older libcap
+ * implementations implicitly drop upper capability
+ * bits when they perform a: capget/modify/capset
+ * sequence.
+ *
+ * This behavior is considered fail-safe
+ * behavior. Upgrading the application to a newer
+ * version of libcap will enable access to the newer
+ * capabilities.
+ *
+ * An alternative would be to return an error here
+ * (-ERANGE), but that causes legacy applications to
+ * unexpectedly fail; the capget/modify/capset aborts
+ * before modification is attempted and the application
+ * fails.
+ */
+ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
+ return -EFAULT;
+
+ return 0;
+}
+
+static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
+{
+ return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}
/**
@@ -222,8 +215,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
- struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
- unsigned i, tocopy, copybytes;
+ struct __user_cap_data_struct kdata[2] = { { 0, }, };
+ unsigned tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
@@ -247,21 +240,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
- for (i = 0; i < tocopy; i++) {
- effective.cap[i] = kdata[i].effective;
- permitted.cap[i] = kdata[i].permitted;
- inheritable.cap[i] = kdata[i].inheritable;
- }
- while (i < _KERNEL_CAPABILITY_U32S) {
- effective.cap[i] = 0;
- permitted.cap[i] = 0;
- inheritable.cap[i] = 0;
- i++;
- }
-
- effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
- inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective);
+ permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted);
+ inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);
new = prepare_creds();
if (!new)
@@ -298,28 +279,13 @@ bool has_ns_capability(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
rcu_read_unlock();
return (ret == 0);
}
/**
- * has_capability - Does a task have a capability in init_user_ns
- * @t: The task in question
- * @cap: The capability to be tested for
- *
- * Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
- *
- * Note that this does not set PF_SUPERPRIV on the task.
- */
-bool has_capability(struct task_struct *t, int cap)
-{
- return has_ns_capability(t, &init_user_ns, cap);
-}
-
-/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
* in a specific user ns.
* @t: The task in question
@@ -338,7 +304,7 @@ bool has_ns_capability_noaudit(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
@@ -360,6 +326,26 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability_noaudit);
+
+static bool ns_capable_common(struct user_namespace *ns,
+ int cap,
+ unsigned int opts)
+{
+ int capable;
+
+ if (unlikely(!cap_valid(cap))) {
+ pr_crit("capable() called with invalid cap=%u\n", cap);
+ BUG();
+ }
+
+ capable = security_capable(current_cred(), ns, cap, opts);
+ if (capable == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return true;
+ }
+ return false;
+}
/**
* ns_capable - Determine if the current task has a superior capability in effect
@@ -374,19 +360,46 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- if (unlikely(!cap_valid(cap))) {
- pr_crit("capable() called with invalid cap=%u\n", cap);
- BUG();
- }
-
- if (security_capable(current_cred(), ns, cap) == 0) {
- current->flags |= PF_SUPERPRIV;
- return true;
- }
- return false;
+ return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);
+/**
+ * ns_capable_noaudit - Determine if the current task has a superior capability
+ * (unaudited) in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
+}
+EXPORT_SYMBOL(ns_capable_noaudit);
+
+/**
+ * ns_capable_setid - Determine if the current task has a superior capability
+ * in effect, while signalling that this check is being done from within a
+ * setid or setgroups syscall.
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_setid(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, CAP_OPT_INSETID);
+}
+EXPORT_SYMBOL(ns_capable_setid);
/**
* capable - Determine if the current task has a superior capability in effect
@@ -420,10 +433,11 @@ EXPORT_SYMBOL(capable);
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
int cap)
{
+
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
- if (security_capable(file->f_cred, ns, cap) == 0)
+ if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
return true;
return false;
@@ -431,7 +445,24 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
EXPORT_SYMBOL(file_ns_capable);
/**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @idmap: idmap of the mount @inode was found from
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
+ struct mnt_idmap *idmap,
+ const struct inode *inode)
+{
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
+}
+
+/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
+ * @idmap: idmap of the mount @inode was found from
* @inode: The inode in question
* @cap: The capability in question
*
@@ -439,11 +470,34 @@ EXPORT_SYMBOL(file_ns_capable);
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
-bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
+ const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
- return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
- kgid_has_mapping(ns, inode->i_gid);
+ return ns_capable(ns, cap) &&
+ privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+ int ret = 0; /* An absent tracer adds no restrictions */
+ const struct cred *cred;
+
+ rcu_read_lock();
+ cred = rcu_dereference(tsk->ptracer_cred);
+ if (cred)
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE,
+ CAP_OPT_NOAUDIT);
+ rcu_read_unlock();
+ return (ret == 0);
+}
diff --git a/kernel/cfi.c b/kernel/cfi.c
new file mode 100644
index 000000000000..4dad04ead06c
--- /dev/null
+++ b/kernel/cfi.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) error handling.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+
+#include <linux/bpf.h>
+#include <linux/cfi_types.h>
+#include <linux/cfi.h>
+
+bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE);
+
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+ unsigned long *target, u32 type)
+{
+ if (target)
+ pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+ (void *)addr, (void *)*target, type);
+ else
+ pr_err("CFI failure at %pS (no target information)\n",
+ (void *)addr);
+
+ if (cfi_warn) {
+ __warn(NULL, 0, (void *)addr, 0, regs, NULL);
+ return BUG_TRAP_TYPE_WARN;
+ }
+
+ return BUG_TRAP_TYPE_BUG;
+}
+
+/*
+ * Declare two non-existent functions with types that match bpf_func_t and
+ * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash
+ * variables for each function type. The cfi_bpf_* variables are used by
+ * arch-specific BPF JIT implementations to ensure indirectly callable JIT
+ * code has matching CFI type hashes.
+ */
+extern typeof(*(bpf_func_t)0) __bpf_prog_runX;
+DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
+
+extern typeof(*(bpf_callback_t)0) __bpf_callback_fn;
+DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
+
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
+{
+ return (unsigned long)((long)p + (long)*p);
+}
+
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
+{
+ s32 *p;
+
+ for (p = start; p < end; ++p) {
+ if (trap_address(p) == addr)
+ return true;
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ struct module *mod)
+{
+ char *secstrings;
+ unsigned int i;
+
+ mod->kcfi_traps = NULL;
+ mod->kcfi_traps_end = NULL;
+
+ secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+ continue;
+
+ mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+ mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+ break;
+ }
+}
+
+static bool is_module_cfi_trap(unsigned long addr)
+{
+ struct module *mod;
+ bool found = false;
+
+ guard(rcu)();
+ mod = __module_address(addr);
+ if (mod)
+ found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
+
+ return found;
+}
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
+{
+ return false;
+}
+#endif /* CONFIG_MODULES */
+
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
+
+bool is_cfi_trap(unsigned long addr)
+{
+ if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+ return true;
+
+ return is_module_cfi_trap(addr);
+}
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
deleted file mode 100644
index 469dd547770c..000000000000
--- a/kernel/cgroup.c
+++ /dev/null
@@ -1,5603 +0,0 @@
-/*
- * Generic process-grouping system.
- *
- * Based originally on the cpuset system, extracted by Paul Menage
- * Copyright (C) 2006 Google, Inc
- *
- * Notifications support
- * Copyright (C) 2009 Nokia Corporation
- * Author: Kirill A. Shutemov
- *
- * Copyright notices from the original cpuset code:
- * --------------------------------------------------
- * Copyright (C) 2003 BULL SA.
- * Copyright (C) 2004-2006 Silicon Graphics, Inc.
- *
- * Portions derived from Patrick Mochel's sysfs code.
- * sysfs is Copyright (c) 2001-3 Patrick Mochel
- *
- * 2003-10-10 Written by Simon Derr.
- * 2003-10-22 Updates by Stephen Hemminger.
- * 2004 May-July Rework by Paul Jackson.
- * ---------------------------------------------------
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file COPYING in the main directory of the Linux
- * distribution for more details.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/cgroup.h>
-#include <linux/cred.h>
-#include <linux/ctype.h>
-#include <linux/errno.h>
-#include <linux/init_task.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/magic.h>
-#include <linux/mm.h>
-#include <linux/mutex.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/proc_fs.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/rwsem.h>
-#include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
-#include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
-#include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/kthread.h>
-#include <linux/delay.h>
-
-#include <linux/atomic.h>
-
-/*
- * pidlists linger the following amount before being destroyed. The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY HZ
-
-#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
- MAX_CFTYPE_NAME + 2)
-
-/*
- * cgroup_mutex is the master lock. Any modification to cgroup or its
- * hierarchy must be performed while holding it.
- *
- * css_set_rwsem protects task->cgroups pointer, the list of css_set
- * objects, and the chain of tasks off each css_set.
- *
- * These locks are exported if CONFIG_PROVE_RCU so that accessors in
- * cgroup.h can use them for lockdep annotations.
- */
-#ifdef CONFIG_PROVE_RCU
-DEFINE_MUTEX(cgroup_mutex);
-DECLARE_RWSEM(css_set_rwsem);
-EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_rwsem);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DECLARE_RWSEM(css_set_rwsem);
-#endif
-
-/*
- * Protects cgroup_idr and css_idr so that IDs can be released without
- * grabbing cgroup_mutex.
- */
-static DEFINE_SPINLOCK(cgroup_idr_lock);
-
-/*
- * Protects cgroup_subsys->release_agent_path. Modifying it also requires
- * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
-#define cgroup_assert_mutex_or_rcu_locked() \
- rcu_lockdep_assert(rcu_read_lock_held() || \
- lockdep_is_held(&cgroup_mutex), \
- "cgroup_mutex or RCU read lock required");
-
-/*
- * cgroup destruction makes heavy use of work items and there can be a lot
- * of concurrent destructions. Use a separate workqueue so that cgroup
- * destruction work items don't end up filling up max_active of system_wq
- * which may lead to deadlock.
- */
-static struct workqueue_struct *cgroup_destroy_wq;
-
-/*
- * pidlist destructions need to be flushed on cgroup destruction. Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
-/* generate an array of cgroup subsystem pointers */
-#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
-#include <linux/cgroup_subsys.h>
-};
-#undef SUBSYS
-
-/* array of cgroup subsystem names */
-#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
-static const char *cgroup_subsys_name[] = {
-#include <linux/cgroup_subsys.h>
-};
-#undef SUBSYS
-
-/*
- * The default hierarchy, reserved for the subsystems that are otherwise
- * unattached - it never has more than a single cgroup, and all tasks are
- * part of that cgroup.
- */
-struct cgroup_root cgrp_dfl_root;
-
-/*
- * The default hierarchy always exists but is hidden until mounted for the
- * first time. This is for backward compatibility.
- */
-static bool cgrp_dfl_root_visible;
-
-/*
- * Set by the boot param of the same name and makes subsystems with NULL
- * ->dfl_files to use ->legacy_files on the default hierarchy.
- */
-static bool cgroup_legacy_files_on_dfl;
-
-/* some controllers are not supported in the default hierarchy */
-static unsigned int cgrp_dfl_root_inhibit_ss_mask;
-
-/* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
-static int cgroup_root_count;
-
-/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
-static DEFINE_IDR(cgroup_hierarchy_idr);
-
-/*
- * Assign a monotonically increasing serial number to csses. It guarantees
- * cgroups with bigger numbers are newer than those with smaller numbers.
- * Also, as csses are always appended to the parent's ->children list, it
- * guarantees that sibling csses are always sorted in the ascending serial
- * number order on the list. Protected by cgroup_mutex.
- */
-static u64 css_serial_nr_next = 1;
-
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
- */
-static int need_forkexit_callback __read_mostly;
-
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
-
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned int ss_mask);
-static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- bool visible);
-static void css_release(struct percpu_ref *ref);
-static void kill_css(struct cgroup_subsys_state *css);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
- bool is_add);
-
-/* IDR wrappers which synchronize using cgroup_idr_lock */
-static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
- gfp_t gfp_mask)
-{
- int ret;
-
- idr_preload(gfp_mask);
- spin_lock_bh(&cgroup_idr_lock);
- ret = idr_alloc(idr, ptr, start, end, gfp_mask);
- spin_unlock_bh(&cgroup_idr_lock);
- idr_preload_end();
- return ret;
-}
-
-static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
-{
- void *ret;
-
- spin_lock_bh(&cgroup_idr_lock);
- ret = idr_replace(idr, ptr, id);
- spin_unlock_bh(&cgroup_idr_lock);
- return ret;
-}
-
-static void cgroup_idr_remove(struct idr *idr, int id)
-{
- spin_lock_bh(&cgroup_idr_lock);
- idr_remove(idr, id);
- spin_unlock_bh(&cgroup_idr_lock);
-}
-
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
-{
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
-
- if (parent_css)
- return container_of(parent_css, struct cgroup, self);
- return NULL;
-}
-
-/**
- * cgroup_css - obtain a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns @cgrp->self)
- *
- * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
- * function must be called either under cgroup_mutex or rcu_read_lock() and
- * the caller is responsible for pinning the returned css if it wants to
- * keep accessing it outside the said locks. This function may return
- * %NULL if @cgrp doesn't have @subsys_id enabled.
- */
-static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- if (ss)
- return rcu_dereference_check(cgrp->subsys[ss->id],
- lockdep_is_held(&cgroup_mutex));
- else
- return &cgrp->self;
-}
-
-/**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns @cgrp->self)
- *
- * Similar to cgroup_css() but returns the effctive css, which is defined
- * as the matching css of the nearest ancestor including self which has @ss
- * enabled. If @ss is associated with the hierarchy @cgrp is on, this
- * function is guaranteed to return non-NULL css.
- */
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- lockdep_assert_held(&cgroup_mutex);
-
- if (!ss)
- return &cgrp->self;
-
- if (!(cgrp->root->subsys_mask & (1 << ss->id)))
- return NULL;
-
- /*
- * This function is used while updating css associations and thus
- * can't test the csses directly. Use ->child_subsys_mask.
- */
- while (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
- cgrp = cgroup_parent(cgrp);
-
- return cgroup_css(cgrp, ss);
-}
-
-/**
- * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest
- *
- * Find and get the effective css of @cgrp for @ss. The effective css is
- * defined as the matching css of the nearest ancestor including self which
- * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
- * the root css is returned, so this function always returns a valid css.
- * The returned css must be put using css_put().
- */
-struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
-
- do {
- css = cgroup_css(cgrp, ss);
-
- if (css && css_tryget_online(css))
- goto out_unlock;
- cgrp = cgroup_parent(cgrp);
- } while (cgrp);
-
- css = init_css_set.subsys[ss->id];
- css_get(css);
-out_unlock:
- rcu_read_unlock();
- return css;
-}
-
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
-struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
-{
- struct cgroup *cgrp = of->kn->parent->priv;
- struct cftype *cft = of_cft(of);
-
- /*
- * This is open and unprotected implementation of cgroup_css().
- * seq_css() is only called from a kernfs file operation which has
- * an active reference on the file. Because all the subsystem
- * files are drained before a css is disassociated with a cgroup,
- * the matching css from the cgroup's subsys table is guaranteed to
- * be and stay valid until the enclosing operation is complete.
- */
- if (cft->ss)
- return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
- else
- return &cgrp->self;
-}
-EXPORT_SYMBOL_GPL(of_css);
-
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor. It also returns %true
- * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
- while (cgrp) {
- if (cgrp == ancestor)
- return true;
- cgrp = cgroup_parent(cgrp);
- }
- return false;
-}
-
-static int notify_on_release(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
-/**
- * for_each_css - iterate all css's of a cgroup
- * @css: the iteration cursor
- * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
- * @cgrp: the target cgroup to iterate css's of
- *
- * Should be called under cgroup_[tree_]mutex.
- */
-#define for_each_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = rcu_dereference_check( \
- (cgrp)->subsys[(ssid)], \
- lockdep_is_held(&cgroup_mutex)))) { } \
- else
-
-/**
- * for_each_e_css - iterate all effective css's of a cgroup
- * @css: the iteration cursor
- * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
- * @cgrp: the target cgroup to iterate css's of
- *
- * Should be called under cgroup_[tree_]mutex.
- */
-#define for_each_e_css(css, ssid, cgrp) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
- if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
- ; \
- else
-
-/**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
- (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/* iterate across the hierarchies */
-#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
-
-/* iterate over child cgrps, lock should be held throughout iteration */
-#define cgroup_for_each_live_child(child, cgrp) \
- list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- cgroup_is_dead(child); })) \
- ; \
- else
-
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies. In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
- /* the cgroup and css_set this link associates */
- struct cgroup *cgrp;
- struct css_set *cset;
-
- /* list of cgrp_cset_links anchored at cgrp->cset_links */
- struct list_head cset_link;
-
- /* list of cgrp_cset_links anchored at css_set->cgrp_links */
- struct list_head cgrp_link;
-};
-
-/*
- * The default css_set - used by init and its children prior to any
- * hierarchies being mounted. It contains a pointer to the root state
- * for each subsystem. Also used to anchor the list of css_sets. Not
- * reference-counted, to improve performance when child cgroups
- * haven't been created.
- */
-struct css_set init_css_set = {
- .refcount = ATOMIC_INIT(1),
- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .tasks = LIST_HEAD_INIT(init_css_set.tasks),
- .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
- .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
-};
-
-static int css_set_count = 1; /* 1 for init_css_set */
-
-/**
- * cgroup_update_populated - updated populated count of a cgroup
- * @cgrp: the target cgroup
- * @populated: inc or dec populated count
- *
- * @cgrp is either getting the first task (css_set) or losing the last.
- * Update @cgrp->populated_cnt accordingly. The count is propagated
- * towards root so that a given cgroup's populated_cnt is zero iff the
- * cgroup and all its descendants are empty.
- *
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed. This can be used to detect when @cgrp and
- * its descendants become populated or empty.
- */
-static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
-{
- lockdep_assert_held(&css_set_rwsem);
-
- do {
- bool trigger;
-
- if (populated)
- trigger = !cgrp->populated_cnt++;
- else
- trigger = !--cgrp->populated_cnt;
-
- if (!trigger)
- break;
-
- if (cgrp->populated_kn)
- kernfs_notify(cgrp->populated_kn);
- cgrp = cgroup_parent(cgrp);
- } while (cgrp);
-}
-
-/*
- * hash table for cgroup groups. This improves the performance to find
- * an existing css_set. This hash doesn't (currently) take into
- * account cgroups in empty hierarchies.
- */
-#define CSS_SET_HASH_BITS 7
-static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-
-static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
-{
- unsigned long key = 0UL;
- struct cgroup_subsys *ss;
- int i;
-
- for_each_subsys(ss, i)
- key += (unsigned long)css[i];
- key = (key >> 16) ^ key;
-
- return key;
-}
-
-static void put_css_set_locked(struct css_set *cset)
-{
- struct cgrp_cset_link *link, *tmp_link;
- struct cgroup_subsys *ss;
- int ssid;
-
- lockdep_assert_held(&css_set_rwsem);
-
- if (!atomic_dec_and_test(&cset->refcount))
- return;
-
- /* This css_set is dead. unlink it and release cgroup refcounts */
- for_each_subsys(ss, ssid)
- list_del(&cset->e_cset_node[ssid]);
- hash_del(&cset->hlist);
- css_set_count--;
-
- list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *cgrp = link->cgrp;
-
- list_del(&link->cset_link);
- list_del(&link->cgrp_link);
-
- /* @cgrp can't go away while we're holding css_set_rwsem */
- if (list_empty(&cgrp->cset_links)) {
- cgroup_update_populated(cgrp, false);
- check_for_release(cgrp);
- }
-
- kfree(link);
- }
-
- kfree_rcu(cset, rcu_head);
-}
-
-static void put_css_set(struct css_set *cset)
-{
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cset->refcount, -1, 1))
- return;
-
- down_write(&css_set_rwsem);
- put_css_set_locked(cset);
- up_write(&css_set_rwsem);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
- atomic_inc(&cset->refcount);
-}
-
-/**
- * compare_css_sets - helper function for find_existing_css_set().
- * @cset: candidate css_set being tested
- * @old_cset: existing css_set for a task
- * @new_cgrp: cgroup that's being entered by the task
- * @template: desired set of css pointers in css_set (pre-calculated)
- *
- * Returns true if "cset" matches "old_cset" except for the hierarchy
- * which "new_cgrp" belongs to, for which it should match "new_cgrp".
- */
-static bool compare_css_sets(struct css_set *cset,
- struct css_set *old_cset,
- struct cgroup *new_cgrp,
- struct cgroup_subsys_state *template[])
-{
- struct list_head *l1, *l2;
-
- /*
- * On the default hierarchy, there can be csets which are
- * associated with the same set of cgroups but different csses.
- * Let's first ensure that csses match.
- */
- if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
- return false;
-
- /*
- * Compare cgroup pointers in order to distinguish between
- * different cgroups in hierarchies. As different cgroups may
- * share the same effective css, this comparison is always
- * necessary.
- */
- l1 = &cset->cgrp_links;
- l2 = &old_cset->cgrp_links;
- while (1) {
- struct cgrp_cset_link *link1, *link2;
- struct cgroup *cgrp1, *cgrp2;
-
- l1 = l1->next;
- l2 = l2->next;
- /* See if we reached the end - both lists are equal length. */
- if (l1 == &cset->cgrp_links) {
- BUG_ON(l2 != &old_cset->cgrp_links);
- break;
- } else {
- BUG_ON(l2 == &old_cset->cgrp_links);
- }
- /* Locate the cgroups associated with these links. */
- link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
- link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
- cgrp1 = link1->cgrp;
- cgrp2 = link2->cgrp;
- /* Hierarchies should be linked in the same order. */
- BUG_ON(cgrp1->root != cgrp2->root);
-
- /*
- * If this hierarchy is the hierarchy of the cgroup
- * that's changing, then we need to check that this
- * css_set points to the new cgroup; if it's any other
- * hierarchy, then this css_set should point to the
- * same cgroup as the old css_set.
- */
- if (cgrp1->root == new_cgrp->root) {
- if (cgrp1 != new_cgrp)
- return false;
- } else {
- if (cgrp1 != cgrp2)
- return false;
- }
- }
- return true;
-}
-
-/**
- * find_existing_css_set - init css array and find the matching css_set
- * @old_cset: the css_set that we're using before the cgroup transition
- * @cgrp: the cgroup that we're moving into
- * @template: out param for the new set of csses, should be clear on entry
- */
-static struct css_set *find_existing_css_set(struct css_set *old_cset,
- struct cgroup *cgrp,
- struct cgroup_subsys_state *template[])
-{
- struct cgroup_root *root = cgrp->root;
- struct cgroup_subsys *ss;
- struct css_set *cset;
- unsigned long key;
- int i;
-
- /*
- * Build the set of subsystem state objects that we want to see in the
- * new css_set. while subsystems can change globally, the entries here
- * won't change, so no need for locking.
- */
- for_each_subsys(ss, i) {
- if (root->subsys_mask & (1UL << i)) {
- /*
- * @ss is in this hierarchy, so we want the
- * effective css from @cgrp.
- */
- template[i] = cgroup_e_css(cgrp, ss);
- } else {
- /*
- * @ss is not in this hierarchy, so we don't want
- * to change the css.
- */
- template[i] = old_cset->subsys[i];
- }
- }
-
- key = css_set_hash(template);
- hash_for_each_possible(css_set_table, cset, hlist, key) {
- if (!compare_css_sets(cset, old_cset, cgrp, template))
- continue;
-
- /* This css_set matches what we need */
- return cset;
- }
-
- /* No existing cgroup group matched */
- return NULL;
-}
-
-static void free_cgrp_cset_links(struct list_head *links_to_free)
-{
- struct cgrp_cset_link *link, *tmp_link;
-
- list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
- list_del(&link->cset_link);
- kfree(link);
- }
-}
-
-/**
- * allocate_cgrp_cset_links - allocate cgrp_cset_links
- * @count: the number of links to allocate
- * @tmp_links: list_head the allocated links are put on
- *
- * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
- * through ->cset_link. Returns 0 on success or -errno.
- */
-static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
-{
- struct cgrp_cset_link *link;
- int i;
-
- INIT_LIST_HEAD(tmp_links);
-
- for (i = 0; i < count; i++) {
- link = kzalloc(sizeof(*link), GFP_KERNEL);
- if (!link) {
- free_cgrp_cset_links(tmp_links);
- return -ENOMEM;
- }
- list_add(&link->cset_link, tmp_links);
- }
- return 0;
-}
-
-/**
- * link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
- * @cset: the css_set to be linked
- * @cgrp: the destination cgroup
- */
-static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
- struct cgroup *cgrp)
-{
- struct cgrp_cset_link *link;
-
- BUG_ON(list_empty(tmp_links));
-
- if (cgroup_on_dfl(cgrp))
- cset->dfl_cgrp = cgrp;
-
- link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
- link->cset = cset;
- link->cgrp = cgrp;
-
- if (list_empty(&cgrp->cset_links))
- cgroup_update_populated(cgrp, true);
- list_move(&link->cset_link, &cgrp->cset_links);
-
- /*
- * Always add links to the tail of the list so that the list
- * is sorted by order of hierarchy creation
- */
- list_add_tail(&link->cgrp_link, &cset->cgrp_links);
-}
-
-/**
- * find_css_set - return a new css_set with one cgroup updated
- * @old_cset: the baseline css_set
- * @cgrp: the cgroup to be updated
- *
- * Return a new css_set that's equivalent to @old_cset, but with @cgrp
- * substituted into the appropriate hierarchy.
- */
-static struct css_set *find_css_set(struct css_set *old_cset,
- struct cgroup *cgrp)
-{
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
- struct css_set *cset;
- struct list_head tmp_links;
- struct cgrp_cset_link *link;
- struct cgroup_subsys *ss;
- unsigned long key;
- int ssid;
-
- lockdep_assert_held(&cgroup_mutex);
-
- /* First see if we already have a cgroup group that matches
- * the desired set */
- down_read(&css_set_rwsem);
- cset = find_existing_css_set(old_cset, cgrp, template);
- if (cset)
- get_css_set(cset);
- up_read(&css_set_rwsem);
-
- if (cset)
- return cset;
-
- cset = kzalloc(sizeof(*cset), GFP_KERNEL);
- if (!cset)
- return NULL;
-
- /* Allocate all the cgrp_cset_link objects that we'll need */
- if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
- kfree(cset);
- return NULL;
- }
-
- atomic_set(&cset->refcount, 1);
- INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->tasks);
- INIT_LIST_HEAD(&cset->mg_tasks);
- INIT_LIST_HEAD(&cset->mg_preload_node);
- INIT_LIST_HEAD(&cset->mg_node);
- INIT_HLIST_NODE(&cset->hlist);
-
- /* Copy the set of subsystem state objects generated in
- * find_existing_css_set() */
- memcpy(cset->subsys, template, sizeof(cset->subsys));
-
- down_write(&css_set_rwsem);
- /* Add reference counts and links from the new css_set. */
- list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- if (c->root == cgrp->root)
- c = cgrp;
- link_css_set(&tmp_links, cset, c);
- }
-
- BUG_ON(!list_empty(&tmp_links));
-
- css_set_count++;
-
- /* Add @cset to the hash table */
- key = css_set_hash(cset->subsys);
- hash_add(css_set_table, &cset->hlist, key);
-
- for_each_subsys(ss, ssid)
- list_add_tail(&cset->e_cset_node[ssid],
- &cset->subsys[ssid]->cgroup->e_csets[ssid]);
-
- up_write(&css_set_rwsem);
-
- return cset;
-}
-
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
-{
- struct cgroup *root_cgrp = kf_root->kn->priv;
-
- return root_cgrp->root;
-}
-
-static int cgroup_init_root_id(struct cgroup_root *root)
-{
- int id;
-
- lockdep_assert_held(&cgroup_mutex);
-
- id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
- if (id < 0)
- return id;
-
- root->hierarchy_id = id;
- return 0;
-}
-
-static void cgroup_exit_root_id(struct cgroup_root *root)
-{
- lockdep_assert_held(&cgroup_mutex);
-
- if (root->hierarchy_id) {
- idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
- root->hierarchy_id = 0;
- }
-}
-
-static void cgroup_free_root(struct cgroup_root *root)
-{
- if (root) {
- /* hierarhcy ID shoulid already have been released */
- WARN_ON_ONCE(root->hierarchy_id);
-
- idr_destroy(&root->cgroup_idr);
- kfree(root);
- }
-}
-
-static void cgroup_destroy_root(struct cgroup_root *root)
-{
- struct cgroup *cgrp = &root->cgrp;
- struct cgrp_cset_link *link, *tmp_link;
-
- mutex_lock(&cgroup_mutex);
-
- BUG_ON(atomic_read(&root->nr_cgrps));
- BUG_ON(!list_empty(&cgrp->self.children));
-
- /* Rebind all subsystems back to the default hierarchy */
- rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
-
- /*
- * Release all the links from cset_links to this hierarchy's
- * root cgroup
- */
- down_write(&css_set_rwsem);
-
- list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
- list_del(&link->cset_link);
- list_del(&link->cgrp_link);
- kfree(link);
- }
- up_write(&css_set_rwsem);
-
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
- }
-
- cgroup_exit_root_id(root);
-
- mutex_unlock(&cgroup_mutex);
-
- kernfs_destroy_root(root->kf_root);
- cgroup_free_root(root);
-}
-
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
- struct cgroup_root *root)
-{
- struct cgroup *res = NULL;
-
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
-
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else {
- struct cgrp_cset_link *link;
-
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
-
- BUG_ON(!res);
- return res;
-}
-
-/*
- * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
- */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroup_root *root)
-{
- /*
- * No need to lock the task - since we hold cgroup_mutex the
- * task can't change groups, so the only thing that can happen
- * is that it exits and its css is set back to init_css_set.
- */
- return cset_cgroup_from_root(task_css_set(task), root);
-}
-
-/*
- * A task must hold cgroup_mutex to modify cgroups.
- *
- * Any task can increment and decrement the count field without lock.
- * So in general, code holding cgroup_mutex can't rely on the count
- * field not changing. However, if the count goes to zero, then only
- * cgroup_attach_task() can increment it again. Because a count of zero
- * means that no tasks are currently attached, therefore there is no
- * way a task attached to that cgroup can fork (the other way to
- * increment the count). So code holding cgroup_mutex can safely
- * assume that if the count is zero, it will stay zero. Similarly, if
- * a task holds cgroup_mutex on a cgroup with zero count, it
- * knows that the cgroup won't be removed, as cgroup_rmdir()
- * needs that mutex.
- *
- * A cgroup can only be deleted if both its 'count' of using tasks
- * is zero, and its list of 'children' cgroups is empty. Since all
- * tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, root cgroup
- * always has either children cgroups and/or using tasks. So we don't
- * need a special hack to ensure that root cgroup cannot be deleted.
- *
- * P.S. One more locking exception. RCU is used to guard the
- * update of a tasks cgroup pointer by cgroup_attach_task()
- */
-
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
-static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
-
-static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
- char *buf)
-{
- if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cft->ss->name, cft->name);
- else
- strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
- return buf;
-}
-
-/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
- *
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
- */
-static umode_t cgroup_file_mode(const struct cftype *cft)
-{
- umode_t mode = 0;
-
- if (cft->mode)
- return cft->mode;
-
- if (cft->read_u64 || cft->read_s64 || cft->seq_show)
- mode |= S_IRUGO;
-
- if (cft->write_u64 || cft->write_s64 || cft->write)
- mode |= S_IWUSR;
-
- return mode;
-}
-
-static void cgroup_get(struct cgroup *cgrp)
-{
- WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
-}
-
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
-/**
- * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
- * @cgrp: the target cgroup
- * @subtree_control: the new subtree_control mask to consider
- *
- * On the default hierarchy, a subsystem may request other subsystems to be
- * enabled together through its ->depends_on mask. In such cases, more
- * subsystems than specified in "cgroup.subtree_control" may be enabled.
- *
- * This function calculates which subsystems need to be enabled if
- * @subtree_control is to be applied to @cgrp. The returned mask is always
- * a superset of @subtree_control and follows the usual hierarchy rules.
- */
-static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
- unsigned int subtree_control)
-{
- struct cgroup *parent = cgroup_parent(cgrp);
- unsigned int cur_ss_mask = subtree_control;
- struct cgroup_subsys *ss;
- int ssid;
-
- lockdep_assert_held(&cgroup_mutex);
-
- if (!cgroup_on_dfl(cgrp))
- return cur_ss_mask;
-
- while (true) {
- unsigned int new_ss_mask = cur_ss_mask;
-
- for_each_subsys(ss, ssid)
- if (cur_ss_mask & (1 << ssid))
- new_ss_mask |= ss->depends_on;
-
- /*
- * Mask out subsystems which aren't available. This can
- * happen only if some depended-upon subsystems were bound
- * to non-default hierarchies.
- */
- if (parent)
- new_ss_mask &= parent->child_subsys_mask;
- else
- new_ss_mask &= cgrp->root->subsys_mask;
-
- if (new_ss_mask == cur_ss_mask)
- break;
- cur_ss_mask = new_ss_mask;
- }
-
- return cur_ss_mask;
-}
-
-/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
- * @cgrp: the target cgroup
- *
- * Update @cgrp->child_subsys_mask according to the current
- * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
- */
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
-{
- cgrp->child_subsys_mask =
- cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
-}
-
-/**
- * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
- * @kn: the kernfs_node being serviced
- *
- * This helper undoes cgroup_kn_lock_live() and should be invoked before
- * the method finishes if locking succeeded. Note that once this function
- * returns the cgroup returned by cgroup_kn_lock_live() may become
- * inaccessible any time. If the caller intends to continue to access the
- * cgroup, it should pin it before invoking this function.
- */
-static void cgroup_kn_unlock(struct kernfs_node *kn)
-{
- struct cgroup *cgrp;
-
- if (kernfs_type(kn) == KERNFS_DIR)
- cgrp = kn->priv;
- else
- cgrp = kn->parent->priv;
-
- mutex_unlock(&cgroup_mutex);
-
- kernfs_unbreak_active_protection(kn);
- cgroup_put(cgrp);
-}
-
-/**
- * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
- * @kn: the kernfs_node being serviced
- *
- * This helper is to be used by a cgroup kernfs method currently servicing
- * @kn. It breaks the active protection, performs cgroup locking and
- * verifies that the associated cgroup is alive. Returns the cgroup if
- * alive; otherwise, %NULL. A successful return should be undone by a
- * matching cgroup_kn_unlock() invocation.
- *
- * Any cgroup kernfs method implementation which requires locking the
- * associated cgroup should use this helper. It avoids nesting cgroup
- * locking under kernfs active protection and allows all kernfs operations
- * including self-removal.
- */
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
-{
- struct cgroup *cgrp;
-
- if (kernfs_type(kn) == KERNFS_DIR)
- cgrp = kn->priv;
- else
- cgrp = kn->parent->priv;
-
- /*
- * We're gonna grab cgroup_mutex which nests outside kernfs
- * active_ref. cgroup liveliness check alone provides enough
- * protection against removal. Ensure @cgrp stays accessible and
- * break the active_ref protection.
- */
- if (!cgroup_tryget(cgrp))
- return NULL;
- kernfs_break_active_protection(kn);
-
- mutex_lock(&cgroup_mutex);
-
- if (!cgroup_is_dead(cgrp))
- return cgrp;
-
- cgroup_kn_unlock(kn);
- return NULL;
-}
-
-static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
-{
- char name[CGROUP_FILE_NAME_MAX];
-
- lockdep_assert_held(&cgroup_mutex);
- kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
-}
-
-/**
- * cgroup_clear_dir - remove subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be removed
- */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
-{
- struct cgroup_subsys *ss;
- int i;
-
- for_each_subsys(ss, i) {
- struct cftype *cfts;
-
- if (!(subsys_mask & (1 << i)))
- continue;
- list_for_each_entry(cfts, &ss->cfts, node)
- cgroup_addrm_files(cgrp, cfts, false);
- }
-}
-
-static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
-{
- struct cgroup_subsys *ss;
- unsigned int tmp_ss_mask;
- int ssid, i, ret;
-
- lockdep_assert_held(&cgroup_mutex);
-
- for_each_subsys(ss, ssid) {
- if (!(ss_mask & (1 << ssid)))
- continue;
-
- /* if @ss has non-root csses attached to it, can't move */
- if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
- return -EBUSY;
-
- /* can't move between two non-dummy roots either */
- if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
- return -EBUSY;
- }
-
- /* skip creating root files on dfl_root for inhibited subsystems */
- tmp_ss_mask = ss_mask;
- if (dst_root == &cgrp_dfl_root)
- tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-
- ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
- if (ret) {
- if (dst_root != &cgrp_dfl_root)
- return ret;
-
- /*
- * Rebinding back to the default root is not allowed to
- * fail. Using both default and non-default roots should
- * be rare. Moving subsystems back and forth even more so.
- * Just warn about it and continue.
- */
- if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
- ret, ss_mask);
- pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
- }
- }
-
- /*
- * Nothing can fail from this point on. Remove files for the
- * removed subsystems and rebind each subsystem.
- */
- for_each_subsys(ss, ssid)
- if (ss_mask & (1 << ssid))
- cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
-
- for_each_subsys(ss, ssid) {
- struct cgroup_root *src_root;
- struct cgroup_subsys_state *css;
- struct css_set *cset;
-
- if (!(ss_mask & (1 << ssid)))
- continue;
-
- src_root = ss->root;
- css = cgroup_css(&src_root->cgrp, ss);
-
- WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
-
- RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
- rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
- ss->root = dst_root;
- css->cgroup = &dst_root->cgrp;
-
- down_write(&css_set_rwsem);
- hash_for_each(css_set_table, i, cset, hlist)
- list_move_tail(&cset->e_cset_node[ss->id],
- &dst_root->cgrp.e_csets[ss->id]);
- up_write(&css_set_rwsem);
-
- src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.subtree_control &= ~(1 << ssid);
- cgroup_refresh_child_subsys_mask(&src_root->cgrp);
-
- /* default hierarchy doesn't enable controllers by default */
- dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root) {
- dst_root->cgrp.subtree_control |= 1 << ssid;
- cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
- }
-
- if (ss->bind)
- ss->bind(css);
- }
-
- kernfs_activate(dst_root->cgrp.kn);
- return 0;
-}
-
-static int cgroup_show_options(struct seq_file *seq,
- struct kernfs_root *kf_root)
-{
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_subsys *ss;
- int ssid;
-
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(seq, ",%s", ss->name);
- if (root->flags & CGRP_ROOT_NOPREFIX)
- seq_puts(seq, ",noprefix");
- if (root->flags & CGRP_ROOT_XATTR)
- seq_puts(seq, ",xattr");
-
- spin_lock(&release_agent_path_lock);
- if (strlen(root->release_agent_path))
- seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
-
- if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
- seq_puts(seq, ",clone_children");
- if (strlen(root->name))
- seq_printf(seq, ",name=%s", root->name);
- return 0;
-}
-
-struct cgroup_sb_opts {
- unsigned int subsys_mask;
- unsigned int flags;
- char *release_agent;
- bool cpuset_clone_children;
- char *name;
- /* User explicitly requested empty subsystem */
- bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
- char *token, *o = data;
- bool all_ss = false, one_ss = false;
- unsigned int mask = -1U;
- struct cgroup_subsys *ss;
- int nr_opts = 0;
- int i;
-
-#ifdef CONFIG_CPUSETS
- mask = ~(1U << cpuset_cgrp_id);
-#endif
-
- memset(opts, 0, sizeof(*opts));
-
- while ((token = strsep(&o, ",")) != NULL) {
- nr_opts++;
-
- if (!*token)
- return -EINVAL;
- if (!strcmp(token, "none")) {
- /* Explicitly have no subsystems */
- opts->none = true;
- continue;
- }
- if (!strcmp(token, "all")) {
- /* Mutually exclusive option 'all' + subsystem name */
- if (one_ss)
- return -EINVAL;
- all_ss = true;
- continue;
- }
- if (!strcmp(token, "__DEVEL__sane_behavior")) {
- opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
- continue;
- }
- if (!strcmp(token, "noprefix")) {
- opts->flags |= CGRP_ROOT_NOPREFIX;
- continue;
- }
- if (!strcmp(token, "clone_children")) {
- opts->cpuset_clone_children = true;
- continue;
- }
- if (!strcmp(token, "xattr")) {
- opts->flags |= CGRP_ROOT_XATTR;
- continue;
- }
- if (!strncmp(token, "release_agent=", 14)) {
- /* Specifying two release agents is forbidden */
- if (opts->release_agent)
- return -EINVAL;
- opts->release_agent =
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
- if (!opts->release_agent)
- return -ENOMEM;
- continue;
- }
- if (!strncmp(token, "name=", 5)) {
- const char *name = token + 5;
- /* Can't specify an empty name */
- if (!strlen(name))
- return -EINVAL;
- /* Must match [\w.-]+ */
- for (i = 0; i < strlen(name); i++) {
- char c = name[i];
- if (isalnum(c))
- continue;
- if ((c == '.') || (c == '-') || (c == '_'))
- continue;
- return -EINVAL;
- }
- /* Specifying two names is forbidden */
- if (opts->name)
- return -EINVAL;
- opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN - 1,
- GFP_KERNEL);
- if (!opts->name)
- return -ENOMEM;
-
- continue;
- }
-
- for_each_subsys(ss, i) {
- if (strcmp(token, ss->name))
- continue;
- if (ss->disabled)
- continue;
-
- /* Mutually exclusive option 'all' + subsystem name */
- if (all_ss)
- return -EINVAL;
- opts->subsys_mask |= (1 << i);
- one_ss = true;
-
- break;
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
- }
-
- if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
- if (nr_opts != 1) {
- pr_err("sane_behavior: no other mount options allowed\n");
- return -EINVAL;
- }
- return 0;
- }
-
- /*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options were
- * not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (!ss->disabled)
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
-
- /*
- * Option noprefix was introduced just for backward compatibility
- * with the old cpuset, so we allow noprefix only if mounting just
- * the cpuset subsystem.
- */
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
- return -EINVAL;
-
- /* Can't specify "none" and some subsystems */
- if (opts->subsys_mask && opts->none)
- return -EINVAL;
-
- return 0;
-}
-
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
-{
- int ret = 0;
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_sb_opts opts;
- unsigned int added_mask, removed_mask;
-
- if (root == &cgrp_dfl_root) {
- pr_err("remount is not allowed\n");
- return -EINVAL;
- }
-
- mutex_lock(&cgroup_mutex);
-
- /* See what subsystems are wanted */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
- pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
- task_tgid_nr(current), current->comm);
-
- added_mask = opts.subsys_mask & ~root->subsys_mask;
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
- /* Don't allow flags or name to change at remount */
- if ((opts.flags ^ root->flags) ||
- (opts.name && strcmp(opts.name, root->name))) {
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags, opts.name ?: "", root->flags, root->name);
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* remounting is not allowed for populated hierarchies */
- if (!list_empty(&root->cgrp.self.children)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- ret = rebind_subsystems(root, added_mask);
- if (ret)
- goto out_unlock;
-
- rebind_subsystems(&cgrp_dfl_root, removed_mask);
-
- if (opts.release_agent) {
- spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, opts.release_agent);
- spin_unlock(&release_agent_path_lock);
- }
- out_unlock:
- kfree(opts.release_agent);
- kfree(opts.name);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-/*
- * To reduce the fork() overhead for systems that are not actually using
- * their cgroups capability, we don't maintain the lists running through
- * each css_set to its tasks until we see the list actually used - in other
- * words after the first mount.
- */
-static bool use_task_css_set_links __read_mostly;
-
-static void cgroup_enable_task_cg_lists(void)
-{
- struct task_struct *p, *g;
-
- down_write(&css_set_rwsem);
-
- if (use_task_css_set_links)
- goto out_unlock;
-
- use_task_css_set_links = true;
-
- /*
- * We need tasklist_lock because RCU is not safe against
- * while_each_thread(). Besides, a forking task that has passed
- * cgroup_post_fork() without seeing use_task_css_set_links = 1
- * is not guaranteed to have its child immediately visible in the
- * tasklist if we walk through it with RCU.
- */
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- WARN_ON_ONCE(!list_empty(&p->cg_list) ||
- task_css_set(p) != &init_css_set);
-
- /*
- * We should check if the process is exiting, otherwise
- * it will race with cgroup_exit() in that the list
- * entry won't be deleted though the process has exited.
- * Do it while holding siglock so that we don't end up
- * racing against cgroup_exit().
- */
- spin_lock_irq(&p->sighand->siglock);
- if (!(p->flags & PF_EXITING)) {
- struct css_set *cset = task_css_set(p);
-
- list_add(&p->cg_list, &cset->tasks);
- get_css_set(cset);
- }
- spin_unlock_irq(&p->sighand->siglock);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
-out_unlock:
- up_write(&css_set_rwsem);
-}
-
-static void init_cgroup_housekeeping(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- int ssid;
-
- INIT_LIST_HEAD(&cgrp->self.sibling);
- INIT_LIST_HEAD(&cgrp->self.children);
- INIT_LIST_HEAD(&cgrp->cset_links);
- INIT_LIST_HEAD(&cgrp->pidlists);
- mutex_init(&cgrp->pidlist_mutex);
- cgrp->self.cgroup = cgrp;
- cgrp->self.flags |= CSS_ONLINE;
-
- for_each_subsys(ss, ssid)
- INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
-
- init_waitqueue_head(&cgrp->offline_waitq);
- INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
-}
-
-static void init_cgroup_root(struct cgroup_root *root,
- struct cgroup_sb_opts *opts)
-{
- struct cgroup *cgrp = &root->cgrp;
-
- INIT_LIST_HEAD(&root->root_list);
- atomic_set(&root->nr_cgrps, 1);
- cgrp->root = root;
- init_cgroup_housekeeping(cgrp);
- idr_init(&root->cgroup_idr);
-
- root->flags = opts->flags;
- if (opts->release_agent)
- strcpy(root->release_agent_path, opts->release_agent);
- if (opts->name)
- strcpy(root->name, opts->name);
- if (opts->cpuset_clone_children)
- set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
-}
-
-static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
-{
- LIST_HEAD(tmp_links);
- struct cgroup *root_cgrp = &root->cgrp;
- struct cftype *base_files;
- struct css_set *cset;
- int i, ret;
-
- lockdep_assert_held(&cgroup_mutex);
-
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
- if (ret < 0)
- goto out;
- root_cgrp->id = ret;
-
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
- GFP_KERNEL);
- if (ret)
- goto out;
-
- /*
- * We're accessing css_set_count without locking css_set_rwsem here,
- * but that's OK - it can only be increased by someone holding
- * cgroup_lock, and that's us. The worst that can happen is that we
- * have some link structures left over
- */
- ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
- if (ret)
- goto cancel_ref;
-
- ret = cgroup_init_root_id(root);
- if (ret)
- goto cancel_ref;
-
- root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
- KERNFS_ROOT_CREATE_DEACTIVATED,
- root_cgrp);
- if (IS_ERR(root->kf_root)) {
- ret = PTR_ERR(root->kf_root);
- goto exit_root_id;
- }
- root_cgrp->kn = root->kf_root->kn;
-
- if (root == &cgrp_dfl_root)
- base_files = cgroup_dfl_base_files;
- else
- base_files = cgroup_legacy_base_files;
-
- ret = cgroup_addrm_files(root_cgrp, base_files, true);
- if (ret)
- goto destroy_root;
-
- ret = rebind_subsystems(root, ss_mask);
- if (ret)
- goto destroy_root;
-
- /*
- * There must be no failure case after here, since rebinding takes
- * care of subsystems' refcounts, which are explicitly dropped in
- * the failure exit path.
- */
- list_add(&root->root_list, &cgroup_roots);
- cgroup_root_count++;
-
- /*
- * Link the root cgroup in this hierarchy into all the css_set
- * objects.
- */
- down_write(&css_set_rwsem);
- hash_for_each(css_set_table, i, cset, hlist)
- link_css_set(&tmp_links, cset, root_cgrp);
- up_write(&css_set_rwsem);
-
- BUG_ON(!list_empty(&root_cgrp->self.children));
- BUG_ON(atomic_read(&root->nr_cgrps) != 1);
-
- kernfs_activate(root_cgrp->kn);
- ret = 0;
- goto out;
-
-destroy_root:
- kernfs_destroy_root(root->kf_root);
- root->kf_root = NULL;
-exit_root_id:
- cgroup_exit_root_id(root);
-cancel_ref:
- percpu_ref_exit(&root_cgrp->self.refcnt);
-out:
- free_cgrp_cset_links(&tmp_links);
- return ret;
-}
-
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
-{
- struct super_block *pinned_sb = NULL;
- struct cgroup_subsys *ss;
- struct cgroup_root *root;
- struct cgroup_sb_opts opts;
- struct dentry *dentry;
- int ret;
- int i;
- bool new_sb;
-
- /*
- * The first time anyone tries to mount a cgroup, enable the list
- * linking each css_set to its tasks and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
-
- mutex_lock(&cgroup_mutex);
-
- /* First find the desired set of subsystems */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- /* look for a matching existing root */
- if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
- cgrp_dfl_root_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- ret = 0;
- goto out_unlock;
- }
-
- /*
- * Destruction of cgroup root is asynchronous, so subsystems may
- * still be dying after the previous unmount. Let's drain the
- * dying subsystems. We just need to ensure that the ones
- * unmounted previously finish dying and don't care about new ones
- * starting. Testing ref liveliness is good enough.
- */
- for_each_subsys(ss, i) {
- if (!(opts.subsys_mask & (1 << i)) ||
- ss->root == &cgrp_dfl_root)
- continue;
-
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
- cgroup_put(&ss->root->cgrp);
- }
-
- for_each_root(root) {
- bool name_match = false;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- /*
- * If we asked for a name then it must match. Also, if
- * name matches but sybsys_mask doesn't, we should fail.
- * Remember whether name matched.
- */
- if (opts.name) {
- if (strcmp(opts.name, root->name))
- continue;
- name_match = true;
- }
-
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match.
- */
- if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->subsys_mask)) {
- if (!name_match)
- continue;
- ret = -EBUSY;
- goto out_unlock;
- }
-
- if (root->flags ^ opts.flags)
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
- ret = 0;
- goto out_unlock;
- }
-
- /*
- * No such thing, create a new one. name= matching without subsys
- * specification is allowed for already existing hierarchies but we
- * can't create new one without subsys specification.
- */
- if (!opts.subsys_mask && !opts.none) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- init_cgroup_root(root, &opts);
-
- ret = cgroup_setup_root(root, opts.subsys_mask);
- if (ret)
- cgroup_free_root(root);
-
-out_unlock:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(opts.release_agent);
- kfree(opts.name);
-
- if (ret)
- return ERR_PTR(ret);
-
- dentry = kernfs_mount(fs_type, flags, root->kf_root,
- CGROUP_SUPER_MAGIC, &new_sb);
- if (IS_ERR(dentry) || !new_sb)
- cgroup_put(&root->cgrp);
-
- /*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
- */
- if (pinned_sb) {
- WARN_ON(new_sb);
- deactivate_super(pinned_sb);
- }
-
- return dentry;
-}
-
-static void cgroup_kill_sb(struct super_block *sb)
-{
- struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-
- /*
- * If @root doesn't have any mounts or children, start killing it.
- * This prevents new mounts by disabling percpu_ref_tryget_live().
- * cgroup_mount() may wait for @root's release.
- *
- * And don't kill the default root.
- */
- if (!list_empty(&root->cgrp.self.children) ||
- root == &cgrp_dfl_root)
- cgroup_put(&root->cgrp);
- else
- percpu_ref_kill(&root->cgrp.self.refcnt);
-
- kernfs_kill_sb(sb);
-}
-
-static struct file_system_type cgroup_fs_type = {
- .name = "cgroup",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
-};
-
-static struct kobject *cgroup_kobj;
-
-/**
- * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
- * @task: target task
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Determine @task's cgroup on the first (the one with the lowest non-zero
- * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
- * function grabs cgroup_mutex and shouldn't be used inside locks used by
- * cgroup controller callbacks.
- *
- * Return value is the same as kernfs_path().
- */
-char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
-{
- struct cgroup_root *root;
- struct cgroup *cgrp;
- int hierarchy_id = 1;
- char *path = NULL;
-
- mutex_lock(&cgroup_mutex);
- down_read(&css_set_rwsem);
-
- root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
-
- if (root) {
- cgrp = task_cgroup_from_root(task, root);
- path = cgroup_path(cgrp, buf, buflen);
- } else {
- /* if no hierarchy exists, everyone is in "/" */
- if (strlcpy(buf, "/", buflen) < buflen)
- path = buf;
- }
-
- up_read(&css_set_rwsem);
- mutex_unlock(&cgroup_mutex);
- return path;
-}
-EXPORT_SYMBOL_GPL(task_cgroup_path);
-
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
- /* the src and dst cset list running through cset->mg_node */
- struct list_head src_csets;
- struct list_head dst_csets;
-
- /*
- * Fields for cgroup_taskset_*() iteration.
- *
- * Before migration is committed, the target migration tasks are on
- * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
- * the csets on ->dst_csets. ->csets point to either ->src_csets
- * or ->dst_csets depending on whether migration is committed.
- *
- * ->cur_csets and ->cur_task point to the current task position
- * during iteration.
- */
- struct list_head *csets;
- struct css_set *cur_cset;
- struct task_struct *cur_task;
-};
-
-/**
- * cgroup_taskset_first - reset taskset and return the first task
- * @tset: taskset of interest
- *
- * @tset iteration is initialized and the first task is returned.
- */
-struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
-{
- tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
- tset->cur_task = NULL;
-
- return cgroup_taskset_next(tset);
-}
-
-/**
- * cgroup_taskset_next - iterate to the next task in taskset
- * @tset: taskset of interest
- *
- * Return the next task in @tset. Iteration must have been initialized
- * with cgroup_taskset_first().
- */
-struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
-{
- struct css_set *cset = tset->cur_cset;
- struct task_struct *task = tset->cur_task;
-
- while (&cset->mg_node != tset->csets) {
- if (!task)
- task = list_first_entry(&cset->mg_tasks,
- struct task_struct, cg_list);
- else
- task = list_next_entry(task, cg_list);
-
- if (&task->cg_list != &cset->mg_tasks) {
- tset->cur_cset = cset;
- tset->cur_task = task;
- return task;
- }
-
- cset = list_next_entry(cset, mg_node);
- task = NULL;
- }
-
- return NULL;
-}
-
-/**
- * cgroup_task_migrate - move a task from one cgroup to another.
- * @old_cgrp: the cgroup @tsk is being migrated from
- * @tsk: the task being migrated
- * @new_cset: the new css_set @tsk is being attached to
- *
- * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
- */
-static void cgroup_task_migrate(struct cgroup *old_cgrp,
- struct task_struct *tsk,
- struct css_set *new_cset)
-{
- struct css_set *old_cset;
-
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
-
- /*
- * We are synchronized through threadgroup_lock() against PF_EXITING
- * setting such that we can't race against cgroup_exit() changing the
- * css_set to init_css_set and dropping the old one.
- */
- WARN_ON_ONCE(tsk->flags & PF_EXITING);
- old_cset = task_css_set(tsk);
-
- get_css_set(new_cset);
- rcu_assign_pointer(tsk->cgroups, new_cset);
-
- /*
- * Use move_tail so that cgroup_taskset_first() still returns the
- * leader after migration. This works because cgroup_migrate()
- * ensures that the dst_cset of the leader is the first on the
- * tset's dst_csets list.
- */
- list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
-
- /*
- * We just gained a reference on old_cset by taking it from the
- * task. As trading it for new_cset is protected by cgroup_mutex,
- * we're safe to drop it here; it will be freed under RCU.
- */
- put_css_set_locked(old_cset);
-}
-
-/**
- * cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
- *
- * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
- * those functions for details.
- */
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
-{
- struct css_set *cset, *tmp_cset;
-
- lockdep_assert_held(&cgroup_mutex);
-
- down_write(&css_set_rwsem);
- list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
- cset->mg_src_cgrp = NULL;
- cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
- put_css_set_locked(cset);
- }
- up_write(&css_set_rwsem);
-}
-
-/**
- * cgroup_migrate_add_src - add a migration source css_set
- * @src_cset: the source css_set to add
- * @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
- *
- * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
- * up by cgroup_migrate_finish().
- *
- * This function may be called without holding threadgroup_lock even if the
- * target is a process. Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
- */
-static void cgroup_migrate_add_src(struct css_set *src_cset,
- struct cgroup *dst_cgrp,
- struct list_head *preloaded_csets)
-{
- struct cgroup *src_cgrp;
-
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
-
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
- if (!list_empty(&src_cset->mg_preload_node))
- return;
-
- WARN_ON(src_cset->mg_src_cgrp);
- WARN_ON(!list_empty(&src_cset->mg_tasks));
- WARN_ON(!list_empty(&src_cset->mg_node));
-
- src_cset->mg_src_cgrp = src_cgrp;
- get_css_set(src_cset);
- list_add(&src_cset->mg_preload_node, preloaded_csets);
-}
-
-/**
- * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup (may be %NULL)
- * @preloaded_csets: list of preloaded source css_sets
- *
- * Tasks are about to be moved to @dst_cgrp and all the source css_sets
- * have been preloaded to @preloaded_csets. This function looks up and
- * pins all destination css_sets, links each to its source, and append them
- * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
- * source css_set is assumed to be its cgroup on the default hierarchy.
- *
- * This function must be called after cgroup_migrate_add_src() has been
- * called on each migration source css_set. After migration is performed
- * using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
- */
-static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
- struct list_head *preloaded_csets)
-{
- LIST_HEAD(csets);
- struct css_set *src_cset, *tmp_cset;
-
- lockdep_assert_held(&cgroup_mutex);
-
- /*
- * Except for the root, child_subsys_mask must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
- dst_cgrp->child_subsys_mask)
- return -EBUSY;
-
- /* look up the dst cset for each src cset and link it to src */
- list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
- struct css_set *dst_cset;
-
- dst_cset = find_css_set(src_cset,
- dst_cgrp ?: src_cset->dfl_cgrp);
- if (!dst_cset)
- goto err;
-
- WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
-
- /*
- * If src cset equals dst, it's noop. Drop the src.
- * cgroup_migrate() will skip the cset too. Note that we
- * can't handle src == dst as some nodes are used by both.
- */
- if (src_cset == dst_cset) {
- src_cset->mg_src_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
- put_css_set(src_cset);
- put_css_set(dst_cset);
- continue;
- }
-
- src_cset->mg_dst_cset = dst_cset;
-
- if (list_empty(&dst_cset->mg_preload_node))
- list_add(&dst_cset->mg_preload_node, &csets);
- else
- put_css_set(dst_cset);
- }
-
- list_splice_tail(&csets, preloaded_csets);
- return 0;
-err:
- cgroup_migrate_finish(&csets);
- return -ENOMEM;
-}
-
-/**
- * cgroup_migrate - migrate a process or task to a cgroup
- * @cgrp: the destination cgroup
- * @leader: the leader of the process or the task to migrate
- * @threadgroup: whether @leader points to the whole process or a single task
- *
- * Migrate a process or task denoted by @leader to @cgrp. If migrating a
- * process, the caller must be holding threadgroup_lock of @leader. The
- * caller is also responsible for invoking cgroup_migrate_add_src() and
- * cgroup_migrate_prepare_dst() on the targets before invoking this
- * function and following up with cgroup_migrate_finish().
- *
- * As long as a controller's ->can_attach() doesn't fail, this function is
- * guaranteed to succeed. This means that, excluding ->can_attach()
- * failure, when migrating multiple targets, the success or failure can be
- * decided for all targets by invoking group_migrate_prepare_dst() before
- * actually starting migrating.
- */
-static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
- bool threadgroup)
-{
- struct cgroup_taskset tset = {
- .src_csets = LIST_HEAD_INIT(tset.src_csets),
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
- .csets = &tset.src_csets,
- };
- struct cgroup_subsys_state *css, *failed_css = NULL;
- struct css_set *cset, *tmp_cset;
- struct task_struct *task, *tmp_task;
- int i, ret;
-
- /*
- * Prevent freeing of tasks while we take a snapshot. Tasks that are
- * already PF_EXITING could be freed from underneath us unless we
- * take an rcu_read_lock.
- */
- down_write(&css_set_rwsem);
- rcu_read_lock();
- task = leader;
- do {
- /* @task either already exited or can't exit until the end */
- if (task->flags & PF_EXITING)
- goto next;
-
- /* leave @task alone if post_fork() hasn't linked it yet */
- if (list_empty(&task->cg_list))
- goto next;
-
- cset = task_css_set(task);
- if (!cset->mg_src_cgrp)
- goto next;
-
- /*
- * cgroup_taskset_first() must always return the leader.
- * Take care to avoid disturbing the ordering.
- */
- list_move_tail(&task->cg_list, &cset->mg_tasks);
- if (list_empty(&cset->mg_node))
- list_add_tail(&cset->mg_node, &tset.src_csets);
- if (list_empty(&cset->mg_dst_cset->mg_node))
- list_move_tail(&cset->mg_dst_cset->mg_node,
- &tset.dst_csets);
- next:
- if (!threadgroup)
- break;
- } while_each_thread(leader, task);
- rcu_read_unlock();
- up_write(&css_set_rwsem);
-
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset.src_csets))
- return 0;
-
- /* check that we can legitimately attach to the cgroup */
- for_each_e_css(css, i, cgrp) {
- if (css->ss->can_attach) {
- ret = css->ss->can_attach(css, &tset);
- if (ret) {
- failed_css = css;
- goto out_cancel_attach;
- }
- }
- }
-
- /*
- * Now that we're guaranteed success, proceed to move all tasks to
- * the new cgroup. There are no failure cases after here, so this
- * is the commit point.
- */
- down_write(&css_set_rwsem);
- list_for_each_entry(cset, &tset.src_csets, mg_node) {
- list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
- cgroup_task_migrate(cset->mg_src_cgrp, task,
- cset->mg_dst_cset);
- }
- up_write(&css_set_rwsem);
-
- /*
- * Migration is committed, all target tasks are now on dst_csets.
- * Nothing is sensitive to fork() after this point. Notify
- * controllers that migration is complete.
- */
- tset.csets = &tset.dst_csets;
-
- for_each_e_css(css, i, cgrp)
- if (css->ss->attach)
- css->ss->attach(css, &tset);
-
- ret = 0;
- goto out_release_tset;
-
-out_cancel_attach:
- for_each_e_css(css, i, cgrp) {
- if (css == failed_css)
- break;
- if (css->ss->cancel_attach)
- css->ss->cancel_attach(css, &tset);
- }
-out_release_tset:
- down_write(&css_set_rwsem);
- list_splice_init(&tset.dst_csets, &tset.src_csets);
- list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
- list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
- list_del_init(&cset->mg_node);
- }
- up_write(&css_set_rwsem);
- return ret;
-}
-
-/**
- * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
- * @dst_cgrp: the cgroup to attach to
- * @leader: the task or the leader of the threadgroup to be attached
- * @threadgroup: attach the whole threadgroup?
- *
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
- */
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
- struct task_struct *leader, bool threadgroup)
-{
- LIST_HEAD(preloaded_csets);
- struct task_struct *task;
- int ret;
-
- /* look up all src csets */
- down_read(&css_set_rwsem);
- rcu_read_lock();
- task = leader;
- do {
- cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
- &preloaded_csets);
- if (!threadgroup)
- break;
- } while_each_thread(leader, task);
- rcu_read_unlock();
- up_read(&css_set_rwsem);
-
- /* prepare dst csets and commit */
- ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
- if (!ret)
- ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
-
- cgroup_migrate_finish(&preloaded_csets);
- return ret;
-}
-
-/*
- * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup.
- */
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
-{
- struct task_struct *tsk;
- const struct cred *cred = current_cred(), *tcred;
- struct cgroup *cgrp;
- pid_t pid;
- int ret;
-
- if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
-
- cgrp = cgroup_kn_lock_live(of->kn);
- if (!cgrp)
- return -ENODEV;
-
-retry_find_task:
- rcu_read_lock();
- if (pid) {
- tsk = find_task_by_vpid(pid);
- if (!tsk) {
- rcu_read_unlock();
- ret = -ESRCH;
- goto out_unlock_cgroup;
- }
- /*
- * even if we're attaching all tasks in the thread group, we
- * only need to check permissions on one of them.
- */
- tcred = __task_cred(tsk);
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
- rcu_read_unlock();
- ret = -EACCES;
- goto out_unlock_cgroup;
- }
- } else
- tsk = current;
-
- if (threadgroup)
- tsk = tsk->group_leader;
-
- /*
- * Workqueue threads may acquire PF_NO_SETAFFINITY and become
- * trapped in a cpuset, or RT worker may be born in a cgroup
- * with no rt_runtime allocated. Just say no.
- */
- if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
- ret = -EINVAL;
- rcu_read_unlock();
- goto out_unlock_cgroup;
- }
-
- get_task_struct(tsk);
- rcu_read_unlock();
-
- threadgroup_lock(tsk);
- if (threadgroup) {
- if (!thread_group_leader(tsk)) {
- /*
- * a race with de_thread from another thread's exec()
- * may strip us of our leadership, if this happens,
- * there is no choice but to throw this task away and
- * try again; this is
- * "double-double-toil-and-trouble-check locking".
- */
- threadgroup_unlock(tsk);
- put_task_struct(tsk);
- goto retry_find_task;
- }
- }
-
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-
- threadgroup_unlock(tsk);
-
- put_task_struct(tsk);
-out_unlock_cgroup:
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-}
-
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
- struct cgroup_root *root;
- int retval = 0;
-
- mutex_lock(&cgroup_mutex);
- for_each_root(root) {
- struct cgroup *from_cgrp;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- down_read(&css_set_rwsem);
- from_cgrp = task_cgroup_from_root(from, root);
- up_read(&css_set_rwsem);
-
- retval = cgroup_attach_task(from_cgrp, tsk, false);
- if (retval)
- break;
- }
- mutex_unlock(&cgroup_mutex);
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, true);
-}
-
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct cgroup *cgrp;
-
- BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
- cgrp = cgroup_kn_lock_live(of->kn);
- if (!cgrp)
- return -ENODEV;
- spin_lock(&release_agent_path_lock);
- strlcpy(cgrp->root->release_agent_path, strstrip(buf),
- sizeof(cgrp->root->release_agent_path));
- spin_unlock(&release_agent_path_lock);
- cgroup_kn_unlock(of->kn);
- return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- spin_lock(&release_agent_path_lock);
- seq_puts(seq, cgrp->root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
- seq_putc(seq, '\n');
- return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
- seq_puts(seq, "0\n");
- return 0;
-}
-
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
-{
- struct cgroup_subsys *ss;
- bool printed = false;
- int ssid;
-
- for_each_subsys(ss, ssid) {
- if (ss_mask & (1 << ssid)) {
- if (printed)
- seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
- printed = true;
- }
- }
- if (printed)
- seq_putc(seq, '\n');
-}
-
-/* show controllers which are currently attached to the default hierarchy */
-static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
- ~cgrp_dfl_root_inhibit_ss_mask);
- return 0;
-}
-
-/* show controllers which are enabled from the parent */
-static int cgroup_controllers_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
- return 0;
-}
-
-/* show controllers which are enabled for a given cgroup's children */
-static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- cgroup_print_ss_mask(seq, cgrp->subtree_control);
- return 0;
-}
-
-/**
- * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
- * @cgrp: root of the subtree to update csses for
- *
- * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
- * css associations need to be updated accordingly. This function looks up
- * all css_sets which are attached to the subtree, creates the matching
- * updated css_sets and migrates the tasks to the new ones.
- */
-static int cgroup_update_dfl_csses(struct cgroup *cgrp)
-{
- LIST_HEAD(preloaded_csets);
- struct cgroup_subsys_state *css;
- struct css_set *src_cset;
- int ret;
-
- lockdep_assert_held(&cgroup_mutex);
-
- /* look up all csses currently attached to @cgrp's subtree */
- down_read(&css_set_rwsem);
- css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
- struct cgrp_cset_link *link;
-
- /* self is not affected by child_subsys_mask change */
- if (css->cgroup == cgrp)
- continue;
-
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, cgrp,
- &preloaded_csets);
- }
- up_read(&css_set_rwsem);
-
- /* NULL dst indicates self on default hierarchy */
- ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
- if (ret)
- goto out_finish;
-
- list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
- struct task_struct *last_task = NULL, *task;
-
- /* src_csets precede dst_csets, break on the first dst_cset */
- if (!src_cset->mg_src_cgrp)
- break;
-
- /*
- * All tasks in src_cset need to be migrated to the
- * matching dst_cset. Empty it process by process. We
- * walk tasks but migrate processes. The leader might even
- * belong to a different cset but such src_cset would also
- * be among the target src_csets because the default
- * hierarchy enforces per-process membership.
- */
- while (true) {
- down_read(&css_set_rwsem);
- task = list_first_entry_or_null(&src_cset->tasks,
- struct task_struct, cg_list);
- if (task) {
- task = task->group_leader;
- WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
- get_task_struct(task);
- }
- up_read(&css_set_rwsem);
-
- if (!task)
- break;
-
- /* guard against possible infinite loop */
- if (WARN(last_task == task,
- "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
- goto out_finish;
- last_task = task;
-
- threadgroup_lock(task);
- /* raced against de_thread() from another thread? */
- if (!thread_group_leader(task)) {
- threadgroup_unlock(task);
- put_task_struct(task);
- continue;
- }
-
- ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
-
- threadgroup_unlock(task);
- put_task_struct(task);
-
- if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
- goto out_finish;
- }
- }
-
-out_finish:
- cgroup_migrate_finish(&preloaded_csets);
- return ret;
-}
-
-/* change the enabled child controllers for a cgroup in the default hierarchy */
-static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
-{
- unsigned int enable = 0, disable = 0;
- unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
- struct cgroup *cgrp, *child;
- struct cgroup_subsys *ss;
- char *tok;
- int ssid, ret;
-
- /*
- * Parse input - space separated list of subsystem names prefixed
- * with either + or -.
- */
- buf = strstrip(buf);
- while ((tok = strsep(&buf, " "))) {
- if (tok[0] == '\0')
- continue;
- for_each_subsys(ss, ssid) {
- if (ss->disabled || strcmp(tok + 1, ss->name) ||
- ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
- continue;
-
- if (*tok == '+') {
- enable |= 1 << ssid;
- disable &= ~(1 << ssid);
- } else if (*tok == '-') {
- disable |= 1 << ssid;
- enable &= ~(1 << ssid);
- } else {
- return -EINVAL;
- }
- break;
- }
- if (ssid == CGROUP_SUBSYS_COUNT)
- return -EINVAL;
- }
-
- cgrp = cgroup_kn_lock_live(of->kn);
- if (!cgrp)
- return -ENODEV;
-
- for_each_subsys(ss, ssid) {
- if (enable & (1 << ssid)) {
- if (cgrp->subtree_control & (1 << ssid)) {
- enable &= ~(1 << ssid);
- continue;
- }
-
- /* unavailable or not enabled on the parent? */
- if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
- ret = -ENOENT;
- goto out_unlock;
- }
- } else if (disable & (1 << ssid)) {
- if (!(cgrp->subtree_control & (1 << ssid))) {
- disable &= ~(1 << ssid);
- continue;
- }
-
- /* a child has it enabled? */
- cgroup_for_each_live_child(child, cgrp) {
- if (child->subtree_control & (1 << ssid)) {
- ret = -EBUSY;
- goto out_unlock;
- }
- }
- }
- }
-
- if (!enable && !disable) {
- ret = 0;
- goto out_unlock;
- }
-
- /*
- * Except for the root, subtree_control must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- /*
- * Update subsys masks and calculate what needs to be done. More
- * subsystems than specified may need to be enabled or disabled
- * depending on subsystem dependencies.
- */
- old_sc = cgrp->subtree_control;
- old_ss = cgrp->child_subsys_mask;
- new_sc = (old_sc | enable) & ~disable;
- new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
-
- css_enable = ~old_ss & new_ss;
- css_disable = old_ss & ~new_ss;
- enable |= css_enable;
- disable |= css_disable;
-
- /*
- * Because css offlining is asynchronous, userland might try to
- * re-enable the same controller while the previous instance is
- * still around. In such cases, wait till it's gone using
- * offline_waitq.
- */
- for_each_subsys(ss, ssid) {
- if (!(css_enable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- DEFINE_WAIT(wait);
-
- if (!cgroup_css(child, ss))
- continue;
-
- cgroup_get(child);
- prepare_to_wait(&child->offline_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- cgroup_kn_unlock(of->kn);
- schedule();
- finish_wait(&child->offline_waitq, &wait);
- cgroup_put(child);
-
- return restart_syscall();
- }
- }
-
- cgrp->subtree_control = new_sc;
- cgrp->child_subsys_mask = new_ss;
-
- /*
- * Create new csses or make the existing ones visible. A css is
- * created invisible if it's being implicitly enabled through
- * dependency. An invisible css is made visible when the userland
- * explicitly enables it.
- */
- for_each_subsys(ss, ssid) {
- if (!(enable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- if (css_enable & (1 << ssid))
- ret = create_css(child, ss,
- cgrp->subtree_control & (1 << ssid));
- else
- ret = cgroup_populate_dir(child, 1 << ssid);
- if (ret)
- goto err_undo_css;
- }
- }
-
- /*
- * At this point, cgroup_e_css() results reflect the new csses
- * making the following cgroup_update_dfl_csses() properly update
- * css associations of all tasks in the subtree.
- */
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- goto err_undo_css;
-
- /*
- * All tasks are migrated out of disabled csses. Kill or hide
- * them. A css is hidden when the userland requests it to be
- * disabled while other subsystems are still depending on it. The
- * css must not actively control resources and be in the vanilla
- * state if it's made visible again later. Controllers which may
- * be depended upon should provide ->css_reset() for this purpose.
- */
- for_each_subsys(ss, ssid) {
- if (!(disable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
- if (css_disable & (1 << ssid)) {
- kill_css(css);
- } else {
- cgroup_clear_dir(child, 1 << ssid);
- if (ss->css_reset)
- ss->css_reset(css);
- }
- }
- }
-
- /*
- * The effective csses of all the descendants (excluding @cgrp) may
- * have changed. Subsystems can optionally subscribe to this event
- * by implementing ->css_e_css_changed() which is invoked if any of
- * the effective csses seen from the css's cgroup may have changed.
- */
- for_each_subsys(ss, ssid) {
- struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
- struct cgroup_subsys_state *css;
-
- if (!ss->css_e_css_changed || !this_css)
- continue;
-
- css_for_each_descendant_pre(css, this_css)
- if (css != this_css)
- ss->css_e_css_changed(css);
- }
-
- kernfs_activate(cgrp->kn);
- ret = 0;
-out_unlock:
- cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
-
-err_undo_css:
- cgrp->subtree_control = old_sc;
- cgrp->child_subsys_mask = old_ss;
-
- for_each_subsys(ss, ssid) {
- if (!(enable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
- if (!css)
- continue;
-
- if (css_enable & (1 << ssid))
- kill_css(css);
- else
- cgroup_clear_dir(child, 1 << ssid);
- }
- }
- goto out_unlock;
-}
-
-static int cgroup_populated_show(struct seq_file *seq, void *v)
-{
- seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
- return 0;
-}
-
-static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off)
-{
- struct cgroup *cgrp = of->kn->parent->priv;
- struct cftype *cft = of->kn->priv;
- struct cgroup_subsys_state *css;
- int ret;
-
- if (cft->write)
- return cft->write(of, buf, nbytes, off);
-
- /*
- * kernfs guarantees that a file isn't deleted with operations in
- * flight, which means that the matching css is and stays alive and
- * doesn't need to be pinned. The RCU locking is not necessary
- * either. It's just for the convenience of using cgroup_css().
- */
- rcu_read_lock();
- css = cgroup_css(cgrp, cft->ss);
- rcu_read_unlock();
-
- if (cft->write_u64) {
- unsigned long long v;
- ret = kstrtoull(buf, 0, &v);
- if (!ret)
- ret = cft->write_u64(css, cft, v);
- } else if (cft->write_s64) {
- long long v;
- ret = kstrtoll(buf, 0, &v);
- if (!ret)
- ret = cft->write_s64(css, cft, v);
- } else {
- ret = -EINVAL;
- }
-
- return ret ?: nbytes;
-}
-
-static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
-{
- return seq_cft(seq)->seq_start(seq, ppos);
-}
-
-static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
-{
- return seq_cft(seq)->seq_next(seq, v, ppos);
-}
-
-static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
-{
- seq_cft(seq)->seq_stop(seq, v);
-}
-
-static int cgroup_seqfile_show(struct seq_file *m, void *arg)
-{
- struct cftype *cft = seq_cft(m);
- struct cgroup_subsys_state *css = seq_css(m);
-
- if (cft->seq_show)
- return cft->seq_show(m, arg);
-
- if (cft->read_u64)
- seq_printf(m, "%llu\n", cft->read_u64(css, cft));
- else if (cft->read_s64)
- seq_printf(m, "%lld\n", cft->read_s64(css, cft));
- else
- return -EINVAL;
- return 0;
-}
-
-static struct kernfs_ops cgroup_kf_single_ops = {
- .atomic_write_len = PAGE_SIZE,
- .write = cgroup_file_write,
- .seq_show = cgroup_seqfile_show,
-};
-
-static struct kernfs_ops cgroup_kf_ops = {
- .atomic_write_len = PAGE_SIZE,
- .write = cgroup_file_write,
- .seq_start = cgroup_seqfile_start,
- .seq_next = cgroup_seqfile_next,
- .seq_stop = cgroup_seqfile_stop,
- .seq_show = cgroup_seqfile_show,
-};
-
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
- const char *new_name_str)
-{
- struct cgroup *cgrp = kn->priv;
- int ret;
-
- if (kernfs_type(kn) != KERNFS_DIR)
- return -ENOTDIR;
- if (kn->parent != new_parent)
- return -EIO;
-
- /*
- * This isn't a proper migration and its usefulness is very
- * limited. Disallow on the default hierarchy.
- */
- if (cgroup_on_dfl(cgrp))
- return -EPERM;
-
- /*
- * We're gonna grab cgroup_mutex which nests outside kernfs
- * active_ref. kernfs_rename() doesn't require active_ref
- * protection. Break them before grabbing cgroup_mutex.
- */
- kernfs_break_active_protection(new_parent);
- kernfs_break_active_protection(kn);
-
- mutex_lock(&cgroup_mutex);
-
- ret = kernfs_rename(kn, new_parent, new_name_str);
-
- mutex_unlock(&cgroup_mutex);
-
- kernfs_unbreak_active_protection(kn);
- kernfs_unbreak_active_protection(new_parent);
- return ret;
-}
-
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
-static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
-{
- char name[CGROUP_FILE_NAME_MAX];
- struct kernfs_node *kn;
- struct lock_class_key *key = NULL;
- int ret;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- key = &cft->lockdep_key;
-#endif
- kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
- cgroup_file_mode(cft), 0, cft->kf_ops, cft,
- NULL, key);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = cgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
- if (cft->seq_show == cgroup_populated_show)
- cgrp->populated_kn = kn;
- return 0;
-}
-
-/**
- * cgroup_addrm_files - add or remove files to a cgroup directory
- * @cgrp: the target cgroup
- * @cfts: array of cftypes to be added
- * @is_add: whether to add or remove
- *
- * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * For removals, this function never fails. If addition fails, this
- * function doesn't remove files already added. The caller is responsible
- * for cleaning up.
- */
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
- bool is_add)
-{
- struct cftype *cft;
- int ret;
-
- lockdep_assert_held(&cgroup_mutex);
-
- for (cft = cfts; cft->name[0] != '\0'; cft++) {
- /* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
- continue;
- if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
- continue;
- if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
- continue;
- if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
- continue;
-
- if (is_add) {
- ret = cgroup_add_file(cgrp, cft);
- if (ret) {
- pr_warn("%s: failed to add %s, err=%d\n",
- __func__, cft->name, ret);
- return ret;
- }
- } else {
- cgroup_rm_file(cgrp, cft);
- }
- }
- return 0;
-}
-
-static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
-{
- LIST_HEAD(pending);
- struct cgroup_subsys *ss = cfts[0].ss;
- struct cgroup *root = &ss->root->cgrp;
- struct cgroup_subsys_state *css;
- int ret = 0;
-
- lockdep_assert_held(&cgroup_mutex);
-
- /* add/rm files for all cgroups created before */
- css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
- struct cgroup *cgrp = css->cgroup;
-
- if (cgroup_is_dead(cgrp))
- continue;
-
- ret = cgroup_addrm_files(cgrp, cfts, is_add);
- if (ret)
- break;
- }
-
- if (is_add && !ret)
- kernfs_activate(root->kn);
- return ret;
-}
-
-static void cgroup_exit_cftypes(struct cftype *cfts)
-{
- struct cftype *cft;
-
- for (cft = cfts; cft->name[0] != '\0'; cft++) {
- /* free copy for custom atomic_write_len, see init_cftypes() */
- if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
- kfree(cft->kf_ops);
- cft->kf_ops = NULL;
- cft->ss = NULL;
-
- /* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
- }
-}
-
-static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
- struct cftype *cft;
-
- for (cft = cfts; cft->name[0] != '\0'; cft++) {
- struct kernfs_ops *kf_ops;
-
- WARN_ON(cft->ss || cft->kf_ops);
-
- if (cft->seq_start)
- kf_ops = &cgroup_kf_ops;
- else
- kf_ops = &cgroup_kf_single_ops;
-
- /*
- * Ugh... if @cft wants a custom max_write_len, we need to
- * make a copy of kf_ops to set its atomic_write_len.
- */
- if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
- kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
- if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
- }
- kf_ops->atomic_write_len = cft->max_write_len;
- }
-
- cft->kf_ops = kf_ops;
- cft->ss = ss;
- }
-
- return 0;
-}
-
-static int cgroup_rm_cftypes_locked(struct cftype *cfts)
-{
- lockdep_assert_held(&cgroup_mutex);
-
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
- list_del(&cfts->node);
- cgroup_apply_cftypes(cfts, false);
- cgroup_exit_cftypes(cfts);
- return 0;
-}
-
-/**
- * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Unregister @cfts. Files described by @cfts are removed from all
- * existing cgroups and all future cgroups won't have them either. This
- * function can be called anytime whether @cfts' subsys is attached or not.
- *
- * Returns 0 on successful unregistration, -ENOENT if @cfts is not
- * registered.
- */
-int cgroup_rm_cftypes(struct cftype *cfts)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-/**
- * cgroup_add_cftypes - add an array of cftypes to a subsystem
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Register @cfts to @ss. Files described by @cfts are created for all
- * existing cgroups to which @ss is attached and all future cgroups will
- * have them too. This function can be called anytime whether @ss is
- * attached or not.
- *
- * Returns 0 on successful registration, -errno on failure. Note that this
- * function currently returns 0 as long as @cfts registration is successful
- * even if some file creation attempts on existing cgroups fail.
- */
-static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
- int ret;
-
- if (ss->disabled)
- return 0;
-
- if (!cfts || cfts[0].name[0] == '\0')
- return 0;
-
- ret = cgroup_init_cftypes(ss, cfts);
- if (ret)
- return ret;
-
- mutex_lock(&cgroup_mutex);
-
- list_add_tail(&cfts->node, &ss->cfts);
- ret = cgroup_apply_cftypes(cfts, true);
- if (ret)
- cgroup_rm_cftypes_locked(cfts);
-
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-/**
- * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Similar to cgroup_add_cftypes() but the added files are only used for
- * the default hierarchy.
- */
-int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
- struct cftype *cft;
-
- for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
- cft->flags |= __CFTYPE_ONLY_ON_DFL;
- return cgroup_add_cftypes(ss, cfts);
-}
-
-/**
- * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
- * @ss: target cgroup subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Similar to cgroup_add_cftypes() but the added files are only used for
- * the legacy hierarchies.
- */
-int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
-{
- struct cftype *cft;
-
- /*
- * If legacy_flies_on_dfl, we want to show the legacy files on the
- * dfl hierarchy but iff the target subsystem hasn't been updated
- * for the dfl hierarchy yet.
- */
- if (!cgroup_legacy_files_on_dfl ||
- ss->dfl_cftypes != ss->legacy_cftypes) {
- for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
- cft->flags |= __CFTYPE_NOT_ON_DFL;
- }
-
- return cgroup_add_cftypes(ss, cfts);
-}
-
-/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
- int count = 0;
- struct cgrp_cset_link *link;
-
- down_read(&css_set_rwsem);
- list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += atomic_read(&link->cset->refcount);
- up_read(&css_set_rwsem);
- return count;
-}
-
-/**
- * css_next_child - find the next child of a given css
- * @pos: the current position (%NULL to initiate traversal)
- * @parent: css whose children to walk
- *
- * This function returns the next child of @parent and should be called
- * under either cgroup_mutex or RCU read lock. The only requirement is
- * that @parent and @pos are accessible. The next sibling is guaranteed to
- * be returned regardless of their states.
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal. It's each subsystem's
- * responsibility to synchronize against on/offlining.
- */
-struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
- struct cgroup_subsys_state *parent)
-{
- struct cgroup_subsys_state *next;
-
- cgroup_assert_mutex_or_rcu_locked();
-
- /*
- * @pos could already have been unlinked from the sibling list.
- * Once a cgroup is removed, its ->sibling.next is no longer
- * updated when its next sibling changes. CSS_RELEASED is set when
- * @pos is taken off list, at which time its next pointer is valid,
- * and, as releases are serialized, the one pointed to by the next
- * pointer is guaranteed to not have started release yet. This
- * implies that if we observe !CSS_RELEASED on @pos in this RCU
- * critical section, the one pointed to by its next pointer is
- * guaranteed to not have finished its RCU grace period even if we
- * have dropped rcu_read_lock() inbetween iterations.
- *
- * If @pos has CSS_RELEASED set, its next pointer can't be
- * dereferenced; however, as each css is given a monotonically
- * increasing unique serial number and always appended to the
- * sibling list, the next one can be found by walking the parent's
- * children until the first css with higher serial number than
- * @pos's. While this path can be slower, it happens iff iteration
- * races against release and the race window is very small.
- */
- if (!pos) {
- next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
- } else if (likely(!(pos->flags & CSS_RELEASED))) {
- next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
- } else {
- list_for_each_entry_rcu(next, &parent->children, sibling)
- if (next->serial_nr > pos->serial_nr)
- break;
- }
-
- /*
- * @next, if not pointing to the head, can be dereferenced and is
- * the next sibling.
- */
- if (&next->sibling != &parent->children)
- return next;
- return NULL;
-}
-
-/**
- * css_next_descendant_pre - find the next descendant for pre-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: css whose descendants to walk
- *
- * To be used by css_for_each_descendant_pre(). Find the next descendant
- * to visit for pre-order traversal of @root's descendants. @root is
- * included in the iteration and the first node to be visited.
- *
- * While this function requires cgroup_mutex or RCU read locking, it
- * doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @root are accessible and @pos is a descendant of @root.
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal. It's each subsystem's
- * responsibility to synchronize against on/offlining.
- */
-struct cgroup_subsys_state *
-css_next_descendant_pre(struct cgroup_subsys_state *pos,
- struct cgroup_subsys_state *root)
-{
- struct cgroup_subsys_state *next;
-
- cgroup_assert_mutex_or_rcu_locked();
-
- /* if first iteration, visit @root */
- if (!pos)
- return root;
-
- /* visit the first child if exists */
- next = css_next_child(NULL, pos);
- if (next)
- return next;
-
- /* no child, visit my or the closest ancestor's next sibling */
- while (pos != root) {
- next = css_next_child(pos, pos->parent);
- if (next)
- return next;
- pos = pos->parent;
- }
-
- return NULL;
-}
-
-/**
- * css_rightmost_descendant - return the rightmost descendant of a css
- * @pos: css of interest
- *
- * Return the rightmost descendant of @pos. If there's no descendant, @pos
- * is returned. This can be used during pre-order traversal to skip
- * subtree of @pos.
- *
- * While this function requires cgroup_mutex or RCU read locking, it
- * doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct rightmost descendant as
- * long as @pos is accessible.
- */
-struct cgroup_subsys_state *
-css_rightmost_descendant(struct cgroup_subsys_state *pos)
-{
- struct cgroup_subsys_state *last, *tmp;
-
- cgroup_assert_mutex_or_rcu_locked();
-
- do {
- last = pos;
- /* ->prev isn't RCU safe, walk ->next till the end */
- pos = NULL;
- css_for_each_child(tmp, last)
- pos = tmp;
- } while (pos);
-
- return last;
-}
-
-static struct cgroup_subsys_state *
-css_leftmost_descendant(struct cgroup_subsys_state *pos)
-{
- struct cgroup_subsys_state *last;
-
- do {
- last = pos;
- pos = css_next_child(NULL, pos);
- } while (pos);
-
- return last;
-}
-
-/**
- * css_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: css whose descendants to walk
- *
- * To be used by css_for_each_descendant_post(). Find the next descendant
- * to visit for post-order traversal of @root's descendants. @root is
- * included in the iteration and the last node to be visited.
- *
- * While this function requires cgroup_mutex or RCU read locking, it
- * doesn't require the whole traversal to be contained in a single critical
- * section. This function will return the correct next descendant as long
- * as both @pos and @cgroup are accessible and @pos is a descendant of
- * @cgroup.
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal. It's each subsystem's
- * responsibility to synchronize against on/offlining.
- */
-struct cgroup_subsys_state *
-css_next_descendant_post(struct cgroup_subsys_state *pos,
- struct cgroup_subsys_state *root)
-{
- struct cgroup_subsys_state *next;
-
- cgroup_assert_mutex_or_rcu_locked();
-
- /* if first iteration, visit leftmost descendant which may be @root */
- if (!pos)
- return css_leftmost_descendant(root);
-
- /* if we visited @root, we're done */
- if (pos == root)
- return NULL;
-
- /* if there's an unvisited sibling, visit its leftmost descendant */
- next = css_next_child(pos, pos->parent);
- if (next)
- return css_leftmost_descendant(next);
-
- /* no sibling left, visit parent */
- return pos->parent;
-}
-
-/**
- * css_has_online_children - does a css have online children
- * @css: the target css
- *
- * Returns %true if @css has any online children; otherwise, %false. This
- * function can be called from any context but the caller is responsible
- * for synchronizing against on/offlining as necessary.
- */
-bool css_has_online_children(struct cgroup_subsys_state *css)
-{
- struct cgroup_subsys_state *child;
- bool ret = false;
-
- rcu_read_lock();
- css_for_each_child(child, css) {
- if (child->flags & CSS_ONLINE) {
- ret = true;
- break;
- }
- }
- rcu_read_unlock();
- return ret;
-}
-
-/**
- * css_advance_task_iter - advance a task itererator to the next css_set
- * @it: the iterator to advance
- *
- * Advance @it to the next css_set to walk.
- */
-static void css_advance_task_iter(struct css_task_iter *it)
-{
- struct list_head *l = it->cset_pos;
- struct cgrp_cset_link *link;
- struct css_set *cset;
-
- /* Advance to the next non-empty css_set */
- do {
- l = l->next;
- if (l == it->cset_head) {
- it->cset_pos = NULL;
- return;
- }
-
- if (it->ss) {
- cset = container_of(l, struct css_set,
- e_cset_node[it->ss->id]);
- } else {
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
- }
- } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
-
- it->cset_pos = l;
-
- if (!list_empty(&cset->tasks))
- it->task_pos = cset->tasks.next;
- else
- it->task_pos = cset->mg_tasks.next;
-
- it->tasks_head = &cset->tasks;
- it->mg_tasks_head = &cset->mg_tasks;
-}
-
-/**
- * css_task_iter_start - initiate task iteration
- * @css: the css to walk tasks of
- * @it: the task iterator to use
- *
- * Initiate iteration through the tasks of @css. The caller can call
- * css_task_iter_next() to walk through the tasks until the function
- * returns NULL. On completion of iteration, css_task_iter_end() must be
- * called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes. The caller can't sleep while iteration is in
- * progress.
- */
-void css_task_iter_start(struct cgroup_subsys_state *css,
- struct css_task_iter *it)
- __acquires(css_set_rwsem)
-{
- /* no one should try to iterate before mounting cgroups */
- WARN_ON_ONCE(!use_task_css_set_links);
-
- down_read(&css_set_rwsem);
-
- it->ss = css->ss;
-
- if (it->ss)
- it->cset_pos = &css->cgroup->e_csets[css->ss->id];
- else
- it->cset_pos = &css->cgroup->cset_links;
-
- it->cset_head = it->cset_pos;
-
- css_advance_task_iter(it);
-}
-
-/**
- * css_task_iter_next - return the next task for the iterator
- * @it: the task iterator being iterated
- *
- * The "next" function for task iteration. @it should have been
- * initialized via css_task_iter_start(). Returns NULL when the iteration
- * reaches the end.
- */
-struct task_struct *css_task_iter_next(struct css_task_iter *it)
-{
- struct task_struct *res;
- struct list_head *l = it->task_pos;
-
- /* If the iterator cg is NULL, we have no tasks */
- if (!it->cset_pos)
- return NULL;
- res = list_entry(l, struct task_struct, cg_list);
-
- /*
- * Advance iterator to find next entry. cset->tasks is consumed
- * first and then ->mg_tasks. After ->mg_tasks, we move onto the
- * next cset.
- */
- l = l->next;
-
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
-
- if (l == it->mg_tasks_head)
- css_advance_task_iter(it);
- else
- it->task_pos = l;
-
- return res;
-}
-
-/**
- * css_task_iter_end - finish task iteration
- * @it: the task iterator to finish
- *
- * Finish task iteration started by css_task_iter_start().
- */
-void css_task_iter_end(struct css_task_iter *it)
- __releases(css_set_rwsem)
-{
- up_read(&css_set_rwsem);
-}
-
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup. No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
- LIST_HEAD(preloaded_csets);
- struct cgrp_cset_link *link;
- struct css_task_iter it;
- struct task_struct *task;
- int ret;
-
- mutex_lock(&cgroup_mutex);
-
- /* all tasks in @from are being moved, all csets are source */
- down_read(&css_set_rwsem);
- list_for_each_entry(link, &from->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- up_read(&css_set_rwsem);
-
- ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
- if (ret)
- goto out_err;
-
- /*
- * Migrate tasks one-by-one until @form is empty. This fails iff
- * ->can_attach() fails.
- */
- do {
- css_task_iter_start(&from->self, &it);
- task = css_task_iter_next(&it);
- if (task)
- get_task_struct(task);
- css_task_iter_end(&it);
-
- if (task) {
- ret = cgroup_migrate(to, task, false);
- put_task_struct(task);
- }
- } while (task && !ret);
-out_err:
- cgroup_migrate_finish(&preloaded_csets);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
- CGROUP_FILE_PROCS,
- CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
- /*
- * used to find which pidlist is wanted. doesn't change as long as
- * this particular list stays in the list.
- */
- struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
- /* array of xids */
- pid_t *list;
- /* how many elements the above list has */
- int length;
- /* each of these stored in a list by its cgroup */
- struct list_head links;
- /* pointer to the cgroup we belong to, for list removal purposes */
- struct cgroup *owner;
- /* for delayed destruction */
- struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
- if (PIDLIST_TOO_LARGE(count))
- return vmalloc(count * sizeof(pid_t));
- else
- return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
- kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer. None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
- struct cgroup_pidlist *l, *tmp_l;
-
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
- mutex_unlock(&cgrp->pidlist_mutex);
-
- flush_workqueue(cgroup_pidlist_destroy_wq);
- BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
- destroy_dwork);
- struct cgroup_pidlist *tofree = NULL;
-
- mutex_lock(&l->owner->pidlist_mutex);
-
- /*
- * Destroy iff we didn't get queued again. The state won't change
- * as destroy_dwork can only be queued while locked.
- */
- if (!delayed_work_pending(dwork)) {
- list_del(&l->links);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- tofree = l;
- }
-
- mutex_unlock(&l->owner->pidlist_mutex);
- kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
- int src, dest = 1;
-
- /*
- * we presume the 0th element is unique, so i starts at 1. trivial
- * edge cases first; no work needs to be done for either
- */
- if (length == 0 || length == 1)
- return length;
- /* src and dest walk down the list; dest counts unique elements */
- for (src = 1; src < length; src++) {
- /* find next unique element */
- while (list[src] == list[src-1]) {
- src++;
- if (src == length)
- goto after;
- }
- /* dest always points to where the next unique element goes */
- list[dest] = list[src];
- dest++;
- }
-after:
- return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco. As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer. As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
- */
-static pid_t pid_fry(pid_t pid)
-{
- unsigned a = pid & 0x55555555;
- unsigned b = pid & 0xAAAAAAAA;
-
- return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
- if (cgroup_on_dfl(cgrp))
- return pid_fry(pid);
- else
- return pid;
-}
-
-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-
-static int fried_cmppid(const void *a, const void *b)
-{
- return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = task_active_pid_ns(current);
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- list_for_each_entry(l, &cgrp->pidlists, links)
- if (l->key.type == type && l->key.ns == ns)
- return l;
- return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- l = cgroup_pidlist_find(cgrp, type);
- if (l)
- return l;
-
- /* entry not found; create a new one */
- l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l)
- return l;
-
- INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
- l->key.type = type;
- /* don't need task_nsproxy() if we're looking at ourself */
- l->key.ns = get_pid_ns(task_active_pid_ns(current));
- l->owner = cgrp;
- list_add(&l->links, &cgrp->pidlists);
- return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
- struct cgroup_pidlist **lp)
-{
- pid_t *array;
- int length;
- int pid, n = 0; /* used for populating the array */
- struct css_task_iter it;
- struct task_struct *tsk;
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- length = cgroup_task_count(cgrp);
- array = pidlist_allocate(length);
- if (!array)
- return -ENOMEM;
- /* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- if (unlikely(n == length))
- break;
- /* get tgid or pid for procs or tasks file respectively */
- if (type == CGROUP_FILE_PROCS)
- pid = task_tgid_vnr(tsk);
- else
- pid = task_pid_vnr(tsk);
- if (pid > 0) /* make sure to only use valid results */
- array[n++] = pid;
- }
- css_task_iter_end(&it);
- length = n;
- /* now sort & (if procs) strip out duplicates */
- if (cgroup_on_dfl(cgrp))
- sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
- else
- sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
-
- l = cgroup_pidlist_find_create(cgrp, type);
- if (!l) {
- pidlist_free(array);
- return -ENOMEM;
- }
-
- /* store array, freeing old if necessary */
- pidlist_free(l->list);
- l->list = array;
- l->length = length;
- *lp = l;
- return 0;
-}
-
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
-{
- struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
- struct cgroup *cgrp;
- struct css_task_iter it;
- struct task_struct *tsk;
-
- /* it should be kernfs_node belonging to cgroupfs and is a directory */
- if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- kernfs_type(kn) != KERNFS_DIR)
- return -EINVAL;
-
- mutex_lock(&cgroup_mutex);
-
- /*
- * We aren't being called from kernfs and there's no guarantee on
- * @kn->priv's validity. For this and css_tryget_online_from_dir(),
- * @kn->priv is RCU safe. Let's do the RCU dancing.
- */
- rcu_read_lock();
- cgrp = rcu_dereference(kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
- rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
- return -ENOENT;
- }
- rcu_read_unlock();
-
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- switch (tsk->state) {
- case TASK_RUNNING:
- stats->nr_running++;
- break;
- case TASK_INTERRUPTIBLE:
- stats->nr_sleeping++;
- break;
- case TASK_UNINTERRUPTIBLE:
- stats->nr_uninterruptible++;
- break;
- case TASK_STOPPED:
- stats->nr_stopped++;
- break;
- default:
- if (delayacct_is_task_waiting_on_io(tsk))
- stats->nr_io_wait++;
- break;
- }
- }
- css_task_iter_end(&it);
-
- mutex_unlock(&cgroup_mutex);
- return 0;
-}
-
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
-{
- /*
- * Initially we receive a position value that corresponds to
- * one more than the last pid shown (or 0 on the first call or
- * after a seek to the start). Use a binary-search to find the
- * next pid to display, if any
- */
- struct kernfs_open_file *of = s->private;
- struct cgroup *cgrp = seq_css(s)->cgroup;
- struct cgroup_pidlist *l;
- enum cgroup_filetype type = seq_cft(s)->private;
- int index = 0, pid = *pos;
- int *iter, ret;
-
- mutex_lock(&cgrp->pidlist_mutex);
-
- /*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
- */
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
-
- /*
- * Either this is the first start() after open or the matching
- * pidlist has been destroyed inbetween. Create a new one.
- */
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
- if (ret)
- return ERR_PTR(ret);
- }
- l = of->priv;
-
- if (pid) {
- int end = l->length;
-
- while (index < end) {
- int mid = (index + end) / 2;
- if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
- index = mid;
- break;
- } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
- index = mid + 1;
- else
- end = mid;
- }
- }
- /* If we're off the end of the array, we're done */
- if (index >= l->length)
- return NULL;
- /* Update the abstract position to be the actual pid that we found */
- iter = l->list + index;
- *pos = cgroup_pid_fry(cgrp, *iter);
- return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
-
- if (l)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
- CGROUP_PIDLIST_DESTROY_DELAY);
- mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
-
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
- pid_t *p = v;
- pid_t *end = l->list + l->length;
- /*
- * Advance to the next pid in the array. If this goes off the
- * end, we're done
- */
- p++;
- if (p >= end) {
- return NULL;
- } else {
- *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
- return p;
- }
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
- seq_printf(s, "%d\n", *(int *)v);
-
- return 0;
-}
-
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
- else
- clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
- return 0;
-}
-
-/* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
- {
- .name = "cgroup.procs",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
- .mode = S_IRUGO | S_IWUSR,
- },
- {
- .name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_root_controllers_show,
- },
- {
- .name = "cgroup.controllers",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = cgroup_controllers_show,
- },
- {
- .name = "cgroup.subtree_control",
- .seq_show = cgroup_subtree_control_show,
- .write = cgroup_subtree_control_write,
- },
- {
- .name = "cgroup.populated",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = cgroup_populated_show,
- },
- { } /* terminate */
-};
-
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
- {
- .name = "cgroup.procs",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
- .mode = S_IRUGO | S_IWUSR,
- },
- {
- .name = "cgroup.clone_children",
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
- {
- .name = "tasks",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
- .mode = S_IRUGO | S_IWUSR,
- },
- {
- .name = "notify_on_release",
- .read_u64 = cgroup_read_notify_on_release,
- .write_u64 = cgroup_write_notify_on_release,
- },
- {
- .name = "release_agent",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_release_agent_show,
- .write = cgroup_release_agent_write,
- .max_write_len = PATH_MAX - 1,
- },
- { } /* terminate */
-};
-
-/**
- * cgroup_populate_dir - create subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be added
- *
- * On failure, no file is added.
- */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
-{
- struct cgroup_subsys *ss;
- int i, ret = 0;
-
- /* process cftsets of each subsystem */
- for_each_subsys(ss, i) {
- struct cftype *cfts;
-
- if (!(subsys_mask & (1 << i)))
- continue;
-
- list_for_each_entry(cfts, &ss->cfts, node) {
- ret = cgroup_addrm_files(cgrp, cfts, true);
- if (ret < 0)
- goto err;
- }
- }
- return 0;
-err:
- cgroup_clear_dir(cgrp, subsys_mask);
- return ret;
-}
-
-/*
- * css destruction is four-stage process.
- *
- * 1. Destruction starts. Killing of the percpu_ref is initiated.
- * Implemented in kill_css().
- *
- * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
- * and thus css_tryget_online() is guaranteed to fail, the css can be
- * offlined by invoking offline_css(). After offlining, the base ref is
- * put. Implemented in css_killed_work_fn().
- *
- * 3. When the percpu_ref reaches zero, the only possible remaining
- * accessors are inside RCU read sections. css_release() schedules the
- * RCU callback.
- *
- * 4. After the grace period, the css can be freed. Implemented in
- * css_free_work_fn().
- *
- * It is actually hairier because both step 2 and 4 require process context
- * and thus involve punting to css->destroy_work adding two additional
- * steps to the already complex sequence.
- */
-static void css_free_work_fn(struct work_struct *work)
-{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
- struct cgroup_subsys *ss = css->ss;
- struct cgroup *cgrp = css->cgroup;
-
- percpu_ref_exit(&css->refcnt);
-
- if (ss) {
- /* css free path */
- int id = css->id;
-
- if (css->parent)
- css_put(css->parent);
-
- ss->css_free(css);
- cgroup_idr_remove(&ss->css_idr, id);
- cgroup_put(cgrp);
- } else {
- /* cgroup free path */
- atomic_dec(&cgrp->root->nr_cgrps);
- cgroup_pidlist_destroy_all(cgrp);
- cancel_work_sync(&cgrp->release_agent_work);
-
- if (cgroup_parent(cgrp)) {
- /*
- * We get a ref to the parent, and put the ref when
- * this cgroup is being freed, so it's guaranteed
- * that the parent won't be destroyed before its
- * children.
- */
- cgroup_put(cgroup_parent(cgrp));
- kernfs_put(cgrp->kn);
- kfree(cgrp);
- } else {
- /*
- * This is root cgroup's refcnt reaching zero,
- * which indicates that the root should be
- * released.
- */
- cgroup_destroy_root(cgrp->root);
- }
- }
-}
-
-static void css_free_rcu_fn(struct rcu_head *rcu_head)
-{
- struct cgroup_subsys_state *css =
- container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
-
- INIT_WORK(&css->destroy_work, css_free_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
-}
-
-static void css_release_work_fn(struct work_struct *work)
-{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
- struct cgroup_subsys *ss = css->ss;
- struct cgroup *cgrp = css->cgroup;
-
- mutex_lock(&cgroup_mutex);
-
- css->flags |= CSS_RELEASED;
- list_del_rcu(&css->sibling);
-
- if (ss) {
- /* css release path */
- cgroup_idr_replace(&ss->css_idr, NULL, css->id);
- if (ss->css_released)
- ss->css_released(css);
- } else {
- /* cgroup release path */
- cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- cgrp->id = -1;
-
- /*
- * There are two control paths which try to determine
- * cgroup from dentry without going through kernfs -
- * cgroupstats_build() and css_tryget_online_from_dir().
- * Those are supported by RCU protecting clearing of
- * cgrp->kn->priv backpointer.
- */
- RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
- }
-
- mutex_unlock(&cgroup_mutex);
-
- call_rcu(&css->rcu_head, css_free_rcu_fn);
-}
-
-static void css_release(struct percpu_ref *ref)
-{
- struct cgroup_subsys_state *css =
- container_of(ref, struct cgroup_subsys_state, refcnt);
-
- INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
-}
-
-static void init_and_link_css(struct cgroup_subsys_state *css,
- struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
- lockdep_assert_held(&cgroup_mutex);
-
- cgroup_get(cgrp);
-
- memset(css, 0, sizeof(*css));
- css->cgroup = cgrp;
- css->ss = ss;
- INIT_LIST_HEAD(&css->sibling);
- INIT_LIST_HEAD(&css->children);
- css->serial_nr = css_serial_nr_next++;
-
- if (cgroup_parent(cgrp)) {
- css->parent = cgroup_css(cgroup_parent(cgrp), ss);
- css_get(css->parent);
- }
-
- BUG_ON(cgroup_css(cgrp, ss));
-}
-
-/* invoke ->css_online() on a new CSS and mark it online if successful */
-static int online_css(struct cgroup_subsys_state *css)
-{
- struct cgroup_subsys *ss = css->ss;
- int ret = 0;
-
- lockdep_assert_held(&cgroup_mutex);
-
- if (ss->css_online)
- ret = ss->css_online(css);
- if (!ret) {
- css->flags |= CSS_ONLINE;
- rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
- }
- return ret;
-}
-
-/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
-static void offline_css(struct cgroup_subsys_state *css)
-{
- struct cgroup_subsys *ss = css->ss;
-
- lockdep_assert_held(&cgroup_mutex);
-
- if (!(css->flags & CSS_ONLINE))
- return;
-
- if (ss->css_offline)
- ss->css_offline(css);
-
- css->flags &= ~CSS_ONLINE;
- RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
-
- wake_up_all(&css->cgroup->offline_waitq);
-}
-
-/**
- * create_css - create a cgroup_subsys_state
- * @cgrp: the cgroup new css will be associated with
- * @ss: the subsys of new css
- * @visible: whether to create control knobs for the new css or not
- *
- * Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created if
- * @visible. Returns 0 on success, -errno on failure.
- */
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- bool visible)
-{
- struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
- struct cgroup_subsys_state *css;
- int err;
-
- lockdep_assert_held(&cgroup_mutex);
-
- css = ss->css_alloc(parent_css);
- if (IS_ERR(css))
- return PTR_ERR(css);
-
- init_and_link_css(css, ss, cgrp);
-
- err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
- if (err)
- goto err_free_css;
-
- err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
- if (err < 0)
- goto err_free_percpu_ref;
- css->id = err;
-
- if (visible) {
- err = cgroup_populate_dir(cgrp, 1 << ss->id);
- if (err)
- goto err_free_id;
- }
-
- /* @css is ready to be brought online now, make it visible */
- list_add_tail_rcu(&css->sibling, &parent_css->children);
- cgroup_idr_replace(&ss->css_idr, css, css->id);
-
- err = online_css(css);
- if (err)
- goto err_list_del;
-
- if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
- cgroup_parent(parent)) {
- pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
- current->comm, current->pid, ss->name);
- if (!strcmp(ss->name, "memory"))
- pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
- ss->warned_broken_hierarchy = true;
- }
-
- return 0;
-
-err_list_del:
- list_del_rcu(&css->sibling);
- cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
-err_free_id:
- cgroup_idr_remove(&ss->css_idr, css->id);
-err_free_percpu_ref:
- percpu_ref_exit(&css->refcnt);
-err_free_css:
- call_rcu(&css->rcu_head, css_free_rcu_fn);
- return err;
-}
-
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
-{
- struct cgroup *parent, *cgrp;
- struct cgroup_root *root;
- struct cgroup_subsys *ss;
- struct kernfs_node *kn;
- struct cftype *base_files;
- int ssid, ret;
-
- /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
- */
- if (strchr(name, '\n'))
- return -EINVAL;
-
- parent = cgroup_kn_lock_live(parent_kn);
- if (!parent)
- return -ENODEV;
- root = parent->root;
-
- /* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
- if (!cgrp) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
- if (ret)
- goto out_free_cgrp;
-
- /*
- * Temporarily set the pointer to NULL, so idr_find() won't return
- * a half-baked cgroup.
- */
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
- if (cgrp->id < 0) {
- ret = -ENOMEM;
- goto out_cancel_ref;
- }
-
- init_cgroup_housekeeping(cgrp);
-
- cgrp->self.parent = &parent->self;
- cgrp->root = root;
-
- if (notify_on_release(parent))
- set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-
- if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
- set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
-
- /* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- goto out_free_id;
- }
- cgrp->kn = kn;
-
- /*
- * This extra ref will be put in cgroup_free_fn() and guarantees
- * that @cgrp->kn is always accessible.
- */
- kernfs_get(kn);
-
- cgrp->self.serial_nr = css_serial_nr_next++;
-
- /* allocation complete, commit to creation */
- list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
- atomic_inc(&root->nr_cgrps);
- cgroup_get(parent);
-
- /*
- * @cgrp is now fully operational. If something fails after this
- * point, it'll be released via the normal destruction path.
- */
- cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
-
- ret = cgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- if (cgroup_on_dfl(cgrp))
- base_files = cgroup_dfl_base_files;
- else
- base_files = cgroup_legacy_base_files;
-
- ret = cgroup_addrm_files(cgrp, base_files, true);
- if (ret)
- goto out_destroy;
-
- /* let's create and online css's */
- for_each_subsys(ss, ssid) {
- if (parent->child_subsys_mask & (1 << ssid)) {
- ret = create_css(cgrp, ss,
- parent->subtree_control & (1 << ssid));
- if (ret)
- goto out_destroy;
- }
- }
-
- /*
- * On the default hierarchy, a child doesn't automatically inherit
- * subtree_control from the parent. Each is configured manually.
- */
- if (!cgroup_on_dfl(cgrp)) {
- cgrp->subtree_control = parent->subtree_control;
- cgroup_refresh_child_subsys_mask(cgrp);
- }
-
- kernfs_activate(kn);
-
- ret = 0;
- goto out_unlock;
-
-out_free_id:
- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
-out_cancel_ref:
- percpu_ref_exit(&cgrp->self.refcnt);
-out_free_cgrp:
- kfree(cgrp);
-out_unlock:
- cgroup_kn_unlock(parent_kn);
- return ret;
-
-out_destroy:
- cgroup_destroy_locked(cgrp);
- goto out_unlock;
-}
-
-/*
- * This is called when the refcnt of a css is confirmed to be killed.
- * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initate destruction and put the css ref from kill_css().
- */
-static void css_killed_work_fn(struct work_struct *work)
-{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
-
- mutex_lock(&cgroup_mutex);
- offline_css(css);
- mutex_unlock(&cgroup_mutex);
-
- css_put(css);
-}
-
-/* css kill confirmation processing requires process context, bounce */
-static void css_killed_ref_fn(struct percpu_ref *ref)
-{
- struct cgroup_subsys_state *css =
- container_of(ref, struct cgroup_subsys_state, refcnt);
-
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
-}
-
-/**
- * kill_css - destroy a css
- * @css: css to destroy
- *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference. ->css_offline() will be invoked
- * asynchronously once css_tryget_online() is guaranteed to fail and when
- * the reference count reaches zero, @css will be released.
- */
-static void kill_css(struct cgroup_subsys_state *css)
-{
- lockdep_assert_held(&cgroup_mutex);
-
- /*
- * This must happen before css is disassociated with its cgroup.
- * See seq_css() for details.
- */
- cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
-
- /*
- * Killing would put the base ref, but we need to keep it alive
- * until after ->css_offline().
- */
- css_get(css);
-
- /*
- * cgroup core guarantees that, by the time ->css_offline() is
- * invoked, no new css reference will be given out via
- * css_tryget_online(). We can't simply call percpu_ref_kill() and
- * proceed to offlining css's because percpu_ref_kill() doesn't
- * guarantee that the ref is seen as killed on all CPUs on return.
- *
- * Use percpu_ref_kill_and_confirm() to get notifications as each
- * css is confirmed to be seen as killed on all CPUs.
- */
- percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
-}
-
-/**
- * cgroup_destroy_locked - the first stage of cgroup destruction
- * @cgrp: cgroup to be destroyed
- *
- * css's make use of percpu refcnts whose killing latency shouldn't be
- * exposed to userland and are RCU protected. Also, cgroup core needs to
- * guarantee that css_tryget_online() won't succeed by the time
- * ->css_offline() is invoked. To satisfy all the requirements,
- * destruction is implemented in the following two steps.
- *
- * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
- * userland visible parts and start killing the percpu refcnts of
- * css's. Set up so that the next stage will be kicked off once all
- * the percpu refcnts are confirmed to be killed.
- *
- * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
- * rest of destruction. Once all cgroup references are gone, the
- * cgroup is RCU-freed.
- *
- * This function implements s1. After this step, @cgrp is gone as far as
- * the userland is concerned and a new cgroup with the same name may be
- * created. As cgroup doesn't care about the names internally, this
- * doesn't cause any problem.
- */
-static int cgroup_destroy_locked(struct cgroup *cgrp)
- __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
-{
- struct cgroup_subsys_state *css;
- bool empty;
- int ssid;
-
- lockdep_assert_held(&cgroup_mutex);
-
- /*
- * css_set_rwsem synchronizes access to ->cset_links and prevents
- * @cgrp from being removed while put_css_set() is in progress.
- */
- down_read(&css_set_rwsem);
- empty = list_empty(&cgrp->cset_links);
- up_read(&css_set_rwsem);
- if (!empty)
- return -EBUSY;
-
- /*
- * Make sure there's no live children. We can't test emptiness of
- * ->self.children as dead children linger on it while being
- * drained; otherwise, "rmdir parent/child parent" may fail.
- */
- if (css_has_online_children(&cgrp->self))
- return -EBUSY;
-
- /*
- * Mark @cgrp dead. This prevents further task migration and child
- * creation by disabling cgroup_lock_live_group().
- */
- cgrp->self.flags &= ~CSS_ONLINE;
-
- /* initiate massacre of all css's */
- for_each_css(css, ssid, cgrp)
- kill_css(css);
-
- /*
- * Remove @cgrp directory along with the base files. @cgrp has an
- * extra ref on its kn.
- */
- kernfs_remove(cgrp->kn);
-
- check_for_release(cgroup_parent(cgrp));
-
- /* put the base reference */
- percpu_ref_kill(&cgrp->self.refcnt);
-
- return 0;
-};
-
-static int cgroup_rmdir(struct kernfs_node *kn)
-{
- struct cgroup *cgrp;
- int ret = 0;
-
- cgrp = cgroup_kn_lock_live(kn);
- if (!cgrp)
- return 0;
-
- ret = cgroup_destroy_locked(cgrp);
-
- cgroup_kn_unlock(kn);
- return ret;
-}
-
-static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
- .remount_fs = cgroup_remount,
- .show_options = cgroup_show_options,
- .mkdir = cgroup_mkdir,
- .rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
-};
-
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
-{
- struct cgroup_subsys_state *css;
-
- printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
-
- mutex_lock(&cgroup_mutex);
-
- idr_init(&ss->css_idr);
- INIT_LIST_HEAD(&ss->cfts);
-
- /* Create the root cgroup state for this subsystem */
- ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
- /* We don't handle early failures gracefully */
- BUG_ON(IS_ERR(css));
- init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
-
- /*
- * Root csses are never destroyed and we can't initialize
- * percpu_ref during early init. Disable refcnting.
- */
- css->flags |= CSS_NO_REF;
-
- if (early) {
- /* allocation can't be done safely during early init */
- css->id = 1;
- } else {
- css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
- BUG_ON(css->id < 0);
- }
-
- /* Update the init_css_set to contain a subsys
- * pointer to this state - since the subsystem is
- * newly registered, all tasks and hence the
- * init_css_set is in the subsystem's root cgroup. */
- init_css_set.subsys[ss->id] = css;
-
- need_forkexit_callback |= ss->fork || ss->exit;
-
- /* At system boot, before all subsystems have been
- * registered, no tasks have been forked, so we don't
- * need to invoke fork callbacks here. */
- BUG_ON(!list_empty(&init_task.tasks));
-
- BUG_ON(online_css(css));
-
- mutex_unlock(&cgroup_mutex);
-}
-
-/**
- * cgroup_init_early - cgroup initialization at system boot
- *
- * Initialize cgroups at system boot, and initialize any
- * subsystems that request early init.
- */
-int __init cgroup_init_early(void)
-{
- static struct cgroup_sb_opts __initdata opts;
- struct cgroup_subsys *ss;
- int i;
-
- init_cgroup_root(&cgrp_dfl_root, &opts);
- cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
-
- RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
-
- for_each_subsys(ss, i) {
- WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
- "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
- i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
- ss->id, ss->name);
- WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
- "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
-
- ss->id = i;
- ss->name = cgroup_subsys_name[i];
-
- if (ss->early_init)
- cgroup_init_subsys(ss, true);
- }
- return 0;
-}
-
-/**
- * cgroup_init - cgroup initialization
- *
- * Register cgroup filesystem and /proc file, and initialize
- * any subsystems that didn't request early init.
- */
-int __init cgroup_init(void)
-{
- struct cgroup_subsys *ss;
- unsigned long key;
- int ssid, err;
-
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
-
- mutex_lock(&cgroup_mutex);
-
- /* Add init_css_set to the hash table */
- key = css_set_hash(init_css_set.subsys);
- hash_add(css_set_table, &init_css_set.hlist, key);
-
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
-
- mutex_unlock(&cgroup_mutex);
-
- for_each_subsys(ss, ssid) {
- if (ss->early_init) {
- struct cgroup_subsys_state *css =
- init_css_set.subsys[ss->id];
-
- css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
- GFP_KERNEL);
- BUG_ON(css->id < 0);
- } else {
- cgroup_init_subsys(ss, false);
- }
-
- list_add_tail(&init_css_set.e_cset_node[ssid],
- &cgrp_dfl_root.cgrp.e_csets[ssid]);
-
- /*
- * Setting dfl_root subsys_mask needs to consider the
- * disabled flag and cftype registration needs kmalloc,
- * both of which aren't available during early_init.
- */
- if (ss->disabled)
- continue;
-
- cgrp_dfl_root.subsys_mask |= 1 << ss->id;
-
- if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
- ss->dfl_cftypes = ss->legacy_cftypes;
-
- if (!ss->dfl_cftypes)
- cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
-
- if (ss->dfl_cftypes == ss->legacy_cftypes) {
- WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
- } else {
- WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
- WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
- }
-
- if (ss->bind)
- ss->bind(init_css_set.subsys[ssid]);
- }
-
- cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
- if (!cgroup_kobj)
- return -ENOMEM;
-
- err = register_filesystem(&cgroup_fs_type);
- if (err < 0) {
- kobject_put(cgroup_kobj);
- return err;
- }
-
- proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
- return 0;
-}
-
-static int __init cgroup_wq_init(void)
-{
- /*
- * There isn't much point in executing destruction path in
- * parallel. Good chunk is serialized with cgroup_mutex anyway.
- * Use 1 for @max_active.
- *
- * We would prefer to do this in cgroup_init() above, but that
- * is called before init_workqueues(): so leave this until after.
- */
- cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
- BUG_ON(!cgroup_destroy_wq);
-
- /*
- * Used to destroy pidlists and separate to serve as flush domain.
- * Cap @max_active to 1 too.
- */
- cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
- BUG_ON(!cgroup_pidlist_destroy_wq);
-
- return 0;
-}
-core_initcall(cgroup_wq_init);
-
-/*
- * proc_cgroup_show()
- * - Print task's cgroup paths into seq_file, one line for each hierarchy
- * - Used for /proc/<pid>/cgroup.
- */
-int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- char *buf, *path;
- int retval;
- struct cgroup_root *root;
-
- retval = -ENOMEM;
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf)
- goto out;
-
- mutex_lock(&cgroup_mutex);
- down_read(&css_set_rwsem);
-
- for_each_root(root) {
- struct cgroup_subsys *ss;
- struct cgroup *cgrp;
- int ssid, count = 0;
-
- if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
- continue;
-
- seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
- if (strlen(root->name))
- seq_printf(m, "%sname=%s", count ? "," : "",
- root->name);
- seq_putc(m, ':');
- cgrp = task_cgroup_from_root(tsk, root);
- path = cgroup_path(cgrp, buf, PATH_MAX);
- if (!path) {
- retval = -ENAMETOOLONG;
- goto out_unlock;
- }
- seq_puts(m, path);
- seq_putc(m, '\n');
- }
-
- retval = 0;
-out_unlock:
- up_read(&css_set_rwsem);
- mutex_unlock(&cgroup_mutex);
- kfree(buf);
-out:
- return retval;
-}
-
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
- struct cgroup_subsys *ss;
- int i;
-
- seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
- /*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
- */
- mutex_lock(&cgroup_mutex);
-
- for_each_subsys(ss, i)
- seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->name, ss->root->hierarchy_id,
- atomic_read(&ss->root->nr_cgrps), !ss->disabled);
-
- mutex_unlock(&cgroup_mutex);
- return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
- .open = cgroupstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-/**
- * cgroup_fork - initialize cgroup related fields during copy_process()
- * @child: pointer to task_struct of forking parent process.
- *
- * A task is associated with the init_css_set until cgroup_post_fork()
- * attaches it to the parent's css_set. Empty cg_list indicates that
- * @child isn't holding reference to its css_set.
- */
-void cgroup_fork(struct task_struct *child)
-{
- RCU_INIT_POINTER(child->cgroups, &init_css_set);
- INIT_LIST_HEAD(&child->cg_list);
-}
-
-/**
- * cgroup_post_fork - called on a new task after adding it to the task list
- * @child: the task in question
- *
- * Adds the task to the list running through its css_set if necessary and
- * call the subsystem fork() callbacks. Has to be after the task is
- * visible on the task list in case we race with the first call to
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
- * list.
- */
-void cgroup_post_fork(struct task_struct *child)
-{
- struct cgroup_subsys *ss;
- int i;
-
- /*
- * This may race against cgroup_enable_task_cg_lists(). As that
- * function sets use_task_css_set_links before grabbing
- * tasklist_lock and we just went through tasklist_lock to add
- * @child, it's guaranteed that either we see the set
- * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
- * @child during its iteration.
- *
- * If we won the race, @child is associated with %current's
- * css_set. Grabbing css_set_rwsem guarantees both that the
- * association is stable, and, on completion of the parent's
- * migration, @child is visible in the source of migration or
- * already in the destination cgroup. This guarantee is necessary
- * when implementing operations which need to migrate all tasks of
- * a cgroup to another.
- *
- * Note that if we lose to cgroup_enable_task_cg_lists(), @child
- * will remain in init_css_set. This is safe because all tasks are
- * in the init_css_set before cg_links is enabled and there's no
- * operation which transfers all tasks out of init_css_set.
- */
- if (use_task_css_set_links) {
- struct css_set *cset;
-
- down_write(&css_set_rwsem);
- cset = task_css_set(current);
- if (list_empty(&child->cg_list)) {
- rcu_assign_pointer(child->cgroups, cset);
- list_add(&child->cg_list, &cset->tasks);
- get_css_set(cset);
- }
- up_write(&css_set_rwsem);
- }
-
- /*
- * Call ss->fork(). This must happen after @child is linked on
- * css_set; otherwise, @child might change state between ->fork()
- * and addition to css_set.
- */
- if (need_forkexit_callback) {
- for_each_subsys(ss, i)
- if (ss->fork)
- ss->fork(child);
- }
-}
-
-/**
- * cgroup_exit - detach cgroup from exiting task
- * @tsk: pointer to task_struct of exiting process
- *
- * Description: Detach cgroup from @tsk and release it.
- *
- * Note that cgroups marked notify_on_release force every task in
- * them to take the global cgroup_mutex mutex when exiting.
- * This could impact scaling on very large systems. Be reluctant to
- * use notify_on_release cgroups where very high task exit scaling
- * is required on large systems.
- *
- * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
- * call cgroup_exit() while the task is still competent to handle
- * notify_on_release(), then leave the task attached to the root cgroup in
- * each hierarchy for the remainder of its exit. No need to bother with
- * init_css_set refcnting. init_css_set never goes away and we can't race
- * with migration path - PF_EXITING is visible to migration path.
- */
-void cgroup_exit(struct task_struct *tsk)
-{
- struct cgroup_subsys *ss;
- struct css_set *cset;
- bool put_cset = false;
- int i;
-
- /*
- * Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check cg_list without grabbing css_set_rwsem.
- */
- if (!list_empty(&tsk->cg_list)) {
- down_write(&css_set_rwsem);
- list_del_init(&tsk->cg_list);
- up_write(&css_set_rwsem);
- put_cset = true;
- }
-
- /* Reassign the task to the init_css_set. */
- cset = task_css_set(tsk);
- RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
-
- if (need_forkexit_callback) {
- /* see cgroup_post_fork() for details */
- for_each_subsys(ss, i) {
- if (ss->exit) {
- struct cgroup_subsys_state *old_css = cset->subsys[i];
- struct cgroup_subsys_state *css = task_css(tsk, i);
-
- ss->exit(css, old_css, tsk);
- }
- }
- }
-
- if (put_cset)
- put_css_set(cset);
-}
-
-static void check_for_release(struct cgroup *cgrp)
-{
- if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
- !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
- schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence. Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d. The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task. We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
- struct cgroup *cgrp =
- container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL, *path;
- char *argv[3], *envp[3];
-
- mutex_lock(&cgroup_mutex);
-
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf)
- goto out;
-
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
- if (!path)
- goto out;
-
- argv[0] = agentbuf;
- argv[1] = path;
- argv[2] = NULL;
-
- /* minimal command environment */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- goto out_free;
-out:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(agentbuf);
- kfree(pathbuf);
-}
-
-static int __init cgroup_disable(char *str)
-{
- struct cgroup_subsys *ss;
- char *token;
- int i;
-
- while ((token = strsep(&str, ",")) != NULL) {
- if (!*token)
- continue;
-
- for_each_subsys(ss, i) {
- if (!strcmp(token, ss->name)) {
- ss->disabled = 1;
- printk(KERN_INFO "Disabling %s control group"
- " subsystem\n", ss->name);
- break;
- }
- }
- }
- return 1;
-}
-__setup("cgroup_disable=", cgroup_disable);
-
-static int __init cgroup_set_legacy_files_on_dfl(char *str)
-{
- printk("cgroup: using legacy files on the default hierarchy\n");
- cgroup_legacy_files_on_dfl = true;
- return 0;
-}
-__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
-
-/**
- * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
- * @dentry: directory dentry of interest
- * @ss: subsystem of interest
- *
- * If @dentry is a directory for a cgroup which has @ss enabled on it, try
- * to get the corresponding css and return it. If such css doesn't exist
- * or can't be pinned, an ERR_PTR value is returned.
- */
-struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
- struct cgroup_subsys *ss)
-{
- struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
- struct cgroup_subsys_state *css = NULL;
- struct cgroup *cgrp;
-
- /* is @dentry a cgroup dir? */
- if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- kernfs_type(kn) != KERNFS_DIR)
- return ERR_PTR(-EBADF);
-
- rcu_read_lock();
-
- /*
- * This path doesn't originate from kernfs and @kn could already
- * have been or be removed at any point. @kn->priv is RCU
- * protected for this access. See css_release_work_fn() for details.
- */
- cgrp = rcu_dereference(kn->priv);
- if (cgrp)
- css = cgroup_css(cgrp, ss);
-
- if (!css || !css_tryget_online(css))
- css = ERR_PTR(-ENOENT);
-
- rcu_read_unlock();
- return css;
-}
-
-/**
- * css_from_id - lookup css by id
- * @id: the cgroup id
- * @ss: cgroup subsys to be looked into
- *
- * Returns the css if there's valid one with @id, otherwise returns NULL.
- * Should be called under rcu_read_lock().
- */
-struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
-}
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
- kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = atomic_read(&task_css_set(current)->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
- struct cgrp_cset_link *link;
- struct css_set *cset;
- char *name_buf;
-
- name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name_buf)
- return -ENOMEM;
-
- down_read(&css_set_rwsem);
- rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- cgroup_name(c, name_buf, NAME_MAX + 1);
- seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name_buf);
- }
- rcu_read_unlock();
- up_read(&css_set_rwsem);
- kfree(name_buf);
- return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(seq);
- struct cgrp_cset_link *link;
-
- down_read(&css_set_rwsem);
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- struct task_struct *task;
- int count = 0;
-
- seq_printf(seq, "css_set %p\n", cset);
-
- list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
-
- list_for_each_entry(task, &cset->mg_tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
- continue;
- overflow:
- seq_puts(seq, " ...\n");
- }
- up_read(&css_set_rwsem);
- return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- return (!cgroup_has_tasks(css->cgroup) &&
- !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] = {
- {
- .name = "taskcount",
- .read_u64 = debug_taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "current_css_set_cg_links",
- .seq_show = current_css_set_cg_links_read,
- },
-
- {
- .name = "cgroup_css_links",
- .seq_show = cgroup_css_links_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-
- { } /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
- .css_alloc = debug_css_alloc,
- .css_free = debug_css_free,
- .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 000000000000..ede31601a363
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
+obj-$(CONFIG_CGROUP_MISC) += misc.o
+obj-$(CONFIG_CGROUP_DMEM) += dmem.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 000000000000..22051b4f1ccb
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,302 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <linux/refcount.h>
+#include <linux/fs_parser.h>
+
+#define TRACE_CGROUP_PATH_LEN 1024
+extern spinlock_t trace_cgroup_path_lock;
+extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+extern void __init enable_debug_cgroup(void);
+
+/*
+ * cgroup_path() takes a spin lock. It is good practice not to take
+ * spin locks within trace point handlers, as they are mostly hidden
+ * from normal view. As cgroup_path() can take the kernfs_rename_lock
+ * spin lock, it is best to not call that function from the trace event
+ * handler.
+ *
+ * Note: trace_cgroup_##type##_enabled() is a static branch that will only
+ * be set when the trace event is enabled.
+ */
+#define TRACE_CGROUP_PATH(type, cgrp, ...) \
+ do { \
+ if (trace_cgroup_##type##_enabled()) { \
+ unsigned long flags; \
+ spin_lock_irqsave(&trace_cgroup_path_lock, \
+ flags); \
+ cgroup_path(cgrp, trace_cgroup_path, \
+ TRACE_CGROUP_PATH_LEN); \
+ trace_cgroup_##type(cgrp, trace_cgroup_path, \
+ ##__VA_ARGS__); \
+ spin_unlock_irqrestore(&trace_cgroup_path_lock, \
+ flags); \
+ } \
+ } while (0)
+
+/*
+ * The cgroup filesystem superblock creation/mount context.
+ */
+struct cgroup_fs_context {
+ struct kernfs_fs_context kfc;
+ struct cgroup_root *root;
+ struct cgroup_namespace *ns;
+ unsigned int flags; /* CGRP_ROOT_* flags */
+
+ /* cgroup1 bits */
+ bool cpuset_clone_children;
+ bool none; /* User explicitly requested empty subsystem */
+ bool all_ss; /* Seen 'all' option */
+ u16 subsys_mask; /* Selected subsystems */
+ char *name; /* Hierarchy name */
+ char *release_agent; /* Path for release notifications */
+};
+
+static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct cgroup_fs_context, kfc);
+}
+
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+
+ struct cgroup_of_peak peak;
+};
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
+};
+
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /* the number of tasks in the set */
+ int nr_tasks;
+
+ /* the subsys currently being processed */
+ int ssid;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+ /*
+ * Preloaded source and destination csets. Used to guarantee
+ * atomic success or failure on actual migration.
+ */
+ struct list_head preloaded_src_csets;
+ struct list_head preloaded_dst_csets;
+
+ /* tasks and csets to migrate */
+ struct cgroup_taskset tset;
+
+ /* subsystems affected by migration */
+ u16 ss_mask;
+};
+
+#define CGROUP_TASKSET_INIT(tset) \
+{ \
+ .src_csets = LIST_HEAD_INIT(tset.src_csets), \
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
+ .csets = &tset.src_csets, \
+}
+
+#define CGROUP_MGCTX_INIT(name) \
+{ \
+ LIST_HEAD_INIT(name.preloaded_src_csets), \
+ LIST_HEAD_INIT(name.preloaded_dst_csets), \
+ CGROUP_TASKSET_INIT(name.tset), \
+}
+
+#define DEFINE_CGROUP_MGCTX(name) \
+ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern bool cgrp_dfl_visible;
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \
+ lockdep_is_held(&cgroup_mutex))
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+ unsigned long flags;
+
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (refcount_dec_not_one(&cset->refcount))
+ return;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ put_css_set_locked(cset);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+ refcount_inc(&cset->refcount);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns);
+
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_fs_context *ctx);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+int cgroup_do_get_tree(struct fs_context *fc);
+
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk);
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+ enum cgroup_attach_lock_mode *lock_mode)
+ __acquires(&cgroup_threadgroup_rwsem);
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
+ __releases(&cgroup_threadgroup_rwsem);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root);
+
+int __cgroup_task_count(const struct cgroup *cgrp);
+int cgroup_task_count(const struct cgroup *cgrp);
+
+/*
+ * rstat.c
+ */
+int css_rstat_init(struct cgroup_subsys_state *css);
+void css_rstat_exit(struct cgroup_subsys_state *css);
+int ss_rstat_init(struct cgroup_subsys *ss);
+void cgroup_base_stat_cputime_show(struct seq_file *seq);
+
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
+/*
+ * cgroup-v1.c
+ */
+extern struct cftype cgroup1_base_files[];
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+extern const struct fs_parameter_spec cgroup1_fs_parameters[];
+
+int proc_cgroupstats_show(struct seq_file *m, void *v);
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
+int cgroup1_get_tree(struct fs_context *fc);
+int cgroup1_reconfigure(struct fs_context *ctx);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 000000000000..a9e029b570c8
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1372 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "cgroup-internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+#include <linux/fs_parser.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/* disable named v1 mounts */
+static bool cgroup_no_v1_named;
+
+/* Show unavailable controllers in /proc/cgroups */
+static bool proc_show_all;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/* protects cgroup_subsys->release_agent_path */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup1_ssid_disabled(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
+static bool cgroup1_subsys_absent(struct cgroup_subsys *ss)
+{
+ /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */
+ return ss->legacy_cftypes == NULL && ss->dfl_cftypes;
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ *
+ * Return: %0 on success or a negative errno code on failure
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroup_root *root;
+ int retval = 0;
+
+ cgroup_lock();
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ spin_lock_irq(&css_set_lock);
+ from_cgrp = task_cgroup_from_root(from, root);
+ spin_unlock_irq(&css_set_lock);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
+ }
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
+ cgroup_unlock();
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_transfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
+ *
+ * Return: %0 on success or a negative errno code on failure
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ DEFINE_CGROUP_MGCTX(mgctx);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ if (cgroup_on_dfl(to))
+ return -EINVAL;
+
+ ret = cgroup_migrate_vet_dst(to);
+ if (ret)
+ return ret;
+
+ cgroup_lock();
+
+ cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
+
+ /* all tasks in @from are being moved, all csets are source */
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &mgctx);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_migrate_prepare_dst(&mgctx);
+ if (ret)
+ goto out_err;
+
+ /*
+ * Migrate tasks one-by-one until @from is empty. This fails iff
+ * ->can_attach() fails.
+ */
+ do {
+ css_task_iter_start(&from->self, 0, &it);
+
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task->flags & PF_EXITING));
+
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(task, false, &mgctx);
+ if (!ret)
+ TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&mgctx);
+ cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
+ cgroup_unlock();
+ return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
+{
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ kvfree(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
+ }
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+ int src, dest = 1;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
+ }
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+after:
+ return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
+ /* entry not found; create a new one */
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
+ return l;
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+ l->key.type = type;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ return l;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
+ struct css_task_iter it;
+ struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
+ css_task_iter_start(&cgrp->self, 0, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ if (unlikely(n == length))
+ break;
+ /* get tgid or pid for procs or tasks file respectively */
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
+ }
+ css_task_iter_end(&it);
+ length = n;
+ /* now sort & strip out duplicates (tgids or recycled thread PIDs) */
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
+ if (!l) {
+ kvfree(array);
+ return -ENOMEM;
+ }
+
+ /* store array, freeing old if necessary */
+ kvfree(l->list);
+ l->list = array;
+ l->length = length;
+ *lp = l;
+ return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Initially we receive a position value that corresponds to
+ * one more than the last pid shown (or 0 on the first call or
+ * after a seek to the start). Use a binary-search to find the
+ * next pid to display, if any
+ */
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
+ int index = 0, pid = *pos;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
+ */
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = ctx->procs1.pidlist;
+
+ if (pid) {
+ int end = l->length;
+
+ while (index < end) {
+ int mid = (index + end) / 2;
+ if (l->list[mid] == pid) {
+ index = mid;
+ break;
+ } else if (l->list[mid] < pid)
+ index = mid + 1;
+ else
+ end = mid;
+ }
+ }
+ /* If we're off the end of the array, we're done */
+ if (index >= l->length)
+ return NULL;
+ /* Update the abstract position to be the actual pid that we found */
+ iter = l->list + index;
+ *pos = *iter;
+ return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
+ /*
+ * Advance to the next pid in the array. If this goes off the
+ * end, we're done
+ */
+ p++;
+ if (p >= end) {
+ (*pos)++;
+ return NULL;
+ } else {
+ *pos = *p;
+ return p;
+ }
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
+}
+
+static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ bool threadgroup)
+{
+ struct cgroup *cgrp;
+ struct task_struct *task;
+ const struct cred *cred, *tcred;
+ ssize_t ret;
+ enum cgroup_attach_lock_mode lock_mode;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
+ */
+ cred = of->file->f_cred;
+ tcred = get_task_cred(task);
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(cgrp, task, threadgroup);
+
+out_finish:
+ cgroup_procs_write_finish(task, lock_mode);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, true);
+}
+
+static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup1_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ struct cgroup_file_ctx *ctx;
+
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ ctx = of->priv;
+ if ((ctx->ns->user_ns != &init_user_ns) ||
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+ spin_lock(&release_agent_path_lock);
+ strscpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
+ seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, "0\n");
+ return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup1_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup1_procs_write,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "tasks",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup1_tasks_write,
+ },
+ {
+ .name = "notify_on_release",
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ },
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys *ss;
+ bool cgrp_v1_visible = false;
+ int i;
+
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * Grab the subsystems state racily. No need to add avenue to
+ * cgroup_mutex contention.
+ */
+
+ for_each_subsys(ss, i) {
+ cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+
+ if (!proc_show_all && cgroup1_subsys_absent(ss))
+ continue;
+
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->legacy_name, ss->root->hierarchy_id,
+ atomic_read(&ss->root->nr_cgrps),
+ cgroup_ssid_enabled(i));
+ }
+
+ if (cgrp_dfl_visible && !cgrp_v1_visible)
+ pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n");
+
+
+ return 0;
+}
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ *
+ * Return: %0 on success or a negative errno code on failure
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup *cgrp;
+ struct css_task_iter it;
+ struct task_struct *tsk;
+
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ /*
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
+ */
+ rcu_read_lock();
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp)) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ css_task_iter_start(&cgrp->self, 0, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ switch (READ_ONCE(tsk->__state)) {
+ case TASK_RUNNING:
+ stats->nr_running++;
+ break;
+ case TASK_INTERRUPTIBLE:
+ stats->nr_sleeping++;
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ stats->nr_uninterruptible++;
+ break;
+ case TASK_STOPPED:
+ stats->nr_stopped++;
+ break;
+ default:
+ if (tsk->in_iowait)
+ stats->nr_io_wait++;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ cgroup_put(cgrp);
+ return 0;
+}
+
+void cgroup1_check_for_release(struct cgroup *cgrp)
+{
+ if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence. Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d. The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task. We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup1_release_agent(struct work_struct *work)
+{
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf, *agentbuf;
+ char *argv[3], *envp[3];
+ int ret;
+
+ /* snoop agent path and exit early if empty */
+ if (!cgrp->root->release_agent_path[0])
+ return;
+
+ /* prepare argument buffers */
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out_free;
+
+ spin_lock(&release_agent_path_lock);
+ strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
+ spin_unlock(&release_agent_path_lock);
+ if (!agentbuf[0])
+ goto out_free;
+
+ ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ if (ret < 0)
+ goto out_free;
+
+ argv[0] = agentbuf;
+ argv[1] = pathbuf;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
+{
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(new_name_str, '\n'))
+ return -EINVAL;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
+ return -ENOTDIR;
+ if (rcu_access_pointer(kn->__parent) != new_parent)
+ return -EIO;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ cgroup_lock();
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+ if (!ret)
+ TRACE_CGROUP_PATH(rename, cgrp);
+
+ cgroup_unlock();
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
+}
+
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_show_option(seq, ss->legacy_name, NULL);
+ if (root->flags & CGRP_ROOT_NOPREFIX)
+ seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+ if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+ seq_puts(seq, ",cpuset_v2_mode");
+ if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
+
+ spin_lock(&release_agent_path_lock);
+ if (strlen(root->release_agent_path))
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+ seq_puts(seq, ",clone_children");
+ if (strlen(root->name))
+ seq_show_option(seq, "name", root->name);
+ return 0;
+}
+
+enum cgroup1_param {
+ Opt_all,
+ Opt_clone_children,
+ Opt_cpuset_v2_mode,
+ Opt_name,
+ Opt_none,
+ Opt_noprefix,
+ Opt_release_agent,
+ Opt_xattr,
+ Opt_favordynmods,
+ Opt_nofavordynmods,
+};
+
+const struct fs_parameter_spec cgroup1_fs_parameters[] = {
+ fsparam_flag ("all", Opt_all),
+ fsparam_flag ("clone_children", Opt_clone_children),
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ fsparam_string("name", Opt_name),
+ fsparam_flag ("none", Opt_none),
+ fsparam_flag ("noprefix", Opt_noprefix),
+ fsparam_string("release_agent", Opt_release_agent),
+ fsparam_flag ("xattr", Opt_xattr),
+ fsparam_flag ("favordynmods", Opt_favordynmods),
+ fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
+ {}
+};
+
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct cgroup_subsys *ss;
+ struct fs_parse_result result;
+ int opt, i;
+
+ opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
+ if (opt == -ENOPARAM) {
+ int ret;
+
+ ret = vfs_parse_fs_param_source(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
+ for_each_subsys(ss, i) {
+ if (strcmp(param->key, ss->legacy_name) ||
+ cgroup1_subsys_absent(ss))
+ continue;
+ if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
+ return invalfc(fc, "Disabled controller '%s'",
+ param->key);
+ ctx->subsys_mask |= (1 << i);
+ return 0;
+ }
+ return invalfc(fc, "Unknown subsys name '%s'", param->key);
+ }
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_none:
+ /* Explicitly have no subsystems */
+ ctx->none = true;
+ break;
+ case Opt_all:
+ ctx->all_ss = true;
+ break;
+ case Opt_noprefix:
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ break;
+ case Opt_clone_children:
+ ctx->cpuset_clone_children = true;
+ break;
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ break;
+ case Opt_xattr:
+ ctx->flags |= CGRP_ROOT_XATTR;
+ break;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_nofavordynmods:
+ ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_release_agent:
+ /* Specifying two release agents is forbidden */
+ if (ctx->release_agent)
+ return invalfc(fc, "release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return invalfc(fc, "Setting release_agent not allowed");
+ ctx->release_agent = param->string;
+ param->string = NULL;
+ break;
+ case Opt_name:
+ /* blocked by boot param? */
+ if (cgroup_no_v1_named)
+ return -ENOENT;
+ /* Can't specify an empty name */
+ if (!param->size)
+ return invalfc(fc, "Empty name");
+ if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
+ return invalfc(fc, "Name too long");
+ /* Must match [\w.-]+ */
+ for (i = 0; i < param->size; i++) {
+ char c = param->string[i];
+ if (isalnum(c))
+ continue;
+ if ((c == '.') || (c == '-') || (c == '_'))
+ continue;
+ return invalfc(fc, "Invalid name");
+ }
+ /* Specifying two names is forbidden */
+ if (ctx->name)
+ return invalfc(fc, "name respecified");
+ ctx->name = param->string;
+ param->string = NULL;
+ break;
+ }
+ return 0;
+}
+
+static int check_cgroupfs_options(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ u16 mask = U16_MAX;
+ u16 enabled = 0;
+ struct cgroup_subsys *ss;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+ for_each_subsys(ss, i)
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
+ !cgroup1_subsys_absent(ss))
+ enabled |= 1 << i;
+
+ ctx->subsys_mask &= enabled;
+
+ /*
+ * In absence of 'none', 'name=' and subsystem name options,
+ * let's default to 'all'.
+ */
+ if (!ctx->subsys_mask && !ctx->none && !ctx->name)
+ ctx->all_ss = true;
+
+ if (ctx->all_ss) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (ctx->subsys_mask)
+ return invalfc(fc, "subsys name conflicts with all");
+ /* 'all' => select all the subsystems */
+ ctx->subsys_mask = enabled;
+ }
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!ctx->subsys_mask && !ctx->name)
+ return invalfc(fc, "Need name or subsystem set");
+
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
+ return invalfc(fc, "noprefix used incorrectly");
+
+ /* Can't specify "none" and some subsystems */
+ if (ctx->subsys_mask && ctx->none)
+ return invalfc(fc, "none used incorrectly");
+
+ return 0;
+}
+
+int cgroup1_reconfigure(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ int ret = 0;
+ u16 added_mask, removed_mask;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ /* See what subsystems are wanted */
+ ret = check_cgroupfs_options(fc);
+ if (ret)
+ goto out_unlock;
+
+ if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = ctx->subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~ctx->subsys_mask;
+
+ /* Don't allow flags or name to change at remount */
+ if ((ctx->flags ^ root->flags) ||
+ (ctx->name && strcmp(ctx->name, root->name))) {
+ errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
+ ctx->flags, ctx->name ?: "", root->flags, root->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
+
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+ if (ctx->release_agent) {
+ spin_lock(&release_agent_path_lock);
+ strscpy(root->release_agent_path, ctx->release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
+
+ trace_cgroup_remount(root);
+
+ out_unlock:
+ cgroup_unlock();
+ return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+ .rename = cgroup1_rename,
+ .show_options = cgroup1_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .show_path = cgroup_show_path,
+};
+
+/*
+ * The guts of cgroup1 mount - find or create cgroup_root to use.
+ * Called with cgroup_mutex held; returns 0 on success, -E... on
+ * error and positive - in case when the candidate is busy dying.
+ * On success it stashes a reference to cgroup_root into given
+ * cgroup_fs_context; that reference is *NOT* counting towards the
+ * cgroup_root refcount.
+ */
+static int cgroup1_root_to_use(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct cgroup_root *root;
+ struct cgroup_subsys *ss;
+ int i, ret;
+
+ /* First find the desired set of subsystems */
+ ret = check_cgroupfs_options(fc);
+ if (ret)
+ return ret;
+
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(ctx->subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
+
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
+ return 1; /* restart */
+ cgroup_put(&ss->root->cgrp);
+ }
+
+ for_each_root(root) {
+ bool name_match = false;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ /*
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
+ */
+ if (ctx->name) {
+ if (strcmp(ctx->name, root->name))
+ continue;
+ name_match = true;
+ }
+
+ /*
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
+ */
+ if ((ctx->subsys_mask || ctx->none) &&
+ (ctx->subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ return -EBUSY;
+ }
+
+ if (root->flags ^ ctx->flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+ ctx->root = root;
+ return 0;
+ }
+
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!ctx->subsys_mask && !ctx->none)
+ return invalfc(fc, "No subsys list or none specified");
+
+ /* Hierarchies may only be created in the initial cgroup namespace. */
+ if (ctx->ns != &init_cgroup_ns)
+ return -EPERM;
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root)
+ return -ENOMEM;
+
+ ctx->root = root;
+ init_cgroup_root(ctx);
+
+ ret = cgroup_setup_root(root, ctx->subsys_mask);
+ if (!ret)
+ cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+ else
+ cgroup_free_root(root);
+
+ return ret;
+}
+
+int cgroup1_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
+
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ ret = cgroup1_root_to_use(fc);
+ if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
+ ret = 1; /* restart */
+
+ cgroup_unlock();
+
+ if (!ret)
+ ret = cgroup_do_get_tree(fc);
+
+ if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
+ fc_drop_locked(fc);
+ ret = 1;
+ }
+
+ if (unlikely(ret > 0)) {
+ msleep(10);
+ return restart_syscall();
+ }
+ return ret;
+}
+
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup_root *root;
+ unsigned long flags;
+
+ rcu_read_lock();
+ for_each_root(root) {
+ /* cgroup1 only*/
+ if (root == &cgrp_dfl_root)
+ continue;
+ if (root->hierarchy_id != hierarchy_id)
+ continue;
+ spin_lock_irqsave(&css_set_lock, flags);
+ cgrp = task_cgroup_from_root(tsk, root);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+ break;
+ }
+ rcu_read_unlock();
+ return cgrp;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ WQ_PERCPU, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ if (!strcmp(token, "all")) {
+ cgroup_no_v1_mask = U16_MAX;
+ continue;
+ }
+
+ if (!strcmp(token, "named")) {
+ cgroup_no_v1_named = true;
+ continue;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ cgroup_no_v1_mask |= 1 << i;
+ break;
+ }
+ }
+ return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+static int __init cgroup_v1_proc(char *str)
+{
+ return (kstrtobool(str, &proc_show_all) == 0);
+}
+__setup("cgroup_v1_proc=", cgroup_v1_proc);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
new file mode 100644
index 000000000000..e717208cfb18
--- /dev/null
+++ b/kernel/cgroup/cgroup.c
@@ -0,0 +1,7448 @@
+/*
+ * Generic process-grouping system.
+ *
+ * Based originally on the cpuset system, extracted by Paul Menage
+ * Copyright (C) 2006 Google, Inc
+ *
+ * Notifications support
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
+ * Copyright notices from the original cpuset code:
+ * --------------------------------------------------
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ * 2003-10-10 Written by Simon Derr.
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson.
+ * ---------------------------------------------------
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "cgroup-internal.h"
+
+#include <linux/bpf-cgroup.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/magic.h>
+#include <linux/mutex.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/string.h>
+#include <linux/hashtable.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/atomic.h>
+#include <linux/cpuset.h>
+#include <linux/proc_ns.h>
+#include <linux/nsproxy.h>
+#include <linux/file.h>
+#include <linux/fs_parser.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
+#include <linux/psi.h>
+#include <linux/nstree.h>
+#include <linux/irq_work.h>
+#include <net/sock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cgroup.h>
+
+#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
+ MAX_CFTYPE_NAME + 2)
+/* let's not notify more than 100 times per second */
+#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
+
+/*
+ * To avoid confusing the compiler (and generating warnings) with code
+ * that attempts to access what would be a 0-element array (i.e. sized
+ * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
+ * constant expression can be added.
+ */
+#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
+
+/*
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * css_set_lock protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
+ */
+DEFINE_MUTEX(cgroup_mutex);
+DEFINE_SPINLOCK(css_set_lock);
+
+#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_lock);
+#endif
+
+struct blocking_notifier_head cgroup_lifetime_notifier =
+ BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier);
+
+DEFINE_SPINLOCK(trace_cgroup_path_lock);
+char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+static bool cgroup_debug __read_mostly;
+
+/*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+/*
+ * Protects cgroup_file->kn for !self csses. It synchronizes notifications
+ * against file removal/re-creation across css hiding.
+ */
+static DEFINE_SPINLOCK(cgroup_file_kn_lock);
+
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
+
+#define cgroup_assert_mutex_or_rcu_locked() \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ !lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
+
+/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_percpu_wq
+ * which may lead to deadlock.
+ *
+ * A cgroup destruction should enqueue work sequentially to:
+ * cgroup_offline_wq: use for css offline work
+ * cgroup_release_wq: use for css release work
+ * cgroup_free_wq: use for free work
+ *
+ * Rationale for using separate workqueues:
+ * The cgroup root free work may depend on completion of other css offline
+ * operations. If all tasks were enqueued to a single workqueue, this could
+ * create a deadlock scenario where:
+ * - Free work waits for other css offline work to complete.
+ * - But other css offline work is queued after free work in the same queue.
+ *
+ * Example deadlock scenario with single workqueue (cgroup_destroy_wq):
+ * 1. umount net_prio
+ * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx)
+ * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx)
+ * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline.
+ * 5. net_prio root destruction blocks waiting for perf_event CSS A offline,
+ * which can never complete as it's behind in the same queue and
+ * workqueue's max_active is 1.
+ */
+static struct workqueue_struct *cgroup_offline_wq;
+static struct workqueue_struct *cgroup_release_wq;
+static struct workqueue_struct *cgroup_free_wq;
+
+/* generate an array of cgroup subsystem pointers */
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
+struct cgroup_subsys *cgroup_subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x) \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu);
+static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu);
+
+/* the default hierarchy */
+struct cgroup_root cgrp_dfl_root = {
+ .cgrp.self.rstat_cpu = &root_rstat_cpu,
+ .cgrp.rstat_base_cpu = &root_rstat_base_cpu,
+};
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
+
+/*
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time. This is for backward compatibility.
+ */
+bool cgrp_dfl_visible;
+
+/* some controllers are not supported in the default hierarchy */
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static u16 cgrp_dfl_implicit_ss_mask;
+
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
+/* The list of hierarchy roots */
+LIST_HEAD(cgroup_roots);
+static int cgroup_root_count;
+
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
+static DEFINE_IDR(cgroup_hierarchy_idr);
+
+/*
+ * Assign a monotonically increasing serial number to csses. It guarantees
+ * cgroups with bigger numbers are newer than those with smaller numbers.
+ * Also, as csses are always appended to the parent's ->children list, it
+ * guarantees that sibling csses are always sorted in the ascending serial
+ * number order on the list. Protected by cgroup_mutex.
+ */
+static u64 css_serial_nr_next = 1;
+
+/*
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
+ */
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
+
+static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
+/* cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+ .ns = NS_COMMON_INIT(init_cgroup_ns),
+ .user_ns = &init_user_ns,
+ .root_cset = &init_css_set,
+};
+
+static struct file_system_type cgroup2_fs_type;
+static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
+
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+ OPT_FEATURE_PRESSURE,
+#endif
+ OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+ "pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
+
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task);
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add);
+static void cgroup_rt_init(void);
+
+#ifdef CONFIG_DEBUG_CGROUP_REF
+#define CGROUP_REF_FN_ATTRS noinline
+#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn);
+#include <linux/cgroup_refcnt.h>
+#endif
+
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core. This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+bool cgroup_ssid_enabled(int ssid)
+{
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return false;
+
+ return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differently depending on the
+ * interface version.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ * and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed. Everything should be at process granularity. Use
+ * "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted. pids will be unique unless they got
+ * recycled in-between reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed. Replacement
+ * notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
+ * and its descendants contain no task; otherwise, 1. The file also
+ * generates kernfs notification which can be monitored through poll and
+ * [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ * take masks of ancestors with non-empty cpus/mems, instead of being
+ * moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ * masks of ancestors.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ */
+bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+ return cgrp->root == &cgrp_dfl_root;
+}
+
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+ gfp_t gfp_mask)
+{
+ int ret;
+
+ idr_preload(gfp_mask);
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
+ spin_unlock_bh(&cgroup_idr_lock);
+ idr_preload_end();
+ return ret;
+}
+
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+ void *ret;
+
+ spin_lock_bh(&cgroup_idr_lock);
+ ret = idr_replace(idr, ptr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+ return ret;
+}
+
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+ spin_lock_bh(&cgroup_idr_lock);
+ idr_remove(idr, id);
+ spin_unlock_bh(&cgroup_idr_lock);
+}
+
+static bool cgroup_has_tasks(struct cgroup *cgrp)
+{
+ return cgrp->nr_populated_csets;
+}
+
+static bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+ return cgrp->dom_cgrp != cgrp;
+}
+
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+ /*
+ * Root isn't under domain level resource control exempting it from
+ * the no-internal-process constraint, so it can serve as a thread
+ * root and a parent of resource domains at the same time.
+ */
+ return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? Should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return true;
+
+ /* domain roots can't be nested under threaded */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* can only have either domain or threaded children */
+ if (cgrp->nr_populated_domain_children)
+ return false;
+
+ /* and no domain controllers can be enabled */
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return false;
+
+ return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+static bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+ /* thread root should be a domain */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* a domain w/ threaded children is a thread root */
+ if (cgrp->nr_threaded_children)
+ return true;
+
+ /*
+ * A domain which has tasks and explicit threaded controllers
+ * enabled is a thread root.
+ */
+ if (cgroup_has_tasks(cgrp) &&
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+ return true;
+
+ return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+ /* the cgroup itself can be a thread root */
+ if (cgroup_is_threaded(cgrp))
+ return false;
+
+ /* but the ancestors can't be unless mixable */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+ return false;
+ if (cgroup_is_threaded(cgrp))
+ return false;
+ }
+
+ return true;
+}
+
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ u16 root_ss_mask = cgrp->root->subsys_mask;
+
+ if (parent) {
+ u16 ss_mask = parent->subtree_control;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
+
+ if (cgroup_on_dfl(cgrp))
+ root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+ cgrp_dfl_implicit_ss_mask);
+ return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (parent) {
+ u16 ss_mask = parent->subtree_ss_mask;
+
+ /* threaded cgroups can only have threaded controllers */
+ if (cgroup_is_threaded(cgrp))
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
+ return ss_mask;
+ }
+
+ return cgrp->root->subsys_mask;
+}
+
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks. This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ if (CGROUP_HAS_SUBSYS_CONFIG && ss)
+ return rcu_dereference_check(cgrp->subsys[ss->id],
+ lockdep_is_held(&cgroup_mutex));
+ else
+ return &cgrp->self;
+}
+
+/**
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effective css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->self;
+
+ /*
+ * This function is used while updating css associations and thus
+ * can't test the csses directly. Test ss_mask.
+ */
+ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
+ cgrp = cgroup_parent(cgrp);
+ if (!cgrp)
+ return NULL;
+ }
+
+ return cgroup_css(cgrp, ss);
+}
+
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ *
+ * The returned css is not guaranteed to be online, and therefore it is the
+ * callers responsibility to try get a reference for it.
+ */
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css)
+ return css;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ return init_css_set.subsys[ss->id];
+}
+
+/**
+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss. The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ * The returned css must be put using css_put().
+ */
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
+ rcu_read_lock();
+
+ do {
+ css = cgroup_css(cgrp, ss);
+
+ if (css && css_tryget_online(css))
+ goto out_unlock;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+
+ css = init_css_set.subsys[ss->id];
+ css_get(css);
+out_unlock:
+ rcu_read_unlock();
+ return css;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
+
+static void cgroup_get_live(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ cgroup_get(cgrp);
+}
+
+/**
+ * __cgroup_task_count - count the number of tasks in a cgroup. The caller
+ * is responsible for taking the css_set_lock.
+ * @cgrp: the cgroup in question
+ */
+int __cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += link->cset->nr_tasks;
+
+ return count;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count;
+
+ spin_lock_irq(&css_set_lock);
+ count = __cgroup_task_count(cgrp);
+ spin_unlock_irq(&css_set_lock);
+
+ return count;
+}
+
+static struct cgroup *kn_priv(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent;
+ /*
+ * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT.
+ * Therefore it is always safe to dereference this pointer outside of a
+ * RCU section.
+ */
+ parent = rcu_dereference_check(kn->__parent,
+ kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT);
+ return parent->priv;
+}
+
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
+{
+ struct cgroup *cgrp = kn_priv(of->kn);
+ struct cftype *cft = of_cft(of);
+
+ /*
+ * This is open and unprotected implementation of cgroup_css().
+ * seq_css() is only called from a kernfs file operation which has
+ * an active reference on the file. Because all the subsystem
+ * files are drained before a css is disassociated with a cgroup,
+ * the matching css from the cgroup's subsys table is guaranteed to
+ * be and stay valid until the enclosing operation is complete.
+ */
+ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
+ return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+ else
+ return &cgrp->self;
+}
+EXPORT_SYMBOL_GPL(of_css);
+
+/**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_mutex.
+ */
+#define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
+
+/**
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_mask: the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * @ss_mask is set.
+ */
+#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
+ unsigned long __ss_mask = (ss_mask); \
+ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
+ (ssid) = 0; \
+ break; \
+ } \
+ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
+ (ss) = cgroup_subsys[ssid]; \
+ {
+
+#define while_each_subsys_mask() \
+ } \
+ } \
+} while (false)
+
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
+
+/* walk live descendants in pre order */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
+ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
+ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
+/*
+ * The default css_set - used by init and its children prior to any
+ * hierarchies being mounted. It contains a pointer to the root state
+ * for each subsystem. Also used to anchor the list of css_sets. Not
+ * reference-counted, to improve performance when child cgroups
+ * haven't been created.
+ */
+struct css_set init_css_set = {
+ .refcount = REFCOUNT_INIT(1),
+ .dom_cset = &init_css_set,
+ .tasks = LIST_HEAD_INIT(init_css_set.tasks),
+ .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
+ .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+
+ /*
+ * The following field is re-initialized when this cset gets linked
+ * in cgroup_init(). However, let's initialize the field
+ * statically too so that the default cgroup can be accessed safely
+ * early during boot.
+ */
+ .dfl_cgrp = &cgrp_dfl_root.cgrp,
+};
+
+static int css_set_count = 1; /* 1 for init_css_set */
+
+static bool css_set_threaded(struct css_set *cset)
+{
+ return cset->dom_cset != cset;
+}
+
+/**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+
+/**
+ * cgroup_update_populated - update the populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * One of the css_sets associated with @cgrp is either getting its first
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed. This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+ struct cgroup *child = NULL;
+ int adj = populated ? 1 : -1;
+
+ lockdep_assert_held(&css_set_lock);
+
+ do {
+ bool was_populated = cgroup_is_populated(cgrp);
+
+ if (!child) {
+ cgrp->nr_populated_csets += adj;
+ } else {
+ if (cgroup_is_threaded(child))
+ cgrp->nr_populated_threaded_children += adj;
+ else
+ cgrp->nr_populated_domain_children += adj;
+ }
+
+ if (was_populated == cgroup_is_populated(cgrp))
+ break;
+
+ cgroup1_check_for_release(cgrp);
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
+ cgroup_is_populated(cgrp));
+ cgroup_file_notify(&cgrp->events_file);
+
+ child = cgrp;
+ cgrp = cgroup_parent(cgrp);
+ } while (cgrp);
+}
+
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last. Update the
+ * populated counters of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+ struct cgrp_cset_link *link;
+
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ cgroup_update_populated(link->cgrp, populated);
+}
+
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position. Advancing an iterator might
+ * remove it from the list, use safe walk. See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+ struct task_struct *task)
+{
+ struct css_task_iter *it, *pos;
+
+ list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+ css_task_iter_skip(it, task);
+}
+
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset. If @task didn't belong to any
+ * css_set, @from_cset can be NULL. If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated counter updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+ struct css_set *from_cset, struct css_set *to_cset,
+ bool use_mg_tasks)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (to_cset && !css_set_populated(to_cset))
+ css_set_update_populated(to_cset, true);
+
+ if (from_cset) {
+ WARN_ON_ONCE(list_empty(&task->cg_list));
+
+ css_set_skip_task_iters(from_cset, task);
+ list_del_init(&task->cg_list);
+ if (!css_set_populated(from_cset))
+ css_set_update_populated(from_cset, false);
+ } else {
+ WARN_ON_ONCE(!list_empty(&task->cg_list));
+ }
+
+ if (to_cset) {
+ /*
+ * We are synchronized through cgroup_threadgroup_rwsem
+ * against PF_EXITING setting such that we can't race
+ * against cgroup_task_dead()/cgroup_task_free() dropping
+ * the css_set.
+ */
+ WARN_ON_ONCE(task->flags & PF_EXITING);
+
+ cgroup_move_task(task, to_cset);
+ list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+ &to_cset->tasks);
+ }
+}
+
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
+#define CSS_SET_HASH_BITS 7
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
+
+static unsigned long css_set_hash(struct cgroup_subsys_state **css)
+{
+ unsigned long key = 0UL;
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
+
+ return key;
+}
+
+void put_css_set_locked(struct css_set *cset)
+{
+ struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&css_set_lock);
+
+ if (!refcount_dec_and_test(&cset->refcount))
+ return;
+
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
+ /* This css_set is dead. Unlink it and release cgroup and css refs */
+ for_each_subsys(ss, ssid) {
+ list_del(&cset->e_cset_node[ssid]);
+ css_put(cset->subsys[ssid]);
+ }
+ hash_del(&cset->hlist);
+ css_set_count--;
+
+ list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ if (cgroup_parent(link->cgrp))
+ cgroup_put(link->cgrp);
+ kfree(link);
+ }
+
+ if (css_set_threaded(cset)) {
+ list_del(&cset->threaded_csets_node);
+ put_css_set_locked(cset->dom_cset);
+ }
+
+ kfree_rcu(cset, rcu_head);
+}
+
+/**
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cset: candidate css_set being tested
+ * @old_cset: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cset,
+ struct css_set *old_cset,
+ struct cgroup *new_cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ struct cgroup *new_dfl_cgrp;
+ struct list_head *l1, *l2;
+
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
+ return false;
+
+
+ /* @cset's domain should match the default cgroup's */
+ if (cgroup_on_dfl(new_cgrp))
+ new_dfl_cgrp = new_cgrp;
+ else
+ new_dfl_cgrp = old_cset->dfl_cgrp;
+
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+ return false;
+
+ /*
+ * Compare cgroup pointers in order to distinguish between
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
+ */
+ l1 = &cset->cgrp_links;
+ l2 = &old_cset->cgrp_links;
+ while (1) {
+ struct cgrp_cset_link *link1, *link2;
+ struct cgroup *cgrp1, *cgrp2;
+
+ l1 = l1->next;
+ l2 = l2->next;
+ /* See if we reached the end - both lists are equal length. */
+ if (l1 == &cset->cgrp_links) {
+ BUG_ON(l2 != &old_cset->cgrp_links);
+ break;
+ } else {
+ BUG_ON(l2 == &old_cset->cgrp_links);
+ }
+ /* Locate the cgroups associated with these links. */
+ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+ link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+ cgrp1 = link1->cgrp;
+ cgrp2 = link2->cgrp;
+ /* Hierarchies should be linked in the same order. */
+ BUG_ON(cgrp1->root != cgrp2->root);
+
+ /*
+ * If this hierarchy is the hierarchy of the cgroup
+ * that's changing, then we need to check that this
+ * css_set points to the new cgroup; if it's any other
+ * hierarchy, then this css_set should point to the
+ * same cgroup as the old css_set.
+ */
+ if (cgrp1->root == new_cgrp->root) {
+ if (cgrp1 != new_cgrp)
+ return false;
+ } else {
+ if (cgrp1 != cgrp2)
+ return false;
+ }
+ }
+ return true;
+}
+
+/**
+ * find_existing_css_set - init css array and find the matching css_set
+ * @old_cset: the css_set that we're using before the cgroup transition
+ * @cgrp: the cgroup that we're moving into
+ * @template: out param for the new set of csses, should be clear on entry
+ */
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp,
+ struct cgroup_subsys_state **template)
+{
+ struct cgroup_root *root = cgrp->root;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ unsigned long key;
+ int i;
+
+ /*
+ * Build the set of subsystem state objects that we want to see in the
+ * new css_set. While subsystems can change globally, the entries here
+ * won't change, so no need for locking.
+ */
+ for_each_subsys(ss, i) {
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css_by_mask(cgrp, ss);
+ } else {
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
+ template[i] = old_cset->subsys[i];
+ }
+ }
+
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cset, hlist, key) {
+ if (!compare_css_sets(cset, old_cset, cgrp, template))
+ continue;
+
+ /* This css_set matches what we need */
+ return cset;
+ }
+
+ /* No existing cgroup group matched */
+ return NULL;
+}
+
+static void free_cgrp_cset_links(struct list_head *links_to_free)
+{
+ struct cgrp_cset_link *link, *tmp_link;
+
+ list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+ list_del(&link->cset_link);
+ kfree(link);
+ }
+}
+
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link. Returns 0 on success or -errno.
+ */
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
+{
+ struct cgrp_cset_link *link;
+ int i;
+
+ INIT_LIST_HEAD(tmp_links);
+
+ for (i = 0; i < count; i++) {
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ free_cgrp_cset_links(tmp_links);
+ return -ENOMEM;
+ }
+ list_add(&link->cset_link, tmp_links);
+ }
+ return 0;
+}
+
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
+ * @cset: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+ struct cgroup *cgrp)
+{
+ struct cgrp_cset_link *link;
+
+ BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
+ link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+ link->cset = cset;
+ link->cgrp = cgrp;
+
+ /*
+ * Always add links to the tail of the lists so that the lists are
+ * in chronological order.
+ */
+ list_move_tail(&link->cset_link, &cgrp->cset_links);
+ list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+
+ if (cgroup_parent(cgrp))
+ cgroup_get_live(cgrp);
+}
+
+/**
+ * find_css_set - return a new css_set with one cgroup updated
+ * @old_cset: the baseline css_set
+ * @cgrp: the cgroup to be updated
+ *
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
+ */
+static struct css_set *find_css_set(struct css_set *old_cset,
+ struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
+ struct css_set *cset;
+ struct list_head tmp_links;
+ struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
+ unsigned long key;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* First see if we already have a cgroup group that matches
+ * the desired set */
+ spin_lock_irq(&css_set_lock);
+ cset = find_existing_css_set(old_cset, cgrp, template);
+ if (cset)
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ if (cset)
+ return cset;
+
+ cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+ if (!cset)
+ return NULL;
+
+ /* Allocate all the cgrp_cset_link objects that we'll need */
+ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
+ kfree(cset);
+ return NULL;
+ }
+
+ refcount_set(&cset->refcount, 1);
+ cset->dom_cset = cset;
+ INIT_LIST_HEAD(&cset->tasks);
+ INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->dying_tasks);
+ INIT_LIST_HEAD(&cset->task_iters);
+ INIT_LIST_HEAD(&cset->threaded_csets);
+ INIT_HLIST_NODE(&cset->hlist);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
+
+ /* Copy the set of subsystem state objects generated in
+ * find_existing_css_set() */
+ memcpy(cset->subsys, template, sizeof(cset->subsys));
+
+ spin_lock_irq(&css_set_lock);
+ /* Add reference counts and links from the new css_set. */
+ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == cgrp->root)
+ c = cgrp;
+ link_css_set(&tmp_links, cset, c);
+ }
+
+ BUG_ON(!list_empty(&tmp_links));
+
+ css_set_count++;
+
+ /* Add @cset to the hash table */
+ key = css_set_hash(cset->subsys);
+ hash_add(css_set_table, &cset->hlist, key);
+
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cset->subsys[ssid];
+
+ list_add_tail(&cset->e_cset_node[ssid],
+ &css->cgroup->e_csets[ssid]);
+ css_get(css);
+ }
+
+ spin_unlock_irq(&css_set_lock);
+
+ /*
+ * If @cset should be threaded, look up the matching dom_cset and
+ * link them up. We first fully initialize @cset then look for the
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
+ * to stay empty until we return.
+ */
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
+ struct css_set *dcset;
+
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+ if (!dcset) {
+ put_css_set(cset);
+ return NULL;
+ }
+
+ spin_lock_irq(&css_set_lock);
+ cset->dom_cset = dcset;
+ list_add_tail(&cset->threaded_csets_node,
+ &dcset->threaded_csets);
+ spin_unlock_irq(&css_set_lock);
+ }
+
+ return cset;
+}
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+{
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
+
+ return root_cgrp->root;
+}
+
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /*
+ * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+ * favordynmods can flip while task is between
+ * cgroup_threadgroup_change_begin() and end(), so down_write global
+ * cgroup_threadgroup_rwsem to synchronize them.
+ *
+ * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+ * cgroup_threadgroup_rwsem doesn't exlude tasks between
+ * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+ * turn off. As the scenario is unlikely, simply disallow disabling once
+ * enabled and print out a warning.
+ */
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ if (favor && !favoring) {
+ cgroup_enable_per_threadgroup_rwsem = true;
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+}
+
+static int cgroup_init_root_id(struct cgroup_root *root)
+{
+ int id;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ root->hierarchy_id = id;
+ return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+}
+
+void cgroup_free_root(struct cgroup_root *root)
+{
+ kfree_rcu(root, rcu);
+}
+
+static void cgroup_destroy_root(struct cgroup_root *root)
+{
+ struct cgroup *cgrp = &root->cgrp;
+ struct cgrp_cset_link *link, *tmp_link;
+ int ret;
+
+ trace_cgroup_destroy_root(root);
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ BUG_ON(atomic_read(&root->nr_cgrps));
+ BUG_ON(!list_empty(&cgrp->self.children));
+
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
+
+ /* Rebind all subsystems back to the default hierarchy */
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
+
+ /*
+ * Release all the links from cset_links to this hierarchy's
+ * root cgroup
+ */
+ spin_lock_irq(&css_set_lock);
+
+ list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+ list_del(&link->cset_link);
+ list_del(&link->cgrp_link);
+ kfree(link);
+ }
+
+ spin_unlock_irq(&css_set_lock);
+
+ WARN_ON_ONCE(list_empty(&root->root_list));
+ list_del_rcu(&root->root_list);
+ cgroup_root_count--;
+
+ if (!have_favordynmods)
+ cgroup_favor_dynmods(root, false);
+
+ cgroup_exit_root_id(root);
+
+ cgroup_unlock();
+
+ kernfs_destroy_root(root->kf_root);
+ cgroup_free_root(root);
+}
+
+/*
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
+ */
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ struct cgroup *res_cgroup = NULL;
+
+ if (cset == &init_css_set) {
+ res_cgroup = &root->cgrp;
+ } else if (root == &cgrp_dfl_root) {
+ res_cgroup = cset->dfl_cgrp;
+ } else {
+ struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == root) {
+ res_cgroup = c;
+ break;
+ }
+ }
+ }
+
+ /*
+ * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+ * before we remove the cgroup root from the root_list. Consequently,
+ * when accessing a cgroup root, the cset_link may have already been
+ * freed, resulting in a NULL res_cgroup. However, by holding the
+ * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+ * If we don't hold cgroup_mutex in the caller, we must do the NULL
+ * check.
+ */
+ return res_cgroup;
+}
+
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ rcu_read_lock();
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ res = __cset_cgroup_from_root(cset, root);
+
+ rcu_read_unlock();
+
+ /*
+ * The namespace_sem is held by current, so the root cgroup can't
+ * be umounted. Therefore, we can ensure that the res is non-NULL.
+ */
+ WARN_ON_ONCE(!res);
+ return res;
+}
+
+/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ if (current->nsproxy) {
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ } else {
+ /*
+ * NOTE: This function may be called from bpf_cgroup_from_id()
+ * on a task which has already passed exit_nsproxy_namespaces()
+ * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+ * make all cgroups visible for lookups.
+ */
+ return &cgrp_dfl_root.cgrp;
+ }
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ return __cset_cgroup_from_root(cset, root);
+}
+
+/*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
+ */
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
+{
+ /*
+ * No need to lock the task - since we hold css_set_lock the
+ * task can't change groups.
+ */
+ return cset_cgroup_from_root(task_css_set(task), root);
+}
+
+/*
+ * A task must hold cgroup_mutex to modify cgroups.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding cgroup_mutex can't rely on the count
+ * field not changing. However, if the count goes to zero, then only
+ * cgroup_attach_task() can increment it again. Because a count of zero
+ * means that no tasks are currently attached, therefore there is no
+ * way a task attached to that cgroup can fork (the other way to
+ * increment the count). So code holding cgroup_mutex can safely
+ * assume that if the count is zero, it will stay zero. Similarly, if
+ * a task holds cgroup_mutex on a cgroup with zero count, it
+ * knows that the cgroup won't be removed, as cgroup_rmdir()
+ * needs that mutex.
+ *
+ * A cgroup can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cgroups is empty. Since all
+ * tasks in the system use _some_ cgroup, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, root cgroup
+ * always has either children cgroups and/or using tasks. So we don't
+ * need a special hack to ensure that root cgroup cannot be deleted.
+ *
+ * P.S. One more locking exception. RCU is used to guard the
+ * update of a tasks cgroup pointer by cgroup_attach_task()
+ */
+
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
+
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+ char *buf)
+{
+ struct cgroup_subsys *ss = cft->ss;
+
+ if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ cft->name);
+ } else {
+ strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ }
+ return buf;
+}
+
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * S_IRUGO for read, S_IWUSR for write.
+ */
+static umode_t cgroup_file_mode(const struct cftype *cft)
+{
+ umode_t mode = 0;
+
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+ mode |= S_IRUGO;
+
+ if (cft->write_u64 || cft->write_s64 || cft->write) {
+ if (cft->flags & CFTYPE_WORLD_WRITABLE)
+ mode |= S_IWUGO;
+ else
+ mode |= S_IWUSR;
+ }
+
+ return mode;
+}
+
+/**
+ * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
+ * @subtree_control: the new subtree_control mask to consider
+ * @this_ss_mask: available subsystems
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function calculates which subsystems need to be enabled if
+ * @subtree_control is to be applied while restricted to @this_ss_mask.
+ */
+static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
+{
+ u16 cur_ss_mask = subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
+
+ while (true) {
+ u16 new_ss_mask = cur_ss_mask;
+
+ do_each_subsys_mask(ss, ssid, cur_ss_mask) {
+ new_ss_mask |= ss->depends_on;
+ } while_each_subsys_mask();
+
+ /*
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
+ */
+ new_ss_mask &= this_ss_mask;
+
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
+ }
+
+ return cur_ss_mask;
+}
+
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded. Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time. If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
+ */
+void cgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn_priv(kn);
+
+ cgroup_unlock();
+
+ kernfs_unbreak_active_protection(kn);
+ cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ * @drain_offline: perform offline draining on the cgroup
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn. It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive. Returns the cgroup if
+ * alive; otherwise, %NULL. A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
+ * cgroup is drained of offlining csses before return.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper. It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
+{
+ struct cgroup *cgrp;
+
+ if (kernfs_type(kn) == KERNFS_DIR)
+ cgrp = kn->priv;
+ else
+ cgrp = kn_priv(kn);
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. cgroup liveliness check alone provides enough
+ * protection against removal. Ensure @cgrp stays accessible and
+ * break the active_ref protection.
+ */
+ if (!cgroup_tryget(cgrp))
+ return NULL;
+ kernfs_break_active_protection(kn);
+
+ if (drain_offline)
+ cgroup_lock_and_drain_offline(cgrp);
+ else
+ cgroup_lock();
+
+ if (!cgroup_is_dead(cgrp))
+ return cgrp;
+
+ cgroup_kn_unlock(kn);
+ return NULL;
+}
+
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+ char name[CGROUP_FILE_NAME_MAX];
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (cft->file_offset) {
+ struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
+ struct cgroup_file *cfile = (void *)css + cft->file_offset;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ cfile->kn = NULL;
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ timer_delete_sync(&cfile->notify_timer);
+ }
+
+ kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
+}
+
+/**
+ * css_clear_dir - remove subsys files in a cgroup directory
+ * @css: target css
+ */
+static void css_clear_dir(struct cgroup_subsys_state *css)
+{
+ struct cgroup *cgrp = css->cgroup;
+ struct cftype *cfts;
+
+ if (!(css->flags & CSS_VISIBLE))
+ return;
+
+ css->flags &= ~CSS_VISIBLE;
+
+ if (css_is_self(css)) {
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
+ } else {
+ list_for_each_entry(cfts, &css->ss->cfts, node)
+ cgroup_addrm_files(css, cgrp, cfts, false);
+ }
+}
+
+/**
+ * css_populate_dir - create subsys files in a cgroup directory
+ * @css: target css
+ *
+ * On failure, no file is added.
+ */
+static int css_populate_dir(struct cgroup_subsys_state *css)
+{
+ struct cgroup *cgrp = css->cgroup;
+ struct cftype *cfts, *failed_cfts;
+ int ret;
+
+ if (css->flags & CSS_VISIBLE)
+ return 0;
+
+ if (css_is_self(css)) {
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ return ret;
+ }
+ }
+ } else {
+ ret = cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ ret = cgroup_addrm_files(css, cgrp, cfts, true);
+ if (ret < 0) {
+ failed_cfts = cfts;
+ goto err;
+ }
+ }
+ }
+
+ css->flags |= CSS_VISIBLE;
+
+ return 0;
+err:
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ if (cfts == failed_cfts)
+ break;
+ cgroup_addrm_files(css, cgrp, cfts, false);
+ }
+ return ret;
+}
+
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+{
+ struct cgroup *dcgrp = &dst_root->cgrp;
+ struct cgroup_subsys *ss;
+ int ssid, ret;
+ u16 dfl_disable_ss_mask = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ do_each_subsys_mask(ss, ssid, ss_mask) {
+ /*
+ * If @ss has non-root csses attached to it, can't move.
+ * If @ss is an implicit controller, it is exempt from this
+ * rule and can be stolen.
+ */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+ !ss->implicit_on_dfl)
+ return -EBUSY;
+
+ /* can't move between two non-dummy roots either */
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
+ return -EBUSY;
+
+ /*
+ * Collect ssid's that need to be disabled from default
+ * hierarchy.
+ */
+ if (ss->root == &cgrp_dfl_root)
+ dfl_disable_ss_mask |= 1 << ssid;
+
+ } while_each_subsys_mask();
+
+ if (dfl_disable_ss_mask) {
+ struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+ /*
+ * Controllers from default hierarchy that need to be rebound
+ * are all disabled together in one go.
+ */
+ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
+ do_each_subsys_mask(ss, ssid, ss_mask) {
+ struct cgroup_root *src_root = ss->root;
+ struct cgroup *scgrp = &src_root->cgrp;
+ struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
+ struct css_set *cset, *cset_pos;
+ struct css_task_iter *it;
+
+ WARN_ON(!css || cgroup_css(dcgrp, ss));
+
+ if (src_root != &cgrp_dfl_root) {
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
+ /* rebind */
+ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
+ rcu_assign_pointer(dcgrp->subsys[ssid], css);
+ ss->root = dst_root;
+
+ spin_lock_irq(&css_set_lock);
+ css->cgroup = dcgrp;
+ WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ e_cset_node[ss->id]) {
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dcgrp->e_csets[ss->id]);
+ /*
+ * all css_sets of scgrp together in same order to dcgrp,
+ * patch in-flight iterators to preserve correct iteration.
+ * since the iterator is always advanced right away and
+ * finished when it->cset_pos meets it->cset_head, so only
+ * update it->cset_head is enough here.
+ */
+ list_for_each_entry(it, &cset->task_iters, iters_node)
+ if (it->cset_head == &scgrp->e_csets[ss->id])
+ it->cset_head = &dcgrp->e_csets[ss->id];
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root == &cgrp_dfl_root) {
+ static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
+ } else {
+ dcgrp->subtree_control |= 1 << ssid;
+ static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
+ }
+
+ ret = cgroup_apply_control(dcgrp);
+ if (ret)
+ pr_warn("partial failure to rebind %s controller (err=%d)\n",
+ ss->name, ret);
+
+ if (ss->bind)
+ ss->bind(css);
+ } while_each_subsys_mask();
+
+ kernfs_activate(dcgrp->kn);
+ return 0;
+}
+
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root)
+{
+ int len = 0;
+ char *buf = NULL;
+ struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+ struct cgroup *ns_cgroup;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+ len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+ spin_unlock_irq(&css_set_lock);
+
+ if (len == -E2BIG)
+ len = -ERANGE;
+ else if (len > 0) {
+ seq_escape(sf, buf, " \t\n\\");
+ len = 0;
+ }
+ kfree(buf);
+ return len;
+}
+
+enum cgroup2_param {
+ Opt_nsdelegate,
+ Opt_favordynmods,
+ Opt_memory_localevents,
+ Opt_memory_recursiveprot,
+ Opt_memory_hugetlb_accounting,
+ Opt_pids_localevents,
+ nr__cgroup2_params
+};
+
+static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
+ fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
+ fsparam_flag("memory_localevents", Opt_memory_localevents),
+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+ fsparam_flag("pids_localevents", Opt_pids_localevents),
+ {}
+};
+
+static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_nsdelegate:
+ ctx->flags |= CGRP_ROOT_NS_DELEGATE;
+ return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
+ case Opt_memory_localevents:
+ ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ return 0;
+ case Opt_memory_recursiveprot:
+ ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ return 0;
+ case Opt_memory_hugetlb_accounting:
+ ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ return 0;
+ case Opt_pids_localevents:
+ ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return &ctx->peak;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+ if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+ if (root_flags & CGRP_ROOT_NS_DELEGATE)
+ cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+
+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+ if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+ if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ }
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+ seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ seq_puts(seq, ",memory_localevents");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ seq_puts(seq, ",memory_recursiveprot");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ seq_puts(seq, ",memory_hugetlb_accounting");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ seq_puts(seq, ",pids_localevents");
+ return 0;
+}
+
+static int cgroup_reconfigure(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+
+ apply_cgroup_root_flags(ctx->flags);
+ return 0;
+}
+
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ INIT_LIST_HEAD(&cgrp->self.sibling);
+ INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->cset_links);
+ INIT_LIST_HEAD(&cgrp->pidlists);
+ mutex_init(&cgrp->pidlist_mutex);
+ cgrp->self.cgroup = cgrp;
+ cgrp->self.flags |= CSS_ONLINE;
+ cgrp->dom_cgrp = cgrp;
+ cgrp->max_descendants = INT_MAX;
+ cgrp->max_depth = INT_MAX;
+ prev_cputime_init(&cgrp->prev_cputime);
+
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+#ifdef CONFIG_CGROUP_BPF
+ for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
+ cgrp->bpf.revisions[i] = 1;
+#endif
+
+ init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
+}
+
+void init_cgroup_root(struct cgroup_fs_context *ctx)
+{
+ struct cgroup_root *root = ctx->root;
+ struct cgroup *cgrp = &root->cgrp;
+
+ INIT_LIST_HEAD_RCU(&root->root_list);
+ atomic_set(&root->nr_cgrps, 1);
+ cgrp->root = root;
+ init_cgroup_housekeeping(cgrp);
+
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
+ if (ctx->release_agent)
+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
+ if (ctx->name)
+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
+ if (ctx->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
+}
+
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+{
+ LIST_HEAD(tmp_links);
+ struct cgroup *root_cgrp = &root->cgrp;
+ struct kernfs_syscall_ops *kf_sops;
+ struct css_set *cset;
+ int i, ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ /*
+ * We're accessing css_set_count without locking css_set_lock here,
+ * but that's OK - it can only be increased by someone holding
+ * cgroup_lock, and that's us. Later rebinding may disable
+ * controllers on the default hierarchy and thus create new csets,
+ * which can't be more than the existing ones. Allocate 2x.
+ */
+ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
+ if (ret)
+ goto cancel_ref;
+
+ ret = cgroup_init_root_id(root);
+ if (ret)
+ goto cancel_ref;
+
+ kf_sops = root == &cgrp_dfl_root ?
+ &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+ root->kf_root = kernfs_create_root(kf_sops,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_SUPPORT_EXPORTOP |
+ KERNFS_ROOT_SUPPORT_USER_XATTR |
+ KERNFS_ROOT_INVARIANT_PARENT,
+ root_cgrp);
+ if (IS_ERR(root->kf_root)) {
+ ret = PTR_ERR(root->kf_root);
+ goto exit_root_id;
+ }
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
+ WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
+ root_cgrp->ancestors[0] = root_cgrp;
+
+ ret = css_populate_dir(&root_cgrp->self);
+ if (ret)
+ goto destroy_root;
+
+ ret = css_rstat_init(&root_cgrp->self);
+ if (ret)
+ goto destroy_root;
+
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto exit_stats;
+
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE, root_cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
+
+ trace_cgroup_setup_root(root);
+
+ /*
+ * There must be no failure case after here, since rebinding takes
+ * care of subsystems' refcounts, which are explicitly dropped in
+ * the failure exit path.
+ */
+ list_add_rcu(&root->root_list, &cgroup_roots);
+ cgroup_root_count++;
+
+ /*
+ * Link the root cgroup in this hierarchy into all the css_set
+ * objects.
+ */
+ spin_lock_irq(&css_set_lock);
+ hash_for_each(css_set_table, i, cset, hlist) {
+ link_css_set(&tmp_links, cset, root_cgrp);
+ if (css_set_populated(cset))
+ cgroup_update_populated(root_cgrp, true);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ BUG_ON(!list_empty(&root_cgrp->self.children));
+ BUG_ON(atomic_read(&root->nr_cgrps) != 1);
+
+ ret = 0;
+ goto out;
+
+exit_stats:
+ css_rstat_exit(&root_cgrp->self);
+destroy_root:
+ kernfs_destroy_root(root->kf_root);
+ root->kf_root = NULL;
+exit_root_id:
+ cgroup_exit_root_id(root);
+cancel_ref:
+ percpu_ref_exit(&root_cgrp->self.refcnt);
+out:
+ free_cgrp_cset_links(&tmp_links);
+ return ret;
+}
+
+int cgroup_do_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
+
+ ctx->kfc.root = ctx->root->kf_root;
+ if (fc->fs_type == &cgroup2_fs_type)
+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
+ else
+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
+ ret = kernfs_get_tree(fc);
+
+ /*
+ * In non-init cgroup namespace, instead of root cgroup's dentry,
+ * we return the dentry corresponding to the cgroupns->root_cgrp.
+ */
+ if (!ret && ctx->ns != &init_cgroup_ns) {
+ struct dentry *nsdentry;
+ struct super_block *sb = fc->root->d_sb;
+ struct cgroup *cgrp;
+
+ cgroup_lock();
+ spin_lock_irq(&css_set_lock);
+
+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
+
+ spin_unlock_irq(&css_set_lock);
+ cgroup_unlock();
+
+ nsdentry = kernfs_node_dentry(cgrp->kn, sb);
+ dput(fc->root);
+ if (IS_ERR(nsdentry)) {
+ deactivate_locked_super(sb);
+ ret = PTR_ERR(nsdentry);
+ nsdentry = NULL;
+ }
+ fc->root = nsdentry;
+ }
+
+ if (!ctx->kfc.new_sb_created)
+ cgroup_put(&ctx->root->cgrp);
+
+ return ret;
+}
+
+/*
+ * Destroy a cgroup filesystem context.
+ */
+static void cgroup_fs_context_free(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+
+ kfree(ctx->name);
+ kfree(ctx->release_agent);
+ put_cgroup_ns(ctx->ns);
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static int cgroup_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
+
+ WRITE_ONCE(cgrp_dfl_visible, true);
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
+ ctx->root = &cgrp_dfl_root;
+
+ ret = cgroup_do_get_tree(fc);
+ if (!ret)
+ apply_cgroup_root_flags(ctx->flags);
+ return ret;
+}
+
+static const struct fs_context_operations cgroup_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup2_parse_param,
+ .get_tree = cgroup_get_tree,
+ .reconfigure = cgroup_reconfigure,
+};
+
+static const struct fs_context_operations cgroup1_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup1_parse_param,
+ .get_tree = cgroup1_get_tree,
+ .reconfigure = cgroup1_reconfigure,
+};
+
+/*
+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
+ * we select the namespace we're going to use.
+ */
+static int cgroup_init_fs_context(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ fc->fs_private = &ctx->kfc;
+ if (fc->fs_type == &cgroup2_fs_type)
+ fc->ops = &cgroup_fs_context_ops;
+ else
+ fc->ops = &cgroup1_fs_context_ops;
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
+ fc->global = true;
+
+ if (have_favordynmods)
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+
+ return 0;
+}
+
+static void cgroup_kill_sb(struct super_block *sb)
+{
+ struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+
+ /*
+ * If @root doesn't have any children, start killing it.
+ * This prevents new mounts by disabling percpu_ref_tryget_live().
+ *
+ * And don't kill the default root.
+ */
+ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ percpu_ref_kill(&root->cgrp.self.refcnt);
+ cgroup_put(&root->cgrp);
+ kernfs_kill_sb(sb);
+}
+
+struct file_system_type cgroup_fs_type = {
+ .name = "cgroup",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = cgroup1_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+static struct file_system_type cgroup2_fs_type = {
+ .name = "cgroup2",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = cgroup2_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+#ifdef CONFIG_CPUSETS_V1
+enum cpuset_param {
+ Opt_cpuset_v2_mode,
+};
+
+static const struct fs_parameter_spec cpuset_fs_parameters[] = {
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ {}
+};
+
+static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, cpuset_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static const struct fs_context_operations cpuset_fs_context_ops = {
+ .get_tree = cgroup1_get_tree,
+ .free = cgroup_fs_context_free,
+ .parse_param = cpuset_parse_param,
+};
+
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
+ */
+static int cpuset_init_fs_context(struct fs_context *fc)
+{
+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
+ struct cgroup_fs_context *ctx;
+ int err;
+
+ err = cgroup_init_fs_context(fc);
+ if (err) {
+ kfree(agent);
+ return err;
+ }
+
+ fc->ops = &cpuset_fs_context_ops;
+
+ ctx = cgroup_fc2context(fc);
+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ ctx->release_agent = agent;
+
+ get_filesystem(&cgroup_fs_type);
+ put_filesystem(fc->fs_type);
+ fc->fs_type = &cgroup_fs_type;
+
+ return 0;
+}
+
+static struct file_system_type cpuset_fs_type = {
+ .name = "cpuset",
+ .init_fs_context = cpuset_init_fs_context,
+ .parameters = cpuset_fs_parameters,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+#endif
+
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
+{
+ struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+
+ return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+}
+
+int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
+{
+ int ret;
+
+ cgroup_lock();
+ spin_lock_irq(&css_set_lock);
+
+ ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
+
+ spin_unlock_irq(&css_set_lock);
+ cgroup_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cgroup_path_ns);
+
+/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
+ */
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
+{
+ cpus_read_lock();
+
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
+ */
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+ struct task_struct *tsk)
+{
+ switch (lock_mode) {
+ case CGRP_ATTACH_LOCK_NONE:
+ break;
+ case CGRP_ATTACH_LOCK_GLOBAL:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ break;
+ case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+ up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+ break;
+ default:
+ pr_warn("cgroup: Unexpected attach lock mode.");
+ break;
+ }
+
+ cpus_read_unlock();
+}
+
+/**
+ * cgroup_migrate_add_task - add a migration target task to a migration context
+ * @task: target task
+ * @mgctx: target migration context
+ *
+ * Add @task, which is a migration target, to @mgctx->tset. This function
+ * becomes noop if @task doesn't need to be migrated. @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
+ */
+static void cgroup_migrate_add_task(struct task_struct *task,
+ struct cgroup_mgctx *mgctx)
+{
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ return;
+
+ /* cgroup_threadgroup_rwsem protects racing against forks */
+ WARN_ON_ONCE(list_empty(&task->cg_list));
+
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ return;
+
+ mgctx->tset.nr_tasks++;
+
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node,
+ &mgctx->tset.src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_add_tail(&cset->mg_dst_cset->mg_node,
+ &mgctx->tset.dst_csets);
+}
+
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ * @dst_cssp: output variable for the destination css
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
+ struct cgroup_subsys_state **dst_cssp)
+{
+ tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+ tset->cur_task = NULL;
+
+ return cgroup_taskset_next(tset, dst_cssp);
+}
+
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ * @dst_cssp: output variable for the destination css
+ *
+ * Return the next task in @tset. Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
+ struct cgroup_subsys_state **dst_cssp)
+{
+ struct css_set *cset = tset->cur_cset;
+ struct task_struct *task = tset->cur_task;
+
+ while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
+ if (!task)
+ task = list_first_entry(&cset->mg_tasks,
+ struct task_struct, cg_list);
+ else
+ task = list_next_entry(task, cg_list);
+
+ if (&task->cg_list != &cset->mg_tasks) {
+ tset->cur_cset = cset;
+ tset->cur_task = task;
+
+ /*
+ * This function may be called both before and
+ * after cgroup_migrate_execute(). The two cases
+ * can be distinguished by looking at whether @cset
+ * has its ->mg_dst_cset set.
+ */
+ if (cset->mg_dst_cset)
+ *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
+ else
+ *dst_cssp = cset->subsys[tset->ssid];
+
+ return task;
+ }
+
+ cset = list_next_entry(cset, mg_node);
+ task = NULL;
+ }
+
+ return NULL;
+}
+
+/**
+ * cgroup_migrate_execute - migrate a taskset
+ * @mgctx: migration context
+ *
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
+ * This function fails iff one of the ->can_attach callbacks fails and
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
+ */
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
+{
+ struct cgroup_taskset *tset = &mgctx->tset;
+ struct cgroup_subsys *ss;
+ struct task_struct *task, *tmp_task;
+ struct css_set *cset, *tmp_cset;
+ int ssid, failed_ssid, ret;
+
+ /* check that we can legitimately attach to the cgroup */
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
+ if (ret) {
+ failed_ssid = ssid;
+ goto out_cancel_attach;
+ }
+ }
+ } while_each_subsys_mask();
+ }
+
+ /*
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
+ */
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(cset, &tset->src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+ struct css_set *from_cset = task_css_set(task);
+ struct css_set *to_cset = cset->mg_dst_cset;
+
+ get_css_set(to_cset);
+ to_cset->nr_tasks++;
+ css_set_move_task(task, from_cset, to_cset, true);
+ from_cset->nr_tasks--;
+ /*
+ * If the source or destination cgroup is frozen,
+ * the task might require to change its state.
+ */
+ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
+ to_cset->dfl_cgrp);
+ put_css_set_locked(from_cset);
+
+ }
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
+ */
+ tset->csets = &tset->dst_csets;
+
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
+
+ ret = 0;
+ goto out_release_tset;
+
+out_cancel_attach:
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ssid == failed_ssid)
+ break;
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
+out_release_tset:
+ spin_lock_irq(&css_set_lock);
+ list_splice_init(&tset->dst_csets, &tset->src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Re-initialize the cgroup_taskset structure in case it is reused
+ * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ * iteration.
+ */
+ tset->nr_tasks = 0;
+ tset->csets = &tset->src_csets;
+ return ret;
+}
+
+/**
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
+ * @dst_cgrp: destination cgroup to test
+ *
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
+ */
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
+{
+ /* v1 doesn't have any restriction */
+ if (!cgroup_on_dfl(dst_cgrp))
+ return 0;
+
+ /* verify @dst_cgrp can host resources */
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * If @dst_cgrp is already or can become a thread root or is
+ * threaded, it doesn't matter.
+ */
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+ return 0;
+
+ /* apply no-internal-process constraint */
+ if (dst_cgrp->subtree_control)
+ return -EBUSY;
+
+ return 0;
+}
+
+/**
+ * cgroup_migrate_finish - cleanup after attach
+ * @mgctx: migration context
+ *
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
+ * those functions for details.
+ */
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
+{
+ struct css_set *cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
+
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_dst_preload_node);
+ put_css_set_locked(cset);
+ }
+
+ spin_unlock_irq(&css_set_lock);
+}
+
+/**
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @mgctx: migration context
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process. Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
+ */
+void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx)
+{
+ struct cgroup *src_cgrp;
+
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_lock);
+
+ /*
+ * If ->dead, @src_set is associated with one or more dead cgroups
+ * and doesn't contain any migratable tasks. Ignore it early so
+ * that the rest of migration path doesn't get confused by it.
+ */
+ if (src_cset->dead)
+ return;
+
+ if (!list_empty(&src_cset->mg_src_preload_node))
+ return;
+
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
+ WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(src_cset->mg_dst_cgrp);
+ WARN_ON(!list_empty(&src_cset->mg_tasks));
+ WARN_ON(!list_empty(&src_cset->mg_node));
+
+ src_cset->mg_src_cgrp = src_cgrp;
+ src_cset->mg_dst_cgrp = dst_cgrp;
+ get_css_set(src_cset);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
+}
+
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @mgctx: migration context
+ *
+ * Tasks are about to be moved and all the source css_sets have been
+ * preloaded to @mgctx->preloaded_src_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set. After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @mgctx.
+ */
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
+{
+ struct css_set *src_cset, *tmp_cset;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up the dst cset for each src cset and link it to src */
+ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ struct css_set *dst_cset;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
+ if (!dst_cset)
+ return -ENOMEM;
+
+ WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ src_cset->mg_dst_cgrp = NULL;
+ list_del_init(&src_cset->mg_src_preload_node);
+ put_css_set(src_cset);
+ put_css_set(dst_cset);
+ continue;
+ }
+
+ src_cset->mg_dst_cset = dst_cset;
+
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
+ &mgctx->preloaded_dst_csets);
+ else
+ put_css_set(dst_cset);
+
+ for_each_subsys(ss, ssid)
+ if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+ mgctx->ss_mask |= 1 << ssid;
+ }
+
+ return 0;
+}
+
+/**
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ * @mgctx: migration context
+ *
+ * Migrate a process or task denoted by @leader. If migrating a process,
+ * the caller must be holding cgroup_threadgroup_rwsem. The caller is also
+ * responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
+ *
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed. This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
+ */
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx)
+{
+ struct task_struct *task;
+
+ /*
+ * The following thread iteration should be inside an RCU critical
+ * section to prevent tasks from being freed while taking the snapshot.
+ * spin_lock_irq() implies RCU critical section here.
+ */
+ spin_lock_irq(&css_set_lock);
+ task = leader;
+ do {
+ cgroup_migrate_add_task(task, mgctx);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ spin_unlock_irq(&css_set_lock);
+
+ return cgroup_migrate_execute(mgctx);
+}
+
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
+ */
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup)
+{
+ DEFINE_CGROUP_MGCTX(mgctx);
+ struct task_struct *task;
+ int ret = 0;
+
+ /* look up all src csets */
+ spin_lock_irq(&css_set_lock);
+ task = leader;
+ do {
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
+ if (!threadgroup)
+ break;
+ } while_each_thread(leader, task);
+ spin_unlock_irq(&css_set_lock);
+
+ /* prepare dst csets and commit */
+ ret = cgroup_migrate_prepare_dst(&mgctx);
+ if (!ret)
+ ret = cgroup_migrate(leader, threadgroup, &mgctx);
+
+ cgroup_migrate_finish(&mgctx);
+
+ if (!ret)
+ TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
+
+ return ret;
+}
+
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+ enum cgroup_attach_lock_mode *lock_mode)
+{
+ struct task_struct *tsk;
+ pid_t pid;
+
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return ERR_PTR(-EINVAL);
+
+retry_find_task:
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ tsk = ERR_PTR(-ESRCH);
+ goto out_unlock_rcu;
+ }
+ } else {
+ tsk = current;
+ }
+
+ if (threadgroup)
+ tsk = tsk->group_leader;
+
+ /*
+ * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+ * If userland migrates such a kthread to a non-root cgroup, it can
+ * become trapped in a cpuset, or RT kthread may be born in a
+ * cgroup with no rt_runtime allocated. Just say no.
+ */
+ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
+ tsk = ERR_PTR(-EINVAL);
+ goto out_unlock_rcu;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ /*
+ * If we migrate a single thread, we don't care about threadgroup
+ * stability. If the thread is `current`, it won't exit(2) under our
+ * hands or change PID through exec(2). We exclude
+ * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers
+ * by cgroup_mutex. Therefore, we can skip the global lock.
+ */
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (pid || threadgroup) {
+ if (cgroup_enable_per_threadgroup_rwsem)
+ *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+ else
+ *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ } else {
+ *lock_mode = CGRP_ATTACH_LOCK_NONE;
+ }
+
+ cgroup_attach_lock(*lock_mode, tsk);
+
+ if (threadgroup) {
+ if (!thread_group_leader(tsk)) {
+ /*
+ * A race with de_thread from another thread's exec()
+ * may strip us of our leadership. If this happens,
+ * throw this task away and try again.
+ */
+ cgroup_attach_unlock(*lock_mode, tsk);
+ put_task_struct(tsk);
+ goto retry_find_task;
+ }
+ }
+
+ return tsk;
+
+out_unlock_rcu:
+ rcu_read_unlock();
+ return tsk;
+}
+
+void cgroup_procs_write_finish(struct task_struct *task,
+ enum cgroup_attach_lock_mode lock_mode)
+{
+ cgroup_attach_unlock(lock_mode, task);
+
+ /* release reference from cgroup_procs_write_start() */
+ put_task_struct(task);
+}
+
+static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ do_each_subsys_mask(ss, ssid, ss_mask) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_puts(seq, ss->name);
+ printed = true;
+ } while_each_subsys_mask();
+ if (printed)
+ seq_putc(seq, '\n');
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgroup_control(cgrp));
+ return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
+ return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's control masks have changed and its subtree's css associations
+ * need to be updated accordingly. This function looks up all css_sets
+ * which are attached to the subtree, creates the matching updated css_sets
+ * and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+ DEFINE_CGROUP_MGCTX(mgctx);
+ struct cgroup_subsys_state *d_css;
+ struct cgroup *dsct;
+ struct css_set *src_cset;
+ enum cgroup_attach_lock_mode lock_mode;
+ bool has_tasks;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ spin_lock_irq(&css_set_lock);
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ struct cgrp_cset_link *link;
+
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
+ list_for_each_entry(link, &dsct->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, dsct, &mgctx);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+
+ if (has_tasks)
+ lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+ else
+ lock_mode = CGRP_ATTACH_LOCK_NONE;
+
+ cgroup_attach_lock(lock_mode, NULL);
+
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(&mgctx);
+ if (ret)
+ goto out_finish;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
+ struct task_struct *task, *ntask;
+
+ /* all tasks in src_csets need to be migrated */
+ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+ cgroup_migrate_add_task(task, &mgctx);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_migrate_execute(&mgctx);
+out_finish:
+ cgroup_migrate_finish(&mgctx);
+ cgroup_attach_unlock(lock_mode, NULL);
+ return ret;
+}
+
+/**
+ * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
+ * @cgrp: root of the target subtree
+ *
+ * Because css offlining is asynchronous, userland may try to re-enable a
+ * controller while the previous css is still around. This function grabs
+ * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
+ */
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+ __acquires(&cgroup_mutex)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+restart:
+ cgroup_lock();
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+ DEFINE_WAIT(wait);
+
+ if (!css || !percpu_ref_is_dying(&css->refcnt))
+ continue;
+
+ cgroup_get_live(dsct);
+ prepare_to_wait(&dsct->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ cgroup_unlock();
+ schedule();
+ finish_wait(&dsct->offline_waitq, &wait);
+
+ cgroup_put(dsct);
+ goto restart;
+ }
+ }
+}
+
+/**
+ * cgroup_save_control - save control masks and dom_cgrp of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
+ * respective old_ prefixed fields for @cgrp's subtree including @cgrp
+ * itself.
+ */
+static void cgroup_save_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ dsct->old_subtree_control = dsct->subtree_control;
+ dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+ dsct->old_dom_cgrp = dsct->dom_cgrp;
+ }
+}
+
+/**
+ * cgroup_propagate_control - refresh control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
+ * ->subtree_control and propagate controller availability through the
+ * subtree so that descendants don't have unavailable controllers enabled.
+ */
+static void cgroup_propagate_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ dsct->subtree_control &= cgroup_control(dsct);
+ dsct->subtree_ss_mask =
+ cgroup_calc_subtree_ss_mask(dsct->subtree_control,
+ cgroup_ss_mask(dsct));
+ }
+}
+
+/**
+ * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
+ * respective old_ prefixed fields for @cgrp's subtree including @cgrp
+ * itself.
+ */
+static void cgroup_restore_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ dsct->subtree_control = dsct->old_subtree_control;
+ dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+ dsct->dom_cgrp = dsct->old_dom_cgrp;
+ }
+}
+
+static bool css_visible(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ if (cgroup_control(cgrp) & (1 << ss->id))
+ return true;
+ if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
+ return false;
+ return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
+}
+
+/**
+ * cgroup_apply_control_enable - enable or show csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and create new csses or make the existing ones
+ * visible. A css is created invisible if it's being implicitly enabled
+ * through dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
+ *
+ * Returns 0 on success, -errno on failure. On failure, csses which have
+ * been processed already aren't cleaned up. The caller is responsible for
+ * cleaning up with cgroup_apply_control_disable().
+ */
+static int cgroup_apply_control_enable(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid, ret;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+ if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
+ continue;
+
+ if (!css) {
+ css = css_create(dsct, ss);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+ }
+
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
+ if (css_visible(css)) {
+ ret = css_populate_dir(css);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * cgroup_apply_control_disable - kill or hide csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and kill and hide csses so that they match
+ * cgroup_ss_mask() and cgroup_visible_mask().
+ *
+ * A css is hidden when the userland requests it to be disabled while other
+ * subsystems are still depending on it. The css must not actively control
+ * resources and be in the vanilla state if it's made visible again later.
+ * Controllers which may be depended upon should provide ->css_reset() for
+ * this purpose.
+ */
+static void cgroup_apply_control_disable(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+ if (!css)
+ continue;
+
+ WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
+
+ if (css->parent &&
+ !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
+ kill_css(css);
+ } else if (!css_visible(css)) {
+ css_clear_dir(css);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
+ }
+}
+
+/**
+ * cgroup_apply_control - apply control mask updates to the subtree
+ * @cgrp: root of the target subtree
+ *
+ * subsystems can be enabled and disabled in a subtree using the following
+ * steps.
+ *
+ * 1. Call cgroup_save_control() to stash the current state.
+ * 2. Update ->subtree_control masks in the subtree as desired.
+ * 3. Call cgroup_apply_control() to apply the changes.
+ * 4. Optionally perform other related operations.
+ * 5. Call cgroup_finalize_control() to finish up.
+ *
+ * This function implements step 3 and propagates the mask changes
+ * throughout @cgrp's subtree, updates csses accordingly and perform
+ * process migrations.
+ */
+static int cgroup_apply_control(struct cgroup *cgrp)
+{
+ int ret;
+
+ cgroup_propagate_control(cgrp);
+
+ ret = cgroup_apply_control_enable(cgrp);
+ if (ret)
+ return ret;
+
+ /*
+ * At this point, cgroup_e_css_by_mask() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
+ return cgroup_update_dfl_csses(cgrp);
+}
+
+/**
+ * cgroup_finalize_control - finalize control mask update
+ * @cgrp: root of the target subtree
+ * @ret: the result of the update
+ *
+ * Finalize control mask update. See cgroup_apply_control() for more info.
+ */
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
+{
+ if (ret) {
+ cgroup_restore_control(cgrp);
+ cgroup_propagate_control(cgrp);
+ }
+
+ cgroup_apply_control_disable(cgrp);
+}
+
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+ /* if nothing is getting enabled, nothing to worry about */
+ if (!enable)
+ return 0;
+
+ /* can @cgrp host any resources? */
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /* mixables don't care */
+ if (cgroup_is_mixable(cgrp))
+ return 0;
+
+ if (domain_enable) {
+ /* can't enable domain controllers inside a thread subtree */
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return -EOPNOTSUPP;
+ } else {
+ /*
+ * Threaded controllers can handle internal competitions
+ * and are always allowed inside a (prospective) thread
+ * subtree.
+ */
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+ return 0;
+ }
+
+ /*
+ * Controllers can't be enabled for a cgroup with tasks to avoid
+ * child cgroups competing against tasks.
+ */
+ if (cgroup_has_tasks(cgrp))
+ return -EBUSY;
+
+ return 0;
+}
+
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ u16 enable = 0, disable = 0;
+ struct cgroup *cgrp, *child;
+ struct cgroup_subsys *ss;
+ char *tok;
+ int ssid, ret;
+
+ /*
+ * Parse input - space separated list of subsystem names prefixed
+ * with either + or -.
+ */
+ buf = strstrip(buf);
+ while ((tok = strsep(&buf, " "))) {
+ if (tok[0] == '\0')
+ continue;
+ do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
+ if (!cgroup_ssid_enabled(ssid) ||
+ strcmp(tok + 1, ss->name))
+ continue;
+
+ if (*tok == '+') {
+ enable |= 1 << ssid;
+ disable &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable |= 1 << ssid;
+ enable &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ } while_each_subsys_mask();
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
+
+ cgrp = cgroup_kn_lock_live(of->kn, true);
+ if (!cgrp)
+ return -ENODEV;
+
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
+
+ if (!(cgroup_control(cgrp) & (1 << ssid))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->subtree_control & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+ if (ret)
+ goto out_unlock;
+
+ /* save and update control masks and prepare csses */
+ cgroup_save_control(cgrp);
+
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
+
+ ret = cgroup_apply_control(cgrp);
+ cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
+
+ kernfs_activate(cgrp->kn);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+/**
+ * cgroup_enable_threaded - make @cgrp threaded
+ * @cgrp: the target cgroup
+ *
+ * Called when "threaded" is written to the cgroup.type interface file and
+ * tries to make @cgrp threaded and join the parent's resource domain.
+ * This function is never called on the root cgroup as cgroup.type doesn't
+ * exist on it.
+ */
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* noop if already threaded */
+ if (cgroup_is_threaded(cgrp))
+ return 0;
+
+ /*
+ * If @cgroup is populated or has domain controllers enabled, it
+ * can't be switched. While the below cgroup_can_be_thread_root()
+ * test can catch the same conditions, that's only when @parent is
+ * not mixable, so let's check it explicitly.
+ */
+ if (cgroup_is_populated(cgrp) ||
+ cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+ return -EOPNOTSUPP;
+
+ /* we're joining the parent's domain, ensure its validity */
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
+ !cgroup_can_be_thread_root(dom_cgrp))
+ return -EOPNOTSUPP;
+
+ /*
+ * The following shouldn't cause actual migrations and should
+ * always succeed.
+ */
+ cgroup_save_control(cgrp);
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
+ if (dsct == cgrp || cgroup_is_threaded(dsct))
+ dsct->dom_cgrp = dom_cgrp;
+
+ ret = cgroup_apply_control(cgrp);
+ if (!ret)
+ parent->nr_threaded_children++;
+
+ cgroup_finalize_control(cgrp, ret);
+ return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ if (cgroup_is_threaded(cgrp))
+ seq_puts(seq, "threaded\n");
+ else if (!cgroup_is_valid_domain(cgrp))
+ seq_puts(seq, "domain invalid\n");
+ else if (cgroup_is_thread_root(cgrp))
+ seq_puts(seq, "domain threaded\n");
+ else
+ seq_puts(seq, "domain\n");
+
+ return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ /* only switching to threaded mode is supported */
+ if (strcmp(strstrip(buf), "threaded"))
+ return -EINVAL;
+
+ /* drain dying csses before we re-apply (threaded) subtree control */
+ cgrp = cgroup_kn_lock_live(of->kn, true);
+ if (!cgrp)
+ return -ENOENT;
+
+ /* threaded can only be enabled */
+ ret = cgroup_enable_threaded(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int descendants = READ_ONCE(cgrp->max_descendants);
+
+ if (descendants == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", descendants);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ int descendants;
+ ssize_t ret;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ descendants = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &descendants);
+ if (ret)
+ return ret;
+ }
+
+ if (descendants < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_descendants = descendants;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ int depth = READ_ONCE(cgrp->max_depth);
+
+ if (depth == INT_MAX)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%d\n", depth);
+
+ return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int depth;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, "max")) {
+ depth = INT_MAX;
+ } else {
+ ret = kstrtoint(buf, 0, &depth);
+ if (ret)
+ return ret;
+ }
+
+ if (depth < 0)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgrp->max_depth = depth;
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static int cgroup_events_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
+ seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
+
+ return 0;
+}
+
+static int cgroup_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct cgroup_subsys_state *css;
+ int dying_cnt[CGROUP_SUBSYS_COUNT];
+ int ssid;
+
+ seq_printf(seq, "nr_descendants %d\n",
+ cgroup->nr_descendants);
+
+ /*
+ * Show the number of live and dying csses associated with each of
+ * non-inhibited cgroup subsystems that is bound to cgroup v2.
+ *
+ * Without proper lock protection, racing is possible. So the
+ * numbers may not be consistent when that happens.
+ */
+ rcu_read_lock();
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ dying_cnt[ssid] = -1;
+ if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+ (cgroup_subsys[ssid]->root != &cgrp_dfl_root))
+ continue;
+ css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+ seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+ css ? (css->nr_descendants + 1) : 0);
+ }
+
+ seq_printf(seq, "nr_dying_descendants %d\n",
+ cgroup->nr_dying_descendants);
+ for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ if (dying_cnt[ssid] >= 0)
+ seq_printf(seq, "nr_dying_subsys_%s %d\n",
+ cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int cgroup_core_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ unsigned int sequence;
+ u64 freeze_time;
+
+ do {
+ sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq);
+ freeze_time = cgrp->freezer.frozen_nsec;
+ /* Add in current freezer interval if the cgroup is freezing. */
+ if (test_bit(CGRP_FREEZE, &cgrp->flags))
+ freeze_time += (ktime_get_ns() -
+ cgrp->freezer.freeze_start_nsec);
+ } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence));
+
+ do_div(freeze_time, NSEC_PER_USEC);
+ seq_printf(seq, "frozen_usec %llu\n", freeze_time);
+
+ return 0;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ css = cgroup_css(cgrp, ss);
+ if (css && !css_tryget_online(css))
+ css = NULL;
+ rcu_read_unlock();
+
+ return css;
+}
+
+static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_extra_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_extra_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+
+static int cgroup_local_stat_show(struct seq_file *seq,
+ struct cgroup *cgrp, int ssid)
+{
+ struct cgroup_subsys *ss = cgroup_subsys[ssid];
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!ss->css_local_stat_show)
+ return 0;
+
+ css = cgroup_tryget_css(cgrp, ss);
+ if (!css)
+ return 0;
+
+ ret = ss->css_local_stat_show(seq, css);
+ css_put(css);
+ return ret;
+}
+#endif
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+ int ret = 0;
+
+ cgroup_base_stat_cputime_show(seq);
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
+static int cpu_local_stat_show(struct seq_file *seq, void *v)
+{
+ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+ int ret = 0;
+
+#ifdef CONFIG_CGROUP_SCHED
+ ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+ return ret;
+}
+
+#ifdef CONFIG_PSI
+static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IO);
+}
+static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_MEM);
+}
+static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_CPU);
+}
+
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct psi_trigger *new;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_get(cgrp);
+ cgroup_kn_unlock(of->kn);
+
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
+ psi = cgroup_psi(cgrp);
+ new = psi_trigger_create(psi, buf, res, of->file, of);
+ if (IS_ERR(new)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&ctx->psi.trigger, new);
+ cgroup_put(cgrp);
+
+ return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+ poll_table *pt)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
+}
+
+bool cgroup_psi_enabled(void)
+{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
+ return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_PSI */
+
+static int cgroup_freeze_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "%d\n", cgrp->freezer.freeze);
+
+ return 0;
+}
+
+static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int freeze;
+
+ ret = kstrtoint(strstrip(buf), 0, &freeze);
+ if (ret)
+ return ret;
+
+ if (freeze < 0 || freeze > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgroup_freeze(cgrp, freeze);
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
+}
+
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ cgrp->kill_seq++;
+ spin_unlock_irq(&css_set_lock);
+
+ css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ /* Skip tasks that are already dying. */
+ if (__fatal_signal_pending(task))
+ continue;
+
+ send_sig(SIGKILL, task, 0);
+ }
+ css_task_iter_end(&it);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+ __cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ ssize_t ret = 0;
+ int kill;
+ struct cgroup *cgrp;
+
+ ret = kstrtoint(strstrip(buf), 0, &kill);
+ if (ret)
+ return ret;
+
+ if (kill != 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /*
+ * Killing is a process directed operation, i.e. the whole thread-group
+ * is taken down so act like we do for cgroup.procs and only make this
+ * writable in non-threaded cgroups.
+ */
+ if (cgroup_is_threaded(cgrp))
+ ret = -EOPNOTSUPP;
+ else
+ cgroup_kill(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (cft->release)
+ cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ of->priv = NULL;
+}
+
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup *cgrp = kn_priv(of->kn);
+ struct cftype *cft = of_cft(of);
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ if (!nbytes)
+ return 0;
+
+ /*
+ * If namespaces are delegation boundaries, disallow writes to
+ * files in an non-init namespace root from inside the namespace
+ * except for the files explicitly marked delegatable -
+ * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
+ */
+ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+ !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
+ return -EPERM;
+
+ if (cft->write)
+ return cft->write(of, buf, nbytes, off);
+
+ /*
+ * kernfs guarantees that a file isn't deleted with operations in
+ * flight, which means that the matching css is and stays alive and
+ * doesn't need to be pinned. The RCU locking is not necessary
+ * either. It's just for the convenience of using cgroup_css().
+ */
+ rcu_read_lock();
+ css = cgroup_css(cgrp, cft->ss);
+ rcu_read_unlock();
+
+ if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else {
+ ret = -EINVAL;
+ }
+
+ return ret ?: nbytes;
+}
+
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of_cft(of);
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
+static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
+{
+ return seq_cft(seq)->seq_start(seq, ppos);
+}
+
+static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+ return seq_cft(seq)->seq_next(seq, v, ppos);
+}
+
+static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
+{
+ if (seq_cft(seq)->seq_stop)
+ seq_cft(seq)->seq_stop(seq, v);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
+
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
+
+ if (cft->read_u64)
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static struct kernfs_ops cgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
+ .write = cgroup_file_write,
+ .poll = cgroup_file_poll,
+ .seq_show = cgroup_seqfile_show,
+};
+
+static struct kernfs_ops cgroup_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
+ .write = cgroup_file_write,
+ .poll = cgroup_file_poll,
+ .seq_start = cgroup_seqfile_start,
+ .seq_next = cgroup_seqfile_next,
+ .seq_stop = cgroup_seqfile_stop,
+ .seq_show = cgroup_seqfile_show,
+};
+
+static void cgroup_file_notify_timer(struct timer_list *timer)
+{
+ cgroup_file_notify(container_of(timer, struct cgroup_file,
+ notify_timer));
+}
+
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ char name[CGROUP_FILE_NAME_MAX];
+ struct kernfs_node *kn;
+ struct lock_class_key *key = NULL;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ key = &cft->lockdep_key;
+#endif
+ kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+ cgroup_file_mode(cft),
+ current_fsuid(), current_fsgid(),
+ 0, cft->kf_ops, cft,
+ NULL, key);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ if (cft->file_offset) {
+ struct cgroup_file *cfile = (void *)css + cft->file_offset;
+
+ timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ cfile->kn = kn;
+ spin_unlock_irq(&cgroup_file_kn_lock);
+ }
+
+ return 0;
+}
+
+/**
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @css: the target css
+ * @cgrp: the target cgroup (usually css->cgroup)
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails.
+ */
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp, struct cftype cfts[],
+ bool is_add)
+{
+ struct cftype *cft, *cft_end = NULL;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+restart:
+ for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
+ continue;
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+ continue;
+ if (is_add) {
+ ret = cgroup_add_file(css, cgrp, cft);
+ if (ret) {
+ pr_warn("%s: failed to add %s, err=%d\n",
+ __func__, cft->name, ret);
+ cft_end = cft;
+ is_add = false;
+ goto restart;
+ }
+ } else {
+ cgroup_rm_file(cgrp, cft);
+ }
+ }
+ return ret;
+}
+
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
+{
+ struct cgroup_subsys *ss = cfts[0].ss;
+ struct cgroup *root = &ss->root->cgrp;
+ struct cgroup_subsys_state *css;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* add/rm files for all cgroups created before */
+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ if (!(css->flags & CSS_VISIBLE))
+ continue;
+
+ ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
+ if (ret)
+ break;
+ }
+
+ if (is_add && !ret)
+ kernfs_activate(root->kn);
+ return ret;
+}
+
+static void cgroup_exit_cftypes(struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* free copy for custom atomic_write_len, see init_cftypes() */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+ kfree(cft->kf_ops);
+ cft->kf_ops = NULL;
+ cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
+ }
+}
+
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+ int ret = 0;
+
+ for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ struct kernfs_ops *kf_ops;
+
+ WARN_ON(cft->ss || cft->kf_ops);
+
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
+
+ if (cft->seq_start)
+ kf_ops = &cgroup_kf_ops;
+ else
+ kf_ops = &cgroup_kf_single_ops;
+
+ /*
+ * Ugh... if @cft wants a custom max_write_len, we need to
+ * make a copy of kf_ops to set its atomic_write_len.
+ */
+ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+ kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+ if (!kf_ops) {
+ ret = -ENOMEM;
+ break;
+ }
+ kf_ops->atomic_write_len = cft->max_write_len;
+ }
+
+ cft->kf_ops = kf_ops;
+ cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
+ }
+
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
+}
+
+static void cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ list_del(&cfts->node);
+ cgroup_apply_cftypes(cfts, false);
+ cgroup_exit_cftypes(cfts);
+}
+
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts. Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either. This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
+ */
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
+ cgroup_lock();
+ cgroup_rm_cftypes_locked(cfts);
+ cgroup_unlock();
+ return 0;
+}
+
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss. Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too. This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure. Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ int ret;
+
+ if (!cgroup_ssid_enabled(ss->id))
+ return 0;
+
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ ret = cgroup_init_cftypes(ss, cfts);
+ if (ret)
+ return ret;
+
+ cgroup_lock();
+
+ list_add_tail(&cfts->node, &ss->cfts);
+ ret = cgroup_apply_cftypes(cfts, true);
+ if (ret)
+ cgroup_rm_cftypes_locked(cfts);
+
+ cgroup_unlock();
+ return ret;
+}
+
+/**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_ONLY_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_file_notify - generate a file modified event for a cgroup_file
+ * @cfile: target cgroup_file
+ *
+ * @cfile must have been obtained by setting cftype->file_offset.
+ */
+void cgroup_file_notify(struct cgroup_file *cfile)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cgroup_file_kn_lock, flags);
+ if (cfile->kn) {
+ unsigned long last = cfile->notified_at;
+ unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+
+ if (time_in_range(jiffies, last, next)) {
+ timer_reduce(&cfile->notify_timer, next);
+ } else {
+ kernfs_notify(cfile->kn);
+ cfile->notified_at = jiffies;
+ }
+ }
+ spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
+}
+EXPORT_SYMBOL_GPL(cgroup_file_notify);
+
+/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
+ * css_next_child - find the next child of a given css
+ * @pos: the current position (%NULL to initiate traversal)
+ * @parent: css whose children to walk
+ *
+ * This function returns the next child of @parent and should be called
+ * under either cgroup_mutex or RCU read lock. The only requirement is
+ * that @parent and @pos are accessible. The next sibling is guaranteed to
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *parent)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /*
+ * @pos could already have been unlinked from the sibling list.
+ * Once a cgroup is removed, its ->sibling.next is no longer
+ * updated when its next sibling changes. CSS_RELEASED is set when
+ * @pos is taken off list, at which time its next pointer is valid,
+ * and, as releases are serialized, the one pointed to by the next
+ * pointer is guaranteed to not have started release yet. This
+ * implies that if we observe !CSS_RELEASED on @pos in this RCU
+ * critical section, the one pointed to by its next pointer is
+ * guaranteed to not have finished its RCU grace period even if we
+ * have dropped rcu_read_lock() in-between iterations.
+ *
+ * If @pos has CSS_RELEASED set, its next pointer can't be
+ * dereferenced; however, as each css is given a monotonically
+ * increasing unique serial number and always appended to the
+ * sibling list, the next one can be found by walking the parent's
+ * children until the first css with higher serial number than
+ * @pos's. While this path can be slower, it happens iff iteration
+ * races against release and the race window is very small.
+ */
+ if (!pos) {
+ next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
+ } else if (likely(!(pos->flags & CSS_RELEASED))) {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
+ } else {
+ list_for_each_entry_rcu(next, &parent->children, sibling,
+ lockdep_is_held(&cgroup_mutex))
+ if (next->serial_nr > pos->serial_nr)
+ break;
+ }
+
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling.
+ */
+ if (&next->sibling != &parent->children)
+ return next;
+ return NULL;
+}
+
+/**
+ * css_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_pre(). Find the next descendant
+ * to visit for pre-order traversal of @root's descendants. @root is
+ * included in the iteration and the first node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit @root */
+ if (!pos)
+ return root;
+
+ /* visit the first child if exists */
+ next = css_next_child(NULL, pos);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ while (pos != root) {
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return next;
+ pos = pos->parent;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
+
+/**
+ * css_rightmost_descendant - return the rightmost descendant of a css
+ * @pos: css of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant, @pos
+ * is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct rightmost descendant as long as @pos
+ * is accessible.
+ */
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last, *tmp;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ css_for_each_child(tmp, last)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
+{
+ struct cgroup_subsys_state *last;
+
+ do {
+ last = pos;
+ pos = css_next_child(NULL, pos);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * css_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: css whose descendants to walk
+ *
+ * To be used by css_for_each_descendant_post(). Find the next descendant
+ * to visit for post-order traversal of @root's descendants. @root is
+ * included in the iteration and the last node to be visited.
+ *
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ * This function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal. It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ */
+struct cgroup_subsys_state *
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+ struct cgroup_subsys_state *root)
+{
+ struct cgroup_subsys_state *next;
+
+ cgroup_assert_mutex_or_rcu_locked();
+
+ /* if first iteration, visit leftmost descendant which may be @root */
+ if (!pos)
+ return css_leftmost_descendant(root);
+
+ /* if we visited @root, we're done */
+ if (pos == root)
+ return NULL;
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = css_next_child(pos, pos->parent);
+ if (next)
+ return css_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ return pos->parent;
+}
+
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false. This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *child;
+ bool ret = false;
+
+ rcu_read_lock();
+ css_for_each_child(child, css) {
+ if (child->flags & CSS_ONLINE) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+ struct list_head *l;
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* find the next threaded cset */
+ if (it->tcset_pos) {
+ l = it->tcset_pos->next;
+
+ if (l != it->tcset_head) {
+ it->tcset_pos = l;
+ return container_of(l, struct css_set,
+ threaded_csets_node);
+ }
+
+ it->tcset_pos = NULL;
+ }
+
+ /* find the next cset */
+ l = it->cset_pos;
+ l = l->next;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
+ return NULL;
+ }
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
+
+ it->cset_pos = l;
+
+ /* initialize threaded css_set walking */
+ if (it->flags & CSS_TASK_ITER_THREADED) {
+ if (it->cur_dcset)
+ put_css_set_locked(it->cur_dcset);
+ it->cur_dcset = cset;
+ get_css_set(cset);
+
+ it->tcset_head = &cset->threaded_csets;
+ it->tcset_pos = &cset->threaded_csets;
+ }
+
+ return cset;
+}
+
+/**
+ * css_task_iter_advance_css_set - advance a task iterator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
+ */
+static void css_task_iter_advance_css_set(struct css_task_iter *it)
+{
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
+ while ((cset = css_task_iter_next_css_set(it))) {
+ if (!list_empty(&cset->tasks)) {
+ it->cur_tasks_head = &cset->tasks;
+ break;
+ } else if (!list_empty(&cset->mg_tasks)) {
+ it->cur_tasks_head = &cset->mg_tasks;
+ break;
+ } else if (!list_empty(&cset->dying_tasks)) {
+ it->cur_tasks_head = &cset->dying_tasks;
+ break;
+ }
+ }
+ if (!cset) {
+ it->task_pos = NULL;
+ return;
+ }
+ it->task_pos = it->cur_tasks_head->next;
+
+ /*
+ * We don't keep css_sets locked across iteration steps and thus
+ * need to take steps to ensure that iteration can be resumed after
+ * the lock is re-acquired. Iteration is performed at two levels -
+ * css_sets and tasks in them.
+ *
+ * Once created, a css_set never leaves its cgroup lists, so a
+ * pinned css_set is guaranteed to stay put and we can resume
+ * iteration afterwards.
+ *
+ * Tasks may leave @cset across iteration steps. This is resolved
+ * by registering each iterator with the css_set currently being
+ * walked and making css_set_move_task() advance iterators whose
+ * next task is leaving.
+ */
+ if (it->cur_cset) {
+ list_del(&it->iters_node);
+ put_css_set_locked(it->cur_cset);
+ }
+ get_css_set(cset);
+ it->cur_cset = cset;
+ list_add(&it->iters_node, &cset->task_iters);
+}
+
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (it->task_pos == &task->cg_list) {
+ it->task_pos = it->task_pos->next;
+ it->flags |= CSS_TASK_ITER_SKIPPED;
+ }
+}
+
+static void css_task_iter_advance(struct css_task_iter *it)
+{
+ struct task_struct *task;
+
+ lockdep_assert_held(&css_set_lock);
+repeat:
+ if (it->task_pos) {
+ /*
+ * Advance iterator to find next entry. We go through cset
+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
+ * the next cset.
+ */
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ it->flags &= ~CSS_TASK_ITER_SKIPPED;
+ else
+ it->task_pos = it->task_pos->next;
+
+ if (it->task_pos == &it->cur_cset->tasks) {
+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
+ it->task_pos = it->cur_tasks_head->next;
+ }
+ if (it->task_pos == &it->cur_cset->mg_tasks) {
+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
+ it->task_pos = it->cur_tasks_head->next;
+ }
+ if (it->task_pos == &it->cur_cset->dying_tasks)
+ css_task_iter_advance_css_set(it);
+ } else {
+ /* called from start, proceed to the first cset */
+ css_task_iter_advance_css_set(it);
+ }
+
+ if (!it->task_pos)
+ return;
+
+ task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+ if (it->flags & CSS_TASK_ITER_PROCS) {
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if (!thread_group_leader(task))
+ goto repeat;
+
+ /* and dying leaders w/o live member threads */
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
+ !atomic_read(&task->signal->live))
+ goto repeat;
+ } else {
+ /* skip all dying ones */
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
+ goto repeat;
+ }
+}
+
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
+ * @it: the task iterator to use
+ *
+ * Initiate iteration through the tasks of @css. The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL. On completion of iteration, css_task_iter_end() must be
+ * called.
+ */
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
+ struct css_task_iter *it)
+{
+ unsigned long irqflags;
+
+ memset(it, 0, sizeof(*it));
+
+ spin_lock_irqsave(&css_set_lock, irqflags);
+
+ it->ss = css->ss;
+ it->flags = flags;
+
+ if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
+
+ css_task_iter_advance(it);
+
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
+}
+
+/**
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration. @it should have been
+ * initialized via css_task_iter_start(). Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
+{
+ unsigned long irqflags;
+
+ if (it->cur_task) {
+ put_task_struct(it->cur_task);
+ it->cur_task = NULL;
+ }
+
+ spin_lock_irqsave(&css_set_lock, irqflags);
+
+ /* @it may be half-advanced by skips, finish advancing */
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ css_task_iter_advance(it);
+
+ if (it->task_pos) {
+ it->cur_task = list_entry(it->task_pos, struct task_struct,
+ cg_list);
+ get_task_struct(it->cur_task);
+ css_task_iter_advance(it);
+ }
+
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
+
+ return it->cur_task;
+}
+
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
+ */
+void css_task_iter_end(struct css_task_iter *it)
+{
+ unsigned long irqflags;
+
+ if (it->cur_cset) {
+ spin_lock_irqsave(&css_set_lock, irqflags);
+ list_del(&it->iters_node);
+ put_css_set_locked(it->cur_cset);
+ spin_unlock_irqrestore(&css_set_lock, irqflags);
+ }
+
+ if (it->cur_dcset)
+ put_css_set(it->cur_dcset);
+
+ if (it->cur_task)
+ put_task_struct(it->cur_task);
+}
+
+static void cgroup_procs_release(struct kernfs_open_file *of)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
+}
+
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (pos)
+ (*pos)++;
+
+ return css_task_iter_next(&ctx->procs.iter);
+}
+
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+ unsigned int iter_flags)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
+
+ /*
+ * When a seq_file is seeked, it's always traversed sequentially
+ * from position 0, so we can simply keep iterating on !0 *pos.
+ */
+ if (!ctx->procs.started) {
+ if (WARN_ON_ONCE((*pos)))
+ return ERR_PTR(-EINVAL);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
+ } else if (!(*pos)) {
+ css_task_iter_end(it);
+ css_task_iter_start(&cgrp->self, iter_flags, it);
+ } else
+ return it->cur_task;
+
+ return cgroup_procs_next(s, NULL, NULL);
+}
+
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+
+ /*
+ * All processes of a threaded subtree belong to the domain cgroup
+ * of the subtree. Only threads can be distributed across the
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
+ * They're always empty anyway.
+ */
+ if (cgroup_is_threaded(cgrp))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+ CSS_TASK_ITER_THREADED);
+}
+
+static int cgroup_procs_show(struct seq_file *s, void *v)
+{
+ seq_printf(s, "%d\n", task_pid_vnr(v));
+ return 0;
+}
+
+static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
+{
+ int ret;
+ struct inode *inode;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
+ iput(inode);
+ return ret;
+}
+
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
+{
+ struct cgroup *com_cgrp = src_cgrp;
+ int ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* find the common ancestor */
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ ret = cgroup_may_write(com_cgrp, sb);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+ return -ENOENT;
+
+ return 0;
+}
+
+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
+{
+ int ret = 0;
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
+ if (ret)
+ return ret;
+
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
+
+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
+ ret = -EOPNOTSUPP;
+
+ return ret;
+}
+
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ bool threadgroup)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup *src_cgrp, *dst_cgrp;
+ struct task_struct *task;
+ ssize_t ret;
+ enum cgroup_attach_lock_mode lock_mode;
+
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!dst_cgrp)
+ return -ENODEV;
+
+ task = cgroup_procs_write_start(buf, threadgroup, &lock_mode);
+ ret = PTR_ERR_OR_ZERO(task);
+ if (ret)
+ goto out_unlock;
+
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ scoped_with_creds(of->file->f_cred)
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ if (ret)
+ goto out_finish;
+
+ ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
+
+out_finish:
+ cgroup_procs_write_finish(task, lock_mode);
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, true) ?: nbytes;
+}
+
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+ return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, false) ?: nbytes;
+}
+
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_base_files[] = {
+ {
+ .name = "cgroup.type",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_type_show,
+ .write = cgroup_type_write,
+ },
+ {
+ .name = "cgroup.procs",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .file_offset = offsetof(struct cgroup, procs_file),
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_procs_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_procs_write,
+ },
+ {
+ .name = "cgroup.threads",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_threads_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
+ .write = cgroup_threads_write,
+ },
+ {
+ .name = "cgroup.controllers",
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .seq_show = cgroup_subtree_control_show,
+ .write = cgroup_subtree_control_write,
+ },
+ {
+ .name = "cgroup.events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct cgroup, events_file),
+ .seq_show = cgroup_events_show,
+ },
+ {
+ .name = "cgroup.max.descendants",
+ .seq_show = cgroup_max_descendants_show,
+ .write = cgroup_max_descendants_write,
+ },
+ {
+ .name = "cgroup.max.depth",
+ .seq_show = cgroup_max_depth_show,
+ .write = cgroup_max_depth_write,
+ },
+ {
+ .name = "cgroup.stat",
+ .seq_show = cgroup_stat_show,
+ },
+ {
+ .name = "cgroup.stat.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_core_local_stat_show,
+ },
+ {
+ .name = "cgroup.freeze",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_freeze_show,
+ .write = cgroup_freeze_write,
+ },
+ {
+ .name = "cgroup.kill",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .write = cgroup_kill_write,
+ },
+ {
+ .name = "cpu.stat",
+ .seq_show = cpu_stat_show,
+ },
+ {
+ .name = "cpu.stat.local",
+ .seq_show = cpu_local_stat_show,
+ },
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
+#ifdef CONFIG_PSI
+ {
+ .name = "io.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
+ .seq_show = cgroup_io_pressure_show,
+ .write = cgroup_io_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+ {
+ .name = "memory.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
+ .seq_show = cgroup_memory_pressure_show,
+ .write = cgroup_memory_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+ {
+ .name = "cpu.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
+ .seq_show = cgroup_cpu_pressure_show,
+ .write = cgroup_cpu_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
+#endif /* CONFIG_PSI */
+ { } /* terminate */
+};
+
+/*
+ * css destruction is four-stage process.
+ *
+ * 1. Destruction starts. Killing of the percpu_ref is initiated.
+ * Implemented in kill_css().
+ *
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ * and thus css_tryget_online() is guaranteed to fail, the css can be
+ * offlined by invoking offline_css(). After offlining, the base ref is
+ * put. Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ * accessors are inside RCU read sections. css_release() schedules the
+ * RCU callback.
+ *
+ * 4. After the grace period, the css can be freed. Implemented in
+ * css_free_rwork_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
+ */
+static void css_free_rwork_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
+ struct cgroup_subsys_state, destroy_rwork);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ percpu_ref_exit(&css->refcnt);
+ css_rstat_exit(css);
+
+ if (!css_is_self(css)) {
+ /* css free path */
+ struct cgroup_subsys_state *parent = css->parent;
+ int id = css->id;
+
+ ss->css_free(css);
+ cgroup_idr_remove(&ss->css_idr, id);
+ cgroup_put(cgrp);
+
+ if (parent)
+ css_put(parent);
+ } else {
+ /* cgroup free path */
+ atomic_dec(&cgrp->root->nr_cgrps);
+ if (!cgroup_on_dfl(cgrp))
+ cgroup1_pidlist_destroy_all(cgrp);
+ cancel_work_sync(&cgrp->release_agent_work);
+ bpf_cgrp_storage_free(cgrp);
+
+ if (cgroup_parent(cgrp)) {
+ /*
+ * We get a ref to the parent, and put the ref when
+ * this cgroup is being freed, so it's guaranteed
+ * that the parent won't be destroyed before its
+ * children.
+ */
+ cgroup_put(cgroup_parent(cgrp));
+ kernfs_put(cgrp->kn);
+ psi_cgroup_free(cgrp);
+ kfree(cgrp);
+ } else {
+ /*
+ * This is root cgroup's refcnt reaching zero,
+ * which indicates that the root should be
+ * released.
+ */
+ cgroup_destroy_root(cgrp->root);
+ }
+ }
+}
+
+static void css_release_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ cgroup_lock();
+
+ css->flags |= CSS_RELEASED;
+ list_del_rcu(&css->sibling);
+
+ if (!css_is_self(css)) {
+ struct cgroup *parent_cgrp;
+
+ css_rstat_flush(css);
+
+ cgroup_idr_replace(&ss->css_idr, NULL, css->id);
+ if (ss->css_released)
+ ss->css_released(css);
+
+ cgrp->nr_dying_subsys[ss->id]--;
+ /*
+ * When a css is released and ready to be freed, its
+ * nr_descendants must be zero. However, the corresponding
+ * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+ * is activated and deactivated multiple times with one or
+ * more of its previous activation leaving behind dying csses.
+ */
+ WARN_ON_ONCE(css->nr_descendants);
+ parent_cgrp = cgroup_parent(cgrp);
+ while (parent_cgrp) {
+ parent_cgrp->nr_dying_subsys[ss->id]--;
+ parent_cgrp = cgroup_parent(parent_cgrp);
+ }
+ } else {
+ struct cgroup *tcgrp;
+
+ /* cgroup release path */
+ TRACE_CGROUP_PATH(release, cgrp);
+
+ css_rstat_flush(&cgrp->self);
+
+ spin_lock_irq(&css_set_lock);
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
+ tcgrp = cgroup_parent(tcgrp))
+ tcgrp->nr_dying_descendants--;
+ spin_unlock_irq(&css_set_lock);
+
+ /*
+ * There are two control paths which try to determine
+ * cgroup from dentry without going through kernfs -
+ * cgroupstats_build() and css_tryget_online_from_dir().
+ * Those are supported by RCU protecting clearing of
+ * cgrp->kn->priv backpointer.
+ */
+ if (cgrp->kn)
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
+ NULL);
+ }
+
+ cgroup_unlock();
+
+ INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
+}
+
+static void css_release(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_release_wq, &css->destroy_work);
+}
+
+static void init_and_link_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_get_live(cgrp);
+
+ memset(css, 0, sizeof(*css));
+ css->cgroup = cgrp;
+ css->ss = ss;
+ css->id = -1;
+ INIT_LIST_HEAD(&css->sibling);
+ INIT_LIST_HEAD(&css->children);
+ css->serial_nr = css_serial_nr_next++;
+ atomic_set(&css->online_cnt, 0);
+
+ if (cgroup_parent(cgrp)) {
+ css->parent = cgroup_css(cgroup_parent(cgrp), ss);
+ css_get(css->parent);
+ }
+
+ BUG_ON(cgroup_css(cgrp, ss));
+}
+
+/* invoke ->css_online() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (ss->css_online)
+ ret = ss->css_online(css);
+ if (!ret) {
+ css->flags |= CSS_ONLINE;
+ rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+ atomic_inc(&css->online_cnt);
+ if (css->parent) {
+ atomic_inc(&css->parent->online_cnt);
+ while ((css = css->parent))
+ css->nr_descendants++;
+ }
+ }
+ return ret;
+}
+
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
+static void offline_css(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ if (ss->css_offline)
+ ss->css_offline(css);
+
+ css->flags &= ~CSS_ONLINE;
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
+}
+
+/**
+ * css_create - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ *
+ * Create a new css associated with @cgrp - @ss pair. On success, the new
+ * css is online and installed in @cgrp. This function doesn't create the
+ * interface files. Returns 0 on success, -errno on failure.
+ */
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
+ struct cgroup_subsys_state *css;
+ int err;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ css = ss->css_alloc(parent_css);
+ if (!css)
+ css = ERR_PTR(-ENOMEM);
+ if (IS_ERR(css))
+ return css;
+
+ init_and_link_css(css, ss, cgrp);
+
+ err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
+ if (err)
+ goto err_free_css;
+
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
+ if (err < 0)
+ goto err_free_css;
+ css->id = err;
+
+ err = css_rstat_init(css);
+ if (err)
+ goto err_free_css;
+
+ /* @css is ready to be brought online now, make it visible */
+ list_add_tail_rcu(&css->sibling, &parent_css->children);
+ cgroup_idr_replace(&ss->css_idr, css, css->id);
+
+ err = online_css(css);
+ if (err)
+ goto err_list_del;
+
+ return css;
+
+err_list_del:
+ list_del_rcu(&css->sibling);
+err_free_css:
+ INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
+ queue_rcu_work(cgroup_free_wq, &css->destroy_rwork);
+ return ERR_PTR(err);
+}
+
+/*
+ * The returned cgroup is fully initialized including its control mask, but
+ * it doesn't have the control mask applied.
+ */
+static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
+ umode_t mode)
+{
+ struct cgroup_root *root = parent->root;
+ struct cgroup *cgrp, *tcgrp;
+ struct kernfs_node *kn;
+ int i, level = parent->level + 1;
+ int ret;
+
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
+ if (!cgrp)
+ return ERR_PTR(-ENOMEM);
+
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
+ if (ret)
+ goto out_free_cgrp;
+
+ /* create the directory */
+ kn = kernfs_create_dir_ns(parent->kn, name, mode,
+ current_fsuid(), current_fsgid(),
+ cgrp, NULL);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_cancel_ref;
+ }
+ cgrp->kn = kn;
+
+ init_cgroup_housekeeping(cgrp);
+
+ cgrp->self.parent = &parent->self;
+ cgrp->root = root;
+ cgrp->level = level;
+
+ /*
+ * Now that init_cgroup_housekeeping() has been called and cgrp->self
+ * is setup, it is safe to perform rstat initialization on it.
+ */
+ ret = css_rstat_init(&cgrp->self);
+ if (ret)
+ goto out_kernfs_remove;
+
+ ret = psi_cgroup_alloc(cgrp);
+ if (ret)
+ goto out_stat_exit;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestors[tcgrp->level] = tcgrp;
+
+ /*
+ * New cgroup inherits effective freeze counter, and
+ * if the parent has to be frozen, the child has too.
+ */
+ cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ seqcount_spinlock_init(&cgrp->freezer.freeze_seq, &css_set_lock);
+ if (cgrp->freezer.e_freeze) {
+ /*
+ * Set the CGRP_FREEZE flag, so when a process will be
+ * attached to the child cgroup, it will become frozen.
+ * At this point the new cgroup is unpopulated, so we can
+ * consider it frozen immediately.
+ */
+ set_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.freeze_start_nsec = ktime_get_ns();
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ }
+
+ if (notify_on_release(parent))
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+
+ cgrp->self.serial_nr = css_serial_nr_next++;
+
+ ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto out_psi_free;
+
+ /* allocation complete, commit to creation */
+ spin_lock_irq(&css_set_lock);
+ for (i = 0; i < level; i++) {
+ tcgrp = cgrp->ancestors[i];
+ tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups get a new
+ * frozen descendant, but their state can't change because of
+ * this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
+ atomic_inc(&root->nr_cgrps);
+ cgroup_get_live(parent);
+
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * subtree_control from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->subtree_control = cgroup_control(cgrp);
+
+ cgroup_propagate_control(cgrp);
+
+ return cgrp;
+
+out_psi_free:
+ psi_cgroup_free(cgrp);
+out_stat_exit:
+ css_rstat_exit(&cgrp->self);
+out_kernfs_remove:
+ kernfs_remove(cgrp->kn);
+out_cancel_ref:
+ percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
+ kfree(cgrp);
+ return ERR_PTR(ret);
+}
+
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+ struct cgroup *cgroup;
+ int ret = false;
+ int level = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
+ goto fail;
+
+ if (level >= cgroup->max_depth)
+ goto fail;
+
+ level++;
+ }
+
+ ret = true;
+fail:
+ return ret;
+}
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
+{
+ struct cgroup *parent, *cgrp;
+ int ret;
+
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = cgroup_kn_lock_live(parent_kn, false);
+ if (!parent)
+ return -ENODEV;
+
+ if (!cgroup_check_hierarchy_limits(parent)) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+
+ cgrp = cgroup_create(parent, name, mode);
+ if (IS_ERR(cgrp)) {
+ ret = PTR_ERR(cgrp);
+ goto out_unlock;
+ }
+
+ /*
+ * This extra ref will be put in css_free_rwork_fn() and guarantees
+ * that @cgrp->kn is always accessible.
+ */
+ kernfs_get(cgrp->kn);
+
+ ret = css_populate_dir(&cgrp->self);
+ if (ret)
+ goto out_destroy;
+
+ ret = cgroup_apply_control_enable(cgrp);
+ if (ret)
+ goto out_destroy;
+
+ TRACE_CGROUP_PATH(mkdir, cgrp);
+
+ /* let's create and online css's */
+ kernfs_activate(cgrp->kn);
+
+ ret = 0;
+ goto out_unlock;
+
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+out_unlock:
+ cgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget_online() is now guaranteed to fail. Tell the subsystem to
+ * initiate destruction and put the css ref from kill_css().
+ */
+static void css_killed_work_fn(struct work_struct *work)
+{
+ struct cgroup_subsys_state *css =
+ container_of(work, struct cgroup_subsys_state, destroy_work);
+
+ cgroup_lock();
+
+ do {
+ offline_css(css);
+ css_put(css);
+ /* @css can't go away while we're holding cgroup_mutex */
+ css = css->parent;
+ } while (css && atomic_dec_and_test(&css->online_cnt));
+
+ cgroup_unlock();
+}
+
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
+{
+ struct cgroup_subsys_state *css =
+ container_of(ref, struct cgroup_subsys_state, refcnt);
+
+ if (atomic_dec_and_test(&css->online_cnt)) {
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_offline_wq, &css->destroy_work);
+ }
+}
+
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference. ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (css->flags & CSS_DYING)
+ return;
+
+ /*
+ * Call css_killed(), if defined, before setting the CSS_DYING flag
+ */
+ if (css->ss->css_killed)
+ css->ss->css_killed(css);
+
+ css->flags |= CSS_DYING;
+
+ /*
+ * This must happen before css is disassociated with its cgroup.
+ * See seq_css() for details.
+ */
+ css_clear_dir(css);
+
+ /*
+ * Killing would put the base ref, but we need to keep it alive
+ * until after ->css_offline().
+ */
+ css_get(css);
+
+ /*
+ * cgroup core guarantees that, by the time ->css_offline() is
+ * invoked, no new css reference will be given out via
+ * css_tryget_online(). We can't simply call percpu_ref_kill() and
+ * proceed to offlining css's because percpu_ref_kill() doesn't
+ * guarantee that the ref is seen as killed on all CPUs on return.
+ *
+ * Use percpu_ref_kill_and_confirm() to get notifications as each
+ * css is confirmed to be seen as killed on all CPUs.
+ */
+ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected. Also, cgroup core needs to
+ * guarantee that css_tryget_online() won't succeed by the time
+ * ->css_offline() is invoked. To satisfy all the requirements,
+ * destruction is implemented in the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
+ * userland visible parts and start killing the percpu refcnts of
+ * css's. Set up so that the next stage will be kicked off once all
+ * the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ * rest of destruction. Once all cgroup references are gone, the
+ * cgroup is RCU-freed.
+ *
+ * This function implements s1. After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created. As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
+ struct cgroup_subsys_state *css;
+ struct cgrp_cset_link *link;
+ int ssid, ret;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Only migration can raise populated from zero and we're already
+ * holding cgroup_mutex.
+ */
+ if (cgroup_is_populated(cgrp))
+ return -EBUSY;
+
+ /*
+ * Make sure there's no live children. We can't test emptiness of
+ * ->self.children as dead children linger on it while being
+ * drained; otherwise, "rmdir parent/child parent" may fail.
+ */
+ if (css_has_online_children(&cgrp->self))
+ return -EBUSY;
+
+ /*
+ * Mark @cgrp and the associated csets dead. The former prevents
+ * further task migration and child creation by disabling
+ * cgroup_kn_lock_live(). The latter makes the csets ignored by
+ * the migration path.
+ */
+ cgrp->self.flags &= ~CSS_ONLINE;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ link->cset->dead = true;
+ spin_unlock_irq(&css_set_lock);
+
+ /* initiate massacre of all css's */
+ for_each_css(css, ssid, cgrp)
+ kill_css(css);
+
+ /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
+ css_clear_dir(&cgrp->self);
+ kernfs_remove(cgrp->kn);
+
+ if (cgroup_is_threaded(cgrp))
+ parent->nr_threaded_children--;
+
+ spin_lock_irq(&css_set_lock);
+ for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+ tcgrp->nr_descendants--;
+ tcgrp->nr_dying_descendants++;
+ /*
+ * If the dying cgroup is frozen, decrease frozen descendants
+ * counters of ancestor cgroups.
+ */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags))
+ tcgrp->freezer.nr_frozen_descendants--;
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ cgroup1_check_for_release(parent);
+
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
+
+ /* put the base reference */
+ percpu_ref_kill(&cgrp->self.refcnt);
+
+ return 0;
+};
+
+int cgroup_rmdir(struct kernfs_node *kn)
+{
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ cgrp = cgroup_kn_lock_live(kn, false);
+ if (!cgrp)
+ return 0;
+
+ ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ TRACE_CGROUP_PATH(rmdir, cgrp);
+
+ cgroup_kn_unlock(kn);
+ return ret;
+}
+
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .show_options = cgroup_show_options,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .show_path = cgroup_show_path,
+};
+
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
+{
+ struct cgroup_subsys_state *css;
+
+ pr_debug("Initializing cgroup subsys %s\n", ss->name);
+
+ cgroup_lock();
+
+ idr_init(&ss->css_idr);
+ INIT_LIST_HEAD(&ss->cfts);
+
+ /* Create the root cgroup state for this subsystem */
+ ss->root = &cgrp_dfl_root;
+ css = ss->css_alloc(NULL);
+ /* We don't handle early failures gracefully */
+ BUG_ON(IS_ERR(css));
+ init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+
+ /*
+ * Root csses are never destroyed and we can't initialize
+ * percpu_ref during early init. Disable refcnting.
+ */
+ css->flags |= CSS_NO_REF;
+
+ if (early) {
+ /* allocation can't be done safely during early init */
+ css->id = 1;
+ } else {
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+ BUG_ON(css->id < 0);
+
+ BUG_ON(ss_rstat_init(ss));
+ BUG_ON(css_rstat_init(css));
+ }
+
+ /* Update the init_css_set to contain a subsys
+ * pointer to this state - since the subsystem is
+ * newly registered, all tasks and hence the
+ * init_css_set is in the subsystem's root cgroup. */
+ init_css_set.subsys[ss->id] = css;
+
+ have_fork_callback |= (bool)ss->fork << ss->id;
+ have_exit_callback |= (bool)ss->exit << ss->id;
+ have_release_callback |= (bool)ss->release << ss->id;
+ have_canfork_callback |= (bool)ss->can_fork << ss->id;
+
+ /* At system boot, before all subsystems have been
+ * registered, no tasks have been forked, so we don't
+ * need to invoke fork callbacks here. */
+ BUG_ON(!list_empty(&init_task.tasks));
+
+ BUG_ON(online_css(css));
+
+ cgroup_unlock();
+}
+
+/**
+ * cgroup_init_early - cgroup initialization at system boot
+ *
+ * Initialize cgroups at system boot, and initialize any
+ * subsystems that request early init.
+ */
+int __init cgroup_init_early(void)
+{
+ static struct cgroup_fs_context __initdata ctx;
+ struct cgroup_subsys *ss;
+ int i;
+
+ ctx.root = &cgrp_dfl_root;
+ init_cgroup_root(&ctx);
+ cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
+
+ RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
+
+ for_each_subsys(ss, i) {
+ WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
+ i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+ ss->id, ss->name);
+ WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+ "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+ WARN(ss->early_init && ss->css_rstat_flush,
+ "cgroup rstat cannot be used with early init subsystem\n");
+
+ ss->id = i;
+ ss->name = cgroup_subsys_name[i];
+ if (!ss->legacy_name)
+ ss->legacy_name = cgroup_subsys_name[i];
+
+ if (ss->early_init)
+ cgroup_init_subsys(ss, true);
+ }
+ return 0;
+}
+
+/**
+ * cgroup_init - cgroup initialization
+ *
+ * Register cgroup filesystem and /proc file, and initialize
+ * any subsystems that didn't request early init.
+ */
+int __init cgroup_init(void)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
+
+ BUG_ON(ss_rstat_init(NULL));
+
+ get_user_ns(init_cgroup_ns.user_ns);
+ cgroup_rt_init();
+
+ cgroup_lock();
+
+ /*
+ * Add init_css_set to the hash table so that dfl_root can link to
+ * it during init.
+ */
+ hash_add(css_set_table, &init_css_set.hlist,
+ css_set_hash(init_css_set.subsys));
+
+ cgroup_bpf_lifetime_notifier_init();
+
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
+
+ cgroup_unlock();
+
+ for_each_subsys(ss, ssid) {
+ if (ss->early_init) {
+ struct cgroup_subsys_state *css =
+ init_css_set.subsys[ss->id];
+
+ css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+ GFP_KERNEL);
+ BUG_ON(css->id < 0);
+ } else {
+ cgroup_init_subsys(ss, false);
+ }
+
+ list_add_tail(&init_css_set.e_cset_node[ssid],
+ &cgrp_dfl_root.cgrp.e_csets[ssid]);
+
+ /*
+ * Setting dfl_root subsys_mask needs to consider the
+ * disabled flag and cftype registration needs kmalloc,
+ * both of which aren't available during early_init.
+ */
+ if (!cgroup_ssid_enabled(ssid))
+ continue;
+
+ if (cgroup1_ssid_disabled(ssid))
+ pr_info("Disabling %s control group subsystem in v1 mounts\n",
+ ss->legacy_name);
+
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+ /* implicit controllers must be threaded too */
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
+ if (ss->implicit_on_dfl)
+ cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
+ else if (!ss->dfl_cftypes)
+ cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
+
+ if (ss->threaded)
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
+ if (ss->dfl_cftypes == ss->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+ } else {
+ WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
+ }
+
+ if (ss->bind)
+ ss->bind(init_css_set.subsys[ssid]);
+
+ cgroup_lock();
+ css_populate_dir(init_css_set.subsys[ssid]);
+ cgroup_unlock();
+ }
+
+ /* init_css_set.subsys[] has been updated, re-hash */
+ hash_del(&init_css_set.hlist);
+ hash_add(css_set_table, &init_css_set.hlist,
+ css_set_hash(init_css_set.subsys));
+
+ WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
+ WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(register_filesystem(&cgroup2_fs_type));
+ WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
+#ifdef CONFIG_CPUSETS_V1
+ WARN_ON(register_filesystem(&cpuset_fs_type));
+#endif
+
+ ns_tree_add(&init_cgroup_ns);
+ return 0;
+}
+
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_offline_wq);
+
+ cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_release_wq);
+
+ cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1);
+ BUG_ON(!cgroup_free_wq);
+ return 0;
+}
+core_initcall(cgroup_wq_init);
+
+void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return;
+ kernfs_path(kn, buf, buflen);
+ kernfs_put(kn);
+}
+
+/*
+ * __cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * There are no cgroup NS restrictions.
+ */
+struct cgroup *__cgroup_get_from_id(u64 id)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp;
+
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
+
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp && !cgroup_tryget(cgrp))
+ cgrp = NULL;
+
+ rcu_read_unlock();
+ kernfs_put(kn);
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+ return cgrp;
+}
+
+/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct cgroup *cgrp, *root_cgrp;
+
+ cgrp = __cgroup_get_from_id(id);
+ if (IS_ERR(cgrp))
+ return cgrp;
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_id);
+
+/*
+ * proc_cgroup_show()
+ * - Print task's cgroup paths into seq_file, one line for each hierarchy
+ * - Used for /proc/<pid>/cgroup.
+ */
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf;
+ int retval;
+ struct cgroup_root *root;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+
+ for_each_root(root) {
+ struct cgroup_subsys *ss;
+ struct cgroup *cgrp;
+ int ssid, count = 0;
+
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
+ continue;
+
+ cgrp = task_cgroup_from_root(tsk, root);
+ /* The root has already been unmounted. */
+ if (!cgrp)
+ continue;
+
+ seq_printf(m, "%d:", root->hierarchy_id);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "",
+ ss->legacy_name);
+ if (strlen(root->name))
+ seq_printf(m, "%sname=%s", count ? "," : "",
+ root->name);
+ seq_putc(m, ':');
+ /*
+ * On traditional hierarchies, all zombie tasks show up as
+ * belonging to the root cgroup. On the default hierarchy,
+ * while a zombie doesn't show up in "cgroup.procs" and
+ * thus can't be migrated, its /proc/PID/cgroup keeps
+ * reporting the cgroup it belonged to before exiting. If
+ * the cgroup is removed before the zombie is reaped,
+ * " (deleted)" is appended to the cgroup path.
+ */
+ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+ retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ if (retval == -E2BIG)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
+ goto out_unlock;
+
+ seq_puts(m, buf);
+ } else {
+ seq_puts(m, "/");
+ }
+
+ if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+ seq_puts(m, " (deleted)\n");
+ else
+ seq_putc(m, '\n');
+ }
+
+ retval = 0;
+out_unlock:
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+ kfree(buf);
+out:
+ return retval;
+}
+
+/**
+ * cgroup_fork - initialize cgroup related fields during copy_process()
+ * @child: pointer to task_struct of forking parent process.
+ *
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the target css_set.
+ */
+void cgroup_fork(struct task_struct *child)
+{
+ RCU_INIT_POINTER(child->cgroups, &init_css_set);
+ INIT_LIST_HEAD(&child->cg_list);
+}
+
+/**
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
+ *
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
+ */
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+
+/**
+ * cgroup_css_set_fork - find or create a css_set for a child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This functions finds or creates a new css_set which the child
+ * process will be attached to in cgroup_post_fork(). By default,
+ * the child process will be given the same css_set as its parent.
+ *
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
+ * existing css_set which includes the requested cgroup and if not create
+ * a new css_set that the child will be attached to later. If this function
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+ * to the target cgroup.
+ */
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+{
+ int ret;
+ struct cgroup *dst_cgrp = NULL;
+ struct css_set *cset;
+ struct super_block *sb;
+
+ if (kargs->flags & CLONE_INTO_CGROUP)
+ cgroup_lock();
+
+ cgroup_threadgroup_change_begin(current);
+
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ if (kargs->cgrp)
+ kargs->kill_seq = kargs->cgrp->kill_seq;
+ else
+ kargs->kill_seq = cset->dfl_cgrp->kill_seq;
+ spin_unlock_irq(&css_set_lock);
+
+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+ kargs->cset = cset;
+ return 0;
+ }
+
+ CLASS(fd_raw, f)(kargs->cgroup);
+ if (fd_empty(f)) {
+ ret = -EBADF;
+ goto err;
+ }
+ sb = fd_file(f)->f_path.dentry->d_sb;
+
+ dst_cgrp = cgroup_get_from_file(fd_file(f));
+ if (IS_ERR(dst_cgrp)) {
+ ret = PTR_ERR(dst_cgrp);
+ dst_cgrp = NULL;
+ goto err;
+ }
+
+ if (cgroup_is_dead(dst_cgrp)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ /*
+ * Verify that we the target cgroup is writable for us. This is
+ * usually done by the vfs layer but since we're not going through
+ * the vfs layer here we need to do it "manually".
+ */
+ ret = cgroup_may_write(dst_cgrp, sb);
+ if (ret)
+ goto err;
+
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
+ if (ret)
+ goto err;
+
+ kargs->cset = find_css_set(cset, dst_cgrp);
+ if (!kargs->cset) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ put_css_set(cset);
+ kargs->cgrp = dst_cgrp;
+ return ret;
+
+err:
+ cgroup_threadgroup_change_end(current);
+ cgroup_unlock();
+ if (dst_cgrp)
+ cgroup_put(dst_cgrp);
+ put_css_set(cset);
+ if (kargs->cset)
+ put_css_set(kargs->cset);
+ return ret;
+}
+
+/**
+ * cgroup_css_set_put_fork - drop references we took during fork
+ * @kargs: the arguments passed to create the child process
+ *
+ * Drop references to the prepared css_set and target cgroup if
+ * CLONE_INTO_CGROUP was requested.
+ */
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+{
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
+
+ cgroup_threadgroup_change_end(current);
+
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+
+ if (kargs->flags & CLONE_INTO_CGROUP) {
+ cgroup_unlock();
+ if (cgrp) {
+ cgroup_put(cgrp);
+ kargs->cgrp = NULL;
+ }
+ }
+}
+
+/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This prepares a new css_set for the child process which the child will
+ * be attached to in cgroup_post_fork().
+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
+ * callback returns an error, the fork aborts with that error code. This
+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
+{
+ struct cgroup_subsys *ss;
+ int i, j, ret;
+
+ ret = cgroup_css_set_fork(kargs);
+ if (ret)
+ return ret;
+
+ do_each_subsys_mask(ss, i, have_canfork_callback) {
+ ret = ss->can_fork(child, kargs->cset);
+ if (ret)
+ goto out_revert;
+ } while_each_subsys_mask();
+
+ return 0;
+
+out_revert:
+ for_each_subsys(ss, j) {
+ if (j >= i)
+ break;
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, kargs->cset);
+ }
+
+ cgroup_css_set_put_fork(kargs);
+
+ return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeeded and cleans up references we took to
+ * prepare a new css_set for the child process in cgroup_can_fork().
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, kargs->cset);
+
+ cgroup_css_set_put_fork(kargs);
+}
+
+/**
+ * cgroup_post_fork - finalize cgroup setup for the child process
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * Attach the child process to its css_set calling the subsystem fork()
+ * callbacks.
+ */
+void cgroup_post_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+{
+ unsigned int cgrp_kill_seq = 0;
+ unsigned long cgrp_flags = 0;
+ bool kill = false;
+ struct cgroup_subsys *ss;
+ struct css_set *cset;
+ int i;
+
+ cset = kargs->cset;
+ kargs->cset = NULL;
+
+ spin_lock_irq(&css_set_lock);
+
+ /* init tasks are special, only link regular threads */
+ if (likely(child->pid)) {
+ if (kargs->cgrp) {
+ cgrp_flags = kargs->cgrp->flags;
+ cgrp_kill_seq = kargs->cgrp->kill_seq;
+ } else {
+ cgrp_flags = cset->dfl_cgrp->flags;
+ cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
+ }
+
+ WARN_ON_ONCE(!list_empty(&child->cg_list));
+ cset->nr_tasks++;
+ css_set_move_task(child, NULL, cset, false);
+ } else {
+ put_css_set(cset);
+ cset = NULL;
+ }
+
+ if (!(child->flags & PF_KTHREAD)) {
+ if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+ /*
+ * If the cgroup has to be frozen, the new task has
+ * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+ * get the task into the frozen state.
+ */
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient
+ * switch from the frozen state and back.
+ */
+ }
+
+ /*
+ * If the cgroup is to be killed notice it now and take the
+ * child down right after we finished preparing it for
+ * userspace.
+ */
+ kill = kargs->kill_seq != cgrp_kill_seq;
+ }
+
+ spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ do_each_subsys_mask(ss, i, have_fork_callback) {
+ ss->fork(child);
+ } while_each_subsys_mask();
+
+ /* Make the new cset the root_cset of the new cgroup namespace. */
+ if (kargs->flags & CLONE_NEWCGROUP) {
+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+
+ get_css_set(cset);
+ child->nsproxy->cgroup_ns->root_cset = cset;
+ put_css_set(rcset);
+ }
+
+ /* Cgroup has to be killed so take down child immediately. */
+ if (unlikely(kill))
+ do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
+ cgroup_css_set_put_fork(kargs);
+}
+
+/**
+ * cgroup_task_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+ *
+ * Description: Detach cgroup from @tsk.
+ *
+ */
+void cgroup_task_exit(struct task_struct *tsk)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ /* see cgroup_post_fork() for details */
+ do_each_subsys_mask(ss, i, have_exit_callback) {
+ ss->exit(tsk);
+ } while_each_subsys_mask();
+}
+
+static void do_cgroup_task_dead(struct task_struct *tsk)
+{
+ struct css_set *cset;
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+
+ WARN_ON_ONCE(list_empty(&tsk->cg_list));
+ cset = task_css_set(tsk);
+ css_set_move_task(tsk, cset, NULL, false);
+ cset->nr_tasks--;
+ /* matches the signal->live check in css_task_iter_advance() */
+ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+
+ if (dl_task(tsk))
+ dec_dl_tasks_cs(tsk);
+
+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
+ if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+ test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
+
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously but can't be delayed indefinitely either as the dead task pins
+ * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy
+ * irq_work to allow batching while ensuring timely completion.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+ struct llist_node *lnode;
+ struct task_struct *task, *next;
+
+ lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+ llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+ do_cgroup_task_dead(task);
+ put_task_struct(task);
+ }
+}
+
+static void __init cgroup_rt_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+ per_cpu(cgrp_dead_tasks_iwork, cpu) =
+ IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn);
+ }
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ get_task_struct(task);
+ llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+ irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
+}
+#else /* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ do_cgroup_task_dead(task);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+void cgroup_task_release(struct task_struct *task)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ do_each_subsys_mask(ss, ssid, have_release_callback) {
+ ss->release(task);
+ } while_each_subsys_mask();
+}
+
+void cgroup_task_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
+
+ if (!list_empty(&task->cg_list)) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
+
+ put_css_set(cset);
+}
+
+static int __init cgroup_disable(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ static_branch_disable(cgroup_subsys_enabled_key[i]);
+ pr_info("Disabling %s control group subsystem\n",
+ ss->name);
+ }
+
+ for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+ if (strcmp(token, cgroup_opt_feature_names[i]))
+ continue;
+ cgroup_feature_disable_mask |= 1 << i;
+ pr_info("Disabling %s control group feature\n",
+ cgroup_opt_feature_names[i]);
+ break;
+ }
+ }
+ return 1;
+}
+__setup("cgroup_disable=", cgroup_disable);
+
+void __init __weak enable_debug_cgroup(void) { }
+
+static int __init enable_cgroup_debug(char *str)
+{
+ cgroup_debug = true;
+ enable_debug_cgroup();
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
+
+static int __init cgroup_favordynmods_setup(char *str)
+{
+ return (kstrtobool(str, &have_favordynmods) == 0);
+}
+__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);
+
+/**
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
+ * @dentry: directory dentry of interest
+ * @ss: subsystem of interest
+ *
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it. If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
+ */
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+ struct cgroup_subsys *ss)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct file_system_type *s_type = dentry->d_sb->s_type;
+ struct cgroup_subsys_state *css = NULL;
+ struct cgroup *cgrp;
+
+ /* is @dentry a cgroup dir? */
+ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
+ !kn || kernfs_type(kn) != KERNFS_DIR)
+ return ERR_PTR(-EBADF);
+
+ rcu_read_lock();
+
+ /*
+ * This path doesn't originate from kernfs and @kn could already
+ * have been or be removed at any point. @kn->priv is RCU
+ * protected for this access. See css_release_work_fn() for details.
+ */
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp)
+ css = cgroup_css(cgrp, ss);
+
+ if (!css || !css_tryget_online(css))
+ css = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+ return css;
+}
+
+/**
+ * css_from_id - lookup css by id
+ * @id: the cgroup id
+ * @ss: cgroup subsys to be looked into
+ *
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
+ * Should be called under rcu_read_lock().
+ */
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return idr_find(&ss->css_idr, id);
+}
+
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it. Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
+ if (!kn)
+ goto out;
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ cgrp = ERR_PTR(-ENOTDIR);
+ goto out_kernfs;
+ }
+
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+
+out_kernfs:
+ kernfs_put(kn);
+out:
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
+/**
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory. Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
+{
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+
+ return cgroup_v1v2_get_from_file(fd_file(f));
+}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
+static u64 power_of_ten(int power)
+{
+ u64 v = 1;
+ while (power--)
+ v *= 10;
+ return v;
+}
+
+/**
+ * cgroup_parse_float - parse a floating number
+ * @input: input string
+ * @dec_shift: number of decimal digits to shift
+ * @v: output
+ *
+ * Parse a decimal floating point number in @input and store the result in
+ * @v with decimal point right shifted @dec_shift times. For example, if
+ * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
+ * Returns 0 on success, -errno otherwise.
+ *
+ * There's nothing cgroup specific about this function except that it's
+ * currently the only user.
+ */
+int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+{
+ s64 whole, frac = 0;
+ int fstart = 0, fend = 0, flen;
+
+ if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
+ return -EINVAL;
+ if (frac < 0)
+ return -EINVAL;
+
+ flen = fend > fstart ? fend - fstart : 0;
+ if (flen < dec_shift)
+ frac *= power_of_ten(dec_shift - flen);
+ else
+ frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
+
+ *v = whole * power_of_ten(dec_shift) + frac;
+ return 0;
+}
+
+/*
+ * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+ struct cgroup *cgroup;
+
+ rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
+
+ while (true) {
+ struct css_set *cset;
+
+ cset = task_css_set(current);
+ if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+ cgroup = cset->dfl_cgrp;
+ break;
+ }
+ cpu_relax();
+ }
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
+ rcu_read_unlock();
+}
+
+void cgroup_sk_clone(struct sock_cgroup_data *skcd)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ cgroup_bpf_put(cgrp);
+ cgroup_put(cgrp);
+}
+
+#endif /* CONFIG_SOCK_CGROUP_DATA */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+ ssize_t size, const char *prefix)
+{
+ struct cftype *cft;
+ ssize_t ret = 0;
+
+ for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+ if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+ continue;
+
+ if (prefix)
+ ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+ ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+ if (WARN_ON(ret >= size))
+ break;
+ }
+
+ return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ ssize_t ret = 0;
+
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+
+ for_each_subsys(ss, ssid)
+ ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+ PAGE_SIZE - ret,
+ cgroup_subsys_name[ssid]);
+
+ return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE,
+ "nsdelegate\n"
+ "favordynmods\n"
+ "memory_localevents\n"
+ "memory_recursiveprot\n"
+ "memory_hugetlb_accounting\n"
+ "pids_localevents\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+ &cgroup_delegate_attr.attr,
+ &cgroup_features_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+ .attrs = cgroup_sysfs_attrs,
+ .name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+ return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
new file mode 100644
index 000000000000..01976c8e7d49
--- /dev/null
+++ b/kernel/cgroup/cpuset-internal.h
@@ -0,0 +1,308 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __CPUSET_INTERNAL_H
+#define __CPUSET_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/spinlock.h>
+#include <linux/union_find.h>
+
+/* See "Frequency meter" comments, below. */
+
+struct fmeter {
+ int cnt; /* unprocessed events count */
+ int val; /* most recent output value */
+ time64_t time; /* clock (secs) when val computed */
+ spinlock_t lock; /* guards read or write of above */
+};
+
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+ PERR_HKEEPING,
+ PERR_ACCESS,
+ PERR_REMOTE,
+};
+
+/* bits in struct cpuset flags field */
+typedef enum {
+ CS_CPU_EXCLUSIVE,
+ CS_MEM_EXCLUSIVE,
+ CS_MEM_HARDWALL,
+ CS_MEMORY_MIGRATE,
+ CS_SCHED_LOAD_BALANCE,
+ CS_SPREAD_PAGE,
+ CS_SPREAD_SLAB,
+} cpuset_flagbits_t;
+
+/* The various types of files and directories in a cpuset file system */
+
+typedef enum {
+ FILE_MEMORY_MIGRATE,
+ FILE_CPULIST,
+ FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
+ FILE_EXCLUSIVE_CPULIST,
+ FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
+ FILE_CPU_EXCLUSIVE,
+ FILE_MEM_EXCLUSIVE,
+ FILE_MEM_HARDWALL,
+ FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
+ FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ FILE_MEMORY_PRESSURE_ENABLED,
+ FILE_MEMORY_PRESSURE,
+ FILE_SPREAD_PAGE,
+ FILE_SPREAD_SLAB,
+} cpuset_filetype_t;
+
+struct cpuset {
+ struct cgroup_subsys_state css;
+
+ unsigned long flags; /* "unsigned long" so bitops work */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierarchy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
+
+ /*
+ * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
+ *
+ * The effective_cpus of a valid partition root comes solely from its
+ * effective_xcpus and some of the effective_xcpus may be distributed
+ * to sub-partitions below & hence excluded from its effective_cpus.
+ * For a valid partition root, its effective_cpus have no relationship
+ * with cpus_allowed unless its exclusive_cpus isn't set.
+ *
+ * This value will only be set if either exclusive_cpus is set or
+ * when this cpuset becomes a local partition root.
+ */
+ cpumask_var_t effective_xcpus;
+
+ /*
+ * Exclusive CPUs as requested by the user (default hierarchy only)
+ *
+ * Its value is independent of cpus_allowed and designates the set of
+ * CPUs that can be granted to the current cpuset or its children when
+ * it becomes a valid partition root. The effective set of exclusive
+ * CPUs granted (effective_xcpus) depends on whether those exclusive
+ * CPUs are passed down by its ancestors and not yet taken up by
+ * another sibling partition root along the way.
+ *
+ * If its value isn't set, it defaults to cpus_allowed.
+ */
+ cpumask_var_t exclusive_cpus;
+
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
+
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
+ /* partition root state */
+ int partition_root_state;
+
+ /*
+ * Whether cpuset is a remote partition.
+ * It used to be a list anchoring all remote partitions — we can switch back
+ * to a list if we need to iterate over the remote partitions.
+ */
+ bool remote_partition;
+
+ /*
+ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
+ * know when to rebuild associated root domain bandwidth information.
+ */
+ int nr_deadline_tasks;
+ int nr_migrate_dl_tasks;
+ u64 sum_migrate_dl_bw;
+
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
+ /* Handle for cpuset.cpus.partition */
+ struct cgroup_file partition_file;
+
+ /* Used to merge intersecting subsets for generate_sched_domains */
+ struct uf_node node;
+};
+
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct cpuset, css) : NULL;
+}
+
+/* Retrieve the cpuset for a task */
+static inline struct cpuset *task_cs(struct task_struct *task)
+{
+ return css_cs(task_css(task, cpuset_cgrp_id));
+}
+
+static inline struct cpuset *parent_cs(struct cpuset *cs)
+{
+ return css_cs(cs->css.parent);
+}
+
+/* convenient tests for these bits */
+static inline bool is_cpuset_online(struct cpuset *cs)
+{
+ return css_is_online(&cs->css) && !css_is_dying(&cs->css);
+}
+
+static inline int is_cpu_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_exclusive(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+}
+
+static inline int is_mem_hardwall(const struct cpuset *cs)
+{
+ return test_bit(CS_MEM_HARDWALL, &cs->flags);
+}
+
+static inline int is_sched_load_balance(const struct cpuset *cs)
+{
+ return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+}
+
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
+static inline int is_spread_page(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+ return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_css: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
+ css_for_each_child((pos_css), &(parent_cs)->css) \
+ if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_css by calling
+ * css_rightmost_descendant() to skip subtree. @root_cs is included in the
+ * iteration and the first node to be visited.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
+ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
+ if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
+
+void rebuild_sched_domains_locked(void);
+void cpuset_callback_lock_irq(void);
+void cpuset_callback_unlock_irq(void);
+void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus);
+void cpuset_update_tasks_nodemask(struct cpuset *cs);
+int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on);
+ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int cpuset_common_seq_show(struct seq_file *sf, void *v);
+void cpuset_full_lock(void);
+void cpuset_full_unlock(void);
+
+/*
+ * cpuset-v1.c
+ */
+#ifdef CONFIG_CPUSETS_V1
+extern struct cftype cpuset1_files[];
+void fmeter_init(struct fmeter *fmp);
+void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk);
+void cpuset1_update_tasks_flags(struct cpuset *cs);
+void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated);
+int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+#else
+static inline void fmeter_init(struct fmeter *fmp) {}
+static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk) {}
+static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
+static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated) {}
+static inline int cpuset1_validate_change(struct cpuset *cur,
+ struct cpuset *trial) { return 0; }
+#endif /* CONFIG_CPUSETS_V1 */
+
+#endif /* __CPUSET_INTERNAL_H */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
new file mode 100644
index 000000000000..12e76774c75b
--- /dev/null
+++ b/kernel/cgroup/cpuset-v1.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "cgroup-internal.h"
+#include "cpuset-internal.h"
+
+/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+ struct work_struct work;
+ struct cpuset *cs;
+};
+
+/*
+ * Frequency meter - How fast is some event occurring?
+ *
+ * These routines manage a digitally filtered, constant time based,
+ * event frequency meter. There are four routines:
+ * fmeter_init() - initialize a frequency meter.
+ * fmeter_markevent() - called each time the event happens.
+ * fmeter_getrate() - returns the recent rate of such events.
+ * fmeter_update() - internal routine used to update fmeter.
+ *
+ * A common data structure is passed to each of these routines,
+ * which is used to keep track of the state required to manage the
+ * frequency meter and its digital filter.
+ *
+ * The filter works on the number of events marked per unit time.
+ * The filter is single-pole low-pass recursive (IIR). The time unit
+ * is 1 second. Arithmetic is done using 32-bit integers scaled to
+ * simulate 3 decimal digits of precision (multiplied by 1000).
+ *
+ * With an FM_COEF of 933, and a time base of 1 second, the filter
+ * has a half-life of 10 seconds, meaning that if the events quit
+ * happening, then the rate returned from the fmeter_getrate()
+ * will be cut in half each 10 seconds, until it converges to zero.
+ *
+ * It is not worth doing a real infinitely recursive filter. If more
+ * than FM_MAXTICKS ticks have elapsed since the last filter event,
+ * just compute FM_MAXTICKS ticks worth, by which point the level
+ * will be stable.
+ *
+ * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
+ * arithmetic overflow in the fmeter_update() routine.
+ *
+ * Given the simple 32 bit integer arithmetic used, this meter works
+ * best for reporting rates between one per millisecond (msec) and
+ * one per 32 (approx) seconds. At constant rates faster than one
+ * per msec it maxes out at values just under 1,000,000. At constant
+ * rates between one per msec, and one per second it will stabilize
+ * to a value N*1000, where N is the rate of events per second.
+ * At constant rates between one per second and one per 32 seconds,
+ * it will be choppy, moving up on the seconds that have an event,
+ * and then decaying until the next event. At rates slower than
+ * about one in 32 seconds, it decays all the way back to zero between
+ * each event.
+ */
+
+#define FM_COEF 933 /* coefficient for half-life of 10 secs */
+#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
+#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
+#define FM_SCALE 1000 /* faux fixed point scale */
+
+/* Initialize a frequency meter */
+void fmeter_init(struct fmeter *fmp)
+{
+ fmp->cnt = 0;
+ fmp->val = 0;
+ fmp->time = 0;
+ spin_lock_init(&fmp->lock);
+}
+
+/* Internal meter update - process cnt events and update value */
+static void fmeter_update(struct fmeter *fmp)
+{
+ time64_t now;
+ u32 ticks;
+
+ now = ktime_get_seconds();
+ ticks = now - fmp->time;
+
+ if (ticks == 0)
+ return;
+
+ ticks = min(FM_MAXTICKS, ticks);
+ while (ticks-- > 0)
+ fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
+ fmp->time = now;
+
+ fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
+ fmp->cnt = 0;
+}
+
+/* Process any previous ticks, then bump cnt by one (times scale). */
+static void fmeter_markevent(struct fmeter *fmp)
+{
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
+ spin_unlock(&fmp->lock);
+}
+
+/* Process any previous ticks, then return current value. */
+static int fmeter_getrate(struct fmeter *fmp)
+{
+ int val;
+
+ spin_lock(&fmp->lock);
+ fmeter_update(fmp);
+ val = fmp->val;
+ spin_unlock(&fmp->lock);
+ return val;
+}
+
+/*
+ * Collection of memory_pressure is suppressed unless
+ * this flag is enabled by writing "1" to the special
+ * cpuset file 'memory_pressure_enabled' in the root cpuset.
+ */
+
+int cpuset_memory_pressure_enabled __read_mostly;
+
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ *
+ * Keep a running average of the rate of synchronous (direct)
+ * page reclaim efforts initiated by tasks in each cpuset.
+ *
+ * This represents the rate at which some task in the cpuset
+ * ran low on memory on all nodes it was allowed to use, and
+ * had to enter the kernels page reclaim code in an effort to
+ * create more free memory by tossing clean pages or swapping
+ * or writing dirty pages.
+ *
+ * Display to user space in the per-cpuset read-only file
+ * "memory_pressure". Value displayed is an integer
+ * representing the recent rate of entry into the synchronous
+ * (direct) page reclaim by any task attached to the cpuset.
+ */
+
+void __cpuset_memory_pressure_bump(void)
+{
+ rcu_read_lock();
+ fmeter_markevent(&task_cs(current)->fmeter);
+ rcu_read_unlock();
+}
+
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
+{
+#ifdef CONFIG_SMP
+ if (val < -1 || val > sched_domain_level_max + 1)
+ return -EINVAL;
+#endif
+
+ if (val != cs->relax_domain_level) {
+ cs->relax_domain_level = val;
+ if (!cpumask_empty(cs->cpus_allowed) &&
+ is_sched_load_balance(cs))
+ rebuild_sched_domains_locked();
+ }
+
+ return 0;
+}
+
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+ s64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
+
+ cpuset_full_lock();
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = update_relax_domain_level(cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ cpuset_full_unlock();
+ return retval;
+}
+
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ return cs->relax_domain_level;
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+/*
+ * update task's spread flag if cpuset's page/slab spread flag is set
+ *
+ * Call with callback_lock or cpuset_mutex held. The check can be skipped
+ * if on default hierarchy.
+ */
+void cpuset1_update_task_spread_flags(struct cpuset *cs,
+ struct task_struct *tsk)
+{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
+ if (is_spread_page(cs))
+ task_set_spread_page(tsk);
+ else
+ task_clear_spread_page(tsk);
+
+ if (is_spread_slab(cs))
+ task_set_spread_slab(tsk);
+ else
+ task_clear_spread_slab(tsk);
+}
+
+/**
+ * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
+ * @cs: the cpuset in which each task's spread flags needs to be changed
+ *
+ * Iterate through each task of @cs updating its spread flags. As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
+ */
+void cpuset1_update_tasks_flags(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, 0, &it);
+ while ((task = css_task_iter_next(&it)))
+ cpuset1_update_task_spread_flags(cs, task);
+ css_task_iter_end(&it);
+}
+
+/*
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+ struct cpuset *parent;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = parent_cs(cs);
+ while (cpumask_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent_cs(parent);
+
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
+ pr_cont_cgroup_name(cs->css.cgroup);
+ pr_cont("\n");
+ }
+}
+
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+ struct cpuset_remove_tasks_struct *s;
+
+ s = container_of(work, struct cpuset_remove_tasks_struct, work);
+ remove_tasks_in_empty_cpuset(s->cs);
+ css_put(&s->cs->css);
+ kfree(s);
+}
+
+void cpuset1_hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ cpuset_callback_lock_irq();
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ cpuset_callback_unlock_irq();
+
+ /*
+ * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migrated to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ cpuset_update_tasks_cpumask(cs, new_cpus);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ cpuset_update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Execute it asynchronously using workqueue.
+ */
+ if (is_empty && cs->css.cgroup->nr_populated_csets &&
+ css_tryget_online(&cs->css)) {
+ struct cpuset_remove_tasks_struct *s;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (WARN_ON_ONCE(!s)) {
+ css_put(&cs->css);
+ return;
+ }
+
+ s->cs = cs;
+ INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+ schedule_work(&s->work);
+ }
+}
+
+/*
+ * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
+ *
+ * One cpuset is a subset of another if all its allowed CPUs and
+ * Memory Nodes are a subset of the other, and its exclusive flags
+ * are only set if the other's are set. Call holding cpuset_mutex.
+ */
+
+static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
+{
+ return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ nodes_subset(p->mems_allowed, q->mems_allowed) &&
+ is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+ is_mem_exclusive(p) <= is_mem_exclusive(q);
+}
+
+/*
+ * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ */
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf;
+ struct cgroup_subsys_state *css;
+ int retval;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
+ if (retval == -E2BIG)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
+ goto out_free;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+ retval = 0;
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+#endif /* CONFIG_PROC_PID_CPUSET */
+
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ return is_cpu_exclusive(cs);
+ case FILE_MEM_EXCLUSIVE:
+ return is_mem_exclusive(cs);
+ case FILE_MEM_HARDWALL:
+ return is_mem_hardwall(cs);
+ case FILE_SCHED_LOAD_BALANCE:
+ return is_sched_load_balance(cs);
+ case FILE_MEMORY_MIGRATE:
+ return is_memory_migrate(cs);
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ return cpuset_memory_pressure_enabled;
+ case FILE_MEMORY_PRESSURE:
+ return fmeter_getrate(&cs->fmeter);
+ case FILE_SPREAD_PAGE:
+ return is_spread_page(cs);
+ case FILE_SPREAD_SLAB:
+ return is_spread_slab(cs);
+ default:
+ BUG();
+ }
+
+ /* Unreachable but makes gcc happy */
+ return 0;
+}
+
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 val)
+{
+ struct cpuset *cs = css_cs(css);
+ cpuset_filetype_t type = cft->private;
+ int retval = 0;
+
+ cpuset_full_lock();
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
+ goto out_unlock;
+ }
+
+ switch (type) {
+ case FILE_CPU_EXCLUSIVE:
+ retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_EXCLUSIVE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
+ break;
+ case FILE_MEM_HARDWALL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
+ break;
+ case FILE_SCHED_LOAD_BALANCE:
+ pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
+ retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
+ break;
+ case FILE_MEMORY_MIGRATE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
+ break;
+ case FILE_MEMORY_PRESSURE_ENABLED:
+ pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
+ cpuset_memory_pressure_enabled = !!val;
+ break;
+ case FILE_SPREAD_PAGE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
+ break;
+ case FILE_SPREAD_SLAB:
+ pr_warn_once("cpuset.%s is deprecated\n", cft->name);
+ retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+out_unlock:
+ cpuset_full_unlock();
+ return retval;
+}
+
+/*
+ * for the common functions, 'private' gives the type of file
+ */
+
+struct cftype cpuset1_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ },
+
+ {
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpu_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_CPU_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_exclusive",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_EXCLUSIVE,
+ },
+
+ {
+ .name = "mem_hardwall",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEM_HARDWALL,
+ },
+
+ {
+ .name = "sched_load_balance",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SCHED_LOAD_BALANCE,
+ },
+
+ {
+ .name = "sched_relax_domain_level",
+ .read_s64 = cpuset_read_s64,
+ .write_s64 = cpuset_write_s64,
+ .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
+ },
+
+ {
+ .name = "memory_migrate",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_MIGRATE,
+ },
+
+ {
+ .name = "memory_pressure",
+ .read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
+ },
+
+ {
+ .name = "memory_spread_page",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_PAGE,
+ },
+
+ {
+ /* obsolete, may be removed in the future */
+ .name = "memory_spread_slab",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SPREAD_SLAB,
+ },
+
+ {
+ .name = "memory_pressure_enabled",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_MEMORY_PRESSURE_ENABLED,
+ },
+
+ { } /* terminate */
+};
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
new file mode 100644
index 000000000000..6e6eb09b8db6
--- /dev/null
+++ b/kernel/cgroup/cpuset.c
@@ -0,0 +1,4543 @@
+/*
+ * kernel/cpuset.c
+ *
+ * Processor and Memory placement constraints for sets of tasks.
+ *
+ * Copyright (C) 2003 BULL SA.
+ * Copyright (C) 2004-2007 Silicon Graphics, Inc.
+ * Copyright (C) 2006 Google, Inc
+ *
+ * Portions derived from Patrick Mochel's sysfs code.
+ * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ *
+ * 2003-10-10 Written by Simon Derr.
+ * 2003-10-22 Updates by Stephen Hemminger.
+ * 2004 May-July Rework by Paul Jackson.
+ * 2006 Rework by Paul Menage to use generic cgroups
+ * 2008 Rework of the scheduler domains and CPU hotplug handling
+ * by Max Krasnyansky
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "cpuset-internal.h"
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/memory.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/security.h>
+#include <linux/oom.h>
+#include <linux/sched/isolation.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/task_work.h>
+
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgment
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
+ [PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
+ [PERR_ACCESS] = "Enable partition not permitted",
+ [PERR_REMOTE] = "Have remote partition underneath",
+};
+
+/*
+ * For local partitions, update to subpartitions_cpus & isolated_cpus is done
+ * in update_parent_effective_cpumask(). For remote partitions, it is done in
+ * the remote_partition_*() and remote_cpus_update() helpers.
+ */
+/*
+ * Exclusive CPUs distributed out to local or remote sub-partitions of
+ * top_cpuset
+ */
+static cpumask_var_t subpartitions_cpus;
+
+/*
+ * Exclusive CPUs in isolated partitions
+ */
+static cpumask_var_t isolated_cpus;
+
+/*
+ * isolated_cpus updating flag (protected by cpuset_mutex)
+ * Set if isolated_cpus is going to be updated in the current
+ * cpuset_mutex crtical section.
+ */
+static bool isolated_cpus_updating;
+
+/*
+ * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
+ */
+static cpumask_var_t boot_hk_cpus;
+static bool have_boot_isolcpus;
+
+/*
+ * A flag to force sched domain rebuild at the end of an operation.
+ * It can be set in
+ * - update_partition_sd_lb()
+ * - update_cpumasks_hier()
+ * - cpuset_update_flag()
+ * - cpuset_hotplug_update_tasks()
+ * - cpuset_handle_hotplug()
+ *
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ *
+ * Note that update_relax_domain_level() in cpuset-v1.c can still call
+ * rebuild_sched_domains_locked() directly without using this flag.
+ */
+static bool force_sd_rebuild;
+
+/*
+ * Partition root states:
+ *
+ * 0 - member (not a partition root)
+ * 1 - partition root
+ * 2 - partition root without load balancing (isolated)
+ * -1 - invalid partition root
+ * -2 - invalid isolated partition root
+ *
+ * There are 2 types of partitions - local or remote. Local partitions are
+ * those whose parents are partition root themselves. Setting of
+ * cpuset.cpus.exclusive are optional in setting up local partitions.
+ * Remote partitions are those whose parents are not partition roots. Passing
+ * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
+ * nodes are mandatory in creating a remote partition.
+ *
+ * For simplicity, a local partition can be created under a local or remote
+ * partition but a remote partition cannot have any partition root in its
+ * ancestor chain except the cgroup root.
+ */
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+/*
+ * Temporary cpumasks for working with partitions that are passed among
+ * functions to avoid memory allocation in inner functions.
+ */
+struct tmpmasks {
+ cpumask_var_t addmask, delmask; /* For partition root */
+ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
+};
+
+void inc_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks++;
+}
+
+void dec_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks--;
+}
+
+static inline bool is_partition_valid(const struct cpuset *cs)
+{
+ return cs->partition_root_state > 0;
+}
+
+static inline bool is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+static inline bool cs_is_member(const struct cpuset *cs)
+{
+ return cs->partition_root_state == PRS_MEMBER;
+}
+
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
+{
+ if (cs->partition_root_state > 0)
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
+/*
+ * Send notification event of whenever partition_root_state changes.
+ */
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
+{
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
+
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
+}
+
+/*
+ * The top_cpuset is always synchronized to cpu_active_mask and we should avoid
+ * using cpu_online_mask as much as possible. An active CPU is always an online
+ * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ
+ * during hotplug operations. A CPU is marked active at the last stage of CPU
+ * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code
+ * will be called to update the sched domains so that the scheduler can move
+ * a normal task to a newly active CPU or remove tasks away from a newly
+ * inactivated CPU. The online bit is set much earlier in the CPU bringup
+ * process and cleared much later in CPU teardown.
+ *
+ * If cpu_online_mask is used while a hotunplug operation is happening in
+ * parallel, we may leave an offline CPU in cpu_allowed or some other masks.
+ */
+static struct cpuset top_cpuset = {
+ .flags = BIT(CS_CPU_EXCLUSIVE) |
+ BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
+ .partition_root_state = PRS_ROOT,
+ .relax_domain_level = -1,
+ .remote_partition = false,
+};
+
+/*
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
+ * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * structures. Note that cpuset_mutex needs to be a mutex as it is used in
+ * paths that rely on priority inheritance (e.g. scheduler - on RT) for
+ * correctness.
+ *
+ * A task must hold both locks to modify cpusets. If a task holds
+ * cpuset_mutex, it blocks others, ensuring that it is the only task able to
+ * also acquire callback_lock and be able to modify cpusets. It can perform
+ * various checks on the cpuset structure first, knowing nothing will change.
+ * It can also allocate memory while just holding cpuset_mutex. While it is
+ * performing these checks, various callback routines can briefly acquire
+ * callback_lock to query cpusets. Once it is ready to make the changes, it
+ * takes callback_lock, blocking everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_lock, as that would risk double tripping on callback_lock
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_lock, then it has read-only
+ * access to cpusets.
+ *
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
+ *
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ */
+
+static DEFINE_MUTEX(cpuset_mutex);
+
+/**
+ * cpuset_lock - Acquire the global cpuset mutex
+ *
+ * This locks the global cpuset mutex to prevent modifications to cpuset
+ * hierarchy and configurations. This helper is not enough to make modification.
+ */
+void cpuset_lock(void)
+{
+ mutex_lock(&cpuset_mutex);
+}
+
+void cpuset_unlock(void)
+{
+ mutex_unlock(&cpuset_mutex);
+}
+
+/**
+ * cpuset_full_lock - Acquire full protection for cpuset modification
+ *
+ * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex
+ * to safely modify cpuset data.
+ */
+void cpuset_full_lock(void)
+{
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+}
+
+void cpuset_full_unlock(void)
+{
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+}
+
+static DEFINE_SPINLOCK(callback_lock);
+
+void cpuset_callback_lock_irq(void)
+{
+ spin_lock_irq(&callback_lock);
+}
+
+void cpuset_callback_unlock_irq(void)
+{
+ spin_unlock_irq(&callback_lock);
+}
+
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
+static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+ if (!cpusets_insane_config() &&
+ movable_only_nodes(nodes)) {
+ static_branch_enable_cpuslocked(&cpusets_insane_config_key);
+ pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+ "Cpuset allocations might fail even with a lot of memory available.\n",
+ nodemask_pr_args(nodes));
+ }
+}
+
+/*
+ * decrease cs->attach_in_progress.
+ * wake_up cpuset_attach_wq if cs->attach_in_progress==0.
+ */
+static inline void dec_attach_in_progress_locked(struct cpuset *cs)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+}
+
+static inline void dec_attach_in_progress(struct cpuset *cs)
+{
+ mutex_lock(&cpuset_mutex);
+ dec_attach_in_progress_locked(cs);
+ mutex_unlock(&cpuset_mutex);
+}
+
+static inline bool cpuset_v2(void)
+{
+ return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
+ cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+}
+
+/*
+ * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
+ * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
+ * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
+ * With v2 behavior, "cpus" and "mems" are always what the users have
+ * requested and won't be changed by hotplug events. Only the effective
+ * cpus or mems will be affected.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cpuset_v2() ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ /* Cpusets in the process of attaching should be considered as populated */
+ return cgroup_is_populated(cs->css.cgroup) ||
+ cs->attach_in_progress;
+}
+
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * @cs should be a valid partition root or going to become a partition root.
+ * @excluded_child should be non-NULL when this cpuset is going to become a
+ * partition itself.
+ *
+ * Note that a remote partition is not allowed underneath a valid local
+ * or remote partition. So if a non-partition root child is populated,
+ * the whole partition is considered populated.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ /*
+ * We cannot call cs_is_populated(cs) directly, as
+ * nr_populated_domain_children may include populated
+ * csets from descendants that are partitions.
+ */
+ if (cs->css.cgroup->nr_populated_csets ||
+ cs->attach_in_progress)
+ return true;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ if (cp == cs || cp == excluded_child)
+ continue;
+
+ if (is_partition_valid(cp)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (cpuset_is_populated(cp)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+/*
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task. If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of cpu_active_mask.
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void guarantee_active_cpus(struct task_struct *tsk,
+ struct cpumask *pmask)
+{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ struct cpuset *cs;
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
+ cpumask_copy(pmask, cpu_active_mask);
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+
+ while (!cpumask_intersects(cs->effective_cpus, pmask))
+ cs = parent_cs(cs);
+
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+ rcu_read_unlock();
+}
+
+/*
+ * Return in *pmask the portion of a cpusets's mems_allowed that
+ * are online, with memory. If none are online with memory, walk
+ * up the cpuset hierarchy until we find one that does have some
+ * online mems. The top cpuset always has some mems online.
+ *
+ * One way or another, we guarantee to return some non-empty subset
+ * of node_states[N_MEMORY].
+ *
+ * Call with callback_lock or cpuset_mutex held.
+ */
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
+{
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
+ cs = parent_cs(cs);
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
+}
+
+/**
+ * alloc_cpumasks - Allocate an array of cpumask variables
+ * @pmasks: Pointer to array of cpumask_var_t pointers
+ * @size: Number of cpumasks to allocate
+ * Return: 0 if successful, -ENOMEM otherwise.
+ *
+ * Allocates @size cpumasks and initializes them to empty. Returns 0 on
+ * success, -ENOMEM on allocation failure. On failure, any previously
+ * allocated cpumasks are freed.
+ */
+static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
+{
+ int i;
+
+ for (i = 0; i < size; i++) {
+ if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
+ while (--i >= 0)
+ free_cpumask_var(*pmasks[i]);
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/**
+ * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations.
+ * @tmp: Pointer to tmpmasks structure to populate
+ * Return: 0 on success, -ENOMEM on allocation failure
+ */
+static inline int alloc_tmpmasks(struct tmpmasks *tmp)
+{
+ /*
+ * Array of pointers to the three cpumask_var_t fields in tmpmasks.
+ * Note: Array size must match actual number of masks (3)
+ */
+ cpumask_var_t *pmask[3] = {
+ &tmp->new_cpus,
+ &tmp->addmask,
+ &tmp->delmask
+ };
+
+ return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
+}
+
+/**
+ * free_tmpmasks - free cpumasks in a tmpmasks structure
+ * @tmp: the tmpmasks structure pointer
+ */
+static inline void free_tmpmasks(struct tmpmasks *tmp)
+{
+ if (!tmp)
+ return;
+
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
+}
+
+/**
+ * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset
+ * @cs: Source cpuset to duplicate (NULL for a fresh allocation)
+ *
+ * Creates a new cpuset by either:
+ * 1. Duplicating an existing cpuset (if @cs is non-NULL), or
+ * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL)
+ *
+ * Return: Pointer to newly allocated cpuset on success, NULL on failure
+ */
+static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
+{
+ struct cpuset *trial;
+
+ /* Allocate base structure */
+ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
+ kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!trial)
+ return NULL;
+
+ /* Setup cpumask pointer array */
+ cpumask_var_t *pmask[4] = {
+ &trial->cpus_allowed,
+ &trial->effective_cpus,
+ &trial->effective_xcpus,
+ &trial->exclusive_cpus
+ };
+
+ if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
+ kfree(trial);
+ return NULL;
+ }
+
+ /* Copy masks if duplicating */
+ if (cs) {
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+ cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+ cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
+ }
+
+ return trial;
+}
+
+/**
+ * free_cpuset - free the cpuset
+ * @cs: the cpuset to be freed
+ */
+static inline void free_cpuset(struct cpuset *cs)
+{
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->effective_xcpus);
+ free_cpumask_var(cs->exclusive_cpus);
+ kfree(cs);
+}
+
+/* Return user specified exclusive CPUs */
+static inline struct cpumask *user_xcpus(struct cpuset *cs)
+{
+ return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
+ : cs->exclusive_cpus;
+}
+
+static inline bool xcpus_empty(struct cpuset *cs)
+{
+ return cpumask_empty(cs->cpus_allowed) &&
+ cpumask_empty(cs->exclusive_cpus);
+}
+
+/*
+ * cpusets_are_exclusive() - check if two cpusets are exclusive
+ *
+ * Return true if exclusive, false if not
+ */
+static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
+{
+ struct cpumask *xcpus1 = user_xcpus(cs1);
+ struct cpumask *xcpus2 = user_xcpus(cs2);
+
+ if (cpumask_intersects(xcpus1, xcpus2))
+ return false;
+ return true;
+}
+
+/**
+ * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * Conflict detection rules:
+ * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
+ * 2. exclusive_cpus masks cannot intersect between cpusets
+ * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ */
+static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ /* If either cpuset is exclusive, check if they are mutually exclusive */
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return !cpusets_are_exclusive(cs1, cs2);
+
+ /* Exclusive_cpus cannot intersect */
+ if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
+ return true;
+
+ /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
+ if (!cpumask_empty(cs1->cpus_allowed) &&
+ cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
+ return true;
+
+ if (!cpumask_empty(cs2->cpus_allowed) &&
+ cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ return true;
+
+ return false;
+}
+
+static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
+ return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+ return false;
+}
+
+/*
+ * validate_change() - Used to validate that any proposed cpuset change
+ * follows the structural rules for cpusets.
+ *
+ * If we replaced the flag and mask values of the current cpuset
+ * (cur) with those values in the trial cpuset (trial), would
+ * our various subset and exclusive rules still be valid? Presumes
+ * cpuset_mutex held.
+ *
+ * 'cur' is the address of an actual, in-use cpuset. Operations
+ * such as list traversal that depend on the actual address of the
+ * cpuset in the list must use cur below, not trial.
+ *
+ * 'trial' is the address of bulk structure copy of cur, with
+ * perhaps one or more of the fields cpus_allowed, mems_allowed,
+ * or flags changed to new, trial values.
+ *
+ * Return 0 if valid, -errno if not.
+ */
+
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret = 0;
+
+ rcu_read_lock();
+
+ if (!is_in_v2_mode())
+ ret = cpuset1_validate_change(cur, trial);
+ if (ret)
+ goto out;
+
+ /* Remaining checks don't apply to root cpuset */
+ if (cur == &top_cpuset)
+ goto out;
+
+ par = parent_cs(cur);
+
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if (cpuset_is_populated(cur)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
+ }
+
+ /*
+ * We can't shrink if we won't have enough room for SCHED_DEADLINE
+ * tasks. This check is not done when scheduling is disabled as the
+ * users should know what they are doing.
+ *
+ * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
+ * cpus_allowed.
+ *
+ * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
+ * for non-isolated partition root. At this point, the target
+ * effective_cpus isn't computed yet. user_xcpus() is the best
+ * approximation.
+ *
+ * TBD: May need to precompute the real effective_cpus here in case
+ * incorrect scheduling of SCHED_DEADLINE tasks in a partition
+ * becomes an issue.
+ */
+ ret = -EBUSY;
+ if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
+ !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
+ goto out;
+
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap. exclusive_cpus cannot overlap with each other if set.
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if (c == cur)
+ continue;
+ if (cpus_excl_conflict(trial, c))
+ goto out;
+ if (mems_excl_conflict(trial, c))
+ goto out;
+ }
+
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ if (dattr->relax_domain_level < c->relax_domain_level)
+ dattr->relax_domain_level = c->relax_domain_level;
+ return;
+}
+
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+ }
+ rcu_read_unlock();
+}
+
+/* Must be called with cpuset_mutex held. */
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
+/*
+ * generate_sched_domains()
+ *
+ * This function builds a partial partition of the systems CPUs
+ * A 'partial partition' is a set of non-overlapping subsets whose
+ * union is a subset of that set.
+ * The output of this function needs to be passed to kernel/sched/core.c
+ * partition_sched_domains() routine, which will rebuild the scheduler's
+ * load balancing domains (sched domains) as specified by that partial
+ * partition.
+ *
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
+ * for a background explanation of this.
+ *
+ * Does not return errors, on the theory that the callers of this
+ * routine would rather not worry about failures to rebuild sched
+ * domains when operating in the severe memory shortage situations
+ * that could cause allocation failures below.
+ *
+ * Must be called with cpuset_mutex held.
+ *
+ * The three key local variables below are:
+ * cp - cpuset pointer, used (together with pos_css) to perform a
+ * top-down scan of all cpusets. For our purposes, rebuilding
+ * the schedulers sched domains, we can ignore !is_sched_load_
+ * balance cpusets.
+ * csa - (for CpuSet Array) Array of pointers to all the cpusets
+ * that need to be load balanced, for convenient iterative
+ * access by the subsequent code that finds the best partition,
+ * i.e the set of domains (subsets) of CPUs such that the
+ * cpus_allowed of every cpuset marked is_sched_load_balance
+ * is a subset of one of these domains, while there are as
+ * many such domains as possible, each as small as possible.
+ * doms - Conversion of 'csa' to an array of cpumasks, for passing to
+ * the kernel/sched/core.c routine partition_sched_domains() in a
+ * convenient format, that can be easily compared to the prior
+ * value to determine what partition elements (sched domains)
+ * were changed (added or removed.)
+ *
+ * Finding the best partition (set of domains):
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
+ *
+ */
+static int generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct cpuset *cp; /* top-down scan of cpusets */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j; /* indices for partition finding loops */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
+ bool cgrpv2 = cpuset_v2();
+ int nslot_update;
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
+single_root_domain:
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ goto done;
+ }
+
+ csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ rcu_read_lock();
+ if (root_load_balance)
+ csa[csn++] = &top_cpuset;
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
+ continue;
+
+ if (cgrpv2)
+ goto v2;
+
+ /*
+ * v1:
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
+ continue;
+
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+
+v2:
+ /*
+ * Only valid partition roots that are not isolated and with
+ * non-empty effective_cpus will be saved into csn[].
+ */
+ if ((cp->partition_root_state == PRS_ROOT) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /*
+ * Skip @cp's subtree if not a partition root and has no
+ * exclusive CPUs to be granted to child cpusets.
+ */
+ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
+ pos_css = css_rightmost_descendant(pos_css);
+ }
+ rcu_read_unlock();
+
+ /*
+ * If there are only isolated partitions underneath the cgroup root,
+ * we can optimize out unneeded sched domains scanning.
+ */
+ if (root_load_balance && (csn == 1))
+ goto single_root_domain;
+
+ for (i = 0; i < csn; i++)
+ uf_node_init(&csa[i]->node);
+
+ /* Merge overlapping cpusets */
+ for (i = 0; i < csn; i++) {
+ for (j = i + 1; j < csn; j++) {
+ if (cpusets_overlap(csa[i], csa[j])) {
+ /*
+ * Cgroup v2 shouldn't pass down overlapping
+ * partition root cpusets.
+ */
+ WARN_ON_ONCE(cgrpv2);
+ uf_union(&csa[i]->node, &csa[j]->node);
+ }
+ }
+ }
+
+ /* Count the total number of domains */
+ for (i = 0; i < csn; i++) {
+ if (uf_find(&csa[i]->node) == &csa[i]->node)
+ ndoms++;
+ }
+
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
+ dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
+ GFP_KERNEL);
+
+ /*
+ * Cgroup v2 doesn't support domain attributes, just set all of them
+ * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
+ * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ */
+ if (cgrpv2) {
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (csa[i] == &top_cpuset)
+ cpumask_and(doms[i], csa[i]->effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
+ }
+ goto done;
+ }
+
+ for (nslot = 0, i = 0; i < csn; i++) {
+ nslot_update = 0;
+ for (j = i; j < csn; j++) {
+ if (uf_find(&csa[j]->node) == &csa[i]->node) {
+ struct cpumask *dp = doms[nslot];
+
+ if (i == j) {
+ nslot_update = 1;
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ }
+ cpumask_or(dp, dp, csa[j]->effective_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, csa[j]);
+ }
+ }
+ if (nslot_update)
+ nslot++;
+ }
+ BUG_ON(nslot != ndoms);
+
+done:
+ kfree(csa);
+
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
+static void dl_update_tasks_root_domain(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ if (cs->nr_deadline_tasks == 0)
+ return;
+
+ css_task_iter_start(&cs->css, 0, &it);
+
+ while ((task = css_task_iter_next(&it)))
+ dl_add_task_root_domain(task);
+
+ css_task_iter_end(&it);
+}
+
+void dl_rebuild_rd_accounting(void)
+{
+ struct cpuset *cs = NULL;
+ struct cgroup_subsys_state *pos_css;
+ int cpu;
+ u64 cookie = ++dl_cookie;
+
+ lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&sched_domains_mutex);
+
+ rcu_read_lock();
+
+ for_each_possible_cpu(cpu) {
+ if (dl_bw_visited(cpu, cookie))
+ continue;
+
+ dl_clear_root_domain_cpu(cpu);
+ }
+
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+
+ if (cpumask_empty(cs->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ css_get(&cs->css);
+
+ rcu_read_unlock();
+
+ dl_update_tasks_root_domain(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Rebuild scheduler domains.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * Call with cpuset_mutex held. Takes cpus_read_lock().
+ */
+void rebuild_sched_domains_locked(void)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct sched_domain_attr *attr;
+ cpumask_var_t *doms;
+ struct cpuset *cs;
+ int ndoms;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&cpuset_mutex);
+ force_sd_rebuild = false;
+
+ /*
+ * If we have raced with CPU hotplug, return early to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
+ *
+ * With no CPUs in any subpartitions, top_cpuset's effective CPUs
+ * should be the same as the active CPUs, so checking only top_cpuset
+ * is enough to detect racing CPU offlines.
+ */
+ if (cpumask_empty(subpartitions_cpus) &&
+ !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ return;
+
+ /*
+ * With subpartition CPUs, however, the effective CPUs of a partition
+ * root should be only a subset of the active CPUs. Since a CPU in any
+ * partition root could be offlined, all must be checked.
+ */
+ if (!cpumask_empty(subpartitions_cpus)) {
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (!is_partition_valid(cs)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ if (!cpumask_subset(cs->effective_cpus,
+ cpu_active_mask)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ /* Generate domain masks and attrs */
+ ndoms = generate_sched_domains(&doms, &attr);
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+}
+#else /* !CONFIG_SMP */
+void rebuild_sched_domains_locked(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void rebuild_sched_domains_cpuslocked(void)
+{
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
+}
+
+void rebuild_sched_domains(void)
+{
+ cpus_read_lock();
+ rebuild_sched_domains_cpuslocked();
+ cpus_read_unlock();
+}
+
+void cpuset_reset_sched_domains(void)
+{
+ mutex_lock(&cpuset_mutex);
+ partition_sched_domains(1, NULL, NULL);
+ mutex_unlock(&cpuset_mutex);
+}
+
+/**
+ * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @new_cpus: the temp variable for the new effective_cpus mask
+ *
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
+ *
+ * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus
+ * to make sure all offline CPUs are also included as hotplug code won't
+ * update cpumasks for tasks in top_cpuset.
+ *
+ * As task_cpu_possible_mask() can be task dependent in arm64, we have to
+ * do cpu masking per task instead of doing it once for all.
+ */
+void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
+
+ css_task_iter_start(&cs->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(task);
+
+ if (top_cs) {
+ /*
+ * PF_NO_SETAFFINITY tasks are ignored.
+ * All per cpu kthreads should have PF_NO_SETAFFINITY
+ * flag set, see kthread_set_per_cpu().
+ */
+ if (task->flags & PF_NO_SETAFFINITY)
+ continue;
+ cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
+ } else {
+ cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
+ }
+ set_cpus_allowed_ptr(task, new_cpus);
+ }
+ css_task_iter_end(&it);
+}
+
+/**
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
+ * @new_cpus: the temp variable for the new effective_cpus mask
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
+ * @parent: the parent cpuset
+ *
+ * The result is valid only if the given cpuset isn't a partition root.
+ */
+static void compute_effective_cpumask(struct cpumask *new_cpus,
+ struct cpuset *cs, struct cpuset *parent)
+{
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+}
+
+/*
+ * Commands for update_parent_effective_cpumask
+ */
+enum partition_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_enablei, /* Enable isolated partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's effective_cpus */
+ partcmd_invalidate, /* Make partition invalid */
+};
+
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp);
+
+/*
+ * Update partition exclusive flag
+ *
+ * Return: 0 if successful, an error code otherwise
+ */
+static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)
+{
+ bool exclusive = (new_prs > PRS_MEMBER);
+
+ if (exclusive && !is_cpu_exclusive(cs)) {
+ if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))
+ return PERR_NOTEXCL;
+ } else if (!exclusive && is_cpu_exclusive(cs)) {
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+ return 0;
+}
+
+/*
+ * Update partition load balance flag and/or rebuild sched domain
+ *
+ * Changing load balance flag will automatically call
+ * rebuild_sched_domains_locked().
+ * This function is for cgroup v2 only.
+ */
+static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
+{
+ int new_prs = cs->partition_root_state;
+ bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+ bool new_lb;
+
+ /*
+ * If cs is not a valid partition root, the load balance state
+ * will follow its parent.
+ */
+ if (new_prs > 0) {
+ new_lb = (new_prs != PRS_ISOLATED);
+ } else {
+ new_lb = is_sched_load_balance(parent_cs(cs));
+ }
+ if (new_lb != !!is_sched_load_balance(cs)) {
+ rebuild_domains = true;
+ if (new_lb)
+ set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ }
+
+ if (rebuild_domains)
+ cpuset_force_rebuild();
+}
+
+/*
+ * tasks_nocpu_error - Return true if tasks will have no effective_cpus
+ */
+static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *xcpus)
+{
+ /*
+ * A populated partition (cs or parent) can't have empty effective_cpus
+ */
+ return (cpumask_subset(parent->effective_cpus, xcpus) &&
+ partition_is_populated(parent, cs)) ||
+ (!cpumask_intersects(xcpus, cpu_active_mask) &&
+ partition_is_populated(cs, NULL));
+}
+
+static void reset_partition_data(struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (!cpuset_v2())
+ return;
+
+ lockdep_assert_held(&callback_lock);
+
+ if (cpumask_empty(cs->exclusive_cpus)) {
+ cpumask_clear(cs->effective_xcpus);
+ if (is_cpu_exclusive(cs))
+ clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+ }
+ if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+}
+
+/*
+ * isolated_cpus_update - Update the isolated_cpus mask
+ * @old_prs: old partition_root_state
+ * @new_prs: new partition_root_state
+ * @xcpus: exclusive CPUs with state change
+ */
+static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs == new_prs);
+ if (new_prs == PRS_ISOLATED)
+ cpumask_or(isolated_cpus, isolated_cpus, xcpus);
+ else
+ cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+
+ isolated_cpus_updating = true;
+}
+
+/*
+ * partition_xcpus_add - Add new exclusive CPUs to partition
+ * @new_prs: new partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be added
+ *
+ * Remote partition if parent == NULL
+ */
+static void partition_xcpus_add(int new_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(new_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+
+ if (parent == &top_cpuset)
+ cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ if (new_prs != parent->partition_root_state)
+ isolated_cpus_update(parent->partition_root_state, new_prs,
+ xcpus);
+
+ cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_del - Remove exclusive CPUs from partition
+ * @old_prs: old partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be removed
+ *
+ * Remote partition if parent == NULL
+ */
+static void partition_xcpus_del(int old_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+ if (parent == &top_cpuset)
+ cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ if (old_prs != parent->partition_root_state)
+ isolated_cpus_update(old_prs, parent->partition_root_state,
+ xcpus);
+
+ cpumask_and(xcpus, xcpus, cpu_active_mask);
+ cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+}
+
+/*
+ * isolated_cpus_can_update - check for isolated & nohz_full conflicts
+ * @add_cpus: cpu mask for cpus that are going to be isolated
+ * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL
+ * Return: false if there is conflict, true otherwise
+ *
+ * If nohz_full is enabled and we have isolated CPUs, their combination must
+ * still leave housekeeping CPUs.
+ *
+ * TBD: Should consider merging this function into
+ * prstate_housekeeping_conflict().
+ */
+static bool isolated_cpus_can_update(struct cpumask *add_cpus,
+ struct cpumask *del_cpus)
+{
+ cpumask_var_t full_hk_cpus;
+ int res = true;
+
+ if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE))
+ return true;
+
+ if (del_cpus && cpumask_weight_and(del_cpus,
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))
+ return true;
+
+ if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL))
+ return false;
+
+ cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus);
+ cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask);
+ if (!cpumask_weight_andnot(full_hk_cpus, add_cpus))
+ res = false;
+
+ free_cpumask_var(full_hk_cpus);
+ return res;
+}
+
+/*
+ * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
+ * @prstate: partition root state to be checked
+ * @new_cpus: cpu mask
+ * Return: true if there is conflict, false otherwise
+ *
+ * CPUs outside of boot_hk_cpus, if defined, can only be used in an
+ * isolated partition.
+ */
+static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+{
+ if (!have_boot_isolcpus)
+ return false;
+
+ if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
+ return true;
+
+ return false;
+}
+
+/*
+ * update_isolation_cpumasks - Update external isolation related CPU masks
+ *
+ * The following external CPU masks will be updated if necessary:
+ * - workqueue unbound cpumask
+ */
+static void update_isolation_cpumasks(void)
+{
+ int ret;
+
+ if (!isolated_cpus_updating)
+ return;
+
+ lockdep_assert_cpus_held();
+
+ ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+
+ ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+
+ isolated_cpus_updating = false;
+}
+
+/**
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+ return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
+
+/**
+ * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
+ * @parent: Parent cpuset containing all siblings
+ * @cs: Current cpuset (will be skipped)
+ * @excpus: exclusive effective CPU mask to modify
+ *
+ * This function ensures the given @excpus mask doesn't include any CPUs that
+ * are exclusively allocated to sibling cpusets. It walks through all siblings
+ * of @cs under @parent and removes their exclusive CPUs from @excpus.
+ */
+static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
+ struct cpumask *excpus)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *sibling;
+ int retval = 0;
+
+ if (cpumask_empty(excpus))
+ return retval;
+
+ /*
+ * Exclude exclusive CPUs from siblings
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, css, parent) {
+ if (sibling == cs)
+ continue;
+
+ if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
+ cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
+ retval++;
+ continue;
+ }
+ if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
+ retval++;
+ }
+ }
+ rcu_read_unlock();
+
+ return retval;
+}
+
+/*
+ * compute_excpus - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
+ *
+ * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets
+ * and exclude their exclusive_cpus or effective_xcpus as well.
+ */
+static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
+
+ if (!cpumask_empty(cs->exclusive_cpus))
+ return 0;
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+/*
+ * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset
+ * @trialcs: The trial cpuset containing the proposed new configuration
+ * @cs: The original cpuset that the trial configuration is based on
+ * Return: 0 if successful with no sibling conflict, >0 if a conflict is found
+ *
+ * Computes the effective_xcpus for a trial configuration. @cs is provided to represent
+ * the real cs.
+ */
+static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
+{
+ struct cpuset *parent = parent_cs(trialcs);
+ struct cpumask *excpus = trialcs->effective_xcpus;
+
+ /* trialcs is member, cpuset.cpus has no impact to excpus */
+ if (cs_is_member(cs))
+ cpumask_and(excpus, trialcs->exclusive_cpus,
+ parent->effective_xcpus);
+ else
+ cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
+
+ return rm_siblings_excl_cpus(parent, cs, excpus);
+}
+
+static inline bool is_remote_partition(struct cpuset *cs)
+{
+ return cs->remote_partition;
+}
+
+static inline bool is_local_partition(struct cpuset *cs)
+{
+ return is_partition_valid(cs) && !is_remote_partition(cs);
+}
+
+/*
+ * remote_partition_enable - Enable current cpuset as a remote partition root
+ * @cs: the cpuset to update
+ * @new_prs: new partition_root_state
+ * @tmp: temporary masks
+ * Return: 0 if successful, errcode if error
+ *
+ * Enable the current cpuset to become a remote partition root taking CPUs
+ * directly from the top cpuset. cpuset_mutex must be held by the caller.
+ */
+static int remote_partition_enable(struct cpuset *cs, int new_prs,
+ struct tmpmasks *tmp)
+{
+ /*
+ * The user must have sysadmin privilege.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return PERR_ACCESS;
+
+ /*
+ * The requested exclusive_cpus must not be allocated to other
+ * partitions and it can't use up all the root's effective_cpus.
+ *
+ * The effective_xcpus mask can contain offline CPUs, but there must
+ * be at least one or more online CPUs present before it can be enabled.
+ *
+ * Note that creating a remote partition with any local partition root
+ * above it or remote partition root underneath it is not allowed.
+ */
+ compute_excpus(cs, tmp->new_cpus);
+ WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
+ if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
+ return PERR_INVCPUS;
+ if (((new_prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(tmp->new_cpus, NULL)) ||
+ prstate_housekeeping_conflict(new_prs, tmp->new_cpus))
+ return PERR_HKEEPING;
+
+ spin_lock_irq(&callback_lock);
+ partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+ cs->remote_partition = true;
+ cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+ cpuset_force_rebuild();
+ cs->prs_err = 0;
+
+ /*
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return 0;
+}
+
+/*
+ * remote_partition_disable - Remove current cpuset from remote partition list
+ * @cs: the cpuset to update
+ * @tmp: temporary masks
+ *
+ * The effective_cpus is also updated.
+ *
+ * cpuset_mutex must be held by the caller.
+ */
+static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ WARN_ON_ONCE(!is_remote_partition(cs));
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+ spin_lock_irq(&callback_lock);
+ cs->remote_partition = false;
+ partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus);
+ if (cs->prs_err)
+ cs->partition_root_state = -cs->partition_root_state;
+ else
+ cs->partition_root_state = PRS_MEMBER;
+
+ /* effective_xcpus may need to be changed */
+ compute_excpus(cs, cs->effective_xcpus);
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+ cpuset_force_rebuild();
+
+ /*
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+}
+
+/*
+ * remote_cpus_update - cpus_exclusive change of remote partition
+ * @cs: the cpuset to be updated
+ * @xcpus: the new exclusive_cpus mask, if non-NULL
+ * @excpus: the new effective_xcpus mask
+ * @tmp: temporary masks
+ *
+ * top_cpuset and subpartitions_cpus will be updated or partition can be
+ * invalidated.
+ */
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
+ struct cpumask *excpus, struct tmpmasks *tmp)
+{
+ bool adding, deleting;
+ int prs = cs->partition_root_state;
+
+ if (WARN_ON_ONCE(!is_remote_partition(cs)))
+ return;
+
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+ if (cpumask_empty(excpus)) {
+ cs->prs_err = PERR_CPUSEMPTY;
+ goto invalidate;
+ }
+
+ adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);
+ deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);
+
+ /*
+ * Additions of remote CPUs is only allowed if those CPUs are
+ * not allocated to other partitions and there are effective_cpus
+ * left in the top cpuset.
+ */
+ if (adding) {
+ WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));
+ if (!capable(CAP_SYS_ADMIN))
+ cs->prs_err = PERR_ACCESS;
+ else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
+ cs->prs_err = PERR_NOCPUS;
+ else if ((prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
+ cs->prs_err = PERR_HKEEPING;
+ if (cs->prs_err)
+ goto invalidate;
+ }
+
+ spin_lock_irq(&callback_lock);
+ if (adding)
+ partition_xcpus_add(prs, NULL, tmp->addmask);
+ if (deleting)
+ partition_xcpus_del(prs, NULL, tmp->delmask);
+ /*
+ * Need to update effective_xcpus and exclusive_cpus now as
+ * update_sibling_cpumasks() below may iterate back to the same cs.
+ */
+ cpumask_copy(cs->effective_xcpus, excpus);
+ if (xcpus)
+ cpumask_copy(cs->exclusive_cpus, xcpus);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+ if (adding || deleting)
+ cpuset_force_rebuild();
+
+ /*
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
+ */
+ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+ update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+ return;
+
+invalidate:
+ remote_partition_disable(cs, tmp);
+}
+
+/**
+ * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
+ * @cs: The cpuset that requests change in partition root state
+ * @cmd: Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp: Temporary addmask and delmask
+ * Return: 0 or a partition root state error code
+ *
+ * For partcmd_enable*, the cpuset is being transformed from a non-partition
+ * root to a partition root. The effective_xcpus (cpus_allowed if
+ * effective_xcpus not set) mask of the given cpuset will be taken away from
+ * parent's effective_cpus. The function will return 0 if all the CPUs listed
+ * in effective_xcpus can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transformed from a partition
+ * root back to a non-partition root. Any CPUs in effective_xcpus will be
+ * given back to parent's effective_cpus. 0 will always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
+ *
+ * For partcmd_invalidate, the current partition will be made invalid.
+ *
+ * The partcmd_enable* and partcmd_disable commands are used by
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
+ *
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_err will be updated
+ * directly.
+ */
+static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *parent = parent_cs(cs);
+ int adding; /* Adding cpus to parent's effective_cpus */
+ int deleting; /* Deleting cpus from parent's effective_cpus */
+ int old_prs, new_prs;
+ int part_error = PERR_NONE; /* Partition error? */
+ struct cpumask *xcpus = user_xcpus(cs);
+ int parent_prs = parent->partition_root_state;
+ bool nocpu;
+
+ lockdep_assert_held(&cpuset_mutex);
+ WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
+
+ /*
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
+ */
+ adding = deleting = false;
+ old_prs = new_prs = cs->partition_root_state;
+
+ if (cmd == partcmd_invalidate) {
+ if (is_partition_invalid(cs))
+ return 0;
+
+ /*
+ * Make the current partition invalid.
+ */
+ if (is_partition_valid(parent))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ if (old_prs > 0)
+ new_prs = -old_prs;
+
+ goto write_error;
+ }
+
+ /*
+ * The parent must be a partition root.
+ * The new cpumask, if present, or the current cpus_allowed must
+ * not be empty.
+ */
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if (!newmask && xcpus_empty(cs))
+ return PERR_CPUSEMPTY;
+
+ nocpu = tasks_nocpu_error(parent, cs, xcpus);
+
+ if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
+ /*
+ * Need to call compute_excpus() in case
+ * exclusive_cpus not set. Sibling conflict should only happen
+ * if exclusive_cpus isn't set.
+ */
+ xcpus = tmp->delmask;
+ if (compute_excpus(cs, xcpus))
+ WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
+
+ /*
+ * Enabling partition root is not allowed if its
+ * effective_xcpus is empty.
+ */
+ if (cpumask_empty(xcpus))
+ return PERR_INVCPUS;
+
+ if (prstate_housekeeping_conflict(new_prs, xcpus))
+ return PERR_HKEEPING;
+
+ if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) &&
+ !isolated_cpus_can_update(xcpus, NULL))
+ return PERR_HKEEPING;
+
+ if (tasks_nocpu_error(parent, cs, xcpus))
+ return PERR_NOCPUS;
+
+ /*
+ * This function will only be called when all the preliminary
+ * checks have passed. At this point, the following condition
+ * should hold.
+ *
+ * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus
+ *
+ * Warn if it is not the case.
+ */
+ cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+
+ deleting = true;
+ } else if (cmd == partcmd_disable) {
+ /*
+ * May need to add cpus back to parent's effective_cpus
+ * (and maybe removed from subpartitions_cpus/isolated_cpus)
+ * for valid partition root. xcpus may contain CPUs that
+ * shouldn't be removed from the two global cpumasks.
+ */
+ if (is_partition_valid(cs)) {
+ cpumask_copy(tmp->addmask, cs->effective_xcpus);
+ adding = true;
+ }
+ new_prs = PRS_MEMBER;
+ } else if (newmask) {
+ /*
+ * Empty cpumask is not allowed
+ */
+ if (cpumask_empty(newmask)) {
+ part_error = PERR_CPUSEMPTY;
+ goto write_error;
+ }
+
+ /* Check newmask again, whether cpus are available for parent/cs */
+ nocpu |= tasks_nocpu_error(parent, cs, newmask);
+
+ /*
+ * partcmd_update with newmask:
+ *
+ * Compute add/delete mask to/from effective_cpus
+ *
+ * For valid partition:
+ * addmask = exclusive_cpus & ~newmask
+ * & parent->effective_xcpus
+ * delmask = newmask & ~exclusive_cpus
+ * & parent->effective_xcpus
+ *
+ * For invalid partition:
+ * delmask = newmask & parent->effective_xcpus
+ * The partition may become valid soon.
+ */
+ if (is_partition_invalid(cs)) {
+ adding = false;
+ deleting = cpumask_and(tmp->delmask,
+ newmask, parent->effective_xcpus);
+ } else {
+ cpumask_andnot(tmp->addmask, xcpus, newmask);
+ adding = cpumask_and(tmp->addmask, tmp->addmask,
+ parent->effective_xcpus);
+
+ cpumask_andnot(tmp->delmask, newmask, xcpus);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->effective_xcpus);
+ }
+
+ /*
+ * TBD: Invalidate a currently valid child root partition may
+ * still break isolated_cpus_can_update() rule if parent is an
+ * isolated partition.
+ */
+ if (is_partition_valid(cs) && (old_prs != parent_prs)) {
+ if ((parent_prs == PRS_ROOT) &&
+ /* Adding to parent means removing isolated CPUs */
+ !isolated_cpus_can_update(tmp->delmask, tmp->addmask))
+ part_error = PERR_HKEEPING;
+ if ((parent_prs == PRS_ISOLATED) &&
+ /* Adding to parent means adding isolated CPUs */
+ !isolated_cpus_can_update(tmp->addmask, tmp->delmask))
+ part_error = PERR_HKEEPING;
+ }
+
+ /*
+ * The new CPUs to be removed from parent's effective CPUs
+ * must be present.
+ */
+ if (deleting) {
+ cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+ }
+
+ /*
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
+ */
+ if (nocpu && (!adding ||
+ !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
+ part_error = PERR_NOCPUS;
+ deleting = false;
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ }
+ } else {
+ /*
+ * partcmd_update w/o newmask
+ *
+ * delmask = effective_xcpus & parent->effective_cpus
+ *
+ * This can be called from:
+ * 1) update_cpumasks_hier()
+ * 2) cpuset_hotplug_update_tasks()
+ *
+ * Check to see if it can be transitioned from valid to
+ * invalid partition or vice versa.
+ *
+ * A partition error happens when parent has tasks and all
+ * its effective CPUs will have to be distributed out.
+ */
+ if (nocpu) {
+ part_error = PERR_NOCPUS;
+ if (is_partition_valid(cs))
+ adding = cpumask_and(tmp->addmask,
+ xcpus, parent->effective_xcpus);
+ } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
+ cpumask_subset(xcpus, parent->effective_xcpus)) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool exclusive = true;
+
+ /*
+ * Convert invalid partition to valid has to
+ * pass the cpu exclusivity test.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, parent) {
+ if (child == cs)
+ continue;
+ if (!cpusets_are_exclusive(cs, child)) {
+ exclusive = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (exclusive)
+ deleting = cpumask_and(tmp->delmask,
+ xcpus, parent->effective_cpus);
+ else
+ part_error = PERR_NOTEXCL;
+ }
+ }
+
+write_error:
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
+
+ if (cmd == partcmd_update) {
+ /*
+ * Check for possible transition between valid and invalid
+ * partition root.
+ */
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ if (part_error)
+ new_prs = -old_prs;
+ break;
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
+ if (!part_error)
+ new_prs = -old_prs;
+ break;
+ }
+ }
+
+ if (!adding && !deleting && (new_prs == old_prs))
+ return 0;
+
+ /*
+ * Transitioning between invalid to valid or vice versa may require
+ * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
+ * validate_change() has already been successfully called and
+ * CPU lists in cs haven't been updated yet. So defer it to later.
+ */
+ if ((old_prs != new_prs) && (cmd != partcmd_update)) {
+ int err = update_partition_exclusive_flag(cs, new_prs);
+
+ if (err)
+ return err;
+ }
+
+ /*
+ * Change the parent's effective_cpus & effective_xcpus (top cpuset
+ * only).
+ *
+ * Newly added CPUs will be removed from effective_cpus and
+ * newly deleted ones will be added back to effective_cpus.
+ */
+ spin_lock_irq(&callback_lock);
+ if (old_prs != new_prs)
+ cs->partition_root_state = new_prs;
+
+ /*
+ * Adding to parent's effective_cpus means deletion CPUs from cs
+ * and vice versa.
+ */
+ if (adding)
+ partition_xcpus_del(old_prs, parent, tmp->addmask);
+ if (deleting)
+ partition_xcpus_add(new_prs, parent, tmp->delmask);
+
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+
+ if ((old_prs != new_prs) && (cmd == partcmd_update))
+ update_partition_exclusive_flag(cs, new_prs);
+
+ if (adding || deleting) {
+ cpuset_update_tasks_cpumask(parent, tmp->addmask);
+ update_sibling_cpumasks(parent, cs, tmp);
+ }
+
+ /*
+ * For partcmd_update without newmask, it is being called from
+ * cpuset_handle_hotplug(). Update the load balance flag and
+ * scheduling domain accordingly.
+ */
+ if ((cmd == partcmd_update) && !newmask)
+ update_partition_sd_lb(cs, old_prs);
+
+ notify_partition_change(cs, old_prs);
+ return 0;
+}
+
+/**
+ * compute_partition_effective_cpumask - compute effective_cpus for partition
+ * @cs: partition root cpuset
+ * @new_ecpus: previously computed effective_cpus to be updated
+ *
+ * Compute the effective_cpus of a partition root by scanning effective_xcpus
+ * of child partition roots and excluding their effective_xcpus.
+ *
+ * This has the side effect of invalidating valid child partition roots,
+ * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
+ * or update_cpumasks_hier() where parent and children are modified
+ * successively, we don't need to call update_parent_effective_cpumask()
+ * and the child's effective_cpus will be updated in later iterations.
+ *
+ * Note that rcu_read_lock() is assumed to be held.
+ */
+static void compute_partition_effective_cpumask(struct cpuset *cs,
+ struct cpumask *new_ecpus)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+ bool populated = partition_is_populated(cs, NULL);
+
+ /*
+ * Check child partition roots to see if they should be
+ * invalidated when
+ * 1) child effective_xcpus not a subset of new
+ * excluisve_cpus
+ * 2) All the effective_cpus will be used up and cp
+ * has tasks
+ */
+ compute_excpus(cs, new_ecpus);
+ cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (!is_partition_valid(child))
+ continue;
+
+ /*
+ * There shouldn't be a remote partition underneath another
+ * partition root.
+ */
+ WARN_ON_ONCE(is_remote_partition(child));
+ child->prs_err = 0;
+ if (!cpumask_subset(child->effective_xcpus,
+ cs->effective_xcpus))
+ child->prs_err = PERR_INVCPUS;
+ else if (populated &&
+ cpumask_subset(new_ecpus, child->effective_xcpus))
+ child->prs_err = PERR_NOCPUS;
+
+ if (child->prs_err) {
+ int old_prs = child->partition_root_state;
+
+ /*
+ * Invalidate child partition
+ */
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(child);
+ spin_unlock_irq(&callback_lock);
+ notify_partition_change(child, old_prs);
+ continue;
+ }
+ cpumask_andnot(new_ecpus, new_ecpus,
+ child->effective_xcpus);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @tmp: temp variables for calculating effective_cpus & partition setup
+ * @force: don't skip any descendant cpusets if set
+ *
+ * When configured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ bool force)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+ int old_prs, new_prs;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+ bool remote = is_remote_partition(cp);
+ bool update_parent = false;
+
+ old_prs = new_prs = cp->partition_root_state;
+
+ /*
+ * For child remote partition root (!= cs), we need to call
+ * remote_cpus_update() if effective_xcpus will be changed.
+ * Otherwise, we can skip the whole subtree.
+ *
+ * remote_cpus_update() will reuse tmp->new_cpus only after
+ * its value is being processed.
+ */
+ if (remote && (cp != cs)) {
+ compute_excpus(cp, tmp->new_cpus);
+ if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+ remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);
+ rcu_read_lock();
+
+ /* Remote partition may be invalidated */
+ new_prs = cp->partition_root_state;
+ remote = (new_prs == old_prs);
+ }
+
+ if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))
+ compute_partition_effective_cpumask(cp, tmp->new_cpus);
+ else
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
+
+ if (remote)
+ goto get_css; /* Ready to update cpuset data */
+
+ /*
+ * A partition with no effective_cpus is allowed as long as
+ * there is no task associated with it. Call
+ * update_parent_effective_cpumask() to check it.
+ */
+ if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
+ update_parent = true;
+ goto update_parent_effective;
+ }
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs unless
+ * it is a partition root that has explicitly distributed
+ * out all its CPUs.
+ */
+ if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+
+ /*
+ * Skip the whole subtree if
+ * 1) the cpumask remains the same,
+ * 2) has no partition root state,
+ * 3) force flag not set, and
+ * 4) for v2 load balance state same as its parent.
+ */
+ if (!cp->partition_root_state && !force &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
+ (!cpuset_v2() ||
+ (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+update_parent_effective:
+ /*
+ * update_parent_effective_cpumask() should have been called
+ * for cs already in update_cpumask(). We should also call
+ * cpuset_update_tasks_cpumask() again for tasks in the parent
+ * cpuset if the parent's effective_cpus changes.
+ */
+ if ((cp != cs) && old_prs) {
+ switch (parent->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ update_parent = true;
+ break;
+
+ default:
+ /*
+ * When parent is not a partition root or is
+ * invalid, child partition roots become
+ * invalid too.
+ */
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
+ WRITE_ONCE(cp->prs_err,
+ is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART);
+ break;
+ }
+ }
+get_css:
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ if (update_parent) {
+ update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
+ /*
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
+ */
+ new_prs = cp->partition_root_state;
+ }
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ cp->partition_root_state = new_prs;
+ if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
+ compute_excpus(cp, cp->effective_xcpus);
+
+ /*
+ * Make sure effective_xcpus is properly set for a valid
+ * partition root.
+ */
+ if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
+ cpumask_and(cp->effective_xcpus,
+ cp->cpus_allowed, parent->effective_xcpus);
+ else if (new_prs < 0)
+ reset_partition_data(cp);
+ spin_unlock_irq(&callback_lock);
+
+ notify_partition_change(cp, old_prs);
+
+ WARN_ON(!is_in_v2_mode() &&
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
+ cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
+
+ /*
+ * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
+ * from parent if current cpuset isn't a valid partition root
+ * and their load balance states differ.
+ */
+ if (cpuset_v2() && !is_partition_valid(cp) &&
+ (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
+ if (is_sched_load_balance(parent))
+ set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ else
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
+ }
+
+ /*
+ * On legacy hierarchy, if the effective cpumask of any non-
+ * empty cpuset is changed, we need to rebuild sched domains.
+ * On default hierarchy, the cpuset needs to be a partition
+ * root as well.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp) &&
+ (!cpuset_v2() || is_partition_valid(cp)))
+ cpuset_force_rebuild();
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * update_sibling_cpumasks - Update siblings cpumasks
+ * @parent: Parent cpuset
+ * @cs: Current cpuset
+ * @tmp: Temp variables
+ */
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *sibling;
+ struct cgroup_subsys_state *pos_css;
+
+ lockdep_assert_held(&cpuset_mutex);
+
+ /*
+ * Check all its siblings and call update_cpumasks_hier()
+ * if their effective_cpus will need to be changed.
+ *
+ * It is possible a change in parent's effective_cpus
+ * due to a change in a child partition's effective_xcpus will impact
+ * its siblings even if they do not inherit parent's effective_cpus
+ * directly.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, pos_css, parent) {
+ if (sibling == cs)
+ continue;
+ if (!is_partition_valid(sibling)) {
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
+ continue;
+ } else if (is_remote_partition(sibling)) {
+ /*
+ * Change in a sibling cpuset won't affect a remote
+ * partition root.
+ */
+ continue;
+ }
+
+ if (!css_tryget_online(&sibling->css))
+ continue;
+
+ rcu_read_unlock();
+ update_cpumasks_hier(sibling, tmp, false);
+ rcu_read_lock();
+ css_put(&sibling->css);
+ }
+ rcu_read_unlock();
+}
+
+static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
+{
+ int retval;
+
+ retval = cpulist_parse(buf, out_mask);
+ if (retval < 0)
+ return retval;
+ if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * validate_partition - Validate a cpuset partition configuration
+ * @cs: The cpuset to validate
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ *
+ * If any validation check fails, the appropriate error code is set in the
+ * cpuset's prs_err field.
+ *
+ * Return: PRS error code (0 if valid, non-zero error code if invalid)
+ */
+static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
+{
+ struct cpuset *parent = parent_cs(cs);
+
+ if (cs_is_member(trialcs))
+ return PERR_NONE;
+
+ if (cpumask_empty(trialcs->effective_xcpus))
+ return PERR_INVCPUS;
+
+ if (prstate_housekeeping_conflict(trialcs->partition_root_state,
+ trialcs->effective_xcpus))
+ return PERR_HKEEPING;
+
+ if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
+ return PERR_NOCPUS;
+
+ return PERR_NONE;
+}
+
+static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ int retval;
+ struct cpuset *parent = parent_cs(cs);
+
+ retval = validate_change(cs, trialcs);
+
+ if ((retval == -EINVAL) && cpuset_v2()) {
+ struct cgroup_subsys_state *css;
+ struct cpuset *cp;
+
+ /*
+ * The -EINVAL error code indicates that partition sibling
+ * CPU exclusivity rule has been violated. We still allow
+ * the cpumask change to proceed while invalidating the
+ * partition. However, any conflicting sibling partitions
+ * have to be marked as invalid too.
+ */
+ trialcs->prs_err = PERR_NOTEXCL;
+ rcu_read_lock();
+ cpuset_for_each_child(cp, css, parent) {
+ struct cpumask *xcpus = user_xcpus(trialcs);
+
+ if (is_partition_valid(cp) &&
+ cpumask_intersects(xcpus, cp->effective_xcpus)) {
+ rcu_read_unlock();
+ update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
+ rcu_read_lock();
+ }
+ }
+ rcu_read_unlock();
+ retval = 0;
+ }
+ return retval;
+}
+
+/**
+ * partition_cpus_change - Handle partition state changes due to CPU mask updates
+ * @cs: The target cpuset being modified
+ * @trialcs: The trial cpuset containing proposed configuration changes
+ * @tmp: Temporary masks for intermediate calculations
+ *
+ * This function handles partition state transitions triggered by CPU mask changes.
+ * CPU modifications may cause a partition to be disabled or require state updates.
+ */
+static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
+ struct tmpmasks *tmp)
+{
+ enum prs_errcode prs_err;
+
+ if (cs_is_member(cs))
+ return;
+
+ prs_err = validate_partition(cs, trialcs);
+ if (prs_err)
+ trialcs->prs_err = cs->prs_err = prs_err;
+
+ if (is_remote_partition(cs)) {
+ if (trialcs->prs_err)
+ remote_partition_disable(cs, tmp);
+ else
+ remote_cpus_update(cs, trialcs->exclusive_cpus,
+ trialcs->effective_xcpus, tmp);
+ } else {
+ if (trialcs->prs_err)
+ update_parent_effective_cpumask(cs, partcmd_invalidate,
+ NULL, tmp);
+ else
+ update_parent_effective_cpumask(cs, partcmd_update,
+ trialcs->effective_xcpus, tmp);
+ }
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
+ if (retval < 0)
+ return retval;
+
+ /* Nothing to do if the cpus didn't change */
+ if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ return 0;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
+
+ compute_trialcs_excpus(trialcs, cs);
+ trialcs->prs_err = PERR_NONE;
+
+ retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ if (retval < 0)
+ goto out_free;
+
+ /*
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ partition_cpus_change(cs, trialcs, &tmp);
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+
+ /* effective_cpus/effective_xcpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, force);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+ update_partition_sd_lb(cs, old_prs);
+out_free:
+ free_tmpmasks(&tmp);
+ return retval;
+}
+
+/**
+ * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ *
+ * The tasks' cpumask will be updated if cs is a valid partition root.
+ */
+static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+ struct tmpmasks tmp;
+ bool force = false;
+ int old_prs = cs->partition_root_state;
+
+ retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
+ if (retval < 0)
+ return retval;
+
+ /* Nothing to do if the CPUs didn't change */
+ if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
+ return 0;
+
+ /*
+ * Reject the change if there is exclusive CPUs conflict with
+ * the siblings.
+ */
+ if (compute_trialcs_excpus(trialcs, cs))
+ return -EINVAL;
+
+ /*
+ * Check all the descendants in update_cpumasks_hier() if
+ * effective_xcpus is to be changed.
+ */
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
+
+ retval = validate_change(cs, trialcs);
+ if (retval)
+ return retval;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
+
+ trialcs->prs_err = PERR_NONE;
+ partition_cpus_change(cs, trialcs, &tmp);
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
+ cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+ if ((old_prs > 0) && !is_partition_valid(cs))
+ reset_partition_data(cs);
+ spin_unlock_irq(&callback_lock);
+
+ /*
+ * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
+ * of the subtree when it is a valid partition root or effective_xcpus
+ * is updated.
+ */
+ if (is_partition_valid(cs) || force)
+ update_cpumasks_hier(cs, &tmp, force);
+
+ /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+ if (cs->partition_root_state)
+ update_partition_sd_lb(cs, old_prs);
+
+ free_tmpmasks(&tmp);
+ return 0;
+}
+
+/*
+ * Migrate memory region from one set of nodes to another. This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management. All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
+ */
+
+struct cpuset_migrate_mm_work {
+ struct work_struct work;
+ struct mm_struct *mm;
+ nodemask_t from;
+ nodemask_t to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+ struct cpuset_migrate_mm_work *mwork =
+ container_of(work, struct cpuset_migrate_mm_work, work);
+
+ /* on a wq worker, no need to worry about %current's mems_allowed */
+ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+ mmput(mwork->mm);
+ kfree(mwork);
+}
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to)
+{
+ struct cpuset_migrate_mm_work *mwork;
+
+ if (nodes_equal(*from, *to)) {
+ mmput(mm);
+ return;
+ }
+
+ mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+ if (mwork) {
+ mwork->mm = mm;
+ mwork->from = *from;
+ mwork->to = *to;
+ INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+ queue_work(cpuset_migrate_mm_wq, &mwork->work);
+ } else {
+ mmput(mm);
+ }
+}
+
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
+{
+ flush_workqueue(cpuset_migrate_mm_wq);
+ kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+ struct callback_head *flush_cb;
+
+ flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+ if (!flush_cb)
+ return;
+
+ init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+ if (task_work_add(current, flush_cb, TWA_RESUME))
+ kfree(flush_cb);
+}
+
+/*
+ * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * @tsk: the task to change
+ * @newmems: new nodes that the task will be set
+ *
+ * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
+ * and rebind an eventual tasks' mempolicy. If the task is allocating in
+ * parallel, it might temporarily see an empty intersection, which results in
+ * a seqlock check and retry before OOM or allocation failure.
+ */
+static void cpuset_change_task_nodemask(struct task_struct *tsk,
+ nodemask_t *newmems)
+{
+ task_lock(tsk);
+
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
+
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems);
+ tsk->mems_allowed = *newmems;
+
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+
+ task_unlock(tsk);
+}
+
+static void *cpuset_being_rebound;
+
+/**
+ * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ *
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's. As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
+ */
+void cpuset_update_tasks_nodemask(struct cpuset *cs)
+{
+ static nodemask_t newmems; /* protected by cpuset_mutex */
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
+
+ guarantee_online_mems(cs, &newmems);
+
+ /*
+ * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
+ * take while holding tasklist_lock. Forks can happen - the
+ * mpol_dup() cpuset_being_rebound check will catch such forks,
+ * and rebind their vma mempolicies too. Because we still hold
+ * the global cpuset_mutex, we know that no other rebind effort
+ * will be contending for the global variable cpuset_being_rebound.
+ * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+ * is idempotent. Also migrate pages in each mm to new nodes.
+ */
+ css_task_iter_start(&cs->css, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ struct mm_struct *mm;
+ bool migrate;
+
+ cpuset_change_task_nodemask(task, &newmems);
+
+ mm = get_task_mm(task);
+ if (!mm)
+ continue;
+
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+ else
+ mmput(mm);
+ }
+ css_task_iter_end(&it);
+
+ /*
+ * All the tasks' nodemasks have been updated, update
+ * cs->old_mems_allowed.
+ */
+ cs->old_mems_allowed = newmems;
+
+ /* We're done rebinding vmas to this cpuset's new mems_allowed. */
+ cpuset_being_rebound = NULL;
+}
+
+/*
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
+ *
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hierarchy, effective_mems will be the same with mems_allowed.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some MEMs.
+ */
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
+ *new_mems = parent->effective_mems;
+
+ /* Skip the whole subtree if the nodemask remains the same. */
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (!css_tryget_online(&cp->css))
+ continue;
+ rcu_read_unlock();
+
+ spin_lock_irq(&callback_lock);
+ cp->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ WARN_ON(!is_in_v2_mode() &&
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
+ cpuset_update_tasks_nodemask(cp);
+
+ rcu_read_lock();
+ css_put(&cp->css);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset. Needs to validate the request, update the
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * update mems_allowed and rebind task's mempolicy and any vma
+ * mempolicies and if the cpuset is marked 'memory_migrate',
+ * migrate the tasks pages to the new memory.
+ *
+ * Call with cpuset_mutex held. May take callback_lock during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_lock, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
+ const char *buf)
+{
+ int retval;
+
+ /*
+ * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+ * The validate_change() call ensures that cpusets with tasks have memory.
+ */
+ retval = nodelist_parse(buf, trialcs->mems_allowed);
+ if (retval < 0)
+ return retval;
+
+ if (!nodes_subset(trialcs->mems_allowed,
+ top_cpuset.mems_allowed))
+ return -EINVAL;
+
+ /* No change? nothing to do */
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed))
+ return 0;
+
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
+ return retval;
+
+ check_insane_mems_config(&trialcs->mems_allowed);
+
+ spin_lock_irq(&callback_lock);
+ cs->mems_allowed = trialcs->mems_allowed;
+ spin_unlock_irq(&callback_lock);
+
+ /* use trialcs->mems_allowed as a temp variable */
+ update_nodemasks_hier(cs, &trialcs->mems_allowed);
+ return 0;
+}
+
+bool current_cpuset_is_being_rebound(void)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = task_cs(current) == cpuset_being_rebound;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/*
+ * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag
+ * bit: the bit to update (see cpuset_flagbits_t)
+ * cs: the cpuset to update
+ * turning_on: whether the flag is being set or cleared
+ *
+ * Call with cpuset_mutex held.
+ */
+
+int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on)
+{
+ struct cpuset *trialcs;
+ int balance_flag_changed;
+ int spread_flag_changed;
+ int err;
+
+ trialcs = dup_or_alloc_cpuset(cs);
+ if (!trialcs)
+ return -ENOMEM;
+
+ if (turning_on)
+ set_bit(bit, &trialcs->flags);
+ else
+ clear_bit(bit, &trialcs->flags);
+
+ err = validate_change(cs, trialcs);
+ if (err < 0)
+ goto out;
+
+ balance_flag_changed = (is_sched_load_balance(cs) !=
+ is_sched_load_balance(trialcs));
+
+ spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+ || (is_spread_page(cs) != is_spread_page(trialcs)));
+
+ spin_lock_irq(&callback_lock);
+ cs->flags = trialcs->flags;
+ spin_unlock_irq(&callback_lock);
+
+ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
+ if (cpuset_v2())
+ cpuset_force_rebuild();
+ else
+ rebuild_sched_domains_locked();
+ }
+
+ if (spread_flag_changed)
+ cpuset1_update_tasks_flags(cs);
+out:
+ free_cpuset(trialcs);
+ return err;
+}
+
+/**
+ * update_prstate - update partition_root_state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
+ *
+ * Call with cpuset_mutex held.
+ */
+static int update_prstate(struct cpuset *cs, int new_prs)
+{
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
+ struct cpuset *parent = parent_cs(cs);
+ struct tmpmasks tmpmask;
+ bool isolcpus_updated = false;
+
+ if (old_prs == new_prs)
+ return 0;
+
+ /*
+ * Treat a previously invalid partition root as if it is a "member".
+ */
+ if (new_prs && is_partition_invalid(cs))
+ old_prs = PRS_MEMBER;
+
+ if (alloc_tmpmasks(&tmpmask))
+ return -ENOMEM;
+
+ err = update_partition_exclusive_flag(cs, new_prs);
+ if (err)
+ goto out;
+
+ if (!old_prs) {
+ /*
+ * cpus_allowed and exclusive_cpus cannot be both empty.
+ */
+ if (xcpus_empty(cs)) {
+ err = PERR_CPUSEMPTY;
+ goto out;
+ }
+
+ /*
+ * We don't support the creation of a new local partition with
+ * a remote partition underneath it. This unsupported
+ * setting can happen only if parent is the top_cpuset because
+ * a remote partition cannot be created underneath an existing
+ * local or remote partition.
+ */
+ if ((parent == &top_cpuset) &&
+ cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {
+ err = PERR_REMOTE;
+ goto out;
+ }
+
+ /*
+ * If parent is valid partition, enable local partiion.
+ * Otherwise, enable a remote partition.
+ */
+ if (is_partition_valid(parent)) {
+ enum partition_cmd cmd = (new_prs == PRS_ROOT)
+ ? partcmd_enable : partcmd_enablei;
+
+ err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
+ } else {
+ err = remote_partition_enable(cs, new_prs, &tmpmask);
+ }
+ } else if (old_prs && new_prs) {
+ /*
+ * A change in load balance state only, no change in cpumasks.
+ * Need to update isolated_cpus.
+ */
+ if (((new_prs == PRS_ISOLATED) &&
+ !isolated_cpus_can_update(cs->effective_xcpus, NULL)) ||
+ prstate_housekeeping_conflict(new_prs, cs->effective_xcpus))
+ err = PERR_HKEEPING;
+ else
+ isolcpus_updated = true;
+ } else {
+ /*
+ * Switching back to member is always allowed even if it
+ * disables child partitions.
+ */
+ if (is_remote_partition(cs))
+ remote_partition_disable(cs, &tmpmask);
+ else
+ update_parent_effective_cpumask(cs, partcmd_disable,
+ NULL, &tmpmask);
+
+ /*
+ * Invalidation of child partitions will be done in
+ * update_cpumasks_hier().
+ */
+ }
+out:
+ /*
+ * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
+ * happens.
+ */
+ if (err) {
+ new_prs = -new_prs;
+ update_partition_exclusive_flag(cs, new_prs);
+ }
+
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ WRITE_ONCE(cs->prs_err, err);
+ if (!is_partition_valid(cs))
+ reset_partition_data(cs);
+ else if (isolcpus_updated)
+ isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
+ spin_unlock_irq(&callback_lock);
+ update_isolation_cpumasks();
+
+ /* Force update if switching back to member & update effective_xcpus */
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
+
+ /* A newly created partition must have effective_xcpus set */
+ WARN_ON_ONCE(!old_prs && (new_prs > 0)
+ && cpumask_empty(cs->effective_xcpus));
+
+ /* Update sched domains and load balance flag */
+ update_partition_sd_lb(cs, old_prs);
+
+ notify_partition_change(cs, old_prs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
+ free_tmpmasks(&tmpmask);
+ return 0;
+}
+
+static struct cpuset *cpuset_attach_old_cs;
+
+/*
+ * Check to see if a cpuset can accept a new task
+ * For v1, cpus_allowed and mems_allowed can't be empty.
+ * For v2, effective_cpus can't be empty.
+ * Note that in v1, effective_cpus = cpus_allowed.
+ */
+static int cpuset_can_attach_check(struct cpuset *cs)
+{
+ if (cpumask_empty(cs->effective_cpus) ||
+ (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
+ return -ENOSPC;
+ return 0;
+}
+
+static void reset_migrate_dl_data(struct cpuset *cs)
+{
+ cs->nr_migrate_dl_tasks = 0;
+ cs->sum_migrate_dl_bw = 0;
+}
+
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+static int cpuset_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs, *oldcs;
+ struct task_struct *task;
+ bool cpus_updated, mems_updated;
+ int ret;
+
+ /* used later by cpuset_attach() */
+ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+ oldcs = cpuset_attach_old_cs;
+ cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
+
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
+ goto out_unlock;
+
+ cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ cgroup_taskset_for_each(task, css, tset) {
+ ret = task_can_attach(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Skip rights over task check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ if (!cpuset_v2() || (cpus_updated || mems_updated)) {
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+ }
+
+ if (dl_task(task)) {
+ cs->nr_migrate_dl_tasks++;
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ }
+ }
+
+ if (!cs->nr_migrate_dl_tasks)
+ goto out_success;
+
+ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ reset_migrate_dl_data(cs);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret) {
+ reset_migrate_dl_data(cs);
+ goto out_unlock;
+ }
+ }
+
+out_success:
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
+
+static void cpuset_cancel_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+
+ cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
+
+ mutex_lock(&cpuset_mutex);
+ dec_attach_in_progress_locked(cs);
+
+ if (cs->nr_migrate_dl_tasks) {
+ int cpu = cpumask_any(cs->effective_cpus);
+
+ dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ reset_migrate_dl_data(cs);
+ }
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
+static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
+{
+ lockdep_assert_held(&cpuset_mutex);
+
+ if (cs != &top_cpuset)
+ guarantee_active_cpus(task, cpus_attach);
+ else
+ cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
+ subpartitions_cpus);
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset1_update_task_spread_flags(cs, task);
+}
+
+static void cpuset_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct task_struct *leader;
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+ struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
+ bool queue_task_work = false;
+
+ cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
+
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
+ mutex_lock(&cpuset_mutex);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cpuset_v2() && !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
+ cgroup_taskset_for_each(task, css, tset)
+ cpuset_attach_task(cs, task);
+
+ /*
+ * Change mm for all threadgroup leaders. This is expensive and may
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
+ */
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
+ cgroup_taskset_for_each_leader(leader, css, tset) {
+ struct mm_struct *mm = get_task_mm(leader);
+
+ if (mm) {
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+ /*
+ * old_mems_allowed is the same with mems_allowed
+ * here, except if this task is being moved
+ * automatically due to hotplug. In that case
+ * @mems_allowed has been updated and is empty, so
+ * @old_mems_allowed is the right nodesets that we
+ * migrate mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+ &cpuset_attach_nodemask_to);
+ queue_task_work = true;
+ } else
+ mmput(mm);
+ }
+ }
+
+out:
+ if (queue_task_work)
+ schedule_flush_migrate_mm();
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
+ if (cs->nr_migrate_dl_tasks) {
+ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+ reset_migrate_dl_data(cs);
+ }
+
+ dec_attach_in_progress_locked(cs);
+
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cpuset *cs = css_cs(of_css(of));
+ struct cpuset *trialcs;
+ int retval = -ENODEV;
+
+ /* root is read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
+ buf = strstrip(buf);
+ cpuset_full_lock();
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ trialcs = dup_or_alloc_cpuset(cs);
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ switch (of_cft(of)->private) {
+ case FILE_CPULIST:
+ retval = update_cpumask(cs, trialcs, buf);
+ break;
+ case FILE_EXCLUSIVE_CPULIST:
+ retval = update_exclusive_cpumask(cs, trialcs, buf);
+ break;
+ case FILE_MEMLIST:
+ retval = update_nodemask(cs, trialcs, buf);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+
+ free_cpuset(trialcs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
+out_unlock:
+ cpuset_full_unlock();
+ if (of_cft(of)->private == FILE_MEMLIST)
+ schedule_flush_migrate_mm();
+ return retval ?: nbytes;
+}
+
+/*
+ * These ascii lists should be read in a single call, by using a user
+ * buffer large enough to hold the entire map. If read in smaller
+ * chunks, there is no guarantee of atomicity. Since the display format
+ * used, list of ranges of sequential numbers, is variable length,
+ * and since these maps can change value dynamically, one could read
+ * gibberish by doing partial reads while a list was changing.
+ */
+int cpuset_common_seq_show(struct seq_file *sf, void *v)
+{
+ struct cpuset *cs = css_cs(seq_css(sf));
+ cpuset_filetype_t type = seq_cft(sf)->private;
+ int ret = 0;
+
+ spin_lock_irq(&callback_lock);
+
+ switch (type) {
+ case FILE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+ break;
+ case FILE_MEMLIST:
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
+ break;
+ case FILE_EFFECTIVE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
+ break;
+ case FILE_EFFECTIVE_MEMLIST:
+ seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
+ break;
+ case FILE_EXCLUSIVE_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
+ break;
+ case FILE_EFFECTIVE_XCPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
+ break;
+ case FILE_SUBPARTS_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
+ break;
+ case FILE_ISOLATED_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ spin_unlock_irq(&callback_lock);
+ return ret;
+}
+
+static int cpuset_partition_show(struct seq_file *seq, void *v)
+{
+ struct cpuset *cs = css_cs(seq_css(seq));
+ const char *err, *type = NULL;
+
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ seq_puts(seq, "root\n");
+ break;
+ case PRS_ISOLATED:
+ seq_puts(seq, "isolated\n");
+ break;
+ case PRS_MEMBER:
+ seq_puts(seq, "member\n");
+ break;
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
+ err = perr_strings[READ_ONCE(cs->prs_err)];
+ if (err)
+ seq_printf(seq, "%s invalid (%s)\n", type, err);
+ else
+ seq_printf(seq, "%s invalid\n", type);
+ break;
+ }
+ return 0;
+}
+
+static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cpuset *cs = css_cs(of_css(of));
+ int val;
+ int retval = -ENODEV;
+
+ buf = strstrip(buf);
+
+ if (!strcmp(buf, "root"))
+ val = PRS_ROOT;
+ else if (!strcmp(buf, "member"))
+ val = PRS_MEMBER;
+ else if (!strcmp(buf, "isolated"))
+ val = PRS_ISOLATED;
+ else
+ return -EINVAL;
+
+ cpuset_full_lock();
+ if (is_cpuset_online(cs))
+ retval = update_prstate(cs, val);
+ cpuset_full_unlock();
+ return retval ?: nbytes;
+}
+
+/*
+ * This is currently a minimal set for the default hierarchy. It can be
+ * expanded later on by migrating more features and control files from v1.
+ */
+static struct cftype dfl_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "mems.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpus.partition",
+ .seq_show = cpuset_partition_show,
+ .write = cpuset_partition_write,
+ .private = FILE_PARTITION_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct cpuset, partition_file),
+ },
+
+ {
+ .name = "cpus.exclusive",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_EXCLUSIVE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.exclusive.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_XCPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.subpartitions",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_SUBPARTS_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
+ },
+
+ {
+ .name = "cpus.isolated",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_ISOLATED_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ { } /* terminate */
+};
+
+
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
+ */
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cpuset *cs;
+
+ if (!parent_css)
+ return &top_cpuset.css;
+
+ cs = dup_or_alloc_cpuset(NULL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ fmeter_init(&cs->fmeter);
+ cs->relax_domain_level = -1;
+
+ /* Set CS_MEMORY_MIGRATE for default hierarchy */
+ if (cpuset_v2())
+ __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
+
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+
+ if (!parent)
+ return 0;
+
+ cpuset_full_lock();
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cpuset_v2() && !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+
+ cpuset_inc();
+
+ spin_lock_irq(&callback_lock);
+ if (is_in_v2_mode()) {
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ cs->effective_mems = parent->effective_mems;
+ }
+ spin_unlock_irq(&callback_lock);
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ goto out_unlock;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * historical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ }
+ rcu_read_unlock();
+
+ spin_lock_irq(&callback_lock);
+ cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+ spin_unlock_irq(&callback_lock);
+out_unlock:
+ cpuset_full_unlock();
+ return 0;
+}
+
+/*
+ * If the cpuset being removed has its flag 'sched_load_balance'
+ * enabled, then simulate turning sched_load_balance off, which
+ * will call rebuild_sched_domains_locked(). That is not needed
+ * in the default hierarchy where only changes in partition
+ * will cause repartitioning.
+ */
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ cpuset_full_lock();
+ if (!cpuset_v2() && is_sched_load_balance(cs))
+ cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ cpuset_dec();
+ cpuset_full_unlock();
+}
+
+/*
+ * If a dying cpuset has the 'cpus.partition' enabled, turn it off by
+ * changing it back to member to free its exclusive CPUs back to the pool to
+ * be used by other online cpusets.
+ */
+static void cpuset_css_killed(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ cpuset_full_lock();
+ /* Reset valid partition back to member */
+ if (is_partition_valid(cs))
+ update_prstate(cs, PRS_MEMBER);
+ cpuset_full_unlock();
+}
+
+static void cpuset_css_free(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ free_cpuset(cs);
+}
+
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+ mutex_lock(&cpuset_mutex);
+ spin_lock_irq(&callback_lock);
+
+ if (is_in_v2_mode()) {
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
+ top_cpuset.mems_allowed = node_possible_map;
+ } else {
+ cpumask_copy(top_cpuset.cpus_allowed,
+ top_cpuset.effective_cpus);
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
+ }
+
+ spin_unlock_irq(&callback_lock);
+ mutex_unlock(&cpuset_mutex);
+}
+
+/*
+ * In case the child is cloned into a cpuset different from its parent,
+ * additional checks are done to see if the move is allowed.
+ */
+static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+ int ret;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+ mutex_lock(&cpuset_mutex);
+
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
+ goto out_unlock;
+
+ ret = task_can_attach(task);
+ if (ret)
+ goto out_unlock;
+
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
+
+static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return;
+
+ dec_attach_in_progress(cs);
+}
+
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+static void cpuset_fork(struct task_struct *task)
+{
+ struct cpuset *cs;
+ bool same_cs;
+
+ rcu_read_lock();
+ cs = task_cs(task);
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs) {
+ if (cs == &top_cpuset)
+ return;
+
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
+ task->mems_allowed = current->mems_allowed;
+ return;
+ }
+
+ /* CLONE_INTO_CGROUP */
+ mutex_lock(&cpuset_mutex);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ cpuset_attach_task(cs, task);
+
+ dec_attach_in_progress_locked(cs);
+ mutex_unlock(&cpuset_mutex);
+}
+
+struct cgroup_subsys cpuset_cgrp_subsys = {
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_killed = cpuset_css_killed,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+ .attach = cpuset_attach,
+ .bind = cpuset_bind,
+ .can_fork = cpuset_can_fork,
+ .cancel_fork = cpuset_cancel_fork,
+ .fork = cpuset_fork,
+#ifdef CONFIG_CPUSETS_V1
+ .legacy_cftypes = cpuset1_files,
+#endif
+ .dfl_cftypes = dfl_files,
+ .early_init = true,
+ .threaded = true,
+};
+
+/**
+ * cpuset_init - initialize cpusets at system boot
+ *
+ * Description: Initialize top_cpuset
+ **/
+
+int __init cpuset_init(void)
+{
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
+
+ cpumask_setall(top_cpuset.cpus_allowed);
+ nodes_setall(top_cpuset.mems_allowed);
+ cpumask_setall(top_cpuset.effective_cpus);
+ cpumask_setall(top_cpuset.effective_xcpus);
+ cpumask_setall(top_cpuset.exclusive_cpus);
+ nodes_setall(top_cpuset.effective_mems);
+
+ fmeter_init(&top_cpuset.fmeter);
+
+ BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+
+ have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
+ if (have_boot_isolcpus) {
+ BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
+ cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
+ }
+
+ return 0;
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ /* A partition root is allowed to have empty effective cpus */
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+ if (nodes_empty(*new_mems))
+ *new_mems = parent_cs(cs)->effective_mems;
+
+ spin_lock_irq(&callback_lock);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->effective_mems = *new_mems;
+ spin_unlock_irq(&callback_lock);
+
+ if (cpus_updated)
+ cpuset_update_tasks_cpumask(cs, new_cpus);
+ if (mems_updated)
+ cpuset_update_tasks_nodemask(cs);
+}
+
+void cpuset_force_rebuild(void)
+{
+ force_sd_rebuild = true;
+}
+
+/**
+ * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
+ * @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
+ *
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
+ */
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated;
+ bool mems_updated;
+ bool remote;
+ int partcmd = -1;
+ struct cpuset *parent;
+retry:
+ wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
+
+ mutex_lock(&cpuset_mutex);
+
+ /*
+ * We have raced with task attaching. We wait until attaching
+ * is finished, so we won't attach a task to an empty cpuset.
+ */
+ if (cs->attach_in_progress) {
+ mutex_unlock(&cpuset_mutex);
+ goto retry;
+ }
+
+ parent = parent_cs(cs);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+ if (!tmp || !cs->partition_root_state)
+ goto update_tasks;
+
+ /*
+ * Compute effective_cpus for valid partition root, may invalidate
+ * child partition roots if necessary.
+ */
+ remote = is_remote_partition(cs);
+ if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
+ compute_partition_effective_cpumask(cs, &new_cpus);
+
+ if (remote && cpumask_empty(&new_cpus) &&
+ partition_is_populated(cs, NULL)) {
+ cs->prs_err = PERR_HOTPLUG;
+ remote_partition_disable(cs, tmp);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ remote = false;
+ }
+
+ /*
+ * Force the partition to become invalid if either one of
+ * the following conditions hold:
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
+ */
+ if (is_local_partition(cs) && (!is_partition_valid(parent) ||
+ tasks_nocpu_error(parent, cs, &new_cpus)))
+ partcmd = partcmd_invalidate;
+ /*
+ * On the other hand, an invalid partition root may be transitioned
+ * back to a regular one with a non-empty effective xcpus.
+ */
+ else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
+ !cpumask_empty(cs->effective_xcpus))
+ partcmd = partcmd_update;
+
+ if (partcmd >= 0) {
+ update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
+ if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
+ compute_partition_effective_cpumask(cs, &new_cpus);
+ cpuset_force_rebuild();
+ }
+ }
+
+update_tasks:
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (!cpus_updated && !mems_updated)
+ goto unlock; /* Hotplug doesn't affect this cpuset */
+
+ if (mems_updated)
+ check_insane_mems_config(&new_mems);
+
+ if (is_in_v2_mode())
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+ else
+ cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+
+unlock:
+ mutex_unlock(&cpuset_mutex);
+}
+
+/**
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
+ *
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
+ *
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
+ * all descendants.
+ *
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
+ *
+ * CPU / memory hotplug is handled synchronously.
+ */
+static void cpuset_handle_hotplug(void)
+{
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated, mems_updated;
+ bool on_dfl = is_in_v2_mode();
+ struct tmpmasks tmp, *ptmp = NULL;
+
+ if (on_dfl && !alloc_tmpmasks(&tmp))
+ ptmp = &tmp;
+
+ lockdep_assert_cpus_held();
+ mutex_lock(&cpuset_mutex);
+
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
+
+ /*
+ * If subpartitions_cpus is populated, it is likely that the check
+ * below will produce a false positive on cpus_updated when the cpu
+ * list isn't changed. It is extra work, but it is better to be safe.
+ */
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
+ !cpumask_empty(subpartitions_cpus);
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
+
+ /* For v1, synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ cpuset_force_rebuild();
+ spin_lock_irq(&callback_lock);
+ if (!on_dfl)
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus. If no CPU is left,
+ * we clear the subpartitions_cpus & let the child partitions
+ * fight for the CPUs again.
+ */
+ if (!cpumask_empty(subpartitions_cpus)) {
+ if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
+ cpumask_clear(subpartitions_cpus);
+ } else {
+ cpumask_andnot(&new_cpus, &new_cpus,
+ subpartitions_cpus);
+ }
+ }
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
+ spin_unlock_irq(&callback_lock);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
+ }
+
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
+ spin_lock_irq(&callback_lock);
+ if (!on_dfl)
+ top_cpuset.mems_allowed = new_mems;
+ top_cpuset.effective_mems = new_mems;
+ spin_unlock_irq(&callback_lock);
+ cpuset_update_tasks_nodemask(&top_cpuset);
+ }
+
+ mutex_unlock(&cpuset_mutex);
+
+ /* if cpus or mems changed, we need to propagate to descendants */
+ if (cpus_updated || mems_updated) {
+ struct cpuset *cs;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (cs == &top_cpuset || !css_tryget_online(&cs->css))
+ continue;
+ rcu_read_unlock();
+
+ cpuset_hotplug_update_tasks(cs, ptmp);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+ }
+
+ /* rebuild sched domains if necessary */
+ if (force_sd_rebuild)
+ rebuild_sched_domains_cpuslocked();
+
+ free_tmpmasks(ptmp);
+}
+
+void cpuset_update_active_cpus(void)
+{
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ */
+ cpuset_handle_hotplug();
+}
+
+/*
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
+ * Call this routine anytime after node_states[N_MEMORY] changes.
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
+ */
+static int cpuset_track_online_nodes(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ cpuset_handle_hotplug();
+ return NOTIFY_OK;
+}
+
+/**
+ * cpuset_init_smp - initialize cpus_allowed
+ *
+ * Description: Finish top cpuset after cpu, node maps are initialized
+ */
+void __init cpuset_init_smp(void)
+{
+ /*
+ * cpus_allowd/mems_allowed set to v2 values in the initial
+ * cpuset_bind() call will be reset to v1 values in another
+ * cpuset_bind() call when v1 cpuset is mounted.
+ */
+ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
+ hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
+
+ cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+ BUG_ON(!cpuset_migrate_mm_wq);
+}
+
+/*
+ * Return cpus_allowed mask from a task's cpuset.
+ */
+static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+{
+ struct cpuset *cs;
+
+ cs = task_cs(tsk);
+ if (cs != &top_cpuset)
+ guarantee_active_cpus(tsk, pmask);
+ /*
+ * Tasks in the top cpuset won't get update to their cpumasks
+ * when a hotplug online/offline event happens. So we include all
+ * offline cpus in the allowed cpu list.
+ */
+ if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+
+ /*
+ * We first exclude cpus allocated to partitions. If there is no
+ * allowable online cpu left, we fall back to all possible cpus.
+ */
+ cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
+ if (!cpumask_intersects(pmask, cpu_active_mask))
+ cpumask_copy(pmask, possible_mask);
+ }
+}
+
+/**
+ * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Similir to cpuset_cpus_allowed() except that the caller must have acquired
+ * cpuset_mutex.
+ */
+void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+{
+ lockdep_assert_held(&cpuset_mutex);
+ __cpuset_cpus_allowed_locked(tsk, pmask);
+}
+
+/**
+ * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
+ * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
+ *
+ * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of cpu_active_mask, even if this means going outside the
+ * tasks cpuset, except when the task is in the top cpuset.
+ **/
+
+void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
+ __cpuset_cpus_allowed_locked(tsk, pmask);
+ spin_unlock_irqrestore(&callback_lock, flags);
+}
+
+/**
+ * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
+ * @tsk: pointer to task_struct with which the scheduler is struggling
+ *
+ * Description: In the case that the scheduler cannot find an allowed cpu in
+ * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
+ * mode however, this value is the same as task_cs(tsk)->effective_cpus,
+ * which will not contain a sane cpumask during cases such as cpu hotplugging.
+ * This is the absolute last resort for the scheduler and it is only used if
+ * _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
+ **/
+
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ const struct cpumask *cs_mask;
+ bool changed = false;
+
+ rcu_read_lock();
+ cs_mask = task_cs(tsk)->cpus_allowed;
+ if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+ set_cpus_allowed_force(tsk, cs_mask);
+ changed = true;
+ }
+ rcu_read_unlock();
+
+ /*
+ * We own tsk->cpus_allowed, nobody can change it under us.
+ *
+ * But we used cs && cs->cpus_allowed lockless and thus can
+ * race with cgroup_attach_task() or update_cpumask() and get
+ * the wrong tsk->cpus_allowed. However, both cases imply the
+ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+ * which takes task_rq_lock().
+ *
+ * If we are called after it dropped the lock we must see all
+ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+ * set any mask even if it is not right from task_cs() pov,
+ * the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
+ */
+ return changed;
+}
+
+void __init cpuset_init_current_mems_allowed(void)
+{
+ nodes_setall(current->mems_allowed);
+}
+
+/**
+ * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
+ * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
+ *
+ * Description: Returns the nodemask_t mems_allowed of the cpuset
+ * attached to the specified @tsk. Guaranteed to return some non-empty
+ * subset of node_states[N_MEMORY], even if this means going outside the
+ * tasks cpuset.
+ **/
+
+nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
+{
+ nodemask_t mask;
+ unsigned long flags;
+
+ spin_lock_irqsave(&callback_lock, flags);
+ guarantee_online_mems(task_cs(tsk), &mask);
+ spin_unlock_irqrestore(&callback_lock, flags);
+
+ return mask;
+}
+
+/**
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
+ * @nodemask: the nodemask to be checked
+ *
+ * Are any of the nodes in the nodemask allowed in current->mems_allowed?
+ */
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
+{
+ return nodes_intersects(*nodemask, current->mems_allowed);
+}
+
+/*
+ * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
+ * mem_hardwall ancestor to the specified cpuset. Call holding
+ * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
+ * (an unusual configuration), then returns the root cpuset.
+ */
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
+{
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
+ return cs;
+}
+
+/*
+ * cpuset_current_node_allowed - Can current task allocate on a memory node?
+ * @node: is this an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate. If @node is set in
+ * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
+ * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
+ * yes. If current has access to memory reserves as an oom victim, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest enclosing hardwalled ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires callback_lock. The
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
+ * current tasks mems_allowed came up empty on the first pass over
+ * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
+ * cpuset are short of memory, might require taking the callback_lock.
+ *
+ * The first call here from mm/page_alloc:get_page_from_freelist()
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
+ * so no allocation on a node outside the cpuset is allowed (unless
+ * in interrupt, of course).
+ *
+ * The second pass through get_page_from_freelist() doesn't even call
+ * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
+ * in alloc_flags. That logic and the checks below have the combined
+ * affect that:
+ * in_interrupt - any node ok (current task context irrelevant)
+ * GFP_ATOMIC - any node ok
+ * tsk_is_oom_victim - any node ok
+ * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
+ * GFP_USER - only nodes in current tasks mems allowed ok.
+ */
+bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
+{
+ struct cpuset *cs; /* current cpuset ancestors */
+ bool allowed; /* is allocation in zone z allowed? */
+ unsigned long flags;
+
+ if (in_interrupt())
+ return true;
+ if (node_isset(node, current->mems_allowed))
+ return true;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(tsk_is_oom_victim(current)))
+ return true;
+ if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
+ return false;
+
+ if (current->flags & PF_EXITING) /* Let dying task have memory */
+ return true;
+
+ /* Not hardwall and node outside mems_allowed: scan up cpusets */
+ spin_lock_irqsave(&callback_lock, flags);
+
+ cs = nearest_hardwall_ancestor(task_cs(current));
+ allowed = node_isset(node, cs->mems_allowed);
+
+ spin_unlock_irqrestore(&callback_lock, flags);
+ return allowed;
+}
+
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+ bool allowed;
+
+ /*
+ * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
+ * and mems_allowed is likely to be empty even if we could get to it,
+ * so return true to avoid taking a global lock on the empty check.
+ */
+ if (!cpuset_v2())
+ return true;
+
+ css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+ if (!css)
+ return true;
+
+ /*
+ * Normally, accessing effective_mems would require the cpuset_mutex
+ * or callback_lock - but node_isset is atomic and the reference
+ * taken via cgroup_get_e_css is sufficient to protect css.
+ *
+ * Since this interface is intended for use by migration paths, we
+ * relax locking here to avoid taking global locks - while accepting
+ * there may be rare scenarios where the result may be innaccurate.
+ *
+ * Reclaim and migration are subject to these same race conditions, and
+ * cannot make strong isolation guarantees, so this is acceptable.
+ */
+ cs = container_of(css, struct cpuset, css);
+ allowed = node_isset(nid, cs->effective_mems);
+ css_put(css);
+ return allowed;
+}
+
+/**
+ * cpuset_spread_node() - On which node to begin search for a page
+ * @rotor: round robin rotor
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online. So it
+ * should not be possible for the following code to return an
+ * offline node. But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start. The zonelist passed to
+ * __alloc_pages() will include all nodes. If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+static int cpuset_spread_node(int *rotor)
+{
+ return *rotor = next_node_in(*rotor, current->mems_allowed);
+}
+
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ */
+int cpuset_mem_spread_node(void)
+{
+ if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
+ current->cpuset_mem_spread_rotor =
+ node_random(&current->mems_allowed);
+
+ return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+
+/**
+ * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
+ * @tsk1: pointer to task_struct of some task.
+ * @tsk2: pointer to task_struct of some other task.
+ *
+ * Description: Return true if @tsk1's mems_allowed intersects the
+ * mems_allowed of @tsk2. Used by the OOM killer to determine if
+ * one of the task's memory usage might impact the memory available
+ * to the other.
+ **/
+
+int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
+ const struct task_struct *tsk2)
+{
+ return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+}
+
+/**
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
+ *
+ * Description: Prints current's name, cpuset name, and cached copy of its
+ * mems_allowed to the kernel log.
+ */
+void cpuset_print_current_mems_allowed(void)
+{
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+
+ cgrp = task_cs(current)->css.cgroup;
+ pr_cont(",cpuset=");
+ pr_cont_cgroup_name(cgrp);
+ pr_cont(",mems_allowed=%*pbl",
+ nodemask_pr_args(&current->mems_allowed));
+
+ rcu_read_unlock();
+}
+
+/* Display task mems_allowed in /proc/<pid>/status file. */
+void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
+{
+ seq_printf(m, "Mems_allowed:\t%*pb\n",
+ nodemask_pr_args(&task->mems_allowed));
+ seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
+ nodemask_pr_args(&task->mems_allowed));
+}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..81ea38dd6f9d
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static int current_css_set_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct css_set *cset;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ int i, refcnt;
+
+ if (!cgroup_kn_lock_live(of->kn, false))
+ return -ENODEV;
+
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ refcnt = refcount_read(&cset->refcount);
+ seq_printf(seq, "css_set %pK %d", cset, refcnt);
+ if (refcnt > cset->nr_tasks)
+ seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+ seq_puts(seq, "\n");
+
+ /*
+ * Print the css'es stored in the current css_set.
+ */
+ for_each_subsys(ss, i) {
+ css = cset->subsys[ss->id];
+ if (!css)
+ continue;
+ seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
+ css, css->id);
+ }
+ spin_unlock_irq(&css_set_lock);
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = refcount_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ spin_unlock_irq(&css_set_lock);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+ int dead_cnt = 0, extra_refs = 0, threaded_csets = 0;
+
+ spin_lock_irq(&css_set_lock);
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+ int refcnt = refcount_read(&cset->refcount);
+
+ /*
+ * Print out the proc_cset and threaded_cset relationship
+ * and highlight difference between refcount and task_count.
+ */
+ seq_printf(seq, "css_set %pK", cset);
+ if (rcu_dereference_protected(cset->dom_cset, 1) != cset) {
+ threaded_csets++;
+ seq_printf(seq, "=>%pK", cset->dom_cset);
+ }
+ if (!list_empty(&cset->threaded_csets)) {
+ struct css_set *tcset;
+ int idx = 0;
+
+ list_for_each_entry(tcset, &cset->threaded_csets,
+ threaded_csets_node) {
+ seq_puts(seq, idx ? "," : "<=");
+ seq_printf(seq, "%pK", tcset);
+ idx++;
+ }
+ } else {
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
+ }
+ seq_puts(seq, "\n");
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+ /* show # of overflowed tasks */
+ if (count > MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " ... (%d)\n",
+ count - MAX_TASKS_SHOWN_PER_CSS);
+
+ if (cset->dead) {
+ seq_puts(seq, " [dead]\n");
+ dead_cnt++;
+ }
+
+ WARN_ON(count != cset->nr_tasks);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ if (!dead_cnt && !extra_refs && !threaded_csets)
+ return 0;
+
+ seq_puts(seq, "\n");
+ if (threaded_csets)
+ seq_printf(seq, "threaded css_sets = %d\n", threaded_csets);
+ if (extra_refs)
+ seq_printf(seq, "extra references = %d\n", extra_refs);
+ if (dead_cnt)
+ seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+ return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ char pbuf[16];
+ int i;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ for_each_subsys(ss, i) {
+ css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+ if (!css)
+ continue;
+
+ pbuf[0] = '\0';
+
+ /* Show the parent CSS if applicable*/
+ if (css->parent)
+ snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+ css->parent->id);
+ seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name,
+ css, css->id,
+ atomic_read(&css->online_cnt), pbuf);
+ }
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+ u16 mask)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ bool first = true;
+
+ seq_printf(seq, "%-17s: ", name);
+ for_each_subsys(ss, ssid) {
+ if (!(mask & (1 << ssid)))
+ continue;
+ if (!first)
+ seq_puts(seq, ", ");
+ seq_puts(seq, ss->name);
+ first = false;
+ }
+ seq_putc(seq, '\n');
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+ cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_is_populated(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_legacy_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "cgroup_subsys_states",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "cgroup_masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "csses",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_legacy_files,
+};
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+void __init enable_debug_cgroup(void)
+{
+ debug_cgrp_subsys.dfl_cftypes = debug_files;
+ debug_cgrp_subsys.implicit_on_dfl = true;
+ debug_cgrp_subsys.threaded = true;
+}
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
new file mode 100644
index 000000000000..e12b946278b6
--- /dev/null
+++ b/kernel/cgroup/dmem.c
@@ -0,0 +1,830 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
+ * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
+ * Partially based on the rdma and misc controllers, which bear the following copyrights:
+ *
+ * Copyright 2020 Google LLC
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ */
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_dmem.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/page_counter.h>
+#include <linux/parser.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+
+struct dmem_cgroup_region {
+ /**
+ * @ref: References keeping the region alive.
+ * Keeps the region reference alive after a succesful RCU lookup.
+ */
+ struct kref ref;
+
+ /** @rcu: RCU head for freeing */
+ struct rcu_head rcu;
+
+ /**
+ * @region_node: Linked into &dmem_cgroup_regions list.
+ * Protected by RCU and global spinlock.
+ */
+ struct list_head region_node;
+
+ /**
+ * @pools: List of pools linked to this region.
+ * Protected by global spinlock only
+ */
+ struct list_head pools;
+
+ /** @size: Size of region, in bytes */
+ u64 size;
+
+ /** @name: Name describing the node, set by dmem_cgroup_register_region */
+ char *name;
+
+ /**
+ * @unregistered: Whether the region is unregistered by its caller.
+ * No new pools should be added to the region afterwards.
+ */
+ bool unregistered;
+};
+
+struct dmemcg_state {
+ struct cgroup_subsys_state css;
+
+ struct list_head pools;
+};
+
+struct dmem_cgroup_pool_state {
+ struct dmem_cgroup_region *region;
+ struct dmemcg_state *cs;
+
+ /* css node, RCU protected against region teardown */
+ struct list_head css_node;
+
+ /* dev node, no RCU protection required */
+ struct list_head region_node;
+
+ struct rcu_head rcu;
+
+ struct page_counter cnt;
+
+ bool inited;
+};
+
+/*
+ * 3 operations require locking protection:
+ * - Registering and unregistering region to/from list, requires global lock.
+ * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
+ * - Adding a dmem_cgroup_pool_state to a region list.
+ *
+ * Since for the most common operations RCU provides enough protection, I
+ * do not think more granular locking makes sense. Most protection is offered
+ * by RCU and the lockless operating page_counter.
+ */
+static DEFINE_SPINLOCK(dmemcg_lock);
+static LIST_HEAD(dmem_cgroup_regions);
+
+static inline struct dmemcg_state *
+css_to_dmemcs(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct dmemcg_state, css);
+}
+
+static inline struct dmemcg_state *get_current_dmemcs(void)
+{
+ return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
+}
+
+static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
+{
+ return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
+}
+
+static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
+{
+ list_del(&pool->region_node);
+ kfree(pool);
+}
+
+static void
+set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_min(&pool->cnt, val);
+}
+
+static void
+set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_low(&pool->cnt, val);
+}
+
+static void
+set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_max(&pool->cnt, val);
+}
+
+static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.low) : 0;
+}
+
+static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.min) : 0;
+}
+
+static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
+}
+
+static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? page_counter_read(&pool->cnt) : 0;
+}
+
+static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
+{
+ set_resource_min(rpool, 0);
+ set_resource_low(rpool, 0);
+ set_resource_max(rpool, PAGE_COUNTER_MAX);
+}
+
+static void dmemcs_offline(struct cgroup_subsys_state *css)
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(css);
+ struct dmem_cgroup_pool_state *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
+ reset_all_resource_limits(pool);
+ rcu_read_unlock();
+}
+
+static void dmemcs_free(struct cgroup_subsys_state *css)
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(css);
+ struct dmem_cgroup_pool_state *pool, *next;
+
+ spin_lock(&dmemcg_lock);
+ list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
+ /*
+ *The pool is dead and all references are 0,
+ * no need for RCU protection with list_del_rcu or freeing.
+ */
+ list_del(&pool->css_node);
+ free_cg_pool(pool);
+ }
+ spin_unlock(&dmemcg_lock);
+
+ kfree(dmemcs);
+}
+
+static struct cgroup_subsys_state *
+dmemcs_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
+ if (!dmemcs)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&dmemcs->pools);
+ return &dmemcs->css;
+}
+
+static struct dmem_cgroup_pool_state *
+find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
+{
+ struct dmem_cgroup_pool_state *pool;
+
+ list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
+ if (pool->region == region)
+ return pool;
+
+ return NULL;
+}
+
+static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
+{
+ if (!pool->cnt.parent)
+ return NULL;
+
+ return container_of(pool->cnt.parent, typeof(*pool), cnt);
+}
+
+static void
+dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
+ struct dmem_cgroup_pool_state *test_pool)
+{
+ struct page_counter *climit;
+ struct cgroup_subsys_state *css;
+ struct dmemcg_state *dmemcg_iter;
+ struct dmem_cgroup_pool_state *pool, *found_pool;
+
+ climit = &limit_pool->cnt;
+
+ rcu_read_lock();
+
+ css_for_each_descendant_pre(css, &limit_pool->cs->css) {
+ dmemcg_iter = container_of(css, struct dmemcg_state, css);
+ found_pool = NULL;
+
+ list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
+ if (pool->region == limit_pool->region) {
+ found_pool = pool;
+ break;
+ }
+ }
+ if (!found_pool)
+ continue;
+
+ page_counter_calculate_protection(
+ climit, &found_pool->cnt, true);
+
+ if (found_pool == test_pool)
+ break;
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
+ * @limit_pool: The pool for which we hit limits
+ * @test_pool: The pool for which to test
+ * @ignore_low: Whether we have to respect low watermarks.
+ * @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
+ *
+ * This function returns true if we can evict from @test_pool, false if not.
+ * When returning false and @ignore_low is false, @ret_hit_low may
+ * be set to true to indicate this function can be retried with @ignore_low
+ * set to true.
+ *
+ * Return: bool
+ */
+bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
+ struct dmem_cgroup_pool_state *test_pool,
+ bool ignore_low, bool *ret_hit_low)
+{
+ struct dmem_cgroup_pool_state *pool = test_pool;
+ struct page_counter *ctest;
+ u64 used, min, low;
+
+ /* Can always evict from current pool, despite limits */
+ if (limit_pool == test_pool)
+ return true;
+
+ if (limit_pool) {
+ if (!parent_dmemcs(limit_pool->cs))
+ return true;
+
+ for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
+ {}
+
+ if (!pool)
+ return false;
+ } else {
+ /*
+ * If there is no cgroup limiting memory usage, use the root
+ * cgroup instead for limit calculations.
+ */
+ for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
+ {}
+ }
+
+ ctest = &test_pool->cnt;
+
+ dmem_cgroup_calculate_protection(limit_pool, test_pool);
+
+ used = page_counter_read(ctest);
+ min = READ_ONCE(ctest->emin);
+
+ if (used <= min)
+ return false;
+
+ if (!ignore_low) {
+ low = READ_ONCE(ctest->elow);
+ if (used > low)
+ return true;
+
+ *ret_hit_low = true;
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
+
+static struct dmem_cgroup_pool_state *
+alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
+ struct dmem_cgroup_pool_state **allocpool)
+{
+ struct dmemcg_state *parent = parent_dmemcs(dmemcs);
+ struct dmem_cgroup_pool_state *pool, *ppool = NULL;
+
+ if (!*allocpool) {
+ pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+ } else {
+ pool = *allocpool;
+ *allocpool = NULL;
+ }
+
+ pool->region = region;
+ pool->cs = dmemcs;
+
+ if (parent)
+ ppool = find_cg_pool_locked(parent, region);
+
+ page_counter_init(&pool->cnt,
+ ppool ? &ppool->cnt : NULL, true);
+ reset_all_resource_limits(pool);
+
+ list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
+ list_add_tail(&pool->region_node, &region->pools);
+
+ if (!parent)
+ pool->inited = true;
+ else
+ pool->inited = ppool ? ppool->inited : false;
+ return pool;
+}
+
+static struct dmem_cgroup_pool_state *
+get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
+ struct dmem_cgroup_pool_state **allocpool)
+{
+ struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
+ struct dmemcg_state *p, *pp;
+
+ /*
+ * Recursively create pool, we may not initialize yet on
+ * recursion, this is done as a separate step.
+ */
+ for (p = dmemcs; p; p = parent_dmemcs(p)) {
+ pool = find_cg_pool_locked(p, region);
+ if (!pool)
+ pool = alloc_pool_single(p, region, allocpool);
+
+ if (IS_ERR(pool))
+ return pool;
+
+ if (p == dmemcs && pool->inited)
+ return pool;
+
+ if (pool->inited)
+ break;
+ }
+
+ retpool = pool = find_cg_pool_locked(dmemcs, region);
+ for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
+ if (pool->inited)
+ break;
+
+ /* ppool was created if it didn't exist by above loop. */
+ ppool = find_cg_pool_locked(pp, region);
+
+ /* Fix up parent links, mark as inited. */
+ pool->cnt.parent = &ppool->cnt;
+ pool->inited = true;
+
+ pool = ppool;
+ }
+
+ return retpool;
+}
+
+static void dmemcg_free_rcu(struct rcu_head *rcu)
+{
+ struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
+ struct dmem_cgroup_pool_state *pool, *next;
+
+ list_for_each_entry_safe(pool, next, &region->pools, region_node)
+ free_cg_pool(pool);
+ kfree(region->name);
+ kfree(region);
+}
+
+static void dmemcg_free_region(struct kref *ref)
+{
+ struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
+
+ call_rcu(&cgregion->rcu, dmemcg_free_rcu);
+}
+
+/**
+ * dmem_cgroup_unregister_region() - Unregister a previously registered region.
+ * @region: The region to unregister.
+ *
+ * This function undoes dmem_cgroup_register_region.
+ */
+void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
+{
+ struct list_head *entry;
+
+ if (!region)
+ return;
+
+ spin_lock(&dmemcg_lock);
+
+ /* Remove from global region list */
+ list_del_rcu(&region->region_node);
+
+ list_for_each_rcu(entry, &region->pools) {
+ struct dmem_cgroup_pool_state *pool =
+ container_of(entry, typeof(*pool), region_node);
+
+ list_del_rcu(&pool->css_node);
+ }
+
+ /*
+ * Ensure any RCU based lookups fail. Additionally,
+ * no new pools should be added to the dead region
+ * by get_cg_pool_unlocked.
+ */
+ region->unregistered = true;
+ spin_unlock(&dmemcg_lock);
+
+ kref_put(&region->ref, dmemcg_free_region);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
+
+/**
+ * dmem_cgroup_register_region() - Register a regions for dev cgroup.
+ * @size: Size of region to register, in bytes.
+ * @fmt: Region parameters to register
+ *
+ * This function registers a node in the dmem cgroup with the
+ * name given. After calling this function, the region can be
+ * used for allocations.
+ *
+ * Return: NULL or a struct on success, PTR_ERR on failure.
+ */
+struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
+{
+ struct dmem_cgroup_region *ret;
+ char *region_name;
+ va_list ap;
+
+ if (!size)
+ return NULL;
+
+ va_start(ap, fmt);
+ region_name = kvasprintf(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!region_name)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret) {
+ kfree(region_name);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&ret->pools);
+ ret->name = region_name;
+ ret->size = size;
+ kref_init(&ret->ref);
+
+ spin_lock(&dmemcg_lock);
+ list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
+ spin_unlock(&dmemcg_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
+
+static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
+{
+ struct dmem_cgroup_region *region;
+
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
+ if (!strcmp(name, region->name) &&
+ kref_get_unless_zero(&region->ref))
+ return region;
+
+ return NULL;
+}
+
+/**
+ * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
+ * @pool: &dmem_cgroup_pool_state
+ *
+ * Called to drop a reference to the limiting pool returned by
+ * dmem_cgroup_try_charge().
+ */
+void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
+{
+ if (pool)
+ css_put(&pool->cs->css);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
+
+static struct dmem_cgroup_pool_state *
+get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
+{
+ struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
+
+ /* fastpath lookup? */
+ rcu_read_lock();
+ pool = find_cg_pool_locked(cg, region);
+ if (pool && !READ_ONCE(pool->inited))
+ pool = NULL;
+ rcu_read_unlock();
+
+ while (!pool) {
+ spin_lock(&dmemcg_lock);
+ if (!region->unregistered)
+ pool = get_cg_pool_locked(cg, region, &allocpool);
+ else
+ pool = ERR_PTR(-ENODEV);
+ spin_unlock(&dmemcg_lock);
+
+ if (pool == ERR_PTR(-ENOMEM)) {
+ pool = NULL;
+ if (WARN_ON(allocpool))
+ continue;
+
+ allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
+ if (allocpool) {
+ pool = NULL;
+ continue;
+ }
+ }
+ }
+
+ kfree(allocpool);
+ return pool;
+}
+
+/**
+ * dmem_cgroup_uncharge() - Uncharge a pool.
+ * @pool: Pool to uncharge.
+ * @size: Size to uncharge.
+ *
+ * Undoes the effects of dmem_cgroup_try_charge.
+ * Must be called with the returned pool as argument,
+ * and same @index and @size.
+ */
+void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
+{
+ if (!pool)
+ return;
+
+ page_counter_uncharge(&pool->cnt, size);
+ css_put(&pool->cs->css);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
+
+/**
+ * dmem_cgroup_try_charge() - Try charging a new allocation to a region.
+ * @region: dmem region to charge
+ * @size: Size (in bytes) to charge.
+ * @ret_pool: On succesfull allocation, the pool that is charged.
+ * @ret_limit_pool: On a failed allocation, the limiting pool.
+ *
+ * This function charges the @region region for a size of @size bytes.
+ *
+ * If the function succeeds, @ret_pool is set, which must be passed to
+ * dmem_cgroup_uncharge() when undoing the allocation.
+ *
+ * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
+ * will be set to the pool for which the limit is hit. This can be used for
+ * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
+ * with @dmem_cgroup_pool_state_put().
+ *
+ * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
+ */
+int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
+ struct dmem_cgroup_pool_state **ret_pool,
+ struct dmem_cgroup_pool_state **ret_limit_pool)
+{
+ struct dmemcg_state *cg;
+ struct dmem_cgroup_pool_state *pool;
+ struct page_counter *fail;
+ int ret;
+
+ *ret_pool = NULL;
+ if (ret_limit_pool)
+ *ret_limit_pool = NULL;
+
+ /*
+ * hold on to css, as cgroup can be removed but resource
+ * accounting happens on css.
+ */
+ cg = get_current_dmemcs();
+
+ pool = get_cg_pool_unlocked(cg, region);
+ if (IS_ERR(pool)) {
+ ret = PTR_ERR(pool);
+ goto err;
+ }
+
+ if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
+ if (ret_limit_pool) {
+ *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
+ css_get(&(*ret_limit_pool)->cs->css);
+ }
+ ret = -EAGAIN;
+ goto err;
+ }
+
+ /* On success, reference from get_current_dmemcs is transferred to *ret_pool */
+ *ret_pool = pool;
+ return 0;
+
+err:
+ css_put(&cg->css);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
+
+static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
+{
+ struct dmem_cgroup_region *region;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+ seq_puts(sf, region->name);
+ seq_printf(sf, " %llu\n", region->size);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
+ u64 *new_limit)
+{
+ char *end;
+
+ if (!strcmp(options, "max")) {
+ *new_limit = PAGE_COUNTER_MAX;
+ return 0;
+ }
+
+ *new_limit = memparse(options, &end);
+ if (*end != '\0')
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ void (*apply)(struct dmem_cgroup_pool_state *, u64))
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
+ int err = 0;
+
+ while (buf && !err) {
+ struct dmem_cgroup_pool_state *pool = NULL;
+ char *options, *region_name;
+ struct dmem_cgroup_region *region;
+ u64 new_limit;
+
+ options = buf;
+ buf = strchr(buf, '\n');
+ if (buf)
+ *buf++ = '\0';
+
+ options = strstrip(options);
+
+ /* eat empty lines */
+ if (!options[0])
+ continue;
+
+ region_name = strsep(&options, " \t");
+ if (!region_name[0])
+ continue;
+
+ rcu_read_lock();
+ region = dmemcg_get_region_by_name(region_name);
+ rcu_read_unlock();
+
+ if (!region)
+ return -EINVAL;
+
+ err = dmemcg_parse_limit(options, region, &new_limit);
+ if (err < 0)
+ goto out_put;
+
+ pool = get_cg_pool_unlocked(dmemcs, region);
+ if (IS_ERR(pool)) {
+ err = PTR_ERR(pool);
+ goto out_put;
+ }
+
+ /* And commit */
+ apply(pool, new_limit);
+
+out_put:
+ kref_put(&region->ref, dmemcg_free_region);
+ }
+
+
+ return err ?: nbytes;
+}
+
+static int dmemcg_limit_show(struct seq_file *sf, void *v,
+ u64 (*fn)(struct dmem_cgroup_pool_state *))
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
+ struct dmem_cgroup_region *region;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+ struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
+ u64 val;
+
+ seq_puts(sf, region->name);
+
+ val = fn(pool);
+ if (val < PAGE_COUNTER_MAX)
+ seq_printf(sf, " %lld\n", val);
+ else
+ seq_puts(sf, " max\n");
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_current);
+}
+
+static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_min);
+}
+
+static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
+}
+
+static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_low);
+}
+
+static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
+}
+
+static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_max);
+}
+
+static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "capacity",
+ .seq_show = dmem_cgroup_region_capacity_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = dmem_cgroup_region_current_show,
+ },
+ {
+ .name = "min",
+ .write = dmem_cgroup_region_min_write,
+ .seq_show = dmem_cgroup_region_min_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "low",
+ .write = dmem_cgroup_region_low_write,
+ .seq_show = dmem_cgroup_region_low_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "max",
+ .write = dmem_cgroup_region_max_write,
+ .seq_show = dmem_cgroup_region_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* Zero entry terminates. */
+};
+
+struct cgroup_subsys dmem_cgrp_subsys = {
+ .css_alloc = dmemcs_alloc,
+ .css_free = dmemcs_free,
+ .css_offline = dmemcs_offline,
+ .legacy_cftypes = files,
+ .dfl_cftypes = files,
+};
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
new file mode 100644
index 000000000000..6c18854bff34
--- /dev/null
+++ b/kernel/cgroup/freezer.c
@@ -0,0 +1,326 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cgroup.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+
+#include "cgroup-internal.h"
+
+#include <trace/events/cgroup.h>
+
+/*
+ * Update CGRP_FROZEN of cgroup.flag
+ * Return true if flags is updated; false if flags has no change
+ */
+static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ /* Already there? */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen)
+ return false;
+
+ if (frozen)
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ else
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
+ return true;
+}
+
+/*
+ * Propagate the cgroup frozen state upwards by the cgroup tree.
+ */
+static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
+{
+ int desc = 1;
+
+ /*
+ * If the new state is frozen, some freezing ancestor cgroups may change
+ * their state too, depending on if all their descendants are frozen.
+ *
+ * Otherwise, all ancestor cgroups are forced into the non-frozen state.
+ */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (frozen) {
+ cgrp->freezer.nr_frozen_descendants += desc;
+ if (!test_bit(CGRP_FREEZE, &cgrp->flags) ||
+ (cgrp->freezer.nr_frozen_descendants !=
+ cgrp->nr_descendants))
+ continue;
+ } else {
+ cgrp->freezer.nr_frozen_descendants -= desc;
+ }
+
+ if (cgroup_update_frozen_flag(cgrp, frozen))
+ desc++;
+ }
+}
+
+/*
+ * Revisit the cgroup frozen state.
+ * Checks if the cgroup is really frozen and perform all state transitions.
+ */
+void cgroup_update_frozen(struct cgroup *cgrp)
+{
+ bool frozen;
+
+ /*
+ * If the cgroup has to be frozen (CGRP_FREEZE bit set),
+ * and all tasks are frozen and/or stopped, let's consider
+ * the cgroup frozen. Otherwise it's not frozen.
+ */
+ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
+ cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
+
+ /* If flags is updated, update the state of ancestor cgroups. */
+ if (cgroup_update_frozen_flag(cgrp, frozen))
+ cgroup_propagate_frozen(cgrp, frozen);
+}
+
+/*
+ * Increment cgroup's nr_frozen_tasks.
+ */
+static void cgroup_inc_frozen_cnt(struct cgroup *cgrp)
+{
+ cgrp->freezer.nr_frozen_tasks++;
+}
+
+/*
+ * Decrement cgroup's nr_frozen_tasks.
+ */
+static void cgroup_dec_frozen_cnt(struct cgroup *cgrp)
+{
+ cgrp->freezer.nr_frozen_tasks--;
+ WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0);
+}
+
+/*
+ * Enter frozen/stopped state, if not yet there. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ */
+void cgroup_enter_frozen(void)
+{
+ struct cgroup *cgrp;
+
+ if (current->frozen)
+ return;
+
+ spin_lock_irq(&css_set_lock);
+ current->frozen = true;
+ cgrp = task_dfl_cgroup(current);
+ cgroup_inc_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
+}
+
+/*
+ * Conditionally leave frozen/stopped state. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ *
+ * If always_leave is not set, and the cgroup is freezing,
+ * we're racing with the cgroup freezing. In this case, we don't
+ * drop the frozen counter to avoid a transient switch to
+ * the unfrozen state.
+ */
+void cgroup_leave_frozen(bool always_leave)
+{
+ struct cgroup *cgrp;
+
+ spin_lock_irq(&css_set_lock);
+ cgrp = task_dfl_cgroup(current);
+ if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) {
+ cgroup_dec_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ WARN_ON_ONCE(!current->frozen);
+ current->frozen = false;
+ } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) {
+ spin_lock(&current->sighand->siglock);
+ current->jobctl |= JOBCTL_TRAP_FREEZE;
+ set_thread_flag(TIF_SIGPENDING);
+ spin_unlock(&current->sighand->siglock);
+ }
+ spin_unlock_irq(&css_set_lock);
+}
+
+/*
+ * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE
+ * jobctl bit.
+ */
+static void cgroup_freeze_task(struct task_struct *task, bool freeze)
+{
+ unsigned long flags;
+
+ /* If the task is about to die, don't bother with freezing it. */
+ if (!lock_task_sighand(task, &flags))
+ return;
+
+ if (freeze) {
+ task->jobctl |= JOBCTL_TRAP_FREEZE;
+ signal_wake_up(task, false);
+ } else {
+ task->jobctl &= ~JOBCTL_TRAP_FREEZE;
+ wake_up_process(task);
+ }
+
+ unlock_task_sighand(task, &flags);
+}
+
+/*
+ * Freeze or unfreeze all tasks in the given cgroup.
+ */
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ write_seqcount_begin(&cgrp->freezer.freeze_seq);
+ if (freeze) {
+ set_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.freeze_start_nsec = ts_nsec;
+ } else {
+ clear_bit(CGRP_FREEZE, &cgrp->flags);
+ cgrp->freezer.frozen_nsec += (ts_nsec -
+ cgrp->freezer.freeze_start_nsec);
+ }
+ write_seqcount_end(&cgrp->freezer.freeze_seq);
+ spin_unlock_irq(&css_set_lock);
+
+ if (freeze)
+ TRACE_CGROUP_PATH(freeze, cgrp);
+ else
+ TRACE_CGROUP_PATH(unfreeze, cgrp);
+
+ css_task_iter_start(&cgrp->self, 0, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /*
+ * Ignore kernel threads here. Freezing cgroups containing
+ * kthreads isn't supported.
+ */
+ if (task->flags & PF_KTHREAD)
+ continue;
+ cgroup_freeze_task(task, freeze);
+ }
+ css_task_iter_end(&it);
+
+ /*
+ * Cgroup state should be revisited here to cover empty leaf cgroups
+ * and cgroups which descendants are already in the desired state.
+ */
+ spin_lock_irq(&css_set_lock);
+ if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants)
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
+}
+
+/*
+ * Adjust the task state (freeze or unfreeze) and revisit the state of
+ * source and destination cgroups.
+ */
+void cgroup_freezer_migrate_task(struct task_struct *task,
+ struct cgroup *src, struct cgroup *dst)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ /*
+ * Kernel threads are not supposed to be frozen at all.
+ */
+ if (task->flags & PF_KTHREAD)
+ return;
+
+ /*
+ * It's not necessary to do changes if both of the src and dst cgroups
+ * are not freezing and task is not frozen.
+ */
+ if (!test_bit(CGRP_FREEZE, &src->flags) &&
+ !test_bit(CGRP_FREEZE, &dst->flags) &&
+ !task->frozen)
+ return;
+
+ /*
+ * Adjust counters of freezing and frozen tasks.
+ * Note, that if the task is frozen, but the destination cgroup is not
+ * frozen, we bump both counters to keep them balanced.
+ */
+ if (task->frozen) {
+ cgroup_inc_frozen_cnt(dst);
+ cgroup_dec_frozen_cnt(src);
+ }
+ cgroup_update_frozen(dst);
+ cgroup_update_frozen(src);
+
+ /*
+ * Force the task to the desired state.
+ */
+ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));
+}
+
+void cgroup_freeze(struct cgroup *cgrp, bool freeze)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *parent;
+ struct cgroup *dsct;
+ bool applied = false;
+ u64 ts_nsec;
+ bool old_e;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Nothing changed? Just exit.
+ */
+ if (cgrp->freezer.freeze == freeze)
+ return;
+
+ cgrp->freezer.freeze = freeze;
+ ts_nsec = ktime_get_ns();
+
+ /*
+ * Propagate changes downwards the cgroup tree.
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ dsct = css->cgroup;
+
+ if (cgroup_is_dead(dsct))
+ continue;
+
+ /*
+ * e_freeze is affected by parent's e_freeze and dst's freeze.
+ * If old e_freeze eq new e_freeze, no change, its children
+ * will not be affected. So do nothing and skip the subtree
+ */
+ old_e = dsct->freezer.e_freeze;
+ parent = cgroup_parent(dsct);
+ dsct->freezer.e_freeze = (dsct->freezer.freeze ||
+ parent->freezer.e_freeze);
+ if (dsct->freezer.e_freeze == old_e) {
+ css = css_rightmost_descendant(css);
+ continue;
+ }
+
+ /*
+ * Do change actual state: freeze or unfreeze.
+ */
+ cgroup_do_freeze(dsct, freeze, ts_nsec);
+ applied = true;
+ }
+
+ /*
+ * Even if the actual state hasn't changed, let's notify a user.
+ * The state can be enforced by an ancestor cgroup: the cgroup
+ * can already be in the desired state or it can be locked in the
+ * opposite state, so that the transition will never happen.
+ * In both cases it's better to notify a user, that there is
+ * nothing to wait for.
+ */
+ if (!applied) {
+ TRACE_CGROUP_PATH(notify_frozen, cgrp,
+ test_bit(CGRP_FROZEN, &cgrp->flags));
+ cgroup_file_notify(&cgrp->events_file);
+ }
+}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/legacy_freezer.c
index 92b98cc0ee76..915b02f65980 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -62,7 +63,7 @@ static struct freezer *parent_freezer(struct freezer *freezer)
return css_freezer(freezer->css.parent);
}
-bool cgroup_freezing(struct task_struct *task)
+bool cgroup1_freezing(struct task_struct *task)
{
bool ret;
@@ -99,24 +100,25 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
* @css: css being created
*
* We're committing to creation of @css. Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
+ * parent's freezing state while holding cpus read lock and freezer_mutex.
*/
static int freezer_css_online(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc_cpuslocked(&freezer_active);
}
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -124,21 +126,23 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
* freezer_css_offline - initiate destruction of a freezer css
* @css: css being destroyed
*
- * @css is going away. Mark it dead and decrement system_freezing_count if
+ * @css is going away. Mark it dead and decrement freezer_active if
* it was holding one.
*/
static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
+ cpus_read_lock();
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec_cpuslocked(&freezer_active);
freezer->state = 0;
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -155,12 +159,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css)
* @freezer->lock. freezer_attach() makes the new tasks conform to the
* current state and all following state changes can see the new tasks.
*/
-static void freezer_attach(struct cgroup_subsys_state *new_css,
- struct cgroup_taskset *tset)
+static void freezer_attach(struct cgroup_taskset *tset)
{
- struct freezer *freezer = css_freezer(new_css);
struct task_struct *task;
- bool clear_frozen = false;
+ struct cgroup_subsys_state *new_css;
mutex_lock(&freezer_mutex);
@@ -174,22 +176,21 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* current state before executing the following - !frozen tasks may
* be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
*/
- cgroup_taskset_for_each(task, tset) {
+ cgroup_taskset_for_each(task, new_css, tset) {
+ struct freezer *freezer = css_freezer(new_css);
+
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
+ /* clear FROZEN and propagate upwards */
+ while (freezer && (freezer->state & CGROUP_FROZEN)) {
+ freezer->state &= ~CGROUP_FROZEN;
+ freezer = parent_freezer(freezer);
+ }
freeze_task(task);
- freezer->state &= ~CGROUP_FROZEN;
- clear_frozen = true;
}
}
- /* propagate FROZEN clearing upwards */
- while (clear_frozen && (freezer = parent_freezer(freezer))) {
- freezer->state &= ~CGROUP_FROZEN;
- clear_frozen = freezer->state & CGROUP_FREEZING;
- }
-
mutex_unlock(&freezer_mutex);
}
@@ -271,19 +272,11 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
rcu_read_unlock();
/* are all tasks frozen? */
- css_task_iter_start(css, &it);
+ css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -323,7 +316,7 @@ static void freeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
freeze_task(task);
css_task_iter_end(&it);
@@ -334,7 +327,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&freezer->css, &it);
+ css_task_iter_start(&freezer->css, 0, &it);
while ((task = css_task_iter_next(&it)))
__thaw_task(task);
css_task_iter_end(&it);
@@ -360,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc_cpuslocked(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -369,9 +362,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec_cpuslocked(&freezer_active);
unfreeze_cgroup(freezer);
}
}
@@ -389,6 +382,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
{
struct cgroup_subsys_state *pos;
+ cpus_read_lock();
/*
* Update all its descendants in pre-order traversal. Each
* descendant will try to inherit its parent's FREEZING state as
@@ -417,6 +411,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static ssize_t freezer_write(struct kernfs_open_file *of,
@@ -428,9 +423,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of,
if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
- else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) {
+ pr_info_once("Freezing with imperfect legacy cgroup freezer. "
+ "See cgroup.freeze of cgroup v2\n");
freeze = true;
- else
+ } else
return -EINVAL;
freezer_change_state(css_freezer(of_css(of)), freeze);
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
new file mode 100644
index 000000000000..6a01d91ea4cb
--- /dev/null
+++ b/kernel/cgroup/misc.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Miscellaneous cgroup controller
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include <linux/limits.h>
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/atomic.h>
+#include <linux/slab.h>
+#include <linux/misc_cgroup.h>
+
+#define MAX_STR "max"
+#define MAX_NUM U64_MAX
+
+/* Miscellaneous res name, keep it in sync with enum misc_res_type */
+static const char *const misc_res_name[] = {
+#ifdef CONFIG_KVM_AMD_SEV
+ /* AMD SEV ASIDs resource */
+ "sev",
+ /* AMD SEV-ES ASIDs resource */
+ "sev_es",
+#endif
+#ifdef CONFIG_INTEL_TDX_HOST
+ /* Intel TDX HKIDs resource */
+ "tdx",
+#endif
+};
+
+/* Root misc cgroup */
+static struct misc_cg root_cg;
+
+/*
+ * Miscellaneous resources capacity for the entire machine. 0 capacity means
+ * resource is not initialized or not present in the host.
+ *
+ * root_cg.max and capacity are independent of each other. root_cg.max can be
+ * more than the actual capacity. We are using Limits resource distribution
+ * model of cgroup for miscellaneous controller.
+ */
+static u64 misc_res_capacity[MISC_CG_RES_TYPES];
+
+/**
+ * parent_misc() - Get the parent of the passed misc cgroup.
+ * @cgroup: cgroup whose parent needs to be fetched.
+ *
+ * Context: Any context.
+ * Return:
+ * * struct misc_cg* - Parent of the @cgroup.
+ * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
+ */
+static struct misc_cg *parent_misc(struct misc_cg *cgroup)
+{
+ return cgroup ? css_misc(cgroup->css.parent) : NULL;
+}
+
+/**
+ * valid_type() - Check if @type is valid or not.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return:
+ * * true - If valid type.
+ * * false - If not valid type.
+ */
+static inline bool valid_type(enum misc_res_type type)
+{
+ return type >= 0 && type < MISC_CG_RES_TYPES;
+}
+
+/**
+ * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
+ * @type: Type of the misc res.
+ * @capacity: Supported capacity of the misc res on the host.
+ *
+ * If capacity is 0 then the charging a misc cgroup fails for that type.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - Successfully registered the capacity.
+ * * %-EINVAL - If @type is invalid.
+ */
+int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
+{
+ if (!valid_type(type))
+ return -EINVAL;
+
+ WRITE_ONCE(misc_res_capacity[type], capacity);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
+
+/**
+ * misc_cg_cancel_charge() - Cancel the charge from the misc cgroup.
+ * @type: Misc res type in misc cg to cancel the charge from.
+ * @cg: Misc cgroup to cancel charge from.
+ * @amount: Amount to cancel.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
+ u64 amount)
+{
+ WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage),
+ "misc cgroup resource %s became less than 0",
+ misc_res_name[type]);
+}
+
+static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage)
+{
+ u64 old;
+
+ while (true) {
+ old = atomic64_read(&res->watermark);
+ if (new_usage <= old)
+ break;
+ if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old)
+ break;
+ }
+}
+
+static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg)
+{
+ atomic64_inc(&cg->res[type].events_local);
+ cgroup_file_notify(&cg->events_local_file);
+
+ for (; parent_misc(cg); cg = parent_misc(cg)) {
+ atomic64_inc(&cg->res[type].events);
+ cgroup_file_notify(&cg->events_file);
+ }
+}
+
+/**
+ * misc_cg_try_charge() - Try charging the misc cgroup.
+ * @type: Misc res type to charge.
+ * @cg: Misc cgroup which will be charged.
+ * @amount: Amount to charge.
+ *
+ * Charge @amount to the misc cgroup. Caller must use the same cgroup during
+ * the uncharge call.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -EINVAL - If @type is invalid or misc res has 0 capacity.
+ * * -EBUSY - If max limit will be crossed or total usage will be more than the
+ * capacity.
+ */
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
+{
+ struct misc_cg *i, *j;
+ int ret;
+ struct misc_res *res;
+ u64 new_usage;
+
+ if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
+ return -EINVAL;
+
+ if (!amount)
+ return 0;
+
+ for (i = cg; i; i = parent_misc(i)) {
+ res = &i->res[type];
+
+ new_usage = atomic64_add_return(amount, &res->usage);
+ if (new_usage > READ_ONCE(res->max) ||
+ new_usage > READ_ONCE(misc_res_capacity[type])) {
+ ret = -EBUSY;
+ goto err_charge;
+ }
+ misc_cg_update_watermark(res, new_usage);
+ }
+ return 0;
+
+err_charge:
+ misc_cg_event(type, i);
+
+ for (j = cg; j != i; j = parent_misc(j))
+ misc_cg_cancel_charge(type, j, amount);
+ misc_cg_cancel_charge(type, i, amount);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(misc_cg_try_charge);
+
+/**
+ * misc_cg_uncharge() - Uncharge the misc cgroup.
+ * @type: Misc res type which was charged.
+ * @cg: Misc cgroup which will be uncharged.
+ * @amount: Charged amount.
+ *
+ * Context: Any context.
+ */
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
+{
+ struct misc_cg *i;
+
+ if (!(amount && valid_type(type) && cg))
+ return;
+
+ for (i = cg; i; i = parent_misc(i))
+ misc_cg_cancel_charge(type, i, amount);
+}
+EXPORT_SYMBOL_GPL(misc_cg_uncharge);
+
+/**
+ * misc_cg_max_show() - Show the misc cgroup max limit.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_max_show(struct seq_file *sf, void *v)
+{
+ int i;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ u64 max;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (READ_ONCE(misc_res_capacity[i])) {
+ max = READ_ONCE(cg->res[i].max);
+ if (max == MAX_NUM)
+ seq_printf(sf, "%s max\n", misc_res_name[i]);
+ else
+ seq_printf(sf, "%s %llu\n", misc_res_name[i],
+ max);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_max_write() - Update the maximum limit of the cgroup.
+ * @of: Handler for the file.
+ * @buf: Data from the user. It should be either "max", 0, or a positive
+ * integer.
+ * @nbytes: Number of bytes of the data.
+ * @off: Offset in the file.
+ *
+ * User can pass data like:
+ * echo sev 23 > misc.max, OR
+ * echo sev max > misc.max
+ *
+ * Context: Any context.
+ * Return:
+ * * >= 0 - Number of bytes processed in the input.
+ * * -EINVAL - If buf is not valid.
+ * * -ERANGE - If number is bigger than the u64 capacity.
+ */
+static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct misc_cg *cg;
+ u64 max;
+ int ret = 0, i;
+ enum misc_res_type type = MISC_CG_RES_TYPES;
+ char *token;
+
+ buf = strstrip(buf);
+ token = strsep(&buf, " ");
+
+ if (!token || !buf)
+ return -EINVAL;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (!strcmp(misc_res_name[i], token)) {
+ type = i;
+ break;
+ }
+ }
+
+ if (type == MISC_CG_RES_TYPES)
+ return -EINVAL;
+
+ if (!strcmp(MAX_STR, buf)) {
+ max = MAX_NUM;
+ } else {
+ ret = kstrtou64(buf, 0, &max);
+ if (ret)
+ return ret;
+ }
+
+ cg = css_misc(of_css(of));
+
+ if (READ_ONCE(misc_res_capacity[type]))
+ WRITE_ONCE(cg->res[type].max, max);
+ else
+ ret = -EINVAL;
+
+ return ret ? ret : nbytes;
+}
+
+/**
+ * misc_cg_current_show() - Show the current usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_current_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 usage;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ usage = atomic64_read(&cg->res[i].usage);
+ if (READ_ONCE(misc_res_capacity[i]) || usage)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], usage);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_peak_show() - Show the peak usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_peak_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 watermark;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ watermark = atomic64_read(&cg->res[i].watermark);
+ if (READ_ONCE(misc_res_capacity[i]) || watermark)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_capacity_show() - Show the total capacity of misc res on the host.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Only present in the root cgroup directory.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_capacity_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 cap;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ cap = READ_ONCE(misc_res_capacity[i]);
+ if (cap)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], cap);
+ }
+
+ return 0;
+}
+
+static int __misc_events_show(struct seq_file *sf, bool local)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ u64 events;
+ int i;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (local)
+ events = atomic64_read(&cg->res[i].events_local);
+ else
+ events = atomic64_read(&cg->res[i].events);
+ if (READ_ONCE(misc_res_capacity[i]) || events)
+ seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
+ }
+ return 0;
+}
+
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, false);
+}
+
+static int misc_events_local_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, true);
+}
+
+/* Misc cgroup interface files */
+static struct cftype misc_cg_files[] = {
+ {
+ .name = "max",
+ .write = misc_cg_max_write,
+ .seq_show = misc_cg_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = misc_cg_current_show,
+ },
+ {
+ .name = "peak",
+ .seq_show = misc_cg_peak_show,
+ },
+ {
+ .name = "capacity",
+ .seq_show = misc_cg_capacity_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_file),
+ .seq_show = misc_events_show,
+ },
+ {
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_local_file),
+ .seq_show = misc_events_local_show,
+ },
+ {}
+};
+
+/**
+ * misc_cg_alloc() - Allocate misc cgroup.
+ * @parent_css: Parent cgroup.
+ *
+ * Context: Process context.
+ * Return:
+ * * struct cgroup_subsys_state* - css of the allocated cgroup.
+ * * ERR_PTR(-ENOMEM) - No memory available to allocate.
+ */
+static struct cgroup_subsys_state *
+misc_cg_alloc(struct cgroup_subsys_state *parent_css)
+{
+ enum misc_res_type i;
+ struct misc_cg *cg;
+
+ if (!parent_css) {
+ cg = &root_cg;
+ } else {
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ WRITE_ONCE(cg->res[i].max, MAX_NUM);
+ atomic64_set(&cg->res[i].usage, 0);
+ }
+
+ return &cg->css;
+}
+
+/**
+ * misc_cg_free() - Free the misc cgroup.
+ * @css: cgroup subsys object.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_misc(css));
+}
+
+/* Cgroup controller callbacks */
+struct cgroup_subsys misc_cgrp_subsys = {
+ .css_alloc = misc_cg_alloc,
+ .css_free = misc_cg_free,
+ .legacy_cftypes = misc_cg_files,
+ .dfl_cftypes = misc_cg_files,
+};
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 000000000000..db9617556dd7
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "cgroup-internal.h"
+
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+#include <linux/nstree.h>
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns __free(kfree) = NULL;
+ int ret;
+
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
+ if (!new_ns)
+ return ERR_PTR(-ENOMEM);
+ ret = ns_common_init(new_ns);
+ if (ret)
+ return ERR_PTR(ret);
+ return no_free_ptr(new_ns);
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ ns_tree_remove(ns);
+ put_css_set(ns->root_cset);
+ dec_cgroup_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_common_free(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(u64 flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns;
+ struct ucounts *ucounts;
+ struct css_set *cset;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP)) {
+ get_cgroup_ns(old_ns);
+ return old_ns;
+ }
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ ucounts = inc_cgroup_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
+ /* It is not safe to take cgroup_mutex here */
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ new_ns = alloc_cgroup_ns();
+ if (IS_ERR(new_ns)) {
+ put_css_set(cset);
+ dec_cgroup_namespaces(ucounts);
+ return new_ns;
+ }
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
+ new_ns->root_cset = cset;
+
+ ns_tree_add(new_ns);
+ return new_ns;
+}
+
+static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
+{
+ struct nsproxy *nsproxy = nsset->nsproxy;
+ struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+ if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Don't need to do anything if we are attaching to our own cgroupns. */
+ if (cgroup_ns == nsproxy->cgroup_ns)
+ return 0;
+
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+ put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+ return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+ .owner = cgroupns_owner,
+};
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
new file mode 100644
index 000000000000..8f61114c36dd
--- /dev/null
+++ b/kernel/cgroup/pids.c
@@ -0,0 +1,460 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+enum pidcg_event {
+ /* Fork failed in subtree because this pids_cgroup limit was hit. */
+ PIDCG_MAX,
+ /* Fork failed in this pids_cgroup because ancestor limit was hit. */
+ PIDCG_FORKFAIL,
+ NR_PIDCG_EVENTS,
+};
+
+struct pids_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * Use 64-bit types so that we can safely represent "max" as
+ * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+ */
+ atomic64_t counter;
+ atomic64_t limit;
+ int64_t watermark;
+
+ /* Handles for pids.events[.local] */
+ struct cgroup_file events_file;
+ struct cgroup_file events_local_file;
+
+ atomic64_t events[NR_PIDCG_EVENTS];
+ atomic64_t events_local[NR_PIDCG_EVENTS];
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+ return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pids_cgroup *pids;
+
+ pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ if (!pids)
+ return ERR_PTR(-ENOMEM);
+
+ atomic64_set(&pids->limit, PIDS_MAX);
+ return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pids(css));
+}
+
+static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
+{
+ /*
+ * This is racy, but we don't need perfectly accurate tallying of
+ * the watermark, and this lets us avoid extra atomic overhead.
+ */
+ if (nr_pids > READ_ONCE(p->watermark))
+ WRITE_ONCE(p->watermark, nr_pids);
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pids` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; parent_pids(p); p = parent_pids(p))
+ pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ pids_update_watermark(p, new);
+ }
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ * @fail: storage of pid cgroup causing the fail
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
+{
+ struct pids_cgroup *p, *q;
+
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+ int64_t limit = atomic64_read(&p->limit);
+
+ /*
+ * Since new is capped to the maximum number of pid_t, if
+ * p->limit is %PIDS_MAX then we know that this test will never
+ * fail.
+ */
+ if (new > limit) {
+ *fail = p;
+ goto revert;
+ }
+ /*
+ * Not technically accurate if we go over limit somewhere up
+ * the hierarchy, but that's tolerable for the watermark.
+ */
+ pids_update_watermark(p, new);
+ }
+
+ return 0;
+
+revert:
+ for (q = pids; q != p; q = parent_pids(q))
+ pids_cancel(q, num);
+ pids_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *dst_css;
+
+ cgroup_taskset_for_each(task, dst_css, tset) {
+ struct pids_cgroup *pids = css_pids(dst_css);
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ /*
+ * No need to pin @old_css between here and cancel_attach()
+ * because cgroup core protects it from being freed before
+ * the migration completes or fails.
+ */
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(pids, 1);
+ pids_uncharge(old_pids, 1);
+ }
+
+ return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *dst_css;
+
+ cgroup_taskset_for_each(task, dst_css, tset) {
+ struct pids_cgroup *pids = css_pids(dst_css);
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(old_pids, 1);
+ pids_uncharge(pids, 1);
+ }
+}
+
+static void pids_event(struct pids_cgroup *pids_forking,
+ struct pids_cgroup *pids_over_limit)
+{
+ struct pids_cgroup *p = pids_forking;
+
+ /* Only log the first time limit is hit. */
+ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(p->css.cgroup);
+ pr_cont("\n");
+ }
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ cgroup_file_notify(&p->events_local_file);
+ return;
+ }
+
+ atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]);
+ cgroup_file_notify(&pids_over_limit->events_local_file);
+
+ for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) {
+ atomic64_inc(&p->events[PIDCG_MAX]);
+ cgroup_file_notify(&p->events_file);
+ }
+}
+
+/*
+ * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
+ * on cgroup_threadgroup_change_begin() held by the copy_process().
+ */
+static int pids_can_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct pids_cgroup *pids, *pids_over_limit;
+ int err;
+
+ pids = css_pids(cset->subsys[pids_cgrp_id]);
+ err = pids_try_charge(pids, 1, &pids_over_limit);
+ if (err)
+ pids_event(pids, pids_over_limit);
+
+ return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct pids_cgroup *pids;
+
+ pids = css_pids(cset->subsys[pids_cgrp_id]);
+ pids_uncharge(pids, 1);
+}
+
+static void pids_release(struct task_struct *task)
+{
+ struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
+
+ pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_MAX_STR)) {
+ limit = PIDS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PIDS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ atomic64_set(&pids->limit, limit);
+ return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit = atomic64_read(&pids->limit);
+
+ if (limit >= PIDS_MAX)
+ seq_printf(sf, "%s\n", PIDS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return atomic64_read(&pids->counter);
+}
+
+static s64 pids_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return READ_ONCE(pids->watermark);
+}
+
+static int __pids_events_show(struct seq_file *sf, bool local)
+{
+ struct pids_cgroup *pids = css_pids(seq_css(sf));
+ enum pidcg_event pe = PIDCG_MAX;
+ atomic64_t *events;
+
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ pe = PIDCG_FORKFAIL;
+ local = true;
+ }
+ events = local ? pids->events_local : pids->events;
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe]));
+ return 0;
+}
+
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, false);
+ return 0;
+}
+
+static int pids_events_local_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, true);
+ return 0;
+}
+
+static struct cftype pids_files[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "events.local",
+ .seq_show = pids_events_local_show,
+ .file_offset = offsetof(struct pids_cgroup, events_local_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cftype pids_files_legacy[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+
+struct cgroup_subsys pids_cgrp_subsys = {
+ .css_alloc = pids_css_alloc,
+ .css_free = pids_css_free,
+ .can_attach = pids_can_attach,
+ .cancel_attach = pids_cancel_attach,
+ .can_fork = pids_can_fork,
+ .cancel_fork = pids_cancel_fork,
+ .release = pids_release,
+ .legacy_cftypes = pids_files_legacy,
+ .dfl_cftypes = pids_files,
+ .threaded = true,
+};
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..ef5878fb2005
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+ RDMACG_RESOURCE_TYPE_MAX,
+ RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+ [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
+ [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+ int max;
+ int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+ struct rdmacg_device *device;
+ struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
+
+ struct list_head cg_node;
+ struct list_head dev_node;
+
+ /* count active user tasks of this pool */
+ u64 usage_sum;
+ /* total number counts which are set to max */
+ int num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+ return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+ return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+ int index, int new_max)
+{
+ if (new_max == S32_MAX) {
+ if (rpool->resources[index].max != S32_MAX)
+ rpool->num_max_cnt++;
+ } else {
+ if (rpool->resources[index].max == S32_MAX)
+ rpool->num_max_cnt--;
+ }
+ rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+ int i;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+ set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_del(&rpool->cg_node);
+ list_del(&rpool->dev_node);
+ kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device)
+
+{
+ struct rdmacg_resource_pool *pool;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(pool, &cg->rpools, cg_node)
+ if (pool->device == device)
+ return pool;
+
+ return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+ if (rpool)
+ return rpool;
+
+ rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+ if (!rpool)
+ return ERR_PTR(-ENOMEM);
+
+ rpool->device = device;
+ set_all_resource_max_limit(rpool);
+
+ INIT_LIST_HEAD(&rpool->cg_node);
+ INIT_LIST_HEAD(&rpool->dev_node);
+ list_add_tail(&rpool->cg_node, &cg->rpools);
+ list_add_tail(&rpool->dev_node, &device->rpools);
+ return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+
+ /*
+ * rpool cannot be null at this stage. Let kernel operate in case
+ * if there a bug in IB stack or rdma controller, instead of crashing
+ * the system.
+ */
+ if (unlikely(!rpool)) {
+ pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+ return;
+ }
+
+ rpool->resources[index].usage--;
+
+ /*
+ * A negative count (or overflow) is invalid,
+ * it indicates a bug in the rdma controller.
+ */
+ WARN_ON_ONCE(rpool->resources[index].usage < 0);
+ rpool->usage_sum--;
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ * stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ struct rdma_cgroup *stop_cg,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *p;
+
+ mutex_lock(&rdmacg_mutex);
+
+ for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+ uncharge_cg_locked(p, device, index);
+
+ mutex_unlock(&rdmacg_mutex);
+
+ css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ if (index >= RDMACG_RESOURCE_MAX)
+ return;
+
+ rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *cg, *p;
+ struct rdmacg_resource_pool *rpool;
+ s64 new;
+ int ret = 0;
+
+ if (index >= RDMACG_RESOURCE_MAX)
+ return -EINVAL;
+
+ /*
+ * hold on to css, as cgroup can be removed but resource
+ * accounting happens on css.
+ */
+ cg = get_current_rdmacg();
+
+ mutex_lock(&rdmacg_mutex);
+ for (p = cg; p; p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto err;
+ } else {
+ new = rpool->resources[index].usage + 1;
+ if (new > rpool->resources[index].max) {
+ ret = -EAGAIN;
+ goto err;
+ } else {
+ rpool->resources[index].usage = new;
+ rpool->usage_sum++;
+ }
+ }
+ }
+ mutex_unlock(&rdmacg_mutex);
+
+ *rdmacg = cg;
+ return 0;
+
+err:
+ mutex_unlock(&rdmacg_mutex);
+ rdmacg_uncharge_hierarchy(cg, device, p, index);
+ return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ */
+void rdmacg_register_device(struct rdmacg_device *device)
+{
+ INIT_LIST_HEAD(&device->dev_node);
+ INIT_LIST_HEAD(&device->rpools);
+
+ mutex_lock(&rdmacg_mutex);
+ list_add_tail(&device->dev_node, &rdmacg_devices);
+ mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ * controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool, *tmp;
+
+ /*
+ * Synchronize with any active resource settings,
+ * usage query happening via configfs.
+ */
+ mutex_lock(&rdmacg_mutex);
+ list_del_init(&device->dev_node);
+
+ /*
+ * Now that this device is off the cgroup list, its safe to free
+ * all the rpool resources.
+ */
+ list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+ free_cg_rpool_locked(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+ substring_t argstr;
+ char *name, *value = c;
+ size_t len;
+ int ret, i;
+
+ name = strsep(&value, "=");
+ if (!name || !value)
+ return -EINVAL;
+
+ i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
+ if (i < 0)
+ return i;
+
+ len = strlen(value);
+
+ argstr.from = value;
+ argstr.to = value + len;
+
+ ret = match_int(&argstr, intval);
+ if (ret >= 0) {
+ if (*intval < 0)
+ return -EINVAL;
+ return i;
+ }
+ if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+ *intval = S32_MAX;
+ return i;
+ }
+ return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+ int *new_limits, unsigned long *enables)
+{
+ char *c;
+ int err = -EINVAL;
+
+ /* parse resource options */
+ while ((c = strsep(&options, " ")) != NULL) {
+ int index, intval;
+
+ index = parse_resource(c, &intval);
+ if (index < 0)
+ goto err;
+
+ new_limits[index] = intval;
+ *enables |= BIT(index);
+ }
+ return 0;
+
+err:
+ return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+ struct rdmacg_device *device;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node)
+ if (!strcmp(name, device->name))
+ return device;
+
+ return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+ const char *dev_name;
+ struct rdmacg_resource_pool *rpool;
+ struct rdmacg_device *device;
+ char *options = strstrip(buf);
+ int *new_limits;
+ unsigned long enables = 0;
+ int i = 0, ret = 0;
+
+ /* extract the device name first */
+ dev_name = strsep(&options, " ");
+ if (!dev_name) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+ if (!new_limits) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = rdmacg_parse_limits(options, new_limits, &enables);
+ if (ret)
+ goto parse_err;
+
+ /* acquire lock to synchronize with hot plug devices */
+ mutex_lock(&rdmacg_mutex);
+
+ device = rdmacg_get_device_locked(dev_name);
+ if (!device) {
+ ret = -ENODEV;
+ goto dev_err;
+ }
+
+ rpool = get_cg_rpool_locked(cg, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto dev_err;
+ }
+
+ /* now set the new limits of the rpool */
+ for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+ set_resource_limit(rpool, i, new_limits[i]);
+
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+
+dev_err:
+ mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+ kfree(new_limits);
+
+err:
+ return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+ struct rdmacg_resource_pool *rpool)
+{
+ enum rdmacg_file_type sf_type;
+ int i;
+ u32 value;
+
+ sf_type = seq_cft(sf)->private;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ seq_puts(sf, rdmacg_resource_names[i]);
+ seq_putc(sf, '=');
+ if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+ if (rpool)
+ value = rpool->resources[i].max;
+ else
+ value = S32_MAX;
+ } else {
+ if (rpool)
+ value = rpool->resources[i].usage;
+ else
+ value = 0;
+ }
+
+ if (value == S32_MAX)
+ seq_puts(sf, RDMACG_MAX_STR);
+ else
+ seq_printf(sf, "%d", value);
+ seq_putc(sf, ' ');
+ }
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+ struct rdmacg_device *device;
+ struct rdmacg_resource_pool *rpool;
+ struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node) {
+ seq_printf(sf, "%s ", device->name);
+
+ rpool = find_cg_rpool_locked(cg, device);
+ print_rpool_values(sf, rpool);
+
+ seq_putc(sf, '\n');
+ }
+
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+ {
+ .name = "max",
+ .write = rdmacg_resource_set_max,
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_MAX,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_STAT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct rdma_cgroup *cg;
+
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&cg->rpools);
+ return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+
+ kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+ struct rdmacg_resource_pool *rpool;
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(rpool, &cg->rpools, cg_node)
+ set_all_resource_max_limit(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+ .css_alloc = rdmacg_css_alloc,
+ .css_free = rdmacg_css_free,
+ .css_offline = rdmacg_css_offline,
+ .legacy_cftypes = rdmacg_files,
+ .dfl_cftypes = rdmacg_files,
+};
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
new file mode 100644
index 000000000000..a198e40c799b
--- /dev/null
+++ b/kernel/cgroup/rstat.c
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
+#include <trace/events/cgroup.h>
+
+static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
+
+/*
+ * Determines whether a given css can participate in rstat.
+ * css's that are cgroup::self use rstat for base stats.
+ * Other css's associated with a subsystem use rstat only when
+ * they define the ss->css_rstat_flush callback.
+ */
+static inline bool css_uses_rstat(struct cgroup_subsys_state *css)
+{
+ return css_is_self(css) || css->ss->css_rstat_flush != NULL;
+}
+
+static struct css_rstat_cpu *css_rstat_cpu(
+ struct cgroup_subsys_state *css, int cpu)
+{
+ return per_cpu_ptr(css->rstat_cpu, cpu);
+}
+
+static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
+ struct cgroup *cgrp, int cpu)
+{
+ return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
+}
+
+static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+{
+ if (ss)
+ return &ss->rstat_ss_lock;
+
+ return &rstat_base_lock;
+}
+
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
+}
+
+/**
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * Atomically inserts the css in the ss's llist for the given cpu. This is
+ * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
+ * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
+ */
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
+ struct llist_head *lhead;
+ struct css_rstat_cpu *rstatc;
+ struct css_rstat_cpu __percpu *rstatc_pcpu;
+ struct llist_node *self;
+
+ /*
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
+ * the requests from nmi context.
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ return;
+
+ rstatc = css_rstat_cpu(css, cpu);
+ /*
+ * If already on list return. This check is racy and smp_mb() is needed
+ * to pair it with the smp_mb() in css_process_update_tree() if the
+ * guarantee that the updated stats are visible to concurrent flusher is
+ * needed.
+ */
+ if (llist_on_list(&rstatc->lnode))
+ return;
+
+ /*
+ * This function can be renentered by irqs and nmis for the same cgroup
+ * and may try to insert the same per-cpu lnode into the llist. Note
+ * that llist_add() does not protect against such scenarios.
+ *
+ * To protect against such stacked contexts of irqs/nmis, we use the
+ * fact that lnode points to itself when not on a list and then use
+ * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+ * which will call llist_add(). The losers can assume the insertion is
+ * successful and the winner will eventually add the per-cpu lnode to
+ * the llist.
+ */
+ self = &rstatc->lnode;
+ rstatc_pcpu = css->rstat_cpu;
+ if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
+ return;
+
+ lhead = ss_lhead_cpu(css->ss, cpu);
+ llist_add(&rstatc->lnode, lhead);
+}
+
+static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
+{
+ /* put @css and all ancestors on the corresponding updated lists */
+ while (true) {
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+ struct cgroup_subsys_state *parent = css->parent;
+ struct css_rstat_cpu *prstatc;
+
+ /*
+ * Both additions and removals are bottom-up. If a cgroup
+ * is already in the tree, all ancestors are.
+ */
+ if (rstatc->updated_next)
+ break;
+
+ /* Root has no parent to link it to, but mark it busy */
+ if (!parent) {
+ rstatc->updated_next = css;
+ break;
+ }
+
+ prstatc = css_rstat_cpu(parent, cpu);
+ rstatc->updated_next = prstatc->updated_children;
+ prstatc->updated_children = css;
+
+ css = parent;
+ }
+}
+
+static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ /*
+ * smp_mb() is needed here (more specifically in between
+ * init_llist_node() and per-cpu stats flushing) if the
+ * guarantee is required by a rstat user where etiher the
+ * updater should add itself on the lockless list or the
+ * flusher flush the stats updated by the updater who have
+ * observed that they are already on the list. The
+ * corresponding barrier pair for this one should be before
+ * css_rstat_updated() by the user.
+ *
+ * For now, there aren't any such user, so not adding the
+ * barrier here but if such a use-case arise, please add
+ * smp_mb() here.
+ */
+
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_process_update_tree(rstatc->owner, cpu);
+ }
+}
+
+/**
+ * css_rstat_push_children - push children css's into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
+ * @cpu: target cpu
+ * Return: A new singly linked list of css's to be flushed
+ *
+ * Iteratively traverse down the css_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list via the rstat_flush_next pointer built from the
+ * tail backward like "pushing" css's into a stack. The root is pushed by
+ * the caller.
+ */
+static struct cgroup_subsys_state *css_rstat_push_children(
+ struct cgroup_subsys_state *head,
+ struct cgroup_subsys_state *child, int cpu)
+{
+ struct cgroup_subsys_state *cnext = child; /* Next head of child css level */
+ struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */
+ struct cgroup_subsys_state *parent, *grandchild;
+ struct css_rstat_cpu *crstatc;
+
+ child->rstat_flush_next = NULL;
+
+ /*
+ * The subsystem rstat lock must be held for the whole duration from
+ * here as the rstat_flush_next list is being constructed to when
+ * it is consumed later in css_rstat_flush().
+ */
+ lockdep_assert_held(ss_rstat_lock(head->ss));
+
+ /*
+ * Notation: -> updated_next pointer
+ * => rstat_flush_next pointer
+ *
+ * Assuming the following sample updated_children lists:
+ * P: C1 -> C2 -> P
+ * C1: G11 -> G12 -> C1
+ * C2: G21 -> G22 -> C2
+ *
+ * After 1st iteration:
+ * head => C2 => C1 => NULL
+ * ghead => G21 => G11 => NULL
+ *
+ * After 2nd iteration:
+ * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL
+ */
+next_level:
+ while (cnext) {
+ child = cnext;
+ cnext = child->rstat_flush_next;
+ parent = child->parent;
+
+ /* updated_next is parent cgroup terminated if !NULL */
+ while (child != parent) {
+ child->rstat_flush_next = head;
+ head = child;
+ crstatc = css_rstat_cpu(child, cpu);
+ grandchild = crstatc->updated_children;
+ if (grandchild != child) {
+ /* Push the grand child to the next level */
+ crstatc->updated_children = child;
+ grandchild->rstat_flush_next = ghead;
+ ghead = grandchild;
+ }
+ child = crstatc->updated_next;
+ crstatc->updated_next = NULL;
+ }
+ }
+
+ if (ghead) {
+ cnext = ghead;
+ ghead = NULL;
+ goto next_level;
+ }
+ return head;
+}
+
+/**
+ * css_rstat_updated_list - build a list of updated css's to be flushed
+ * @root: root of the css subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of css's to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
+ * each returned css is unlinked from the updated tree.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child css's if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent css. An exception
+ * here is the css root whose updated_next can be self terminated.
+ */
+static struct cgroup_subsys_state *css_rstat_updated_list(
+ struct cgroup_subsys_state *root, int cpu)
+{
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
+ struct cgroup_subsys_state *head = NULL, *parent, *child;
+
+ css_process_update_tree(root->ss, cpu);
+
+ /* Return NULL if this subtree is not on-list */
+ if (!rstatc->updated_next)
+ return NULL;
+
+ /*
+ * Unlink @root from its parent. As the updated_children list is
+ * singly linked, we have to walk it to find the removal point.
+ */
+ parent = root->parent;
+ if (parent) {
+ struct css_rstat_cpu *prstatc;
+ struct cgroup_subsys_state **nextp;
+
+ prstatc = css_rstat_cpu(parent, cpu);
+ nextp = &prstatc->updated_children;
+ while (*nextp != root) {
+ struct css_rstat_cpu *nrstatc;
+
+ nrstatc = css_rstat_cpu(*nextp, cpu);
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+ *nextp = rstatc->updated_next;
+ }
+
+ rstatc->updated_next = NULL;
+
+ /* Push @root to the list first before pushing the children */
+ head = root;
+ root->rstat_flush_next = NULL;
+ child = rstatc->updated_children;
+ rstatc->updated_children = root;
+ if (child != root)
+ head = css_rstat_push_children(head, child, cpu);
+
+ return head;
+}
+
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for css_rstat_updated() and
+ * css_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ */
+
+__bpf_hook_start();
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__bpf_hook_end();
+
+/*
+ * Helper functions for locking.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __acquires(ss_rstat_lock(css->ss))
+{
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
+ bool contended;
+
+ lock = ss_rstat_lock(css->ss);
+ contended = !spin_trylock_irq(lock);
+ if (contended) {
+ trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+ spin_lock_irq(lock);
+ }
+ trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __releases(ss_rstat_lock(css->ss))
+{
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
+
+ lock = ss_rstat_lock(css->ss);
+ trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+ spin_unlock_irq(lock);
+}
+
+/**
+ * css_rstat_flush - flush stats in @css's rstat subtree
+ * @css: target cgroup subsystem state
+ *
+ * Collect all per-cpu stats in @css's subtree into the global counters
+ * and propagate them upwards. After this function returns, all rstat
+ * nodes in the subtree have up-to-date ->stat.
+ *
+ * This also gets all rstat nodes in the subtree including @css off the
+ * ->updated_children lists.
+ *
+ * This function may block.
+ */
+__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
+{
+ int cpu;
+ bool is_self = css_is_self(css);
+
+ /*
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
+
+ might_sleep();
+ for_each_possible_cpu(cpu) {
+ struct cgroup_subsys_state *pos;
+
+ /* Reacquire for each CPU to avoid disabling IRQs too long */
+ __css_rstat_lock(css, cpu);
+ pos = css_rstat_updated_list(css, cpu);
+ for (; pos; pos = pos->rstat_flush_next) {
+ if (is_self) {
+ cgroup_base_stat_flush(pos->cgroup, cpu);
+ bpf_rstat_flush(pos->cgroup,
+ cgroup_parent(pos->cgroup), cpu);
+ } else
+ pos->ss->css_rstat_flush(pos, cpu);
+ }
+ __css_rstat_unlock(css, cpu);
+ if (!cond_resched())
+ cpu_relax();
+ }
+}
+
+int css_rstat_init(struct cgroup_subsys_state *css)
+{
+ struct cgroup *cgrp = css->cgroup;
+ int cpu;
+ bool is_self = css_is_self(css);
+
+ if (is_self) {
+ /* the root cgrp has rstat_base_cpu preallocated */
+ if (!cgrp->rstat_base_cpu) {
+ cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu);
+ if (!cgrp->rstat_base_cpu)
+ return -ENOMEM;
+ }
+ } else if (css->ss->css_rstat_flush == NULL)
+ return 0;
+
+ /* the root cgrp's self css has rstat_cpu preallocated */
+ if (!css->rstat_cpu) {
+ css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
+ if (!css->rstat_cpu) {
+ if (is_self)
+ free_percpu(cgrp->rstat_base_cpu);
+
+ return -ENOMEM;
+ }
+ }
+
+ /* ->updated_children list is self terminated */
+ for_each_possible_cpu(cpu) {
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+
+ rstatc->owner = rstatc->updated_children = css;
+ init_llist_node(&rstatc->lnode);
+
+ if (is_self) {
+ struct cgroup_rstat_base_cpu *rstatbc;
+
+ rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
+ u64_stats_init(&rstatbc->bsync);
+ }
+ }
+
+ return 0;
+}
+
+void css_rstat_exit(struct cgroup_subsys_state *css)
+{
+ int cpu;
+
+ if (!css_uses_rstat(css))
+ return;
+
+ if (!css->rstat_cpu)
+ return;
+
+ css_rstat_flush(css);
+
+ /* sanity check */
+ for_each_possible_cpu(cpu) {
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+
+ if (WARN_ON_ONCE(rstatc->updated_children != css) ||
+ WARN_ON_ONCE(rstatc->updated_next))
+ return;
+ }
+
+ if (css_is_self(css)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ free_percpu(cgrp->rstat_base_cpu);
+ cgrp->rstat_base_cpu = NULL;
+ }
+
+ free_percpu(css->rstat_cpu);
+ css->rstat_cpu = NULL;
+}
+
+/**
+ * ss_rstat_init - subsystem-specific rstat initialization
+ * @ss: target subsystem
+ *
+ * If @ss is NULL, the static locks associated with the base stats
+ * are initialized. If @ss is non-NULL, the subsystem-specific locks
+ * are initialized.
+ */
+int __init ss_rstat_init(struct cgroup_subsys *ss)
+{
+ int cpu;
+
+ if (ss) {
+ ss->lhead = alloc_percpu(struct llist_head);
+ if (!ss->lhead)
+ return -ENOMEM;
+ }
+
+ spin_lock_init(ss_rstat_lock(ss));
+ for_each_possible_cpu(cpu)
+ init_llist_head(ss_lhead_cpu(ss, cpu));
+
+ return 0;
+}
+
+/*
+ * Functions for cgroup basic resource statistics implemented on top of
+ * rstat.
+ */
+static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
+ struct cgroup_base_stat *src_bstat)
+{
+ dst_bstat->cputime.utime += src_bstat->cputime.utime;
+ dst_bstat->cputime.stime += src_bstat->cputime.stime;
+ dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
+ dst_bstat->ntime += src_bstat->ntime;
+}
+
+static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
+ struct cgroup_base_stat *src_bstat)
+{
+ dst_bstat->cputime.utime -= src_bstat->cputime.utime;
+ dst_bstat->cputime.stime -= src_bstat->cputime.stime;
+ dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
+ dst_bstat->ntime -= src_bstat->ntime;
+}
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
+{
+ struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_base_cpu *prstatbc;
+ struct cgroup_base_stat delta;
+ unsigned seq;
+
+ /* Root-level stats are sourced from system-wide CPU stats */
+ if (!parent)
+ return;
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = __u64_stats_fetch_begin(&rstatbc->bsync);
+ delta = rstatbc->bstat;
+ } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq));
+
+ /* propagate per-cpu delta to cgroup and per-cpu global statistics */
+ cgroup_base_stat_sub(&delta, &rstatbc->last_bstat);
+ cgroup_base_stat_add(&cgrp->bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta);
+
+ /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
+ if (cgroup_parent(parent)) {
+ delta = cgrp->bstat;
+ cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
+ cgroup_base_stat_add(&parent->bstat, &delta);
+ cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+
+ delta = rstatbc->subtree_bstat;
+ prstatbc = cgroup_rstat_base_cpu(parent, cpu);
+ cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat);
+ cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta);
+ }
+}
+
+static struct cgroup_rstat_base_cpu *
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
+{
+ struct cgroup_rstat_base_cpu *rstatbc;
+
+ rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu);
+ *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync);
+ return rstatbc;
+}
+
+static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
+ struct cgroup_rstat_base_cpu *rstatbc,
+ unsigned long flags)
+{
+ u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
+ css_rstat_updated(&cgrp->self, smp_processor_id());
+ put_cpu_ptr(rstatbc);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+ struct cgroup_rstat_base_cpu *rstatbc;
+ unsigned long flags;
+
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+ rstatbc->bstat.cputime.sum_exec_runtime += delta_exec;
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+ enum cpu_usage_stat index, u64 delta_exec)
+{
+ struct cgroup_rstat_base_cpu *rstatbc;
+ unsigned long flags;
+
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+
+ switch (index) {
+ case CPUTIME_NICE:
+ rstatbc->bstat.ntime += delta_exec;
+ fallthrough;
+ case CPUTIME_USER:
+ rstatbc->bstat.cputime.utime += delta_exec;
+ break;
+ case CPUTIME_SYSTEM:
+ case CPUTIME_IRQ:
+ case CPUTIME_SOFTIRQ:
+ rstatbc->bstat.cputime.stime += delta_exec;
+ break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatbc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
+ default:
+ break;
+ }
+
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
+}
+
+/*
+ * compute the cputime for the root cgroup by getting the per cpu data
+ * at a global level, then categorizing the fields in a manner consistent
+ * with how it is done by __cgroup_account_cputime_field for each bit of
+ * cpu time attributed to a cgroup.
+ */
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
+{
+ struct task_cputime *cputime = &bstat->cputime;
+ int i;
+
+ memset(bstat, 0, sizeof(*bstat));
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u64 user = 0;
+ u64 sys = 0;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ user += cpustat[CPUTIME_NICE];
+ cputime->utime += user;
+
+ sys += cpustat[CPUTIME_SYSTEM];
+ sys += cpustat[CPUTIME_IRQ];
+ sys += cpustat[CPUTIME_SOFTIRQ];
+ cputime->stime += sys;
+
+ cputime->sum_exec_runtime += user;
+ cputime->sum_exec_runtime += sys;
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
+ bstat->ntime += cpustat[CPUTIME_NICE];
+ }
+}
+
+
+static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time = bstat->forceidle_sum;
+
+ do_div(forceidle_time, NSEC_PER_USEC);
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
+void cgroup_base_stat_cputime_show(struct seq_file *seq)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct cgroup_base_stat bstat;
+
+ if (cgroup_parent(cgrp)) {
+ css_rstat_flush(&cgrp->self);
+ __css_rstat_lock(&cgrp->self, -1);
+ bstat = cgrp->bstat;
+ cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
+ &bstat.cputime.utime, &bstat.cputime.stime);
+ __css_rstat_unlock(&cgrp->self, -1);
+ } else {
+ root_cgroup_cputime(&bstat);
+ }
+
+ do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
+ do_div(bstat.cputime.utime, NSEC_PER_USEC);
+ do_div(bstat.cputime.stime, NSEC_PER_USEC);
+ do_div(bstat.ntime, NSEC_PER_USEC);
+
+ seq_printf(seq, "usage_usec %llu\n"
+ "user_usec %llu\n"
+ "system_usec %llu\n"
+ "nice_usec %llu\n",
+ bstat.cputime.sum_exec_runtime,
+ bstat.cputime.utime,
+ bstat.cputime.stime,
+ bstat.ntime);
+
+ cgroup_force_idle_show(seq, &bstat);
+}
+
+/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */
+BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, css_rstat_updated)
+BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE)
+BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
+}
+late_initcall(bpf_rstat_kfunc_init);
diff --git a/kernel/compat.c b/kernel/compat.c
index 333d364be29d..fb50f29d9b36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/compat.c
*
@@ -5,10 +6,6 @@
* on 64 bit kernels.
*
* Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -20,7 +17,6 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/security.h>
-#include <linux/timex.h>
#include <linux/export.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
@@ -28,364 +24,7 @@
#include <linux/ptrace.h>
#include <linux/gfp.h>
-#include <asm/uaccess.h>
-
-static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
-{
- memset(txc, 0, sizeof(struct timex));
-
- if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
- __get_user(txc->modes, &utp->modes) ||
- __get_user(txc->offset, &utp->offset) ||
- __get_user(txc->freq, &utp->freq) ||
- __get_user(txc->maxerror, &utp->maxerror) ||
- __get_user(txc->esterror, &utp->esterror) ||
- __get_user(txc->status, &utp->status) ||
- __get_user(txc->constant, &utp->constant) ||
- __get_user(txc->precision, &utp->precision) ||
- __get_user(txc->tolerance, &utp->tolerance) ||
- __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
- __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
- __get_user(txc->tick, &utp->tick) ||
- __get_user(txc->ppsfreq, &utp->ppsfreq) ||
- __get_user(txc->jitter, &utp->jitter) ||
- __get_user(txc->shift, &utp->shift) ||
- __get_user(txc->stabil, &utp->stabil) ||
- __get_user(txc->jitcnt, &utp->jitcnt) ||
- __get_user(txc->calcnt, &utp->calcnt) ||
- __get_user(txc->errcnt, &utp->errcnt) ||
- __get_user(txc->stbcnt, &utp->stbcnt))
- return -EFAULT;
-
- return 0;
-}
-
-static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
-{
- if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
- __put_user(txc->modes, &utp->modes) ||
- __put_user(txc->offset, &utp->offset) ||
- __put_user(txc->freq, &utp->freq) ||
- __put_user(txc->maxerror, &utp->maxerror) ||
- __put_user(txc->esterror, &utp->esterror) ||
- __put_user(txc->status, &utp->status) ||
- __put_user(txc->constant, &utp->constant) ||
- __put_user(txc->precision, &utp->precision) ||
- __put_user(txc->tolerance, &utp->tolerance) ||
- __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
- __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
- __put_user(txc->tick, &utp->tick) ||
- __put_user(txc->ppsfreq, &utp->ppsfreq) ||
- __put_user(txc->jitter, &utp->jitter) ||
- __put_user(txc->shift, &utp->shift) ||
- __put_user(txc->stabil, &utp->stabil) ||
- __put_user(txc->jitcnt, &utp->jitcnt) ||
- __put_user(txc->calcnt, &utp->calcnt) ||
- __put_user(txc->errcnt, &utp->errcnt) ||
- __put_user(txc->stbcnt, &utp->stbcnt) ||
- __put_user(txc->tai, &utp->tai))
- return -EFAULT;
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
- struct timezone __user *, tz)
-{
- if (tv) {
- struct timeval ktv;
- do_gettimeofday(&ktv);
- if (compat_put_timeval(&ktv, tv))
- return -EFAULT;
- }
- if (tz) {
- if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
- return -EFAULT;
- }
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
- struct timezone __user *, tz)
-{
- struct timeval user_tv;
- struct timespec new_ts;
- struct timezone new_tz;
-
- if (tv) {
- if (compat_get_timeval(&user_tv, tv))
- return -EFAULT;
- new_ts.tv_sec = user_tv.tv_sec;
- new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
- }
- if (tz) {
- if (copy_from_user(&new_tz, tz, sizeof(*tz)))
- return -EFAULT;
- }
-
- return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
-}
-
-static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
-{
- return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
- __get_user(tv->tv_sec, &ctv->tv_sec) ||
- __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
-}
-
-static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
-{
- return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
- __put_user(tv->tv_sec, &ctv->tv_sec) ||
- __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
-}
-
-static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
-{
- return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
- __get_user(ts->tv_sec, &cts->tv_sec) ||
- __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
-}
-
-static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
-{
- return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
- __put_user(ts->tv_sec, &cts->tv_sec) ||
- __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
-}
-
-int compat_get_timeval(struct timeval *tv, const void __user *utv)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
- else
- return __compat_get_timeval(tv, utv);
-}
-EXPORT_SYMBOL_GPL(compat_get_timeval);
-
-int compat_put_timeval(const struct timeval *tv, void __user *utv)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
- else
- return __compat_put_timeval(tv, utv);
-}
-EXPORT_SYMBOL_GPL(compat_put_timeval);
-
-int compat_get_timespec(struct timespec *ts, const void __user *uts)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
- else
- return __compat_get_timespec(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_get_timespec);
-
-int compat_put_timespec(const struct timespec *ts, void __user *uts)
-{
- if (COMPAT_USE_64BIT_TIME)
- return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
- else
- return __compat_put_timespec(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_put_timespec);
-
-int compat_convert_timespec(struct timespec __user **kts,
- const void __user *cts)
-{
- struct timespec ts;
- struct timespec __user *uts;
-
- if (!cts || COMPAT_USE_64BIT_TIME) {
- *kts = (struct timespec __user *)cts;
- return 0;
- }
-
- uts = compat_alloc_user_space(sizeof(ts));
- if (!uts)
- return -EFAULT;
- if (compat_get_timespec(&ts, cts))
- return -EFAULT;
- if (copy_to_user(uts, &ts, sizeof(ts)))
- return -EFAULT;
-
- *kts = uts;
- return 0;
-}
-
-static long compat_nanosleep_restart(struct restart_block *restart)
-{
- struct compat_timespec __user *rmtp;
- struct timespec rmt;
- mm_segment_t oldfs;
- long ret;
-
- restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- ret = hrtimer_nanosleep_restart(restart);
- set_fs(oldfs);
-
- if (ret == -ERESTART_RESTARTBLOCK) {
- rmtp = restart->nanosleep.compat_rmtp;
-
- if (rmtp && compat_put_timespec(&rmt, rmtp))
- return -EFAULT;
- }
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
- struct compat_timespec __user *, rmtp)
-{
- struct timespec tu, rmt;
- mm_segment_t oldfs;
- long ret;
-
- if (compat_get_timespec(&tu, rqtp))
- return -EFAULT;
-
- if (!timespec_valid(&tu))
- return -EINVAL;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- ret = hrtimer_nanosleep(&tu,
- rmtp ? (struct timespec __user *)&rmt : NULL,
- HRTIMER_MODE_REL, CLOCK_MONOTONIC);
- set_fs(oldfs);
-
- /*
- * hrtimer_nanosleep() can only return 0 or
- * -ERESTART_RESTARTBLOCK here because:
- *
- * - we call it with HRTIMER_MODE_REL and therefor exclude the
- * -ERESTARTNOHAND return path.
- *
- * - we supply the rmtp argument from the task stack (due to
- * the necessary compat conversion. So the update cannot
- * fail, which excludes the -EFAULT return path as well. If
- * it fails nevertheless we have a bigger problem and wont
- * reach this place anymore.
- *
- * - if the return value is 0, we do not have to update rmtp
- * because there is no remaining time.
- *
- * We check for -ERESTART_RESTARTBLOCK nevertheless if the
- * core implementation decides to return random nonsense.
- */
- if (ret == -ERESTART_RESTARTBLOCK) {
- struct restart_block *restart = &current->restart_block;
-
- restart->fn = compat_nanosleep_restart;
- restart->nanosleep.compat_rmtp = rmtp;
-
- if (rmtp && compat_put_timespec(&rmt, rmtp))
- return -EFAULT;
- }
- return ret;
-}
-
-static inline long get_compat_itimerval(struct itimerval *o,
- struct compat_itimerval __user *i)
-{
- return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
- (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
- __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
- __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
- __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
-}
-
-static inline long put_compat_itimerval(struct compat_itimerval __user *o,
- struct itimerval *i)
-{
- return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
- (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
- __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
- __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
- __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
-}
-
-COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
- struct compat_itimerval __user *, it)
-{
- struct itimerval kit;
- int error;
-
- error = do_getitimer(which, &kit);
- if (!error && put_compat_itimerval(it, &kit))
- error = -EFAULT;
- return error;
-}
-
-COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
- struct compat_itimerval __user *, in,
- struct compat_itimerval __user *, out)
-{
- struct itimerval kin, kout;
- int error;
-
- if (in) {
- if (get_compat_itimerval(&kin, in))
- return -EFAULT;
- } else
- memset(&kin, 0, sizeof(kin));
-
- error = do_setitimer(which, &kin, out ? &kout : NULL);
- if (error || !out)
- return error;
- if (put_compat_itimerval(out, &kout))
- return -EFAULT;
- return 0;
-}
-
-static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
-{
- return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
-}
-
-COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
-{
- if (tbuf) {
- struct tms tms;
- struct compat_tms tmp;
-
- do_sys_times(&tms);
- /* Convert our struct tms to the compat version. */
- tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
- tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
- tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
- tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
- if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
- return -EFAULT;
- }
- force_successful_syscall_return();
- return compat_jiffies_to_clock_t(jiffies);
-}
-
-#ifdef __ARCH_WANT_SYS_SIGPENDING
-
-/*
- * Assumption: old_sigset_t and compat_old_sigset_t are both
- * types that can be passed to put_user()/get_user().
- */
-
-COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
-{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sigpending((old_sigset_t __user *) &s);
- set_fs(old_fs);
- if (ret == 0)
- ret = put_user(s, set);
- return ret;
-}
-
-#endif
+#include <linux/uaccess.h>
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
@@ -441,164 +80,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
#endif
-COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
-
- if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
- __get_user(r.rlim_cur, &rlim->rlim_cur) ||
- __get_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
-
- if (r.rlim_cur == COMPAT_RLIM_INFINITY)
- r.rlim_cur = RLIM_INFINITY;
- if (r.rlim_max == COMPAT_RLIM_INFINITY)
- r.rlim_max = RLIM_INFINITY;
- return do_prlimit(current, resource, &r, NULL);
-}
-
-#ifdef COMPAT_RLIM_OLD_INFINITY
-
-COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
- set_fs(old_fs);
-
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
-#endif
-
-COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
-
- ret = do_prlimit(current, resource, NULL, &r);
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
{
- if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
- __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
- __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
- __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
- __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
- __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
- __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
- __put_user(r->ru_idrss, &ru->ru_idrss) ||
- __put_user(r->ru_isrss, &ru->ru_isrss) ||
- __put_user(r->ru_minflt, &ru->ru_minflt) ||
- __put_user(r->ru_majflt, &ru->ru_majflt) ||
- __put_user(r->ru_nswap, &ru->ru_nswap) ||
- __put_user(r->ru_inblock, &ru->ru_inblock) ||
- __put_user(r->ru_oublock, &ru->ru_oublock) ||
- __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
- __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
- __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
- __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
- __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+ struct compat_rusage r32;
+ memset(&r32, 0, sizeof(r32));
+ r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
+ r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
+ r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
+ r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
+ r32.ru_maxrss = r->ru_maxrss;
+ r32.ru_ixrss = r->ru_ixrss;
+ r32.ru_idrss = r->ru_idrss;
+ r32.ru_isrss = r->ru_isrss;
+ r32.ru_minflt = r->ru_minflt;
+ r32.ru_majflt = r->ru_majflt;
+ r32.ru_nswap = r->ru_nswap;
+ r32.ru_inblock = r->ru_inblock;
+ r32.ru_oublock = r->ru_oublock;
+ r32.ru_msgsnd = r->ru_msgsnd;
+ r32.ru_msgrcv = r->ru_msgrcv;
+ r32.ru_nsignals = r->ru_nsignals;
+ r32.ru_nvcsw = r->ru_nvcsw;
+ r32.ru_nivcsw = r->ru_nivcsw;
+ if (copy_to_user(ru, &r32, sizeof(r32)))
return -EFAULT;
return 0;
}
-COMPAT_SYSCALL_DEFINE4(wait4,
- compat_pid_t, pid,
- compat_uint_t __user *, stat_addr,
- int, options,
- struct compat_rusage __user *, ru)
-{
- if (!ru) {
- return sys_wait4(pid, stat_addr, options, NULL);
- } else {
- struct rusage r;
- int ret;
- unsigned int status;
- mm_segment_t old_fs = get_fs();
-
- set_fs (KERNEL_DS);
- ret = sys_wait4(pid,
- (stat_addr ?
- (unsigned int __user *) &status : NULL),
- options, (struct rusage __user *) &r);
- set_fs (old_fs);
-
- if (ret > 0) {
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
- if (stat_addr && put_user(status, stat_addr))
- return -EFAULT;
- }
- return ret;
- }
-}
-
-COMPAT_SYSCALL_DEFINE5(waitid,
- int, which, compat_pid_t, pid,
- struct compat_siginfo __user *, uinfo, int, options,
- struct compat_rusage __user *, uru)
-{
- siginfo_t info;
- struct rusage ru;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- memset(&info, 0, sizeof(info));
-
- set_fs(KERNEL_DS);
- ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
- uru ? (struct rusage __user *)&ru : NULL);
- set_fs(old_fs);
-
- if ((ret < 0) || (info.si_signo == 0))
- return ret;
-
- if (uru) {
- /* sys_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- ret = copy_to_user(uru, &ru, sizeof(ru));
- else
- ret = put_compat_rusage(&ru, uru);
- if (ret)
- return -EFAULT;
- }
-
- BUG_ON(info.si_code & __SI_MASK);
- info.si_code |= __SI_CHLD;
- return copy_siginfo_to_user32(uinfo, &info);
-}
-
static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
unsigned len, struct cpumask *new_mask)
{
@@ -644,12 +152,12 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
+ unsigned int retlen = min(len, cpumask_size());
if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
ret = -EFAULT;
@@ -661,211 +169,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
return ret;
}
-int get_compat_itimerspec(struct itimerspec *dst,
- const struct compat_itimerspec __user *src)
-{
- if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
- __compat_get_timespec(&dst->it_value, &src->it_value))
- return -EFAULT;
- return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
- const struct itimerspec *src)
-{
- if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
- __compat_put_timespec(&src->it_value, &dst->it_value))
- return -EFAULT;
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
- struct compat_sigevent __user *, timer_event_spec,
- timer_t __user *, created_timer_id)
-{
- struct sigevent __user *event = NULL;
-
- if (timer_event_spec) {
- struct sigevent kevent;
-
- event = compat_alloc_user_space(sizeof(*event));
- if (get_compat_sigevent(&kevent, timer_event_spec) ||
- copy_to_user(event, &kevent, sizeof(*event)))
- return -EFAULT;
- }
-
- return sys_timer_create(which_clock, event, created_timer_id);
-}
-
-COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
- struct compat_itimerspec __user *, new,
- struct compat_itimerspec __user *, old)
-{
- long err;
- mm_segment_t oldfs;
- struct itimerspec newts, oldts;
-
- if (!new)
- return -EINVAL;
- if (get_compat_itimerspec(&newts, new))
- return -EFAULT;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- err = sys_timer_settime(timer_id, flags,
- (struct itimerspec __user *) &newts,
- (struct itimerspec __user *) &oldts);
- set_fs(oldfs);
- if (!err && old && put_compat_itimerspec(old, &oldts))
- return -EFAULT;
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
- struct compat_itimerspec __user *, setting)
-{
- long err;
- mm_segment_t oldfs;
- struct itimerspec ts;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- err = sys_timer_gettime(timer_id,
- (struct itimerspec __user *) &ts);
- set_fs(oldfs);
- if (!err && put_compat_itimerspec(setting, &ts))
- return -EFAULT;
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
- struct compat_timespec __user *, tp)
-{
- long err;
- mm_segment_t oldfs;
- struct timespec ts;
-
- if (compat_get_timespec(&ts, tp))
- return -EFAULT;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- err = sys_clock_settime(which_clock,
- (struct timespec __user *) &ts);
- set_fs(oldfs);
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
- struct compat_timespec __user *, tp)
-{
- long err;
- mm_segment_t oldfs;
- struct timespec ts;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- err = sys_clock_gettime(which_clock,
- (struct timespec __user *) &ts);
- set_fs(oldfs);
- if (!err && compat_put_timespec(&ts, tp))
- return -EFAULT;
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
- struct compat_timex __user *, utp)
-{
- struct timex txc;
- mm_segment_t oldfs;
- int err, ret;
-
- err = compat_get_timex(&txc, utp);
- if (err)
- return err;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
- set_fs(oldfs);
-
- err = compat_put_timex(utp, &txc);
- if (err)
- return err;
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
- struct compat_timespec __user *, tp)
-{
- long err;
- mm_segment_t oldfs;
- struct timespec ts;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- err = sys_clock_getres(which_clock,
- (struct timespec __user *) &ts);
- set_fs(oldfs);
- if (!err && tp && compat_put_timespec(&ts, tp))
- return -EFAULT;
- return err;
-}
-
-static long compat_clock_nanosleep_restart(struct restart_block *restart)
-{
- long err;
- mm_segment_t oldfs;
- struct timespec tu;
- struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
-
- restart->nanosleep.rmtp = (struct timespec __user *) &tu;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- err = clock_nanosleep_restart(restart);
- set_fs(oldfs);
-
- if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- compat_put_timespec(&tu, rmtp))
- return -EFAULT;
-
- if (err == -ERESTART_RESTARTBLOCK) {
- restart->fn = compat_clock_nanosleep_restart;
- restart->nanosleep.compat_rmtp = rmtp;
- }
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
- struct compat_timespec __user *, rqtp,
- struct compat_timespec __user *, rmtp)
-{
- long err;
- mm_segment_t oldfs;
- struct timespec in, out;
- struct restart_block *restart;
-
- if (compat_get_timespec(&in, rqtp))
- return -EFAULT;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- err = sys_clock_nanosleep(which_clock, flags,
- (struct timespec __user *) &in,
- (struct timespec __user *) &out);
- set_fs(oldfs);
-
- if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
- compat_put_timespec(&out, rmtp))
- return -EFAULT;
-
- if (err == -ERESTART_RESTARTBLOCK) {
- restart = &current->restart_block;
- restart->fn = compat_clock_nanosleep_restart;
- restart->nanosleep.compat_rmtp = rmtp;
- }
- return err;
-}
-
/*
* We currently only need the following fields from the sigevent
* structure: sigev_value, sigev_signo, sig_notify and (sometimes
@@ -877,7 +180,7 @@ int get_compat_sigevent(struct sigevent *event,
const struct compat_sigevent __user *u_event)
{
memset(event, 0, sizeof(*event));
- return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) ||
+ return (!access_ok(u_event, sizeof(*u_event)) ||
__get_user(event->sigev_value.sival_int,
&u_event->sigev_value.sival_int) ||
__get_user(event->sigev_signo, &u_event->sigev_signo) ||
@@ -890,287 +193,79 @@ int get_compat_sigevent(struct sigevent *event,
long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
-
- if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
- return -EFAULT;
-
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = 0;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- /*
- * We dont want to read past the end of the userspace
- * bitmap. We must however ensure the end of the
- * kernel bitmap is zeroed.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__get_user(um, umask))
- return -EFAULT;
- } else {
- um = 0;
- }
+ if (!user_read_access_begin(umask, bitmap_size / 8))
+ return -EFAULT;
- umask++;
- m |= (long)um << (j * BITS_PER_COMPAT_LONG);
- }
- *mask++ = m;
+ while (nr_compat_longs > 1) {
+ compat_ulong_t l1, l2;
+ unsafe_get_user(l1, umask++, Efault);
+ unsafe_get_user(l2, umask++, Efault);
+ *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_get_user(*mask, umask++, Efault);
+ user_read_access_end();
return 0;
+
+Efault:
+ user_read_access_end();
+ return -EFAULT;
}
long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
-
- if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
- return -EFAULT;
-
nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = *mask++;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- um = m;
-
- /*
- * We dont want to write past the end of the userspace
- * bitmap.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__put_user(um, umask))
- return -EFAULT;
- }
+ if (!user_write_access_begin(umask, bitmap_size / 8))
+ return -EFAULT;
- umask++;
- m >>= 4*sizeof(um);
- m >>= 4*sizeof(um);
- }
+ while (nr_compat_longs > 1) {
+ unsigned long m = *mask++;
+ unsafe_put_user((compat_ulong_t)m, umask++, Efault);
+ unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
+ user_write_access_end();
return 0;
+Efault:
+ user_write_access_end();
+ return -EFAULT;
}
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
-{
- switch (_NSIG_WORDS) {
- case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
- case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
- case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
- case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
- }
-}
-EXPORT_SYMBOL_GPL(sigset_from_compat);
-
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
-{
- switch (_NSIG_WORDS) {
- case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
- case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
- case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
- case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
- }
-}
-
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
- struct compat_siginfo __user *, uinfo,
- struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
{
- compat_sigset_t s32;
- sigset_t s;
- struct timespec t;
- siginfo_t info;
- long ret;
-
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
+ if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
return -EFAULT;
- sigset_from_compat(&s, &s32);
-
- if (uts) {
- if (compat_get_timespec(&t, uts))
- return -EFAULT;
- }
-
- ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
-
- if (ret > 0 && uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
-
- return ret;
-}
-
-#ifdef __ARCH_WANT_COMPAT_SYS_TIME
-
-/* compat_time_t is a 32 bit "long" and needs to get converted. */
-
-COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
-{
- compat_time_t i;
- struct timeval tv;
-
- do_gettimeofday(&tv);
- i = tv.tv_sec;
-
- if (tloc) {
- if (put_user(i,tloc))
- return -EFAULT;
+ switch (_NSIG_WORDS) {
+ case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+ fallthrough;
+ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+ fallthrough;
+ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+ fallthrough;
+ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
- force_successful_syscall_return();
- return i;
-}
-
-COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
-{
- struct timespec tv;
- int err;
-
- if (get_user(tv.tv_sec, tptr))
+#else
+ if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
return -EFAULT;
-
- tv.tv_nsec = 0;
-
- err = security_settime(&tv, NULL);
- if (err)
- return err;
-
- do_settimeofday(&tv);
- return 0;
-}
-
-#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-
-COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
-{
- struct timex txc;
- int err, ret;
-
- err = compat_get_timex(&txc, utp);
- if (err)
- return err;
-
- ret = do_adjtimex(&txc);
-
- err = compat_put_timex(utp, &txc);
- if (err)
- return err;
-
- return ret;
-}
-
-#ifdef CONFIG_NUMA
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
- compat_uptr_t __user *, pages32,
- const int __user *, nodes,
- int __user *, status,
- int, flags)
-{
- const void __user * __user *pages;
- int i;
-
- pages = compat_alloc_user_space(nr_pages * sizeof(void *));
- for (i = 0; i < nr_pages; i++) {
- compat_uptr_t p;
-
- if (get_user(p, pages32 + i) ||
- put_user(compat_ptr(p), pages + i))
- return -EFAULT;
- }
- return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
-}
-
-COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
- compat_ulong_t, maxnode,
- const compat_ulong_t __user *, old_nodes,
- const compat_ulong_t __user *, new_nodes)
-{
- unsigned long __user *old = NULL;
- unsigned long __user *new = NULL;
- nodemask_t tmp_mask;
- unsigned long nr_bits;
- unsigned long size;
-
- nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
- size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
- if (old_nodes) {
- if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
- return -EFAULT;
- old = compat_alloc_user_space(new_nodes ? size * 2 : size);
- if (new_nodes)
- new = old + size / sizeof(unsigned long);
- if (copy_to_user(old, nodes_addr(tmp_mask), size))
- return -EFAULT;
- }
- if (new_nodes) {
- if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
- return -EFAULT;
- if (new == NULL)
- new = compat_alloc_user_space(size);
- if (copy_to_user(new, nodes_addr(tmp_mask), size))
- return -EFAULT;
- }
- return sys_migrate_pages(pid, nr_bits + 1, old, new);
-}
#endif
-
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct compat_timespec __user *, interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
- set_fs(old_fs);
- if (compat_put_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
-/*
- * Allocate user-space memory for the duration of a single system call,
- * in order to marshall parameters inside a compat thunk.
- */
-void __user *compat_alloc_user_space(unsigned long len)
-{
- void __user *ptr;
-
- /* If len would occupy more than half of the entire compat space... */
- if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
- return NULL;
-
- ptr = arch_compat_alloc_user_space(len);
-
- if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
- return NULL;
-
- return ptr;
+ return 0;
}
-EXPORT_SYMBOL_GPL(compat_alloc_user_space);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
diff --git a/kernel/configs.c b/kernel/configs.c
index c18b1f1ae515..a28c79c5f713 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* kernel/configs.c
* Echo the kernel .config file used to build the kernel
@@ -6,21 +7,6 @@
* Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
* Copyright (C) 2002 Hewlett-Packard Company
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/kernel.h>
@@ -28,45 +14,42 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
-
-/**************************************************/
-/* the actual current config file */
+#include <linux/uaccess.h>
/*
- * Define kernel_config_data and kernel_config_data_size, which contains the
- * wrapped and compressed configuration file. The file is first compressed
- * with gzip and then bounded by two eight byte magic numbers to allow
- * extraction from a binary kernel image:
- *
- * IKCFG_ST
- * <image>
- * IKCFG_ED
+ * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from
+ * a binary kernel image or a module. See scripts/extract-ikconfig.
*/
-#define MAGIC_START "IKCFG_ST"
-#define MAGIC_END "IKCFG_ED"
-#include "config_data.h"
-
-
-#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
-#define kernel_config_data_size \
- (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .ascii \"IKCFG_ST\" \n"
+" .global kernel_config_data \n"
+"kernel_config_data: \n"
+" .incbin \"kernel/config_data.gz\" \n"
+" .global kernel_config_data_end \n"
+"kernel_config_data_end: \n"
+" .ascii \"IKCFG_ED\" \n"
+" .popsection \n"
+);
#ifdef CONFIG_IKCONFIG_PROC
+extern char kernel_config_data;
+extern char kernel_config_data_end;
+
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
- kernel_config_data + MAGIC_SIZE,
- kernel_config_data_size);
+ &kernel_config_data,
+ &kernel_config_data_end -
+ &kernel_config_data);
}
-static const struct file_operations ikconfig_file_ops = {
- .owner = THIS_MODULE,
- .read = ikconfig_read_current,
- .llseek = default_llseek,
+static const struct proc_ops config_gz_proc_ops = {
+ .proc_read = ikconfig_read_current,
+ .proc_lseek = default_llseek,
};
static int __init ikconfig_init(void)
@@ -75,11 +58,11 @@ static int __init ikconfig_init(void)
/* create the current config file */
entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
- &ikconfig_file_ops);
+ &config_gz_proc_ops);
if (!entry)
return -ENOMEM;
- proc_set_size(entry, kernel_config_data_size);
+ proc_set_size(entry, &kernel_config_data_end - &kernel_config_data);
return 0;
}
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
new file mode 100644
index 000000000000..9f6ab7dabf67
--- /dev/null
+++ b/kernel/configs/debug.config
@@ -0,0 +1,119 @@
+# Help: Debugging for CI systems and finding regressions
+#
+# The config is based on running daily CI for enterprise Linux distros to
+# seek regressions on linux-next builds on different bare-metal and virtual
+# platforms. It can be used for example,
+#
+# $ make ARCH=arm64 defconfig debug.config
+#
+# Keep alphabetically sorted inside each section.
+#
+# printk and dmesg options
+#
+CONFIG_DEBUG_BUGVERBOSE=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_PRINTK_CALLER=y
+CONFIG_PRINTK_TIME=y
+CONFIG_SYMBOLIC_ERRNAME=y
+#
+# Compile-time checks and compiler options
+#
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
+CONFIG_DEBUG_SECTION_MISMATCH=y
+CONFIG_FRAME_WARN=2048
+CONFIG_SECTION_MISMATCH_WARN_ONLY=y
+#
+# Generic Kernel Debugging Instruments
+#
+# CONFIG_UBSAN_ALIGNMENT is not set
+# CONFIG_UBSAN_DIV_ZERO is not set
+# CONFIG_UBSAN_TRAP is not set
+# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_FS_ALLOW_ALL=y
+CONFIG_DEBUG_IRQFLAGS=y
+CONFIG_UBSAN=y
+CONFIG_UBSAN_BOOL=y
+CONFIG_UBSAN_BOUNDS=y
+CONFIG_UBSAN_ENUM=y
+CONFIG_UBSAN_SHIFT=y
+CONFIG_UBSAN_UNREACHABLE=y
+#
+# Networking Debugging
+#
+CONFIG_NET_DEV_REFCNT_TRACKER=y
+CONFIG_NET_NS_REFCNT_TRACKER=y
+CONFIG_DEBUG_NET=y
+#
+# Memory Debugging
+#
+# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set
+# CONFIG_DEBUG_RODATA_TEST is not set
+# CONFIG_DEBUG_WX is not set
+# CONFIG_KFENCE is not set
+# CONFIG_PAGE_POISONING is not set
+# CONFIG_SLUB_STATS is not set
+CONFIG_PAGE_EXTENSION=y
+CONFIG_PAGE_OWNER=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_DEBUG_OBJECTS_FREE=y
+CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_DEBUG_OBJECTS_TIMERS=y
+CONFIG_DEBUG_OBJECTS_WORK=y
+CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_VIRTUAL=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_PGFLAGS=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_KASAN=y
+CONFIG_KASAN_GENERIC=y
+CONFIG_KASAN_INLINE=y
+CONFIG_KASAN_VMALLOC=y
+CONFIG_PTDUMP_DEBUGFS=y
+CONFIG_SCHED_STACK_END_CHECK=y
+CONFIG_SLUB_DEBUG_ON=y
+#
+# Debug Oops, Lockups and Hangs
+#
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
+# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_PANIC_TIMEOUT=0
+CONFIG_SOFTLOCKUP_DETECTOR=y
+#
+# Lock Debugging (spinlocks, mutexes, etc...)
+#
+# CONFIG_PROVE_RAW_LOCK_NESTING is not set
+CONFIG_PROVE_LOCKING=y
+#
+# Debug kernel data structures
+#
+CONFIG_BUG_ON_DATA_CORRUPTION=y
+#
+# RCU Debugging
+#
+CONFIG_RCU_EXPERT=y
+CONFIG_PROVE_RCU=y
+CONFIG_PROVE_RCU_LIST=y
+#
+# Tracers
+#
+CONFIG_BRANCH_PROFILE_NONE=y
+CONFIG_DYNAMIC_FTRACE=y
+CONFIG_FTRACE=y
+CONFIG_FUNCTION_TRACER=y
+#
+# Preemption
+#
+CONFIG_DEBUG_PREEMPT=y
+CONFIG_PREEMPT=y
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
new file mode 100644
index 000000000000..7c3924614e01
--- /dev/null
+++ b/kernel/configs/hardening.config
@@ -0,0 +1,113 @@
+# Help: Basic kernel hardening options
+#
+# These are considered the basic kernel hardening, self-protection, and
+# attack surface reduction options. They are expected to have low (or
+# no) performance impact on most workloads, and have a reasonable level
+# of legacy API removals.
+
+# Make sure reporting of various hardening actions is possible.
+CONFIG_BUG=y
+
+# Basic kernel memory permission enforcement.
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_VMAP_STACK=y
+
+# Kernel image and memory ASLR.
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+
+# Randomize allocator freelists, harden metadata.
+CONFIG_SLAB_FREELIST_RANDOM=y
+CONFIG_SLAB_FREELIST_HARDENED=y
+CONFIG_SLAB_BUCKETS=y
+CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
+CONFIG_RANDOM_KMALLOC_CACHES=y
+
+# Sanity check userspace page table mappings.
+CONFIG_PAGE_TABLE_CHECK=y
+CONFIG_PAGE_TABLE_CHECK_ENFORCED=y
+
+# Randomize kernel stack offset on syscall entry.
+CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
+
+# Basic stack frame overflow protection.
+CONFIG_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR_STRONG=y
+
+# Basic buffer length bounds checking.
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_FORTIFY_SOURCE=y
+
+# Basic array index bounds checking.
+CONFIG_UBSAN=y
+CONFIG_UBSAN_TRAP=y
+CONFIG_UBSAN_BOUNDS=y
+# CONFIG_UBSAN_SHIFT is not set
+# CONFIG_UBSAN_DIV_ZERO is not set
+# CONFIG_UBSAN_UNREACHABLE is not set
+# CONFIG_UBSAN_INTEGER_WRAP is not set
+# CONFIG_UBSAN_BOOL is not set
+# CONFIG_UBSAN_ENUM is not set
+# CONFIG_UBSAN_ALIGNMENT is not set
+
+# Sampling-based heap out-of-bounds and use-after-free detection.
+CONFIG_KFENCE=y
+
+# Linked list integrity checking.
+CONFIG_LIST_HARDENED=y
+
+# Initialize all heap variables to zero on allocation.
+CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+
+# Initialize all heap variables to zero on free to reduce stale data lifetime.
+CONFIG_INIT_ON_FREE_DEFAULT_ON=y
+
+# Initialize all stack variables to zero on function entry.
+CONFIG_INIT_STACK_ALL_ZERO=y
+
+# Wipe kernel stack after syscall completion to reduce stale data lifetime.
+CONFIG_KSTACK_ERASE=y
+
+# Wipe RAM at reboot via EFI. For more details, see:
+# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
+# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
+CONFIG_RESET_ATTACK_MITIGATION=y
+
+# Disable DMA between EFI hand-off and the kernel's IOMMU setup.
+CONFIG_EFI_DISABLE_PCI_DMA=y
+
+# Force IOMMU TLB invalidation so devices will never be able to access stale
+# data content.
+CONFIG_IOMMU_SUPPORT=y
+CONFIG_IOMMU_DEFAULT_DMA_STRICT=y
+
+# Do not allow direct physical memory access to non-device memory.
+CONFIG_STRICT_DEVMEM=y
+CONFIG_IO_STRICT_DEVMEM=y
+
+# Provide userspace with seccomp BPF API for syscall attack surface reduction.
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+
+# Provides some protections against SYN flooding.
+CONFIG_SYN_COOKIES=y
+
+# Enable Kernel Control Flow Integrity.
+CONFIG_CFI=y
+# CONFIG_CFI_PERMISSIVE is not set
+
+# Attack surface reduction: do not autoload TTY line disciplines.
+# CONFIG_LDISC_AUTOLOAD is not set
+
+# Dangerous; enabling this disables userspace brk ASLR.
+# CONFIG_COMPAT_BRK is not set
+
+# Dangerous; exposes kernel text image layout.
+# CONFIG_PROC_KCORE is not set
+
+# Dangerous; enabling this disables userspace VDSO ASLR.
+# CONFIG_COMPAT_VDSO is not set
+
+# Attack surface reduction: Use the modern PTY interface (devpts) only.
+# CONFIG_LEGACY_PTYS is not set
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
new file mode 100644
index 000000000000..d0877063d925
--- /dev/null
+++ b/kernel/configs/kvm_guest.config
@@ -0,0 +1,35 @@
+# Help: Bootable as a KVM guest
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_BLK_DEV=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_TTY=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_BINFMT_ELF=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_S390_GUEST=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_MENU=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_VIRTIO_NET=y
+CONFIG_9P_FS=y
+CONFIG_NET_9P=y
+CONFIG_NET_9P_VIRTIO=y
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_SCSI_VIRTIO=y
+CONFIG_VIRTIO_INPUT=y
+CONFIG_DRM_VIRTIO_GPU=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
new file mode 100644
index 000000000000..ebfdc3d8aa9a
--- /dev/null
+++ b/kernel/configs/nopm.config
@@ -0,0 +1,17 @@
+# Help: Disable Power Management
+
+CONFIG_PM=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+
+# Triggers PM on OMAP
+CONFIG_CPU_IDLE=n
+
+# Triggers enablement via hibernate callbacks
+CONFIG_XEN=n
+
+# ARM/ARM64 architectures that select PM unconditionally
+CONFIG_ARCH_OMAP2PLUS_TYPICAL=n
+CONFIG_ARCH_RENESAS=n
+CONFIG_ARCH_TEGRA=n
+CONFIG_ARCH_VEXPRESS=n
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..2c6e001a7284
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1,2 @@
+# Help: Enable Rust
+CONFIG_RUST=y
diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config
new file mode 100644
index 000000000000..ffb9dcafca26
--- /dev/null
+++ b/kernel/configs/tiny-base.config
@@ -0,0 +1 @@
+CONFIG_EXPERT=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..5dd0f0a34a73 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,5 @@
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_KERNEL_XZ=y
-CONFIG_OPTIMIZE_INLINING=y
-CONFIG_SLOB=y
+CONFIG_SLUB=y
+CONFIG_SLUB_TINY=y
+CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
new file mode 100644
index 000000000000..35f48671b8d5
--- /dev/null
+++ b/kernel/configs/x86_debug.config
@@ -0,0 +1,18 @@
+# Help: Debugging options for tip tree testing
+CONFIG_X86_DEBUG_FPU=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_SLUB_DEBUG_ON=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_GCOV_KERNEL=y
+CONFIG_LOCKDEP=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_SCHEDSTATS=y
+CONFIG_NOINSTR_VALIDATION=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
new file mode 100644
index 000000000000..1875a0a5047a
--- /dev/null
+++ b/kernel/configs/xen.config
@@ -0,0 +1,52 @@
+# Help: Bootable as a Xen guest
+#
+# global stuff - these enable us to allow some
+# of the not so generic stuff below for xen
+CONFIG_PARAVIRT=y
+CONFIG_NET=y
+CONFIG_NET_CORE=y
+CONFIG_NETDEVICES=y
+CONFIG_BLOCK=y
+CONFIG_WATCHDOG=y
+CONFIG_TARGET_CORE=y
+CONFIG_SCSI=y
+CONFIG_FB=y
+CONFIG_INPUT_MISC=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
+CONFIG_ZONE_DEVICE=y
+CONFIG_TTY=y
+# Technically not required but otherwise produces
+# pretty useless systems starting from allnoconfig
+# You want TCP/IP and ELF binaries right?
+CONFIG_INET=y
+CONFIG_BINFMT_ELF=y
+# generic config
+CONFIG_XEN=y
+CONFIG_XEN_DOM0=y
+# backend drivers
+CONFIG_XEN_BACKEND=y
+CONFIG_XEN_BLKDEV_BACKEND=m
+CONFIG_XEN_NETDEV_BACKEND=m
+CONFIG_HVC_XEN=y
+CONFIG_XEN_WDT=m
+CONFIG_XEN_SCSI_BACKEND=m
+# frontend drivers
+CONFIG_XEN_FBDEV_FRONTEND=m
+CONFIG_HVC_XEN_FRONTEND=y
+CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m
+CONFIG_XEN_SCSI_FRONTEND=m
+# others
+CONFIG_XEN_BALLOON=y
+CONFIG_XEN_DEV_EVTCHN=m
+CONFIG_XEN_BLKDEV_FRONTEND=m
+CONFIG_XEN_NETDEV_FRONTEND=m
+CONFIG_XENFS=m
+CONFIG_XEN_COMPAT_XENFS=y
+CONFIG_XEN_SYS_HYPERVISOR=y
+CONFIG_XEN_XENBUS_FRONTEND=y
+CONFIG_XEN_GNTDEV=m
+CONFIG_XEN_GRANT_DEV_ALLOC=m
+CONFIG_SWIOTLB_XEN=y
+CONFIG_XEN_PRIVCMD=m
+CONFIG_XEN_UNPOPULATED_ALLOC=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 72d59a1a6eb6..fb5be6e9b423 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,17 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Context tracking: Probe on high level context boundaries such as kernel
- * and userspace. This includes syscalls and exceptions entry/exit.
+ * Context tracking: Probe on high level context boundaries such as kernel,
+ * userspace, guest or idle.
*
* This is used by RCU to remove its dependency on the timer tick while a CPU
- * runs in userspace.
+ * runs in idle, userspace or guest mode.
*
- * Started by Frederic Weisbecker:
+ * User/guest tracking started by Frederic Weisbecker:
*
- * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker
*
* Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
* Steven Rostedt, Peter Zijlstra for suggestions and improvements.
*
+ * RCU extended quiescent state bits imported from kernel/rcu/tree.c
+ * where the relevant authorship may be found.
*/
#include <linux/context_tracking.h>
@@ -20,105 +23,580 @@
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/kprobes.h>
+#include <trace/events/rcu.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/context_tracking.h>
-struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL_GPL(context_tracking_enabled);
-
-DEFINE_PER_CPU(struct context_tracking, context_tracking);
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+ .nesting = 1,
+ .nmi_nesting = CT_NESTING_IRQ_NONIDLE,
+#endif
+ .state = ATOMIC_INIT(CT_RCU_WATCHING),
+};
EXPORT_SYMBOL_GPL(context_tracking);
-void context_tracking_cpu_set(int cpu)
+#ifdef CONFIG_CONTEXT_TRACKING_IDLE
+#define TPS(x) tracepoint_string(x)
+
+/* Record the current task on exiting RCU-tasks (dyntick-idle entry). */
+static __always_inline void rcu_task_exit(void)
{
- if (!per_cpu(context_tracking.active, cpu)) {
- per_cpu(context_tracking.active, cpu) = true;
- static_key_slow_inc(&context_tracking_enabled);
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Record no current task on entering RCU-tasks (dyntick-idle exit). */
+static __always_inline void rcu_task_enter(void)
+{
+#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
+ WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
+#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+}
+
+/* Turn on heavyweight RCU tasks trace readers on kernel exit. */
+static __always_inline void rcu_task_trace_heavyweight_enter(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = true;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/* Turn off heavyweight RCU tasks trace readers on kernel entry. */
+static __always_inline void rcu_task_trace_heavyweight_exit(void)
+{
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ current->trc_reader_special.b.need_mb = false;
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/*
+ * Record entry into an extended quiescent state. This is only to be
+ * called when not already in an extended quiescent state, that is,
+ * RCU is watching prior to the call to this function and is no longer
+ * watching upon return.
+ */
+static noinstr void ct_kernel_exit_state(int offset)
+{
+ /*
+ * CPUs seeing atomic_add_return() must see prior RCU read-side
+ * critical sections, and we also must force ordering with the
+ * next idle sojourn.
+ */
+ rcu_task_trace_heavyweight_enter(); // Before CT state update!
+ // RCU is still watching. Better not be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu());
+ (void)ct_state_inc(offset);
+ // RCU is no longer watching.
+}
+
+/*
+ * Record exit from an extended quiescent state. This is only to be
+ * called from an extended quiescent state, that is, RCU is not watching
+ * prior to the call to this function and is watching upon return.
+ */
+static noinstr void ct_kernel_enter_state(int offset)
+{
+ int seq;
+
+ /*
+ * CPUs seeing atomic_add_return() must see prior idle sojourns,
+ * and we also must force ordering with the next RCU read-side
+ * critical section.
+ */
+ seq = ct_state_inc(offset);
+ // RCU is now watching. Better not be in an extended quiescent state!
+ rcu_task_trace_heavyweight_exit(); // After CT state update!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING));
+}
+
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->nmi_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
+ */
+static void noinstr ct_kernel_exit(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ WARN_ON_ONCE(ct_nmi_nesting() != CT_NESTING_IRQ_NONIDLE);
+ WRITE_ONCE(ct->nmi_nesting, 0);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+ ct_nesting() == 0);
+ if (ct_nesting() != 1) {
+ // RCU will still be watching, so just do accounting and leave.
+ ct->nesting--;
+ return;
+ }
+
+ instrumentation_begin();
+ lockdep_assert_irqs_disabled();
+ trace_rcu_watching(TPS("End"), ct_nesting(), 0, ct_rcu_watching());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ rcu_preempt_deferred_qs(current);
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ instrumentation_end();
+ WRITE_ONCE(ct->nesting, 0); /* Avoid irq-access tearing. */
+ // RCU is watching here ...
+ ct_kernel_exit_state(offset);
+ // ... but is no longer watching here.
+ rcu_task_exit();
+}
+
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ *
+ * We crowbar the ->nmi_nesting field to CT_NESTING_IRQ_NONIDLE to
+ * allow for the possibility of usermode upcalls messing up our count of
+ * interrupt nesting level during the busy period that is just now starting.
+ */
+static void noinstr ct_kernel_enter(bool user, int offset)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ long oldval;
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ oldval = ct_nesting();
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
+ if (oldval) {
+ // RCU was already watching, so just do accounting and leave.
+ ct->nesting++;
+ return;
}
+ rcu_task_enter();
+ // RCU is not watching here ...
+ ct_kernel_enter_state(offset);
+ // ... but is watching here.
+ instrumentation_begin();
+
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ trace_rcu_watching(TPS("Start"), ct_nesting(), 1, ct_rcu_watching());
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
+ WRITE_ONCE(ct->nesting, 1);
+ WARN_ON_ONCE(ct_nmi_nesting());
+ WRITE_ONCE(ct->nmi_nesting, CT_NESTING_IRQ_NONIDLE);
+ instrumentation_end();
}
/**
- * context_tracking_enter - Inform the context tracking that the CPU is going
- * enter user or guest space mode.
+ * ct_nmi_exit - inform RCU of exit from NMI context
*
- * This function must be called right before we switch from the kernel
- * to user or guest space, when it's guaranteed the remaining kernel
- * instructions to execute won't use any RCU read side critical section
- * because this function sets RCU in extended quiescent state.
+ * If we are returning from the outermost NMI handler that interrupted an
+ * RCU-idle period, update ct->state and ct->nmi_nesting
+ * to let the RCU grace-period handling know that the CPU is back to
+ * being RCU-idle.
+ *
+ * If you add or remove a call to ct_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
-void context_tracking_enter(enum ctx_state state)
+void noinstr ct_nmi_exit(void)
{
- unsigned long flags;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ instrumentation_begin();
/*
- * Repeat the user_enter() check here because some archs may be calling
- * this from asm and if no CPU needs context tracking, they shouldn't
- * go further. Repeat the check here until they support the inline static
- * key check.
+ * Check for ->nmi_nesting underflow and bad CT state.
+ * (We are exiting an NMI handler, so RCU better be paying attention
+ * to us!)
*/
- if (!context_tracking_is_enabled())
- return;
+ WARN_ON_ONCE(ct_nmi_nesting() <= 0);
+ WARN_ON_ONCE(!rcu_is_watching_curr_cpu());
/*
- * Some contexts may involve an exception occuring in an irq,
- * leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
- * This would mess up the dyntick_nesting count though. And rcu_irq_*()
- * helpers are enough to protect RCU uses inside the exception. So
- * just return immediately if we detect we are in an IRQ.
+ * If the nesting level is not 1, the CPU wasn't RCU-idle, so
+ * leave it in non-RCU-idle state.
*/
- if (in_interrupt())
+ if (ct_nmi_nesting() != 1) {
+ trace_rcu_watching(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2,
+ ct_rcu_watching());
+ WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */
+ ct_nmi_nesting() - 2);
+ instrumentation_end();
return;
+ }
+
+ /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
+ trace_rcu_watching(TPS("Endirq"), ct_nmi_nesting(), 0, ct_rcu_watching());
+ WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */
+
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+ instrumentation_end();
+
+ // RCU is watching here ...
+ ct_kernel_exit_state(CT_RCU_WATCHING);
+ // ... but is no longer watching here.
+
+ if (!in_nmi())
+ rcu_task_exit();
+}
+
+/**
+ * ct_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
+ * ct->nmi_nesting to let the RCU grace-period handling know
+ * that the CPU is active. This implementation permits nested NMIs, as
+ * long as the nesting level does not overflow an int. (You will probably
+ * run out of stack space first.)
+ *
+ * If you add or remove a call to ct_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_nmi_enter(void)
+{
+ long incby = 2;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+
+ /* Complain about underflow. */
+ WARN_ON_ONCE(ct_nmi_nesting() < 0);
+
+ /*
+ * If idle from RCU viewpoint, atomically increment CT state
+ * to mark non-idle and increment ->nmi_nesting by one.
+ * Otherwise, increment ->nmi_nesting by two. This means
+ * if ->nmi_nesting is equal to one, we are guaranteed
+ * to be in the outermost NMI handler that interrupted an RCU-idle
+ * period (observation due to Andy Lutomirski).
+ */
+ if (!rcu_is_watching_curr_cpu()) {
+
+ if (!in_nmi())
+ rcu_task_enter();
+
+ // RCU is not watching here ...
+ ct_kernel_enter_state(CT_RCU_WATCHING);
+ // ... but is watching here.
+
+ instrumentation_begin();
+ // instrumentation for the noinstr rcu_is_watching_curr_cpu()
+ instrument_atomic_read(&ct->state, sizeof(ct->state));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
+
+ incby = 1;
+ } else if (!in_nmi()) {
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ } else {
+ instrumentation_begin();
+ }
+
+ trace_rcu_watching(incby == 1 ? TPS("Startirq") : TPS("++="),
+ ct_nmi_nesting(),
+ ct_nmi_nesting() + incby, ct_rcu_watching());
+ instrumentation_end();
+ WRITE_ONCE(ct->nmi_nesting, /* Prevent store tearing. */
+ ct_nmi_nesting() + incby);
+ barrier();
+}
+
+/**
+ * ct_idle_enter - inform RCU that current CPU is entering idle
+ *
+ * Enter idle mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur. (Though RCU read-side
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * If you add or remove a call to ct_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_enter(void)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
+ ct_kernel_exit(false, CT_RCU_WATCHING + CT_STATE_IDLE);
+}
+EXPORT_SYMBOL_GPL(ct_idle_enter);
+
+/**
+ * ct_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * If you add or remove a call to ct_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+void noinstr ct_idle_exit(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ ct_kernel_enter(false, CT_RCU_WATCHING - CT_STATE_IDLE);
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(ct_idle_exit);
+
+/**
+ * ct_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to user mode!
+ * This code assumes that the idle loop never does upcalls to user mode.
+ * If your architecture's idle loop does do upcalls to user mode (or does
+ * anything else that results in unbalanced calls to the irq_enter() and
+ * irq_exit() functions), RCU will give you what you deserve, good and hard.
+ * But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_enter(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_enter();
+}
+
+/**
+ * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur. The caller must have disabled interrupts.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture's idle loop violates this assumption, RCU will give you what
+ * you deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ *
+ * If you add or remove a call to ct_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
+ */
+noinstr void ct_irq_exit(void)
+{
+ lockdep_assert_irqs_disabled();
+ ct_nmi_exit();
+}
+
+/*
+ * Wrapper for ct_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_enter();
+ local_irq_restore(flags);
+}
+
+/*
+ * Wrapper for ct_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to ct_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
+ */
+void ct_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ct_irq_exit();
+ local_irq_restore(flags);
+}
+#else
+static __always_inline void ct_kernel_exit(bool user, int offset) { }
+static __always_inline void ct_kernel_enter(bool user, int offset) { }
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */
+
+#ifdef CONFIG_CONTEXT_TRACKING_USER
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/context_tracking.h>
+
+DEFINE_STATIC_KEY_FALSE_RO(context_tracking_key);
+EXPORT_SYMBOL_GPL(context_tracking_key);
+
+static noinstr bool context_tracking_recursion_enter(void)
+{
+ int recursion;
+
+ recursion = __this_cpu_inc_return(context_tracking.recursion);
+ if (recursion == 1)
+ return true;
+
+ WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
+ __this_cpu_dec(context_tracking.recursion);
+
+ return false;
+}
+
+static __always_inline void context_tracking_recursion_exit(void)
+{
+ __this_cpu_dec(context_tracking.recursion);
+}
+
+/**
+ * __ct_user_enter - Inform the context tracking that the CPU is going
+ * to enter user or guest space mode.
+ *
+ * @state: userspace context-tracking state to enter.
+ *
+ * This function must be called right before we switch from the kernel
+ * to user or guest space, when it's guaranteed the remaining kernel
+ * instructions to execute won't use any RCU read side critical section
+ * because this function sets RCU in extended quiescent state.
+ */
+void noinstr __ct_user_enter(enum ctx_state state)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ lockdep_assert_irqs_disabled();
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
- local_irq_save(flags);
- if ( __this_cpu_read(context_tracking.state) != state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (!context_tracking_recursion_enter())
+ return;
+
+ if (__ct_state() != state) {
+ if (ct->active) {
/*
* At this stage, only low level arch entry code remains and
* then we'll run in userspace. We can assume there won't be
* any RCU read-side critical section until the next call to
- * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * user_exit() or ct_irq_enter(). Let's remove RCU's dependency
* on the tick.
*/
- if (state == CONTEXT_USER) {
+ if (state == CT_STATE_USER) {
+ instrumentation_begin();
trace_user_enter(0);
vtime_user_enter(current);
+ instrumentation_end();
+ }
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
+
+ /*
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_eqs_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+ ct_kernel_exit(true, CT_RCU_WATCHING + state);
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ raw_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ raw_atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires CT_RCU_WATCHING increments to be fully
+ * ordered.
+ */
+ raw_atomic_add(state, &ct->state);
}
- rcu_user_enter();
}
- /*
- * Even if context tracking is disabled on this CPU, because it's outside
- * the full dynticks mask for example, we still have to keep track of the
- * context transitions and states to prevent inconsistency on those of
- * other CPUs.
- * If a task triggers an exception in userspace, sleep on the exception
- * handler and then migrate to another CPU, that new CPU must know where
- * the exception returns by the time we call exception_exit().
- * This information can only be provided by the previous CPU when it called
- * exception_enter().
- * OTOH we can spare the calls to vtime and RCU when context_tracking.active
- * is false because we know that CPU is not tickless.
- */
- __this_cpu_write(context_tracking.state, state);
}
+ context_tracking_recursion_exit();
+}
+EXPORT_SYMBOL_GPL(__ct_user_enter);
+
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_restore() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_enter() through user_enter_irqoff()
+ * or context_tracking_guest_enter(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_enter(enum ctx_state state)
+{
+ unsigned long flags;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ __ct_user_enter(state);
local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_enter);
-EXPORT_SYMBOL_GPL(context_tracking_enter);
+NOKPROBE_SYMBOL(ct_user_enter);
+EXPORT_SYMBOL_GPL(ct_user_enter);
-void context_tracking_user_enter(void)
+/**
+ * user_enter_callable() - Unfortunate ASM callable version of user_enter() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
+ *
+ * This OBSOLETE function should be noinstr but it unsafely calls
+ * local_irq_restore(), involving illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call user_enter_irqoff(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void user_enter_callable(void)
{
- context_tracking_enter(CONTEXT_USER);
+ user_enter();
}
-NOKPROBE_SYMBOL(context_tracking_user_enter);
+NOKPROBE_SYMBOL(user_enter_callable);
/**
- * context_tracking_exit - Inform the context tracking that the CPU is
- * exiting user or guest mode and entering the kernel.
+ * __ct_user_exit - Inform the context tracking that the CPU is
+ * exiting user or guest mode and entering the kernel.
+ *
+ * @state: userspace context-tracking state being exited from.
*
* This function must be called after we entered the kernel from user or
* guest space before any use of RCU read side critical section. This
@@ -128,68 +606,126 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void context_tracking_exit(enum ctx_state state)
+void noinstr __ct_user_exit(enum ctx_state state)
{
- unsigned long flags;
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
- if (!context_tracking_is_enabled())
+ if (!context_tracking_recursion_enter())
return;
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
- if (__this_cpu_read(context_tracking.state) == state) {
- if (__this_cpu_read(context_tracking.active)) {
+ if (__ct_state() == state) {
+ if (ct->active) {
/*
- * We are going to run code that may use RCU. Inform
- * RCU core about that (ie: we may need the tick again).
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
*/
- rcu_user_exit();
- if (state == CONTEXT_USER) {
+ ct_kernel_enter(true, CT_RCU_WATCHING - state);
+ if (state == CT_STATE_USER) {
+ instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
+ instrumentation_end();
+ }
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ raw_atomic_set(&ct->state, CT_STATE_KERNEL);
+
+ } else {
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ raw_atomic_set(&ct->state, CT_STATE_KERNEL);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires CT_RCU_WATCHING increments to be fully
+ * ordered.
+ */
+ raw_atomic_sub(state, &ct->state);
}
}
- __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
- local_irq_restore(flags);
+ context_tracking_recursion_exit();
}
-NOKPROBE_SYMBOL(context_tracking_exit);
-EXPORT_SYMBOL_GPL(context_tracking_exit);
+EXPORT_SYMBOL_GPL(__ct_user_exit);
-void context_tracking_user_exit(void)
+/*
+ * OBSOLETE:
+ * This function should be noinstr but the below local_irq_save() is
+ * unsafe because it involves illegal RCU uses through tracing and lockdep.
+ * This is unlikely to be fixed as this function is obsolete. The preferred
+ * way is to call __context_tracking_exit() through user_exit_irqoff()
+ * or context_tracking_guest_exit(). It should be the arch entry code
+ * responsibility to call into context tracking with IRQs disabled.
+ */
+void ct_user_exit(enum ctx_state state)
{
- context_tracking_exit(CONTEXT_USER);
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ __ct_user_exit(state);
+ local_irq_restore(flags);
}
-NOKPROBE_SYMBOL(context_tracking_user_exit);
+NOKPROBE_SYMBOL(ct_user_exit);
+EXPORT_SYMBOL_GPL(ct_user_exit);
/**
- * __context_tracking_task_switch - context switch the syscall callbacks
- * @prev: the task that is being switched out
- * @next: the task that is being switched in
- *
- * The context tracking uses the syscall slow path to implement its user-kernel
- * boundaries probes on syscalls. This way it doesn't impact the syscall fast
- * path on CPUs that don't do context tracking.
+ * user_exit_callable() - Unfortunate ASM callable version of user_exit() for
+ * archs that didn't manage to check the context tracking
+ * static key from low level code.
*
- * But we need to clear the flag on the previous task because it may later
- * migrate to some CPU that doesn't do the context tracking. As such the TIF
- * flag may not be desired there.
+ * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(),
+ * involving illegal RCU uses through tracing and lockdep. This is unlikely
+ * to be fixed as this function is obsolete. The preferred way is to call
+ * user_exit_irqoff(). It should be the arch entry code responsibility to
+ * call into context tracking with IRQs disabled.
*/
-void __context_tracking_task_switch(struct task_struct *prev,
- struct task_struct *next)
+void user_exit_callable(void)
{
- clear_tsk_thread_flag(prev, TIF_NOHZ);
- set_tsk_thread_flag(next, TIF_NOHZ);
+ user_exit();
}
+NOKPROBE_SYMBOL(user_exit_callable);
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init ct_cpu_track_user(int cpu)
+{
+ static __initdata bool initialized = false;
+
+ if (!per_cpu(context_tracking.active, cpu)) {
+ per_cpu(context_tracking.active, cpu) = true;
+ static_branch_inc(&context_tracking_key);
+ }
+
+ if (initialized)
+ return;
+
+#ifdef CONFIG_HAVE_TIF_NOHZ
+ /*
+ * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
+ * This assumes that init is the only task at this early boot stage.
+ */
+ set_tsk_thread_flag(&init_task, TIF_NOHZ);
+#endif
+ WARN_ON_ONCE(!tasklist_empty());
+
+ initialized = true;
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE
void __init context_tracking_init(void)
{
int cpu;
for_each_possible_cpu(cpu)
- context_tracking_cpu_set(cpu);
+ ct_cpu_track_user(cpu);
}
#endif
+
+#endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 94bbe4695232..b674fdf96208 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3,15 +3,21 @@
*
* This code is licenced under the GPL.
*/
+#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
@@ -21,163 +27,531 @@
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
+#include <linux/irq.h>
+#include <linux/nmi.h>
+#include <linux/smpboot.h>
+#include <linux/relay.h>
+#include <linux/slab.h>
+#include <linux/scs.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/cpuset.h>
+#include <linux/random.h>
+#include <linux/cc_platform.h>
+#include <linux/parser.h>
+
#include <trace/events/power.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpuhp.h>
#include "smpboot.h"
+/**
+ * struct cpuhp_cpu_state - Per cpu hotplug state storage
+ * @state: The current cpu state
+ * @target: The target state
+ * @fail: Current CPU hotplug callback state
+ * @thread: Pointer to the hotplug thread
+ * @should_run: Thread should execute
+ * @rollback: Perform a rollback
+ * @single: Single callback invocation
+ * @bringup: Single callback bringup or teardown selector
+ * @node: Remote CPU node; for multi-instance, do a
+ * single entry callback for install/remove
+ * @last: For multi-instance rollback, remember how far we got
+ * @cb_state: The state for a single callback (install/uninstall)
+ * @result: Result of the operation
+ * @ap_sync_state: State for AP synchronization
+ * @done_up: Signal completion to the issuer of the task for cpu-up
+ * @done_down: Signal completion to the issuer of the task for cpu-down
+ */
+struct cpuhp_cpu_state {
+ enum cpuhp_state state;
+ enum cpuhp_state target;
+ enum cpuhp_state fail;
+#ifdef CONFIG_SMP
+ struct task_struct *thread;
+ bool should_run;
+ bool rollback;
+ bool single;
+ bool bringup;
+ struct hlist_node *node;
+ struct hlist_node *last;
+ enum cpuhp_state cb_state;
+ int result;
+ atomic_t ap_sync_state;
+ struct completion done_up;
+ struct completion done_down;
+#endif
+};
+
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+ .fail = CPUHP_INVALID,
+};
+
+#ifdef CONFIG_SMP
+cpumask_t cpus_booted_once_mask;
+#endif
+
+#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
+static struct lockdep_map cpuhp_state_up_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
+static struct lockdep_map cpuhp_state_down_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+
+
+static inline void cpuhp_lock_acquire(bool bringup)
+{
+ lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+
+static inline void cpuhp_lock_release(bool bringup)
+{
+ lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+
+static inline void cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
+
+#endif
+
+/**
+ * struct cpuhp_step - Hotplug state machine step
+ * @name: Name of the step
+ * @startup: Startup function of the step
+ * @teardown: Teardown function of the step
+ * @cant_stop: Bringup/teardown can't be stopped at this step
+ * @multi_instance: State has multiple instances which get added afterwards
+ */
+struct cpuhp_step {
+ const char *name;
+ union {
+ int (*single)(unsigned int cpu);
+ int (*multi)(unsigned int cpu,
+ struct hlist_node *node);
+ } startup;
+ union {
+ int (*single)(unsigned int cpu);
+ int (*multi)(unsigned int cpu,
+ struct hlist_node *node);
+ } teardown;
+ /* private: */
+ struct hlist_head list;
+ /* public: */
+ bool cant_stop;
+ bool multi_instance;
+};
+
+static DEFINE_MUTEX(cpuhp_state_mutex);
+static struct cpuhp_step cpuhp_hp_states[];
+
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+ return cpuhp_hp_states + state;
+}
+
+static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
+{
+ return bringup ? !step->startup.single : !step->teardown.single;
+}
+
+/**
+ * cpuhp_invoke_callback - Invoke the callbacks for a given state
+ * @cpu: The cpu for which the callback should be invoked
+ * @state: The state to do callbacks for
+ * @bringup: True if the bringup callback should be invoked
+ * @node: For multi-instance, do a single entry callback for install/remove
+ * @lastp: For multi-instance rollback, remember how far we got
+ *
+ * Called from cpu hotplug and from the state register machinery.
+ *
+ * Return: %0 on success or a negative errno code
+ */
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+ bool bringup, struct hlist_node *node,
+ struct hlist_node **lastp)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ int (*cbm)(unsigned int cpu, struct hlist_node *node);
+ int (*cb)(unsigned int cpu);
+ int ret, cnt;
+
+ if (st->fail == state) {
+ st->fail = CPUHP_INVALID;
+ return -EAGAIN;
+ }
+
+ if (cpuhp_step_empty(bringup, step)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ if (!step->multi_instance) {
+ WARN_ON_ONCE(lastp && *lastp);
+ cb = bringup ? step->startup.single : step->teardown.single;
+
+ trace_cpuhp_enter(cpu, st->target, state, cb);
+ ret = cb(cpu);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ return ret;
+ }
+ cbm = bringup ? step->startup.multi : step->teardown.multi;
+
+ /* Single invocation for instance add/remove */
+ if (node) {
+ WARN_ON_ONCE(lastp && *lastp);
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ return ret;
+ }
+
+ /* State transition. Invoke on all instances */
+ cnt = 0;
+ hlist_for_each(node, &step->list) {
+ if (lastp && node == *lastp)
+ break;
+
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ if (ret) {
+ if (!lastp)
+ goto err;
+
+ *lastp = node;
+ return ret;
+ }
+ cnt++;
+ }
+ if (lastp)
+ *lastp = NULL;
+ return 0;
+err:
+ /* Rollback the instances if one failed */
+ cbm = !bringup ? step->startup.multi : step->teardown.multi;
+ if (!cbm)
+ return ret;
+
+ hlist_for_each(node, &step->list) {
+ if (!cnt--)
+ break;
+
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ /*
+ * Rollback must not fail,
+ */
+ WARN_ON_ONCE(ret);
+ }
+ return ret;
+}
+
#ifdef CONFIG_SMP
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+ /*
+ * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+ * purposes as that state is handled explicitly in cpu_down.
+ */
+ return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
+}
+
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ wait_for_completion(done);
+}
+
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ complete(done);
+}
+
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+ return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
+/* Synchronization state management */
+enum cpuhp_sync_state {
+ SYNC_STATE_DEAD,
+ SYNC_STATE_KICKED,
+ SYNC_STATE_SHOULD_DIE,
+ SYNC_STATE_ALIVE,
+ SYNC_STATE_SHOULD_ONLINE,
+ SYNC_STATE_ONLINE,
+};
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC
+/**
+ * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
+ * @state: The synchronization state to set
+ *
+ * No synchronization point. Just update of the synchronization state, but implies
+ * a full barrier so that the AP changes are visible before the control CPU proceeds.
+ */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ (void)atomic_xchg(st, state);
+}
+
+void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }
+
+static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
+ enum cpuhp_sync_state next_state)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ ktime_t now, end, start = ktime_get();
+ int sync;
+
+ end = start + 10ULL * NSEC_PER_SEC;
+
+ sync = atomic_read(st);
+ while (1) {
+ if (sync == state) {
+ if (!atomic_try_cmpxchg(st, &sync, next_state))
+ continue;
+ return true;
+ }
+
+ now = ktime_get();
+ if (now > end) {
+ /* Timeout. Leave the state unchanged */
+ return false;
+ } else if (now - start < NSEC_PER_MSEC) {
+ /* Poll for one millisecond */
+ arch_cpuhp_sync_state_poll();
+ } else {
+ usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC);
+ }
+ sync = atomic_read(st);
+ }
+ return true;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC */
+static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
+/**
+ * cpuhp_ap_report_dead - Update synchronization state to DEAD
+ *
+ * No synchronization point. Just update of the synchronization state.
+ */
+void cpuhp_ap_report_dead(void)
+{
+ cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
+}
+
+void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }
+
+/*
+ * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
+ * because the AP cannot issue complete() at this stage.
+ */
+static void cpuhp_bp_sync_dead(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+ do {
+ /* CPU can have reported dead already. Don't overwrite that! */
+ if (sync == SYNC_STATE_DEAD)
+ break;
+ } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));
+
+ if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
+ /* CPU reached dead state. Invoke the cleanup function */
+ arch_cpuhp_cleanup_dead_cpu(cpu);
+ return;
+ }
+
+ /* No further action possible. Emit message and give up. */
+ pr_err("CPU%u failed to report dead state\n", cpu);
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */
+
+#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
+/**
+ * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
+ *
+ * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
+ * for the BP to release it.
+ */
+void cpuhp_ap_sync_alive(void)
+{
+ atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);
+
+ cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);
+
+ /* Wait for the control CPU to release it. */
+ while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
+ cpu_relax();
+}
+
+static bool cpuhp_can_boot_ap(unsigned int cpu)
+{
+ atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
+ int sync = atomic_read(st);
+
+again:
+ switch (sync) {
+ case SYNC_STATE_DEAD:
+ /* CPU is properly dead */
+ break;
+ case SYNC_STATE_KICKED:
+ /* CPU did not come up in previous attempt */
+ break;
+ case SYNC_STATE_ALIVE:
+ /* CPU is stuck cpuhp_ap_sync_alive(). */
+ break;
+ default:
+ /* CPU failed to report online or dead and is in limbo state. */
+ return false;
+ }
+
+ /* Prepare for booting */
+ if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
+ goto again;
+
+ return true;
+}
+
+void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }
+
+/*
+ * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
+ * because the AP cannot issue complete() so early in the bringup.
+ */
+static int cpuhp_bp_sync_alive(unsigned int cpu)
+{
+ int ret = 0;
+
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
+ return 0;
+
+ if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
+ pr_err("CPU%u failed to report alive state\n", cpu);
+ ret = -EIO;
+ }
+
+ /* Let the architecture cleanup the kick alive mechanics. */
+ arch_cpuhp_cleanup_kick_cpu(cpu);
+ return ret;
+}
+#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
+static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
+static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
+#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
+bool cpuhp_tasks_frozen;
+EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
/*
* The following two APIs (cpu_maps_update_begin/done) must be used when
* attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
- * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
- * hotplug callback (un)registration performed using __register_cpu_notifier()
- * or __unregister_cpu_notifier().
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
-EXPORT_SYMBOL(cpu_notifier_register_begin);
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
-EXPORT_SYMBOL(cpu_notifier_register_done);
-static RAW_NOTIFIER_HEAD(cpu_chain);
-
-/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+/*
+ * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU
-static struct {
- struct task_struct *active_writer;
- /* wait queue to wake up the active_writer */
- wait_queue_head_t wq;
- /* verifies that no writer will get active while readers are active */
- struct mutex lock;
- /*
- * Also blocks the new readers during
- * an ongoing cpu hotplug operation.
- */
- atomic_t refcount;
+DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} cpu_hotplug = {
- .active_writer = NULL,
- .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
- .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "cpu_hotplug.lock" },
-#endif
-};
+static bool cpu_hotplug_offline_disabled __ro_after_init;
-/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
-#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire_tryread() \
- lock_map_acquire_tryread(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
-#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+void cpus_read_lock(void)
+{
+ percpu_down_read(&cpu_hotplug_lock);
+}
+EXPORT_SYMBOL_GPL(cpus_read_lock);
+int cpus_read_trylock(void)
+{
+ return percpu_down_read_trylock(&cpu_hotplug_lock);
+}
+EXPORT_SYMBOL_GPL(cpus_read_trylock);
-void get_online_cpus(void)
+void cpus_read_unlock(void)
{
- might_sleep();
- if (cpu_hotplug.active_writer == current)
- return;
- cpuhp_lock_acquire_read();
- mutex_lock(&cpu_hotplug.lock);
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
+ percpu_up_read(&cpu_hotplug_lock);
}
-EXPORT_SYMBOL_GPL(get_online_cpus);
+EXPORT_SYMBOL_GPL(cpus_read_unlock);
-bool try_get_online_cpus(void)
+void cpus_write_lock(void)
{
- if (cpu_hotplug.active_writer == current)
- return true;
- if (!mutex_trylock(&cpu_hotplug.lock))
- return false;
- cpuhp_lock_acquire_tryread();
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
- return true;
+ percpu_down_write(&cpu_hotplug_lock);
}
-EXPORT_SYMBOL_GPL(try_get_online_cpus);
-void put_online_cpus(void)
+void cpus_write_unlock(void)
{
- int refcount;
+ percpu_up_write(&cpu_hotplug_lock);
+}
- if (cpu_hotplug.active_writer == current)
+void lockdep_assert_cpus_held(void)
+{
+ /*
+ * We can't have hotplug operations before userspace starts running,
+ * and some init codepaths will knowingly not take the hotplug lock.
+ * This is all valid, so mute lockdep until it makes sense to report
+ * unheld locks.
+ */
+ if (system_state < SYSTEM_RUNNING)
return;
- refcount = atomic_dec_return(&cpu_hotplug.refcount);
- if (WARN_ON(refcount < 0)) /* try to fix things up */
- atomic_inc(&cpu_hotplug.refcount);
-
- if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
- wake_up(&cpu_hotplug.wq);
-
- cpuhp_lock_release();
-
+ percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
-EXPORT_SYMBOL_GPL(put_online_cpus);
+EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held);
-/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
- *
- * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- * writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- * non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
- */
-void cpu_hotplug_begin(void)
+#ifdef CONFIG_LOCKDEP
+int lockdep_is_cpus_held(void)
{
- DEFINE_WAIT(wait);
+ return percpu_rwsem_is_held(&cpu_hotplug_lock);
+}
+#endif
- cpu_hotplug.active_writer = current;
- cpuhp_lock_acquire();
+static void lockdep_acquire_cpus_lock(void)
+{
+ rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
+}
- for (;;) {
- mutex_lock(&cpu_hotplug.lock);
- prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
- if (likely(!atomic_read(&cpu_hotplug.refcount)))
- break;
- mutex_unlock(&cpu_hotplug.lock);
- schedule();
- }
- finish_wait(&cpu_hotplug.wq, &wait);
+static void lockdep_release_cpus_lock(void)
+{
+ rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
-void cpu_hotplug_done(void)
+/* Declare CPU offlining not supported */
+void cpu_hotplug_disable_offlining(void)
{
- cpu_hotplug.active_writer = NULL;
- mutex_unlock(&cpu_hotplug.lock);
- cpuhp_lock_release();
+ cpu_maps_update_begin();
+ cpu_hotplug_offline_disabled = true;
+ cpu_maps_update_done();
}
/*
@@ -190,72 +564,667 @@ void cpu_hotplug_done(void)
void cpu_hotplug_disable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
+ cpu_hotplug_disabled++;
cpu_maps_update_done();
}
+EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
+
+static void __cpu_hotplug_enable(void)
+{
+ if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+ return;
+ cpu_hotplug_disabled--;
+}
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ __cpu_hotplug_enable();
cpu_maps_update_done();
}
+EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
+
+#else
+
+static void lockdep_acquire_cpus_lock(void)
+{
+}
+
+static void lockdep_release_cpus_lock(void)
+{
+}
#endif /* CONFIG_HOTPLUG_CPU */
-/* Need to know about CPUs going up/down? */
-int __ref register_cpu_notifier(struct notifier_block *nb)
+/*
+ * Architectures that need SMT-specific errata handling during SMT hotplug
+ * should override this.
+ */
+void __weak arch_smt_update(void) { }
+
+#ifdef CONFIG_HOTPLUG_SMT
+
+enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;
+
+void __init cpu_smt_disable(bool force)
+{
+ if (!cpu_smt_possible())
+ return;
+
+ if (force) {
+ pr_info("SMT: Force disabled\n");
+ cpu_smt_control = CPU_SMT_FORCE_DISABLED;
+ } else {
+ pr_info("SMT: disabled\n");
+ cpu_smt_control = CPU_SMT_DISABLED;
+ }
+ cpu_smt_num_threads = 1;
+}
+
+/*
+ * The decision whether SMT is supported can only be done after the full
+ * CPU identification. Called from architecture code.
+ */
+void __init cpu_smt_set_num_threads(unsigned int num_threads,
+ unsigned int max_threads)
+{
+ WARN_ON(!num_threads || (num_threads > max_threads));
+
+ if (max_threads == 1)
+ cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+ cpu_smt_max_threads = max_threads;
+
+ /*
+ * If SMT has been disabled via the kernel command line or SMT is
+ * not supported, set cpu_smt_num_threads to 1 for consistency.
+ * If enabled, take the architecture requested number of threads
+ * to bring up into account.
+ */
+ if (cpu_smt_control != CPU_SMT_ENABLED)
+ cpu_smt_num_threads = 1;
+ else if (num_threads < cpu_smt_num_threads)
+ cpu_smt_num_threads = num_threads;
+}
+
+static int __init smt_cmdline_disable(char *str)
+{
+ cpu_smt_disable(str && !strcmp(str, "force"));
+ return 0;
+}
+early_param("nosmt", smt_cmdline_disable);
+
+/*
+ * For Archicture supporting partial SMT states check if the thread is allowed.
+ * Otherwise this has already been checked through cpu_smt_max_threads when
+ * setting the SMT level.
+ */
+static inline bool cpu_smt_thread_allowed(unsigned int cpu)
+{
+#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
+ return topology_smt_thread_allowed(cpu);
+#else
+ return true;
+#endif
+}
+
+static inline bool cpu_bootable(unsigned int cpu)
{
+ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ return true;
+
+ /* All CPUs are bootable if controls are not configured */
+ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
+ return true;
+
+ /* All CPUs are bootable if CPU is not SMT capable */
+ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ return true;
+
+ if (topology_is_primary_thread(cpu))
+ return true;
+
+ /*
+ * On x86 it's required to boot all logical CPUs at least once so
+ * that the init code can get a chance to set CR4.MCE on each
+ * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
+ * core will shutdown the machine.
+ */
+ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
+}
+
+/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
+bool cpu_smt_possible(void)
+{
+ return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
+ cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(cpu_smt_possible);
+
+#else
+static inline bool cpu_bootable(unsigned int cpu) { return true; }
+#endif
+
+static inline enum cpuhp_state
+cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ bool bringup = st->state < target;
+
+ st->rollback = false;
+ st->last = NULL;
+
+ st->target = target;
+ st->single = false;
+ st->bringup = bringup;
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
+
+ return prev_state;
+}
+
+static inline void
+cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state prev_state)
+{
+ bool bringup = !st->bringup;
+
+ st->target = prev_state;
+
+ /*
+ * Already rolling back. No need invert the bringup value or to change
+ * the current state.
+ */
+ if (st->rollback)
+ return;
+
+ st->rollback = true;
+
+ /*
+ * If we have st->last we need to undo partial multi_instance of this
+ * state first. Otherwise start undo at the previous state.
+ */
+ if (!st->last) {
+ if (st->bringup)
+ st->state--;
+ else
+ st->state++;
+ }
+
+ st->bringup = bringup;
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+ if (!st->single && st->state == st->target)
+ return;
+
+ st->result = 0;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+ wait_for_ap_thread(st, st->bringup);
+}
+
+static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state;
int ret;
- cpu_maps_update_begin();
- ret = raw_notifier_chain_register(&cpu_chain, nb);
- cpu_maps_update_done();
+
+ prev_state = cpuhp_set_state(cpu, st, target);
+ __cpuhp_kick_ap(st);
+ if ((ret = st->result)) {
+ cpuhp_reset_state(cpu, st, prev_state);
+ __cpuhp_kick_ap(st);
+ }
+
return ret;
}
-int __ref __register_cpu_notifier(struct notifier_block *nb)
+static int bringup_wait_for_ap_online(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
+ wait_for_ap_thread(st, true);
+ if (WARN_ON_ONCE((!cpu_online(cpu))))
+ return -ECANCELED;
+
+ /* Unpark the hotplug thread of the target cpu */
+ kthread_unpark(st->thread);
+
+ /*
+ * SMT soft disabling on X86 requires to bring the CPU out of the
+ * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
+ * CPU marked itself as booted_once in notify_cpu_starting() so the
+ * cpu_bootable() check will now return false if this is not the
+ * primary sibling.
+ */
+ if (!cpu_bootable(cpu))
+ return -ECANCELED;
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+static int cpuhp_kick_ap_alive(unsigned int cpu)
{
- return raw_notifier_chain_register(&cpu_chain, nb);
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
+
+ return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
}
-static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
- int *nr_calls)
+static int cpuhp_bringup_ap(unsigned int cpu)
{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int ret;
- ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
- nr_calls);
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ * Prevent irq alloc/free across the bringup.
+ */
+ irq_lock_sparse();
- return notifier_to_errno(ret);
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
+
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(cpu, st, st->target);
+
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
}
+#else
+static int bringup_cpu(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ struct task_struct *idle = idle_thread_get(cpu);
+ int ret;
+
+ if (!cpuhp_can_boot_ap(cpu))
+ return -EAGAIN;
+
+ /*
+ * Some architectures have to walk the irq descriptors to
+ * setup the vector space for the cpu which comes online.
+ *
+ * Prevent irq alloc/free across the bringup by acquiring the
+ * sparse irq lock. Hold it until the upcoming CPU completes the
+ * startup in cpuhp_online_idle() which allows to avoid
+ * intermediate synchronization points in the architecture code.
+ */
+ irq_lock_sparse();
+
+ ret = __cpu_up(cpu, idle);
+ if (ret)
+ goto out_unlock;
-static int cpu_notify(unsigned long val, void *v)
+ ret = cpuhp_bp_sync_alive(cpu);
+ if (ret)
+ goto out_unlock;
+
+ ret = bringup_wait_for_ap_online(cpu);
+ if (ret)
+ goto out_unlock;
+
+ irq_unlock_sparse();
+
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(cpu, st, st->target);
+
+out_unlock:
+ irq_unlock_sparse();
+ return ret;
+}
+#endif
+
+static int finish_cpu(unsigned int cpu)
{
- return __cpu_notify(val, v, -1, NULL);
+ struct task_struct *idle = idle_thread_get(cpu);
+ struct mm_struct *mm = idle->active_mm;
+
+ /*
+ * sched_force_init_mm() ensured the use of &init_mm,
+ * drop that refcount now that the CPU has stopped.
+ */
+ WARN_ON(mm != &init_mm);
+ idle->active_mm = NULL;
+ mmdrop_lazy_tlb(mm);
+
+ return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Hotplug state machine related functions
+ */
-static void cpu_notify_nofail(unsigned long val, void *v)
+/*
+ * Get the next state to run. Empty ones will be skipped. Returns true if a
+ * state must be run.
+ *
+ * st->state will be modified ahead of time, to match state_to_run, as if it
+ * has already ran.
+ */
+static bool cpuhp_next_state(bool bringup,
+ enum cpuhp_state *state_to_run,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
{
- BUG_ON(cpu_notify(val, v));
+ do {
+ if (bringup) {
+ if (st->state >= target)
+ return false;
+
+ *state_to_run = ++st->state;
+ } else {
+ if (st->state <= target)
+ return false;
+
+ *state_to_run = st->state--;
+ }
+
+ if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
+ break;
+ } while (true);
+
+ return true;
}
-EXPORT_SYMBOL(register_cpu_notifier);
-EXPORT_SYMBOL(__register_cpu_notifier);
-void __ref unregister_cpu_notifier(struct notifier_block *nb)
+static int __cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target,
+ bool nofail)
{
- cpu_maps_update_begin();
- raw_notifier_chain_unregister(&cpu_chain, nb);
- cpu_maps_update_done();
+ enum cpuhp_state state;
+ int ret = 0;
+
+ while (cpuhp_next_state(bringup, &state, st, target)) {
+ int err;
+
+ err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
+ if (!err)
+ continue;
+
+ if (nofail) {
+ pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+ cpu, bringup ? "UP" : "DOWN",
+ cpuhp_get_step(st->state)->name,
+ st->state, err);
+ ret = -1;
+ } else {
+ ret = err;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
+}
+
+static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
+{
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ /*
+ * When CPU hotplug is disabled, then taking the CPU down is not
+ * possible because takedown_cpu() and the architecture and
+ * subsystem specific mechanisms are not available. So the CPU
+ * which would be completely unplugged again needs to stay around
+ * in the current state.
+ */
+ return st->state <= CPUHP_BRINGUP_CPU;
+}
+
+static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ ret = cpuhp_invoke_callback_range(true, cpu, st, target);
+ if (ret) {
+ pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
+
+ cpuhp_reset_state(cpu, st, prev_state);
+ if (can_rollback_cpu(st))
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
+ prev_state));
+ }
+ return ret;
+}
+
+/*
+ * The cpu hotplug threads manage the bringup and teardown of the cpus
+ */
+static int cpuhp_should_run(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+ return st->should_run;
+}
+
+/*
+ * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
+ * callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ * - single: runs st->cb_state
+ * - up: runs ++st->state, while st->state < st->target
+ * - down: runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
+ */
+static void cpuhp_thread_fun(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+ bool bringup = st->bringup;
+ enum cpuhp_state state;
+
+ if (WARN_ON_ONCE(!st->should_run))
+ return;
+
+ /*
+ * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
+ * that if we see ->should_run we also see the rest of the state.
+ */
+ smp_mb();
+
+ /*
+ * The BP holds the hotplug lock, but we're now running on the AP,
+ * ensure that anybody asserting the lock is held, will actually find
+ * it so.
+ */
+ lockdep_acquire_cpus_lock();
+ cpuhp_lock_acquire(bringup);
+
+ if (st->single) {
+ state = st->cb_state;
+ st->should_run = false;
+ } else {
+ st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
+ if (!st->should_run)
+ goto end;
+ }
+
+ WARN_ON_ONCE(!cpuhp_is_ap_state(state));
+
+ if (cpuhp_is_atomic_state(state)) {
+ local_irq_disable();
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ local_irq_enable();
+
+ /*
+ * STARTING/DYING must not fail!
+ */
+ WARN_ON_ONCE(st->result);
+ } else {
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ }
+
+ if (st->result) {
+ /*
+ * If we fail on a rollback, we're up a creek without no
+ * paddle, no way forward, no way back. We loose, thanks for
+ * playing.
+ */
+ WARN_ON_ONCE(st->rollback);
+ st->should_run = false;
+ }
+
+end:
+ cpuhp_lock_release(bringup);
+ lockdep_release_cpus_lock();
+
+ if (!st->should_run)
+ complete_ap_thread(st, bringup);
+}
+
+/* Invoke a single callback on a remote cpu */
+static int
+cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
+ struct hlist_node *node)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
+
+ if (!cpu_online(cpu))
+ return 0;
+
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
+
+ /*
+ * If we are up and running, use the hotplug thread. For early calls
+ * we invoke the thread function directly.
+ */
+ if (!st->thread)
+ return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+
+ st->rollback = false;
+ st->last = NULL;
+
+ st->node = node;
+ st->bringup = bringup;
+ st->cb_state = state;
+ st->single = true;
+
+ __cpuhp_kick_ap(st);
+
+ /*
+ * If we failed and did a partial, do a rollback.
+ */
+ if ((ret = st->result) && st->last) {
+ st->rollback = true;
+ st->bringup = !bringup;
+
+ __cpuhp_kick_ap(st);
+ }
+
+ /*
+ * Clean up the leftovers so the next hotplug operation wont use stale
+ * data.
+ */
+ st->node = st->last = NULL;
+ return ret;
+}
+
+static int cpuhp_kick_ap_work(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ enum cpuhp_state prev_state = st->state;
+ int ret;
+
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
+
+ trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
+ ret = cpuhp_kick_ap(cpu, st, st->target);
+ trace_cpuhp_exit(cpu, st->state, prev_state, ret);
+
+ return ret;
+}
+
+static struct smp_hotplug_thread cpuhp_threads = {
+ .store = &cpuhp_state.thread,
+ .thread_should_run = cpuhp_should_run,
+ .thread_fn = cpuhp_thread_fun,
+ .thread_comm = "cpuhp/%u",
+ .selfparking = true,
+};
+
+static __init void cpuhp_init_state(void)
+{
+ struct cpuhp_cpu_state *st;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ st = per_cpu_ptr(&cpuhp_state, cpu);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
+ }
}
-EXPORT_SYMBOL(unregister_cpu_notifier);
-void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+void __init cpuhp_threads_init(void)
{
- raw_notifier_chain_unregister(&cpu_chain, nb);
+ cpuhp_init_state();
+ BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
+ kthread_unpark(this_cpu_read(cpuhp_state.thread));
}
-EXPORT_SYMBOL(__unregister_cpu_notifier);
+
+#ifdef CONFIG_HOTPLUG_CPU
+#ifndef arch_clear_mm_cpumask_cpu
+#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
+#endif
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
@@ -292,259 +1261,441 @@ void clear_tasks_mm_cpumask(int cpu)
t = find_lock_task_mm(p);
if (!t)
continue;
- cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ arch_clear_mm_cpumask_cpu(cpu, t->mm);
task_unlock(t);
}
rcu_read_unlock();
}
-static inline void check_for_tasks(int dead_cpu)
-{
- struct task_struct *g, *p;
-
- read_lock_irq(&tasklist_lock);
- do_each_thread(g, p) {
- if (!p->on_rq)
- continue;
- /*
- * We do the check with unlocked task_rq(p)->lock.
- * Order the reading to do not warn about a task,
- * which was running on this cpu in the past, and
- * it's just been woken on another cpu.
- */
- rmb();
- if (task_cpu(p) != dead_cpu)
- continue;
-
- pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
- p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
- } while_each_thread(g, p);
- read_unlock_irq(&tasklist_lock);
-}
-
-struct take_cpu_down_param {
- unsigned long mod;
- void *hcpu;
-};
-
/* Take this CPU down. */
-static int __ref take_cpu_down(void *_param)
+static int take_cpu_down(void *_param)
{
- struct take_cpu_down_param *param = _param;
- int err;
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+ enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
+ int err, cpu = smp_processor_id();
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
- cpu_notify(CPU_DYING | param->mod, param->hcpu);
- /* Give up timekeeping duties */
- tick_handover_do_timer();
+ /*
+ * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
+ * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
+ */
+ WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
+
+ /*
+ * Invoke the former CPU_DYING callbacks. DYING must not fail!
+ */
+ cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
+
/* Park the stopper thread */
- kthread_park(current);
+ stop_machine_park(cpu);
return 0;
}
-/* Requires cpu_add_remove_lock to be held */
-static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+static int takedown_cpu(unsigned int cpu)
{
- int err, nr_calls = 0;
- void *hcpu = (void *)(long)cpu;
- unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
- struct take_cpu_down_param tcd_param = {
- .mod = mod,
- .hcpu = hcpu,
- };
-
- if (num_online_cpus() == 1)
- return -EBUSY;
-
- if (!cpu_online(cpu))
- return -EINVAL;
-
- cpu_hotplug_begin();
-
- err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
- if (err) {
- nr_calls--;
- __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
- pr_warn("%s: attempt to take down CPU %u failed\n",
- __func__, cpu);
- goto out_release;
- }
-
- /*
- * By now we've cleared cpu_active_mask, wait for all preempt-disabled
- * and RCU users of this state to go away such that all new such users
- * will observe it.
- *
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so explicitly call both.
- *
- * Do sync before park smpboot threads to take care the rcu boost case.
- */
-#ifdef CONFIG_PREEMPT
- synchronize_sched();
-#endif
- synchronize_rcu();
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int err;
- smpboot_park_threads(cpu);
+ /* Park the smpboot threads */
+ kthread_park(st->thread);
/*
- * So now all preempt/rcu users must observe !cpu_active().
+ * Prevent irq alloc/free while the dying cpu reorganizes the
+ * interrupt affinities.
*/
+ irq_lock_sparse();
- err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
- /* CPU didn't die: tell everyone. Can't complain. */
- smpboot_unpark_threads(cpu);
- cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
- goto out_release;
+ /* CPU refused to die */
+ irq_unlock_sparse();
+ /* Unpark the hotplug thread so we can rollback there */
+ kthread_unpark(st->thread);
+ return err;
}
BUG_ON(cpu_online(cpu));
/*
- * The migration_call() CPU_DYING callback will have removed all
- * runnable tasks from the cpu, there's only the idle task left now
+ * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
+ * all runnable tasks from the CPU, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
* Wait for the stop thread to go away.
*/
- while (!per_cpu(cpu_dead_idle, cpu))
- cpu_relax();
- smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
- per_cpu(cpu_dead_idle, cpu) = false;
+ wait_for_ap_thread(st, false);
+ BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
+
+ /* Interrupts are moved away from the dying cpu, reenable alloc/free */
+ irq_unlock_sparse();
hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);
- /* CPU is completely dead: tell everyone. Too late to complain. */
- tick_cleanup_dead_cpu(cpu);
- cpu_notify_nofail(CPU_DEAD | mod, hcpu);
+ cpuhp_bp_sync_dead(cpu);
- check_for_tasks(cpu);
+ lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu));
-out_release:
- cpu_hotplug_done();
- if (!err)
- cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
- return err;
+ /*
+ * Callbacks must be re-integrated right away to the RCU state machine.
+ * Otherwise an RCU callback could block a further teardown function
+ * waiting for its completion.
+ */
+ rcutree_migrate_callbacks(cpu);
+
+ return 0;
}
-int __ref cpu_down(unsigned int cpu)
+static void cpuhp_complete_idle_dead(void *arg)
{
- int err;
+ struct cpuhp_cpu_state *st = arg;
- cpu_maps_update_begin();
+ complete_ap_thread(st, false);
+}
- if (cpu_hotplug_disabled) {
- err = -EBUSY;
- goto out;
+void cpuhp_report_idle_dead(void)
+{
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+ BUG_ON(st->state != CPUHP_AP_OFFLINE);
+ tick_assert_timekeeping_handover();
+ rcutree_report_cpu_dead();
+ st->state = CPUHP_AP_IDLE_DEAD;
+ /*
+ * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
+ * to an online cpu.
+ */
+ smp_call_function_single(cpumask_first(cpu_online_mask),
+ cpuhp_complete_idle_dead, st, 0);
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ ret = cpuhp_invoke_callback_range(false, cpu, st, target);
+ if (ret) {
+ pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
+
+ cpuhp_reset_state(cpu, st, prev_state);
+
+ if (st->state < prev_state)
+ WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
+ prev_state));
}
- err = _cpu_down(cpu, 0);
+ return ret;
+}
+
+/* Requires cpu_add_remove_lock to be held */
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
+ enum cpuhp_state target)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int prev_state, ret = 0;
+
+ if (num_online_cpus() == 1)
+ return -EBUSY;
+
+ if (!cpu_present(cpu))
+ return -EINVAL;
+
+ cpus_write_lock();
+
+ cpuhp_tasks_frozen = tasks_frozen;
+
+ prev_state = cpuhp_set_state(cpu, st, target);
+ /*
+ * If the current CPU state is in the range of the AP hotplug thread,
+ * then we need to kick the thread.
+ */
+ if (st->state > CPUHP_TEARDOWN_CPU) {
+ st->target = max((int)target, CPUHP_TEARDOWN_CPU);
+ ret = cpuhp_kick_ap_work(cpu);
+ /*
+ * The AP side has done the error rollback already. Just
+ * return the error code..
+ */
+ if (ret)
+ goto out;
+
+ /*
+ * We might have stopped still in the range of the AP hotplug
+ * thread. Nothing to do anymore.
+ */
+ if (st->state > CPUHP_TEARDOWN_CPU)
+ goto out;
+
+ st->target = target;
+ }
+ /*
+ * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
+ * to do the further cleanups.
+ */
+ ret = cpuhp_down_callbacks(cpu, st, target);
+ if (ret && st->state < prev_state) {
+ if (st->state == CPUHP_TEARDOWN_CPU) {
+ cpuhp_reset_state(cpu, st, prev_state);
+ __cpuhp_kick_ap(st);
+ } else {
+ WARN(1, "DEAD callback error for CPU%d", cpu);
+ }
+ }
out:
+ cpus_write_unlock();
+ arch_smt_update();
+ return ret;
+}
+
+struct cpu_down_work {
+ unsigned int cpu;
+ enum cpuhp_state target;
+};
+
+static long __cpu_down_maps_locked(void *arg)
+{
+ struct cpu_down_work *work = arg;
+
+ return _cpu_down(work->cpu, 0, work->target);
+}
+
+static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
+{
+ struct cpu_down_work work = { .cpu = cpu, .target = target, };
+
+ /*
+ * If the platform does not support hotplug, report it explicitly to
+ * differentiate it from a transient offlining failure.
+ */
+ if (cpu_hotplug_offline_disabled)
+ return -EOPNOTSUPP;
+ if (cpu_hotplug_disabled)
+ return -EBUSY;
+
+ /*
+ * Ensure that the control task does not run on the to be offlined
+ * CPU to prevent a deadlock against cfs_b->period_timer.
+ * Also keep at least one housekeeping cpu onlined to avoid generating
+ * an empty sched_domain span.
+ */
+ for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
+ if (cpu != work.cpu)
+ return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+ }
+ return -EBUSY;
+}
+
+static int cpu_down(unsigned int cpu, enum cpuhp_state target)
+{
+ int err;
+
+ cpu_maps_update_begin();
+ err = cpu_down_maps_locked(cpu, target);
cpu_maps_update_done();
return err;
}
-EXPORT_SYMBOL(cpu_down);
-#endif /*CONFIG_HOTPLUG_CPU*/
-/*
- * Unpark per-CPU smpboot kthreads at CPU-online time.
+/**
+ * cpu_device_down - Bring down a cpu device
+ * @dev: Pointer to the cpu device to offline
+ *
+ * This function is meant to be used by device core cpu subsystem only.
+ *
+ * Other subsystems should use remove_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
-static int smpboot_thread_call(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+int cpu_device_down(struct device *dev)
{
- int cpu = (long)hcpu;
+ return cpu_down(dev->id, CPUHP_OFFLINE);
+}
- switch (action & ~CPU_TASKS_FROZEN) {
+int remove_cpu(unsigned int cpu)
+{
+ int ret;
- case CPU_ONLINE:
- smpboot_unpark_threads(cpu);
- break;
+ lock_device_hotplug();
+ ret = device_offline(get_cpu_device(cpu));
+ unlock_device_hotplug();
- default:
- break;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(remove_cpu);
+
+void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
+{
+ unsigned int cpu;
+ int error;
+
+ cpu_maps_update_begin();
+
+ /*
+ * Make certain the cpu I'm about to reboot on is online.
+ *
+ * This is inline to what migrate_to_reboot_cpu() already do.
+ */
+ if (!cpu_online(primary_cpu))
+ primary_cpu = cpumask_first(cpu_online_mask);
+
+ for_each_online_cpu(cpu) {
+ if (cpu == primary_cpu)
+ continue;
+
+ error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (error) {
+ pr_err("Failed to offline CPU%d - error=%d",
+ cpu, error);
+ break;
+ }
}
- return NOTIFY_OK;
+ /*
+ * Ensure all but the reboot CPU are offline.
+ */
+ BUG_ON(num_online_cpus() > 1);
+
+ /*
+ * Make sure the CPUs won't be enabled by someone else after this
+ * point. Kexec will reboot to a new kernel shortly resetting
+ * everything along the way.
+ */
+ cpu_hotplug_disabled++;
+
+ cpu_maps_update_done();
}
-static struct notifier_block smpboot_thread_notifier = {
- .notifier_call = smpboot_thread_call,
- .priority = CPU_PRI_SMPBOOT,
-};
+#else
+#define takedown_cpu NULL
+#endif /*CONFIG_HOTPLUG_CPU*/
-void __cpuinit smpboot_thread_init(void)
+/**
+ * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
+ * @cpu: cpu that just started
+ *
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+
+ rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
+ cpumask_set_cpu(cpu, &cpus_booted_once_mask);
+
+ /*
+ * STARTING must not fail!
+ */
+ cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
+}
+
+/*
+ * Called from the idle task. Wake up the controlling task which brings the
+ * hotplug thread of the upcoming CPU up and then delegates the rest of the
+ * online bringup to the hotplug thread.
+ */
+void cpuhp_online_idle(enum cpuhp_state state)
{
- register_cpu_notifier(&smpboot_thread_notifier);
+ struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+ /* Happens for the boot cpu */
+ if (state != CPUHP_AP_ONLINE_IDLE)
+ return;
+
+ cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);
+
+ /*
+ * Unpark the stopper thread before we start the idle loop (and start
+ * scheduling); this ensures the stopper task is always available.
+ */
+ stop_machine_unpark(smp_processor_id());
+
+ st->state = CPUHP_AP_ONLINE_IDLE;
+ complete_ap_thread(st, true);
}
/* Requires cpu_add_remove_lock to be held */
-static int _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
- int ret, nr_calls = 0;
- void *hcpu = (void *)(long)cpu;
- unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct task_struct *idle;
+ int ret = 0;
- cpu_hotplug_begin();
+ cpus_write_lock();
- if (cpu_online(cpu) || !cpu_present(cpu)) {
+ if (!cpu_present(cpu)) {
ret = -EINVAL;
goto out;
}
- idle = idle_thread_get(cpu);
- if (IS_ERR(idle)) {
- ret = PTR_ERR(idle);
+ /*
+ * The caller of cpu_up() might have raced with another
+ * caller. Nothing to do.
+ */
+ if (st->state >= target)
goto out;
- }
- ret = smpboot_create_threads(cpu);
- if (ret)
- goto out;
+ if (st->state == CPUHP_OFFLINE) {
+ /* Let it fail before we try to bring the cpu up */
+ idle = idle_thread_get(cpu);
+ if (IS_ERR(idle)) {
+ ret = PTR_ERR(idle);
+ goto out;
+ }
- ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
- if (ret) {
- nr_calls--;
- pr_warn("%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
- goto out_notify;
+ /*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
}
- /* Arch-specific enabling code. */
- ret = __cpu_up(cpu, idle);
- if (ret != 0)
- goto out_notify;
- BUG_ON(!cpu_online(cpu));
+ cpuhp_tasks_frozen = tasks_frozen;
- /* Now call notifier in preparation. */
- cpu_notify(CPU_ONLINE | mod, hcpu);
+ cpuhp_set_state(cpu, st, target);
+ /*
+ * If the current CPU state is in the range of the AP hotplug thread,
+ * then we need to kick the thread once more.
+ */
+ if (st->state > CPUHP_BRINGUP_CPU) {
+ ret = cpuhp_kick_ap_work(cpu);
+ /*
+ * The AP side has done the error rollback already. Just
+ * return the error code..
+ */
+ if (ret)
+ goto out;
+ }
-out_notify:
- if (ret != 0)
- __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+ /*
+ * Try to reach the target state. We max out on the BP at
+ * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
+ * responsible for bringing it up to the target state.
+ */
+ target = min((int)target, CPUHP_BRINGUP_CPU);
+ ret = cpuhp_up_callbacks(cpu, st, target);
out:
- cpu_hotplug_done();
-
+ cpus_write_unlock();
+ arch_smt_update();
return ret;
}
-int cpu_up(unsigned int cpu)
+static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
int err = 0;
if (!cpu_possible(cpu)) {
pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
cpu);
-#if defined(CONFIG_IA64)
- pr_err("please check additional_cpus= boot parameter\n");
-#endif
return -EINVAL;
}
@@ -558,24 +1709,205 @@ int cpu_up(unsigned int cpu)
err = -EBUSY;
goto out;
}
+ if (!cpu_bootable(cpu)) {
+ err = -EPERM;
+ goto out;
+ }
- err = _cpu_up(cpu, 0);
-
+ err = _cpu_up(cpu, 0, target);
out:
cpu_maps_update_done();
return err;
}
-EXPORT_SYMBOL_GPL(cpu_up);
+
+/**
+ * cpu_device_up - Bring up a cpu device
+ * @dev: Pointer to the cpu device to online
+ *
+ * This function is meant to be used by device core cpu subsystem only.
+ *
+ * Other subsystems should use add_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
+ */
+int cpu_device_up(struct device *dev)
+{
+ return cpu_up(dev->id, CPUHP_ONLINE);
+}
+
+int add_cpu(unsigned int cpu)
+{
+ int ret;
+
+ lock_device_hotplug();
+ ret = device_online(get_cpu_device(cpu));
+ unlock_device_hotplug();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(add_cpu);
+
+/**
+ * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
+ * @sleep_cpu: The cpu we hibernated on and should be brought up.
+ *
+ * On some architectures like arm64, we can hibernate on any CPU, but on
+ * wake up the CPU we hibernated on might be offline as a side effect of
+ * using maxcpus= for example.
+ *
+ * Return: %0 on success or a negative errno code
+ */
+int bringup_hibernate_cpu(unsigned int sleep_cpu)
+{
+ int ret;
+
+ if (!cpu_online(sleep_cpu)) {
+ pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
+ ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
+ if (ret) {
+ pr_err("Failed to bring hibernate-CPU up!\n");
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
+ enum cpuhp_state target)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+ if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
+ /*
+ * If this failed then cpu_up() might have only
+ * rolled back to CPUHP_BP_KICK_AP for the final
+ * online. Clean it up. NOOP if already rolled back.
+ */
+ WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
+ }
+
+ if (!--ncpus)
+ break;
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_PARALLEL
+static bool __cpuhp_parallel_bringup __ro_after_init = true;
+
+static int __init parallel_bringup_parse_param(char *arg)
+{
+ return kstrtobool(arg, &__cpuhp_parallel_bringup);
+}
+early_param("cpuhp.parallel", parallel_bringup_parse_param);
+
+#ifdef CONFIG_HOTPLUG_SMT
+static inline bool cpuhp_smt_aware(void)
+{
+ return cpu_smt_max_threads > 1;
+}
+
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_primary_thread_mask;
+}
+#else
+static inline bool cpuhp_smt_aware(void)
+{
+ return false;
+}
+static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
+{
+ return cpu_none_mask;
+}
+#endif
+
+bool __weak arch_cpuhp_init_parallel_bringup(void)
+{
+ return true;
+}
+
+/*
+ * On architectures which have enabled parallel bringup this invokes all BP
+ * prepare states for each of the to be onlined APs first. The last state
+ * sends the startup IPI to the APs. The APs proceed through the low level
+ * bringup code in parallel and then wait for the control CPU to release
+ * them one by one for the final onlining procedure.
+ *
+ * This avoids waiting for each AP to respond to the startup IPI in
+ * CPUHP_BRINGUP_CPU.
+ */
+static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
+{
+ const struct cpumask *mask = cpu_present_mask;
+
+ if (__cpuhp_parallel_bringup)
+ __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
+ if (!__cpuhp_parallel_bringup)
+ return false;
+
+ if (cpuhp_smt_aware()) {
+ const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
+ static struct cpumask tmp_mask __initdata;
+
+ /*
+ * X86 requires to prevent that SMT siblings stopped while
+ * the primary thread does a microcode update for various
+ * reasons. Bring the primary threads up first.
+ */
+ cpumask_and(&tmp_mask, mask, pmask);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
+ /* Account for the online CPUs */
+ ncpus -= num_online_cpus();
+ if (!ncpus)
+ return true;
+ /* Create the mask for secondary CPUs */
+ cpumask_andnot(&tmp_mask, mask, pmask);
+ mask = &tmp_mask;
+ }
+
+ /* Bring the not-yet started CPUs up */
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
+ cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
+ return true;
+}
+#else
+static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
+#endif /* CONFIG_HOTPLUG_PARALLEL */
+
+void __init bringup_nonboot_cpus(unsigned int max_cpus)
+{
+ if (!max_cpus)
+ return;
+
+ /* Try parallel bringup optimization if enabled */
+ if (cpuhp_bringup_cpus_parallel(max_cpus))
+ return;
+
+ /* Full per CPU serialized bringup */
+ cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE);
+}
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
-int disable_nonboot_cpus(void)
+int freeze_secondary_cpus(int primary)
{
- int cpu, first_cpu, error = 0;
+ int cpu, error = 0;
cpu_maps_update_begin();
- first_cpu = cpumask_first(cpu_online_mask);
+ if (primary == -1) {
+ primary = cpumask_first(cpu_online_mask);
+ if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+ primary = housekeeping_any_cpu(HK_TYPE_TIMER);
+ } else {
+ if (!cpu_online(primary))
+ primary = cpumask_first(cpu_online_mask);
+ }
+
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -583,11 +1915,18 @@ int disable_nonboot_cpus(void)
cpumask_clear(frozen_cpus);
pr_info("Disabling non-boot CPUs ...\n");
- for_each_online_cpu(cpu) {
- if (cpu == first_cpu)
+ for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) {
+ if (!cpu_online(cpu) || cpu == primary)
continue;
+
+ if (pm_wakeup_pending()) {
+ pr_info("Wakeup pending. Abort CPU freeze\n");
+ error = -EBUSY;
+ break;
+ }
+
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
- error = _cpu_down(cpu, 1);
+ error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
@@ -597,42 +1936,47 @@ int disable_nonboot_cpus(void)
}
}
- if (!error) {
+ if (!error)
BUG_ON(num_online_cpus() > 1);
- /* Make sure the CPUs won't be enabled by someone else */
- cpu_hotplug_disabled = 1;
- } else {
+ else
pr_err("Non-boot CPUs are not disabled\n");
- }
+
+ /*
+ * Make sure the CPUs won't be enabled by someone else. We need to do
+ * this even in case of failure as all freeze_secondary_cpus() users are
+ * supposed to do thaw_secondary_cpus() on the failure path.
+ */
+ cpu_hotplug_disabled++;
+
cpu_maps_update_done();
return error;
}
-void __weak arch_enable_nonboot_cpus_begin(void)
+void __weak arch_thaw_secondary_cpus_begin(void)
{
}
-void __weak arch_enable_nonboot_cpus_end(void)
+void __weak arch_thaw_secondary_cpus_end(void)
{
}
-void __ref enable_nonboot_cpus(void)
+void thaw_secondary_cpus(void)
{
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ __cpu_hotplug_enable();
if (cpumask_empty(frozen_cpus))
goto out;
pr_info("Enabling non-boot CPUs ...\n");
- arch_enable_nonboot_cpus_begin();
+ arch_thaw_secondary_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
trace_suspend_resume(TPS("CPU_ON"), cpu, true);
- error = _cpu_up(cpu, 1);
+ error = _cpu_up(cpu, 1, CPUHP_ONLINE);
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
@@ -641,7 +1985,7 @@ void __ref enable_nonboot_cpus(void)
pr_warn("Error taking CPU%d up: %d\n", cpu, error);
}
- arch_enable_nonboot_cpus_end();
+ arch_thaw_secondary_cpus_end();
cpumask_clear(frozen_cpus);
out:
@@ -705,26 +2049,1010 @@ core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
+int __boot_cpu_id;
+
+#endif /* CONFIG_SMP */
+
+/* Boot processor state steps */
+static struct cpuhp_step cpuhp_hp_states[] = {
+ [CPUHP_OFFLINE] = {
+ .name = "offline",
+ .startup.single = NULL,
+ .teardown.single = NULL,
+ },
+#ifdef CONFIG_SMP
+ [CPUHP_CREATE_THREADS]= {
+ .name = "threads:prepare",
+ .startup.single = smpboot_create_threads,
+ .teardown.single = NULL,
+ .cant_stop = true,
+ },
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_WORKQUEUE_PREP] = {
+ .name = "workqueue:prepare",
+ .startup.single = workqueue_prepare_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_HRTIMERS_PREPARE] = {
+ .name = "hrtimers:prepare",
+ .startup.single = hrtimers_prepare_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_SMPCFD_PREPARE] = {
+ .name = "smpcfd:prepare",
+ .startup.single = smpcfd_prepare_cpu,
+ .teardown.single = smpcfd_dead_cpu,
+ },
+ [CPUHP_RELAY_PREPARE] = {
+ .name = "relay:prepare",
+ .startup.single = relay_prepare_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_RCUTREE_PREP] = {
+ .name = "RCU/tree:prepare",
+ .startup.single = rcutree_prepare_cpu,
+ .teardown.single = rcutree_dead_cpu,
+ },
+ /*
+ * On the tear-down path, timers_dead_cpu() must be invoked
+ * before blk_mq_queue_reinit_notify() from notify_dead(),
+ * otherwise a RCU stall occurs.
+ */
+ [CPUHP_TIMERS_PREPARE] = {
+ .name = "timers:prepare",
+ .startup.single = timers_prepare_cpu,
+ .teardown.single = timers_dead_cpu,
+ },
+
+#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
+ /*
+ * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
+ * the next step will release it.
+ */
+ [CPUHP_BP_KICK_AP] = {
+ .name = "cpu:kick_ap",
+ .startup.single = cpuhp_kick_ap_alive,
+ },
+
+ /*
+ * Waits for the AP to reach cpuhp_ap_sync_alive() and then
+ * releases it for the complete bringup.
+ */
+ [CPUHP_BRINGUP_CPU] = {
+ .name = "cpu:bringup",
+ .startup.single = cpuhp_bringup_ap,
+ .teardown.single = finish_cpu,
+ .cant_stop = true,
+ },
+#else
+ /*
+ * All-in-one CPU bringup state which includes the kick alive.
+ */
+ [CPUHP_BRINGUP_CPU] = {
+ .name = "cpu:bringup",
+ .startup.single = bringup_cpu,
+ .teardown.single = finish_cpu,
+ .cant_stop = true,
+ },
+#endif
+ /* Final state before CPU kills itself */
+ [CPUHP_AP_IDLE_DEAD] = {
+ .name = "idle:dead",
+ },
+ /*
+ * Last state before CPU enters the idle loop to die. Transient state
+ * for synchronization.
+ */
+ [CPUHP_AP_OFFLINE] = {
+ .name = "ap:offline",
+ .cant_stop = true,
+ },
+ /* First state is scheduler control. Interrupts are disabled */
+ [CPUHP_AP_SCHED_STARTING] = {
+ .name = "sched:starting",
+ .startup.single = sched_cpu_starting,
+ .teardown.single = sched_cpu_dying,
+ },
+ [CPUHP_AP_RCUTREE_DYING] = {
+ .name = "RCU/tree:dying",
+ .startup.single = NULL,
+ .teardown.single = rcutree_dying_cpu,
+ },
+ [CPUHP_AP_SMPCFD_DYING] = {
+ .name = "smpcfd:dying",
+ .startup.single = NULL,
+ .teardown.single = smpcfd_dying_cpu,
+ },
+ [CPUHP_AP_HRTIMERS_DYING] = {
+ .name = "hrtimers:dying",
+ .startup.single = hrtimers_cpu_starting,
+ .teardown.single = hrtimers_cpu_dying,
+ },
+ [CPUHP_AP_TICK_DYING] = {
+ .name = "tick:dying",
+ .startup.single = NULL,
+ .teardown.single = tick_cpu_dying,
+ },
+ /* Entry state on starting. Interrupts enabled from here on. Transient
+ * state for synchronsization */
+ [CPUHP_AP_ONLINE] = {
+ .name = "ap:online",
+ },
+ /*
+ * Handled on control processor until the plugged processor manages
+ * this itself.
+ */
+ [CPUHP_TEARDOWN_CPU] = {
+ .name = "cpu:teardown",
+ .startup.single = NULL,
+ .teardown.single = takedown_cpu,
+ .cant_stop = true,
+ },
+
+ [CPUHP_AP_SCHED_WAIT_EMPTY] = {
+ .name = "sched:waitempty",
+ .startup.single = NULL,
+ .teardown.single = sched_cpu_wait_empty,
+ },
+
+ /* Handle smpboot threads park/unpark */
+ [CPUHP_AP_SMPBOOT_THREADS] = {
+ .name = "smpboot/threads:online",
+ .startup.single = smpboot_unpark_threads,
+ .teardown.single = smpboot_park_threads,
+ },
+ [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
+ .name = "irq/affinity:online",
+ .startup.single = irq_affinity_online_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_AP_PERF_ONLINE] = {
+ .name = "perf:online",
+ .startup.single = perf_event_init_cpu,
+ .teardown.single = perf_event_exit_cpu,
+ },
+ [CPUHP_AP_WATCHDOG_ONLINE] = {
+ .name = "lockup_detector:online",
+ .startup.single = lockup_detector_online_cpu,
+ .teardown.single = lockup_detector_offline_cpu,
+ },
+ [CPUHP_AP_WORKQUEUE_ONLINE] = {
+ .name = "workqueue:online",
+ .startup.single = workqueue_online_cpu,
+ .teardown.single = workqueue_offline_cpu,
+ },
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
+ [CPUHP_AP_RCUTREE_ONLINE] = {
+ .name = "RCU/tree:online",
+ .startup.single = rcutree_online_cpu,
+ .teardown.single = rcutree_offline_cpu,
+ },
+#endif
+ /*
+ * The dynamically registered state space is here
+ */
+
+#ifdef CONFIG_SMP
+ /* Last state is scheduler control setting the cpu active */
+ [CPUHP_AP_ACTIVE] = {
+ .name = "sched:active",
+ .startup.single = sched_cpu_activate,
+ .teardown.single = sched_cpu_deactivate,
+ },
+#endif
+
+ /* CPU is fully up and running. */
+ [CPUHP_ONLINE] = {
+ .name = "online",
+ .startup.single = NULL,
+ .teardown.single = NULL,
+ },
+};
+
+/* Sanity check for callbacks */
+static int cpuhp_cb_check(enum cpuhp_state state)
+{
+ if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
+{
+ enum cpuhp_state i, end;
+ struct cpuhp_step *step;
+
+ switch (state) {
+ case CPUHP_AP_ONLINE_DYN:
+ step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
+ end = CPUHP_AP_ONLINE_DYN_END;
+ break;
+ case CPUHP_BP_PREPARE_DYN:
+ step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
+ end = CPUHP_BP_PREPARE_DYN_END;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ for (i = state; i <= end; i++, step++) {
+ if (!step->name)
+ return i;
+ }
+ WARN(1, "No more dynamic states available for CPU hotplug\n");
+ return -ENOSPC;
+}
+
+static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
+{
+ /* (Un)Install the callbacks for further cpu hotplug operations */
+ struct cpuhp_step *sp;
+ int ret = 0;
+
+ /*
+ * If name is NULL, then the state gets removed.
+ *
+ * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
+ * the first allocation from these dynamic ranges, so the removal
+ * would trigger a new allocation and clear the wrong (already
+ * empty) state, leaving the callbacks of the to be cleared state
+ * dangling, which causes wreckage on the next hotplug operation.
+ */
+ if (name && (state == CPUHP_AP_ONLINE_DYN ||
+ state == CPUHP_BP_PREPARE_DYN)) {
+ ret = cpuhp_reserve_state(state);
+ if (ret < 0)
+ return ret;
+ state = ret;
+ }
+ sp = cpuhp_get_step(state);
+ if (name && sp->name)
+ return -EBUSY;
+
+ sp->startup.single = startup;
+ sp->teardown.single = teardown;
+ sp->name = name;
+ sp->multi_instance = multi_instance;
+ INIT_HLIST_HEAD(&sp->list);
+ return ret;
+}
+
+static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
+{
+ return cpuhp_get_step(state)->teardown.single;
+}
+
+/*
+ * Call the startup/teardown function for a step either on the AP or
+ * on the current CPU.
+ */
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
+ struct hlist_node *node)
+{
+ struct cpuhp_step *sp = cpuhp_get_step(state);
+ int ret;
+
+ /*
+ * If there's nothing to do, we done.
+ * Relies on the union for multi_instance.
+ */
+ if (cpuhp_step_empty(bringup, sp))
+ return 0;
+ /*
+ * The non AP bound callbacks can fail on bringup. On teardown
+ * e.g. module removal we crash for now.
+ */
+#ifdef CONFIG_SMP
+ if (cpuhp_is_ap_state(state))
+ ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
+ else
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+#else
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+#endif
+ BUG_ON(ret && !bringup);
+ return ret;
+}
+
+/*
+ * Called from __cpuhp_setup_state on a recoverable failure.
+ *
+ * Note: The teardown callbacks for rollback are not allowed to fail!
+ */
+static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
+ struct hlist_node *node)
+{
+ int cpu;
+
+ /* Roll back the already executed steps on the other cpus */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpu >= failedcpu)
+ break;
+
+ /* Did we invoke the startup call on that cpu ? */
+ if (cpustate >= state)
+ cpuhp_issue_call(cpu, state, false, node);
+ }
+}
+
+int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
+ struct hlist_node *node,
+ bool invoke)
+{
+ struct cpuhp_step *sp;
+ int cpu;
+ int ret;
+
+ lockdep_assert_cpus_held();
+
+ sp = cpuhp_get_step(state);
+ if (sp->multi_instance == false)
+ return -EINVAL;
+
+ mutex_lock(&cpuhp_state_mutex);
+
+ if (!invoke || !sp->startup.multi)
+ goto add_node;
+
+ /*
+ * Try to call the startup callback for each present cpu
+ * depending on the hotplug state of the cpu.
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate < state)
+ continue;
+
+ ret = cpuhp_issue_call(cpu, state, true, node);
+ if (ret) {
+ if (sp->teardown.multi)
+ cpuhp_rollback_install(cpu, state, node);
+ goto unlock;
+ }
+ }
+add_node:
+ ret = 0;
+ hlist_add_head(node, &sp->list);
+unlock:
+ mutex_unlock(&cpuhp_state_mutex);
+ return ret;
+}
+
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+ bool invoke)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
+ cpus_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
+
/**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
- * @cpu: cpu that just started
+ * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
+ * @state: The state to setup
+ * @name: Name of the step
+ * @invoke: If true, the startup function is invoked for cpus where
+ * cpu state >= @state
+ * @startup: startup callback function
+ * @teardown: teardown callback function
+ * @multi_instance: State is set up for multiple instances which get
+ * added afterwards.
*
- * This function calls the cpu_chain notifiers with CPU_STARTING.
- * It must be called by the arch code on the new cpu, before the new cpu
- * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ * The caller needs to hold cpus read locked while calling this function.
+ * Return:
+ * On success:
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN;
+ * 0 for all other states
+ * On failure: proper (negative) error code
*/
-void notify_cpu_starting(unsigned int cpu)
+int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
+ const char *name, bool invoke,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
{
- unsigned long val = CPU_STARTING;
+ int cpu, ret = 0;
+ bool dynstate;
-#ifdef CONFIG_PM_SLEEP_SMP
- if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
- val = CPU_STARTING_FROZEN;
-#endif /* CONFIG_PM_SLEEP_SMP */
- cpu_notify(val, (void *)(long)cpu);
+ lockdep_assert_cpus_held();
+
+ if (cpuhp_cb_check(state) || !name)
+ return -EINVAL;
+
+ mutex_lock(&cpuhp_state_mutex);
+
+ ret = cpuhp_store_callbacks(state, name, startup, teardown,
+ multi_instance);
+
+ dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN;
+ if (ret > 0 && dynstate) {
+ state = ret;
+ ret = 0;
+ }
+
+ if (ret || !invoke || !startup)
+ goto out;
+
+ /*
+ * Try to call the startup callback for each present cpu
+ * depending on the hotplug state of the cpu.
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate < state)
+ continue;
+
+ ret = cpuhp_issue_call(cpu, state, true, NULL);
+ if (ret) {
+ if (teardown)
+ cpuhp_rollback_install(cpu, state, NULL);
+ cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
+ goto out;
+ }
+ }
+out:
+ mutex_unlock(&cpuhp_state_mutex);
+ /*
+ * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN,
+ * return the dynamically allocated state in case of success.
+ */
+ if (!ret && dynstate)
+ return state;
+ return ret;
}
+EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
-#endif /* CONFIG_SMP */
+int __cpuhp_setup_state(enum cpuhp_state state,
+ const char *name, bool invoke,
+ int (*startup)(unsigned int cpu),
+ int (*teardown)(unsigned int cpu),
+ bool multi_instance)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
+ teardown, multi_instance);
+ cpus_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(__cpuhp_setup_state);
+
+int __cpuhp_state_remove_instance(enum cpuhp_state state,
+ struct hlist_node *node, bool invoke)
+{
+ struct cpuhp_step *sp = cpuhp_get_step(state);
+ int cpu;
+
+ BUG_ON(cpuhp_cb_check(state));
+
+ if (!sp->multi_instance)
+ return -EINVAL;
+
+ cpus_read_lock();
+ mutex_lock(&cpuhp_state_mutex);
+
+ if (!invoke || !cpuhp_get_teardown_cb(state))
+ goto remove;
+ /*
+ * Call the teardown callback for each present cpu depending
+ * on the hotplug state of the cpu. This function is not
+ * allowed to fail currently!
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate >= state)
+ cpuhp_issue_call(cpu, state, false, node);
+ }
+
+remove:
+ hlist_del(node);
+ mutex_unlock(&cpuhp_state_mutex);
+ cpus_read_unlock();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
+
+/**
+ * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
+ * @state: The state to remove
+ * @invoke: If true, the teardown function is invoked for cpus where
+ * cpu state >= @state
+ *
+ * The caller needs to hold cpus read locked while calling this function.
+ * The teardown callback is currently not allowed to fail. Think
+ * about module removal!
+ */
+void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
+{
+ struct cpuhp_step *sp = cpuhp_get_step(state);
+ int cpu;
+
+ BUG_ON(cpuhp_cb_check(state));
+
+ lockdep_assert_cpus_held();
+
+ mutex_lock(&cpuhp_state_mutex);
+ if (sp->multi_instance) {
+ WARN(!hlist_empty(&sp->list),
+ "Error: Removing state %d which has instances left.\n",
+ state);
+ goto remove;
+ }
+
+ if (!invoke || !cpuhp_get_teardown_cb(state))
+ goto remove;
+
+ /*
+ * Call the teardown callback for each present cpu depending
+ * on the hotplug state of the cpu. This function is not
+ * allowed to fail currently!
+ */
+ for_each_present_cpu(cpu) {
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int cpustate = st->state;
+
+ if (cpustate >= state)
+ cpuhp_issue_call(cpu, state, false, NULL);
+ }
+remove:
+ cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
+ mutex_unlock(&cpuhp_state_mutex);
+}
+EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
+
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+{
+ cpus_read_lock();
+ __cpuhp_remove_state_cpuslocked(state, invoke);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL(__cpuhp_remove_state);
+
+#ifdef CONFIG_HOTPLUG_SMT
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ /*
+ * Disable can be called with CPU_SMT_ENABLED when changing
+ * from a higher to lower number of SMT threads per core.
+ */
+ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+/* Check if the core a CPU belongs to is online */
+#if !defined(topology_is_core_online)
+static inline bool topology_is_core_online(unsigned int cpu)
+{
+ return true;
+}
+#endif
+
+int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+#endif
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
+static ssize_t state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->state);
+}
+static DEVICE_ATTR_RO(state);
+
+static ssize_t target_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int target, ret;
+
+ ret = kstrtoint(buf, 10, &target);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
+ if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
+ return -EINVAL;
+#else
+ if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
+ return -EINVAL;
+#endif
+
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ return ret;
+
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(target);
+ ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ goto out;
+
+ if (st->state < target)
+ ret = cpu_up(dev->id, target);
+ else if (st->state > target)
+ ret = cpu_down(dev->id, target);
+ else if (WARN_ON(st->target != target))
+ st->target = target;
+out:
+ unlock_device_hotplug();
+ return ret ? ret : count;
+}
+
+static ssize_t target_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->target);
+}
+static DEVICE_ATTR_RW(target);
+
+static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int fail, ret;
+
+ ret = kstrtoint(buf, 10, &fail);
+ if (ret)
+ return ret;
+
+ if (fail == CPUHP_INVALID) {
+ st->fail = fail;
+ return count;
+ }
+
+ if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
+ return -EINVAL;
+
+ /*
+ * Cannot fail STARTING/DYING callbacks.
+ */
+ if (cpuhp_is_atomic_state(fail))
+ return -EINVAL;
+
+ /*
+ * DEAD callbacks cannot fail...
+ * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
+ * triggering STARTING callbacks, a failure in this state would
+ * hinder rollback.
+ */
+ if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
+ return -EINVAL;
+
+ /*
+ * Cannot fail anything that doesn't have callbacks.
+ */
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(fail);
+ if (!sp->startup.single && !sp->teardown.single)
+ ret = -EINVAL;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ return ret;
+
+ st->fail = fail;
+
+ return count;
+}
+
+static ssize_t fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->fail);
+}
+
+static DEVICE_ATTR_RW(fail);
+
+static struct attribute *cpuhp_cpu_attrs[] = {
+ &dev_attr_state.attr,
+ &dev_attr_target.attr,
+ &dev_attr_fail.attr,
+ NULL
+};
+
+static const struct attribute_group cpuhp_cpu_attr_group = {
+ .attrs = cpuhp_cpu_attrs,
+ .name = "hotplug",
+};
+
+static ssize_t states_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ ssize_t cur, res = 0;
+ int i;
+
+ mutex_lock(&cpuhp_state_mutex);
+ for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
+ struct cpuhp_step *sp = cpuhp_get_step(i);
+
+ if (sp->name) {
+ cur = sprintf(buf, "%3d: %s\n", i, sp->name);
+ buf += cur;
+ res += cur;
+ }
+ }
+ mutex_unlock(&cpuhp_state_mutex);
+ return res;
+}
+static DEVICE_ATTR_RO(states);
+
+static struct attribute *cpuhp_cpu_root_attrs[] = {
+ &dev_attr_states.attr,
+ NULL
+};
+
+static const struct attribute_group cpuhp_cpu_root_attr_group = {
+ .attrs = cpuhp_cpu_root_attrs,
+ .name = "hotplug",
+};
+
+#ifdef CONFIG_HOTPLUG_SMT
+
+static bool cpu_smt_num_threads_valid(unsigned int threads)
+{
+ if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
+ return threads >= 1 && threads <= cpu_smt_max_threads;
+ return threads == 1 || threads == cpu_smt_max_threads;
+}
+
+static ssize_t
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ctrlval, ret, num_threads, orig_threads;
+ bool force_off;
+
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+ return -EPERM;
+
+ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ return -ENODEV;
+
+ if (sysfs_streq(buf, "on")) {
+ ctrlval = CPU_SMT_ENABLED;
+ num_threads = cpu_smt_max_threads;
+ } else if (sysfs_streq(buf, "off")) {
+ ctrlval = CPU_SMT_DISABLED;
+ num_threads = 1;
+ } else if (sysfs_streq(buf, "forceoff")) {
+ ctrlval = CPU_SMT_FORCE_DISABLED;
+ num_threads = 1;
+ } else if (kstrtoint(buf, 10, &num_threads) == 0) {
+ if (num_threads == 1)
+ ctrlval = CPU_SMT_DISABLED;
+ else if (cpu_smt_num_threads_valid(num_threads))
+ ctrlval = CPU_SMT_ENABLED;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ return ret;
+
+ orig_threads = cpu_smt_num_threads;
+ cpu_smt_num_threads = num_threads;
+
+ force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;
+
+ if (num_threads > orig_threads)
+ ret = cpuhp_smt_enable();
+ else if (num_threads < orig_threads || force_off)
+ ret = cpuhp_smt_disable(ctrlval);
+
+ unlock_device_hotplug();
+ return ret ? ret : count;
+}
+
+#else /* !CONFIG_HOTPLUG_SMT */
+static ssize_t
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_HOTPLUG_SMT */
+
+static const char *smt_states[] = {
+ [CPU_SMT_ENABLED] = "on",
+ [CPU_SMT_DISABLED] = "off",
+ [CPU_SMT_FORCE_DISABLED] = "forceoff",
+ [CPU_SMT_NOT_SUPPORTED] = "notsupported",
+ [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
+};
+
+static ssize_t control_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const char *state = smt_states[cpu_smt_control];
+
+#ifdef CONFIG_HOTPLUG_SMT
+ /*
+ * If SMT is enabled but not all threads are enabled then show the
+ * number of threads. If all threads are enabled show "on". Otherwise
+ * show the state name.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED &&
+ cpu_smt_num_threads != cpu_smt_max_threads)
+ return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+#endif
+
+ return sysfs_emit(buf, "%s\n", state);
+}
+
+static ssize_t control_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ return __store_smt_control(dev, attr, buf, count);
+}
+static DEVICE_ATTR_RW(control);
+
+static ssize_t active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", sched_smt_active());
+}
+static DEVICE_ATTR_RO(active);
+
+static struct attribute *cpuhp_smt_attrs[] = {
+ &dev_attr_control.attr,
+ &dev_attr_active.attr,
+ NULL
+};
+
+static const struct attribute_group cpuhp_smt_attr_group = {
+ .attrs = cpuhp_smt_attrs,
+ .name = "smt",
+};
+
+static int __init cpu_smt_sysfs_init(void)
+{
+ struct device *dev_root;
+ int ret = -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
+ put_device(dev_root);
+ }
+ return ret;
+}
+
+static int __init cpuhp_sysfs_init(void)
+{
+ struct device *dev_root;
+ int cpu, ret;
+
+ ret = cpu_smt_sysfs_init();
+ if (ret)
+ return ret;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
+ put_device(dev_root);
+ if (ret)
+ return ret;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct device *dev = get_cpu_device(cpu);
+
+ if (!dev)
+ continue;
+ ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+device_initcall(cpuhp_sysfs_init);
+#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
/*
* cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -755,71 +3083,265 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
- = CPU_BITS_ALL;
+struct cpumask __cpu_possible_mask __ro_after_init
+ = {CPU_BITS_ALL};
+unsigned int __num_possible_cpus __ro_after_init = NR_CPUS;
#else
-static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+struct cpumask __cpu_possible_mask __ro_after_init;
+unsigned int __num_possible_cpus __ro_after_init;
#endif
-const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
-EXPORT_SYMBOL(cpu_possible_mask);
+EXPORT_SYMBOL(__cpu_possible_mask);
+EXPORT_SYMBOL(__num_possible_cpus);
-static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
-EXPORT_SYMBOL(cpu_online_mask);
+struct cpumask __cpu_online_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_online_mask);
-static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
-EXPORT_SYMBOL(cpu_present_mask);
+struct cpumask __cpu_enabled_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_enabled_mask);
-static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
-const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
-EXPORT_SYMBOL(cpu_active_mask);
+struct cpumask __cpu_present_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_present_mask);
-void set_cpu_possible(unsigned int cpu, bool possible)
+struct cpumask __cpu_active_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_active_mask);
+
+struct cpumask __cpu_dying_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_dying_mask);
+
+atomic_t __num_online_cpus __read_mostly;
+EXPORT_SYMBOL(__num_online_cpus);
+
+void init_cpu_present(const struct cpumask *src)
{
- if (possible)
- cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
+ cpumask_copy(&__cpu_present_mask, src);
}
-void set_cpu_present(unsigned int cpu, bool present)
+void init_cpu_possible(const struct cpumask *src)
{
- if (present)
- cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
+ cpumask_copy(&__cpu_possible_mask, src);
+ __num_possible_cpus = cpumask_weight(&__cpu_possible_mask);
}
void set_cpu_online(unsigned int cpu, bool online)
{
+ /*
+ * atomic_inc/dec() is required to handle the horrid abuse of this
+ * function by the reboot and kexec code which invoke it from
+ * IPI/NMI broadcasts when shutting down CPUs. Invocation from
+ * regular CPU hotplug is properly serialized.
+ *
+ * Note, that the fact that __num_online_cpus is of type atomic_t
+ * does not protect readers which are not serialized against
+ * concurrent hotplug operations.
+ */
if (online) {
- cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
+ atomic_inc(&__num_online_cpus);
} else {
- cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
+ atomic_dec(&__num_online_cpus);
}
}
-void set_cpu_active(unsigned int cpu, bool active)
+/*
+ * This should be marked __init, but there is a boatload of call sites
+ * which need to be fixed up to do so. Sigh...
+ */
+void set_cpu_possible(unsigned int cpu, bool possible)
{
- if (active)
- cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
- else
- cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
+ if (possible) {
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus++;
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus--;
+ }
}
-void init_cpu_present(const struct cpumask *src)
+/*
+ * Activate the first processor.
+ */
+void __init boot_cpu_init(void)
{
- cpumask_copy(to_cpumask(cpu_present_bits), src);
+ int cpu = smp_processor_id();
+
+ /* Mark the boot cpu "present", "online" etc for SMP and UP case */
+ set_cpu_online(cpu, true);
+ set_cpu_active(cpu, true);
+ set_cpu_present(cpu, true);
+ set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+ __boot_cpu_id = cpu;
+#endif
}
-void init_cpu_possible(const struct cpumask *src)
+/*
+ * Must be called _AFTER_ setting up the per_cpu areas
+ */
+void __init boot_cpu_hotplug_init(void)
{
- cpumask_copy(to_cpumask(cpu_possible_bits), src);
+#ifdef CONFIG_SMP
+ cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
+ atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
+#endif
+ this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
-void init_cpu_online(const struct cpumask *src)
+#ifdef CONFIG_CPU_MITIGATIONS
+/*
+ * All except the cross-thread attack vector are mitigated by default.
+ * Cross-thread mitigation often requires disabling SMT which is expensive
+ * so cross-thread mitigations are only partially enabled by default.
+ *
+ * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is
+ * present.
+ */
+static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = {
+ [CPU_MITIGATE_USER_KERNEL] = true,
+ [CPU_MITIGATE_USER_USER] = true,
+ [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM),
+ [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM),
+};
+
+bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
{
- cpumask_copy(to_cpumask(cpu_online_bits), src);
+ if (v < NR_CPU_ATTACK_VECTORS)
+ return attack_vectors[v];
+
+ WARN_ONCE(1, "Invalid attack vector %d\n", v);
+ return false;
}
+
+/*
+ * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally
+ * be combined with attack-vector disables which follow them.
+ *
+ * Examples:
+ * mitigations=auto,no_user_kernel,no_user_user,no_cross_thread
+ * mitigations=auto,nosmt,no_guest_host,no_guest_guest
+ *
+ * mitigations=off is equivalent to disabling all attack vectors.
+ */
+enum cpu_mitigations {
+ CPU_MITIGATIONS_OFF,
+ CPU_MITIGATIONS_AUTO,
+ CPU_MITIGATIONS_AUTO_NOSMT,
+};
+
+enum {
+ NO_USER_KERNEL,
+ NO_USER_USER,
+ NO_GUEST_HOST,
+ NO_GUEST_GUEST,
+ NO_CROSS_THREAD,
+ NR_VECTOR_PARAMS,
+};
+
+enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO;
+static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+
+static const match_table_t global_mitigations = {
+ { CPU_MITIGATIONS_AUTO_NOSMT, "auto,nosmt"},
+ { CPU_MITIGATIONS_AUTO, "auto"},
+ { CPU_MITIGATIONS_OFF, "off"},
+};
+
+static const match_table_t vector_mitigations = {
+ { NO_USER_KERNEL, "no_user_kernel"},
+ { NO_USER_USER, "no_user_user"},
+ { NO_GUEST_HOST, "no_guest_host"},
+ { NO_GUEST_GUEST, "no_guest_guest"},
+ { NO_CROSS_THREAD, "no_cross_thread"},
+ { NR_VECTOR_PARAMS, NULL},
+};
+
+static int __init mitigations_parse_global_opt(char *arg)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) {
+ const char *pattern = global_mitigations[i].pattern;
+
+ if (!strncmp(arg, pattern, strlen(pattern))) {
+ cpu_mitigations = global_mitigations[i].token;
+ return strlen(pattern);
+ }
+ }
+
+ return 0;
+}
+
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ char *s, *p;
+ int len;
+
+ len = mitigations_parse_global_opt(arg);
+
+ if (cpu_mitigations_off()) {
+ memset(attack_vectors, 0, sizeof(attack_vectors));
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ } else if (cpu_mitigations_auto_nosmt()) {
+ smt_mitigations = SMT_MITIGATIONS_ON;
+ }
+
+ p = arg + len;
+
+ if (!*p)
+ return 0;
+
+ /* Attack vector controls may come after the ',' */
+ if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) {
+ pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg);
+ return 0;
+ }
+
+ while ((s = strsep(&p, ",")) != NULL) {
+ switch (match_token(s, vector_mitigations, NULL)) {
+ case NO_USER_KERNEL:
+ attack_vectors[CPU_MITIGATE_USER_KERNEL] = false;
+ break;
+ case NO_USER_USER:
+ attack_vectors[CPU_MITIGATE_USER_USER] = false;
+ break;
+ case NO_GUEST_HOST:
+ attack_vectors[CPU_MITIGATE_GUEST_HOST] = false;
+ break;
+ case NO_GUEST_GUEST:
+ attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false;
+ break;
+ case NO_CROSS_THREAD:
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ break;
+ default:
+ pr_crit("Unsupported mitigations options %s\n", s);
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+/* mitigations=off */
+bool cpu_mitigations_off(void)
+{
+ return cpu_mitigations == CPU_MITIGATIONS_OFF;
+}
+EXPORT_SYMBOL_GPL(cpu_mitigations_off);
+
+/* mitigations=auto,nosmt */
+bool cpu_mitigations_auto_nosmt(void)
+{
+ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
+}
+EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
+#else
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n");
+ return 0;
+}
+#endif
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 9656a3c36503..7481fbb947d3 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2011 Google, Inc.
*
* Author:
* Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
@@ -22,15 +13,38 @@
#include <linux/spinlock.h>
#include <linux/syscore_ops.h>
-static DEFINE_RWLOCK(cpu_pm_notifier_lock);
-static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+/*
+ * atomic_notifiers use a spinlock_t, which can block under PREEMPT_RT.
+ * Notifications for cpu_pm will be issued by the idle task itself, which can
+ * never block, IOW it requires using a raw_spinlock_t.
+ */
+static struct {
+ struct raw_notifier_head chain;
+ raw_spinlock_t lock;
+} cpu_pm_notifier = {
+ .chain = RAW_NOTIFIER_INIT(cpu_pm_notifier.chain),
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(cpu_pm_notifier.lock),
+};
-static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+static int cpu_pm_notify(enum cpu_pm_event event)
{
int ret;
- ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
- nr_to_call, nr_calls);
+ rcu_read_lock();
+ ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);
+ rcu_read_unlock();
+
+ return notifier_to_errno(ret);
+}
+
+static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event event_down)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
return notifier_to_errno(ret);
}
@@ -42,18 +56,16 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
* Add a driver to a list of drivers that are notified about
* CPU and CPU cluster low power entry and exit.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_register.
+ * This function has the same return conditions as raw_notifier_chain_register.
*/
int cpu_pm_register_notifier(struct notifier_block *nb)
{
unsigned long flags;
int ret;
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_register(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
@@ -64,18 +76,16 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
*
* Remove a driver from the CPU PM notifier list.
*
- * This function may sleep, and has the same return conditions as
- * raw_notifier_chain_unregister.
+ * This function has the same return conditions as raw_notifier_chain_unregister.
*/
int cpu_pm_unregister_notifier(struct notifier_block *nb)
{
unsigned long flags;
int ret;
- write_lock_irqsave(&cpu_pm_notifier_lock, flags);
- ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
- write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
-
+ raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);
+ ret = raw_notifier_chain_unregister(&cpu_pm_notifier.chain, nb);
+ raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
@@ -97,20 +107,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
*/
int cpu_pm_enter(void)
{
- int nr_calls;
- int ret = 0;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
- if (ret)
- /*
- * Inform listeners (nr_calls - 1) about failure of CPU PM
- * PM entry who are notified earlier to prepare for it.
- */
- cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify_robust(CPU_PM_ENTER, CPU_PM_ENTER_FAILED);
}
EXPORT_SYMBOL_GPL(cpu_pm_enter);
@@ -128,13 +125,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
*/
int cpu_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_PM_EXIT);
}
EXPORT_SYMBOL_GPL(cpu_pm_exit);
@@ -156,20 +147,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*/
int cpu_cluster_pm_enter(void)
{
- int nr_calls;
- int ret = 0;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
- if (ret)
- /*
- * Inform listeners (nr_calls - 1) about failure of CPU cluster
- * PM entry who are notified earlier to prepare for it.
- */
- cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify_robust(CPU_CLUSTER_PM_ENTER, CPU_CLUSTER_PM_ENTER_FAILED);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
@@ -180,7 +158,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
* low power state that may have caused some blocks in the same power domain
* to reset.
*
- * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * Must be called after cpu_cluster_pm_enter has been called for the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
* and its PM extensions, local CPU timers context save/restore which
@@ -190,18 +168,12 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
*/
int cpu_cluster_pm_exit(void)
{
- int ret;
-
- read_lock(&cpu_pm_notifier_lock);
- ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
- read_unlock(&cpu_pm_notifier_lock);
-
- return ret;
+ return cpu_pm_notify(CPU_CLUSTER_PM_EXIT);
}
EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
#ifdef CONFIG_PM
-static int cpu_pm_suspend(void)
+static int cpu_pm_suspend(void *data)
{
int ret;
@@ -213,20 +185,24 @@ static int cpu_pm_suspend(void)
return ret;
}
-static void cpu_pm_resume(void)
+static void cpu_pm_resume(void *data)
{
cpu_cluster_pm_exit();
cpu_pm_exit();
}
-static struct syscore_ops cpu_pm_syscore_ops = {
+static const struct syscore_ops cpu_pm_syscore_ops = {
.suspend = cpu_pm_suspend,
.resume = cpu_pm_resume,
};
+static struct syscore cpu_pm_syscore = {
+ .ops = &cpu_pm_syscore_ops,
+};
+
static int cpu_pm_init(void)
{
- register_syscore_ops(&cpu_pm_syscore_ops);
+ register_syscore(&cpu_pm_syscore);
return 0;
}
core_initcall(cpu_pm_init);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
deleted file mode 100644
index ee14e3a35a29..000000000000
--- a/kernel/cpuset.c
+++ /dev/null
@@ -1,2700 +0,0 @@
-/*
- * kernel/cpuset.c
- *
- * Processor and Memory placement constraints for sets of tasks.
- *
- * Copyright (C) 2003 BULL SA.
- * Copyright (C) 2004-2007 Silicon Graphics, Inc.
- * Copyright (C) 2006 Google, Inc
- *
- * Portions derived from Patrick Mochel's sysfs code.
- * sysfs is Copyright (c) 2001-3 Patrick Mochel
- *
- * 2003-10-10 Written by Simon Derr.
- * 2003-10-22 Updates by Stephen Hemminger.
- * 2004 May-July Rework by Paul Jackson.
- * 2006 Rework by Paul Menage to use generic cgroups
- * 2008 Rework of the scheduler domains and CPU hotplug handling
- * by Max Krasnyansky
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file COPYING in the main directory of the Linux
- * distribution for more details.
- */
-
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/cpuset.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/kmod.h>
-#include <linux/list.h>
-#include <linux/mempolicy.h>
-#include <linux/mm.h>
-#include <linux/memory.h>
-#include <linux/export.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/proc_fs.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/security.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/backing-dev.h>
-#include <linux/sort.h>
-
-#include <asm/uaccess.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/workqueue.h>
-#include <linux/cgroup.h>
-#include <linux/wait.h>
-
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
-
-/* See "Frequency meter" comments, below. */
-
-struct fmeter {
- int cnt; /* unprocessed events count */
- int val; /* most recent output value */
- time_t time; /* clock (secs) when val computed */
- spinlock_t lock; /* guards read or write of above */
-};
-
-struct cpuset {
- struct cgroup_subsys_state css;
-
- unsigned long flags; /* "unsigned long" so bitops work */
-
- /*
- * On default hierarchy:
- *
- * The user-configured masks can only be changed by writing to
- * cpuset.cpus and cpuset.mems, and won't be limited by the
- * parent masks.
- *
- * The effective masks is the real masks that apply to the tasks
- * in the cpuset. They may be changed if the configured masks are
- * changed or hotplug happens.
- *
- * effective_mask == configured_mask & parent's effective_mask,
- * and if it ends up empty, it will inherit the parent's mask.
- *
- *
- * On legacy hierachy:
- *
- * The user-configured masks are always the same with effective masks.
- */
-
- /* user-configured CPUs and Memory Nodes allow to tasks */
- cpumask_var_t cpus_allowed;
- nodemask_t mems_allowed;
-
- /* effective CPUs and Memory Nodes allow to tasks */
- cpumask_var_t effective_cpus;
- nodemask_t effective_mems;
-
- /*
- * This is old Memory Nodes tasks took on.
- *
- * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
- * - A new cpuset's old_mems_allowed is initialized when some
- * task is moved into it.
- * - old_mems_allowed is used in cpuset_migrate_mm() when we change
- * cpuset.mems_allowed and have tasks' nodemask updated, and
- * then old_mems_allowed is updated to mems_allowed.
- */
- nodemask_t old_mems_allowed;
-
- struct fmeter fmeter; /* memory_pressure filter */
-
- /*
- * Tasks are being attached to this cpuset. Used to prevent
- * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
- */
- int attach_in_progress;
-
- /* partition number for rebuild_sched_domains() */
- int pn;
-
- /* for custom sched domain */
- int relax_domain_level;
-};
-
-static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct cpuset, css) : NULL;
-}
-
-/* Retrieve the cpuset for a task */
-static inline struct cpuset *task_cs(struct task_struct *task)
-{
- return css_cs(task_css(task, cpuset_cgrp_id));
-}
-
-static inline struct cpuset *parent_cs(struct cpuset *cs)
-{
- return css_cs(cs->css.parent);
-}
-
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return false;
-}
-#endif
-
-
-/* bits in struct cpuset flags field */
-typedef enum {
- CS_ONLINE,
- CS_CPU_EXCLUSIVE,
- CS_MEM_EXCLUSIVE,
- CS_MEM_HARDWALL,
- CS_MEMORY_MIGRATE,
- CS_SCHED_LOAD_BALANCE,
- CS_SPREAD_PAGE,
- CS_SPREAD_SLAB,
-} cpuset_flagbits_t;
-
-/* convenient tests for these bits */
-static inline bool is_cpuset_online(const struct cpuset *cs)
-{
- return test_bit(CS_ONLINE, &cs->flags);
-}
-
-static inline int is_cpu_exclusive(const struct cpuset *cs)
-{
- return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
-}
-
-static inline int is_mem_exclusive(const struct cpuset *cs)
-{
- return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
-}
-
-static inline int is_mem_hardwall(const struct cpuset *cs)
-{
- return test_bit(CS_MEM_HARDWALL, &cs->flags);
-}
-
-static inline int is_sched_load_balance(const struct cpuset *cs)
-{
- return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-}
-
-static inline int is_memory_migrate(const struct cpuset *cs)
-{
- return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
-}
-
-static inline int is_spread_page(const struct cpuset *cs)
-{
- return test_bit(CS_SPREAD_PAGE, &cs->flags);
-}
-
-static inline int is_spread_slab(const struct cpuset *cs)
-{
- return test_bit(CS_SPREAD_SLAB, &cs->flags);
-}
-
-static struct cpuset top_cpuset = {
- .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
- (1 << CS_MEM_EXCLUSIVE)),
-};
-
-/**
- * cpuset_for_each_child - traverse online children of a cpuset
- * @child_cs: loop cursor pointing to the current child
- * @pos_css: used for iteration
- * @parent_cs: target cpuset to walk children of
- *
- * Walk @child_cs through the online children of @parent_cs. Must be used
- * with RCU read locked.
- */
-#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
- css_for_each_child((pos_css), &(parent_cs)->css) \
- if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
-
-/**
- * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
- * @des_cs: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @root_cs: target cpuset to walk ancestor of
- *
- * Walk @des_cs through the online descendants of @root_cs. Must be used
- * with RCU read locked. The caller may modify @pos_css by calling
- * css_rightmost_descendant() to skip subtree. @root_cs is included in the
- * iteration and the first node to be visited.
- */
-#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
- css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
- if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
-
-/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. We also require taking task_lock() when dereferencing a
- * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
- *
- * A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- * is the only task able to also acquire callback_lock and be able to
- * modify cpusets. It can perform various checks on the cpuset structure
- * first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
- * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_lock, blocking
- * everyone else.
- *
- * Calls to the kernel memory allocator can not be made while holding
- * callback_lock, as that would risk double tripping on callback_lock
- * from one of the callbacks into the cpuset code from within
- * __alloc_pages().
- *
- * If a task is only holding callback_lock, then it has read-only
- * access to cpusets.
- *
- * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
- *
- * The cpuset_common_file_read() handlers only hold callback_lock across
- * small pieces of code, such as when reading out possibly multi-word
- * cpumasks and nodemasks.
- *
- * Accessing a task's cpuset should be done in accordance with the
- * guidelines for accessing subsystem state in kernel/cgroup.c
- */
-
-static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_SPINLOCK(callback_lock);
-
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
-static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
-
-/*
- * This is ugly, but preserves the userspace API for existing cpuset
- * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead
- */
-static struct dentry *cpuset_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name, void *data)
-{
- struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- struct dentry *ret = ERR_PTR(-ENODEV);
- if (cgroup_fs) {
- char mountopts[] =
- "cpuset,noprefix,"
- "release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->mount(cgroup_fs, flags,
- unused_dev_name, mountopts);
- put_filesystem(cgroup_fs);
- }
- return ret;
-}
-
-static struct file_system_type cpuset_fs_type = {
- .name = "cpuset",
- .mount = cpuset_mount,
-};
-
-/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. The top
- * cpuset always has some cpus online.
- *
- * One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
- *
- * Call with callback_lock or cpuset_mutex held.
- */
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
-{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
- cs = parent_cs(cs);
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
-}
-
-/*
- * Return in *pmask the portion of a cpusets's mems_allowed that
- * are online, with memory. If none are online with memory, walk
- * up the cpuset hierarchy until we find one that does have some
- * online mems. The top cpuset always has some mems online.
- *
- * One way or another, we guarantee to return some non-empty subset
- * of node_states[N_MEMORY].
- *
- * Call with callback_lock or cpuset_mutex held.
- */
-static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
-{
- while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
- cs = parent_cs(cs);
- nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
-}
-
-/*
- * update task's spread flag if cpuset's page/slab spread flag is set
- *
- * Call with callback_lock or cpuset_mutex held.
- */
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
- struct task_struct *tsk)
-{
- if (is_spread_page(cs))
- task_set_spread_page(tsk);
- else
- task_clear_spread_page(tsk);
-
- if (is_spread_slab(cs))
- task_set_spread_slab(tsk);
- else
- task_clear_spread_slab(tsk);
-}
-
-/*
- * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
- *
- * One cpuset is a subset of another if all its allowed CPUs and
- * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
- */
-
-static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
-{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
- nodes_subset(p->mems_allowed, q->mems_allowed) &&
- is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
- is_mem_exclusive(p) <= is_mem_exclusive(q);
-}
-
-/**
- * alloc_trial_cpuset - allocate a trial cpuset
- * @cs: the cpuset that the trial cpuset duplicates
- */
-static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
-{
- struct cpuset *trial;
-
- trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
- if (!trial)
- return NULL;
-
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
- goto free_cpus;
-
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
- cpumask_copy(trial->effective_cpus, cs->effective_cpus);
- return trial;
-
-free_cpus:
- free_cpumask_var(trial->cpus_allowed);
-free_cs:
- kfree(trial);
- return NULL;
-}
-
-/**
- * free_trial_cpuset - free the trial cpuset
- * @trial: the trial cpuset to be freed
- */
-static void free_trial_cpuset(struct cpuset *trial)
-{
- free_cpumask_var(trial->effective_cpus);
- free_cpumask_var(trial->cpus_allowed);
- kfree(trial);
-}
-
-/*
- * validate_change() - Used to validate that any proposed cpuset change
- * follows the structural rules for cpusets.
- *
- * If we replaced the flag and mask values of the current cpuset
- * (cur) with those values in the trial cpuset (trial), would
- * our various subset and exclusive rules still be valid? Presumes
- * cpuset_mutex held.
- *
- * 'cur' is the address of an actual, in-use cpuset. Operations
- * such as list traversal that depend on the actual address of the
- * cpuset in the list must use cur below, not trial.
- *
- * 'trial' is the address of bulk structure copy of cur, with
- * perhaps one or more of the fields cpus_allowed, mems_allowed,
- * or flags changed to new, trial values.
- *
- * Return 0 if valid, -errno if not.
- */
-
-static int validate_change(struct cpuset *cur, struct cpuset *trial)
-{
- struct cgroup_subsys_state *css;
- struct cpuset *c, *par;
- int ret;
-
- rcu_read_lock();
-
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
-
- /* Remaining checks don't apply to root cpuset */
- ret = 0;
- if (cur == &top_cpuset)
- goto out;
-
- par = parent_cs(cur);
-
- /* On legacy hiearchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
- goto out;
-
- /*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
-
- /*
- * Cpusets with tasks - existing or newly being attached - can't
- * be changed to have empty cpus_allowed or mems_allowed.
- */
- ret = -ENOSPC;
- if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
- if (!cpumask_empty(cur->cpus_allowed) &&
- cpumask_empty(trial->cpus_allowed))
- goto out;
- if (!nodes_empty(cur->mems_allowed) &&
- nodes_empty(trial->mems_allowed))
- goto out;
- }
-
- /*
- * We can't shrink if we won't have enough room for SCHED_DEADLINE
- * tasks.
- */
- ret = -EBUSY;
- if (is_cpu_exclusive(cur) &&
- !cpuset_cpumask_can_shrink(cur->cpus_allowed,
- trial->cpus_allowed))
- goto out;
-
- ret = 0;
-out:
- rcu_read_unlock();
- return ret;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping effective cpus_allowed masks?
- */
-static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
-{
- return cpumask_intersects(a->effective_cpus, b->effective_cpus);
-}
-
-static void
-update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
-{
- if (dattr->relax_domain_level < c->relax_domain_level)
- dattr->relax_domain_level = c->relax_domain_level;
- return;
-}
-
-static void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- /* skip the whole subtree if @cp doesn't have any CPU */
- if (cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (is_sched_load_balance(cp))
- update_domain_attr(dattr, cp);
- }
- rcu_read_unlock();
-}
-
-/*
- * generate_sched_domains()
- *
- * This function builds a partial partition of the systems CPUs
- * A 'partial partition' is a set of non-overlapping subsets whose
- * union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched/core.c
- * partition_sched_domains() routine, which will rebuild the scheduler's
- * load balancing domains (sched domains) as specified by that partial
- * partition.
- *
- * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
- * for a background explanation of this.
- *
- * Does not return errors, on the theory that the callers of this
- * routine would rather not worry about failures to rebuild sched
- * domains when operating in the severe memory shortage situations
- * that could cause allocation failures below.
- *
- * Must be called with cpuset_mutex held.
- *
- * The three key local variables below are:
- * q - a linked-list queue of cpuset pointers, used to implement a
- * top-down scan of all cpusets. This scan loads a pointer
- * to each cpuset marked is_sched_load_balance into the
- * array 'csa'. For our purposes, rebuilding the schedulers
- * sched domains, we can ignore !is_sched_load_balance cpusets.
- * csa - (for CpuSet Array) Array of pointers to all the cpusets
- * that need to be load balanced, for convenient iterative
- * access by the subsequent code that finds the best partition,
- * i.e the set of domains (subsets) of CPUs such that the
- * cpus_allowed of every cpuset marked is_sched_load_balance
- * is a subset of one of these domains, while there are as
- * many such domains as possible, each as small as possible.
- * doms - Conversion of 'csa' to an array of cpumasks, for passing to
- * the kernel/sched/core.c routine partition_sched_domains() in a
- * convenient format, that can be easily compared to the prior
- * value to determine what partition elements (sched domains)
- * were changed (added or removed.)
- *
- * Finding the best partition (set of domains):
- * The triple nested loops below over i, j, k scan over the
- * load balanced cpusets (using the array of cpuset pointers in
- * csa[]) looking for pairs of cpusets that have overlapping
- * cpus_allowed, but which don't have the same 'pn' partition
- * number and gives them in the same partition number. It keeps
- * looping on the 'restart' label until it can no longer find
- * any such pairs.
- *
- * The union of the cpus_allowed masks from the set of
- * all cpusets having the same 'pn' value then form the one
- * element of the partition (one sched domain) to be passed to
- * partition_sched_domains().
- */
-static int generate_sched_domains(cpumask_var_t **domains,
- struct sched_domain_attr **attributes)
-{
- struct cpuset *cp; /* scans q */
- struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
- int i, j, k; /* indices for partition finding loops */
- cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
- struct sched_domain_attr *dattr; /* attributes for custom domains */
- int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
- struct cgroup_subsys_state *pos_css;
-
- doms = NULL;
- dattr = NULL;
- csa = NULL;
-
- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
- /* Special case for the 99% of systems with one, full, sched domain */
- if (is_sched_load_balance(&top_cpuset)) {
- ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
-
- goto done;
- }
-
- csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
- if (!csa)
- goto done;
- csn = 0;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
- /*
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
- continue;
-
- if (is_sched_load_balance(cp))
- csa[csn++] = cp;
-
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- }
- rcu_read_unlock();
-
- for (i = 0; i < csn; i++)
- csa[i]->pn = i;
- ndoms = csn;
-
-restart:
- /* Find the best partition (set of sched domains) */
- for (i = 0; i < csn; i++) {
- struct cpuset *a = csa[i];
- int apn = a->pn;
-
- for (j = 0; j < csn; j++) {
- struct cpuset *b = csa[j];
- int bpn = b->pn;
-
- if (apn != bpn && cpusets_overlap(a, b)) {
- for (k = 0; k < csn; k++) {
- struct cpuset *c = csa[k];
-
- if (c->pn == bpn)
- c->pn = apn;
- }
- ndoms--; /* one less element */
- goto restart;
- }
- }
- }
-
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- /*
- * The rest of the code, including the scheduler, can deal with
- * dattr==NULL case. No need to abort if alloc fails.
- */
- dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
-
- for (nslot = 0, i = 0; i < csn; i++) {
- struct cpuset *a = csa[i];
- struct cpumask *dp;
- int apn = a->pn;
-
- if (apn < 0) {
- /* Skip completed partitions */
- continue;
- }
-
- dp = doms[nslot];
-
- if (nslot == ndoms) {
- static int warnings = 10;
- if (warnings) {
- pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
- nslot, ndoms, csn, i, apn);
- warnings--;
- }
- continue;
- }
-
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- for (j = i; j < csn; j++) {
- struct cpuset *b = csa[j];
-
- if (apn == b->pn) {
- cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
- if (dattr)
- update_domain_attr_tree(dattr + nslot, b);
-
- /* Done with this partition */
- b->pn = -1;
- }
- }
- nslot++;
- }
- BUG_ON(nslot != ndoms);
-
-done:
- free_cpumask_var(non_isolated_cpus);
- kfree(csa);
-
- /*
- * Fallback to the default domain if kmalloc() failed.
- * See comments in partition_sched_domains().
- */
- if (doms == NULL)
- ndoms = 1;
-
- *domains = doms;
- *attributes = dattr;
- return ndoms;
-}
-
-/*
- * Rebuild scheduler domains.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * Call with cpuset_mutex held. Takes get_online_cpus().
- */
-static void rebuild_sched_domains_locked(void)
-{
- struct sched_domain_attr *attr;
- cpumask_var_t *doms;
- int ndoms;
-
- lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();
-
- /*
- * We have raced with CPU hotplug. Don't do anything to avoid
- * passing doms with offlined cpu to partition_sched_domains().
- * Anyways, hotplug work item will rebuild sched domains.
- */
- if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
-
- /* Generate domain masks and attrs */
- ndoms = generate_sched_domains(&doms, &attr);
-
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
-}
-#else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
-{
-}
-#endif /* CONFIG_SMP */
-
-void rebuild_sched_domains(void)
-{
- mutex_lock(&cpuset_mutex);
- rebuild_sched_domains_locked();
- mutex_unlock(&cpuset_mutex);
-}
-
-/**
- * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
- * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- *
- * Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
- * cpuset membership stays stable.
- */
-static void update_tasks_cpumask(struct cpuset *cs)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&cs->css, &it);
- while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cs->effective_cpus);
- css_task_iter_end(&it);
-}
-
-/*
- * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_cpus: temp variable for calculating new effective_cpus
- *
- * When congifured cpumask is changed, the effective cpumasks of this cpuset
- * and all its descendants need to be updated.
- *
- * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
- *
- * Called with cpuset_mutex held
- */
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
- bool need_rebuild_sched_domains = false;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, cs) {
- struct cpuset *parent = parent_cs(cp);
-
- cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
-
- /*
- * If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some CPUs.
- */
- if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
- cpumask_copy(new_cpus, parent->effective_cpus);
-
- /* Skip the whole subtree if the cpumask remains the same. */
- if (cpumask_equal(new_cpus, cp->effective_cpus)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (!css_tryget_online(&cp->css))
- continue;
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, new_cpus);
- spin_unlock_irq(&callback_lock);
-
- WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
- !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
-
- update_tasks_cpumask(cp);
-
- /*
- * If the effective cpumask of any non-empty cpuset is changed,
- * we need to rebuild sched domains.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- is_sched_load_balance(cp))
- need_rebuild_sched_domains = true;
-
- rcu_read_lock();
- css_put(&cp->css);
- }
- rcu_read_unlock();
-
- if (need_rebuild_sched_domains)
- rebuild_sched_domains_locked();
-}
-
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @trialcs: trial cpuset
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
- const char *buf)
-{
- int retval;
-
- /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
- if (cs == &top_cpuset)
- return -EACCES;
-
- /*
- * An empty cpus_allowed is ok only if the cpuset has no tasks.
- * Since cpulist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have cpus.
- */
- if (!*buf) {
- cpumask_clear(trialcs->cpus_allowed);
- } else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
- if (retval < 0)
- return retval;
-
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
- return -EINVAL;
- }
-
- /* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
- return 0;
-
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
-
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
- spin_unlock_irq(&callback_lock);
-
- /* use trialcs->cpus_allowed as a temp variable */
- update_cpumasks_hier(cs, trialcs->cpus_allowed);
- return 0;
-}
-
-/*
- * cpuset_migrate_mm
- *
- * Migrate memory region from one set of nodes to another.
- *
- * Temporarilly set tasks mems_allowed to target nodes of migration,
- * so that the migration code can allocate pages on these nodes.
- *
- * While the mm_struct we are migrating is typically from some
- * other task, the task_struct mems_allowed that we are hacking
- * is for our current task, which must allocate new pages for that
- * migrating memory region.
- */
-
-static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
- const nodemask_t *to)
-{
- struct task_struct *tsk = current;
-
- tsk->mems_allowed = *to;
-
- do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
-
- rcu_read_lock();
- guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
- rcu_read_unlock();
-}
-
-/*
- * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
- * @tsk: the task to change
- * @newmems: new nodes that the task will be set
- *
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
- */
-static void cpuset_change_task_nodemask(struct task_struct *tsk,
- nodemask_t *newmems)
-{
- bool need_loop;
-
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return;
- if (current->flags & PF_EXITING) /* Let dying task have memory */
- return;
-
- task_lock(tsk);
- /*
- * Determine if a loop is necessary if another thread is doing
- * read_mems_allowed_begin(). If at least one node remains unchanged and
- * tsk does not have a mempolicy, then an empty nodemask will not be
- * possible when mems_allowed is larger than a word.
- */
- need_loop = task_has_mempolicy(tsk) ||
- !nodes_intersects(*newmems, tsk->mems_allowed);
-
- if (need_loop) {
- local_irq_disable();
- write_seqcount_begin(&tsk->mems_allowed_seq);
- }
-
- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
- tsk->mems_allowed = *newmems;
-
- if (need_loop) {
- write_seqcount_end(&tsk->mems_allowed_seq);
- local_irq_enable();
- }
-
- task_unlock(tsk);
-}
-
-static void *cpuset_being_rebound;
-
-/**
- * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
- * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- *
- * Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
- * cpuset membership stays stable.
- */
-static void update_tasks_nodemask(struct cpuset *cs)
-{
- static nodemask_t newmems; /* protected by cpuset_mutex */
- struct css_task_iter it;
- struct task_struct *task;
-
- cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
-
- guarantee_online_mems(cs, &newmems);
-
- /*
- * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
- * take while holding tasklist_lock. Forks can happen - the
- * mpol_dup() cpuset_being_rebound check will catch such forks,
- * and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_mutex, we know that no other rebind effort
- * will be contending for the global variable cpuset_being_rebound.
- * It's ok if we rebind the same mm twice; mpol_rebind_mm()
- * is idempotent. Also migrate pages in each mm to new nodes.
- */
- css_task_iter_start(&cs->css, &it);
- while ((task = css_task_iter_next(&it))) {
- struct mm_struct *mm;
- bool migrate;
-
- cpuset_change_task_nodemask(task, &newmems);
-
- mm = get_task_mm(task);
- if (!mm)
- continue;
-
- migrate = is_memory_migrate(cs);
-
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
- mmput(mm);
- }
- css_task_iter_end(&it);
-
- /*
- * All the tasks' nodemasks have been updated, update
- * cs->old_mems_allowed.
- */
- cs->old_mems_allowed = newmems;
-
- /* We're done rebinding vmas to this cpuset's new mems_allowed. */
- cpuset_being_rebound = NULL;
-}
-
-/*
- * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_mems: a temp variable for calculating new effective_mems
- *
- * When configured nodemask is changed, the effective nodemasks of this cpuset
- * and all its descendants need to be updated.
- *
- * On legacy hiearchy, effective_mems will be the same with mems_allowed.
- *
- * Called with cpuset_mutex held
- */
-static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, cs) {
- struct cpuset *parent = parent_cs(cp);
-
- nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
-
- /*
- * If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some MEMs.
- */
- if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
- *new_mems = parent->effective_mems;
-
- /* Skip the whole subtree if the nodemask remains the same. */
- if (nodes_equal(*new_mems, cp->effective_mems)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (!css_tryget_online(&cp->css))
- continue;
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cp->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
-
- WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
- !nodes_equal(cp->mems_allowed, cp->effective_mems));
-
- update_tasks_nodemask(cp);
-
- rcu_read_lock();
- css_put(&cp->css);
- }
- rcu_read_unlock();
-}
-
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset. Needs to validate the request, update the
- * cpusets mems_allowed, and for each task in the cpuset,
- * update mems_allowed and rebind task's mempolicy and any vma
- * mempolicies and if the cpuset is marked 'memory_migrate',
- * migrate the tasks pages to the new memory.
- *
- * Call with cpuset_mutex held. May take callback_lock during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
-static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
- const char *buf)
-{
- int retval;
-
- /*
- * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
- * it's read-only
- */
- if (cs == &top_cpuset) {
- retval = -EACCES;
- goto done;
- }
-
- /*
- * An empty mems_allowed is ok iff there are no tasks in the cpuset.
- * Since nodelist_parse() fails on an empty mask, we special case
- * that parsing. The validate_change() call ensures that cpusets
- * with tasks have memory.
- */
- if (!*buf) {
- nodes_clear(trialcs->mems_allowed);
- } else {
- retval = nodelist_parse(buf, trialcs->mems_allowed);
- if (retval < 0)
- goto done;
-
- if (!nodes_subset(trialcs->mems_allowed,
- top_cpuset.mems_allowed)) {
- retval = -EINVAL;
- goto done;
- }
- }
-
- if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
- retval = 0; /* Too easy - nothing to do */
- goto done;
- }
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- goto done;
-
- spin_lock_irq(&callback_lock);
- cs->mems_allowed = trialcs->mems_allowed;
- spin_unlock_irq(&callback_lock);
-
- /* use trialcs->mems_allowed as a temp variable */
- update_nodemasks_hier(cs, &cs->mems_allowed);
-done:
- return retval;
-}
-
-int current_cpuset_is_being_rebound(void)
-{
- int ret;
-
- rcu_read_lock();
- ret = task_cs(current) == cpuset_being_rebound;
- rcu_read_unlock();
-
- return ret;
-}
-
-static int update_relax_domain_level(struct cpuset *cs, s64 val)
-{
-#ifdef CONFIG_SMP
- if (val < -1 || val >= sched_domain_level_max)
- return -EINVAL;
-#endif
-
- if (val != cs->relax_domain_level) {
- cs->relax_domain_level = val;
- if (!cpumask_empty(cs->cpus_allowed) &&
- is_sched_load_balance(cs))
- rebuild_sched_domains_locked();
- }
-
- return 0;
-}
-
-/**
- * update_tasks_flags - update the spread flags of tasks in the cpuset.
- * @cs: the cpuset in which each task's spread flags needs to be changed
- *
- * Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
- * stable.
- */
-static void update_tasks_flags(struct cpuset *cs)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&cs->css, &it);
- while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
- css_task_iter_end(&it);
-}
-
-/*
- * update_flag - read a 0 or a 1 in a file and update associated flag
- * bit: the bit to update (see cpuset_flagbits_t)
- * cs: the cpuset to update
- * turning_on: whether the flag is being set or cleared
- *
- * Call with cpuset_mutex held.
- */
-
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
- int turning_on)
-{
- struct cpuset *trialcs;
- int balance_flag_changed;
- int spread_flag_changed;
- int err;
-
- trialcs = alloc_trial_cpuset(cs);
- if (!trialcs)
- return -ENOMEM;
-
- if (turning_on)
- set_bit(bit, &trialcs->flags);
- else
- clear_bit(bit, &trialcs->flags);
-
- err = validate_change(cs, trialcs);
- if (err < 0)
- goto out;
-
- balance_flag_changed = (is_sched_load_balance(cs) !=
- is_sched_load_balance(trialcs));
-
- spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
- || (is_spread_page(cs) != is_spread_page(trialcs)));
-
- spin_lock_irq(&callback_lock);
- cs->flags = trialcs->flags;
- spin_unlock_irq(&callback_lock);
-
- if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- rebuild_sched_domains_locked();
-
- if (spread_flag_changed)
- update_tasks_flags(cs);
-out:
- free_trial_cpuset(trialcs);
- return err;
-}
-
-/*
- * Frequency meter - How fast is some event occurring?
- *
- * These routines manage a digitally filtered, constant time based,
- * event frequency meter. There are four routines:
- * fmeter_init() - initialize a frequency meter.
- * fmeter_markevent() - called each time the event happens.
- * fmeter_getrate() - returns the recent rate of such events.
- * fmeter_update() - internal routine used to update fmeter.
- *
- * A common data structure is passed to each of these routines,
- * which is used to keep track of the state required to manage the
- * frequency meter and its digital filter.
- *
- * The filter works on the number of events marked per unit time.
- * The filter is single-pole low-pass recursive (IIR). The time unit
- * is 1 second. Arithmetic is done using 32-bit integers scaled to
- * simulate 3 decimal digits of precision (multiplied by 1000).
- *
- * With an FM_COEF of 933, and a time base of 1 second, the filter
- * has a half-life of 10 seconds, meaning that if the events quit
- * happening, then the rate returned from the fmeter_getrate()
- * will be cut in half each 10 seconds, until it converges to zero.
- *
- * It is not worth doing a real infinitely recursive filter. If more
- * than FM_MAXTICKS ticks have elapsed since the last filter event,
- * just compute FM_MAXTICKS ticks worth, by which point the level
- * will be stable.
- *
- * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
- * arithmetic overflow in the fmeter_update() routine.
- *
- * Given the simple 32 bit integer arithmetic used, this meter works
- * best for reporting rates between one per millisecond (msec) and
- * one per 32 (approx) seconds. At constant rates faster than one
- * per msec it maxes out at values just under 1,000,000. At constant
- * rates between one per msec, and one per second it will stabilize
- * to a value N*1000, where N is the rate of events per second.
- * At constant rates between one per second and one per 32 seconds,
- * it will be choppy, moving up on the seconds that have an event,
- * and then decaying until the next event. At rates slower than
- * about one in 32 seconds, it decays all the way back to zero between
- * each event.
- */
-
-#define FM_COEF 933 /* coefficient for half-life of 10 secs */
-#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
-#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
-#define FM_SCALE 1000 /* faux fixed point scale */
-
-/* Initialize a frequency meter */
-static void fmeter_init(struct fmeter *fmp)
-{
- fmp->cnt = 0;
- fmp->val = 0;
- fmp->time = 0;
- spin_lock_init(&fmp->lock);
-}
-
-/* Internal meter update - process cnt events and update value */
-static void fmeter_update(struct fmeter *fmp)
-{
- time_t now = get_seconds();
- time_t ticks = now - fmp->time;
-
- if (ticks == 0)
- return;
-
- ticks = min(FM_MAXTICKS, ticks);
- while (ticks-- > 0)
- fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
- fmp->time = now;
-
- fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
- fmp->cnt = 0;
-}
-
-/* Process any previous ticks, then bump cnt by one (times scale). */
-static void fmeter_markevent(struct fmeter *fmp)
-{
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
- spin_unlock(&fmp->lock);
-}
-
-/* Process any previous ticks, then return current value. */
-static int fmeter_getrate(struct fmeter *fmp)
-{
- int val;
-
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- val = fmp->val;
- spin_unlock(&fmp->lock);
- return val;
-}
-
-static struct cpuset *cpuset_attach_old_cs;
-
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
-{
- struct cpuset *cs = css_cs(css);
- struct task_struct *task;
- int ret;
-
- /* used later by cpuset_attach() */
- cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
-
- mutex_lock(&cpuset_mutex);
-
- /* allow moving tasks into an empty cpuset if on default hierarchy */
- ret = -ENOSPC;
- if (!cgroup_on_dfl(css->cgroup) &&
- (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
- goto out_unlock;
-
- cgroup_taskset_for_each(task, tset) {
- ret = task_can_attach(task, cs->cpus_allowed);
- if (ret)
- goto out_unlock;
- ret = security_task_setscheduler(task);
- if (ret)
- goto out_unlock;
- }
-
- /*
- * Mark attach is in progress. This makes validate_change() fail
- * changes which zero cpus/mems_allowed.
- */
- cs->attach_in_progress++;
- ret = 0;
-out_unlock:
- mutex_unlock(&cpuset_mutex);
- return ret;
-}
-
-static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
-{
- mutex_lock(&cpuset_mutex);
- css_cs(css)->attach_in_progress--;
- mutex_unlock(&cpuset_mutex);
-}
-
-/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
- * but we can't allocate it dynamically there. Define it global and
- * allocate from cpuset_init().
- */
-static cpumask_var_t cpus_attach;
-
-static void cpuset_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
-{
- /* static buf protected by cpuset_mutex */
- static nodemask_t cpuset_attach_nodemask_to;
- struct mm_struct *mm;
- struct task_struct *task;
- struct task_struct *leader = cgroup_taskset_first(tset);
- struct cpuset *cs = css_cs(css);
- struct cpuset *oldcs = cpuset_attach_old_cs;
-
- mutex_lock(&cpuset_mutex);
-
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
-
- cgroup_taskset_for_each(task, tset) {
- /*
- * can_attach beforehand should guarantee that this doesn't
- * fail. TODO: have a better way to handle failure here
- */
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
- cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
- }
-
- /*
- * Change mm, possibly for multiple threads in a threadgroup. This is
- * expensive and may sleep.
- */
- cpuset_attach_nodemask_to = cs->effective_mems;
- mm = get_task_mm(leader);
- if (mm) {
- mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-
- /*
- * old_mems_allowed is the same with mems_allowed here, except
- * if this task is being moved automatically due to hotplug.
- * In that case @mems_allowed has been updated and is empty,
- * so @old_mems_allowed is the right nodesets that we migrate
- * mm from.
- */
- if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
- &cpuset_attach_nodemask_to);
- }
- mmput(mm);
- }
-
- cs->old_mems_allowed = cpuset_attach_nodemask_to;
-
- cs->attach_in_progress--;
- if (!cs->attach_in_progress)
- wake_up(&cpuset_attach_wq);
-
- mutex_unlock(&cpuset_mutex);
-}
-
-/* The various types of files and directories in a cpuset file system */
-
-typedef enum {
- FILE_MEMORY_MIGRATE,
- FILE_CPULIST,
- FILE_MEMLIST,
- FILE_EFFECTIVE_CPULIST,
- FILE_EFFECTIVE_MEMLIST,
- FILE_CPU_EXCLUSIVE,
- FILE_MEM_EXCLUSIVE,
- FILE_MEM_HARDWALL,
- FILE_SCHED_LOAD_BALANCE,
- FILE_SCHED_RELAX_DOMAIN_LEVEL,
- FILE_MEMORY_PRESSURE_ENABLED,
- FILE_MEMORY_PRESSURE,
- FILE_SPREAD_PAGE,
- FILE_SPREAD_SLAB,
-} cpuset_filetype_t;
-
-static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 val)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = 0;
-
- mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs)) {
- retval = -ENODEV;
- goto out_unlock;
- }
-
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_EXCLUSIVE:
- retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_HARDWALL:
- retval = update_flag(CS_MEM_HARDWALL, cs, val);
- break;
- case FILE_SCHED_LOAD_BALANCE:
- retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
- break;
- case FILE_MEMORY_MIGRATE:
- retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
- break;
- case FILE_MEMORY_PRESSURE_ENABLED:
- cpuset_memory_pressure_enabled = !!val;
- break;
- case FILE_MEMORY_PRESSURE:
- retval = -EACCES;
- break;
- case FILE_SPREAD_PAGE:
- retval = update_flag(CS_SPREAD_PAGE, cs, val);
- break;
- case FILE_SPREAD_SLAB:
- retval = update_flag(CS_SPREAD_SLAB, cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
- }
-out_unlock:
- mutex_unlock(&cpuset_mutex);
- return retval;
-}
-
-static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
- s64 val)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = -ENODEV;
-
- mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs))
- goto out_unlock;
-
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- retval = update_relax_domain_level(cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
- }
-out_unlock:
- mutex_unlock(&cpuset_mutex);
- return retval;
-}
-
-/*
- * Common handling for a write to a "cpus" or "mems" file.
- */
-static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct cpuset *cs = css_cs(of_css(of));
- struct cpuset *trialcs;
- int retval = -ENODEV;
-
- buf = strstrip(buf);
-
- /*
- * CPU or memory hotunplug may leave @cs w/o any execution
- * resources, in which case the hotplug code asynchronously updates
- * configuration and transfers all tasks to the nearest ancestor
- * which can execute.
- *
- * As writes to "cpus" or "mems" may restore @cs's execution
- * resources, wait for the previously scheduled operations before
- * proceeding, so that we don't end up keep removing tasks added
- * after execution capability is restored.
- *
- * cpuset_hotplug_work calls back into cgroup core via
- * cgroup_transfer_tasks() and waiting for it from a cgroupfs
- * operation like this one can lead to a deadlock through kernfs
- * active_ref protection. Let's break the protection. Losing the
- * protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
- * hierarchies.
- */
- css_get(&cs->css);
- kernfs_break_active_protection(of->kn);
- flush_work(&cpuset_hotplug_work);
-
- mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs))
- goto out_unlock;
-
- trialcs = alloc_trial_cpuset(cs);
- if (!trialcs) {
- retval = -ENOMEM;
- goto out_unlock;
- }
-
- switch (of_cft(of)->private) {
- case FILE_CPULIST:
- retval = update_cpumask(cs, trialcs, buf);
- break;
- case FILE_MEMLIST:
- retval = update_nodemask(cs, trialcs, buf);
- break;
- default:
- retval = -EINVAL;
- break;
- }
-
- free_trial_cpuset(trialcs);
-out_unlock:
- mutex_unlock(&cpuset_mutex);
- kernfs_unbreak_active_protection(of->kn);
- css_put(&cs->css);
- return retval ?: nbytes;
-}
-
-/*
- * These ascii lists should be read in a single call, by using a user
- * buffer large enough to hold the entire map. If read in smaller
- * chunks, there is no guarantee of atomicity. Since the display format
- * used, list of ranges of sequential numbers, is variable length,
- * and since these maps can change value dynamically, one could read
- * gibberish by doing partial reads while a list was changing.
- */
-static int cpuset_common_seq_show(struct seq_file *sf, void *v)
-{
- struct cpuset *cs = css_cs(seq_css(sf));
- cpuset_filetype_t type = seq_cft(sf)->private;
- int ret = 0;
-
- spin_lock_irq(&callback_lock);
-
- switch (type) {
- case FILE_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
- break;
- case FILE_MEMLIST:
- seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
- break;
- case FILE_EFFECTIVE_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
- break;
- case FILE_EFFECTIVE_MEMLIST:
- seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
- break;
- default:
- ret = -EINVAL;
- }
-
- spin_unlock_irq(&callback_lock);
- return ret;
-}
-
-static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- return is_cpu_exclusive(cs);
- case FILE_MEM_EXCLUSIVE:
- return is_mem_exclusive(cs);
- case FILE_MEM_HARDWALL:
- return is_mem_hardwall(cs);
- case FILE_SCHED_LOAD_BALANCE:
- return is_sched_load_balance(cs);
- case FILE_MEMORY_MIGRATE:
- return is_memory_migrate(cs);
- case FILE_MEMORY_PRESSURE_ENABLED:
- return cpuset_memory_pressure_enabled;
- case FILE_MEMORY_PRESSURE:
- return fmeter_getrate(&cs->fmeter);
- case FILE_SPREAD_PAGE:
- return is_spread_page(cs);
- case FILE_SPREAD_SLAB:
- return is_spread_slab(cs);
- default:
- BUG();
- }
-
- /* Unreachable but makes gcc happy */
- return 0;
-}
-
-static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- return cs->relax_domain_level;
- default:
- BUG();
- }
-
- /* Unrechable but makes gcc happy */
- return 0;
-}
-
-
-/*
- * for the common functions, 'private' gives the type of file
- */
-
-static struct cftype files[] = {
- {
- .name = "cpus",
- .seq_show = cpuset_common_seq_show,
- .write = cpuset_write_resmask,
- .max_write_len = (100U + 6 * NR_CPUS),
- .private = FILE_CPULIST,
- },
-
- {
- .name = "mems",
- .seq_show = cpuset_common_seq_show,
- .write = cpuset_write_resmask,
- .max_write_len = (100U + 6 * MAX_NUMNODES),
- .private = FILE_MEMLIST,
- },
-
- {
- .name = "effective_cpus",
- .seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_CPULIST,
- },
-
- {
- .name = "effective_mems",
- .seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_MEMLIST,
- },
-
- {
- .name = "cpu_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_CPU_EXCLUSIVE,
- },
-
- {
- .name = "mem_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_EXCLUSIVE,
- },
-
- {
- .name = "mem_hardwall",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_HARDWALL,
- },
-
- {
- .name = "sched_load_balance",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SCHED_LOAD_BALANCE,
- },
-
- {
- .name = "sched_relax_domain_level",
- .read_s64 = cpuset_read_s64,
- .write_s64 = cpuset_write_s64,
- .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
- },
-
- {
- .name = "memory_migrate",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_MIGRATE,
- },
-
- {
- .name = "memory_pressure",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE,
- .mode = S_IRUGO,
- },
-
- {
- .name = "memory_spread_page",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_PAGE,
- },
-
- {
- .name = "memory_spread_slab",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_SLAB,
- },
-
- {
- .name = "memory_pressure_enabled",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
- },
-
- { } /* terminate */
-};
-
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
- */
-
-static struct cgroup_subsys_state *
-cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cpuset *cs;
-
- if (!parent_css)
- return &top_cpuset.css;
-
- cs = kzalloc(sizeof(*cs), GFP_KERNEL);
- if (!cs)
- return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
-
- set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- cpumask_clear(cs->cpus_allowed);
- nodes_clear(cs->mems_allowed);
- cpumask_clear(cs->effective_cpus);
- nodes_clear(cs->effective_mems);
- fmeter_init(&cs->fmeter);
- cs->relax_domain_level = -1;
-
- return &cs->css;
-
-free_cpus:
- free_cpumask_var(cs->cpus_allowed);
-free_cs:
- kfree(cs);
- return ERR_PTR(-ENOMEM);
-}
-
-static int cpuset_css_online(struct cgroup_subsys_state *css)
-{
- struct cpuset *cs = css_cs(css);
- struct cpuset *parent = parent_cs(cs);
- struct cpuset *tmp_cs;
- struct cgroup_subsys_state *pos_css;
-
- if (!parent)
- return 0;
-
- mutex_lock(&cpuset_mutex);
-
- set_bit(CS_ONLINE, &cs->flags);
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
-
- cpuset_inc();
-
- spin_lock_irq(&callback_lock);
- if (cgroup_on_dfl(cs->css.cgroup)) {
- cpumask_copy(cs->effective_cpus, parent->effective_cpus);
- cs->effective_mems = parent->effective_mems;
- }
- spin_unlock_irq(&callback_lock);
-
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
- goto out_unlock;
-
- /*
- * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
- * set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
- *
- * Currently, if any sibling cpusets have exclusive cpus or mem, we
- * refuse to clone the configuration - thereby refusing the task to
- * be entered, and as a result refusing the sys_unshare() or
- * clone() which initiated it. If this becomes a problem for some
- * users who wish to allow that scenario, then this could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
- */
- rcu_read_lock();
- cpuset_for_each_child(tmp_cs, pos_css, parent) {
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
- rcu_read_unlock();
- goto out_unlock;
- }
- }
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cs->mems_allowed = parent->mems_allowed;
- cs->effective_mems = parent->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
-out_unlock:
- mutex_unlock(&cpuset_mutex);
- return 0;
-}
-
-/*
- * If the cpuset being removed has its flag 'sched_load_balance'
- * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
- */
-
-static void cpuset_css_offline(struct cgroup_subsys_state *css)
-{
- struct cpuset *cs = css_cs(css);
-
- mutex_lock(&cpuset_mutex);
-
- if (is_sched_load_balance(cs))
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
- cpuset_dec();
- clear_bit(CS_ONLINE, &cs->flags);
-
- mutex_unlock(&cpuset_mutex);
-}
-
-static void cpuset_css_free(struct cgroup_subsys_state *css)
-{
- struct cpuset *cs = css_cs(css);
-
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->cpus_allowed);
- kfree(cs);
-}
-
-static void cpuset_bind(struct cgroup_subsys_state *root_css)
-{
- mutex_lock(&cpuset_mutex);
- spin_lock_irq(&callback_lock);
-
- if (cgroup_on_dfl(root_css->cgroup)) {
- cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
- top_cpuset.mems_allowed = node_possible_map;
- } else {
- cpumask_copy(top_cpuset.cpus_allowed,
- top_cpuset.effective_cpus);
- top_cpuset.mems_allowed = top_cpuset.effective_mems;
- }
-
- spin_unlock_irq(&callback_lock);
- mutex_unlock(&cpuset_mutex);
-}
-
-struct cgroup_subsys cpuset_cgrp_subsys = {
- .css_alloc = cpuset_css_alloc,
- .css_online = cpuset_css_online,
- .css_offline = cpuset_css_offline,
- .css_free = cpuset_css_free,
- .can_attach = cpuset_can_attach,
- .cancel_attach = cpuset_cancel_attach,
- .attach = cpuset_attach,
- .bind = cpuset_bind,
- .legacy_cftypes = files,
- .early_init = 1,
-};
-
-/**
- * cpuset_init - initialize cpusets at system boot
- *
- * Description: Initialize top_cpuset and the cpuset internal file system,
- **/
-
-int __init cpuset_init(void)
-{
- int err = 0;
-
- if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
- BUG();
- if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
- BUG();
-
- cpumask_setall(top_cpuset.cpus_allowed);
- nodes_setall(top_cpuset.mems_allowed);
- cpumask_setall(top_cpuset.effective_cpus);
- nodes_setall(top_cpuset.effective_mems);
-
- fmeter_init(&top_cpuset.fmeter);
- set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
- top_cpuset.relax_domain_level = -1;
-
- err = register_filesystem(&cpuset_fs_type);
- if (err < 0)
- return err;
-
- if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
- BUG();
-
- return 0;
-}
-
-/*
- * If CPU and/or memory hotplug handlers, below, unplug any CPUs
- * or memory nodes, we need to walk over the cpuset hierarchy,
- * removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then move the tasks in the empty
- * cpuset to its next-highest non-empty parent.
- */
-static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
-{
- struct cpuset *parent;
-
- /*
- * Find its next-highest non-empty parent, (top cpuset
- * has online cpus, so can't be empty).
- */
- parent = parent_cs(cs);
- while (cpumask_empty(parent->cpus_allowed) ||
- nodes_empty(parent->mems_allowed))
- parent = parent_cs(parent);
-
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
- pr_cont_cgroup_name(cs->css.cgroup);
- pr_cont("\n");
- }
-}
-
-static void
-hotplug_update_tasks_legacy(struct cpuset *cs,
- struct cpumask *new_cpus, nodemask_t *new_mems,
- bool cpus_updated, bool mems_updated)
-{
- bool is_empty;
-
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->cpus_allowed, new_cpus);
- cpumask_copy(cs->effective_cpus, new_cpus);
- cs->mems_allowed = *new_mems;
- cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
-
- /*
- * Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
- */
- if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs);
- if (mems_updated && !nodes_empty(cs->mems_allowed))
- update_tasks_nodemask(cs);
-
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
-
- mutex_unlock(&cpuset_mutex);
-
- /*
- * Move tasks to the nearest ancestor with execution resources,
- * This is full cgroup operation which will also call back into
- * cpuset. Should be done outside any lock.
- */
- if (is_empty)
- remove_tasks_in_empty_cpuset(cs);
-
- mutex_lock(&cpuset_mutex);
-}
-
-static void
-hotplug_update_tasks(struct cpuset *cs,
- struct cpumask *new_cpus, nodemask_t *new_mems,
- bool cpus_updated, bool mems_updated)
-{
- if (cpumask_empty(new_cpus))
- cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
- if (nodes_empty(*new_mems))
- *new_mems = parent_cs(cs)->effective_mems;
-
- spin_lock_irq(&callback_lock);
- cpumask_copy(cs->effective_cpus, new_cpus);
- cs->effective_mems = *new_mems;
- spin_unlock_irq(&callback_lock);
-
- if (cpus_updated)
- update_tasks_cpumask(cs);
- if (mems_updated)
- update_tasks_nodemask(cs);
-}
-
-/**
- * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
- * @cs: cpuset in interest
- *
- * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
- * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
- * all its tasks are moved to the nearest ancestor with both resources.
- */
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
-{
- static cpumask_t new_cpus;
- static nodemask_t new_mems;
- bool cpus_updated;
- bool mems_updated;
-retry:
- wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
-
- mutex_lock(&cpuset_mutex);
-
- /*
- * We have raced with task attaching. We wait until attaching
- * is finished, so we won't attach a task to an empty cpuset.
- */
- if (cs->attach_in_progress) {
- mutex_unlock(&cpuset_mutex);
- goto retry;
- }
-
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
- nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
-
- cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
- mems_updated = !nodes_equal(new_mems, cs->effective_mems);
-
- if (cgroup_on_dfl(cs->css.cgroup))
- hotplug_update_tasks(cs, &new_cpus, &new_mems,
- cpus_updated, mems_updated);
- else
- hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
- cpus_updated, mems_updated);
-
- mutex_unlock(&cpuset_mutex);
-}
-
-/**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- *
- * This function is called after either CPU or memory configuration has
- * changed and updates cpuset accordingly. The top_cpuset is always
- * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
- * order to make cpusets transparent (of no affect) on systems that are
- * actively using CPU hotplug but making no active use of cpusets.
- *
- * Non-root cpusets are only affected by offlining. If any CPUs or memory
- * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
- * all descendants.
- *
- * Note that CPU offlining during suspend is ignored. We don't modify
- * cpusets across suspend/resume cycles at all.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work)
-{
- static cpumask_t new_cpus;
- static nodemask_t new_mems;
- bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
-
- mutex_lock(&cpuset_mutex);
-
- /* fetch the available cpus/mems and find out which changed how */
- cpumask_copy(&new_cpus, cpu_active_mask);
- new_mems = node_states[N_MEMORY];
-
- cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
- mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
-
- /* synchronize cpus_allowed to cpu_active_mask */
- if (cpus_updated) {
- spin_lock_irq(&callback_lock);
- if (!on_dfl)
- cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
- cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
- spin_unlock_irq(&callback_lock);
- /* we don't mess with cpumasks of tasks in top_cpuset */
- }
-
- /* synchronize mems_allowed to N_MEMORY */
- if (mems_updated) {
- spin_lock_irq(&callback_lock);
- if (!on_dfl)
- top_cpuset.mems_allowed = new_mems;
- top_cpuset.effective_mems = new_mems;
- spin_unlock_irq(&callback_lock);
- update_tasks_nodemask(&top_cpuset);
- }
-
- mutex_unlock(&cpuset_mutex);
-
- /* if cpus or mems changed, we need to propagate to descendants */
- if (cpus_updated || mems_updated) {
- struct cpuset *cs;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (cs == &top_cpuset || !css_tryget_online(&cs->css))
- continue;
- rcu_read_unlock();
-
- cpuset_hotplug_update_tasks(cs);
-
- rcu_read_lock();
- css_put(&cs->css);
- }
- rcu_read_unlock();
- }
-
- /* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
- rebuild_sched_domains();
-}
-
-void cpuset_update_active_cpus(bool cpu_online)
-{
- /*
- * We're inside cpu hotplug critical region which usually nests
- * inside cgroup synchronization. Bounce actual hotplug processing
- * to a work item to avoid reverse locking order.
- *
- * We still need to do partition_sched_domains() synchronously;
- * otherwise, the scheduler will get confused and put tasks to the
- * dead CPU. Fall back to the default single domain.
- * cpuset_hotplug_workfn() will rebuild it as necessary.
- */
- partition_sched_domains(1, NULL, NULL);
- schedule_work(&cpuset_hotplug_work);
-}
-
-/*
- * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
- * Call this routine anytime after node_states[N_MEMORY] changes.
- * See cpuset_update_active_cpus() for CPU hotplug handling.
- */
-static int cpuset_track_online_nodes(struct notifier_block *self,
- unsigned long action, void *arg)
-{
- schedule_work(&cpuset_hotplug_work);
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
-/**
- * cpuset_init_smp - initialize cpus_allowed
- *
- * Description: Finish top cpuset after cpu, node maps are initialized
- */
-void __init cpuset_init_smp(void)
-{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
- top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
-
- cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
- top_cpuset.effective_mems = node_states[N_MEMORY];
-
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
-}
-
-/**
- * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
- * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
- * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
- *
- * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
- * attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_mask, even if this means going outside the
- * tasks cpuset.
- **/
-
-void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
- guarantee_online_cpus(task_cs(tsk), pmask);
- rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
-}
-
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
-{
- rcu_read_lock();
- do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
- rcu_read_unlock();
-
- /*
- * We own tsk->cpus_allowed, nobody can change it under us.
- *
- * But we used cs && cs->cpus_allowed lockless and thus can
- * race with cgroup_attach_task() or update_cpumask() and get
- * the wrong tsk->cpus_allowed. However, both cases imply the
- * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
- * which takes task_rq_lock().
- *
- * If we are called after it dropped the lock we must see all
- * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
- * set any mask even if it is not right from task_cs() pov,
- * the pending set_cpus_allowed_ptr() will fix things.
- *
- * select_fallback_rq() will fix things ups and set cpu_possible_mask
- * if required.
- */
-}
-
-void __init cpuset_init_current_mems_allowed(void)
-{
- nodes_setall(current->mems_allowed);
-}
-
-/**
- * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
- * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
- *
- * Description: Returns the nodemask_t mems_allowed of the cpuset
- * attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of node_states[N_MEMORY], even if this means going outside the
- * tasks cpuset.
- **/
-
-nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
-{
- nodemask_t mask;
- unsigned long flags;
-
- spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
- guarantee_online_mems(task_cs(tsk), &mask);
- rcu_read_unlock();
- spin_unlock_irqrestore(&callback_lock, flags);
-
- return mask;
-}
-
-/**
- * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
- * @nodemask: the nodemask to be checked
- *
- * Are any of the nodes in the nodemask allowed in current->mems_allowed?
- */
-int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
-{
- return nodes_intersects(*nodemask, current->mems_allowed);
-}
-
-/*
- * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
- * mem_hardwall ancestor to the specified cpuset. Call holding
- * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
- * (an unusual configuration), then returns the root cpuset.
- */
-static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
-{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
- cs = parent_cs(cs);
- return cs;
-}
-
-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
- * @node: is this an allowed node?
- * @gfp_mask: memory allocation flags
- *
- * If we're in interrupt, yes, we can always allocate. If @node is set in
- * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
- * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
- * Otherwise, no.
- *
- * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
- * and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
- * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest enclosing hardwalled ancestor cpuset.
- *
- * Scanning up parent cpusets requires callback_lock. The
- * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
- * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
- * current tasks mems_allowed came up empty on the first pass over
- * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
- * cpuset are short of memory, might require taking the callback_lock.
- *
- * The first call here from mm/page_alloc:get_page_from_freelist()
- * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
- * so no allocation on a node outside the cpuset is allowed (unless
- * in interrupt, of course).
- *
- * The second pass through get_page_from_freelist() doesn't even call
- * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
- * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
- * in alloc_flags. That logic and the checks below have the combined
- * affect that:
- * in_interrupt - any node ok (current task context irrelevant)
- * GFP_ATOMIC - any node ok
- * TIF_MEMDIE - any node ok
- * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
- * GFP_USER - only nodes in current tasks mems allowed ok.
- */
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
-{
- struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
- unsigned long flags;
-
- if (in_interrupt())
- return 1;
- if (node_isset(node, current->mems_allowed))
- return 1;
- /*
- * Allow tasks that have access to memory reserves because they have
- * been OOM killed to get memory anywhere.
- */
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
- return 1;
- if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
- return 0;
-
- if (current->flags & PF_EXITING) /* Let dying task have memory */
- return 1;
-
- /* Not hardwall and node outside mems_allowed: scan up cpusets */
- spin_lock_irqsave(&callback_lock, flags);
-
- rcu_read_lock();
- cs = nearest_hardwall_ancestor(task_cs(current));
- allowed = node_isset(node, cs->mems_allowed);
- rcu_read_unlock();
-
- spin_unlock_irqrestore(&callback_lock, flags);
- return allowed;
-}
-
-/**
- * cpuset_mem_spread_node() - On which node to begin search for a file page
- * cpuset_slab_spread_node() - On which node to begin search for a slab page
- *
- * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
- * tasks in a cpuset with is_spread_page or is_spread_slab set),
- * and if the memory allocation used cpuset_mem_spread_node()
- * to determine on which node to start looking, as it will for
- * certain page cache or slab cache pages such as used for file
- * system buffers and inode caches, then instead of starting on the
- * local node to look for a free page, rather spread the starting
- * node around the tasks mems_allowed nodes.
- *
- * We don't have to worry about the returned node being offline
- * because "it can't happen", and even if it did, it would be ok.
- *
- * The routines calling guarantee_online_mems() are careful to
- * only set nodes in task->mems_allowed that are online. So it
- * should not be possible for the following code to return an
- * offline node. But if it did, that would be ok, as this routine
- * is not returning the node where the allocation must be, only
- * the node where the search should start. The zonelist passed to
- * __alloc_pages() will include all nodes. If the slab allocator
- * is passed an offline node, it will fall back to the local node.
- * See kmem_cache_alloc_node().
- */
-
-static int cpuset_spread_node(int *rotor)
-{
- int node;
-
- node = next_node(*rotor, current->mems_allowed);
- if (node == MAX_NUMNODES)
- node = first_node(current->mems_allowed);
- *rotor = node;
- return node;
-}
-
-int cpuset_mem_spread_node(void)
-{
- if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
- current->cpuset_mem_spread_rotor =
- node_random(&current->mems_allowed);
-
- return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
-}
-
-int cpuset_slab_spread_node(void)
-{
- if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
- current->cpuset_slab_spread_rotor =
- node_random(&current->mems_allowed);
-
- return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
-}
-
-EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
-
-/**
- * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
- * @tsk1: pointer to task_struct of some task.
- * @tsk2: pointer to task_struct of some other task.
- *
- * Description: Return true if @tsk1's mems_allowed intersects the
- * mems_allowed of @tsk2. Used by the OOM killer to determine if
- * one of the task's memory usage might impact the memory available
- * to the other.
- **/
-
-int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
- const struct task_struct *tsk2)
-{
- return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
-}
-
-/**
- * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @tsk: pointer to task_struct of some task.
- *
- * Description: Prints @task's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log.
- */
-void cpuset_print_task_mems_allowed(struct task_struct *tsk)
-{
- struct cgroup *cgrp;
-
- rcu_read_lock();
-
- cgrp = task_cs(tsk)->css.cgroup;
- pr_info("%s cpuset=", tsk->comm);
- pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
-
- rcu_read_unlock();
-}
-
-/*
- * Collection of memory_pressure is suppressed unless
- * this flag is enabled by writing "1" to the special
- * cpuset file 'memory_pressure_enabled' in the root cpuset.
- */
-
-int cpuset_memory_pressure_enabled __read_mostly;
-
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
- *
- * Keep a running average of the rate of synchronous (direct)
- * page reclaim efforts initiated by tasks in each cpuset.
- *
- * This represents the rate at which some task in the cpuset
- * ran low on memory on all nodes it was allowed to use, and
- * had to enter the kernels page reclaim code in an effort to
- * create more free memory by tossing clean pages or swapping
- * or writing dirty pages.
- *
- * Display to user space in the per-cpuset read-only file
- * "memory_pressure". Value displayed is an integer
- * representing the recent rate of entry into the synchronous
- * (direct) page reclaim by any task attached to the cpuset.
- **/
-
-void __cpuset_memory_pressure_bump(void)
-{
- rcu_read_lock();
- fmeter_markevent(&task_cs(current)->fmeter);
- rcu_read_unlock();
-}
-
-#ifdef CONFIG_PROC_PID_CPUSET
-/*
- * proc_cpuset_show()
- * - Print tasks cpuset path into seq_file.
- * - Used for /proc/<pid>/cpuset.
- * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
- * doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
- * anyway.
- */
-int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- char *buf, *p;
- struct cgroup_subsys_state *css;
- int retval;
-
- retval = -ENOMEM;
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf)
- goto out;
-
- retval = -ENAMETOOLONG;
- rcu_read_lock();
- css = task_css(tsk, cpuset_cgrp_id);
- p = cgroup_path(css->cgroup, buf, PATH_MAX);
- rcu_read_unlock();
- if (!p)
- goto out_free;
- seq_puts(m, p);
- seq_putc(m, '\n');
- retval = 0;
-out_free:
- kfree(buf);
-out:
- return retval;
-}
-#endif /* CONFIG_PROC_PID_CPUSET */
-
-/* Display task mems_allowed in /proc/<pid>/status file. */
-void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
-{
- seq_printf(m, "Mems_allowed:\t%*pb\n",
- nodemask_pr_args(&task->mems_allowed));
- seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
- nodemask_pr_args(&task->mems_allowed));
-}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
new file mode 100644
index 000000000000..99dac1aa972a
--- /dev/null
+++ b/kernel/crash_core.c
@@ -0,0 +1,693 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
+#include <linux/crash_core.h>
+#include <linux/reboot.h>
+#include <linux/btf.h>
+#include <linux/objtool.h>
+#include <linux/delay.h>
+#include <linux/panic.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
+ */
+#define CMA_DMA_TIMEOUT_SEC 10
+
+#ifdef CONFIG_CRASH_DUMP
+
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+ struct page *vmcoreinfo_page;
+ void *safecopy;
+
+ if (!IS_ENABLED(CONFIG_CRASH_DUMP))
+ return 0;
+ if (image->type != KEXEC_TYPE_CRASH)
+ return 0;
+
+ /*
+ * For kdump, allocate one vmcoreinfo safe copy from the
+ * crash memory. as we have arch_kexec_protect_crashkres()
+ * after kexec syscall, we naturally protect it from write
+ * (even read) access under kernel direct mapping. But on
+ * the other hand, we still need to operate it when crash
+ * happens to generate vmcoreinfo note, hereby we rely on
+ * vmap for this purpose.
+ */
+ vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+ if (!vmcoreinfo_page) {
+ pr_warn("Could not allocate vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
+ safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ if (!safecopy) {
+ pr_warn("Could not vmap vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
+
+ image->vmcoreinfo_data_copy = safecopy;
+ crash_update_vmcoreinfo_safecopy(safecopy);
+
+ return 0;
+}
+
+
+
+int kexec_should_crash(struct task_struct *p)
+{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in make_task_dead() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+int kexec_crash_loaded(void)
+{
+ return !!kexec_crash_image;
+}
+EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+
+static void crash_cma_clear_pending_dma(void)
+{
+ if (!crashk_cma_cnt)
+ return;
+
+ mdelay(CMA_DMA_TIMEOUT_SEC * 1000);
+}
+
+/*
+ * No panic_cpu check version of crash_kexec(). This function is called
+ * only when panic_cpu holds the current CPU number; this is the only CPU
+ * which processes crash_kexec routines.
+ */
+void __noclone __crash_kexec(struct pt_regs *regs)
+{
+ /* Take the kexec_lock here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (kexec_trylock()) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ crash_cma_clear_pending_dma();
+ machine_kexec(kexec_crash_image);
+ }
+ kexec_unlock();
+ }
+}
+STACK_FRAME_NON_STANDARD(__crash_kexec);
+
+__bpf_kfunc void crash_kexec(struct pt_regs *regs)
+{
+ if (panic_try_start()) {
+ /* This is the 1st CPU which comes here, so go ahead. */
+ __crash_kexec(regs);
+
+ /*
+ * Reset panic_cpu to allow another panic()/crash_kexec()
+ * call.
+ */
+ panic_reset();
+ }
+}
+
+static inline resource_size_t crash_resource_size(const struct resource *res)
+{
+ return !res->end ? 0 : resource_size(res);
+}
+
+
+
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf;
+ unsigned int cpu, i;
+ unsigned long long notes_addr;
+ unsigned long mstart, mend;
+
+ /* extra phdr for vmcoreinfo ELF note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += mem->nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two ELF headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ ehdr = (Elf64_Ehdr *)buf;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each possible CPU */
+ for_each_possible_cpu(cpu) {
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ phdr++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+ (ehdr->e_phnum)++;
+ phdr++;
+
+ /* Prepare PT_LOAD type program header for kernel text region */
+ if (need_kernel_map) {
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (unsigned long) _text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ ehdr->e_phnum++;
+ phdr++;
+ }
+
+ /* Go through all the ranges in mem->ranges[] and prepare phdr */
+ for (i = 0; i < mem->nr_ranges; i++) {
+ mstart = mem->ranges[i].start;
+ mend = mem->ranges[i].end;
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+#ifdef CONFIG_KEXEC_FILE
+ kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+#endif
+ phdr++;
+ }
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+/**
+ * crash_exclude_mem_range - exclude a mem range for existing ranges
+ * @mem: mem->range contains an array of ranges sorted in ascending order
+ * @mstart: the start of to-be-excluded range
+ * @mend: the start of to-be-excluded range
+ *
+ * If you are unsure if a range split will happen, to avoid function call
+ * failure because of -ENOMEM, always make sure
+ * mem->max_nr_ranges == mem->nr_ranges + 1
+ * before calling the function each time.
+ *
+ * returns 0 if a memory range is excluded successfully
+ * return -ENOMEM if mem->ranges doesn't have space to hold split ranges
+ */
+int crash_exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i;
+ unsigned long long start, end, p_start, p_end;
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
+
+ if (p_start > end)
+ continue;
+
+ /*
+ * Because the memory ranges in mem->ranges are stored in
+ * ascending order, when we detect `p_end < start`, we can
+ * immediately exit the for loop, as the subsequent memory
+ * ranges will definitely be outside the range we are looking
+ * for.
+ */
+ if (p_end < start)
+ break;
+
+ /* Truncate any area outside of range */
+ if (p_start < start)
+ p_start = start;
+ if (p_end > end)
+ p_end = end;
+
+ /* Found completely overlapping range */
+ if (p_start == start && p_end == end) {
+ memmove(&mem->ranges[i], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+ i--;
+ mem->nr_ranges--;
+ } else if (p_start > start && p_end < end) {
+ /* Split original range */
+ if (mem->nr_ranges >= mem->max_nr_ranges)
+ return -ENOMEM;
+
+ memmove(&mem->ranges[i + 2], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+
+ mem->ranges[i].end = p_start - 1;
+ mem->ranges[i + 1].start = p_end + 1;
+ mem->ranges[i + 1].end = end;
+
+ i++;
+ mem->nr_ranges++;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
+ else
+ mem->ranges[i].start = p_end + 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crash_exclude_mem_range);
+
+ssize_t crash_get_memory_size(void)
+{
+ ssize_t size = 0;
+
+ if (!kexec_trylock())
+ return -EBUSY;
+
+ size += crash_resource_size(&crashk_res);
+ size += crash_resource_size(&crashk_low_res);
+
+ kexec_unlock();
+ return size;
+}
+
+static int __crash_shrink_memory(struct resource *old_res,
+ unsigned long new_size)
+{
+ struct resource *ram_res;
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res)
+ return -ENOMEM;
+
+ ram_res->start = old_res->start + new_size;
+ ram_res->end = old_res->end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
+ ram_res->name = "System RAM";
+
+ if (!new_size) {
+ release_resource(old_res);
+ old_res->start = 0;
+ old_res->end = 0;
+ } else {
+ old_res->end = ram_res->start - 1;
+ }
+
+ crash_free_reserved_phys_range(ram_res->start, ram_res->end);
+ insert_resource(&iomem_resource, ram_res);
+
+ return 0;
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+ int ret = 0;
+ unsigned long old_size, low_size;
+
+ if (!kexec_trylock())
+ return -EBUSY;
+
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ low_size = crash_resource_size(&crashk_low_res);
+ old_size = crash_resource_size(&crashk_res) + low_size;
+ new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN);
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
+
+ /*
+ * (low_size > new_size) implies that low_size is greater than zero.
+ * This also means that if low_size is zero, the else branch is taken.
+ *
+ * If low_size is greater than 0, (low_size > new_size) indicates that
+ * crashk_low_res also needs to be shrunken. Otherwise, only crashk_res
+ * needs to be shrunken.
+ */
+ if (low_size > new_size) {
+ ret = __crash_shrink_memory(&crashk_res, 0);
+ if (ret)
+ goto unlock;
+
+ ret = __crash_shrink_memory(&crashk_low_res, new_size);
+ } else {
+ ret = __crash_shrink_memory(&crashk_res, new_size - low_size);
+ }
+
+ /* Swap crashk_res and crashk_low_res if needed */
+ if (!crashk_res.end && crashk_low_res.end) {
+ crashk_res.start = crashk_low_res.start;
+ crashk_res.end = crashk_low_res.end;
+ release_resource(&crashk_low_res);
+ crashk_low_res.start = 0;
+ crashk_low_res.end = 0;
+ insert_resource(&iomem_resource, &crashk_res);
+ }
+
+unlock:
+ kexec_unlock();
+ return ret;
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.common.pr_pid = current->pid;
+ elf_core_copy_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+#endif /*CONFIG_CRASH_DUMP*/
+
+#ifdef CONFIG_CRASH_HOTPLUG
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+/*
+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
+ */
+static DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
+
+/*
+ * This routine utilized when the crash_hotplug sysfs node is read.
+ * It reflects the kernel's ability/permission to update the kdump
+ * image directly.
+ */
+int crash_check_hotplug_support(void)
+{
+ int rc = 0;
+
+ crash_hotplug_lock();
+ /* Obtain lock while reading crash information */
+ if (!kexec_trylock()) {
+ if (!kexec_in_progress)
+ pr_info("kexec_trylock() failed, kdump image may be inaccurate\n");
+ crash_hotplug_unlock();
+ return 0;
+ }
+ if (kexec_crash_image) {
+ rc = kexec_crash_image->hotplug_support;
+ }
+ /* Release lock now that update complete */
+ kexec_unlock();
+ crash_hotplug_unlock();
+
+ return rc;
+}
+
+/*
+ * To accurately reflect hot un/plug changes of CPU and Memory resources
+ * (including onling and offlining of those resources), the relevant
+ * kexec segments must be updated with latest CPU and Memory resources.
+ *
+ * Architectures must ensure two things for all segments that need
+ * updating during hotplug events:
+ *
+ * 1. Segments must be large enough to accommodate a growing number of
+ * resources.
+ * 2. Exclude the segments from SHA verification.
+ *
+ * For example, on most architectures, the elfcorehdr (which is passed
+ * to the crash kernel via the elfcorehdr= parameter) must include the
+ * new list of CPUs and memory. To make changes to the elfcorehdr, it
+ * should be large enough to permit a growing number of CPU and Memory
+ * resources. One can estimate the elfcorehdr memory size based on
+ * NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES. The elfcorehdr is
+ * excluded from SHA verification by default if the architecture
+ * supports crash hotplug.
+ */
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg)
+{
+ struct kimage *image;
+
+ crash_hotplug_lock();
+ /* Obtain lock while changing crash information */
+ if (!kexec_trylock()) {
+ if (!kexec_in_progress)
+ pr_info("kexec_trylock() failed, kdump image may be inaccurate\n");
+ crash_hotplug_unlock();
+ return;
+ }
+
+ /* Check kdump is not loaded */
+ if (!kexec_crash_image)
+ goto out;
+
+ image = kexec_crash_image;
+
+ /* Check that kexec segments update is permitted */
+ if (!image->hotplug_support)
+ goto out;
+
+ if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
+ hp_action == KEXEC_CRASH_HP_REMOVE_CPU)
+ pr_debug("hp_action %u, cpu %u\n", hp_action, cpu);
+ else
+ pr_debug("hp_action %u\n", hp_action);
+
+ /*
+ * The elfcorehdr_index is set to -1 when the struct kimage
+ * is allocated. Find the segment containing the elfcorehdr,
+ * if not already found.
+ */
+ if (image->elfcorehdr_index < 0) {
+ unsigned long mem;
+ unsigned char *ptr;
+ unsigned int n;
+
+ for (n = 0; n < image->nr_segments; n++) {
+ mem = image->segment[n].mem;
+ ptr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (ptr) {
+ /* The segment containing elfcorehdr */
+ if (memcmp(ptr, ELFMAG, SELFMAG) == 0)
+ image->elfcorehdr_index = (int)n;
+ kunmap_local(ptr);
+ }
+ }
+ }
+
+ if (image->elfcorehdr_index < 0) {
+ pr_err("unable to locate elfcorehdr segment");
+ goto out;
+ }
+
+ /* Needed in order for the segments to be updated */
+ arch_kexec_unprotect_crashkres();
+
+ /* Differentiate between normal load and hotplug update */
+ image->hp_action = hp_action;
+
+ /* Now invoke arch-specific update handler */
+ arch_crash_handle_hotplug_event(image, arg);
+
+ /* No longer handling a hotplug event */
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_updated = true;
+
+ /* Change back to read-only */
+ arch_kexec_protect_crashkres();
+
+ /* Errors in the callback is not a reason to rollback state */
+out:
+ /* Release lock now that update complete */
+ kexec_unlock();
+ crash_hotplug_unlock();
+}
+
+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg)
+{
+ switch (val) {
+ case MEM_ONLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU, arg);
+ break;
+
+ case MEM_OFFLINE:
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
+ KEXEC_CRASH_HP_INVALID_CPU, arg);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block crash_memhp_nb = {
+ .notifier_call = crash_memhp_notifier,
+ .priority = 0
+};
+
+static int crash_cpuhp_online(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL);
+ return 0;
+}
+
+static int crash_cpuhp_offline(unsigned int cpu)
+{
+ crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL);
+ return 0;
+}
+
+static int __init crash_hotplug_init(void)
+{
+ int result = 0;
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ register_memory_notifier(&crash_memhp_nb);
+
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ result = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
+ "crash/cpuhp", crash_cpuhp_online, crash_cpuhp_offline);
+ }
+
+ return result;
+}
+
+subsys_initcall(crash_hotplug_init);
+#endif
diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c
new file mode 100644
index 000000000000..8aadf6801530
--- /dev/null
+++ b/kernel/crash_core_test.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there
+
+// Helper to create and initialize crash_mem
+static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges,
+ unsigned int nr_initial_ranges,
+ const struct range *initial_ranges)
+{
+ struct crash_mem *mem;
+ size_t alloc_size;
+
+ // Check if max_ranges can even hold initial_ranges
+ if (max_ranges < nr_initial_ranges) {
+ kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n",
+ max_ranges, nr_initial_ranges);
+ return NULL;
+ }
+
+ alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range);
+ mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL);
+ if (!mem) {
+ kunit_err(test, "Failed to allocate crash_mem\n");
+ return NULL;
+ }
+
+ mem->max_nr_ranges = max_ranges;
+ mem->nr_ranges = nr_initial_ranges;
+ if (initial_ranges && nr_initial_ranges > 0) {
+ memcpy(mem->ranges, initial_ranges,
+ nr_initial_ranges * sizeof(struct range));
+ }
+
+ return mem;
+}
+
+// Helper to compare ranges for assertions
+static void assert_ranges_equal(struct kunit *test,
+ const struct range *actual_ranges,
+ unsigned int actual_nr_ranges,
+ const struct range *expected_ranges,
+ unsigned int expected_nr_ranges,
+ const char *case_name)
+{
+ unsigned int i;
+
+ KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges,
+ "%s: Number of ranges mismatch.", case_name);
+
+ for (i = 0; i < expected_nr_ranges; i++) {
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start,
+ "%s: Range %u start mismatch.", case_name, i);
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end,
+ "%s: Range %u end mismatch.", case_name, i);
+ }
+}
+
+// Structure for test parameters
+struct exclude_test_param {
+ const char *description;
+ unsigned long long exclude_start;
+ unsigned long long exclude_end;
+ unsigned int initial_max_ranges;
+ const struct range *initial_ranges;
+ unsigned int initial_nr_ranges;
+ const struct range *expected_ranges;
+ unsigned int expected_nr_ranges;
+ int expected_ret;
+};
+
+static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params)
+{
+ struct crash_mem *mem;
+ int ret;
+
+ kunit_info(test, "%s", params->description);
+
+ mem = create_crash_mem(test, params->initial_max_ranges,
+ params->initial_nr_ranges, params->initial_ranges);
+ if (!mem)
+ return; // Error already logged by create_crash_mem or kunit_kzalloc
+
+ ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end);
+
+ KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret,
+ "%s: Return value mismatch.", params->description);
+
+ if (params->expected_ret == 0) {
+ assert_ranges_equal(test, mem->ranges, mem->nr_ranges,
+ params->expected_ranges, params->expected_nr_ranges,
+ params->description);
+ } else {
+ // If an error is expected, nr_ranges might still be relevant to check
+ // depending on the exact point of failure. For ENOMEM on split,
+ // nr_ranges shouldn't have changed.
+ KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges,
+ mem->nr_ranges,
+ "%s: Number of ranges mismatch on error.",
+ params->description);
+ }
+}
+
+/*
+ * Test Strategy 1: One to-be-excluded range A and one existing range B.
+ *
+ * Exhaust all possibilities of the position of A regarding B.
+ */
+
+static const struct range single_range_b = { .start = 100, .end = 199 };
+
+static const struct exclude_test_param exclude_single_range_test_data[] = {
+ {
+ .description = "1.1: A is left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 50,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.2: A's right boundary touches B's left boundary",
+ .exclude_start = 10, .exclude_end = 99,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.3: A overlaps B's left part",
+ .exclude_start = 50, .exclude_end = 149,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.4: A is completely inside B",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 119 },
+ { .start = 180, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.5: A overlaps B's right part",
+ .exclude_start = 150, .exclude_end = 249,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.6: A's left boundary touches B's right boundary",
+ .exclude_start = 200, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.7: A is right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 300,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.8: A completely covers B and extends beyond",
+ .exclude_start = 50, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.9: A covers B and extends to the left",
+ .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.10: A covers B and extends to the right",
+ .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.11: A is identical to B",
+ .exclude_start = 100, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.12: A is a point, left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 10,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.13: A is a point, at start of B",
+ .exclude_start = 100, .exclude_end = 100,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.14: A is a point, in middle of B (causes split)",
+ .exclude_start = 150, .exclude_end = 150,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 149 },
+ { .start = 151, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.15: A is a point, at end of B",
+ .exclude_start = 199, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.16: A is a point, right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ // ENOMEM case for single range split
+ {
+ .description = "1.17: A completely inside B (split), no space (ENOMEM)",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 1, // Not enough for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 1, // Should remain unchanged
+ .expected_ret = -ENOMEM,
+ },
+};
+
+
+static void exclude_single_range_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description);
+ run_exclude_test_case(test, &exclude_single_range_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * Test Strategy 2: Regression test.
+ */
+
+static const struct exclude_test_param exclude_range_regression_test_data[] = {
+ // Test data from commit a2e9a95d2190
+ {
+ .description = "2.1: exclude low 1M",
+ .exclude_start = 0, .exclude_end = (1 << 20) - 1,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 0, .end = 0x3efff },
+ { .start = 0x3f000, .end = 0x3ffff },
+ { .start = 0x40000, .end = 0x9ffff }
+ },
+ .initial_nr_ranges = 3,
+ .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u
+ {
+ .description = "2.2: when range out of bound",
+ .exclude_start = 100, .exclude_end = 200,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 1, .end = 299 },
+ { .start = 401, .end = 1000 },
+ { .start = 1001, .end = 2000 }
+ },
+ .initial_nr_ranges = 3,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 3, // Should remain unchanged
+ .expected_ret = -ENOMEM
+ },
+
+};
+
+
+static void exclude_range_regression_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description);
+ run_exclude_test_case(test, &exclude_range_regression_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * KUnit Test Suite
+ */
+static struct kunit_case crash_exclude_mem_range_test_cases[] = {
+ KUNIT_CASE(exclude_single_range_test),
+ KUNIT_CASE(exclude_range_regression_test),
+ {}
+};
+
+static struct kunit_suite crash_exclude_mem_range_suite = {
+ .name = "crash_exclude_mem_range_tests",
+ .test_cases = crash_exclude_mem_range_test_cases,
+ // .init and .exit can be NULL if not needed globally for the suite
+};
+
+kunit_test_suite(crash_exclude_mem_range_suite);
+
+MODULE_DESCRIPTION("crash dump KUnit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
new file mode 100644
index 000000000000..401423ba477d
--- /dev/null
+++ b/kernel/crash_dump_dm_crypt.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <keys/user-type.h>
+#include <linux/crash_dump.h>
+#include <linux/cc_platform.h>
+#include <linux/configfs.h>
+#include <linux/module.h>
+
+#define KEY_NUM_MAX 128 /* maximum dm crypt keys */
+#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */
+#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */
+
+static unsigned int key_count;
+
+struct dm_crypt_key {
+ unsigned int key_size;
+ char key_desc[KEY_DESC_MAX_LEN];
+ u8 data[KEY_SIZE_MAX];
+};
+
+static struct keys_header {
+ unsigned int total_keys;
+ struct dm_crypt_key keys[] __counted_by(total_keys);
+} *keys_header;
+
+static size_t get_keys_header_size(size_t total_keys)
+{
+ return struct_size(keys_header, keys, total_keys);
+}
+
+unsigned long long dm_crypt_keys_addr;
+EXPORT_SYMBOL_GPL(dm_crypt_keys_addr);
+
+static int __init setup_dmcryptkeys(char *arg)
+{
+ char *end;
+
+ if (!arg)
+ return -EINVAL;
+ dm_crypt_keys_addr = memparse(arg, &end);
+ if (end > arg)
+ return 0;
+
+ dm_crypt_keys_addr = 0;
+ return -EINVAL;
+}
+
+early_param("dmcryptkeys", setup_dmcryptkeys);
+
+/*
+ * Architectures may override this function to read dm crypt keys
+ */
+ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+}
+
+static int add_key_to_keyring(struct dm_crypt_key *dm_key,
+ key_ref_t keyring_ref)
+{
+ key_ref_t key_ref;
+ int r;
+
+ /* create or update the requested key and add it to the target keyring */
+ key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc,
+ dm_key->data, dm_key->key_size,
+ KEY_USR_ALL, KEY_ALLOC_IN_QUOTA);
+
+ if (!IS_ERR(key_ref)) {
+ r = key_ref_to_ptr(key_ref)->serial;
+ key_ref_put(key_ref);
+ kexec_dprintk("Success adding key %s", dm_key->key_desc);
+ } else {
+ r = PTR_ERR(key_ref);
+ kexec_dprintk("Error when adding key");
+ }
+
+ key_ref_put(keyring_ref);
+ return r;
+}
+
+static void get_keys_from_kdump_reserved_memory(void)
+{
+ struct keys_header *keys_header_loaded;
+
+ arch_kexec_unprotect_crashkres();
+
+ keys_header_loaded = kmap_local_page(pfn_to_page(
+ kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT));
+
+ memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count));
+ kunmap_local(keys_header_loaded);
+ arch_kexec_protect_crashkres();
+}
+
+static int restore_dm_crypt_keys_to_thread_keyring(void)
+{
+ struct dm_crypt_key *key;
+ size_t keys_header_size;
+ key_ref_t keyring_ref;
+ u64 addr;
+
+ /* find the target keyring (which must be writable) */
+ keyring_ref =
+ lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE);
+ if (IS_ERR(keyring_ref)) {
+ kexec_dprintk("Failed to get the user keyring\n");
+ return PTR_ERR(keyring_ref);
+ }
+
+ addr = dm_crypt_keys_addr;
+ dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr);
+ if (key_count < 0 || key_count > KEY_NUM_MAX) {
+ kexec_dprintk("Failed to read the number of dm-crypt keys\n");
+ return -1;
+ }
+
+ kexec_dprintk("There are %u keys\n", key_count);
+ addr = dm_crypt_keys_addr;
+
+ keys_header_size = get_keys_header_size(key_count);
+ keys_header = kzalloc(keys_header_size, GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr);
+
+ for (int i = 0; i < keys_header->total_keys; i++) {
+ key = &keys_header->keys[i];
+ kexec_dprintk("Get key (size=%u)\n", key->key_size);
+ add_key_to_keyring(key, keyring_ref);
+ }
+
+ return 0;
+}
+
+static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
+{
+ const struct user_key_payload *ukp;
+ struct key *key;
+
+ kexec_dprintk("Requesting logon key %s", dm_key->key_desc);
+ key = request_key(&key_type_logon, dm_key->key_desc, NULL);
+
+ if (IS_ERR(key)) {
+ pr_warn("No such logon key %s\n", dm_key->key_desc);
+ return PTR_ERR(key);
+ }
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp)
+ return -EKEYREVOKED;
+
+ if (ukp->datalen > KEY_SIZE_MAX) {
+ pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX);
+ return -EINVAL;
+ }
+
+ memcpy(dm_key->data, ukp->data, ukp->datalen);
+ dm_key->key_size = ukp->datalen;
+ kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
+ dm_key->key_desc, dm_key->data);
+ return 0;
+}
+
+struct config_key {
+ struct config_item item;
+ const char *description;
+};
+
+static inline struct config_key *to_config_key(struct config_item *item)
+{
+ return container_of(item, struct config_key, item);
+}
+
+static ssize_t config_key_description_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%s\n", to_config_key(item)->description);
+}
+
+static ssize_t config_key_description_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct config_key *config_key = to_config_key(item);
+ size_t len;
+ int ret;
+
+ ret = -EINVAL;
+ len = strcspn(page, "\n");
+
+ if (len > KEY_DESC_MAX_LEN) {
+ pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN);
+ return ret;
+ }
+
+ if (!len)
+ return ret;
+
+ kfree(config_key->description);
+ ret = -ENOMEM;
+ config_key->description = kmemdup_nul(page, len, GFP_KERNEL);
+ if (!config_key->description)
+ return ret;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_key_, description);
+
+static struct configfs_attribute *config_key_attrs[] = {
+ &config_key_attr_description,
+ NULL,
+};
+
+static void config_key_release(struct config_item *item)
+{
+ kfree(to_config_key(item));
+ key_count--;
+}
+
+static struct configfs_item_operations config_key_item_ops = {
+ .release = config_key_release,
+};
+
+static const struct config_item_type config_key_type = {
+ .ct_item_ops = &config_key_item_ops,
+ .ct_attrs = config_key_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *config_keys_make_item(struct config_group *group,
+ const char *name)
+{
+ struct config_key *config_key;
+
+ if (key_count > KEY_NUM_MAX) {
+ pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX);
+ return ERR_PTR(-EINVAL);
+ }
+
+ config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL);
+ if (!config_key)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&config_key->item, name, &config_key_type);
+
+ key_count++;
+
+ return &config_key->item;
+}
+
+static ssize_t config_keys_count_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", key_count);
+}
+
+CONFIGFS_ATTR_RO(config_keys_, count);
+
+static bool is_dm_key_reused;
+
+static ssize_t config_keys_reuse_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", is_dm_key_reused);
+}
+
+static ssize_t config_keys_reuse_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) {
+ kexec_dprintk(
+ "dm-crypt keys haven't be saved to crash-reserved memory\n");
+ return -EINVAL;
+ }
+
+ if (kstrtobool(page, &is_dm_key_reused))
+ return -EINVAL;
+
+ if (is_dm_key_reused)
+ get_keys_from_kdump_reserved_memory();
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, reuse);
+
+static struct configfs_attribute *config_keys_attrs[] = {
+ &config_keys_attr_count,
+ &config_keys_attr_reuse,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations config_keys_group_ops = {
+ .make_item = config_keys_make_item,
+};
+
+static const struct config_item_type config_keys_type = {
+ .ct_group_ops = &config_keys_group_ops,
+ .ct_attrs = config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static bool restore;
+
+static ssize_t config_keys_restore_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", restore);
+}
+
+static ssize_t config_keys_restore_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!restore)
+ restore_dm_crypt_keys_to_thread_keyring();
+
+ if (kstrtobool(page, &restore))
+ return -EINVAL;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, restore);
+
+static struct configfs_attribute *kdump_config_keys_attrs[] = {
+ &config_keys_attr_restore,
+ NULL,
+};
+
+static const struct config_item_type kdump_config_keys_type = {
+ .ct_attrs = kdump_config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem config_keys_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "crash_dm_crypt_keys",
+ .ci_type = &config_keys_type,
+ },
+ },
+};
+
+static int build_keys_header(void)
+{
+ struct config_item *item = NULL;
+ struct config_key *key;
+ int i, r;
+
+ if (keys_header != NULL)
+ kvfree(keys_header);
+
+ keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ keys_header->total_keys = key_count;
+
+ i = 0;
+ list_for_each_entry(item, &config_keys_subsys.su_group.cg_children,
+ ci_entry) {
+ if (item->ci_type != &config_key_type)
+ continue;
+
+ key = to_config_key(item);
+
+ if (!key->description) {
+ pr_warn("No key description for key %s\n", item->ci_name);
+ return -EINVAL;
+ }
+
+ strscpy(keys_header->keys[i].key_desc, key->description,
+ KEY_DESC_MAX_LEN);
+ r = read_key_from_user_keying(&keys_header->keys[i]);
+ if (r != 0) {
+ kexec_dprintk("Failed to read key %s\n",
+ keys_header->keys[i].key_desc);
+ return r;
+ }
+ i++;
+ kexec_dprintk("Found key: %s\n", item->ci_name);
+ }
+
+ return 0;
+}
+
+int crash_load_dm_crypt_keys(struct kimage *image)
+{
+ struct kexec_buf kbuf = {
+ .image = image,
+ .buf_min = 0,
+ .buf_max = ULONG_MAX,
+ .top_down = false,
+ .random = true,
+ };
+ int r;
+
+
+ if (key_count <= 0) {
+ kexec_dprintk("No dm-crypt keys\n");
+ return -ENOENT;
+ }
+
+ if (!is_dm_key_reused) {
+ image->dm_crypt_keys_addr = 0;
+ r = build_keys_header();
+ if (r)
+ return r;
+ }
+
+ kbuf.buffer = keys_header;
+ kbuf.bufsz = get_keys_header_size(key_count);
+
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ r = kexec_add_buffer(&kbuf);
+ if (r) {
+ kvfree((void *)kbuf.buffer);
+ return r;
+ }
+ image->dm_crypt_keys_addr = kbuf.mem;
+ image->dm_crypt_keys_sz = kbuf.bufsz;
+ kexec_dprintk(
+ "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n",
+ kbuf.bufsz, kbuf.memsz);
+
+ return r;
+}
+
+static int __init configfs_dmcrypt_keys_init(void)
+{
+ int ret;
+
+ if (is_kdump_kernel()) {
+ config_keys_subsys.su_group.cg_item.ci_type =
+ &kdump_config_keys_type;
+ }
+
+ config_group_init(&config_keys_subsys.su_group);
+ mutex_init(&config_keys_subsys.su_mutex);
+ ret = configfs_register_subsystem(&config_keys_subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n", ret,
+ config_keys_subsys.su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+
+ return 0;
+
+out_unregister:
+ configfs_unregister_subsystem(&config_keys_subsys);
+
+ return ret;
+}
+
+module_init(configfs_dmcrypt_keys_init);
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
new file mode 100644
index 000000000000..62e60e0223cf
--- /dev/null
+++ b/kernel/crash_reserve.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/buildid.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/sizes.h>
+#include <linux/kexec.h>
+#include <linux/memory.h>
+#include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kmemleak.h>
+#include <linux/cma.h>
+#include <linux/crash_reserve.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/sha1.h>
+
+#include "kallsyms_internal.h"
+#include "kexec_internal.h"
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+ .desc = IORES_DESC_CRASH_KERNEL
+};
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+ unsigned long long total_mem = system_ram;
+
+ /*
+ * Firmware sometimes reserves some memory regions for its own use,
+ * so the system memory size is less than the actual physical memory
+ * size. Work around this by rounding up the total size to 128M,
+ * which is enough for most test cases.
+ */
+ total_mem = roundup(total_mem, SZ_128M);
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= total_mem) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (total_mem >= start && total_mem < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_CMA 2
+#define SUFFIX_NULL 3
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_CMA] = ",cma",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low|cma]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+ char *name = "crashkernel=";
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+ if (!ck_cmdline)
+ return -ENOENT;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ unsigned long long *low_size,
+ unsigned long long *cma_size,
+ bool *high)
+{
+ int ret;
+ unsigned long long __always_unused cma_base;
+
+ /* crashkernel=X[@offset] */
+ ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+ crash_base, NULL);
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ /*
+ * If non-NULL 'high' passed in and no normal crashkernel
+ * setting detected, try parsing crashkernel=,high|low.
+ */
+ if (high && ret == -ENOENT) {
+ ret = __parse_crashkernel(cmdline, 0, crash_size,
+ crash_base, suffix_tbl[SUFFIX_HIGH]);
+ if (ret || !*crash_size)
+ return -EINVAL;
+
+ /*
+ * crashkernel=Y,low can be specified or not, but invalid value
+ * is not allowed.
+ */
+ ret = __parse_crashkernel(cmdline, 0, low_size,
+ crash_base, suffix_tbl[SUFFIX_LOW]);
+ if (ret == -ENOENT) {
+ *low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ ret = 0;
+ } else if (ret) {
+ return ret;
+ }
+
+ *high = true;
+ }
+
+ /*
+ * optional CMA reservation
+ * cma_base is ignored
+ */
+ if (cma_size)
+ __parse_crashkernel(cmdline, 0, cma_size,
+ &cma_base, suffix_tbl[SUFFIX_CMA]);
+#endif
+ if (!*crash_size)
+ ret = -EINVAL;
+
+ if (*crash_size >= system_ram)
+ ret = -EINVAL;
+
+ return ret;
+}
+
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+ unsigned long long low_base;
+
+ low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+ if (!low_base) {
+ pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+ return -ENOMEM;
+ }
+
+ pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+ low_base, low_base + low_size, low_size >> 20);
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+#endif
+ return 0;
+}
+
+void __init reserve_crashkernel_generic(unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{
+ unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+ bool fixed_base = false;
+
+ /* User specifies base address explicitly. */
+ if (crash_base) {
+ fixed_base = true;
+ search_base = crash_base;
+ search_end = crash_base + crash_size;
+ } else if (high) {
+ search_base = CRASH_ADDR_LOW_MAX;
+ search_end = CRASH_ADDR_HIGH_MAX;
+ }
+
+retry:
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+ search_base, search_end);
+ if (!crash_base) {
+ /*
+ * For crashkernel=size[KMG]@offset[KMG], print out failure
+ * message if can't reserve the specified region.
+ */
+ if (fixed_base) {
+ pr_warn("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+
+ /*
+ * For crashkernel=size[KMG], if the first attempt was for
+ * low memory, fall back to high memory, the minimum required
+ * low memory will be reserved later.
+ */
+ if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+ search_end = CRASH_ADDR_HIGH_MAX;
+ search_base = CRASH_ADDR_LOW_MAX;
+ crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+ goto retry;
+ }
+
+ /*
+ * For crashkernel=size[KMG],high, if the first attempt was
+ * for high memory, fall back to low memory.
+ */
+ if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ search_end = CRASH_ADDR_LOW_MAX;
+ search_base = 0;
+ if (search_end != CRASH_ADDR_HIGH_MAX)
+ goto retry;
+ }
+ pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+ crash_size);
+ return;
+ }
+
+ if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
+ crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+ memblock_phys_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+ crash_base, crash_base + crash_size, crash_size >> 20);
+
+ /*
+ * The crashkernel memory will be removed from the kernel linear
+ * map. Inform kmemleak so that it won't try to access it.
+ */
+ kmemleak_ignore_phys(crash_base);
+ if (crashk_low_res.end)
+ kmemleak_ignore_phys(crashk_low_res.start);
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+ insert_resource(&iomem_resource, &crashk_res);
+#endif
+}
+
+struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX];
+#ifdef CRASHKERNEL_CMA
+int crashk_cma_cnt;
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ unsigned long long request_size = roundup(cma_size, PAGE_SIZE);
+ unsigned long long reserved_size = 0;
+
+ if (!cma_size)
+ return;
+
+ while (cma_size > reserved_size &&
+ crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) {
+
+ struct cma *res;
+
+ if (cma_declare_contiguous(0, request_size, 0, 0, 0, false,
+ "crashkernel", &res)) {
+ /* reservation failed, try half-sized blocks */
+ if (request_size <= PAGE_SIZE)
+ break;
+
+ request_size = roundup(request_size / 2, PAGE_SIZE);
+ continue;
+ }
+
+ crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res);
+ crashk_cma_ranges[crashk_cma_cnt].end =
+ crashk_cma_ranges[crashk_cma_cnt].start +
+ cma_get_size(res) - 1;
+ ++crashk_cma_cnt;
+ reserved_size += request_size;
+ }
+
+ if (cma_size > reserved_size)
+ pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n",
+ cma_size >> 20, reserved_size >> 20, crashk_cma_cnt);
+ else
+ pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n",
+ reserved_size >> 20, crashk_cma_cnt);
+}
+
+#else /* CRASHKERNEL_CMA */
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ if (cma_size)
+ pr_warn("crashkernel CMA reservation not supported\n");
+}
+#endif
+
+#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+static __init int insert_crashkernel_resources(void)
+{
+ if (!arch_add_crash_res_to_iomem())
+ return 0;
+
+ if (crashk_res.start < crashk_res.end)
+ insert_resource(&iomem_resource, &crashk_res);
+
+ if (crashk_low_res.start < crashk_low_res.end)
+ insert_resource(&iomem_resource, &crashk_low_res);
+
+ return 0;
+}
+early_initcall(insert_crashkernel_resources);
+#endif
+#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index ec1c07667ec1..a6f686b30da1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,89 +1,40 @@
-/* Task credentials management - see Documentation/security/credentials.txt
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Task credentials management - see Documentation/security/credentials.rst
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
+
+#define pr_fmt(fmt) "CRED: " fmt
+
#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/coredump.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
+#include <linux/uidgid.h>
#if 0
-#define kdebug(FMT, ...) \
- printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+ printk("[%-5.5s%5u] " FMT "\n", \
+ current->comm, current->pid, ##__VA_ARGS__)
#else
-#define kdebug(FMT, ...) \
- no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+do { \
+ if (0) \
+ no_printk("[%-5.5s%5u] " FMT "\n", \
+ current->comm, current->pid, ##__VA_ARGS__); \
+} while (0)
#endif
static struct kmem_cache *cred_jar;
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
-/*
- * The initial credentials for the initial task
- */
-struct cred init_cred = {
- .usage = ATOMIC_INIT(4),
-#ifdef CONFIG_DEBUG_CREDENTIALS
- .subscribers = ATOMIC_INIT(2),
- .magic = CRED_MAGIC,
-#endif
- .uid = GLOBAL_ROOT_UID,
- .gid = GLOBAL_ROOT_GID,
- .suid = GLOBAL_ROOT_UID,
- .sgid = GLOBAL_ROOT_GID,
- .euid = GLOBAL_ROOT_UID,
- .egid = GLOBAL_ROOT_GID,
- .fsuid = GLOBAL_ROOT_UID,
- .fsgid = GLOBAL_ROOT_GID,
- .securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_EMPTY_SET,
- .cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_FULL_SET,
- .cap_bset = CAP_FULL_SET,
- .user = INIT_USER,
- .user_ns = &init_user_ns,
- .group_info = &init_groups,
-};
-
-static inline void set_cred_subscribers(struct cred *cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- atomic_set(&cred->subscribers, n);
-#endif
-}
-
-static inline int read_cred_subscribers(const struct cred *cred)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- return atomic_read(&cred->subscribers);
-#else
- return 0;
-#endif
-}
-
-static inline void alter_cred_subscribers(const struct cred *_cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- struct cred *cred = (struct cred *) _cred;
-
- atomic_add(n, &cred->subscribers);
-#endif
-}
-
/*
* The RCU callback to actually dispose of a set of credentials
*/
@@ -93,20 +44,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
kdebug("put_cred_rcu(%p)", cred);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- if (cred->magic != CRED_MAGIC_DEAD ||
- atomic_read(&cred->usage) != 0 ||
- read_cred_subscribers(cred) != 0)
- panic("CRED: put_cred_rcu() sees %p with"
- " mag %x, put %p, usage %d, subscr %d\n",
- cred, cred->magic, cred->put_addr,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-#else
- if (atomic_read(&cred->usage) != 0)
- panic("CRED: put_cred_rcu() sees %p with usage %d\n",
- cred, atomic_read(&cred->usage));
-#endif
+ if (atomic_long_read(&cred->usage) != 0)
+ panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+ cred, atomic_long_read(&cred->usage));
security_cred_free(cred);
key_put(cred->session_keyring);
@@ -116,6 +56,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
+ if (cred->ucounts)
+ put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred);
}
@@ -128,20 +70,17 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
- kdebug("__put_cred(%p{%d,%d})", cred,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-
- BUG_ON(atomic_read(&cred->usage) != 0);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(cred) != 0);
- cred->magic = CRED_MAGIC_DEAD;
- cred->put_addr = __builtin_return_address(0);
-#endif
+ kdebug("__put_cred(%p{%ld})", cred,
+ atomic_long_read(&cred->usage));
+
+ BUG_ON(atomic_long_read(&cred->usage) != 0);
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
- call_rcu(&cred->rcu, put_cred_rcu);
+ if (cred->non_rcu)
+ put_cred_rcu(&cred->rcu);
+ else
+ call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
@@ -150,23 +89,28 @@ EXPORT_SYMBOL(__put_cred);
*/
void exit_creds(struct task_struct *tsk)
{
- struct cred *cred;
+ struct cred *real_cred, *cred;
- kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
+ kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_long_read(&tsk->cred->usage));
- cred = (struct cred *) tsk->real_cred;
+ real_cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
- validate_creds(cred);
- alter_cred_subscribers(cred, -1);
- put_cred(cred);
+
+ if (real_cred == cred) {
+ put_cred_many(cred, 2);
+ } else {
+ put_cred(real_cred);
+ put_cred(cred);
+ }
+
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+ key_put(tsk->cached_requested_key);
+ tsk->cached_requested_key = NULL;
+#endif
}
/**
@@ -188,11 +132,12 @@ const struct cred *get_task_cred(struct task_struct *task)
do {
cred = __task_cred((task));
BUG_ON(!cred);
- } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+ } while (!get_cred_rcu(cred));
rcu_read_unlock();
return cred;
}
+EXPORT_SYMBOL(get_task_cred);
/*
* Allocate blank credentials, such that the credentials can be filled in at a
@@ -206,12 +151,8 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
- atomic_set(&new->usage, 1);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
-
- if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
+ atomic_long_set(&new->usage, 1);
+ if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
return new;
@@ -241,8 +182,6 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- validate_process_creds();
-
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
@@ -252,8 +191,8 @@ struct cred *prepare_creds(void)
old = task->cred;
memcpy(new, old, sizeof(struct cred));
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ new->non_rcu = 0;
+ atomic_long_set(&new->usage, 1);
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
@@ -269,9 +208,13 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
- validate_creds(new);
+
return new;
error:
@@ -302,6 +245,9 @@ struct cred *prepare_exec_creds(void)
new->process_keyring = NULL;
#endif
+ new->suid = new->fsuid = new->euid;
+ new->sgid = new->fsgid = new->egid;
+
return new;
}
@@ -314,24 +260,26 @@ struct cred *prepare_exec_creds(void)
* The new process gets the current process's subjective credentials as its
* objective and subjective credentials
*/
-int copy_creds(struct task_struct *p, unsigned long clone_flags)
+int copy_creds(struct task_struct *p, u64 clone_flags)
{
struct cred *new;
int ret;
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+ p->cached_requested_key = NULL;
+#endif
+
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
#endif
clone_flags & CLONE_THREAD
) {
- p->real_cred = get_cred(p->cred);
- get_cred(p->cred);
- alter_cred_subscribers(p->cred, 2);
- kdebug("share_creds(%p{%d,%d})",
- p->cred, atomic_read(&p->cred->usage),
- read_cred_subscribers(p->cred));
- atomic_inc(&p->cred->user->processes);
+ p->real_cred = get_cred_many(p->cred, 2);
+ kdebug("share_creds(%p{%ld})",
+ p->cred, atomic_long_read(&p->cred->usage));
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
return 0;
}
@@ -343,6 +291,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
+ ret = set_cred_ucounts(new);
+ if (ret < 0)
+ goto error_put;
}
#ifdef CONFIG_KEYS
@@ -364,10 +315,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
}
#endif
- atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new);
- alter_cred_subscribers(new, 2);
- validate_creds(new);
+ inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
+
return 0;
error_put:
@@ -419,17 +370,11 @@ int commit_creds(struct cred *new)
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- kdebug("commit_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("commit_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
BUG_ON(task->cred != old);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(old) < 2);
- validate_creds(old);
- validate_creds(new);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -442,27 +387,37 @@ int commit_creds(struct cred *new)
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
+ /*
+ * If a task drops privileges and becomes nondumpable,
+ * the dumpability change must become visible before
+ * the credential change; otherwise, a __ptrace_may_access()
+ * racing with this change may be able to attach to a task it
+ * shouldn't be able to attach to (as if the task had dropped
+ * privileges without becoming nondumpable).
+ * Pairs with a read barrier in __ptrace_may_access().
+ */
smp_wmb();
}
/* alter the thread keyring */
if (!uid_eq(new->fsuid, old->fsuid))
- key_fsuid_changed(task);
+ key_fsuid_changed(new);
if (!gid_eq(new->fsgid, old->fsgid))
- key_fsgid_changed(task);
+ key_fsgid_changed(new);
/* do it
* RLIMIT_NPROC limits on user->processes have already been checked
* in set_user().
*/
- alter_cred_subscribers(new, 2);
- if (new->user != old->user)
- atomic_inc(&new->user->processes);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
- atomic_dec(&old->user->processes);
- alter_cred_subscribers(old, -2);
+ if (new->user != old->user || new->user_ns != old->user_ns)
+ dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+ if (new->user_ns != old->user_ns)
+ switch_cred_namespaces(old, new);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
@@ -478,8 +433,7 @@ int commit_creds(struct cred *new)
proc_id_connector(task, PROC_EVENT_GID);
/* release the old obj and subj refs both */
- put_cred(old);
- put_cred(old);
+ put_cred_many(old, 2);
return 0;
}
EXPORT_SYMBOL(commit_creds);
@@ -493,70 +447,87 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
- kdebug("abort_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("abort_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(new) != 0);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
/**
- * override_creds - Override the current process's subjective credentials
- * @new: The credentials to be assigned
+ * cred_fscmp - Compare two credentials with respect to filesystem access.
+ * @a: The first credential
+ * @b: The second credential
*
- * Install a set of temporary override subjective credentials on the current
- * process, returning the old set for later reversion.
+ * cred_cmp() will return zero if both credentials have the same
+ * fsuid, fsgid, and supplementary groups. That is, if they will both
+ * provide the same access to files based on mode/uid/gid.
+ * If the credentials are different, then either -1 or 1 will
+ * be returned depending on whether @a comes before or after @b
+ * respectively in an arbitrary, but stable, ordering of credentials.
+ *
+ * Return: -1, 0, or 1 depending on comparison
*/
-const struct cred *override_creds(const struct cred *new)
+int cred_fscmp(const struct cred *a, const struct cred *b)
{
- const struct cred *old = current->cred;
-
- kdebug("override_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
-
- validate_creds(old);
- validate_creds(new);
- get_cred(new);
- alter_cred_subscribers(new, 1);
- rcu_assign_pointer(current->cred, new);
- alter_cred_subscribers(old, -1);
-
- kdebug("override_creds() = %p{%d,%d}", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
- return old;
+ struct group_info *ga, *gb;
+ int g;
+
+ if (a == b)
+ return 0;
+ if (uid_lt(a->fsuid, b->fsuid))
+ return -1;
+ if (uid_gt(a->fsuid, b->fsuid))
+ return 1;
+
+ if (gid_lt(a->fsgid, b->fsgid))
+ return -1;
+ if (gid_gt(a->fsgid, b->fsgid))
+ return 1;
+
+ ga = a->group_info;
+ gb = b->group_info;
+ if (ga == gb)
+ return 0;
+ if (ga == NULL)
+ return -1;
+ if (gb == NULL)
+ return 1;
+ if (ga->ngroups < gb->ngroups)
+ return -1;
+ if (ga->ngroups > gb->ngroups)
+ return 1;
+
+ for (g = 0; g < ga->ngroups; g++) {
+ if (gid_lt(ga->gid[g], gb->gid[g]))
+ return -1;
+ if (gid_gt(ga->gid[g], gb->gid[g]))
+ return 1;
+ }
+ return 0;
}
-EXPORT_SYMBOL(override_creds);
+EXPORT_SYMBOL(cred_fscmp);
-/**
- * revert_creds - Revert a temporary subjective credentials override
- * @old: The credentials to be restored
- *
- * Revert a temporary set of override subjective credentials to an old set,
- * discarding the override set.
- */
-void revert_creds(const struct cred *old)
+int set_cred_ucounts(struct cred *new)
{
- const struct cred *override = current->cred;
-
- kdebug("revert_creds(%p{%d,%d})", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
-
- validate_creds(old);
- validate_creds(override);
- alter_cred_subscribers(old, 1);
- rcu_assign_pointer(current->cred, old);
- alter_cred_subscribers(override, -1);
- put_cred(override);
+ struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
+
+ /*
+ * This optimization is needed because alloc_ucounts() uses locks
+ * for table lookups.
+ */
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
+ return 0;
+
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
+ return -EAGAIN;
+
+ new->ucounts = new_ucounts;
+ put_ucounts(old_ucounts);
+
+ return 0;
}
-EXPORT_SYMBOL(revert_creds);
/*
* initialise the credentials stuff
@@ -564,8 +535,8 @@ EXPORT_SYMBOL(revert_creds);
void __init cred_init(void)
{
/* allocate a slab in which we can store credentials */
- cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ cred_jar = KMEM_CACHE(cred,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
/**
@@ -576,37 +547,33 @@ void __init cred_init(void)
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
- * @daemon is used to provide a base for the security record, but can be NULL.
- * If @daemon is supplied, then the security data will be derived from that;
- * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ * @daemon is used to provide a base cred, with the security data derived from
+ * that; if this is "&init_task", they'll be set to 0, no groups, full
+ * capabilities, and no keys.
*
* The caller may change these controls afterwards if desired.
*
* Returns the new credentials or NULL if out of memory.
- *
- * Does not take, and does not return holding current->cred_replace_mutex.
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
const struct cred *old;
struct cred *new;
+ if (WARN_ON_ONCE(!daemon))
+ return NULL;
+
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
- if (daemon)
- old = get_task_cred(daemon);
- else
- old = get_cred(&init_cred);
-
- validate_creds(old);
+ old = get_task_cred(daemon);
*new = *old;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ new->non_rcu = 0;
+ atomic_long_set(&new->usage, 1);
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
@@ -622,11 +589,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+ new->ucounts = get_ucounts(new->ucounts);
+ if (!new->ucounts)
+ goto error;
+
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
put_cred(old);
- validate_creds(new);
return new;
error:
@@ -684,127 +654,10 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EINVAL;
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
-
-#ifdef CONFIG_DEBUG_CREDENTIALS
-
-bool creds_are_invalid(const struct cred *cred)
-{
- if (cred->magic != CRED_MAGIC)
- return true;
-#ifdef CONFIG_SECURITY_SELINUX
- /*
- * cred->security == NULL if security_cred_alloc_blank() or
- * security_prepare_creds() returned an error.
- */
- if (selinux_is_enabled() && cred->security) {
- if ((unsigned long) cred->security < PAGE_SIZE)
- return true;
- if ((*(u32 *)cred->security & 0xffffff00) ==
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
- return true;
- }
-#endif
- return false;
-}
-EXPORT_SYMBOL(creds_are_invalid);
-
-/*
- * dump invalid credentials
- */
-static void dump_invalid_creds(const struct cred *cred, const char *label,
- const struct task_struct *tsk)
-{
- printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
- label, cred,
- cred == &init_cred ? "[init]" : "",
- cred == tsk->real_cred ? "[real]" : "",
- cred == tsk->cred ? "[eff]" : "");
- printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
- cred->magic, cred->put_addr);
- printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
- printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
- from_kuid_munged(&init_user_ns, cred->uid),
- from_kuid_munged(&init_user_ns, cred->euid),
- from_kuid_munged(&init_user_ns, cred->suid),
- from_kuid_munged(&init_user_ns, cred->fsuid));
- printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
- from_kgid_munged(&init_user_ns, cred->gid),
- from_kgid_munged(&init_user_ns, cred->egid),
- from_kgid_munged(&init_user_ns, cred->sgid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
-#ifdef CONFIG_SECURITY
- printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
- if ((unsigned long) cred->security >= PAGE_SIZE &&
- (((unsigned long) cred->security & 0xffffff00) !=
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
- printk(KERN_ERR "CRED: ->security {%x, %x}\n",
- ((u32*)cred->security)[0],
- ((u32*)cred->security)[1]);
-#endif
-}
-
-/*
- * report use of invalid credentials
- */
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
-{
- printk(KERN_ERR "CRED: Invalid credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
- dump_invalid_creds(cred, "Specified", current);
- BUG();
-}
-EXPORT_SYMBOL(__invalid_creds);
-
-/*
- * check the credentials on a process
- */
-void __validate_process_creds(struct task_struct *tsk,
- const char *file, unsigned line)
-{
- if (tsk->cred == tsk->real_cred) {
- if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- } else {
- if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
- read_cred_subscribers(tsk->cred) < 1 ||
- creds_are_invalid(tsk->real_cred) ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- }
- return;
-
-invalid_creds:
- printk(KERN_ERR "CRED: Invalid process credentials\n");
- printk(KERN_ERR "CRED: At %s:%u\n", file, line);
-
- dump_invalid_creds(tsk->real_cred, "Real", tsk);
- if (tsk->cred != tsk->real_cred)
- dump_invalid_creds(tsk->cred, "Effective", tsk);
- else
- printk(KERN_ERR "CRED: Effective creds == Real creds\n");
- BUG();
-}
-EXPORT_SYMBOL(__validate_process_creds);
-
-/*
- * check creds for do_exit()
- */
-void validate_creds_for_do_exit(struct task_struct *tsk)
-{
- kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
- tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
-
- __validate_process_creds(tsk, __FILE__, __LINE__);
-}
-
-#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index a85edc339985..332ee6c6ec2c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the linux kernel debugger
#
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..0b9495187fba 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,10 +23,6 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#define pr_fmt(fmt) "KGDB: " fmt
@@ -49,11 +46,13 @@
#include <linux/init.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/nmi.h>
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/rcupdate.h>
+#include <linux/irq.h>
+#include <linux/security.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -65,9 +64,7 @@ static int kgdb_break_asap;
struct debuggerinfo_struct kgdb_info[NR_CPUS];
-/**
- * kgdb_connected - Is a host GDB connected to us?
- */
+/* kgdb_connected - Is a host GDB connected to us? */
int kgdb_connected;
EXPORT_SYMBOL_GPL(kgdb_connected);
@@ -80,7 +77,7 @@ static int exception_level;
struct kgdb_io *dbg_io_ops;
static DEFINE_SPINLOCK(kgdb_registration_lock);
-/* Action for the reboot notifiter, a global allow kdb to change it */
+/* Action for the reboot notifier, a global allow kdb to change it */
static int kgdbreboot;
/* kgdb console driver is loaded */
static int kgdb_con_registered;
@@ -94,14 +91,6 @@ int dbg_switch_cpu;
/* Use kdb or gdbserver mode */
int dbg_kdb_mode = 1;
-static int __init opt_kgdb_con(char *str)
-{
- kgdb_use_con = 1;
- return 0;
-}
-
-early_param("kgdbcon", opt_kgdb_con);
-
module_param(kgdb_use_con, int, 0644);
module_param(kgdbreboot, int, 0644);
@@ -127,7 +116,6 @@ static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
*/
static atomic_t masters_in_kgdb;
static atomic_t slaves_in_kgdb;
-static atomic_t kgdb_break_tasklet_var;
atomic_t kgdb_setting_breakpoint;
struct task_struct *kgdb_usethread;
@@ -163,31 +151,37 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
/*
* Weak aliases for breakpoint management,
- * can be overriden by architectures when needed:
+ * can be overridden by architectures when needed:
*/
int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
- err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr,
BREAK_INSTR_SIZE);
if (err)
return err;
- err = probe_kernel_write((char *)bpt->bpt_addr,
+ err = copy_to_kernel_nofault((char *)bpt->bpt_addr,
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
return err;
}
+NOKPROBE_SYMBOL(kgdb_arch_set_breakpoint);
int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
- return probe_kernel_write((char *)bpt->bpt_addr,
+ return copy_to_kernel_nofault((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
+NOKPROBE_SYMBOL(kgdb_arch_remove_breakpoint);
int __weak kgdb_validate_break_address(unsigned long addr)
{
struct kgdb_bkpt tmp;
int err;
+
+ if (kgdb_within_blocklist(addr))
+ return -EINVAL;
+
/* Validate setting the breakpoint and then removing it. If the
* remove fails, the kernel needs to emit a bad message because we
* are deep trouble not being able to put things back the way we
@@ -208,6 +202,7 @@ unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
{
return instruction_pointer(regs);
}
+NOKPROBE_SYMBOL(kgdb_arch_pc);
int __weak kgdb_arch_init(void)
{
@@ -218,6 +213,65 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
{
return 0;
}
+NOKPROBE_SYMBOL(kgdb_skipexception);
+
+#ifdef CONFIG_SMP
+
+/*
+ * Default (weak) implementation for kgdb_roundup_cpus
+ */
+
+void __weak kgdb_call_nmi_hook(void *ignored)
+{
+ /*
+ * NOTE: get_irq_regs() is supposed to get the registers from
+ * before the IPI interrupt happened and so is supposed to
+ * show where the processor was. In some situations it's
+ * possible we might be called without an IPI, so it might be
+ * safer to figure out how to make kgdb_breakpoint() work
+ * properly here.
+ */
+ kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
+}
+NOKPROBE_SYMBOL(kgdb_call_nmi_hook);
+
+static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
+ CSD_INIT(kgdb_call_nmi_hook, NULL);
+
+void __weak kgdb_roundup_cpus(void)
+{
+ call_single_data_t *csd;
+ int this_cpu = raw_smp_processor_id();
+ int cpu;
+ int ret;
+
+ for_each_online_cpu(cpu) {
+ /* No need to roundup ourselves */
+ if (cpu == this_cpu)
+ continue;
+
+ csd = &per_cpu(kgdb_roundup_csd, cpu);
+
+ /*
+ * If it didn't round up last time, don't try again
+ * since smp_call_function_single_async() will block.
+ *
+ * If rounding_up is false then we know that the
+ * previous call must have at least started and that
+ * means smp_call_function_single_async() won't block.
+ */
+ if (kgdb_info[cpu].rounding_up)
+ continue;
+ kgdb_info[cpu].rounding_up = true;
+
+ ret = smp_call_function_single_async(cpu, csd);
+ if (ret)
+ kgdb_info[cpu].rounding_up = false;
+ }
+}
+NOKPROBE_SYMBOL(kgdb_roundup_cpus);
+
+#endif
/*
* Some architectures need cache flushes when we set/clear a
@@ -228,20 +282,10 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
if (!CACHE_FLUSH_IS_SAFE)
return;
- if (current->mm) {
- int i;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache[i])
- continue;
- flush_cache_range(current->vmacache[i],
- addr, addr + BREAK_INSTR_SIZE);
- }
- }
-
/* Force flush instruction cache if it was outside the mm */
flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
}
+NOKPROBE_SYMBOL(kgdb_flush_swbreak_addr);
/*
* SW breakpoint management:
@@ -269,6 +313,7 @@ int dbg_activate_sw_breakpoints(void)
}
return ret;
}
+NOKPROBE_SYMBOL(dbg_activate_sw_breakpoints);
int dbg_set_sw_break(unsigned long addr)
{
@@ -332,6 +377,7 @@ int dbg_deactivate_sw_breakpoints(void)
}
return ret;
}
+NOKPROBE_SYMBOL(dbg_deactivate_sw_breakpoints);
int dbg_remove_sw_break(unsigned long addr)
{
@@ -359,6 +405,18 @@ int kgdb_isremovedbreak(unsigned long addr)
return 0;
}
+int kgdb_has_hit_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_ACTIVE &&
+ kgdb_break[i].bpt_addr == addr)
+ return 1;
+ }
+ return 0;
+}
+
int dbg_remove_all_break(void)
{
int error;
@@ -383,6 +441,48 @@ setundefined:
return 0;
}
+void kgdb_free_init_mem(void)
+{
+ int i;
+
+ /* Clear init memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (init_section_contains((void *)kgdb_break[i].bpt_addr, 0))
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+}
+
+#ifdef CONFIG_KGDB_KDB
+void kdb_dump_stack_on_cpu(int cpu)
+{
+ if (cpu == raw_smp_processor_id() || !IS_ENABLED(CONFIG_SMP)) {
+ dump_stack();
+ return;
+ }
+
+ if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) {
+ kdb_printf("ERROR: Task on cpu %d didn't stop in the debugger\n",
+ cpu);
+ return;
+ }
+
+ /*
+ * In general, architectures don't support dumping the stack of a
+ * "running" process that's not the current one. From the point of
+ * view of the Linux, kernel processes that are looping in the kgdb
+ * slave loop are still "running". There's also no API (that actually
+ * works across all architectures) that can do a stack crawl based
+ * on registers passed as a parameter.
+ *
+ * Solve this conundrum by asking slave CPUs to do the backtrace
+ * themselves.
+ */
+ kgdb_info[cpu].exception_state |= DCPU_WANT_BT;
+ while (kgdb_info[cpu].exception_state & DCPU_WANT_BT)
+ cpu_relax();
+}
+#endif
+
/*
* Return true if there is a valid kgdb I/O module. Also if no
* debugger is attached a message can be printed to the console about
@@ -410,6 +510,7 @@ static int kgdb_io_ready(int print_wait)
}
return 1;
}
+NOKPROBE_SYMBOL(kgdb_io_ready);
static int kgdb_reenter_check(struct kgdb_state *ks)
{
@@ -443,6 +544,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
if (exception_level > 1) {
dump_stack();
+ kgdb_io_module_registered = false;
panic("Recursive entry to debugger");
}
@@ -456,6 +558,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
return 1;
}
+NOKPROBE_SYMBOL(kgdb_reenter_check);
static void dbg_touch_watchdogs(void)
{
@@ -463,6 +566,7 @@ static void dbg_touch_watchdogs(void)
clocksource_touch_watchdog();
rcu_cpu_stall_reset();
}
+NOKPROBE_SYMBOL(dbg_touch_watchdogs);
static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
int exception_state)
@@ -487,6 +591,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
arch_kgdb_ops.disable_hw_break(regs);
acquirelock:
+ rcu_read_lock();
/*
* Interrupts will be restored by the 'trap return' code, except when
* single stepping.
@@ -522,6 +627,9 @@ cpu_loop:
atomic_xchg(&kgdb_active, cpu);
break;
}
+ } else if (kgdb_info[cpu].exception_state & DCPU_WANT_BT) {
+ dump_stack();
+ kgdb_info[cpu].exception_state &= ~DCPU_WANT_BT;
} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
if (!raw_spin_is_locked(&dbg_slave_lock))
goto return_normal;
@@ -534,6 +642,8 @@ return_normal:
arch_kgdb_ops.correct_hw_break();
if (trace_on)
tracing_on();
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
@@ -541,6 +651,7 @@ return_normal:
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return 0;
}
cpu_relax();
@@ -559,6 +670,7 @@ return_normal:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
goto acquirelock;
}
@@ -574,6 +686,8 @@ return_normal:
if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
goto kgdb_restore;
+ atomic_inc(&ignore_console_lock_warning);
+
/* Call the I/O driver's pre_exception routine */
if (dbg_io_ops->pre_exception)
dbg_io_ops->pre_exception();
@@ -592,17 +706,17 @@ return_normal:
/* Signal the other CPUs to enter kgdb_wait() */
else if ((!kgdb_single_step) && kgdb_do_roundup)
- kgdb_roundup_cpus(flags);
+ kgdb_roundup_cpus();
#endif
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- time_left = loops_per_jiffy * HZ;
+ time_left = MSEC_PER_SEC;
while (kgdb_do_roundup && --time_left &&
(atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
online_cpus)
- cpu_relax();
+ udelay(1000);
if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n");
@@ -627,6 +741,29 @@ cpu_master_loop:
continue;
kgdb_connected = 0;
} else {
+ /*
+ * This is a brutal way to interfere with the debugger
+ * and prevent gdb being used to poke at kernel memory.
+ * This could cause trouble if lockdown is applied when
+ * there is already an active gdb session. For now the
+ * answer is simply "don't do that". Typically lockdown
+ * *will* be applied before the debug core gets started
+ * so only developers using kgdb for fairly advanced
+ * early kernel debug can be biten by this. Hopefully
+ * they are sophisticated enough to take care of
+ * themselves, especially with help from the lockdown
+ * message printed on the console!
+ */
+ if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) {
+ if (IS_ENABLED(CONFIG_KGDB_KDB)) {
+ /* Switch back to kdb if possible... */
+ dbg_kdb_mode = 1;
+ continue;
+ } else {
+ /* ... otherwise just bail */
+ break;
+ }
+ }
error = gdb_serial_stub(ks);
}
@@ -642,10 +779,14 @@ cpu_master_loop:
}
}
+ dbg_activate_sw_breakpoints();
+
/* Call the I/O driver's post_exception routine */
if (dbg_io_ops->post_exception)
dbg_io_ops->post_exception();
+ atomic_dec(&ignore_console_lock_warning);
+
if (!kgdb_single_step) {
raw_spin_unlock(&dbg_slave_lock);
/* Wait till all the CPUs have quit from the debugger. */
@@ -666,6 +807,8 @@ kgdb_restore:
if (trace_on)
tracing_on();
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
@@ -676,9 +819,11 @@ kgdb_restore:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return kgdb_info[cpu].ret_state;
}
+NOKPROBE_SYMBOL(kgdb_cpu_enter);
/*
* kgdb_handle_exception() - main entry point from a kernel exception
@@ -692,10 +837,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
{
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
- int ret = 0;
-
- if (arch_kgdb_ops.enable_nmi)
- arch_kgdb_ops.enable_nmi(0);
/*
* Avoid entering the debugger if we were triggered due to an oops
* but panic_timeout indicates the system should automatically
@@ -713,23 +854,17 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
- goto out; /* Ouch, double exception ! */
+ return 0; /* Ouch, double exception ! */
if (kgdb_info[ks->cpu].enter_kgdb != 0)
- goto out;
+ return 0;
- ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
-out:
- if (arch_kgdb_ops.enable_nmi)
- arch_kgdb_ops.enable_nmi(1);
- return ret;
+ return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
}
+NOKPROBE_SYMBOL(kgdb_handle_exception);
/*
- * GDB places a breakpoint at this function to know dynamically
- * loaded objects. It's not defined static so that only one instance with this
- * name exists in the kernel.
+ * GDB places a breakpoint at this function to know dynamically loaded objects.
*/
-
static int module_event(struct notifier_block *self, unsigned long val,
void *data)
{
@@ -746,6 +881,8 @@ int kgdb_nmicallback(int cpu, void *regs)
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
+ kgdb_info[cpu].rounding_up = false;
+
memset(ks, 0, sizeof(struct kgdb_state));
ks->cpu = cpu;
ks->linux_regs = regs;
@@ -758,6 +895,7 @@ int kgdb_nmicallback(int cpu, void *regs)
#endif
return 1;
}
+NOKPROBE_SYMBOL(kgdb_nmicallback);
int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
atomic_t *send_ready)
@@ -783,6 +921,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code,
#endif
return 1;
}
+NOKPROBE_SYMBOL(kgdb_nmicallin);
static void kgdb_console_write(struct console *co, const char *s,
unsigned count)
@@ -806,8 +945,22 @@ static struct console kgdbcons = {
.index = -1,
};
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+
+ if (kgdb_io_module_registered && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_dbg(int key)
+static void sysrq_handle_dbg(u8 key)
{
if (!dbg_io_ops) {
pr_crit("ERROR: No KGDB I/O module available\n");
@@ -825,36 +978,42 @@ static void sysrq_handle_dbg(int key)
kgdb_breakpoint();
}
-static struct sysrq_key_op sysrq_dbg_op = {
+static const struct sysrq_key_op sysrq_dbg_op = {
.handler = sysrq_handle_dbg,
.help_msg = "debug(g)",
.action_msg = "DEBUG",
};
#endif
-static int kgdb_panic_event(struct notifier_block *self,
- unsigned long val,
- void *data)
+void kgdb_panic(const char *msg)
{
+ if (!kgdb_io_module_registered)
+ return;
+
/*
- * Avoid entering the debugger if we were triggered due to a panic
- * We don't want to get stuck waiting for input from user in such case.
- * panic_timeout indicates the system should automatically
+ * We don't want to get stuck waiting for input from user if
+ * "panic_timeout" indicates the system should automatically
* reboot on panic.
*/
if (panic_timeout)
- return NOTIFY_DONE;
+ return;
+
+ debug_locks_off();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
if (dbg_kdb_mode)
- kdb_printf("PANIC: %s\n", (char *)data);
+ kdb_printf("PANIC: %s\n", msg);
+
kgdb_breakpoint();
- return NOTIFY_DONE;
}
-static struct notifier_block kgdb_panic_event_nb = {
- .notifier_call = kgdb_panic_event,
- .priority = INT_MAX,
-};
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ pr_crit("Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
void __weak kgdb_arch_late(void)
{
@@ -866,6 +1025,9 @@ void __init dbg_late_init(void)
if (kgdb_io_module_registered)
kgdb_arch_late();
kdb_init(KDB_INIT_FULL);
+
+ if (kgdb_io_module_registered && kgdb_break_asap)
+ kgdb_initial_breakpoint();
}
static int
@@ -874,12 +1036,13 @@ dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
/*
* Take the following action on reboot notify depending on value:
* 1 == Enter debugger
- * 0 == [the default] detatch debug client
+ * 0 == [the default] detach debug client
* -1 == Do nothing... and use this until the board resets
*/
switch (kgdbreboot) {
case 1:
kgdb_breakpoint();
+ goto done;
case -1:
goto done;
}
@@ -904,8 +1067,6 @@ static void kgdb_register_callbacks(void)
kgdb_arch_late();
register_module_notifier(&dbg_module_load_nb);
register_reboot_notifier(&dbg_reboot_notifier);
- atomic_notifier_chain_register(&panic_notifier_list,
- &kgdb_panic_event_nb);
#ifdef CONFIG_MAGIC_SYSRQ
register_sysrq_key('g', &sysrq_dbg_op);
#endif
@@ -919,16 +1080,14 @@ static void kgdb_register_callbacks(void)
static void kgdb_unregister_callbacks(void)
{
/*
- * When this routine is called KGDB should unregister from the
- * panic handler and clean up, making sure it is not handling any
+ * When this routine is called KGDB should unregister from
+ * handlers and clean up, making sure it is not handling any
* break exceptions at the time.
*/
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
unregister_reboot_notifier(&dbg_reboot_notifier);
unregister_module_notifier(&dbg_module_load_nb);
- atomic_notifier_chain_unregister(&panic_notifier_list,
- &kgdb_panic_event_nb);
kgdb_arch_exit();
#ifdef CONFIG_MAGIC_SYSRQ
unregister_sysrq_key('g', &sysrq_dbg_op);
@@ -940,39 +1099,6 @@ static void kgdb_unregister_callbacks(void)
}
}
-/*
- * There are times a tasklet needs to be used vs a compiled in
- * break point so as to cause an exception outside a kgdb I/O module,
- * such as is the case with kgdboe, where calling a breakpoint in the
- * I/O driver itself would be fatal.
- */
-static void kgdb_tasklet_bpt(unsigned long ing)
-{
- kgdb_breakpoint();
- atomic_set(&kgdb_break_tasklet_var, 0);
-}
-
-static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
-
-void kgdb_schedule_breakpoint(void)
-{
- if (atomic_read(&kgdb_break_tasklet_var) ||
- atomic_read(&kgdb_active) != -1 ||
- atomic_read(&kgdb_setting_breakpoint))
- return;
- atomic_inc(&kgdb_break_tasklet_var);
- tasklet_schedule(&kgdb_tasklet_breakpoint);
-}
-EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
-
-static void kgdb_initial_breakpoint(void)
-{
- kgdb_break_asap = 0;
-
- pr_crit("Waiting for connection from remote gdb...\n");
- kgdb_breakpoint();
-}
-
/**
* kgdb_register_io_module - register KGDB IO module
* @new_dbg_io_ops: the io ops vector
@@ -981,15 +1107,22 @@ static void kgdb_initial_breakpoint(void)
*/
int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
{
+ struct kgdb_io *old_dbg_io_ops;
int err;
spin_lock(&kgdb_registration_lock);
- if (dbg_io_ops) {
- spin_unlock(&kgdb_registration_lock);
+ old_dbg_io_ops = dbg_io_ops;
+ if (old_dbg_io_ops) {
+ if (!old_dbg_io_ops->deinit) {
+ spin_unlock(&kgdb_registration_lock);
- pr_err("Another I/O driver is already registered with KGDB\n");
- return -EBUSY;
+ pr_err("KGDB I/O driver %s can't replace %s.\n",
+ new_dbg_io_ops->name, old_dbg_io_ops->name);
+ return -EBUSY;
+ }
+ pr_info("Replacing I/O driver %s with %s\n",
+ old_dbg_io_ops->name, new_dbg_io_ops->name);
}
if (new_dbg_io_ops->init) {
@@ -1004,12 +1137,18 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
+ if (old_dbg_io_ops) {
+ old_dbg_io_ops->deinit();
+ return 0;
+ }
+
pr_info("Registered I/O driver %s\n", new_dbg_io_ops->name);
/* Arm KGDB now. */
kgdb_register_callbacks();
- if (kgdb_break_asap)
+ if (kgdb_break_asap &&
+ (!dbg_is_early || IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG)))
kgdb_initial_breakpoint();
return 0;
@@ -1017,7 +1156,7 @@ int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
EXPORT_SYMBOL_GPL(kgdb_register_io_module);
/**
- * kkgdb_unregister_io_module - unregister KGDB IO module
+ * kgdb_unregister_io_module - unregister KGDB IO module
* @old_dbg_io_ops: the io ops vector
*
* Unregister it with the KGDB core.
@@ -1039,6 +1178,9 @@ void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
spin_unlock(&kgdb_registration_lock);
+ if (old_dbg_io_ops->deinit)
+ old_dbg_io_ops->deinit();
+
pr_info("Unregistered I/O driver %s, debugger disabled\n",
old_dbg_io_ops->name);
}
@@ -1079,7 +1221,8 @@ static int __init opt_kgdb_wait(char *str)
kgdb_break_asap = 1;
kdb_init(KDB_INIT_EARLY);
- if (kgdb_io_module_registered)
+ if (kgdb_io_module_registered &&
+ IS_ENABLED(CONFIG_ARCH_HAS_EARLY_DEBUG))
kgdb_initial_breakpoint();
return 0;
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 127d9bc49fb4..cd22b5f68831 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -33,7 +33,7 @@ struct kgdb_state {
#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
-#define DCPU_SSTEP 0x8 /* CPU is single stepping */
+#define DCPU_WANT_BT 0x8 /* Slave cpu should backtrace then clear flag */
struct debuggerinfo_struct {
void *debuggerinfo;
@@ -42,6 +42,7 @@ struct debuggerinfo_struct {
int ret_state;
int irq_depth;
int enter_kgdb;
+ bool rounding_up;
};
extern struct debuggerinfo_struct kgdb_info[];
@@ -75,6 +76,7 @@ extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
extern int kdb_common_init_state(struct kgdb_state *ks);
extern int kdb_common_deinit_state(void);
+extern void kdb_dump_stack_on_cpu(int cpu);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 19d9a578c753..22fe969c5d2e 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Debug Core
*
@@ -22,20 +23,18 @@
*
* Original KGDB stub: David Grothe <dave@gcom.com>,
* Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
*/
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/serial_core.h>
+#include <linux/string.h>
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "debug_core.h"
#define KGDB_MAX_THREAD_QUERY 17
@@ -246,7 +245,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
*/
tmp = buf + count;
- err = probe_kernel_read(tmp, mem, count);
+ err = copy_from_kernel_nofault(tmp, mem, count);
if (err)
return NULL;
while (count > 0) {
@@ -282,7 +281,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
*tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
}
- return probe_kernel_write(mem, tmp_raw, count);
+ return copy_to_kernel_nofault(mem, tmp_raw, count);
}
/*
@@ -320,7 +319,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
/*
* Copy the binary array pointed to by buf into mem. Fix $, #, and
* 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * The input buf is overwitten with the result to write to mem.
+ * The input buf is overwritten with the result to write to mem.
*/
static int kgdb_ebin2mem(char *buf, char *mem, int count)
{
@@ -334,7 +333,7 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
size++;
}
- return probe_kernel_write(mem, c, size);
+ return copy_to_kernel_nofault(mem, c, size);
}
#if DBG_MAX_REG_NUM > 0
@@ -549,7 +548,7 @@ static void gdb_cmd_setregs(struct kgdb_state *ks)
error_packet(remcom_out_buffer, -EINVAL);
} else {
gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
}
@@ -579,7 +578,7 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
if (err)
error_packet(remcom_out_buffer, err);
else
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
#if DBG_MAX_REG_NUM > 0
@@ -594,7 +593,7 @@ static char *gdb_hex_reg_helper(int regnum, char *out)
dbg_reg_def[i].size);
}
-/* Handle the 'p' individual regster get */
+/* Handle the 'p' individual register get */
static void gdb_cmd_reg_get(struct kgdb_state *ks)
{
unsigned long regnum;
@@ -609,7 +608,7 @@ static void gdb_cmd_reg_get(struct kgdb_state *ks)
gdb_hex_reg_helper(regnum, remcom_out_buffer);
}
-/* Handle the 'P' individual regster set */
+/* Handle the 'P' individual register set */
static void gdb_cmd_reg_set(struct kgdb_state *ks)
{
unsigned long regnum;
@@ -632,7 +631,7 @@ static void gdb_cmd_reg_set(struct kgdb_state *ks)
i = i / 2;
kgdb_hex2mem(ptr, (char *)gdb_regs, i);
dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
#endif /* DBG_MAX_REG_NUM > 0 */
@@ -644,7 +643,7 @@ static void gdb_cmd_binwrite(struct kgdb_state *ks)
if (err)
error_packet(remcom_out_buffer, err);
else
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
/* Handle the 'D' or 'k', detach or kill packets */
@@ -658,7 +657,7 @@ static void gdb_cmd_detachkill(struct kgdb_state *ks)
if (error < 0) {
error_packet(remcom_out_buffer, error);
} else {
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
kgdb_connected = 0;
}
put_packet(remcom_out_buffer);
@@ -678,7 +677,7 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
/* For now, only honor R0 */
if (strcmp(remcom_in_buffer, "R0") == 0) {
printk(KERN_CRIT "Executing emergency reboot\n");
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
put_packet(remcom_out_buffer);
/*
@@ -724,7 +723,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
}
}
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (i >= ks->thr_query && !finished) {
int_to_threadref(thref, p->pid);
ptr = pack_threadid(ptr, thref);
@@ -734,14 +733,14 @@ static void gdb_cmd_query(struct kgdb_state *ks)
finished = 1;
}
i++;
- } while_each_thread(g, p);
+ }
*(--ptr) = '\0';
break;
case 'C':
/* Current thread id */
- strcpy(remcom_out_buffer, "QC");
+ strscpy(remcom_out_buffer, "QC");
ks->threadid = shadow_pid(current->pid);
int_to_threadref(thref, ks->threadid);
pack_threadid(remcom_out_buffer + 2, thref);
@@ -775,7 +774,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
int len = strlen(remcom_in_buffer + 6);
if ((len % 2) != 0) {
- strcpy(remcom_out_buffer, "E01");
+ strscpy(remcom_out_buffer, "E01");
break;
}
kgdb_hex2mem(remcom_in_buffer + 6,
@@ -787,10 +786,23 @@ static void gdb_cmd_query(struct kgdb_state *ks)
kdb_parse(remcom_out_buffer);
kdb_common_deinit_state();
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
}
break;
#endif
+#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT
+ case 'S':
+ if (!strncmp(remcom_in_buffer, "qSupported:", 11))
+ strscpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature);
+ break;
+ case 'X':
+ if (!strncmp(remcom_in_buffer, "qXfer:", 6))
+ kgdb_arch_handle_qxfer_pkt(remcom_in_buffer,
+ remcom_out_buffer);
+ break;
+#endif
+ default:
+ break;
}
}
@@ -811,7 +823,7 @@ static void gdb_cmd_task(struct kgdb_state *ks)
}
kgdb_usethread = thread;
ks->kgdb_usethreadid = ks->threadid;
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
break;
case 'c':
ptr = &remcom_in_buffer[2];
@@ -826,7 +838,7 @@ static void gdb_cmd_task(struct kgdb_state *ks)
}
kgdb_contthread = thread;
}
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
break;
}
}
@@ -840,7 +852,7 @@ static void gdb_cmd_thread(struct kgdb_state *ks)
kgdb_hex2long(&ptr, &ks->threadid);
thread = getthread(ks->linux_regs, ks->threadid);
if (thread)
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
else
error_packet(remcom_out_buffer, -EINVAL);
}
@@ -902,7 +914,7 @@ static void gdb_cmd_break(struct kgdb_state *ks)
(int) length, *bpt_type - '0');
if (error == 0)
- strcpy(remcom_out_buffer, "OK");
+ strscpy(remcom_out_buffer, "OK");
else
error_packet(remcom_out_buffer, error);
}
@@ -938,7 +950,7 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
}
/*
- * This function performs all gdbserial command procesing
+ * This function performs all gdbserial command processing
*/
int gdb_serial_stub(struct kgdb_state *ks)
{
@@ -1031,6 +1043,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
gdb_cmd_detachkill(ks);
return DBG_PASS_EVENT;
}
+ fallthrough;
#endif
case 'C': /* Exception passing */
tmp = gdb_cmd_exception_pass(ks);
@@ -1038,7 +1051,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
goto default_handle;
if (tmp == 0)
break;
- /* Fall through on tmp < 0 */
+ fallthrough; /* on tmp < 0 */
case 'c': /* Continue packet */
case 's': /* Single step packet */
if (kgdb_contthread && kgdb_contthread != current) {
@@ -1046,8 +1059,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
error_packet(remcom_out_buffer, -EINVAL);
break;
}
- dbg_activate_sw_breakpoints();
- /* Fall through to default processing */
+ fallthrough; /* to default processing */
default:
default_handle:
error = kgdb_arch_handle_exception(ks->ex_vector,
@@ -1093,10 +1105,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
return error;
case 's':
case 'c':
- strcpy(remcom_in_buffer, cmd);
+ strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
return 0;
case '$':
- strcpy(remcom_in_buffer, cmd);
+ strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
gdbstub_prev_in_buf_pos = 0;
return 0;
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
index 396d12eda9e8..df259542a236 100644
--- a/kernel/debug/kdb/.gitignore
+++ b/kernel/debug/kdb/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
index d4fc58f4b88d..efac857c5511 100644
--- a/kernel/debug/kdb/Makefile
+++ b/kernel/debug/kdb/Makefile
@@ -6,7 +6,6 @@
# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
#
-CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index e1dbf4a2c69e..c0c2072f5452 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
if (!bp->bp_type) {
kdb_printf("Software breakpoints are unavailable.\n"
- " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " Boot the kernel with rodata=off\n"
" OR use hw breaks: help bph\n");
}
-#endif
return 1;
}
return 0;
@@ -244,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i)
kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
if (bp->bp_enabled)
- kdb_printf("\n is enabled");
+ kdb_printf("\n is enabled ");
else
kdb_printf("\n is disabled");
- kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+ kdb_printf(" addr at %016lx, hardtype=%d installed=%d\n",
bp->bp_addr, bp->bp_type, bp->bp_installed);
kdb_printf("\n");
@@ -309,6 +307,15 @@ static int kdb_bp(int argc, const char **argv)
return KDB_BADINT;
/*
+ * This check is redundant (since the breakpoint machinery should
+ * be doing the same check during kdb_bp_install) but gives the
+ * user immediate feedback.
+ */
+ diag = kgdb_validate_break_address(template.bp_addr);
+ if (diag)
+ return diag;
+
+ /*
* Find an empty bp structure to allocate
*/
for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
@@ -414,7 +421,6 @@ static int kdb_bc(int argc, const char **argv)
* assume that the breakpoint number is desired.
*/
if (addr < KDB_MAXBPT) {
- bp = &kdb_breakpoints[addr];
lowbp = highbp = addr;
highbp++;
} else {
@@ -454,13 +460,15 @@ static int kdb_bc(int argc, const char **argv)
break;
case KDBCMD_BE:
+ if (bp->bp_enabled)
+ break;
+
bp->bp_enabled = 1;
kdb_printf("Breakpoint %d at "
- kdb_bfd_vma_fmt " enabled",
+ kdb_bfd_vma_fmt " enabled\n",
i, bp->bp_addr);
- kdb_printf("\n");
break;
case KDBCMD_BD:
if (!bp->bp_enabled)
@@ -516,6 +524,54 @@ static int kdb_ss(int argc, const char **argv)
return KDB_CMD_SS;
}
+static kdbtab_t bptab[] = {
+ { .name = "bp",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Set/Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "bl",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "Display breakpoints",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "bc",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Clear Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .name = "be",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Enable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .name = "bd",
+ .func = kdb_bc,
+ .usage = "<bpnum>",
+ .help = "Disable Breakpoint",
+ .flags = KDB_ENABLE_FLOW_CTRL,
+ },
+ { .name = "ss",
+ .func = kdb_ss,
+ .usage = "",
+ .help = "Single Step",
+ .minlen = 1,
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+ },
+};
+
+static kdbtab_t bphcmd = {
+ .name = "bph",
+ .func = kdb_bp,
+ .usage = "[<vaddr>]",
+ .help = "[datar [length]|dataw [length]] Set hw brk",
+ .flags = KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS,
+};
+
/* Initialize the breakpoint table and register breakpoint commands. */
void __init kdb_initbptab(void)
@@ -531,30 +587,7 @@ void __init kdb_initbptab(void)
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
bp->bp_free = 1;
- kdb_register_flags("bp", kdb_bp, "[<vaddr>]",
- "Set/Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bl", kdb_bp, "[<vaddr>]",
- "Display breakpoints", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
+ kdb_register_table(bptab, ARRAY_SIZE(bptab));
if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
- kdb_register_flags("bph", kdb_bp, "[<vaddr>]",
- "[datar [length]|dataw [length]] Set hw brk", 0,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("bc", kdb_bc, "<bpnum>",
- "Clear Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("be", kdb_bc, "<bpnum>",
- "Enable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
- kdb_register_flags("bd", kdb_bc, "<bpnum>",
- "Disable Breakpoint", 0,
- KDB_ENABLE_FLOW_CTRL);
-
- kdb_register_flags("ss", kdb_ss, "",
- "Single Step", 1,
- KDB_ENABLE_FLOW_CTRL | KDB_REPEAT_NO_ARGS);
- /*
- * Architecture dependent initialization.
- */
+ kdb_register_table(&bphcmd, 1);
}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index fe15fff5df53..137ba73f56fc 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -12,7 +12,8 @@
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/kdb.h>
#include <linux/nmi.h>
#include "kdb_private.h"
@@ -20,22 +21,18 @@
static void kdb_show_stack(struct task_struct *p, void *addr)
{
- int old_lvl = console_loglevel;
- console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
- kdb_set_current_task(p);
- if (addr) {
- show_stack((struct task_struct *)p, addr);
- } else if (kdb_current_regs) {
-#ifdef CONFIG_X86
- show_stack(p, &kdb_current_regs->sp);
-#else
- show_stack(p, NULL);
-#endif
+
+ if (!addr && kdb_task_has_cpu(p)) {
+ int old_lvl = console_loglevel;
+
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+ kdb_dump_stack_on_cpu(kdb_process_cpu(p));
+ console_loglevel = old_lvl;
} else {
- show_stack(p, NULL);
+ show_stack(p, addr, KERN_EMERG);
}
- console_loglevel = old_lvl;
+
kdb_trap_printk--;
}
@@ -49,7 +46,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
* btp <pid> Kernel stack for <pid>
* btt <address-expression> Kernel stack for task structure at
* <address-expression>
- * bta [DRSTCZEUIMA] All useful processes, optionally
+ * bta [state_chars>|A] All useful processes, optionally
* filtered by state
* btc [<cpu>] The current process on one cpu,
* default is all cpus
@@ -77,12 +74,12 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
*/
static int
-kdb_bt1(struct task_struct *p, unsigned long mask,
- int argcount, int btaprompt)
+kdb_bt1(struct task_struct *p, const char *mask, bool btaprompt)
{
- char buffer[2];
- if (kdb_getarea(buffer[0], (unsigned long)p) ||
- kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+ char ch;
+
+ if (kdb_getarea(ch, (unsigned long)p) ||
+ kdb_getarea(ch, (unsigned long)(p+1)-1))
return KDB_BADADDR;
if (!kdb_task_state(p, mask))
return 0;
@@ -90,22 +87,46 @@ kdb_bt1(struct task_struct *p, unsigned long mask,
kdb_ps1(p);
kdb_show_stack(p, NULL);
if (btaprompt) {
- kdb_getstr(buffer, sizeof(buffer),
- "Enter <q> to end, <cr> to continue:");
- if (buffer[0] == 'q') {
- kdb_printf("\n");
+ kdb_printf("Enter <q> to end, <cr> or <space> to continue:");
+ do {
+ ch = kdb_getchar();
+ } while (!strchr("\r\n q", ch));
+ kdb_printf("\n");
+
+ /* reset the pager */
+ kdb_nextline = 1;
+
+ if (ch == 'q')
return 1;
- }
}
touch_nmi_watchdog();
return 0;
}
+static void
+kdb_bt_cpu(unsigned long cpu)
+{
+ struct task_struct *kdb_tsk;
+
+ if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+ kdb_printf("WARNING: no process for cpu %ld\n", cpu);
+ return;
+ }
+
+ /* If a CPU failed to round up we could be here */
+ kdb_tsk = KDB_TSK(cpu);
+ if (!kdb_tsk) {
+ kdb_printf("WARNING: no task for cpu %ld\n", cpu);
+ return;
+ }
+
+ kdb_bt1(kdb_tsk, "A", false);
+}
+
int
kdb_bt(int argc, const char **argv)
{
int diag;
- int argcount = 5;
int btaprompt = 1;
int nextarg;
unsigned long addr;
@@ -117,25 +138,25 @@ kdb_bt(int argc, const char **argv)
if (strcmp(argv[0], "bta") == 0) {
struct task_struct *g, *p;
unsigned long cpu;
- unsigned long mask = kdb_task_state_string(argc ? argv[1] :
- NULL);
+ const char *mask = argc ? argv[1] : kdbgetenv("PS");
+
if (argc == 0)
kdb_ps_suppressed();
/* Run the active tasks first */
for_each_online_cpu(cpu) {
- p = kdb_curr_task(cpu);
- if (kdb_bt1(p, mask, argcount, btaprompt))
+ p = curr_task(cpu);
+ if (kdb_bt1(p, mask, btaprompt))
return 0;
}
/* Now the inactive tasks */
- kdb_do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
if (task_curr(p))
continue;
- if (kdb_bt1(p, mask, argcount, btaprompt))
+ if (kdb_bt1(p, mask, btaprompt))
return 0;
- } kdb_while_each_thread(g, p);
+ }
} else if (strcmp(argv[0], "btp") == 0) {
struct task_struct *p;
unsigned long pid;
@@ -145,10 +166,8 @@ kdb_bt(int argc, const char **argv)
if (diag)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
- if (p) {
- kdb_set_current_task(p);
- return kdb_bt1(p, ~0UL, argcount, 0);
- }
+ if (p)
+ return kdb_bt1(p, "A", false);
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
@@ -157,12 +176,9 @@ kdb_bt(int argc, const char **argv)
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
- kdb_set_current_task((struct task_struct *)addr);
- return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+ return kdb_bt1((struct task_struct *)addr, "A", false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
- struct task_struct *save_current_task = kdb_current_task;
- char buf[80];
if (argc > 1)
return KDB_ARGCOUNT;
if (argc == 1) {
@@ -170,26 +186,21 @@ kdb_bt(int argc, const char **argv)
if (diag)
return diag;
}
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
- argv = NULL;
if (cpu != ~0) {
- if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
- kdb_printf("no process for cpu %ld\n", cpu);
- return 0;
+ kdb_bt_cpu(cpu);
+ } else {
+ /*
+ * Recursive use of kdb_parse, do not use argv after
+ * this point.
+ */
+ argv = NULL;
+ kdb_printf("btc: cpu status: ");
+ kdb_parse("cpu\n");
+ for_each_online_cpu(cpu) {
+ kdb_bt_cpu(cpu);
+ touch_nmi_watchdog();
}
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
- kdb_parse(buf);
- return 0;
- }
- kdb_printf("btc: cpu status: ");
- kdb_parse("cpu\n");
- for_each_online_cpu(cpu) {
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
- kdb_parse(buf);
- touch_nmi_watchdog();
}
- kdb_set_current_task(save_current_task);
return 0;
} else {
if (argc) {
@@ -201,7 +212,7 @@ kdb_bt(int argc, const char **argv)
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
- return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+ return kdb_bt1(kdb_current_task, "A", false);
}
}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 15e1a7af5dd0..e91fc3e4edd5 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -118,13 +118,6 @@ int kdb_stub(struct kgdb_state *ks)
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
KDB_STATE_SET(PAGER);
- /* zero out any offline cpu data */
- for_each_present_cpu(i) {
- if (!cpu_online(i)) {
- kgdb_info[i].debuggerinfo = NULL;
- kgdb_info[i].task = NULL;
- }
- }
if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
@@ -147,14 +140,12 @@ int kdb_stub(struct kgdb_state *ks)
*/
kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
- kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
if (KDB_STATE(DOING_KGDB))
KDB_STATE_CLEAR(DOING_KGDB);
return DBG_PASS_EVENT;
}
kdb_bp_install(ks->linux_regs);
- dbg_activate_sw_breakpoints();
/* Set the exit state to a single step or a continue */
if (KDB_STATE(DOING_SS))
gdbstub_state(ks, "s");
@@ -174,7 +165,6 @@ int kdb_stub(struct kgdb_state *ks)
* differently vs the gdbstub
*/
kgdb_single_step = 0;
- dbg_deactivate_sw_breakpoints();
return DBG_SWITCH_CPU_EVENT;
}
return kgdb_info[ks->cpu].ret_state;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index fc1ef736253c..61c1690058ed 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -9,7 +9,6 @@
* Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
*/
-#include <linux/module.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
@@ -30,6 +29,7 @@
char kdb_prompt_str[CMD_BUFLEN];
int kdb_trap_printk;
+int kdb_printf_cpu = -1;
static int kgdb_transition_check(char *buffer)
{
@@ -48,15 +48,90 @@ static int kgdb_transition_check(char *buffer)
return 0;
}
-static int kdb_read_get_key(char *buffer, size_t bufsize)
+/**
+ * kdb_handle_escape() - validity check on an accumulated escape sequence.
+ * @buf: Accumulated escape characters to be examined. Note that buf
+ * is not a string, it is an array of characters and need not be
+ * nil terminated.
+ * @sz: Number of accumulated escape characters.
+ *
+ * Return: -1 if the escape sequence is unwanted, 0 if it is incomplete,
+ * otherwise it returns a mapped key value to pass to the upper layers.
+ */
+static int kdb_handle_escape(char *buf, size_t sz)
+{
+ char *lastkey = buf + sz - 1;
+
+ switch (sz) {
+ case 1:
+ if (*lastkey == '\e')
+ return 0;
+ break;
+
+ case 2: /* \e<something> */
+ if (*lastkey == '[')
+ return 0;
+ break;
+
+ case 3:
+ switch (*lastkey) {
+ case 'A': /* \e[A, up arrow */
+ return 16;
+ case 'B': /* \e[B, down arrow */
+ return 14;
+ case 'C': /* \e[C, right arrow */
+ return 6;
+ case 'D': /* \e[D, left arrow */
+ return 2;
+ case '1': /* \e[<1,3,4>], may be home, del, end */
+ case '3':
+ case '4':
+ return 0;
+ }
+ break;
+
+ case 4:
+ if (*lastkey == '~') {
+ switch (buf[2]) {
+ case '1': /* \e[1~, home */
+ return 1;
+ case '3': /* \e[3~, del */
+ return 4;
+ case '4': /* \e[4~, end */
+ return 5;
+ }
+ }
+ break;
+ }
+
+ return -1;
+}
+
+/**
+ * kdb_getchar() - Read a single character from a kdb console (or consoles).
+ *
+ * Other than polling the various consoles that are currently enabled,
+ * most of the work done in this function is dealing with escape sequences.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right. The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters. kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources. Escape sequence processing
+ * has to be done as states in the polling loop.
+ *
+ * Return: The key pressed or a control code derived from an escape sequence.
+ */
+char kdb_getchar(void)
{
#define ESCAPE_UDELAY 1000
#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
- char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
- char *ped = escape_data;
+ char buf[4]; /* longest vt100 escape sequence is 4 bytes */
+ char *pbuf = buf;
int escape_delay = 0;
- get_char_func *f, *f_escape = NULL;
+ get_char_func *f, *f_prev = NULL;
int key;
+ static bool last_char_was_cr;
for (f = &kdb_poll_funcs[0]; ; ++f) {
if (*f == NULL) {
@@ -64,109 +139,76 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
touch_nmi_watchdog();
f = &kdb_poll_funcs[0];
}
- if (escape_delay == 2) {
- *ped = '\0';
- ped = escape_data;
- --escape_delay;
- }
- if (escape_delay == 1) {
- key = *ped++;
- if (!*ped)
- --escape_delay;
- break;
- }
+
key = (*f)();
if (key == -1) {
if (escape_delay) {
udelay(ESCAPE_UDELAY);
- --escape_delay;
+ if (--escape_delay == 0)
+ return '\e';
}
continue;
}
- if (bufsize <= 2) {
- if (key == '\r')
- key = '\n';
- *buffer++ = key;
- *buffer = '\0';
- return -1;
+
+ /*
+ * The caller expects that newlines are either CR or LF. However
+ * some terminals send _both_ CR and LF. Avoid having to handle
+ * this in the caller by stripping the LF if we saw a CR right
+ * before.
+ */
+ if (last_char_was_cr && key == '\n') {
+ last_char_was_cr = false;
+ continue;
}
- if (escape_delay == 0 && key == '\e') {
+ last_char_was_cr = (key == '\r');
+
+ /*
+ * When the first character is received (or we get a change
+ * input source) we set ourselves up to handle an escape
+ * sequences (just in case).
+ */
+ if (f_prev != f) {
+ f_prev = f;
+ pbuf = buf;
escape_delay = ESCAPE_DELAY;
- ped = escape_data;
- f_escape = f;
}
- if (escape_delay) {
- *ped++ = key;
- if (f_escape != f) {
- escape_delay = 2;
- continue;
- }
- if (ped - escape_data == 1) {
- /* \e */
- continue;
- } else if (ped - escape_data == 2) {
- /* \e<something> */
- if (key != '[')
- escape_delay = 2;
- continue;
- } else if (ped - escape_data == 3) {
- /* \e[<something> */
- int mapkey = 0;
- switch (key) {
- case 'A': /* \e[A, up arrow */
- mapkey = 16;
- break;
- case 'B': /* \e[B, down arrow */
- mapkey = 14;
- break;
- case 'C': /* \e[C, right arrow */
- mapkey = 6;
- break;
- case 'D': /* \e[D, left arrow */
- mapkey = 2;
- break;
- case '1': /* dropthrough */
- case '3': /* dropthrough */
- /* \e[<1,3,4>], may be home, del, end */
- case '4':
- mapkey = -1;
- break;
- }
- if (mapkey != -1) {
- if (mapkey > 0) {
- escape_data[0] = mapkey;
- escape_data[1] = '\0';
- }
- escape_delay = 2;
- }
- continue;
- } else if (ped - escape_data == 4) {
- /* \e[<1,3,4><something> */
- int mapkey = 0;
- if (key == '~') {
- switch (escape_data[2]) {
- case '1': /* \e[1~, home */
- mapkey = 1;
- break;
- case '3': /* \e[3~, del */
- mapkey = 4;
- break;
- case '4': /* \e[4~, end */
- mapkey = 5;
- break;
- }
- }
- if (mapkey > 0) {
- escape_data[0] = mapkey;
- escape_data[1] = '\0';
- }
- escape_delay = 2;
- continue;
- }
- }
- break; /* A key to process */
+
+ *pbuf++ = key;
+ key = kdb_handle_escape(buf, pbuf - buf);
+ if (key < 0) /* no escape sequence; return best character */
+ return buf[pbuf - buf == 2 ? 1 : 0];
+ if (key > 0)
+ return key;
}
- return key;
+
+ unreachable();
+}
+
+/**
+ * kdb_position_cursor() - Place cursor in the correct horizontal position
+ * @prompt: Nil-terminated string containing the prompt string
+ * @buffer: Nil-terminated string containing the entire command line
+ * @cp: Cursor position, pointer the character in buffer where the cursor
+ * should be positioned.
+ *
+ * The cursor is positioned by sending a carriage-return and then printing
+ * the content of the line until we reach the correct cursor position.
+ *
+ * There is some additional fine detail here.
+ *
+ * Firstly, even though kdb_printf() will correctly format zero-width fields
+ * we want the second call to kdb_printf() to be conditional. That keeps things
+ * a little cleaner when LOGGING=1.
+ *
+ * Secondly, we can't combine everything into one call to kdb_printf() since
+ * that renders into a fixed length buffer and the combined print could result
+ * in unwanted truncation.
+ */
+static void kdb_position_cursor(char *prompt, char *buffer, char *cp)
+{
+ kdb_printf("\r%s", prompt);
+ if (cp > buffer)
+ kdb_printf("%.*s", (int)(cp - buffer), buffer);
}
/*
@@ -187,17 +229,7 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
* function. It is not reentrant - it relies on the fact
* that while kdb is running on only one "master debug" cpu.
* Remarks:
- *
- * The buffer size must be >= 2. A buffer size of 2 means that the caller only
- * wants a single key.
- *
- * An escape key could be the start of a vt100 control sequence such as \e[D
- * (left arrow) or it could be a character in its own right. The standard
- * method for detecting the difference is to wait for 2 seconds to see if there
- * are any other characters. kdb is complicated by the lack of a timer service
- * (interrupts are off), by multiple input sources and by the need to sometimes
- * return after just one key. Escape sequence processing has to be done as
- * states in the polling loop.
+ * The buffer size must be >= 2.
*/
static char *kdb_read(char *buffer, size_t bufsize)
@@ -215,8 +247,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int count;
int i;
int diag, dtab_count;
- int key;
-
+ int key, ret;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -232,29 +263,21 @@ static char *kdb_read(char *buffer, size_t bufsize)
*cp = '\0';
kdb_printf("%s", buffer);
poll_again:
- key = kdb_read_get_key(buffer, bufsize);
- if (key == -1)
- return buffer;
+ key = kdb_getchar();
if (key != 9)
tab = 0;
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
- if (cp < lastchar) {
- memcpy(tmpbuffer, cp, lastchar - cp);
- memcpy(cp-1, tmpbuffer, lastchar - cp);
- }
- *(--lastchar) = '\0';
- --cp;
- kdb_printf("\b%s \r", cp);
- tmp = *cp;
- *cp = '\0';
- kdb_printf(kdb_prompt_str);
- kdb_printf("%s", buffer);
- *cp = tmp;
+ memmove(cp-1, cp, lastchar - cp + 1);
+ lastchar--;
+ cp--;
+ kdb_printf("\b%s ", cp);
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
}
break;
- case 13: /* enter */
+ case 10: /* linefeed */
+ case 13: /* carriage return */
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
@@ -265,22 +288,16 @@ poll_again:
return buffer;
case 4: /* Del */
if (cp < lastchar) {
- memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
- memcpy(cp, tmpbuffer, lastchar - cp - 1);
- *(--lastchar) = '\0';
- kdb_printf("%s \r", cp);
- tmp = *cp;
- *cp = '\0';
- kdb_printf(kdb_prompt_str);
- kdb_printf("%s", buffer);
- *cp = tmp;
+ memmove(cp, cp+1, lastchar - cp);
+ lastchar--;
+ kdb_printf("%s ", cp);
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
}
break;
case 1: /* Home */
if (cp > buffer) {
- kdb_printf("\r");
- kdb_printf(kdb_prompt_str);
cp = buffer;
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
}
break;
case 5: /* End */
@@ -296,11 +313,10 @@ poll_again:
}
break;
case 14: /* Down */
- memset(tmpbuffer, ' ',
- strlen(kdb_prompt_str) + (lastchar-buffer));
- *(tmpbuffer+strlen(kdb_prompt_str) +
- (lastchar-buffer)) = '\0';
- kdb_printf("\r%s\r", tmpbuffer);
+ case 16: /* Up */
+ kdb_printf("\r%*c\r",
+ (int)(strlen(kdb_prompt_str) + (lastchar - buffer)),
+ ' ');
*lastchar = (char)key;
*(lastchar+1) = '\0';
return lastchar;
@@ -310,34 +326,19 @@ poll_again:
++cp;
}
break;
- case 16: /* Up */
- memset(tmpbuffer, ' ',
- strlen(kdb_prompt_str) + (lastchar-buffer));
- *(tmpbuffer+strlen(kdb_prompt_str) +
- (lastchar-buffer)) = '\0';
- kdb_printf("\r%s\r", tmpbuffer);
- *lastchar = (char)key;
- *(lastchar+1) = '\0';
- return lastchar;
case 9: /* Tab */
if (tab < 2)
++tab;
- p_tmp = buffer;
- while (*p_tmp == ' ')
- p_tmp++;
- if (p_tmp > cp)
- break;
- memcpy(tmpbuffer, p_tmp, cp-p_tmp);
- *(tmpbuffer + (cp-p_tmp)) = '\0';
- p_tmp = strrchr(tmpbuffer, ' ');
- if (p_tmp)
- ++p_tmp;
- else
- p_tmp = tmpbuffer;
- len = strlen(p_tmp);
- count = kallsyms_symbol_complete(p_tmp,
- sizeof(tmpbuffer) -
- (p_tmp - tmpbuffer));
+
+ tmp = *cp;
+ *cp = '\0';
+ p_tmp = strrchr(buffer, ' ');
+ p_tmp = (p_tmp ? p_tmp + 1 : buffer);
+ strscpy(tmpbuffer, p_tmp);
+ *cp = tmp;
+
+ len = strlen(tmpbuffer);
+ count = kallsyms_symbol_complete(tmpbuffer, sizeof(tmpbuffer));
if (tab == 2 && count > 0) {
kdb_printf("\n%d symbols are found.", count);
if (count > dtab_count) {
@@ -349,42 +350,51 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- if (kallsyms_symbol_next(p_tmp, i) < 0)
+ ret = kallsyms_symbol_next(tmpbuffer, i, sizeof(tmpbuffer));
+ if (WARN_ON(!ret))
break;
- kdb_printf("%s ", p_tmp);
- *(p_tmp + len) = '\0';
+ if (ret != -E2BIG)
+ kdb_printf("%s ", tmpbuffer);
+ else
+ kdb_printf("%s... ", tmpbuffer);
+ tmpbuffer[len] = '\0';
}
if (i >= dtab_count)
kdb_printf("...");
kdb_printf("\n");
- kdb_printf(kdb_prompt_str);
+ kdb_printf("%s", kdb_prompt_str);
kdb_printf("%s", buffer);
+ if (cp != lastchar)
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
} else if (tab != 2 && count > 0) {
- len_tmp = strlen(p_tmp);
- strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
- len_tmp = strlen(p_tmp);
- strncpy(cp, p_tmp+len, len_tmp-len + 1);
- len = len_tmp - len;
- kdb_printf("%s", cp);
- cp += len;
- lastchar += len;
+ /* How many new characters do we want from tmpbuffer? */
+ len_tmp = strlen(tmpbuffer) - len;
+ if (lastchar + len_tmp >= bufend)
+ len_tmp = bufend - lastchar;
+
+ if (len_tmp) {
+ /* + 1 ensures the '\0' is memmove'd */
+ memmove(cp+len_tmp, cp, (lastchar-cp) + 1);
+ memcpy(cp, tmpbuffer+len, len_tmp);
+ kdb_printf("%s", cp);
+ cp += len_tmp;
+ lastchar += len_tmp;
+ if (cp != lastchar)
+ kdb_position_cursor(kdb_prompt_str,
+ buffer, cp);
+ }
}
kdb_nextline = 1; /* reset output line number */
break;
default:
if (key >= 32 && lastchar < bufend) {
if (cp < lastchar) {
- memcpy(tmpbuffer, cp, lastchar - cp);
- memcpy(cp+1, tmpbuffer, lastchar - cp);
- *++lastchar = '\0';
+ memmove(cp+1, cp, lastchar - cp + 1);
+ lastchar++;
*cp = key;
- kdb_printf("%s\r", cp);
+ kdb_printf("%s", cp);
++cp;
- tmp = *cp;
- *cp = '\0';
- kdb_printf(kdb_prompt_str);
- kdb_printf("%s", buffer);
- *cp = tmp;
+ kdb_position_cursor(kdb_prompt_str, buffer, cp);
} else {
*++lastchar = '\0';
*cp++ = key;
@@ -442,8 +452,8 @@ poll_again:
char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
- strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
- kdb_printf(kdb_prompt_str);
+ strscpy(kdb_prompt_str, prompt);
+ kdb_printf("%s", kdb_prompt_str);
kdb_nextline = 1; /* Prompt and input resets line number */
return kdb_read(buffer, bufsize);
}
@@ -548,37 +558,102 @@ static int kdb_search_string(char *searched, char *searchfor)
return 0;
}
+static void kdb_msg_write(const char *msg, int msg_len)
+{
+ struct console *c;
+ const char *cp;
+ int cookie;
+ int len;
+
+ if (msg_len == 0)
+ return;
+
+ cp = msg;
+ len = msg_len;
+
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+
+ /*
+ * The console_srcu_read_lock() only provides safe console list
+ * traversal. The use of the ->write() callback relies on all other
+ * CPUs being stopped at the moment and console drivers being able to
+ * handle reentrance when @oops_in_progress is set.
+ *
+ * There is no guarantee that every console driver can handle
+ * reentrance in this way; the developer deploying the debugger
+ * is responsible for ensuring that the console drivers they
+ * have selected handle reentrance appropriately.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ short flags = console_srcu_read_flags(c);
+
+ if (!console_is_usable(c, flags, true))
+ continue;
+ if (c == dbg_io_ops->cons)
+ continue;
+
+ if (flags & CON_NBCON) {
+ struct nbcon_write_context wctxt = { };
+
+ /*
+ * Do not continue if the console is NBCON and the context
+ * can't be acquired.
+ */
+ if (!nbcon_kdb_try_acquire(c, &wctxt))
+ continue;
+
+ nbcon_write_context_set_buf(&wctxt, (char *)msg, msg_len);
+
+ c->write_atomic(c, &wctxt);
+ nbcon_kdb_release(&wctxt);
+ } else {
+ /*
+ * Set oops_in_progress to encourage the console drivers to
+ * disregard their internal spin locks: in the current calling
+ * context the risk of deadlock is a bigger problem than risks
+ * due to re-entering the console driver. We operate directly on
+ * oops_in_progress rather than using bust_spinlocks() because
+ * the calls bust_spinlocks() makes on exit are not appropriate
+ * for this calling context.
+ */
+ ++oops_in_progress;
+ c->write(c, msg, msg_len);
+ --oops_in_progress;
+ }
+ touch_nmi_watchdog();
+ }
+ console_srcu_read_unlock(cookie);
+}
+
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
{
int diag;
int linecount;
int colcount;
int logging, saved_loglevel = 0;
- int saved_trap_printk;
- int got_printf_lock = 0;
int retlen = 0;
int fnd, len;
+ int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- struct console *c = console_drivers;
- static DEFINE_SPINLOCK(kdb_printf_lock);
- unsigned long uninitialized_var(flags);
-
- preempt_disable();
- saved_trap_printk = kdb_trap_printk;
- kdb_trap_printk = 0;
+ unsigned long flags;
/* Serialize kdb_printf if multiple cpus try to write at once.
* But if any cpu goes recursive in kdb, just print the output,
* even if it is interleaved with any other text.
*/
- if (!KDB_STATE(PRINTF_LOCK)) {
- KDB_STATE_SET(PRINTF_LOCK);
- spin_lock_irqsave(&kdb_printf_lock, flags);
- got_printf_lock = 1;
- atomic_inc(&kdb_event);
- } else {
- __acquire(kdb_printf_lock);
+ local_irq_save(flags);
+ this_cpu = smp_processor_id();
+ for (;;) {
+ old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
+ if (old_cpu == -1 || old_cpu == this_cpu)
+ break;
+
+ cpu_relax();
}
diag = kdbgetintenv("LINES", &linecount);
@@ -656,8 +731,8 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
* it, depending on the results of the search.
*/
cp++; /* to byte after the newline */
- replaced_byte = *cp; /* remember what/where it was */
- cphold = cp;
+ replaced_byte = *cp; /* remember what it was */
+ cphold = cp; /* remember where it was */
*cp = '\0'; /* end the string for our search */
/*
@@ -674,18 +749,23 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
* Shift the buffer left.
*/
*cphold = replaced_byte;
- strcpy(kdb_buffer, cphold);
- len = strlen(kdb_buffer);
+ len = strlen(cphold);
+ /* Use memmove() because the buffers overlap */
+ memmove(kdb_buffer, cphold, len + 1);
next_avail = kdb_buffer + len;
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
- if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+ if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) {
/*
* This was a interactive search (using '/' at more
- * prompt) and it has completed. Clear the flag.
+ * prompt) and it has completed. Replace the \0 with
+ * its original value to ensure multi-line strings
+ * are handled properly, and return to normal mode.
*/
+ *cphold = replaced_byte;
kdb_grepping_flag = 0;
+ }
/*
* at this point the string is a full line and
* should be printed, up to the null.
@@ -697,24 +777,12 @@ kdb_printit:
* Write to all consoles.
*/
retlen = strlen(kdb_buffer);
- cp = (char *) printk_skip_level(kdb_buffer);
- if (!dbg_kdb_mode && kgdb_connected) {
+ cp = (char *) printk_skip_headers(kdb_buffer);
+ if (!dbg_kdb_mode && kgdb_connected)
gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
- } else {
- if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = retlen - (cp - kdb_buffer);
- cp2 = cp;
- while (len--) {
- dbg_io_ops->write_char(*cp2);
- cp2++;
- }
- }
- while (c) {
- c->write(c, cp, retlen - (cp - kdb_buffer));
- touch_nmi_watchdog();
- c = c->next;
- }
- }
+ else
+ kdb_msg_write(cp, retlen - (cp - kdb_buffer));
+
if (logging) {
saved_loglevel = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_SILENT;
@@ -747,7 +815,7 @@ kdb_printit:
/* check for having reached the LINES number of printed lines */
if (kdb_nextline >= linecount) {
- char buf1[16] = "";
+ char ch;
/* Watch out for recursion here. Any routine that calls
* kdb_printf will come back through here. And kdb_read
@@ -763,58 +831,43 @@ kdb_printit:
moreprompt = "more> ";
kdb_input_flush();
- c = console_drivers;
-
- if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = strlen(moreprompt);
- cp = moreprompt;
- while (len--) {
- dbg_io_ops->write_char(*cp);
- cp++;
- }
- }
- while (c) {
- c->write(c, moreprompt, strlen(moreprompt));
- touch_nmi_watchdog();
- c = c->next;
- }
+ kdb_msg_write(moreprompt, strlen(moreprompt));
if (logging)
printk("%s", moreprompt);
- kdb_read(buf1, 2); /* '2' indicates to return
- * immediately after getting one key. */
+ ch = kdb_getchar();
kdb_nextline = 1; /* Really set output line 1 */
/* empty and reset the buffer: */
kdb_buffer[0] = '\0';
next_avail = kdb_buffer;
size_avail = sizeof(kdb_buffer);
- if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+ if ((ch == 'q') || (ch == 'Q')) {
/* user hit q or Q */
KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
KDB_STATE_CLEAR(PAGER);
/* end of command output; back to normal mode */
kdb_grepping_flag = 0;
kdb_printf("\n");
- } else if (buf1[0] == ' ') {
+ } else if (ch == ' ') {
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] == '\n') {
+ } else if (ch == '\n' || ch == '\r') {
kdb_nextline = linecount - 1;
kdb_printf("\r");
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] == '/' && !kdb_grepping_flag) {
+ } else if (ch == '/' && !kdb_grepping_flag) {
kdb_printf("\r");
kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
kdbgetenv("SEARCHPROMPT") ?: "search> ");
*strchrnul(kdb_grep_string, '\n') = '\0';
kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
suspend_grep = 1; /* for this recursion */
- } else if (buf1[0] && buf1[0] != '\n') {
- /* user hit something other than enter */
+ } else if (ch) {
+ /* user hit something unexpected */
suspend_grep = 1; /* for this recursion */
- if (buf1[0] != '/')
+ if (ch != '/')
kdb_printf(
"\nOnly 'q', 'Q' or '/' are processed at "
"more prompt, input ignored\n");
@@ -837,8 +890,9 @@ kdb_printit:
*/
if (kdb_grepping_flag && !suspend_grep) {
*cphold = replaced_byte;
- strcpy(kdb_buffer, cphold);
- len = strlen(kdb_buffer);
+ len = strlen(cphold);
+ /* Use memmove() because the buffers overlap */
+ memmove(kdb_buffer, cphold, len + 1);
next_avail = kdb_buffer + len;
size_avail = sizeof(kdb_buffer) - len;
}
@@ -847,16 +901,9 @@ kdb_print_out:
suspend_grep = 0; /* end of what may have been a recursive call */
if (logging)
console_loglevel = saved_loglevel;
- if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
- got_printf_lock = 0;
- spin_unlock_irqrestore(&kdb_printf_lock, flags);
- KDB_STATE_CLEAR(PRINTF_LOCK);
- atomic_dec(&kdb_event);
- } else {
- __release(kdb_printf_lock);
- }
- kdb_trap_printk = saved_trap_printk;
- preempt_enable();
+ /* kdb_printf_cpu locked the code above. */
+ smp_store_release(&kdb_printf_cpu, old_cpu);
+ local_irq_restore(flags);
return retlen;
}
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 118527aa60ea..386d30e530b7 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -11,9 +11,10 @@
#include <linux/kdb.h>
#include <linux/keyboard.h>
#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/io.h>
+#include "kdb_private.h"
+
/* Keyboard Controller Registers on normal PCs. */
#define KBD_STATUS_REG 0x64 /* Status register (R) */
@@ -24,6 +25,8 @@
#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
+#define CTRL(c) ((c) - 64)
+
static int kbd_exists;
static int kbd_last_ret;
@@ -122,29 +125,26 @@ int kdb_get_kbd_char(void)
return 8;
}
- /* Special Key */
+ /* Translate special keys to equivalent CTRL control characters */
switch (scancode) {
case 0xF: /* Tab */
- return 9;
+ return CTRL('I');
case 0x53: /* Del */
- return 4;
+ return CTRL('D');
case 0x47: /* Home */
- return 1;
+ return CTRL('A');
case 0x4F: /* End */
- return 5;
+ return CTRL('E');
case 0x4B: /* Left */
- return 2;
+ return CTRL('B');
case 0x48: /* Up */
- return 16;
+ return CTRL('P');
case 0x50: /* Down */
- return 14;
+ return CTRL('N');
case 0x4D: /* Right */
- return 6;
+ return CTRL('F');
}
- if (scancode == 0xe0)
- return -1;
-
/*
* For Japanese 86/106 keyboards
* See comment in drivers/char/pc_keyb.c.
@@ -171,13 +171,26 @@ int kdb_get_kbd_char(void)
switch (KTYP(keychar)) {
case KT_LETTER:
case KT_LATIN:
+ switch (keychar) {
+ /* non-printable supported control characters */
+ case CTRL('A'): /* Home */
+ case CTRL('B'): /* Left */
+ case CTRL('D'): /* Del */
+ case CTRL('E'): /* End */
+ case CTRL('F'): /* Right */
+ case CTRL('I'): /* Tab */
+ case CTRL('N'): /* Down */
+ case CTRL('P'): /* Up */
+ return keychar;
+ }
+
if (isprint(keychar))
break; /* printable characters */
- /* drop through */
+ fallthrough;
case KT_SPEC:
if (keychar == K_ENTER)
break;
- /* drop through */
+ fallthrough;
default:
return -1; /* ignore unprintables */
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4121345498e0..dddf2b5aad57 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -18,12 +18,13 @@
#include <linux/kmsg_dump.h>
#include <linux/reboot.h>
#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/debug.h>
#include <linux/sysrq.h>
#include <linux/smp.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
-#include <linux/atomic.h>
-#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -42,6 +43,7 @@
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include "kdb_private.h"
#undef MODULE_PARAM_PREFIX
@@ -59,8 +61,7 @@ int kdb_grep_trailing;
/*
* Kernel debugger state flags
*/
-int kdb_flags;
-atomic_t kdb_event;
+unsigned int kdb_flags;
/*
* kdb_lock protects updates to kdb_initial_cpu. Used to
@@ -71,7 +72,6 @@ int kdb_nextline = 1;
int kdb_state; /* General KDB state */
struct task_struct *kdb_current_task;
-EXPORT_SYMBOL(kdb_current_task);
struct pt_regs *kdb_current_regs;
const char *kdb_diemsg;
@@ -83,15 +83,8 @@ static unsigned int kdb_continue_catastrophic =
static unsigned int kdb_continue_catastrophic;
#endif
-/* kdb_commands describes the available commands. */
-static kdbtab_t *kdb_commands;
-#define KDB_BASE_CMD_MAX 50
-static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
-#define for_each_kdbcmd(cmd, num) \
- for ((cmd) = kdb_base_commands, (num) = 0; \
- num < kdb_max_commands; \
- num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++)
+/* kdb_cmds_head describes the available commands. */
+static LIST_HEAD(kdb_cmds_head);
typedef struct _kdbmsg {
int km_diag; /* kdb diagnostic */
@@ -111,7 +104,7 @@ static kdbmsg_t kdbmsgs[] = {
KDBMSG(NOENVVALUE, "Environment variable should have value"),
KDBMSG(NOTIMP, "Command not implemented"),
KDBMSG(ENVFULL, "Environment full"),
- KDBMSG(ENVBUFFULL, "Environment buffer full"),
+ KDBMSG(KMALLOCFAILED, "Failed to allocate memory"),
KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
#ifdef CONFIG_CPU_XSCALE
KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
@@ -136,70 +129,84 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
/*
- * Initial environment. This is all kept static and local to
- * this file. We don't want to rely on the memory allocation
- * mechanisms in the kernel, so we use a very limited allocate-only
- * heap for new and altered environment variables. The entire
- * environment is limited to a fixed number of entries (add more
- * to __env[] if required) and a fixed amount of heap (add more to
- * KDB_ENVBUFSIZE if required).
+ * Initial environment. This is all kept static and local to this file.
+ * The entire environment is limited to a fixed number of entries
+ * (add more to __env[] if required)
*/
-static char *__env[] = {
+static char *__env[31] = {
#if defined(CONFIG_SMP)
- "PROMPT=[%d]kdb> ",
+ "PROMPT=[%d]kdb> ",
#else
- "PROMPT=kdb> ",
+ "PROMPT=kdb> ",
#endif
- "MOREPROMPT=more> ",
- "RADIX=16",
- "MDCOUNT=8", /* lines of md output */
- KDB_PLATFORM_ENV,
- "DTABCOUNT=30",
- "NOSECT=1",
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
- (char *)0,
+ "MOREPROMPT=more> ",
+ "RADIX=16",
+ "MDCOUNT=8", /* lines of md output */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
};
static const int __nenv = ARRAY_SIZE(__env);
-struct task_struct *kdb_curr_task(int cpu)
+/*
+ * Update the permissions flags (kdb_cmd_enabled) to match the
+ * current lockdown state.
+ *
+ * Within this function the calls to security_locked_down() are "lazy". We
+ * avoid calling them if the current value of kdb_cmd_enabled already excludes
+ * flags that might be subject to lockdown. Additionally we deliberately check
+ * the lockdown flags independently (even though read lockdown implies write
+ * lockdown) since that results in both simpler code and clearer messages to
+ * the user on first-time debugger entry.
+ *
+ * The permission masks during a read+write lockdown permits the following
+ * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE).
+ *
+ * The INSPECT commands are not blocked during lockdown because they are
+ * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes
+ * forcing them to have no arguments) and lsmod. These commands do expose
+ * some kernel state but do not allow the developer seated at the console to
+ * choose what state is reported. SIGNAL and REBOOT should not be controversial,
+ * given these are allowed for root during lockdown already.
+ */
+static void kdb_check_for_lockdown(void)
{
- struct task_struct *p = curr_task(cpu);
-#ifdef _TIF_MCA_INIT
- if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
- p = krp->p;
-#endif
- return p;
+ const int write_flags = KDB_ENABLE_MEM_WRITE |
+ KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_FLOW_CTRL;
+ const int read_flags = KDB_ENABLE_MEM_READ |
+ KDB_ENABLE_REG_READ;
+
+ bool need_to_lockdown_write = false;
+ bool need_to_lockdown_read = false;
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags))
+ need_to_lockdown_write =
+ security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL);
+
+ if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags))
+ need_to_lockdown_read =
+ security_locked_down(LOCKDOWN_DBG_READ_KERNEL);
+
+ /* De-compose KDB_ENABLE_ALL if required */
+ if (need_to_lockdown_write || need_to_lockdown_read)
+ if (kdb_cmd_enabled & KDB_ENABLE_ALL)
+ kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL;
+
+ if (need_to_lockdown_write)
+ kdb_cmd_enabled &= ~write_flags;
+
+ if (need_to_lockdown_read)
+ kdb_cmd_enabled &= ~read_flags;
}
/*
- * Check whether the flags of the current command and the permissions
- * of the kdb console has allow a command to be run.
+ * Check whether the flags of the current command, the permissions of the kdb
+ * console and the lockdown state allow a command to be run.
*/
-static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
+static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions,
bool no_args)
{
/* permissions comes from userspace so needs massaging slightly */
@@ -247,42 +254,12 @@ char *kdbgetenv(const char *match)
}
/*
- * kdballocenv - This function is used to allocate bytes for
- * environment entries.
- * Parameters:
- * match A character string representing a numeric value
- * Outputs:
- * *value the unsigned long representation of the env variable 'match'
- * Returns:
- * Zero on success, a kdb diagnostic on failure.
- * Remarks:
- * We use a static environment buffer (envbuffer) to hold the values
- * of dynamically generated environment variables (see kdb_set). Buffer
- * space once allocated is never free'd, so over time, the amount of space
- * (currently 512 bytes) will be exhausted if env variables are changed
- * frequently.
- */
-static char *kdballocenv(size_t bytes)
-{
-#define KDB_ENVBUFSIZE 512
- static char envbuffer[KDB_ENVBUFSIZE];
- static int envbufsize;
- char *ep = NULL;
-
- if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
- ep = &envbuffer[envbufsize];
- envbufsize += bytes;
- }
- return ep;
-}
-
-/*
* kdbgetulenv - This function will return the value of an unsigned
* long-valued environment variable.
* Parameters:
* match A character string representing a numeric value
* Outputs:
- * *value the unsigned long represntation of the env variable 'match'
+ * *value the unsigned long representation of the env variable 'match'
* Returns:
* Zero on success, a kdb diagnostic on failure.
*/
@@ -295,8 +272,8 @@ static int kdbgetulenv(const char *match, unsigned long *value)
return KDB_NOTENV;
if (strlen(ep) == 0)
return KDB_NOENVVALUE;
-
- *value = simple_strtoul(ep, NULL, 0);
+ if (kstrtoul(ep, 0, value))
+ return KDB_BADINT;
return 0;
}
@@ -323,53 +300,84 @@ int kdbgetintenv(const char *match, int *value)
}
/*
+ * kdb_setenv() - Alter an existing environment variable or create a new one.
+ * @var: Name of the variable
+ * @val: Value of the variable
+ *
+ * Return: Zero on success, a kdb diagnostic on failure.
+ */
+static int kdb_setenv(const char *var, const char *val)
+{
+ int i;
+ char *ep;
+ size_t varlen, vallen;
+
+ varlen = strlen(var);
+ vallen = strlen(val);
+ ep = kmalloc(varlen + vallen + 2, GFP_KDB);
+ if (!ep)
+ return KDB_KMALLOCFAILED;
+
+ sprintf(ep, "%s=%s", var, val);
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i]
+ && ((strncmp(__env[i], var, varlen) == 0)
+ && ((__env[i][varlen] == '\0')
+ || (__env[i][varlen] == '=')))) {
+ kfree_const(__env[i]);
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ /*
+ * Wasn't existing variable. Fit into slot.
+ */
+ for (i = 0; i < __nenv-1; i++) {
+ if (__env[i] == (char *)0) {
+ __env[i] = ep;
+ return 0;
+ }
+ }
+
+ return KDB_ENVFULL;
+}
+
+/*
+ * kdb_printenv() - Display the current environment variables.
+ */
+static void kdb_printenv(void)
+{
+ int i;
+
+ for (i = 0; i < __nenv; i++) {
+ if (__env[i])
+ kdb_printf("%s\n", __env[i]);
+ }
+}
+
+/*
* kdbgetularg - This function will convert a numeric string into an
* unsigned long value.
* Parameters:
* arg A character string representing a numeric value
* Outputs:
- * *value the unsigned long represntation of arg.
+ * *value the unsigned long representation of arg.
* Returns:
* Zero on success, a kdb diagnostic on failure.
*/
int kdbgetularg(const char *arg, unsigned long *value)
{
- char *endp;
- unsigned long val;
-
- val = simple_strtoul(arg, &endp, 0);
-
- if (endp == arg) {
- /*
- * Also try base 16, for us folks too lazy to type the
- * leading 0x...
- */
- val = simple_strtoul(arg, &endp, 16);
- if (endp == arg)
- return KDB_BADINT;
- }
-
- *value = val;
-
+ if (kstrtoul(arg, 0, value))
+ return KDB_BADINT;
return 0;
}
int kdbgetu64arg(const char *arg, u64 *value)
{
- char *endp;
- u64 val;
-
- val = simple_strtoull(arg, &endp, 0);
-
- if (endp == arg) {
-
- val = simple_strtoull(arg, &endp, 16);
- if (endp == arg)
- return KDB_BADINT;
- }
-
- *value = val;
-
+ if (kstrtou64(arg, 0, value))
+ return KDB_BADINT;
return 0;
}
@@ -379,10 +387,6 @@ int kdbgetu64arg(const char *arg, u64 *value)
*/
int kdb_set(int argc, const char **argv)
{
- int i;
- char *ep;
- size_t varlen, vallen;
-
/*
* we can be invoked two ways:
* set var=value argv[1]="var", argv[2]="value"
@@ -398,20 +402,26 @@ int kdb_set(int argc, const char **argv)
return KDB_ARGCOUNT;
/*
+ * Censor sensitive variables
+ */
+ if (strcmp(argv[1], "PROMPT") == 0 &&
+ !kdb_check_flags(KDB_ENABLE_MEM_READ, kdb_cmd_enabled, false))
+ return KDB_NOPERM;
+
+ /*
* Check for internal variables
*/
if (strcmp(argv[1], "KDBDEBUG") == 0) {
unsigned int debugflags;
- char *cp;
+ int ret;
- debugflags = simple_strtoul(argv[2], &cp, 0);
- if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+ ret = kstrtouint(argv[2], 0, &debugflags);
+ if (ret || debugflags & ~KDB_DEBUG_FLAG_MASK) {
kdb_printf("kdb: illegal debug flags '%s'\n",
argv[2]);
return 0;
}
- kdb_flags = (kdb_flags &
- ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+ kdb_flags = (kdb_flags & ~KDB_DEBUG(MASK))
| (debugflags << KDB_DEBUG_FLAG_SHIFT);
return 0;
@@ -421,37 +431,7 @@ int kdb_set(int argc, const char **argv)
* Tokenizer squashed the '=' sign. argv[1] is variable
* name, argv[2] = value.
*/
- varlen = strlen(argv[1]);
- vallen = strlen(argv[2]);
- ep = kdballocenv(varlen + vallen + 2);
- if (ep == (char *)0)
- return KDB_ENVBUFFULL;
-
- sprintf(ep, "%s=%s", argv[1], argv[2]);
-
- ep[varlen+vallen+1] = '\0';
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i]
- && ((strncmp(__env[i], argv[1], varlen) == 0)
- && ((__env[i][varlen] == '\0')
- || (__env[i][varlen] == '=')))) {
- __env[i] = ep;
- return 0;
- }
- }
-
- /*
- * Wasn't existing variable. Fit into slot.
- */
- for (i = 0; i < __nenv-1; i++) {
- if (__env[i] == (char *)0) {
- __env[i] = ep;
- return 0;
- }
- }
-
- return KDB_ENVFULL;
+ return kdb_setenv(argv[1], argv[2]);
}
static int kdb_check_regs(void)
@@ -470,7 +450,7 @@ static int kdb_check_regs(void)
* symbol name, and offset to the caller.
*
* The argument may consist of a numeric value (decimal or
- * hexidecimal), a symbol name, a register name (preceded by the
+ * hexadecimal), a symbol name, a register name (preceded by the
* percent sign), an environment variable with a numeric value
* (preceded by a dollar sign) or a simple arithmetic expression
* consisting of a symbol name, +/-, and a numeric constant value
@@ -654,70 +634,73 @@ static void kdb_cmderror(int diag)
* Returns:
* zero for success, a kdb diagnostic if error
*/
-struct defcmd_set {
- int count;
- int usable;
- char *name;
- char *usage;
- char *help;
- char **command;
+struct kdb_macro {
+ kdbtab_t cmd; /* Macro command */
+ struct list_head statements; /* Associated statement list */
};
-static struct defcmd_set *defcmd_set;
-static int defcmd_set_count;
-static int defcmd_in_progress;
+
+struct kdb_macro_statement {
+ char *statement; /* Statement text */
+ struct list_head list_node; /* Statement list node */
+};
+
+static struct kdb_macro *kdb_macro;
+static bool defcmd_in_progress;
/* Forward references */
static int kdb_exec_defcmd(int argc, const char **argv);
static int kdb_defcmd2(const char *cmdstr, const char *argv0)
{
- struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
- char **save_command = s->command;
+ struct kdb_macro_statement *kms;
+
+ if (!kdb_macro)
+ return KDB_NOTIMP;
+
if (strcmp(argv0, "endefcmd") == 0) {
- defcmd_in_progress = 0;
- if (!s->count)
- s->usable = 0;
- if (s->usable)
- /* macros are always safe because when executed each
- * internal command re-enters kdb_parse() and is
- * safety checked individually.
- */
- kdb_register_flags(s->name, kdb_exec_defcmd, s->usage,
- s->help, 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ defcmd_in_progress = false;
+ if (!list_empty(&kdb_macro->statements))
+ kdb_register(&kdb_macro->cmd);
return 0;
}
- if (!s->usable)
- return KDB_NOTIMP;
- s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
- if (!s->command) {
- kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+
+ kms = kmalloc(sizeof(*kms), GFP_KDB);
+ if (!kms) {
+ kdb_printf("Could not allocate new kdb macro command: %s\n",
cmdstr);
- s->usable = 0;
return KDB_NOTIMP;
}
- memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
- s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
- kfree(save_command);
+
+ kms->statement = kdb_strdup(cmdstr, GFP_KDB);
+ list_add_tail(&kms->list_node, &kdb_macro->statements);
+
return 0;
}
static int kdb_defcmd(int argc, const char **argv)
{
- struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+ kdbtab_t *mp;
+
if (defcmd_in_progress) {
kdb_printf("kdb: nested defcmd detected, assuming missing "
"endefcmd\n");
kdb_defcmd2("endefcmd", "endefcmd");
}
if (argc == 0) {
- int i;
- for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
- kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
- s->usage, s->help);
- for (i = 0; i < s->count; ++i)
- kdb_printf("%s", s->command[i]);
- kdb_printf("endefcmd\n");
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (kp->func == kdb_exec_defcmd) {
+ kdb_printf("defcmd %s \"%s\" \"%s\"\n",
+ kp->name, kp->usage, kp->help);
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements,
+ list_node)
+ kdb_printf("%s", kms->statement);
+ kdb_printf("endefcmd\n");
+ }
}
return 0;
}
@@ -727,45 +710,35 @@ static int kdb_defcmd(int argc, const char **argv)
kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
- defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set)
+ kdb_macro = kzalloc(sizeof(*kdb_macro), GFP_KDB);
+ if (!kdb_macro)
goto fail_defcmd;
- memcpy(defcmd_set, save_defcmd_set,
- defcmd_set_count * sizeof(*defcmd_set));
- s = defcmd_set + defcmd_set_count;
- memset(s, 0, sizeof(*s));
- s->usable = 1;
- s->name = kdb_strdup(argv[1], GFP_KDB);
- if (!s->name)
+
+ mp = &kdb_macro->cmd;
+ mp->func = kdb_exec_defcmd;
+ mp->minlen = 0;
+ mp->flags = KDB_ENABLE_ALWAYS_SAFE;
+ mp->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!mp->name)
goto fail_name;
- s->usage = kdb_strdup(argv[2], GFP_KDB);
- if (!s->usage)
+ mp->usage = kdb_strdup_dequote(argv[2], GFP_KDB);
+ if (!mp->usage)
goto fail_usage;
- s->help = kdb_strdup(argv[3], GFP_KDB);
- if (!s->help)
+ mp->help = kdb_strdup_dequote(argv[3], GFP_KDB);
+ if (!mp->help)
goto fail_help;
- if (s->usage[0] == '"') {
- strcpy(s->usage, argv[2]+1);
- s->usage[strlen(s->usage)-1] = '\0';
- }
- if (s->help[0] == '"') {
- strcpy(s->help, argv[3]+1);
- s->help[strlen(s->help)-1] = '\0';
- }
- ++defcmd_set_count;
- defcmd_in_progress = 1;
- kfree(save_defcmd_set);
+
+ INIT_LIST_HEAD(&kdb_macro->statements);
+ defcmd_in_progress = true;
return 0;
fail_help:
- kfree(s->usage);
+ kfree(mp->usage);
fail_usage:
- kfree(s->name);
+ kfree(mp->name);
fail_name:
- kfree(defcmd_set);
+ kfree(kdb_macro);
fail_defcmd:
- kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
- defcmd_set = save_defcmd_set;
+ kdb_printf("Could not allocate new kdb_macro entry for %s\n", argv[1]);
return KDB_NOTIMP;
}
@@ -780,25 +753,31 @@ fail_defcmd:
*/
static int kdb_exec_defcmd(int argc, const char **argv)
{
- int i, ret;
- struct defcmd_set *s;
+ int ret;
+ kdbtab_t *kp;
+ struct kdb_macro *kmp;
+ struct kdb_macro_statement *kms;
+
if (argc != 0)
return KDB_ARGCOUNT;
- for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
- if (strcmp(s->name, argv[0]) == 0)
+
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->name, argv[0]) == 0)
break;
}
- if (i == defcmd_set_count) {
+ if (list_entry_is_head(kp, &kdb_cmds_head, list_node)) {
kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
argv[0]);
return KDB_NOTIMP;
}
- for (i = 0; i < s->count; ++i) {
- /* Recursive use of kdb_parse, do not use argv after
- * this point */
+ kmp = container_of(kp, struct kdb_macro, cmd);
+ list_for_each_entry(kms, &kmp->statements, list_node) {
+ /*
+ * Recursive use of kdb_parse, do not use argv after this point.
+ */
argv = NULL;
- kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
- ret = kdb_parse(s->command[i]);
+ kdb_printf("[%s]kdb> %s\n", kmp->cmd.name, kms->statement);
+ ret = kdb_parse(kms->statement);
if (ret)
return ret;
}
@@ -828,7 +807,7 @@ static void parse_grep(const char *str)
cp++;
while (isspace(*cp))
cp++;
- if (strncmp(cp, "grep ", 5)) {
+ if (!str_has_prefix(cp, "grep ")) {
kdb_printf("invalid 'pipe', see grephelp\n");
return;
}
@@ -873,7 +852,7 @@ static void parse_grep(const char *str)
kdb_printf("search string too long\n");
return;
}
- strcpy(kdb_grep_string, cp);
+ memcpy(kdb_grep_string, cp, len + 1);
kdb_grepping_flag++;
return;
}
@@ -894,7 +873,7 @@ static void parse_grep(const char *str)
* Limited to 20 tokens.
*
* Real rudimentary tokenization. Basically only whitespace
- * is considered a token delimeter (but special consideration
+ * is considered a token delimiter (but special consideration
* is taken of the '=' sign as used by the 'set' command).
*
* The algorithm used to tokenize the input string relies on
@@ -914,7 +893,7 @@ int kdb_parse(const char *cmdstr)
char *cp;
char *cpp, quoted;
kdbtab_t *tp;
- int i, escaped, ignore_errors = 0, check_grep = 0;
+ int escaped, ignore_errors = 0, check_grep = 0;
/*
* First tokenize the command string.
@@ -1004,25 +983,17 @@ int kdb_parse(const char *cmdstr)
++argv[0];
}
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- /*
- * If this command is allowed to be abbreviated,
- * check to see if this is it.
- */
-
- if (tp->cmd_minlen
- && (strlen(argv[0]) <= tp->cmd_minlen)) {
- if (strncmp(argv[0],
- tp->cmd_name,
- tp->cmd_minlen) == 0) {
- break;
- }
- }
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ /*
+ * If this command is allowed to be abbreviated,
+ * check to see if this is it.
+ */
+ if (tp->minlen && (strlen(argv[0]) <= tp->minlen) &&
+ (strncmp(argv[0], tp->name, tp->minlen) == 0))
+ break;
- if (strcmp(argv[0], tp->cmd_name) == 0)
- break;
- }
+ if (strcmp(argv[0], tp->name) == 0)
+ break;
}
/*
@@ -1030,34 +1001,29 @@ int kdb_parse(const char *cmdstr)
* few characters of this match any of the known commands.
* e.g., md1c20 should match md.
*/
- if (i == kdb_max_commands) {
- for_each_kdbcmd(tp, i) {
- if (tp->cmd_name) {
- if (strncmp(argv[0],
- tp->cmd_name,
- strlen(tp->cmd_name)) == 0) {
- break;
- }
- }
+ if (list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
+ list_for_each_entry(tp, &kdb_cmds_head, list_node) {
+ if (strncmp(argv[0], tp->name, strlen(tp->name)) == 0)
+ break;
}
}
- if (i < kdb_max_commands) {
+ if (!list_entry_is_head(tp, &kdb_cmds_head, list_node)) {
int result;
- if (!kdb_check_flags(tp->cmd_flags, kdb_cmd_enabled, argc <= 1))
+ if (!kdb_check_flags(tp->flags, kdb_cmd_enabled, argc <= 1))
return KDB_NOPERM;
KDB_STATE_SET(CMD);
- result = (*tp->cmd_func)(argc-1, (const char **)argv);
+ result = (*tp->func)(argc-1, (const char **)argv);
if (result && ignore_errors && result > KDB_CMD_GO)
result = 0;
KDB_STATE_CLEAR(CMD);
- if (tp->cmd_flags & KDB_REPEAT_WITH_ARGS)
+ if (tp->flags & KDB_REPEAT_WITH_ARGS)
return result;
- argc = tp->cmd_flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
+ argc = tp->flags & KDB_REPEAT_NO_ARGS ? 1 : 0;
if (argv[argc])
*(argv[argc]) = '\0';
return result;
@@ -1100,13 +1066,14 @@ static int handle_ctrl_cmd(char *cmd)
switch (*cmd) {
case CTRL_P:
if (cmdptr != cmd_tail)
- cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
- strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ cmdptr = (cmdptr + KDB_CMD_HISTORY_COUNT - 1) %
+ KDB_CMD_HISTORY_COUNT;
+ strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
case CTRL_N:
if (cmdptr != cmd_head)
cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
- strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+ strscpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
return 1;
}
return 0;
@@ -1137,7 +1104,7 @@ static void kdb_dumpregs(struct pt_regs *regs)
console_loglevel = old_lvl;
}
-void kdb_set_current_task(struct task_struct *p)
+static void kdb_set_current_task(struct task_struct *p)
{
kdb_current_task = p;
@@ -1148,6 +1115,16 @@ void kdb_set_current_task(struct task_struct *p)
kdb_current_regs = NULL;
}
+static void drop_newline(char *buf)
+{
+ size_t len = strlen(buf);
+
+ if (len == 0)
+ return;
+ if (*(buf + len - 1) == '\n')
+ *(buf + len - 1) = '\0';
+}
+
/*
* kdb_local - The main code for kdb. This routine is invoked on a
* specific processor, it is not global. The main kdb() routine
@@ -1173,14 +1150,17 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
char *cmdbuf;
int diag;
struct task_struct *kdb_current =
- kdb_curr_task(raw_smp_processor_id());
+ curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason);
+
+ kdb_check_for_lockdown();
+
kdb_go_count = 0;
if (reason == KDB_REASON_DEBUG) {
/* special case below */
} else {
- kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+ kdb_printf("\nEntering kdb (current=0x%px, pid %d) ",
kdb_current, kdb_current ? kdb_current->pid : 0);
#if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -1196,7 +1176,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*/
switch (db_result) {
case KDB_DB_BPT:
- kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+ kdb_printf("\nEntering kdb (0x%px, pid %d) ",
kdb_current, kdb_current->pid);
#if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -1287,14 +1267,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*(cmd_hist[cmd_head]) = '\0';
do_full_getstr:
-#if defined(CONFIG_SMP)
+ /* PROMPT can only be set if we have MEM_READ permission. */
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
raw_smp_processor_id());
-#else
- snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
-#endif
- if (defcmd_in_progress)
- strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
/*
* Fetch command from keyboard
@@ -1303,7 +1278,7 @@ do_full_getstr:
if (*cmdbuf != '\n') {
if (*cmdbuf < 32) {
if (cmdptr == cmd_head) {
- strncpy(cmd_hist[cmd_head], cmd_cur,
+ strscpy(cmd_hist[cmd_head], cmd_cur,
CMD_BUFLEN);
*(cmd_hist[cmd_head] +
strlen(cmd_hist[cmd_head])-1) = '\0';
@@ -1313,7 +1288,7 @@ do_full_getstr:
cmdbuf = cmd_cur;
goto do_full_getstr;
} else {
- strncpy(cmd_hist[cmd_head], cmd_cur,
+ strscpy(cmd_hist[cmd_head], cmd_cur,
CMD_BUFLEN);
}
@@ -1325,6 +1300,7 @@ do_full_getstr:
cmdptr = cmd_head;
diag = kdb_parse(cmdbuf);
if (diag == KDB_NOTFOUND) {
+ drop_newline(cmdbuf);
kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
diag = 0;
}
@@ -1480,6 +1456,7 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr,
char cbuf[32];
char *c = cbuf;
int i;
+ int j;
unsigned long word;
memset(cbuf, '\0', sizeof(cbuf));
@@ -1525,25 +1502,9 @@ static void kdb_md_line(const char *fmtstr, unsigned long addr,
wc.word = word;
#define printable_char(c) \
({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
- switch (bytesperword) {
- case 8:
- *c++ = printable_char(*cp++);
+ for (j = 0; j < bytesperword; j++)
*c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- addr += 4;
- case 4:
- *c++ = printable_char(*cp++);
- *c++ = printable_char(*cp++);
- addr += 2;
- case 2:
- *c++ = printable_char(*cp++);
- addr++;
- case 1:
- *c++ = printable_char(*cp++);
- addr++;
- break;
- }
+ addr += bytesperword;
#undef printable_char
}
}
@@ -1564,6 +1525,7 @@ static int kdb_md(int argc, const char **argv)
int symbolic = 0;
int valid = 0;
int phys = 0;
+ int raw = 0;
kdbgetintenv("MDCOUNT", &mdcount);
kdbgetintenv("RADIX", &radix);
@@ -1573,9 +1535,10 @@ static int kdb_md(int argc, const char **argv)
repeat = mdcount * 16 / bytesperword;
if (strcmp(argv[0], "mdr") == 0) {
- if (argc != 2)
+ if (argc == 2 || (argc == 0 && last_addr != 0))
+ valid = raw = 1;
+ else
return KDB_ARGCOUNT;
- valid = 1;
} else if (isdigit(argv[0][2])) {
bytesperword = (int)(argv[0][2] - '0');
if (bytesperword == 0) {
@@ -1588,10 +1551,10 @@ static int kdb_md(int argc, const char **argv)
if (!argv[0][3])
valid = 1;
else if (argv[0][3] == 'c' && argv[0][4]) {
- char *p;
- repeat = simple_strtoul(argv[0] + 4, &p, 10);
+ if (kstrtouint(argv[0] + 4, 10, &repeat))
+ return KDB_BADINT;
mdcount = ((repeat * bytesperword) + 15) / 16;
- valid = !*p;
+ valid = 1;
}
last_repeat = repeat;
} else if (strcmp(argv[0], "md") == 0)
@@ -1611,7 +1574,10 @@ static int kdb_md(int argc, const char **argv)
radix = last_radix;
bytesperword = last_bytesperword;
repeat = last_repeat;
- mdcount = ((repeat * bytesperword) + 15) / 16;
+ if (raw)
+ mdcount = repeat;
+ else
+ mdcount = ((repeat * bytesperword) + 15) / 16;
}
if (argc) {
@@ -1628,7 +1594,10 @@ static int kdb_md(int argc, const char **argv)
diag = kdbgetularg(argv[nextarg], &val);
if (!diag) {
mdcount = (int) val;
- repeat = mdcount * 16 / bytesperword;
+ if (raw)
+ repeat = mdcount;
+ else
+ repeat = mdcount * 16 / bytesperword;
}
}
if (argc >= nextarg+1) {
@@ -1638,8 +1607,15 @@ static int kdb_md(int argc, const char **argv)
}
}
- if (strcmp(argv[0], "mdr") == 0)
- return kdb_mdr(addr, mdcount);
+ if (strcmp(argv[0], "mdr") == 0) {
+ int ret;
+ last_addr = addr;
+ ret = kdb_mdr(addr, mdcount);
+ last_addr += mdcount;
+ last_repeat = mdcount;
+ last_bytesperword = bytesperword; // to make REPEAT happy
+ return ret;
+ }
switch (radix) {
case 10:
@@ -2002,54 +1978,6 @@ static int kdb_ef(int argc, const char **argv)
return 0;
}
-#if defined(CONFIG_MODULES)
-/*
- * kdb_lsmod - This function implements the 'lsmod' command. Lists
- * currently loaded kernel modules.
- * Mostly taken from userland lsmod.
- */
-static int kdb_lsmod(int argc, const char **argv)
-{
- struct module *mod;
-
- if (argc != 0)
- return KDB_ARGCOUNT;
-
- kdb_printf("Module Size modstruct Used by\n");
- list_for_each_entry(mod, kdb_modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- kdb_printf("%-20s%8u 0x%p ", mod->name,
- mod->core_size, (void *)mod);
-#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4d ", module_refcount(mod));
-#endif
- if (mod->state == MODULE_STATE_GOING)
- kdb_printf(" (Unloading)");
- else if (mod->state == MODULE_STATE_COMING)
- kdb_printf(" (Loading)");
- else
- kdb_printf(" (Live)");
- kdb_printf(" 0x%p", mod->module_core);
-
-#ifdef CONFIG_MODULE_UNLOAD
- {
- struct module_use *use;
- kdb_printf(" [ ");
- list_for_each_entry(use, &mod->source_list,
- source_list)
- kdb_printf("%s ", use->target->name);
- kdb_printf("]\n");
- }
-#endif
- }
-
- return 0;
-}
-
-#endif /* CONFIG_MODULES */
-
/*
* kdb_env - This function implements the 'env' command. Display the
* current environment variables.
@@ -2057,15 +1985,11 @@ static int kdb_lsmod(int argc, const char **argv)
static int kdb_env(int argc, const char **argv)
{
- int i;
-
- for (i = 0; i < __nenv; i++) {
- if (__env[i])
- kdb_printf("%s\n", __env[i]);
- }
+ kdb_printenv();
if (KDB_DEBUG(MASK))
- kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+ kdb_printf("KDBDEBUG=0x%x\n",
+ (kdb_flags & KDB_DEBUG(MASK)) >> KDB_DEBUG_FLAG_SHIFT);
return 0;
}
@@ -2084,22 +2008,17 @@ static int kdb_dmesg(int argc, const char **argv)
int adjust = 0;
int n = 0;
int skip = 0;
- struct kmsg_dumper dumper = { .active = 1 };
+ struct kmsg_dump_iter iter;
size_t len;
char buf[201];
if (argc > 2)
return KDB_ARGCOUNT;
if (argc) {
- char *cp;
- lines = simple_strtol(argv[1], &cp, 0);
- if (*cp)
+ if (kstrtoint(argv[1], 0, &lines))
lines = 0;
- if (argc > 1) {
- adjust = simple_strtoul(argv[2], &cp, 0);
- if (*cp || adjust < 0)
- adjust = 0;
- }
+ if (argc > 1 && (kstrtoint(argv[2], 0, &adjust) || adjust < 0))
+ adjust = 0;
}
/* disable LOGGING if set */
@@ -2109,8 +2028,8 @@ static int kdb_dmesg(int argc, const char **argv)
kdb_set(2, setargs);
}
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL))
n++;
if (lines < 0) {
@@ -2142,8 +2061,8 @@ static int kdb_dmesg(int argc, const char **argv)
if (skip >= n || skip < 0)
return 0;
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) {
if (skip) {
skip--;
continue;
@@ -2159,32 +2078,6 @@ static int kdb_dmesg(int argc, const char **argv)
return 0;
}
#endif /* CONFIG_PRINTK */
-
-/* Make sure we balance enable/disable calls, must disable first. */
-static atomic_t kdb_nmi_disabled;
-
-static int kdb_disable_nmi(int argc, const char *argv[])
-{
- if (atomic_read(&kdb_nmi_disabled))
- return 0;
- atomic_set(&kdb_nmi_disabled, 1);
- arch_kgdb_ops.enable_nmi(0);
- return 0;
-}
-
-static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
-{
- if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
- return -EINVAL;
- arch_kgdb_ops.enable_nmi(1);
- return 0;
-}
-
-static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
- .set = kdb_param_enable_nmi,
-};
-module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
-
/*
* kdb_cpu - This function implements the 'cpu' command.
* cpu [<cpunum>]
@@ -2205,8 +2098,8 @@ static void kdb_cpu_status(void)
state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
- if (kdb_task_state_char(KDB_TSK(i)) == 'I')
- state = 'I'; /* idle task */
+ if (kdb_task_state_char(KDB_TSK(i)) == '-')
+ state = '-'; /* idle task */
}
if (state != prev_state) {
if (prev_state != '?') {
@@ -2273,52 +2166,46 @@ static int kdb_cpu(int argc, const char **argv)
void kdb_ps_suppressed(void)
{
int idle = 0, daemon = 0;
- unsigned long mask_I = kdb_task_state_string("I"),
- mask_M = kdb_task_state_string("M");
unsigned long cpu;
const struct task_struct *p, *g;
for_each_online_cpu(cpu) {
- p = kdb_curr_task(cpu);
- if (kdb_task_state(p, mask_I))
+ p = curr_task(cpu);
+ if (kdb_task_state(p, "-"))
++idle;
}
- kdb_do_each_thread(g, p) {
- if (kdb_task_state(p, mask_M))
+ for_each_process_thread(g, p) {
+ if (kdb_task_state(p, "ims"))
++daemon;
- } kdb_while_each_thread(g, p);
+ }
if (idle || daemon) {
if (idle)
- kdb_printf("%d idle process%s (state I)%s\n",
+ kdb_printf("%d idle process%s (state -)%s\n",
idle, idle == 1 ? "" : "es",
daemon ? " and " : "");
if (daemon)
- kdb_printf("%d sleeping system daemon (state M) "
+ kdb_printf("%d sleeping system daemon (state [ims]) "
"process%s", daemon,
daemon == 1 ? "" : "es");
kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
}
}
-/*
- * kdb_ps - This function implements the 'ps' command which shows a
- * list of the active processes.
- * ps [DRSTCZEUIMA] All processes, optionally filtered by state
- */
void kdb_ps1(const struct task_struct *p)
{
int cpu;
unsigned long tmp;
- if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ if (!p ||
+ copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return;
cpu = kdb_process_cpu(p);
- kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
+ kdb_printf("0x%px %8d %8d %d %4d %c 0x%px %c%s\n",
(void *)p, p->pid, p->parent->pid,
kdb_task_has_cpu(p), kdb_process_cpu(p),
kdb_task_state_char(p),
(void *)(&p->thread),
- p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+ p == curr_task(raw_smp_processor_id()) ? '*' : ' ',
p->comm);
if (kdb_task_has_cpu(p)) {
if (!KDB_TSK(cpu)) {
@@ -2326,38 +2213,46 @@ void kdb_ps1(const struct task_struct *p)
} else {
if (KDB_TSK(cpu) != p)
kdb_printf(" Error: does not match running "
- "process table (0x%p)\n", KDB_TSK(cpu));
+ "process table (0x%px)\n", KDB_TSK(cpu));
}
}
}
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ *
+ * ps [<state_chars>] Show processes, optionally selecting only those whose
+ * state character is found in <state_chars>.
+ */
static int kdb_ps(int argc, const char **argv)
{
struct task_struct *g, *p;
- unsigned long mask, cpu;
+ const char *mask;
+ unsigned long cpu;
if (argc == 0)
kdb_ps_suppressed();
kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
(int)(2*sizeof(void *))+2, "Task Addr",
(int)(2*sizeof(void *))+2, "Thread");
- mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ mask = argc ? argv[1] : kdbgetenv("PS");
/* Run the active tasks first */
for_each_online_cpu(cpu) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- p = kdb_curr_task(cpu);
+ p = curr_task(cpu);
if (kdb_task_state(p, mask))
kdb_ps1(p);
}
kdb_printf("\n");
/* Now the real tasks */
- kdb_do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
if (kdb_task_state(p, mask))
kdb_ps1(p);
- } kdb_while_each_thread(g, p);
+ }
return 0;
}
@@ -2410,23 +2305,20 @@ static int kdb_kgdb(int argc, const char **argv)
static int kdb_help(int argc, const char **argv)
{
kdbtab_t *kt;
- int i;
kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
kdb_printf("-----------------------------"
"-----------------------------\n");
- for_each_kdbcmd(kt, i) {
+ list_for_each_entry(kt, &kdb_cmds_head, list_node) {
char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
- if (!kt->cmd_name)
- continue;
- if (!kdb_check_flags(kt->cmd_flags, kdb_cmd_enabled, true))
+ if (!kdb_check_flags(kt->flags, kdb_cmd_enabled, true))
continue;
- if (strlen(kt->cmd_usage) > 20)
+ if (strlen(kt->usage) > 20)
space = "\n ";
- kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
- kt->cmd_usage, space, kt->cmd_help);
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->name,
+ kt->usage, space, kt->help);
}
return 0;
}
@@ -2437,24 +2329,20 @@ static int kdb_help(int argc, const char **argv)
static int kdb_kill(int argc, const char **argv)
{
long sig, pid;
- char *endp;
struct task_struct *p;
- struct siginfo info;
if (argc != 2)
return KDB_ARGCOUNT;
- sig = simple_strtol(argv[1], &endp, 0);
- if (*endp)
+ if (kstrtol(argv[1], 0, &sig))
return KDB_BADINT;
- if (sig >= 0) {
+ if ((sig >= 0) || !valid_signal(-sig)) {
kdb_printf("Invalid signal parameter.<-signal>\n");
return 0;
}
sig = -sig;
- pid = simple_strtol(argv[2], &endp, 0);
- if (*endp)
+ if (kstrtol(argv[2], 0, &pid))
return KDB_BADINT;
if (pid <= 0) {
kdb_printf("Process ID must be large than 0.\n");
@@ -2468,50 +2356,10 @@ static int kdb_kill(int argc, const char **argv)
return 0;
}
p = p->group_leader;
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_USER;
- info.si_pid = pid; /* same capabilities as process being signalled */
- info.si_uid = 0; /* kdb has root authority */
- kdb_send_sig_info(p, &info);
+ kdb_send_sig(p, sig);
return 0;
}
-struct kdb_tm {
- int tm_sec; /* seconds */
- int tm_min; /* minutes */
- int tm_hour; /* hours */
- int tm_mday; /* day of the month */
- int tm_mon; /* month */
- int tm_year; /* year */
-};
-
-static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
-{
- /* This will work from 1970-2099, 2100 is not a leap year */
- static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
- 31, 30, 31, 30, 31 };
- memset(tm, 0, sizeof(*tm));
- tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
- tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
- (2 * 365 + 1); /* shift base from 1970 to 1968 */
- tm->tm_min = tm->tm_sec / 60 % 60;
- tm->tm_hour = tm->tm_sec / 60 / 60;
- tm->tm_sec = tm->tm_sec % 60;
- tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
- tm->tm_mday %= (4*365+1);
- mon_day[1] = 29;
- while (tm->tm_mday >= mon_day[tm->tm_mon]) {
- tm->tm_mday -= mon_day[tm->tm_mon];
- if (++tm->tm_mon == 12) {
- tm->tm_mon = 0;
- ++tm->tm_year;
- mon_day[1] = 28;
- }
- }
- ++tm->tm_mday;
-}
-
/*
* Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
* I cannot call that code directly from kdb, it has an unconditional
@@ -2519,10 +2367,10 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
*/
static void kdb_sysinfo(struct sysinfo *val)
{
- struct timespec uptime;
- ktime_get_ts(&uptime);
+ u64 uptime = ktime_get_mono_fast_ns();
+
memset(val, 0, sizeof(*val));
- val->uptime = uptime.tv_sec;
+ val->uptime = div_u64(uptime, NSEC_PER_SEC);
val->loads[0] = avenrun[0];
val->loads[1] = avenrun[1];
val->loads[2] = avenrun[2];
@@ -2537,8 +2385,7 @@ static void kdb_sysinfo(struct sysinfo *val)
*/
static int kdb_summary(int argc, const char **argv)
{
- struct timespec now;
- struct kdb_tm tm;
+ time64_t now;
struct sysinfo val;
if (argc)
@@ -2550,35 +2397,23 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("machine %s\n", init_uts_ns.name.machine);
kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
- kdb_printf("ccversion %s\n", __stringify(CCVERSION));
-
- now = __current_kernel_time();
- kdb_gmtime(&now, &tm);
- kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
- "tz_minuteswest %d\n",
- 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
- tm.tm_hour, tm.tm_min, tm.tm_sec,
- sys_tz.tz_minuteswest);
+ now = __ktime_get_real_seconds();
+ kdb_printf("date %ptTs tz_minuteswest %d\n", &now, sys_tz.tz_minuteswest);
kdb_sysinfo(&val);
kdb_printf("uptime ");
if (val.uptime > (24*60*60)) {
int days = val.uptime / (24*60*60);
val.uptime %= (24*60*60);
- kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+ kdb_printf("%d day%s ", days, str_plural(days));
}
kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
- /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
-#undef LOAD_INT
-#undef LOAD_FRAC
+
/* Display in kilobytes */
#define K(x) ((x) << (PAGE_SHIFT - 10))
kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
@@ -2617,7 +2452,7 @@ static int kdb_per_cpu(int argc, const char **argv)
diag = kdbgetularg(argv[3], &whichcpu);
if (diag)
return diag;
- if (!cpu_online(whichcpu)) {
+ if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {
kdb_printf("cpu %ld is not online\n", whichcpu);
return KDB_BADCPUNUM;
}
@@ -2676,243 +2511,268 @@ static int kdb_grep_help(int argc, const char **argv)
return 0;
}
-/*
- * kdb_register_flags - This function is used to register a kernel
- * debugger command.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * repeat Does the command auto repeat on enter?
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_register() - This function is used to register a kernel debugger
+ * command.
+ * @cmd: pointer to kdb command
+ *
+ * Note that it's the job of the caller to keep the memory for the cmd
+ * allocated until unregister is called.
*/
-#define kdb_command_extend 50 /* arbitrary */
-int kdb_register_flags(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen,
- kdb_cmdflags_t flags)
+int kdb_register(kdbtab_t *cmd)
{
- int i;
kdbtab_t *kp;
- /*
- * Brute force method to determine duplicates
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
- kdb_printf("Duplicate kdb command registered: "
- "%s, func %p help %s\n", cmd, func, help);
- return 1;
- }
- }
-
- /*
- * Insert command into first available location in table
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name == NULL)
- break;
- }
-
- if (i >= kdb_max_commands) {
- kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
- kdb_command_extend) * sizeof(*new), GFP_KDB);
- if (!new) {
- kdb_printf("Could not allocate new kdb_command "
- "table\n");
+ list_for_each_entry(kp, &kdb_cmds_head, list_node) {
+ if (strcmp(kp->name, cmd->name) == 0) {
+ kdb_printf("Duplicate kdb cmd: %s, func %p help %s\n",
+ cmd->name, cmd->func, cmd->help);
return 1;
}
- if (kdb_commands) {
- memcpy(new, kdb_commands,
- (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
- kfree(kdb_commands);
- }
- memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
- kdb_command_extend * sizeof(*new));
- kdb_commands = new;
- kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
- kdb_max_commands += kdb_command_extend;
}
- kp->cmd_name = cmd;
- kp->cmd_func = func;
- kp->cmd_usage = usage;
- kp->cmd_help = help;
- kp->cmd_minlen = minlen;
- kp->cmd_flags = flags;
-
+ list_add_tail(&cmd->list_node, &kdb_cmds_head);
return 0;
}
-EXPORT_SYMBOL_GPL(kdb_register_flags);
-
+EXPORT_SYMBOL_GPL(kdb_register);
-/*
- * kdb_register - Compatibility register function for commands that do
- * not need to specify a repeat state. Equivalent to
- * kdb_register_flags with flags set to 0.
- * Inputs:
- * cmd Command name
- * func Function to execute the command
- * usage A simple usage string showing arguments
- * help A simple help string describing command
- * Returns:
- * zero for success, one if a duplicate command.
+/**
+ * kdb_register_table() - This function is used to register a kdb command
+ * table.
+ * @kp: pointer to kdb command table
+ * @len: length of kdb command table
*/
-int kdb_register(char *cmd,
- kdb_func_t func,
- char *usage,
- char *help,
- short minlen)
+void kdb_register_table(kdbtab_t *kp, size_t len)
{
- return kdb_register_flags(cmd, func, usage, help, minlen, 0);
+ while (len--) {
+ list_add_tail(&kp->list_node, &kdb_cmds_head);
+ kp++;
+ }
}
-EXPORT_SYMBOL_GPL(kdb_register);
-/*
- * kdb_unregister - This function is used to unregister a kernel
- * debugger command. It is generally called when a module which
- * implements kdb commands is unloaded.
- * Inputs:
- * cmd Command name
- * Returns:
- * zero for success, one command not registered.
+/**
+ * kdb_unregister() - This function is used to unregister a kernel debugger
+ * command. It is generally called when a module which
+ * implements kdb command is unloaded.
+ * @cmd: pointer to kdb command
*/
-int kdb_unregister(char *cmd)
+void kdb_unregister(kdbtab_t *cmd)
{
- int i;
- kdbtab_t *kp;
-
- /*
- * find the command.
- */
- for_each_kdbcmd(kp, i) {
- if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
- kp->cmd_name = NULL;
- return 0;
- }
- }
-
- /* Couldn't find it. */
- return 1;
+ list_del(&cmd->list_node);
}
EXPORT_SYMBOL_GPL(kdb_unregister);
-/* Initialize the kdb command table. */
-static void __init kdb_inittab(void)
-{
- int i;
- kdbtab_t *kp;
-
- for_each_kdbcmd(kp, i)
- kp->cmd_name = NULL;
-
- kdb_register_flags("md", kdb_md, "<vaddr>",
- "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdr", kdb_md, "<vaddr> <bytes>",
- "Display Raw Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mdp", kdb_md, "<paddr> <bytes>",
- "Display Physical Memory", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mds", kdb_md, "<vaddr>",
- "Display Memory Symbolically", 0,
- KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("mm", kdb_mm, "<vaddr> <contents>",
- "Modify Memory Contents", 0,
- KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS);
- kdb_register_flags("go", kdb_go, "[<vaddr>]",
- "Continue Execution", 1,
- KDB_ENABLE_REG_WRITE | KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("rd", kdb_rd, "",
- "Display Registers", 0,
- KDB_ENABLE_REG_READ);
- kdb_register_flags("rm", kdb_rm, "<reg> <contents>",
- "Modify Registers", 0,
- KDB_ENABLE_REG_WRITE);
- kdb_register_flags("ef", kdb_ef, "<vaddr>",
- "Display exception frame", 0,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("bt", kdb_bt, "[<vaddr>]",
- "Stack traceback", 1,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("btp", kdb_bt, "<pid>",
- "Display stack for process <pid>", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
- "Backtrace all processes matching state flag", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btc", kdb_bt, "",
- "Backtrace current process on each cpu", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("btt", kdb_bt, "<vaddr>",
- "Backtrace process given its struct task address", 0,
- KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS);
- kdb_register_flags("env", kdb_env, "",
- "Show environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("set", kdb_set, "",
- "Set environment variables", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("help", kdb_help, "",
- "Display Help Message", 1,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("?", kdb_help, "",
- "Display Help Message", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("cpu", kdb_cpu, "<cpunum>",
- "Switch to new cpu", 0,
- KDB_ENABLE_ALWAYS_SAFE_NO_ARGS);
- kdb_register_flags("kgdb", kdb_kgdb, "",
- "Enter kgdb mode", 0, 0);
- kdb_register_flags("ps", kdb_ps, "[<flags>|A]",
- "Display active task list", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("pid", kdb_pid, "<pidnum>",
- "Switch to another task", 0,
- KDB_ENABLE_INSPECT);
- kdb_register_flags("reboot", kdb_reboot, "",
- "Reboot the machine immediately", 0,
- KDB_ENABLE_REBOOT);
+static kdbtab_t maintab[] = {
+ { .name = "md",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Contents, also mdWcN, e.g. md8c1",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mdr",
+ .func = kdb_md,
+ .usage = "<vaddr> <bytes>",
+ .help = "Display Raw Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mdp",
+ .func = kdb_md,
+ .usage = "<paddr> <bytes>",
+ .help = "Display Physical Memory",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mds",
+ .func = kdb_md,
+ .usage = "<vaddr>",
+ .help = "Display Memory Symbolically",
+ .flags = KDB_ENABLE_MEM_READ | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "mm",
+ .func = kdb_mm,
+ .usage = "<vaddr> <contents>",
+ .help = "Modify Memory Contents",
+ .flags = KDB_ENABLE_MEM_WRITE | KDB_REPEAT_NO_ARGS,
+ },
+ { .name = "go",
+ .func = kdb_go,
+ .usage = "[<vaddr>]",
+ .help = "Continue Execution",
+ .minlen = 1,
+ .flags = KDB_ENABLE_REG_WRITE |
+ KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .name = "rd",
+ .func = kdb_rd,
+ .usage = "",
+ .help = "Display Registers",
+ .flags = KDB_ENABLE_REG_READ,
+ },
+ { .name = "rm",
+ .func = kdb_rm,
+ .usage = "<reg> <contents>",
+ .help = "Modify Registers",
+ .flags = KDB_ENABLE_REG_WRITE,
+ },
+ { .name = "ef",
+ .func = kdb_ef,
+ .usage = "<vaddr>",
+ .help = "Display exception frame",
+ .flags = KDB_ENABLE_MEM_READ,
+ },
+ { .name = "bt",
+ .func = kdb_bt,
+ .usage = "[<vaddr>]",
+ .help = "Stack traceback",
+ .minlen = 1,
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .name = "btp",
+ .func = kdb_bt,
+ .usage = "<pid>",
+ .help = "Display stack for process <pid>",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "bta",
+ .func = kdb_bt,
+ .usage = "[<state_chars>|A]",
+ .help = "Backtrace all processes whose state matches",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "btc",
+ .func = kdb_bt,
+ .usage = "",
+ .help = "Backtrace current process on each cpu",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "btt",
+ .func = kdb_bt,
+ .usage = "<vaddr>",
+ .help = "Backtrace process given its struct task address",
+ .flags = KDB_ENABLE_MEM_READ | KDB_ENABLE_INSPECT_NO_ARGS,
+ },
+ { .name = "env",
+ .func = kdb_env,
+ .usage = "",
+ .help = "Show environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "set",
+ .func = kdb_set,
+ .usage = "",
+ .help = "Set environment variables",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "help",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .minlen = 1,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "?",
+ .func = kdb_help,
+ .usage = "",
+ .help = "Display Help Message",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "cpu",
+ .func = kdb_cpu,
+ .usage = "<cpunum>",
+ .help = "Switch to new cpu",
+ .flags = KDB_ENABLE_ALWAYS_SAFE_NO_ARGS,
+ },
+ { .name = "kgdb",
+ .func = kdb_kgdb,
+ .usage = "",
+ .help = "Enter kgdb mode",
+ .flags = 0,
+ },
+ { .name = "ps",
+ .func = kdb_ps,
+ .usage = "[<state_chars>|A]",
+ .help = "Display active task list",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "pid",
+ .func = kdb_pid,
+ .usage = "<pidnum>",
+ .help = "Switch to another task",
+ .flags = KDB_ENABLE_INSPECT,
+ },
+ { .name = "reboot",
+ .func = kdb_reboot,
+ .usage = "",
+ .help = "Reboot the machine immediately",
+ .flags = KDB_ENABLE_REBOOT,
+ },
#if defined(CONFIG_MODULES)
- kdb_register_flags("lsmod", kdb_lsmod, "",
- "List loaded kernel modules", 0,
- KDB_ENABLE_INSPECT);
+ { .name = "lsmod",
+ .func = kdb_lsmod,
+ .usage = "",
+ .help = "List loaded kernel modules",
+ .flags = KDB_ENABLE_INSPECT,
+ },
#endif
#if defined(CONFIG_MAGIC_SYSRQ)
- kdb_register_flags("sr", kdb_sr, "<key>",
- "Magic SysRq key", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .name = "sr",
+ .func = kdb_sr,
+ .usage = "<key>",
+ .help = "Magic SysRq key",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
#if defined(CONFIG_PRINTK)
- kdb_register_flags("dmesg", kdb_dmesg, "[lines]",
- "Display syslog buffer", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .name = "dmesg",
+ .func = kdb_dmesg,
+ .usage = "[lines]",
+ .help = "Display syslog buffer",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
#endif
- if (arch_kgdb_ops.enable_nmi) {
- kdb_register_flags("disable_nmi", kdb_disable_nmi, "",
- "Disable NMI entry to KDB", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- }
- kdb_register_flags("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
- "Define a set of commands, down to endefcmd", 0,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("kill", kdb_kill, "<-signal> <pid>",
- "Send a signal to a process", 0,
- KDB_ENABLE_SIGNAL);
- kdb_register_flags("summary", kdb_summary, "",
- "Summarize the system", 4,
- KDB_ENABLE_ALWAYS_SAFE);
- kdb_register_flags("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
- "Display per_cpu variables", 3,
- KDB_ENABLE_MEM_READ);
- kdb_register_flags("grephelp", kdb_grep_help, "",
- "Display help on | grep", 0,
- KDB_ENABLE_ALWAYS_SAFE);
+ { .name = "defcmd",
+ .func = kdb_defcmd,
+ .usage = "name \"usage\" \"help\"",
+ .help = "Define a set of commands, down to endefcmd",
+ /*
+ * Macros are always safe because when executed each
+ * internal command re-enters kdb_parse() and is safety
+ * checked individually.
+ */
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "kill",
+ .func = kdb_kill,
+ .usage = "<-signal> <pid>",
+ .help = "Send a signal to a process",
+ .flags = KDB_ENABLE_SIGNAL,
+ },
+ { .name = "summary",
+ .func = kdb_summary,
+ .usage = "",
+ .help = "Summarize the system",
+ .minlen = 4,
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+ { .name = "per_cpu",
+ .func = kdb_per_cpu,
+ .usage = "<sym> [<bytes>] [<cpu>]",
+ .help = "Display per_cpu variables",
+ .minlen = 3,
+ .flags = KDB_ENABLE_MEM_READ,
+ },
+ { .name = "grephelp",
+ .func = kdb_grep_help,
+ .usage = "",
+ .help = "Display help on | grep",
+ .flags = KDB_ENABLE_ALWAYS_SAFE,
+ },
+};
+
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+ kdb_register_table(maintab, ARRAY_SIZE(maintab));
}
/* Execute any commands defined in kdb_cmds. */
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 75014d7f4568..a2fc7d2bc9fc 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -64,7 +64,7 @@
/*
* KDB_MAXBPT describes the total number of breakpoints
- * supported by this architecure.
+ * supported by this architecture.
*/
#define KDB_MAXBPT 16
@@ -83,7 +83,7 @@ typedef struct __ksymtab {
unsigned long sym_start;
unsigned long sym_end;
} kdb_symtab_t;
-extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size);
extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
/* Exported Symbols for kernel loadable modules to use. */
@@ -109,8 +109,8 @@ extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
extern int kdbgetsymval(const char *, kdb_symtab_t *);
extern int kdbnearsym(unsigned long, kdb_symtab_t *);
-extern void kdbnearsym_cleanup(void);
extern char *kdb_strdup(const char *str, gfp_t type);
+extern char *kdb_strdup_dequote(const char *str, gfp_t type);
extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
/* Routine for debugging the debugger state. */
@@ -132,7 +132,6 @@ extern int kdb_state;
#define KDB_STATE_PAGER 0x00000400 /* pager is available */
#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
* back to initial cpu */
-#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
@@ -166,17 +165,7 @@ typedef struct _kdb_bp {
#ifdef CONFIG_KGDB_KDB
extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
-/* The KDB shell command table */
-typedef struct _kdbtab {
- char *cmd_name; /* Command name */
- kdb_func_t cmd_func; /* Function to execute command */
- char *cmd_usage; /* Usage String for this command */
- char *cmd_help; /* Help message for this command */
- short cmd_minlen; /* Minimum legal # command
- * chars required */
- kdb_cmdflags_t cmd_flags; /* Command behaviour flags */
-} kdbtab_t;
-
+extern void kdb_register_table(kdbtab_t *kp, size_t len);
extern int kdb_bt(int, const char **); /* KDB display back trace */
/* KDB breakpoint management functions */
@@ -202,15 +191,11 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
-extern unsigned long kdb_task_state(const struct task_struct *p,
- unsigned long mask);
+extern bool kdb_task_state(const struct task_struct *p, const char *mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
-extern void kdb_print_nameval(const char *name, unsigned long val);
-extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
-extern void kdb_meminfo_proc_show(void);
+extern char kdb_getchar(void);
extern char *kdb_getstr(char *, size_t, const char *);
extern void kdb_gdb_state_pass(char *buf);
@@ -226,22 +211,12 @@ extern void kdb_gdb_state_pass(char *buf);
#define KDB_TSK(cpu) kgdb_info[cpu].task
#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
-extern struct task_struct *kdb_curr_task(int);
-
#define kdb_task_has_cpu(p) (task_curr(p))
-/* Simplify coexistence with NPTL */
-#define kdb_do_each_thread(g, p) do_each_thread(g, p)
-#define kdb_while_each_thread(g, p) while_each_thread(g, p)
-
-#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
-
-extern void *debug_kmalloc(size_t size, gfp_t flags);
-extern void debug_kfree(void *);
-extern void debug_kusage(void);
+#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
-extern void kdb_set_current_task(struct task_struct *);
extern struct task_struct *kdb_current_task;
+extern struct pt_regs *kdb_current_regs;
#ifdef CONFIG_KDB_KEYBOARD
extern void kdb_kbd_cleanup_state(void);
@@ -249,13 +224,19 @@ extern void kdb_kbd_cleanup_state(void);
#define kdb_kbd_cleanup_state()
#endif /* ! CONFIG_KDB_KEYBOARD */
-#ifdef CONFIG_MODULES
-extern struct list_head *kdb_modules;
-#endif /* CONFIG_MODULES */
-
extern char kdb_prompt_str[];
#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
#endif /* CONFIG_KGDB_KDB */
+
+#define kdb_func_printf(format, args...) \
+ kdb_printf("%s: " format, __func__, ## args)
+
+#define kdb_dbg_printf(mask, format, args...) \
+ do { \
+ if (KDB_DEBUG(mask)) \
+ kdb_func_printf(format, ## args); \
+ } while (0)
+
#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index d35cc2d3a4cc..56f7b906e7cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -10,7 +10,6 @@
* 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
*/
-#include <stdarg.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -18,13 +17,14 @@
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/ptrace.h>
-#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/kdb.h>
#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
#include "kdb_private.h"
/*
@@ -39,68 +39,61 @@
*/
int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
{
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
- symtab);
+ kdb_dbg_printf(AR, "symname=%s, symtab=%px\n", symname, symtab);
memset(symtab, 0, sizeof(*symtab));
symtab->sym_start = kallsyms_lookup_name(symname);
if (symtab->sym_start) {
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 1, "
- "symtab->sym_start=0x%lx\n",
- symtab->sym_start);
+ kdb_dbg_printf(AR, "returns 1, symtab->sym_start=0x%lx\n",
+ symtab->sym_start);
return 1;
}
- if (KDB_DEBUG(AR))
- kdb_printf("kdbgetsymval: returns 0\n");
+ kdb_dbg_printf(AR, "returns 0\n");
return 0;
}
EXPORT_SYMBOL(kdbgetsymval);
-static char *kdb_name_table[100]; /* arbitrary size */
-
-/*
- * kdbnearsym - Return the name of the symbol with the nearest address
- * less than 'addr'.
+/**
+ * kdbnearsym() - Return the name of the symbol with the nearest address
+ * less than @addr.
+ * @addr: Address to check for near symbol
+ * @symtab: Structure to receive results
*
- * Parameters:
- * addr Address to check for symbol near
- * symtab Structure to receive results
- * Returns:
- * 0 No sections contain this address, symtab zero filled
- * 1 Address mapped to module/symbol/section, data in symtab
- * Remarks:
- * 2.6 kallsyms has a "feature" where it unpacks the name into a
- * string. If that string is reused before the caller expects it
- * then the caller sees its string change without warning. To
- * avoid cluttering up the main kdb code with lots of kdb_strdup,
- * tests and kfree calls, kdbnearsym maintains an LRU list of the
- * last few unique strings. The list is sized large enough to
- * hold active strings, no kdb caller of kdbnearsym makes more
- * than ~20 later calls before using a saved value.
+ * WARNING: This function may return a pointer to a single statically
+ * allocated buffer (namebuf). kdb's unusual calling context (single
+ * threaded, all other CPUs halted) provides us sufficient locking for
+ * this to be safe. The only constraint imposed by the static buffer is
+ * that the caller must consume any previous reply prior to another call
+ * to lookup a new symbol.
+ *
+ * Note that, strictly speaking, some architectures may re-enter the kdb
+ * trap if the system turns out to be very badly damaged and this breaks
+ * the single-threaded assumption above. In these circumstances successful
+ * continuation and exit from the inner trap is unlikely to work and any
+ * user attempting this receives a prominent warning before being allowed
+ * to progress. In these circumstances we remain memory safe because
+ * namebuf[KSYM_NAME_LEN-1] will never change from '\0' although we do
+ * tolerate the possibility of garbled symbol display from the outer kdb
+ * trap.
+ *
+ * Return:
+ * * 0 - No sections contain this address, symtab zero filled
+ * * 1 - Address mapped to module/symbol/section, data in symtab
*/
int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
{
int ret = 0;
unsigned long symbolsize = 0;
unsigned long offset = 0;
-#define knt1_size 128 /* must be >= kallsyms table size */
- char *knt1 = NULL;
+ static char namebuf[KSYM_NAME_LEN];
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+ kdb_dbg_printf(AR, "addr=0x%lx, symtab=%px\n", addr, symtab);
memset(symtab, 0, sizeof(*symtab));
if (addr < 4096)
goto out;
- knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
- if (!knt1) {
- kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
- addr);
- goto out;
- }
+
symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
- (char **)(&symtab->mod_name), knt1);
+ (char **)(&symtab->mod_name), namebuf);
if (offset > 8*1024*1024) {
symtab->sym_name = NULL;
addr = offset = symbolsize = 0;
@@ -109,66 +102,14 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
symtab->sym_end = symtab->sym_start + symbolsize;
ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
- if (ret) {
- int i;
- /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
- * set but the buffer passed into kallsyms_lookup is not used,
- * so it contains garbage. The caller has to work out which
- * buffer needs to be saved.
- *
- * What was Rusty smoking when he wrote that code?
- */
- if (symtab->sym_name != knt1) {
- strncpy(knt1, symtab->sym_name, knt1_size);
- knt1[knt1_size-1] = '\0';
- }
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i] &&
- strcmp(kdb_name_table[i], knt1) == 0)
- break;
- }
- if (i >= ARRAY_SIZE(kdb_name_table)) {
- debug_kfree(kdb_name_table[0]);
- memcpy(kdb_name_table, kdb_name_table+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-1));
- } else {
- debug_kfree(knt1);
- knt1 = kdb_name_table[i];
- memcpy(kdb_name_table+i, kdb_name_table+i+1,
- sizeof(kdb_name_table[0]) *
- (ARRAY_SIZE(kdb_name_table)-i-1));
- }
- i = ARRAY_SIZE(kdb_name_table) - 1;
- kdb_name_table[i] = knt1;
- symtab->sym_name = kdb_name_table[i];
- knt1 = NULL;
- }
-
if (symtab->mod_name == NULL)
symtab->mod_name = "kernel";
- if (KDB_DEBUG(AR))
- kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
- "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
- symtab->sym_start, symtab->mod_name, symtab->sym_name,
- symtab->sym_name);
-
+ kdb_dbg_printf(AR, "returns %d symtab->sym_start=0x%lx, symtab->mod_name=%px, symtab->sym_name=%px (%s)\n",
+ ret, symtab->sym_start, symtab->mod_name, symtab->sym_name, symtab->sym_name);
out:
- debug_kfree(knt1);
return ret;
}
-void kdbnearsym_cleanup(void)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
- if (kdb_name_table[i]) {
- debug_kfree(kdb_name_table[i]);
- kdb_name_table[i] = NULL;
- }
- }
-}
-
static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
/*
@@ -192,7 +133,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
while ((name = kdb_walk_kallsyms(&pos))) {
if (strncmp(name, prefix_name, prefix_len) == 0) {
- strcpy(ks_namebuf, name);
+ strscpy(ks_namebuf, name, sizeof(ks_namebuf));
/* Work out the longest name that matches the prefix */
if (++number == 1) {
prev_len = min_t(int, max_len-1,
@@ -221,11 +162,13 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
* Parameters:
* prefix_name prefix of a symbol name to lookup
* flag 0 means search from the head, 1 means continue search.
+ * buf_size maximum length that can be written to prefix_name
+ * buffer
* Returns:
* 1 if a symbol matches the given prefix.
* 0 if no string found
*/
-int kallsyms_symbol_next(char *prefix_name, int flag)
+int kallsyms_symbol_next(char *prefix_name, int flag, int buf_size)
{
int prefix_len = strlen(prefix_name);
static loff_t pos;
@@ -235,10 +178,8 @@ int kallsyms_symbol_next(char *prefix_name, int flag)
pos = 0;
while ((name = kdb_walk_kallsyms(&pos))) {
- if (strncmp(name, prefix_name, prefix_len) == 0) {
- strncpy(prefix_name, name, strlen(name)+1);
- return 1;
- }
+ if (!strncmp(name, prefix_name, prefix_len))
+ return strscpy(prefix_name, name, buf_size);
}
return 0;
}
@@ -306,11 +247,41 @@ void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
*/
char *kdb_strdup(const char *str, gfp_t type)
{
- int n = strlen(str)+1;
+ size_t n = strlen(str) + 1;
char *s = kmalloc(n, type);
if (!s)
return NULL;
- return strcpy(s, str);
+ memcpy(s, str, n);
+ return s;
+}
+
+/*
+ * kdb_strdup_dequote - same as kdb_strdup(), but trims surrounding quotes from
+ * the input string if present.
+ * Remarks:
+ * Quotes are only removed if there is both a leading and a trailing quote.
+ */
+char *kdb_strdup_dequote(const char *str, gfp_t type)
+{
+ size_t len = strlen(str);
+ char *s;
+
+ if (str[0] == '"' && len > 1 && str[len - 1] == '"') {
+ /* trim both leading and trailing quotes */
+ str++;
+ len -= 2;
+ }
+
+ len++; /* add space for NUL terminator */
+
+ s = kmalloc(len, type);
+ if (!s)
+ return NULL;
+
+ memcpy(s, str, len - 1);
+ s[len - 1] = '\0';
+
+ return s;
}
/*
@@ -325,10 +296,10 @@ char *kdb_strdup(const char *str, gfp_t type)
*/
int kdb_getarea_size(void *res, unsigned long addr, size_t size)
{
- int ret = probe_kernel_read((char *)res, (char *)addr, size);
+ int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -350,10 +321,10 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = probe_kernel_read((char *)addr, (char *)res, size);
+ int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
- kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+ kdb_func_printf("Bad address 0x%lx\n", addr);
KDB_STATE_SET(SUPPRESS);
}
ret = KDB_BADADDR;
@@ -365,7 +336,7 @@ int kdb_putarea_size(unsigned long addr, void *res, size_t size)
/*
* kdb_getphys - Read data from a physical address. Validate the
- * address is in range, use kmap_atomic() to get data
+ * address is in range, use kmap_local_page() to get data
* similar to kdb_getarea() - but for phys addresses
* Inputs:
* res Pointer to the word to receive the result
@@ -384,9 +355,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
if (!pfn_valid(pfn))
return 1;
page = pfn_to_page(pfn);
- vaddr = kmap_atomic(page);
+ vaddr = kmap_local_page(page);
memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
return 0;
}
@@ -432,10 +403,10 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* drop through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -481,10 +452,10 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* drop through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
@@ -525,91 +496,15 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
diag = kdb_putarea(addr, w8);
break;
}
- /* drop through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
- kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+ kdb_func_printf("bad width %zu\n", size);
}
return diag;
}
-/*
- * kdb_task_state_string - Convert a string containing any of the
- * letters DRSTCZEUIMA to a mask for the process state field and
- * return the value. If no argument is supplied, return the mask
- * that corresponds to environment variable PS, DRSTCZEU by
- * default.
- * Inputs:
- * s String to convert
- * Returns:
- * Mask for process state.
- * Notes:
- * The mask folds data from several sources into a single long value, so
- * be careful not to overlap the bits. TASK_* bits are in the LSB,
- * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
- * is no overlap between TASK_* and EXIT_* but that may not always be
- * true, so EXIT_* bits are shifted left 16 bits before being stored in
- * the mask.
- */
-/* unrunnable is < 0 */
-#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
-#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
-#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
-#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
-
-unsigned long kdb_task_state_string(const char *s)
-{
- long res = 0;
- if (!s) {
- s = kdbgetenv("PS");
- if (!s)
- s = "DRSTCZEU"; /* default value for ps */
- }
- while (*s) {
- switch (*s) {
- case 'D':
- res |= TASK_UNINTERRUPTIBLE;
- break;
- case 'R':
- res |= RUNNING;
- break;
- case 'S':
- res |= TASK_INTERRUPTIBLE;
- break;
- case 'T':
- res |= TASK_STOPPED;
- break;
- case 'C':
- res |= TASK_TRACED;
- break;
- case 'Z':
- res |= EXIT_ZOMBIE << 16;
- break;
- case 'E':
- res |= EXIT_DEAD << 16;
- break;
- case 'U':
- res |= UNRUNNABLE;
- break;
- case 'I':
- res |= IDLE;
- break;
- case 'M':
- res |= DAEMON;
- break;
- case 'A':
- res = ~0UL;
- break;
- default:
- kdb_printf("%s: unknown flag '%c' ignored\n",
- __func__, *s);
- break;
- }
- ++s;
- }
- return res;
-}
/*
* kdb_task_state_char - Return the character that represents the task state.
@@ -620,31 +515,26 @@ unsigned long kdb_task_state_string(const char *s)
*/
char kdb_task_state_char (const struct task_struct *p)
{
- int cpu;
- char state;
unsigned long tmp;
+ char state;
+ int cpu;
- if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ if (!p ||
+ copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
- cpu = kdb_process_cpu(p);
- state = (p->state == 0) ? 'R' :
- (p->state < 0) ? 'U' :
- (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
- (p->state & TASK_STOPPED) ? 'T' :
- (p->state & TASK_TRACED) ? 'C' :
- (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
- (p->exit_state & EXIT_DEAD) ? 'E' :
- (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ state = task_state_to_char((struct task_struct *) p);
+
if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
+ cpu = kdb_process_cpu(p);
if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
if (cpu != kdb_initial_cpu)
- state = 'I'; /* idle task */
+ state = '-'; /* idle task */
}
- } else if (!p->mm && state == 'S') {
- state = 'M'; /* sleeping system daemon */
+ } else if (!p->mm && strchr("IMS", state)) {
+ state = tolower(state); /* sleeping system daemon */
}
return state;
}
@@ -654,274 +544,26 @@ char kdb_task_state_char (const struct task_struct *p)
* given by the mask.
* Inputs:
* p struct task for the process
- * mask mask from kdb_task_state_string to select processes
+ * mask set of characters used to select processes; both NULL
+ * and the empty string mean adopt a default filter, which
+ * is to suppress sleeping system daemons and the idle tasks
* Returns:
* True if the process matches at least one criteria defined by the mask.
*/
-unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
-{
- char state[] = { kdb_task_state_char(p), '\0' };
- return (mask & kdb_task_state_string(state)) != 0;
-}
-
-/*
- * kdb_print_nameval - Print a name and its value, converting the
- * value to a symbol lookup if possible.
- * Inputs:
- * name field name to print
- * val value of field
- */
-void kdb_print_nameval(const char *name, unsigned long val)
+bool kdb_task_state(const struct task_struct *p, const char *mask)
{
- kdb_symtab_t symtab;
- kdb_printf(" %-11.11s ", name);
- if (kdbnearsym(val, &symtab))
- kdb_symbol_print(val, &symtab,
- KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
- else
- kdb_printf("0x%lx\n", val);
-}
-
-/* Last ditch allocator for debugging, so we can still debug even when
- * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
- * for space usage, not for speed. One smallish memory pool, the free
- * chain is always in ascending address order to allow coalescing,
- * allocations are done in brute force best fit.
- */
+ char state = kdb_task_state_char(p);
-struct debug_alloc_header {
- u32 next; /* offset of next header from start of pool */
- u32 size;
- void *caller;
-};
-
-/* The memory returned by this allocator must be aligned, which means
- * so must the header size. Do not assume that sizeof(struct
- * debug_alloc_header) is a multiple of the alignment, explicitly
- * calculate the overhead of this header, including the alignment.
- * The rest of this code must not use sizeof() on any header or
- * pointer to a header.
- */
-#define dah_align 8
-#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
-
-static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
-static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
-static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
-
-/* Locking is awkward. The debug code is called from all contexts,
- * including non maskable interrupts. A normal spinlock is not safe
- * in NMI context. Try to get the debug allocator lock, if it cannot
- * be obtained after a second then give up. If the lock could not be
- * previously obtained on this cpu then only try once.
- *
- * sparse has no annotation for "this function _sometimes_ acquires a
- * lock", so fudge the acquire/release notation.
- */
-static DEFINE_SPINLOCK(dap_lock);
-static int get_dap_lock(void)
- __acquires(dap_lock)
-{
- static int dap_locked = -1;
- int count;
- if (dap_locked == smp_processor_id())
- count = 1;
- else
- count = 1000;
- while (1) {
- if (spin_trylock(&dap_lock)) {
- dap_locked = -1;
- return 1;
- }
- if (!count--)
- break;
- udelay(1000);
- }
- dap_locked = smp_processor_id();
- __acquire(dap_lock);
- return 0;
-}
-
-void *debug_kmalloc(size_t size, gfp_t flags)
-{
- unsigned int rem, h_offset;
- struct debug_alloc_header *best, *bestprev, *prev, *h;
- void *p = NULL;
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return NULL;
- }
- h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first_call) {
- h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
- dah_first_call = 0;
- }
- size = ALIGN(size, dah_align);
- prev = best = bestprev = NULL;
- while (1) {
- if (h->size >= size && (!best || h->size < best->size)) {
- best = h;
- bestprev = prev;
- if (h->size == size)
- break;
- }
- if (!h->next)
- break;
- prev = h;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
- }
- if (!best)
- goto out;
- rem = best->size - size;
- /* The pool must always contain at least one header */
- if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
- goto out;
- if (rem >= dah_overhead) {
- best->size = size;
- h_offset = ((char *)best - debug_alloc_pool) +
- dah_overhead + best->size;
- h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
- h->size = rem - dah_overhead;
- h->next = best->next;
- } else
- h_offset = best->next;
- best->caller = __builtin_return_address(0);
- dah_used += best->size;
- dah_used_max = max(dah_used, dah_used_max);
- if (bestprev)
- bestprev->next = h_offset;
- else
- dah_first = h_offset;
- p = (char *)best + dah_overhead;
- memset(p, POISON_INUSE, best->size - 1);
- *((char *)p + best->size - 1) = POISON_END;
-out:
- spin_unlock(&dap_lock);
- return p;
-}
-
-void debug_kfree(void *p)
-{
- struct debug_alloc_header *h;
- unsigned int h_offset;
- if (!p)
- return;
- if ((char *)p < debug_alloc_pool ||
- (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
- kfree(p);
- return;
- }
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return; /* memory leak, cannot be helped */
- }
- h = (struct debug_alloc_header *)((char *)p - dah_overhead);
- memset(p, POISON_FREE, h->size - 1);
- *((char *)p + h->size - 1) = POISON_END;
- h->caller = NULL;
- dah_used -= h->size;
- h_offset = (char *)h - debug_alloc_pool;
- if (h_offset < dah_first) {
- h->next = dah_first;
- dah_first = h_offset;
- } else {
- struct debug_alloc_header *prev;
- unsigned int prev_offset;
- prev = (struct debug_alloc_header *)(debug_alloc_pool +
- dah_first);
- while (1) {
- if (!prev->next || prev->next > h_offset)
- break;
- prev = (struct debug_alloc_header *)
- (debug_alloc_pool + prev->next);
- }
- prev_offset = (char *)prev - debug_alloc_pool;
- if (prev_offset + dah_overhead + prev->size == h_offset) {
- prev->size += dah_overhead + h->size;
- memset(h, POISON_FREE, dah_overhead - 1);
- *((char *)h + dah_overhead - 1) = POISON_END;
- h = prev;
- h_offset = prev_offset;
- } else {
- h->next = prev->next;
- prev->next = h_offset;
- }
- }
- if (h_offset + dah_overhead + h->size == h->next) {
- struct debug_alloc_header *next;
- next = (struct debug_alloc_header *)
- (debug_alloc_pool + h->next);
- h->size += dah_overhead + next->size;
- h->next = next->next;
- memset(next, POISON_FREE, dah_overhead - 1);
- *((char *)next + dah_overhead - 1) = POISON_END;
- }
- spin_unlock(&dap_lock);
-}
-
-void debug_kusage(void)
-{
- struct debug_alloc_header *h_free, *h_used;
-#ifdef CONFIG_IA64
- /* FIXME: using dah for ia64 unwind always results in a memory leak.
- * Fix that memory leak first, then set debug_kusage_one_time = 1 for
- * all architectures.
+ /* If there is no mask, then we will filter code that runs when the
+ * scheduler is idling and any system daemons that are currently
+ * sleeping.
*/
- static int debug_kusage_one_time;
-#else
- static int debug_kusage_one_time = 1;
-#endif
- if (!get_dap_lock()) {
- __release(dap_lock); /* we never actually got it */
- return;
- }
- h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
- if (dah_first == 0 &&
- (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
- dah_first_call))
- goto out;
- if (!debug_kusage_one_time)
- goto out;
- debug_kusage_one_time = 0;
- kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
- __func__, dah_first);
- if (dah_first) {
- h_used = (struct debug_alloc_header *)debug_alloc_pool;
- kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
- h_used->size);
- }
- do {
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- kdb_printf("%s: h_used %p size %d caller %p\n",
- __func__, h_used, h_used->size, h_used->caller);
- h_free = (struct debug_alloc_header *)
- (debug_alloc_pool + h_free->next);
- } while (h_free->next);
- h_used = (struct debug_alloc_header *)
- ((char *)h_free + dah_overhead + h_free->size);
- if ((char *)h_used - debug_alloc_pool !=
- sizeof(debug_alloc_pool_aligned))
- kdb_printf("%s: h_used %p size %d caller %p\n",
- __func__, h_used, h_used->size, h_used->caller);
-out:
- spin_unlock(&dap_lock);
-}
-
-/* Maintain a small stack of kdb_flags to allow recursion without disturbing
- * the global kdb state.
- */
+ if (!mask || mask[0] == '\0')
+ return !strchr("-ims", state);
-static int kdb_flags_stack[4], kdb_flags_index;
+ /* A is a special case that matches all states */
+ if (strchr(mask, 'A'))
+ return true;
-void kdb_save_flags(void)
-{
- BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
- kdb_flags_stack[kdb_flags_index++] = kdb_flags;
-}
-
-void kdb_restore_flags(void)
-{
- BUG_ON(kdb_flags_index <= 0);
- kdb_flags = kdb_flags_stack[--kdb_flags_index];
+ return strchr(mask, state);
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ef90b04d783f..30e7912ebb0d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -1,100 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* delayacct.c - per-task delay accounting
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/clock.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
-#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
#include <linux/module.h>
-int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
-EXPORT_SYMBOL_GPL(delayacct_on);
+#define UPDATE_DELAY(type) \
+do { \
+ d->type##_delay_max = tsk->delays->type##_delay_max; \
+ d->type##_delay_min = tsk->delays->type##_delay_min; \
+ tmp = d->type##_delay_total + tsk->delays->type##_delay; \
+ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
+ d->type##_count += tsk->delays->type##_count; \
+} while (0)
+
+DEFINE_STATIC_KEY_FALSE(delayacct_key);
+int delayacct_on __read_mostly; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
-static int __init delayacct_setup_disable(char *str)
+static void set_delayacct(bool enabled)
+{
+ if (enabled) {
+ static_branch_enable(&delayacct_key);
+ delayacct_on = 1;
+ } else {
+ delayacct_on = 0;
+ static_branch_disable(&delayacct_key);
+ }
+}
+
+static int __init delayacct_setup_enable(char *str)
{
- delayacct_on = 0;
+ delayacct_on = 1;
return 1;
}
-__setup("nodelayacct", delayacct_setup_disable);
+__setup("delayacct", delayacct_setup_enable);
void delayacct_init(void)
{
- delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
delayacct_tsk_init(&init_task);
+ set_delayacct(delayacct_on);
}
+#ifdef CONFIG_PROC_SYSCTL
+static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int state = delayacct_on;
+ struct ctl_table t;
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ set_delayacct(state);
+ return err;
+}
+
+static const struct ctl_table kern_delayacct_table[] = {
+ {
+ .procname = "task_delayacct",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_delayacct,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static __init int kernel_delayacct_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_delayacct_table);
+ return 0;
+}
+late_initcall(kernel_delayacct_sysctls_init);
+#endif
+
void __delayacct_tsk_init(struct task_struct *tsk)
{
tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
if (tsk->delays)
- spin_lock_init(&tsk->delays->lock);
+ raw_spin_lock_init(&tsk->delays->lock);
}
/*
* Finish delay accounting for a statistic using its timestamps (@start),
- * accumalator (@total) and @count
+ * accumulator (@total) and @count
*/
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
{
- s64 ns = ktime_get_ns() - *start;
+ s64 ns = local_clock() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(&current->delays->lock, flags);
+ raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ if (ns > *max)
+ *max = ns;
+ if (*min == 0 || ns < *min)
+ *min = ns;
+ raw_spin_unlock_irqrestore(lock, flags);
}
}
void __delayacct_blkio_start(void)
{
- current->delays->blkio_start = ktime_get_ns();
+ current->delays->blkio_start = local_clock();
}
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
{
- if (current->delays->flags & DELAYACCT_PF_SWAPIN)
- /* Swapin block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->swapin_delay,
- &current->delays->swapin_count);
- else /* Other block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_delay,
- &current->delays->blkio_count);
+ delayacct_end(&p->delays->lock,
+ &p->delays->blkio_start,
+ &p->delays->blkio_delay,
+ &p->delays->blkio_count,
+ &p->delays->blkio_delay_max,
+ &p->delays->blkio_delay_min);
}
-int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
- cputime_t utime, stime, stimescaled, utimescaled;
+ u64 utime, stime, stimescaled, utimescaled;
unsigned long long t2, t3;
unsigned long flags, t1;
s64 tmp;
task_cputime(tsk, &utime, &stime);
tmp = (s64)d->cpu_run_real_total;
- tmp += cputime_to_nsecs(utime + stime);
+ tmp += utime + stime;
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
tmp = (s64)d->cpu_scaled_run_real_total;
- tmp += cputime_to_nsecs(utimescaled + stimescaled);
+ tmp += utimescaled + stimescaled;
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -108,26 +168,28 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_count += t1;
+ d->cpu_delay_max = tsk->sched_info.max_run_delay;
+ d->cpu_delay_min = tsk->sched_info.min_run_delay;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
-
tmp = (s64)d->cpu_run_virtual_total + t3;
+
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
- /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+ if (!tsk->delays)
+ return 0;
- spin_lock_irqsave(&tsk->delays->lock, flags);
- tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
- d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
- tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
- d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
- tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
- d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
- d->blkio_count += tsk->delays->blkio_count;
- d->swapin_count += tsk->delays->swapin_count;
- d->freepages_count += tsk->delays->freepages_count;
- spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+ raw_spin_lock_irqsave(&tsk->delays->lock, flags);
+ UPDATE_DELAY(blkio);
+ UPDATE_DELAY(swapin);
+ UPDATE_DELAY(freepages);
+ UPDATE_DELAY(thrashing);
+ UPDATE_DELAY(compact);
+ UPDATE_DELAY(wpcopy);
+ UPDATE_DELAY(irq);
+ raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
}
@@ -137,22 +199,107 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
__u64 ret;
unsigned long flags;
- spin_lock_irqsave(&tsk->delays->lock, flags);
- ret = nsec_to_clock_t(tsk->delays->blkio_delay +
- tsk->delays->swapin_delay);
- spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ raw_spin_lock_irqsave(&tsk->delays->lock, flags);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay);
+ raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
void __delayacct_freepages_start(void)
{
- current->delays->freepages_start = ktime_get_ns();
+ current->delays->freepages_start = local_clock();
}
void __delayacct_freepages_end(void)
{
- delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(&current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count,
+ &current->delays->freepages_delay_max,
+ &current->delays->freepages_delay_min);
+}
+
+void __delayacct_thrashing_start(bool *in_thrashing)
+{
+ *in_thrashing = !!current->in_thrashing;
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 1;
+ current->delays->thrashing_start = local_clock();
+}
+
+void __delayacct_thrashing_end(bool *in_thrashing)
+{
+ if (*in_thrashing)
+ return;
+
+ current->in_thrashing = 0;
+ delayacct_end(&current->delays->lock,
+ &current->delays->thrashing_start,
+ &current->delays->thrashing_delay,
+ &current->delays->thrashing_count,
+ &current->delays->thrashing_delay_max,
+ &current->delays->thrashing_delay_min);
+}
+
+void __delayacct_swapin_start(void)
+{
+ current->delays->swapin_start = local_clock();
+}
+
+void __delayacct_swapin_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->swapin_start,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count,
+ &current->delays->swapin_delay_max,
+ &current->delays->swapin_delay_min);
+}
+
+void __delayacct_compact_start(void)
+{
+ current->delays->compact_start = local_clock();
+}
+
+void __delayacct_compact_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->compact_start,
+ &current->delays->compact_delay,
+ &current->delays->compact_count,
+ &current->delays->compact_delay_max,
+ &current->delays->compact_delay_min);
+}
+
+void __delayacct_wpcopy_start(void)
+{
+ current->delays->wpcopy_start = local_clock();
+}
+
+void __delayacct_wpcopy_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->wpcopy_start,
+ &current->delays->wpcopy_delay,
+ &current->delays->wpcopy_count,
+ &current->delays->wpcopy_delay_max,
+ &current->delays->wpcopy_delay_min);
+}
+
+void __delayacct_irq(struct task_struct *task, u32 delta)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->delays->lock, flags);
+ task->delays->irq_delay += delta;
+ task->delays->irq_count++;
+ if (delta > task->delays->irq_delay_max)
+ task->delays->irq_delay_max = delta;
+ if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
+ task->delays->irq_delay_min = delta;
+ raw_spin_unlock_irqrestore(&task->delays->lock, flags);
}
diff --git a/kernel/dma.c b/kernel/dma.c
index 6c6262f86c17..40f152936316 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
*
@@ -134,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v)
}
#endif /* MAX_DMA_CHANNELS */
-static int proc_dma_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_dma_show, NULL);
-}
-
-static const struct file_operations proc_dma_operations = {
- .open = proc_dma_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_dma_init(void)
{
- proc_create("dma", 0, NULL, &proc_dma_operations);
+ proc_create_single("dma", 0, NULL, proc_dma_show);
return 0;
}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
new file mode 100644
index 000000000000..31cfdb6b4bc3
--- /dev/null
+++ b/kernel/dma/Kconfig
@@ -0,0 +1,270 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NO_DMA
+ bool
+
+config HAS_DMA
+ bool
+ depends on !NO_DMA
+ default y
+
+config DMA_OPS_HELPERS
+ bool
+
+#
+# IOMMU drivers that can bypass the IOMMU code and optionally use the direct
+# mapping fast path should select this option and set the dma_ops_bypass
+# flag in struct device where applicable
+#
+config DMA_OPS_BYPASS
+ bool
+
+# Lets platform IOMMU driver choose between bypass and IOMMU
+config ARCH_HAS_DMA_MAP_DIRECT
+ bool
+
+config NEED_SG_DMA_FLAGS
+ bool
+
+config NEED_SG_DMA_LENGTH
+ bool
+
+config NEED_DMA_MAP_STATE
+ bool
+
+config ARCH_DMA_ADDR_T_64BIT
+ def_bool 64BIT || PHYS_ADDR_T_64BIT
+
+config ARCH_HAS_DMA_SET_MASK
+ bool
+
+#
+# Select this option if the architecture needs special handling for
+# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what
+# people think of when saying write combine, so very few platforms should
+# need to enable this.
+#
+config ARCH_HAS_DMA_WRITE_COMBINE
+ bool
+
+#
+# Select if the architectures provides the arch_dma_mark_clean hook
+#
+config ARCH_HAS_DMA_MARK_CLEAN
+ bool
+
+config DMA_DECLARE_COHERENT
+ bool
+
+config ARCH_HAS_SETUP_DMA_OPS
+ bool
+
+config ARCH_HAS_TEARDOWN_DMA_OPS
+ bool
+
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE
+ bool
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU
+ bool
+ select NEED_DMA_MAP_STATE
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
+ bool
+
+config ARCH_HAS_DMA_PREP_COHERENT
+ bool
+
+config ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ bool
+
+#
+# Select this option if the architecture assumes DMA devices are coherent
+# by default.
+#
+config ARCH_DMA_DEFAULT_COHERENT
+ bool
+
+config SWIOTLB
+ bool
+ select NEED_DMA_MAP_STATE
+
+config SWIOTLB_DYNAMIC
+ bool "Dynamic allocation of DMA bounce buffers"
+ default n
+ depends on SWIOTLB
+ help
+ This enables dynamic resizing of the software IO TLB. The kernel
+ starts with one memory pool at boot and it will allocate additional
+ pools as needed. To reduce run-time kernel memory requirements, you
+ may have to specify a smaller size of the initial pool using
+ "swiotlb=" on the kernel command line.
+
+ If unsure, say N.
+
+config DMA_BOUNCE_UNALIGNED_KMALLOC
+ bool
+ depends on SWIOTLB
+
+config DMA_NEED_SYNC
+ def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
+ ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || \
+ ARCH_HAS_DMA_OPS || SWIOTLB
+
+config DMA_RESTRICTED_POOL
+ bool "DMA Restricted Pool"
+ depends on OF && OF_RESERVED_MEM && SWIOTLB
+ help
+ This enables support for restricted DMA pools which provide a level of
+ DMA memory protection on systems with limited hardware protection
+ capabilities, such as those lacking an IOMMU.
+
+ For more information see
+ <Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt>
+ and <kernel/dma/swiotlb.c>.
+ If unsure, say "n".
+
+#
+# Should be selected if we can mmap non-coherent mappings to userspace.
+# The only thing that is really required is a way to set an uncached bit
+# in the pagetables
+#
+config DMA_NONCOHERENT_MMAP
+ default y if !MMU
+ bool
+
+config DMA_COHERENT_POOL
+ select GENERIC_ALLOCATOR
+ bool
+
+config DMA_GLOBAL_POOL
+ select DMA_DECLARE_COHERENT
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
+ bool
+
+config DMA_DIRECT_REMAP
+ bool
+ select DMA_COHERENT_POOL
+ select DMA_NONCOHERENT_MMAP
+
+#
+# Fallback to arch code for DMA allocations. This should eventually go away.
+#
+config ARCH_HAS_DMA_ALLOC
+ depends on !ARCH_HAS_DMA_SET_UNCACHED
+ depends on !DMA_DIRECT_REMAP
+ depends on !DMA_GLOBAL_POOL
+ bool
+
+config DMA_CMA
+ bool "DMA Contiguous Memory Allocator"
+ depends on HAVE_DMA_CONTIGUOUS && CMA
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ You can disable CMA by specifying "cma=0" on the kernel's command
+ line.
+
+ For more information see <kernel/dma/contiguous.c>.
+ If unsure, say "n".
+
+if DMA_CMA
+
+config DMA_NUMA_CMA
+ bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
+ depends on NUMA
+ help
+ Enable this option to get numa CMA areas so that NUMA devices
+ can get local memory by DMA coherent APIs.
+
+ You can set the size of pernuma CMA by specifying "cma_pernuma=size"
+ or set the node id and its size of CMA by specifying "numa_cma=
+ <node>:size[,<node>:size]" on the kernel's command line.
+
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 0 if X86
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator. If the size of 0 is selected, CMA is disabled by
+ default, but it can be enabled by passing cma=size[MG] to the kernel.
+
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 0 if X86
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+ If 0 percent is selected, CMA is disabled by default, but it can be
+ enabled by passing cma=size[MG] to the kernel.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_MBYTES
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 2 12
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+endif
+
+config DMA_API_DEBUG
+ bool "Enable debugging of DMA-API usage"
+ select NEED_DMA_MAP_STATE
+ help
+ Enable this option to debug the use of the DMA API by device drivers.
+ With this option you will be able to detect common bugs in device
+ drivers like double-freeing of DMA mappings or freeing mappings that
+ were never allocated.
+
+ This option causes a performance degradation. Use only if you want to
+ debug device drivers and dma interactions.
+
+ If unsure, say N.
+
+config DMA_MAP_BENCHMARK
+ bool "Enable benchmarking of streaming DMA mapping"
+ depends on DEBUG_FS
+ help
+ Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
+ performance of dma_(un)map_page.
+
+ See tools/testing/selftests/dma/dma_map_benchmark.c
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
new file mode 100644
index 000000000000..6977033444a3
--- /dev/null
+++ b/kernel/dma/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_HAS_DMA) += mapping.o direct.o
+obj-$(CONFIG_DMA_OPS_HELPERS) += ops_helpers.o
+obj-$(CONFIG_ARCH_HAS_DMA_OPS) += dummy.o
+obj-$(CONFIG_DMA_CMA) += contiguous.o
+obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
+obj-$(CONFIG_DMA_API_DEBUG) += debug.o
+obj-$(CONFIG_SWIOTLB) += swiotlb.o
+obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
+obj-$(CONFIG_MMU) += remap.o
+obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
new file mode 100644
index 000000000000..77c8d9487a9a
--- /dev/null
+++ b/kernel/dma/coherent.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
+
+struct dma_coherent_mem {
+ void *virt_base;
+ dma_addr_t device_base;
+ unsigned long pfn_base;
+ int size;
+ unsigned long *bitmap;
+ spinlock_t spinlock;
+ bool use_dev_dma_pfn_offset;
+};
+
+static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev)
+{
+ if (dev && dev->dma_mem)
+ return dev->dma_mem;
+ return NULL;
+}
+
+static inline dma_addr_t dma_get_device_base(struct device *dev,
+ struct dma_coherent_mem * mem)
+{
+ if (mem->use_dev_dma_pfn_offset)
+ return phys_to_dma(dev, PFN_PHYS(mem->pfn_base));
+ return mem->device_base;
+}
+
+static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset)
+{
+ struct dma_coherent_mem *dma_mem;
+ int pages = size >> PAGE_SHIFT;
+ void *mem_base;
+
+ if (!size)
+ return ERR_PTR(-EINVAL);
+
+ mem_base = memremap(phys_addr, size, MEMREMAP_WC);
+ if (!mem_base)
+ return ERR_PTR(-EINVAL);
+
+ dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+ if (!dma_mem)
+ goto out_unmap_membase;
+ dma_mem->bitmap = bitmap_zalloc(pages, GFP_KERNEL);
+ if (!dma_mem->bitmap)
+ goto out_free_dma_mem;
+
+ dma_mem->virt_base = mem_base;
+ dma_mem->device_base = device_addr;
+ dma_mem->pfn_base = PFN_DOWN(phys_addr);
+ dma_mem->size = pages;
+ dma_mem->use_dev_dma_pfn_offset = use_dma_pfn_offset;
+ spin_lock_init(&dma_mem->spinlock);
+
+ return dma_mem;
+
+out_free_dma_mem:
+ kfree(dma_mem);
+out_unmap_membase:
+ memunmap(mem_base);
+ pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %zd MiB\n",
+ &phys_addr, size / SZ_1M);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void _dma_release_coherent_memory(struct dma_coherent_mem *mem)
+{
+ if (!mem)
+ return;
+
+ memunmap(mem->virt_base);
+ bitmap_free(mem->bitmap);
+ kfree(mem);
+}
+
+static int dma_assign_coherent_memory(struct device *dev,
+ struct dma_coherent_mem *mem)
+{
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->dma_mem)
+ return -EBUSY;
+
+ dev->dma_mem = mem;
+ return 0;
+}
+
+/*
+ * Declare a region of memory to be handed out by dma_alloc_coherent() when it
+ * is asked for coherent memory for this device. This shall only be used
+ * from platform code, usually based on the device tree description.
+ *
+ * phys_addr is the CPU physical address to which the memory is currently
+ * assigned (this will be ioremapped so the CPU can access the region).
+ *
+ * device_addr is the DMA address the device needs to be programmed with to
+ * actually address this memory (this will be handed out as the dma_addr_t in
+ * dma_alloc_coherent()).
+ *
+ * size is the size of the area (must be a multiple of PAGE_SIZE).
+ *
+ * As a simplification for the platforms, only *one* such region of memory may
+ * be declared per device.
+ */
+int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size)
+{
+ struct dma_coherent_mem *mem;
+ int ret;
+
+ mem = dma_init_coherent_memory(phys_addr, device_addr, size, false);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+
+ ret = dma_assign_coherent_memory(dev, mem);
+ if (ret)
+ _dma_release_coherent_memory(mem);
+ return ret;
+}
+
+void dma_release_coherent_memory(struct device *dev)
+{
+ if (dev) {
+ _dma_release_coherent_memory(dev->dma_mem);
+ dev->dma_mem = NULL;
+ }
+}
+
+static void *__dma_alloc_from_coherent(struct device *dev,
+ struct dma_coherent_mem *mem,
+ ssize_t size, dma_addr_t *dma_handle)
+{
+ int order = get_order(size);
+ unsigned long flags;
+ int pageno;
+ void *ret;
+
+ spin_lock_irqsave(&mem->spinlock, flags);
+
+ if (unlikely(size > ((dma_addr_t)mem->size << PAGE_SHIFT)))
+ goto err;
+
+ pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
+ if (unlikely(pageno < 0))
+ goto err;
+
+ /*
+ * Memory was found in the coherent area.
+ */
+ *dma_handle = dma_get_device_base(dev, mem) +
+ ((dma_addr_t)pageno << PAGE_SHIFT);
+ ret = mem->virt_base + ((dma_addr_t)pageno << PAGE_SHIFT);
+ spin_unlock_irqrestore(&mem->spinlock, flags);
+ memset(ret, 0, size);
+ return ret;
+err:
+ spin_unlock_irqrestore(&mem->spinlock, flags);
+ return NULL;
+}
+
+/**
+ * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool
+ * @dev: device from which we allocate memory
+ * @size: size of requested memory area
+ * @dma_handle: This will be filled with the correct dma handle
+ * @ret: This pointer will be filled with the virtual address
+ * to allocated area.
+ *
+ * This function should be only called from per-arch dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
+ */
+int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle, void **ret)
+{
+ struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+
+ if (!mem)
+ return 0;
+
+ *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
+ return 1;
+}
+
+static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
+ int order, void *vaddr)
+{
+ if (mem && vaddr >= mem->virt_base && vaddr <
+ (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) {
+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&mem->spinlock, flags);
+ bitmap_release_region(mem->bitmap, page, order);
+ spin_unlock_irqrestore(&mem->spinlock, flags);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * dma_release_from_dev_coherent() - free memory to device coherent memory pool
+ * @dev: device from which the memory was allocated
+ * @order: the order of pages allocated
+ * @vaddr: virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if the caller should
+ * proceed with releasing memory from generic pools.
+ */
+int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr)
+{
+ struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+
+ return __dma_release_from_coherent(mem, order, vaddr);
+}
+
+static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem,
+ struct vm_area_struct *vma, void *vaddr, size_t size, int *ret)
+{
+ if (mem && vaddr >= mem->virt_base && vaddr + size <=
+ (mem->virt_base + ((dma_addr_t)mem->size << PAGE_SHIFT))) {
+ unsigned long off = vma->vm_pgoff;
+ int start = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+ unsigned long user_count = vma_pages(vma);
+ int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ *ret = -ENXIO;
+ if (off < count && user_count <= count - off) {
+ unsigned long pfn = mem->pfn_base + start + off;
+ *ret = remap_pfn_range(vma, vma->vm_start, pfn,
+ user_count << PAGE_SHIFT,
+ vma->vm_page_prot);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool
+ * @dev: device from which the memory was allocated
+ * @vma: vm_area for the userspace memory
+ * @vaddr: cpu address returned by dma_alloc_from_dev_coherent
+ * @size: size of the memory buffer allocated
+ * @ret: result from remap_pfn_range()
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, maps that memory to the provided vma.
+ *
+ * Returns 1 if @vaddr belongs to the device coherent pool and the caller
+ * should return @ret, or 0 if they should proceed with mapping memory from
+ * generic areas.
+ */
+int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
+ void *vaddr, size_t size, int *ret)
+{
+ struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
+
+ return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
+}
+
+#ifdef CONFIG_DMA_GLOBAL_POOL
+static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init;
+
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle)
+{
+ if (!dma_coherent_default_memory)
+ return NULL;
+
+ return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+ dma_handle);
+}
+
+int dma_release_from_global_coherent(int order, void *vaddr)
+{
+ if (!dma_coherent_default_memory)
+ return 0;
+
+ return __dma_release_from_coherent(dma_coherent_default_memory, order,
+ vaddr);
+}
+
+int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
+ size_t size, int *ret)
+{
+ if (!dma_coherent_default_memory)
+ return 0;
+
+ return __dma_mmap_from_coherent(dma_coherent_default_memory, vma,
+ vaddr, size, ret);
+}
+
+int dma_init_global_coherent(phys_addr_t phys_addr, size_t size)
+{
+ struct dma_coherent_mem *mem;
+
+ mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ dma_coherent_default_memory = mem;
+ pr_info("DMA: default coherent area is set\n");
+ return 0;
+}
+#endif /* CONFIG_DMA_GLOBAL_POOL */
+
+/*
+ * Support for reserved memory regions defined in device tree
+ */
+#ifdef CONFIG_OF_RESERVED_MEM
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+
+#ifdef CONFIG_DMA_GLOBAL_POOL
+static phys_addr_t dma_reserved_default_memory_base __initdata;
+static phys_addr_t dma_reserved_default_memory_size __initdata;
+#endif
+
+static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
+{
+ struct dma_coherent_mem *mem = rmem->priv;
+
+ if (!mem) {
+ mem = dma_init_coherent_memory(rmem->base, rmem->base,
+ rmem->size, true);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+ rmem->priv = mem;
+ }
+
+ /* Warn if the device potentially can't use the reserved memory */
+ if (mem->device_base + rmem->size - 1 >
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit))
+ dev_warn(dev, "reserved memory is beyond device's set DMA address range\n");
+
+ dma_assign_coherent_memory(dev, mem);
+ return 0;
+}
+
+static void rmem_dma_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ if (dev)
+ dev->dma_mem = NULL;
+}
+
+static const struct reserved_mem_ops rmem_dma_ops = {
+ .device_init = rmem_dma_device_init,
+ .device_release = rmem_dma_device_release,
+};
+
+static int __init rmem_dma_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+
+ if (of_get_flat_dt_prop(node, "reusable", NULL))
+ return -EINVAL;
+
+#ifdef CONFIG_ARM
+ if (!of_get_flat_dt_prop(node, "no-map", NULL)) {
+ pr_err("Reserved memory: regions without no-map are not yet supported\n");
+ return -EINVAL;
+ }
+#endif
+
+#ifdef CONFIG_DMA_GLOBAL_POOL
+ if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) {
+ WARN(dma_reserved_default_memory_size,
+ "Reserved memory: region for default DMA coherent area is redefined\n");
+ dma_reserved_default_memory_base = rmem->base;
+ dma_reserved_default_memory_size = rmem->size;
+ }
+#endif
+
+ rmem->ops = &rmem_dma_ops;
+ pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return 0;
+}
+
+#ifdef CONFIG_DMA_GLOBAL_POOL
+static int __init dma_init_reserved_memory(void)
+{
+ if (!dma_reserved_default_memory_size)
+ return -ENOMEM;
+ return dma_init_global_coherent(dma_reserved_default_memory_base,
+ dma_reserved_default_memory_size);
+}
+core_initcall(dma_init_reserved_memory);
+#endif /* CONFIG_DMA_GLOBAL_POOL */
+
+RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup);
+#endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
new file mode 100644
index 000000000000..d8fd6f779f79
--- /dev/null
+++ b/kernel/dma/contiguous.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ * Marek Szyprowski <m.szyprowski@samsung.com>
+ * Michal Nazarewicz <mina86@mina86.com>
+ *
+ * Contiguous Memory Allocator
+ *
+ * The Contiguous Memory Allocator (CMA) makes it possible to
+ * allocate big contiguous chunks of memory after the system has
+ * booted.
+ *
+ * Why is it needed?
+ *
+ * Various devices on embedded systems have no scatter-getter and/or
+ * IO map support and require contiguous blocks of memory to
+ * operate. They include devices such as cameras, hardware video
+ * coders, etc.
+ *
+ * Such devices often require big memory buffers (a full HD frame
+ * is, for instance, more than 2 mega pixels large, i.e. more than 6
+ * MB of memory), which makes mechanisms such as kmalloc() or
+ * alloc_page() ineffective.
+ *
+ * At the same time, a solution where a big memory region is
+ * reserved for a device is suboptimal since often more memory is
+ * reserved then strictly required and, moreover, the memory is
+ * inaccessible to page system even if device drivers don't use it.
+ *
+ * CMA tries to solve this issue by operating on memory regions
+ * where only movable pages can be allocated from. This way, kernel
+ * can use the memory for pagecache and when device driver requests
+ * it, allocated pages can be migrated.
+ */
+
+#define pr_fmt(fmt) "cma: " fmt
+
+#include <asm/page.h>
+
+#include <linux/memblock.h>
+#include <linux/err.h>
+#include <linux/sizes.h>
+#include <linux/dma-buf/heaps/cma.h>
+#include <linux/dma-map-ops.h>
+#include <linux/cma.h>
+#include <linux/nospec.h>
+
+#ifdef CONFIG_CMA_SIZE_MBYTES
+#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES
+#else
+#define CMA_SIZE_MBYTES 0
+#endif
+
+struct cma *dma_contiguous_default_area;
+
+/*
+ * Default global CMA area size can be defined in kernel's .config.
+ * This is useful mainly for distro maintainers to create a kernel
+ * that works correctly for most supported systems.
+ * The size can be set in bytes or as a percentage of the total memory
+ * in the system.
+ *
+ * Users, who want to set the size of global CMA area for their system
+ * should use cma= kernel parameter.
+ */
+#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M)
+static phys_addr_t size_cmdline __initdata = -1;
+static phys_addr_t base_cmdline __initdata;
+static phys_addr_t limit_cmdline __initdata;
+
+static int __init early_cma(char *p)
+{
+ if (!p) {
+ pr_err("Config string not provided\n");
+ return -EINVAL;
+ }
+
+ size_cmdline = memparse(p, &p);
+ if (*p != '@')
+ return 0;
+ base_cmdline = memparse(p + 1, &p);
+ if (*p != '-') {
+ limit_cmdline = base_cmdline + size_cmdline;
+ return 0;
+ }
+ limit_cmdline = memparse(p + 1, &p);
+
+ return 0;
+}
+early_param("cma", early_cma);
+
+#ifdef CONFIG_DMA_NUMA_CMA
+
+static struct cma *dma_contiguous_numa_area[MAX_NUMNODES];
+static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata;
+static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
+static phys_addr_t pernuma_size_bytes __initdata;
+
+static int __init early_numa_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ numa_cma_size[nid] = tmp;
+
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else
+ break;
+ }
+
+ return 0;
+}
+early_param("numa_cma", early_numa_cma);
+
+static int __init early_cma_pernuma(char *p)
+{
+ pernuma_size_bytes = memparse(p, &p);
+ return 0;
+}
+early_param("cma_pernuma", early_cma_pernuma);
+#endif
+
+#ifdef CONFIG_CMA_SIZE_PERCENTAGE
+
+static phys_addr_t __init __maybe_unused cma_early_percent_memory(void)
+{
+ unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size());
+
+ return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
+}
+
+#else
+
+static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
+{
+ return 0;
+}
+
+#endif
+
+#ifdef CONFIG_DMA_NUMA_CMA
+static void __init dma_numa_cma_reserve(void)
+{
+ int nid;
+
+ for_each_node(nid) {
+ int ret;
+ char name[CMA_MAX_NAME];
+ struct cma **cma;
+
+ if (!node_online(nid)) {
+ if (pernuma_size_bytes || numa_cma_size[nid])
+ pr_warn("invalid node %d specified\n", nid);
+ continue;
+ }
+
+ if (pernuma_size_bytes) {
+
+ cma = &dma_contiguous_pernuma_area[nid];
+ snprintf(name, sizeof(name), "pernuma%d", nid);
+ ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
+ 0, false, name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
+
+ if (numa_cma_size[nid]) {
+
+ cma = &dma_contiguous_numa_area[nid];
+ snprintf(name, sizeof(name), "numa%d", nid);
+ ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false,
+ name, cma, nid);
+ if (ret)
+ pr_warn("%s: reservation failed: err %d, node %d", __func__,
+ ret, nid);
+ }
+ }
+}
+#else
+static inline void __init dma_numa_cma_reserve(void)
+{
+}
+#endif
+
+/**
+ * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init dma_contiguous_reserve(phys_addr_t limit)
+{
+ phys_addr_t selected_size = 0;
+ phys_addr_t selected_base = 0;
+ phys_addr_t selected_limit = limit;
+ bool fixed = false;
+
+ dma_numa_cma_reserve();
+
+ pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
+
+ if (size_cmdline != -1) {
+ selected_size = size_cmdline;
+ selected_base = base_cmdline;
+
+ /* Hornor the user setup dma address limit */
+ selected_limit = limit_cmdline ?: limit;
+
+ if (base_cmdline + size_cmdline == limit_cmdline)
+ fixed = true;
+ } else {
+#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
+ selected_size = size_bytes;
+#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
+ selected_size = cma_early_percent_memory();
+#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
+ selected_size = min(size_bytes, cma_early_percent_memory());
+#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
+ selected_size = max(size_bytes, cma_early_percent_memory());
+#endif
+ }
+
+ if (selected_size && !dma_contiguous_default_area) {
+ int ret;
+
+ pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ (unsigned long)selected_size / SZ_1M);
+
+ dma_contiguous_reserve_area(selected_size, selected_base,
+ selected_limit,
+ &dma_contiguous_default_area,
+ fixed);
+
+ ret = dma_heap_cma_register_heap(dma_contiguous_default_area);
+ if (ret)
+ pr_warn("Couldn't register default CMA heap.");
+ }
+}
+
+void __weak
+dma_contiguous_early_fixup(phys_addr_t base, unsigned long size)
+{
+}
+
+/**
+ * dma_contiguous_reserve_area() - reserve custom contiguous area
+ * @size: Size of the reserved area (in bytes),
+ * @base: Base address of the reserved area optional, use 0 for any
+ * @limit: End address of the reserved memory (optional, 0 for any).
+ * @res_cma: Pointer to store the created cma region.
+ * @fixed: hint about where to place the reserved area
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the early allocator (memblock or bootmem)
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory. This function allows to create custom reserved areas for specific
+ * devices.
+ *
+ * If @fixed is true, reserve contiguous area at exactly @base. If false,
+ * reserve in range from @base to @limit.
+ */
+int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
+ phys_addr_t limit, struct cma **res_cma,
+ bool fixed)
+{
+ int ret;
+
+ ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed,
+ "reserved", res_cma);
+ if (ret)
+ return ret;
+
+ /* Architecture specific contiguous memory fixup. */
+ dma_contiguous_early_fixup(cma_get_base(*res_cma),
+ cma_get_size(*res_cma));
+
+ return 0;
+}
+
+/**
+ * dma_alloc_from_contiguous() - allocate pages from contiguous area
+ * @dev: Pointer to device for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ * @no_warn: Avoid printing message about failed allocation.
+ *
+ * This function allocates memory buffer for specified device. It uses
+ * device specific contiguous memory area if available or the default
+ * global one. Requires architecture specific dev_get_cma_area() helper
+ * function.
+ */
+struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
+ unsigned int align, bool no_warn)
+{
+ if (align > CONFIG_CMA_ALIGNMENT)
+ align = CONFIG_CMA_ALIGNMENT;
+
+ return cma_alloc(dev_get_cma_area(dev), count, align, no_warn);
+}
+
+/**
+ * dma_release_from_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @pages: Allocated pages.
+ * @count: Number of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_from_contiguous().
+ * It returns false when provided pages do not belong to contiguous area and
+ * true otherwise.
+ */
+bool dma_release_from_contiguous(struct device *dev, struct page *pages,
+ int count)
+{
+ return cma_release(dev_get_cma_area(dev), pages, count);
+}
+
+static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
+{
+ unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT);
+
+ return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN);
+}
+
+/**
+ * dma_alloc_contiguous() - allocate contiguous pages
+ * @dev: Pointer to device for which the allocation is performed.
+ * @size: Requested allocation size.
+ * @gfp: Allocation flags.
+ *
+ * tries to use device specific contiguous memory area if available, or it
+ * tries to use per-numa cma, if the allocation fails, it will fallback to
+ * try default global one.
+ *
+ * Note that it bypass one-page size of allocations from the per-numa and
+ * global area as the addresses within one page are always contiguous, so
+ * there is no need to waste CMA pages for that kind; it also helps reduce
+ * fragmentations.
+ */
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+#ifdef CONFIG_DMA_NUMA_CMA
+ int nid = dev_to_node(dev);
+#endif
+
+ /* CMA can be used only in the context which permits sleeping */
+ if (!gfpflags_allow_blocking(gfp))
+ return NULL;
+ if (dev->cma_area)
+ return cma_alloc_aligned(dev->cma_area, size, gfp);
+ if (size <= PAGE_SIZE)
+ return NULL;
+
+#ifdef CONFIG_DMA_NUMA_CMA
+ if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
+ struct cma *cma = dma_contiguous_pernuma_area[nid];
+ struct page *page;
+
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
+
+ cma = dma_contiguous_numa_area[nid];
+ if (cma) {
+ page = cma_alloc_aligned(cma, size, gfp);
+ if (page)
+ return page;
+ }
+ }
+#endif
+ if (!dma_contiguous_default_area)
+ return NULL;
+
+ return cma_alloc_aligned(dma_contiguous_default_area, size, gfp);
+}
+
+/**
+ * dma_free_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @page: Pointer to the allocated pages.
+ * @size: Size of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_contiguous(). As the
+ * cma_release returns false when provided pages do not belong to contiguous
+ * area and true otherwise, this function then does a fallback __free_pages()
+ * upon a false-return.
+ */
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ /* if dev has its own cma, free page from there */
+ if (dev->cma_area) {
+ if (cma_release(dev->cma_area, page, count))
+ return;
+ } else {
+ /*
+ * otherwise, page is from either per-numa cma or default cma
+ */
+#ifdef CONFIG_DMA_NUMA_CMA
+ if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
+ page, count))
+ return;
+ if (cma_release(dma_contiguous_numa_area[page_to_nid(page)],
+ page, count))
+ return;
+#endif
+ if (cma_release(dma_contiguous_default_area, page, count))
+ return;
+ }
+
+ /* not in any cma, free from buddy */
+ __free_pages(page, get_order(size));
+}
+
+/*
+ * Support for reserved memory regions defined in device tree
+ */
+#ifdef CONFIG_OF_RESERVED_MEM
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev)
+{
+ dev->cma_area = rmem->priv;
+ return 0;
+}
+
+static void rmem_cma_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ dev->cma_area = NULL;
+}
+
+static const struct reserved_mem_ops rmem_cma_ops = {
+ .device_init = rmem_cma_device_init,
+ .device_release = rmem_cma_device_release,
+};
+
+static int __init rmem_cma_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+ bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
+ struct cma *cma;
+ int err;
+
+ if (size_cmdline != -1 && default_cma) {
+ pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n",
+ rmem->name);
+ return -EBUSY;
+ }
+
+ if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -EINVAL;
+
+ if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
+ pr_err("Reserved memory: incorrect alignment of CMA region\n");
+ return -EINVAL;
+ }
+
+ err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
+ if (err) {
+ pr_err("Reserved memory: unable to setup CMA region\n");
+ return err;
+ }
+
+ if (default_cma)
+ dma_contiguous_default_area = cma;
+
+ rmem->ops = &rmem_cma_ops;
+ rmem->priv = cma;
+
+ pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+
+ err = dma_heap_cma_register_heap(cma);
+ if (err)
+ pr_warn("Couldn't register CMA heap.");
+
+ return 0;
+}
+RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup);
+#endif
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
new file mode 100644
index 000000000000..138ede653de4
--- /dev/null
+++ b/kernel/dma/debug.c
@@ -0,0 +1,1608 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2008 Advanced Micro Devices, Inc.
+ *
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ */
+
+#define pr_fmt(fmt) "DMA-API: " fmt
+
+#include <linux/sched/task_stack.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-map-ops.h>
+#include <linux/sched/task.h>
+#include <linux/stacktrace.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/swiotlb.h>
+#include <asm/sections.h>
+#include "debug.h"
+
+#define HASH_SIZE 16384ULL
+#define HASH_FN_SHIFT 13
+#define HASH_FN_MASK (HASH_SIZE - 1)
+
+#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
+/* If the pool runs out, add this many new entries at once */
+#define DMA_DEBUG_DYNAMIC_ENTRIES (PAGE_SIZE / sizeof(struct dma_debug_entry))
+
+enum {
+ dma_debug_single,
+ dma_debug_sg,
+ dma_debug_coherent,
+ dma_debug_noncoherent,
+ dma_debug_phy,
+};
+
+enum map_err_types {
+ MAP_ERR_CHECK_NOT_APPLICABLE,
+ MAP_ERR_NOT_CHECKED,
+ MAP_ERR_CHECKED,
+};
+
+#define DMA_DEBUG_STACKTRACE_ENTRIES 5
+
+/**
+ * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
+ * @list: node on pre-allocated free_entries list
+ * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
+ * @dev_addr: dma address
+ * @size: length of the mapping
+ * @type: single, page, sg, coherent
+ * @direction: enum dma_data_direction
+ * @sg_call_ents: 'nents' from dma_map_sg
+ * @sg_mapped_ents: 'mapped_ents' from dma_map_sg
+ * @paddr: physical start address of the mapping
+ * @map_err_type: track whether dma_mapping_error() was checked
+ * @stack_len: number of backtrace entries in @stack_entries
+ * @stack_entries: stack of backtrace history
+ */
+struct dma_debug_entry {
+ struct list_head list;
+ struct device *dev;
+ u64 dev_addr;
+ u64 size;
+ int type;
+ int direction;
+ int sg_call_ents;
+ int sg_mapped_ents;
+ phys_addr_t paddr;
+ enum map_err_types map_err_type;
+#ifdef CONFIG_STACKTRACE
+ unsigned int stack_len;
+ unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+#endif
+} ____cacheline_aligned_in_smp;
+
+typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *);
+
+struct hash_bucket {
+ struct list_head list;
+ spinlock_t lock;
+};
+
+/* Hash list to save the allocated dma addresses */
+static struct hash_bucket dma_entry_hash[HASH_SIZE];
+/* List of pre-allocated dma_debug_entry's */
+static LIST_HEAD(free_entries);
+/* Lock for the list above */
+static DEFINE_SPINLOCK(free_entries_lock);
+
+/* Global disable flag - will be set in case of an error */
+static bool global_disable __read_mostly;
+
+/* Early initialization disable flag, set at the end of dma_debug_init */
+static bool dma_debug_initialized __read_mostly;
+
+static inline bool dma_debug_disabled(void)
+{
+ return global_disable || !dma_debug_initialized;
+}
+
+/* Global error count */
+static u32 error_count;
+
+/* Global error show enable*/
+static u32 show_all_errors __read_mostly;
+/* Number of errors to show */
+static u32 show_num_errors = 1;
+
+static u32 num_free_entries;
+static u32 min_free_entries;
+static u32 nr_total_entries;
+
+/* number of preallocated entries requested by kernel cmdline */
+static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
+
+/* per-driver filter related state */
+
+#define NAME_MAX_LEN 64
+
+static char current_driver_name[NAME_MAX_LEN] __read_mostly;
+static struct device_driver *current_driver __read_mostly;
+
+static DEFINE_RWLOCK(driver_name_lock);
+
+static const char *const maperr2str[] = {
+ [MAP_ERR_CHECK_NOT_APPLICABLE] = "dma map error check not applicable",
+ [MAP_ERR_NOT_CHECKED] = "dma map error not checked",
+ [MAP_ERR_CHECKED] = "dma map error checked",
+};
+
+static const char *type2name[] = {
+ [dma_debug_single] = "single",
+ [dma_debug_sg] = "scatter-gather",
+ [dma_debug_coherent] = "coherent",
+ [dma_debug_noncoherent] = "noncoherent",
+ [dma_debug_phy] = "phy",
+};
+
+static const char *dir2name[] = {
+ [DMA_BIDIRECTIONAL] = "DMA_BIDIRECTIONAL",
+ [DMA_TO_DEVICE] = "DMA_TO_DEVICE",
+ [DMA_FROM_DEVICE] = "DMA_FROM_DEVICE",
+ [DMA_NONE] = "DMA_NONE",
+};
+
+/*
+ * The access to some variables in this macro is racy. We can't use atomic_t
+ * here because all these variables are exported to debugfs. Some of them even
+ * writeable. This is also the reason why a lock won't help much. But anyway,
+ * the races are no big deal. Here is why:
+ *
+ * error_count: the addition is racy, but the worst thing that can happen is
+ * that we don't count some errors
+ * show_num_errors: the subtraction is racy. Also no big deal because in
+ * worst case this will result in one warning more in the
+ * system log than the user configured. This variable is
+ * writeable via debugfs.
+ */
+static inline void dump_entry_trace(struct dma_debug_entry *entry)
+{
+#ifdef CONFIG_STACKTRACE
+ if (entry) {
+ pr_warn("Mapped at:\n");
+ stack_trace_print(entry->stack_entries, entry->stack_len, 0);
+ }
+#endif
+}
+
+static bool driver_filter(struct device *dev)
+{
+ struct device_driver *drv;
+ unsigned long flags;
+ bool ret;
+
+ /* driver filter off */
+ if (likely(!current_driver_name[0]))
+ return true;
+
+ /* driver filter on and initialized */
+ if (current_driver && dev && dev->driver == current_driver)
+ return true;
+
+ /* driver filter on, but we can't filter on a NULL device... */
+ if (!dev)
+ return false;
+
+ if (current_driver || !current_driver_name[0])
+ return false;
+
+ /* driver filter on but not yet initialized */
+ drv = dev->driver;
+ if (!drv)
+ return false;
+
+ /* lock to protect against change of current_driver_name */
+ read_lock_irqsave(&driver_name_lock, flags);
+
+ ret = false;
+ if (drv->name &&
+ strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) {
+ current_driver = drv;
+ ret = true;
+ }
+
+ read_unlock_irqrestore(&driver_name_lock, flags);
+
+ return ret;
+}
+
+#define err_printk(dev, entry, format, arg...) do { \
+ error_count += 1; \
+ if (driver_filter(dev) && \
+ (show_all_errors || show_num_errors > 0)) { \
+ WARN(1, pr_fmt("%s %s: ") format, \
+ dev ? dev_driver_string(dev) : "NULL", \
+ dev ? dev_name(dev) : "NULL", ## arg); \
+ dump_entry_trace(entry); \
+ } \
+ if (!show_all_errors && show_num_errors > 0) \
+ show_num_errors -= 1; \
+ } while (0);
+
+/*
+ * Hash related functions
+ *
+ * Every DMA-API request is saved into a struct dma_debug_entry. To
+ * have quick access to these structs they are stored into a hash.
+ */
+static int hash_fn(struct dma_debug_entry *entry)
+{
+ /*
+ * Hash function is based on the dma address.
+ * We use bits 20-27 here as the index into the hash
+ */
+ return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK;
+}
+
+/*
+ * Request exclusive access to a hash bucket for a given dma_debug_entry.
+ */
+static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry,
+ unsigned long *flags)
+ __acquires(&dma_entry_hash[idx].lock)
+{
+ int idx = hash_fn(entry);
+ unsigned long __flags;
+
+ spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags);
+ *flags = __flags;
+ return &dma_entry_hash[idx];
+}
+
+/*
+ * Give up exclusive access to the hash bucket
+ */
+static void put_hash_bucket(struct hash_bucket *bucket,
+ unsigned long flags)
+ __releases(&bucket->lock)
+{
+ spin_unlock_irqrestore(&bucket->lock, flags);
+}
+
+static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b)
+{
+ return ((a->dev_addr == b->dev_addr) &&
+ (a->dev == b->dev)) ? true : false;
+}
+
+static bool containing_match(struct dma_debug_entry *a,
+ struct dma_debug_entry *b)
+{
+ if (a->dev != b->dev)
+ return false;
+
+ if ((b->dev_addr <= a->dev_addr) &&
+ ((b->dev_addr + b->size) >= (a->dev_addr + a->size)))
+ return true;
+
+ return false;
+}
+
+/*
+ * Search a given entry in the hash bucket list
+ */
+static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket,
+ struct dma_debug_entry *ref,
+ match_fn match)
+{
+ struct dma_debug_entry *entry, *ret = NULL;
+ int matches = 0, match_lvl, last_lvl = -1;
+
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!match(ref, entry))
+ continue;
+
+ /*
+ * Some drivers map the same physical address multiple
+ * times. Without a hardware IOMMU this results in the
+ * same device addresses being put into the dma-debug
+ * hash multiple times too. This can result in false
+ * positives being reported. Therefore we implement a
+ * best-fit algorithm here which returns the entry from
+ * the hash which fits best to the reference value
+ * instead of the first-fit.
+ */
+ matches += 1;
+ match_lvl = 0;
+ entry->size == ref->size ? ++match_lvl : 0;
+ entry->type == ref->type ? ++match_lvl : 0;
+ entry->direction == ref->direction ? ++match_lvl : 0;
+ entry->sg_call_ents == ref->sg_call_ents ? ++match_lvl : 0;
+
+ if (match_lvl == 4) {
+ /* perfect-fit - return the result */
+ return entry;
+ } else if (match_lvl > last_lvl) {
+ /*
+ * We found an entry that fits better then the
+ * previous one or it is the 1st match.
+ */
+ last_lvl = match_lvl;
+ ret = entry;
+ }
+ }
+
+ /*
+ * If we have multiple matches but no perfect-fit, just return
+ * NULL.
+ */
+ ret = (matches == 1) ? ret : NULL;
+
+ return ret;
+}
+
+static struct dma_debug_entry *bucket_find_exact(struct hash_bucket *bucket,
+ struct dma_debug_entry *ref)
+{
+ return __hash_bucket_find(bucket, ref, exact_match);
+}
+
+static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
+ struct dma_debug_entry *ref,
+ unsigned long *flags)
+{
+
+ struct dma_debug_entry *entry, index = *ref;
+ int limit = min(HASH_SIZE, (index.dev_addr >> HASH_FN_SHIFT) + 1);
+
+ for (int i = 0; i < limit; i++) {
+ entry = __hash_bucket_find(*bucket, ref, containing_match);
+
+ if (entry)
+ return entry;
+
+ /*
+ * Nothing found, go back a hash bucket
+ */
+ put_hash_bucket(*bucket, *flags);
+ index.dev_addr -= (1 << HASH_FN_SHIFT);
+ *bucket = get_hash_bucket(&index, flags);
+ }
+
+ return NULL;
+}
+
+/*
+ * Add an entry to a hash bucket
+ */
+static void hash_bucket_add(struct hash_bucket *bucket,
+ struct dma_debug_entry *entry)
+{
+ list_add_tail(&entry->list, &bucket->list);
+}
+
+/*
+ * Remove entry from a hash bucket list
+ */
+static void hash_bucket_del(struct dma_debug_entry *entry)
+{
+ list_del(&entry->list);
+}
+
+/*
+ * For each mapping (initial cacheline in the case of
+ * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a
+ * scatterlist, or the cacheline specified in dma_map_single) insert
+ * into this tree using the cacheline as the key. At
+ * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If
+ * the entry already exists at insertion time add a tag as a reference
+ * count for the overlapping mappings. For now, the overlap tracking
+ * just ensures that 'unmaps' balance 'maps' before marking the
+ * cacheline idle, but we should also be flagging overlaps as an API
+ * violation.
+ *
+ * Memory usage is mostly constrained by the maximum number of available
+ * dma-debug entries in that we need a free dma_debug_entry before
+ * inserting into the tree. In the case of dma_map_page and
+ * dma_alloc_coherent there is only one dma_debug_entry and one
+ * dma_active_cacheline entry to track per event. dma_map_sg(), on the
+ * other hand, consumes a single dma_debug_entry, but inserts 'nents'
+ * entries into the tree.
+ *
+ * Use __GFP_NOWARN because the printk from an OOM, to netconsole, could end
+ * up right back in the DMA debugging code, leading to a deadlock.
+ */
+static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC | __GFP_NOWARN);
+static DEFINE_SPINLOCK(radix_lock);
+#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1)
+#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT)
+#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT)
+
+static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry)
+{
+ return ((entry->paddr >> PAGE_SHIFT) << CACHELINE_PER_PAGE_SHIFT) +
+ (offset_in_page(entry->paddr) >> L1_CACHE_SHIFT);
+}
+
+static int active_cacheline_read_overlap(phys_addr_t cln)
+{
+ int overlap = 0, i;
+
+ for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
+ if (radix_tree_tag_get(&dma_active_cacheline, cln, i))
+ overlap |= 1 << i;
+ return overlap;
+}
+
+static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
+{
+ int i;
+
+ if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0)
+ return overlap;
+
+ for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--)
+ if (overlap & 1 << i)
+ radix_tree_tag_set(&dma_active_cacheline, cln, i);
+ else
+ radix_tree_tag_clear(&dma_active_cacheline, cln, i);
+
+ return overlap;
+}
+
+static void active_cacheline_inc_overlap(phys_addr_t cln)
+{
+ int overlap = active_cacheline_read_overlap(cln);
+
+ overlap = active_cacheline_set_overlap(cln, ++overlap);
+
+ /* If we overflowed the overlap counter then we're potentially
+ * leaking dma-mappings.
+ */
+ WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+ pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
+ ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
+}
+
+static int active_cacheline_dec_overlap(phys_addr_t cln)
+{
+ int overlap = active_cacheline_read_overlap(cln);
+
+ return active_cacheline_set_overlap(cln, --overlap);
+}
+
+static int active_cacheline_insert(struct dma_debug_entry *entry)
+{
+ phys_addr_t cln = to_cacheline_number(entry);
+ unsigned long flags;
+ int rc;
+
+ /* If the device is not writing memory then we don't have any
+ * concerns about the cpu consuming stale data. This mitigates
+ * legitimate usages of overlapping mappings.
+ */
+ if (entry->direction == DMA_TO_DEVICE)
+ return 0;
+
+ spin_lock_irqsave(&radix_lock, flags);
+ rc = radix_tree_insert(&dma_active_cacheline, cln, entry);
+ if (rc == -EEXIST)
+ active_cacheline_inc_overlap(cln);
+ spin_unlock_irqrestore(&radix_lock, flags);
+
+ return rc;
+}
+
+static void active_cacheline_remove(struct dma_debug_entry *entry)
+{
+ phys_addr_t cln = to_cacheline_number(entry);
+ unsigned long flags;
+
+ /* ...mirror the insert case */
+ if (entry->direction == DMA_TO_DEVICE)
+ return;
+
+ spin_lock_irqsave(&radix_lock, flags);
+ /* since we are counting overlaps the final put of the
+ * cacheline will occur when the overlap count is 0.
+ * active_cacheline_dec_overlap() returns -1 in that case
+ */
+ if (active_cacheline_dec_overlap(cln) < 0)
+ radix_tree_delete(&dma_active_cacheline, cln);
+ spin_unlock_irqrestore(&radix_lock, flags);
+}
+
+/*
+ * Dump mappings entries on kernel space for debugging purposes
+ */
+void debug_dma_dump_mappings(struct device *dev)
+{
+ int idx;
+ phys_addr_t cln;
+
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!dev || dev == entry->dev) {
+ cln = to_cacheline_number(entry);
+ dev_info(entry->dev,
+ "%s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
+ type2name[entry->type], idx,
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+
+ cond_resched();
+ }
+}
+
+/*
+ * Dump mappings entries on user space via debugfs
+ */
+static int dump_show(struct seq_file *seq, void *v)
+{
+ int idx;
+ phys_addr_t cln;
+
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ cln = to_cacheline_number(entry);
+ seq_printf(seq,
+ "%s %s %s idx %d P=%pa D=%llx L=%llx cln=%pa %s %s\n",
+ dev_driver_string(entry->dev),
+ dev_name(entry->dev),
+ type2name[entry->type], idx,
+ &entry->paddr, entry->dev_addr,
+ entry->size, &cln,
+ dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+ }
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dump);
+
+/*
+ * Wrapper function for adding an entry to the hash.
+ * This function takes care of locking itself.
+ */
+static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
+{
+ struct hash_bucket *bucket;
+ unsigned long flags;
+ int rc;
+
+ bucket = get_hash_bucket(entry, &flags);
+ hash_bucket_add(bucket, entry);
+ put_hash_bucket(bucket, flags);
+
+ rc = active_cacheline_insert(entry);
+ if (rc == -ENOMEM) {
+ pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
+ global_disable = true;
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
+ is_swiotlb_active(entry->dev))) {
+ err_printk(entry->dev, entry,
+ "cacheline tracking EEXIST, overlapping mappings aren't supported\n");
+ }
+}
+
+static int dma_debug_create_entries(gfp_t gfp)
+{
+ struct dma_debug_entry *entry;
+ int i;
+
+ entry = (void *)get_zeroed_page(gfp);
+ if (!entry)
+ return -ENOMEM;
+
+ for (i = 0; i < DMA_DEBUG_DYNAMIC_ENTRIES; i++)
+ list_add_tail(&entry[i].list, &free_entries);
+
+ num_free_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
+ nr_total_entries += DMA_DEBUG_DYNAMIC_ENTRIES;
+
+ return 0;
+}
+
+static struct dma_debug_entry *__dma_entry_alloc(void)
+{
+ struct dma_debug_entry *entry;
+
+ entry = list_entry(free_entries.next, struct dma_debug_entry, list);
+ list_del(&entry->list);
+ memset(entry, 0, sizeof(*entry));
+
+ num_free_entries -= 1;
+ if (num_free_entries < min_free_entries)
+ min_free_entries = num_free_entries;
+
+ return entry;
+}
+
+/*
+ * This should be called outside of free_entries_lock scope to avoid potential
+ * deadlocks with serial consoles that use DMA.
+ */
+static void __dma_entry_alloc_check_leak(u32 nr_entries)
+{
+ u32 tmp = nr_entries % nr_prealloc_entries;
+
+ /* Shout each time we tick over some multiple of the initial pool */
+ if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
+ pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
+ nr_entries,
+ (nr_entries / nr_prealloc_entries));
+ }
+}
+
+/* struct dma_entry allocator
+ *
+ * The next two functions implement the allocator for
+ * struct dma_debug_entries.
+ */
+static struct dma_debug_entry *dma_entry_alloc(void)
+{
+ bool alloc_check_leak = false;
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+ u32 nr_entries;
+
+ spin_lock_irqsave(&free_entries_lock, flags);
+ if (num_free_entries == 0) {
+ if (dma_debug_create_entries(GFP_ATOMIC)) {
+ global_disable = true;
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+ pr_err("debugging out of memory - disabling\n");
+ return NULL;
+ }
+ alloc_check_leak = true;
+ nr_entries = nr_total_entries;
+ }
+
+ entry = __dma_entry_alloc();
+
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+
+ if (alloc_check_leak)
+ __dma_entry_alloc_check_leak(nr_entries);
+
+#ifdef CONFIG_STACKTRACE
+ entry->stack_len = stack_trace_save(entry->stack_entries,
+ ARRAY_SIZE(entry->stack_entries),
+ 1);
+#endif
+ return entry;
+}
+
+static void dma_entry_free(struct dma_debug_entry *entry)
+{
+ unsigned long flags;
+
+ active_cacheline_remove(entry);
+
+ /*
+ * add to beginning of the list - this way the entries are
+ * more likely cache hot when they are reallocated.
+ */
+ spin_lock_irqsave(&free_entries_lock, flags);
+ list_add(&entry->list, &free_entries);
+ num_free_entries += 1;
+ spin_unlock_irqrestore(&free_entries_lock, flags);
+}
+
+/*
+ * DMA-API debugging init code
+ *
+ * The init code does two things:
+ * 1. Initialize core data structures
+ * 2. Preallocate a given number of dma_debug_entry structs
+ */
+
+static ssize_t filter_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[NAME_MAX_LEN + 1];
+ unsigned long flags;
+ int len;
+
+ if (!current_driver_name[0])
+ return 0;
+
+ /*
+ * We can't copy to userspace directly because current_driver_name can
+ * only be read under the driver_name_lock with irqs disabled. So
+ * create a temporary copy first.
+ */
+ read_lock_irqsave(&driver_name_lock, flags);
+ len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name);
+ read_unlock_irqrestore(&driver_name_lock, flags);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t filter_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char buf[NAME_MAX_LEN];
+ unsigned long flags;
+ size_t len;
+ int i;
+
+ /*
+ * We can't copy from userspace directly. Access to
+ * current_driver_name is protected with a write_lock with irqs
+ * disabled. Since copy_from_user can fault and may sleep we
+ * need to copy to temporary buffer first
+ */
+ len = min(count, (size_t)(NAME_MAX_LEN - 1));
+ if (copy_from_user(buf, userbuf, len))
+ return -EFAULT;
+
+ buf[len] = 0;
+
+ write_lock_irqsave(&driver_name_lock, flags);
+
+ /*
+ * Now handle the string we got from userspace very carefully.
+ * The rules are:
+ * - only use the first token we got
+ * - token delimiter is everything looking like a space
+ * character (' ', '\n', '\t' ...)
+ *
+ */
+ if (!isalnum(buf[0])) {
+ /*
+ * If the first character userspace gave us is not
+ * alphanumerical then assume the filter should be
+ * switched off.
+ */
+ if (current_driver_name[0])
+ pr_info("switching off dma-debug driver filter\n");
+ current_driver_name[0] = 0;
+ current_driver = NULL;
+ goto out_unlock;
+ }
+
+ /*
+ * Now parse out the first token and use it as the name for the
+ * driver to filter for.
+ */
+ for (i = 0; i < NAME_MAX_LEN - 1; ++i) {
+ current_driver_name[i] = buf[i];
+ if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0)
+ break;
+ }
+ current_driver_name[i] = 0;
+ current_driver = NULL;
+
+ pr_info("enable driver filter for driver [%s]\n",
+ current_driver_name);
+
+out_unlock:
+ write_unlock_irqrestore(&driver_name_lock, flags);
+
+ return count;
+}
+
+static const struct file_operations filter_fops = {
+ .read = filter_read,
+ .write = filter_write,
+ .llseek = default_llseek,
+};
+
+static int __init dma_debug_fs_init(void)
+{
+ struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
+
+ debugfs_create_bool("disabled", 0444, dentry, &global_disable);
+ debugfs_create_u32("error_count", 0444, dentry, &error_count);
+ debugfs_create_u32("all_errors", 0644, dentry, &show_all_errors);
+ debugfs_create_u32("num_errors", 0644, dentry, &show_num_errors);
+ debugfs_create_u32("num_free_entries", 0444, dentry, &num_free_entries);
+ debugfs_create_u32("min_free_entries", 0444, dentry, &min_free_entries);
+ debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries);
+ debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops);
+ debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops);
+
+ return 0;
+}
+core_initcall_sync(dma_debug_fs_init);
+
+static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
+{
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+ int count = 0, i;
+
+ for (i = 0; i < HASH_SIZE; ++i) {
+ spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
+ list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
+ if (entry->dev == dev) {
+ count += 1;
+ *out_entry = entry;
+ }
+ }
+ spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
+ }
+
+ return count;
+}
+
+static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data)
+{
+ struct device *dev = data;
+ struct dma_debug_entry *entry;
+ int count;
+
+ if (dma_debug_disabled())
+ return 0;
+
+ switch (action) {
+ case BUS_NOTIFY_UNBOUND_DRIVER:
+ count = device_dma_allocations(dev, &entry);
+ if (count == 0)
+ break;
+ err_printk(dev, entry, "device driver has pending "
+ "DMA allocations while released from device "
+ "[count=%d]\n"
+ "One of leaked entries details: "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [mapped as %s]\n",
+ count, entry->dev_addr, entry->size,
+ dir2name[entry->direction], type2name[entry->type]);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+void dma_debug_add_bus(const struct bus_type *bus)
+{
+ struct notifier_block *nb;
+
+ if (dma_debug_disabled())
+ return;
+
+ nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+ if (nb == NULL) {
+ pr_err("dma_debug_add_bus: out of memory\n");
+ return;
+ }
+
+ nb->notifier_call = dma_debug_device_change;
+
+ bus_register_notifier(bus, nb);
+}
+
+static int dma_debug_init(void)
+{
+ int i, nr_pages;
+
+ /* Do not use dma_debug_initialized here, since we really want to be
+ * called to set dma_debug_initialized
+ */
+ if (global_disable)
+ return 0;
+
+ for (i = 0; i < HASH_SIZE; ++i) {
+ INIT_LIST_HEAD(&dma_entry_hash[i].list);
+ spin_lock_init(&dma_entry_hash[i].lock);
+ }
+
+ nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
+ for (i = 0; i < nr_pages; ++i)
+ dma_debug_create_entries(GFP_KERNEL);
+ if (num_free_entries >= nr_prealloc_entries) {
+ pr_info("preallocated %d debug entries\n", nr_total_entries);
+ } else if (num_free_entries > 0) {
+ pr_warn("%d debug entries requested but only %d allocated\n",
+ nr_prealloc_entries, nr_total_entries);
+ } else {
+ pr_err("debugging out of memory error - disabled\n");
+ global_disable = true;
+
+ return 0;
+ }
+ min_free_entries = num_free_entries;
+
+ dma_debug_initialized = true;
+
+ pr_info("debugging enabled by kernel config\n");
+ return 0;
+}
+core_initcall(dma_debug_init);
+
+static __init int dma_debug_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (strncmp(str, "off", 3) == 0) {
+ pr_info("debugging disabled on kernel command line\n");
+ global_disable = true;
+ }
+
+ return 1;
+}
+
+static __init int dma_debug_entries_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!get_option(&str, &nr_prealloc_entries))
+ nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
+ return 1;
+}
+
+__setup("dma_debug=", dma_debug_cmdline);
+__setup("dma_debug_entries=", dma_debug_entries_cmdline);
+
+static void check_unmap(struct dma_debug_entry *ref)
+{
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+
+ bucket = get_hash_bucket(ref, &flags);
+ entry = bucket_find_exact(bucket, ref);
+
+ if (!entry) {
+ /* must drop lock before calling dma_mapping_error */
+ put_hash_bucket(bucket, flags);
+
+ if (dma_mapping_error(ref->dev, ref->dev_addr)) {
+ err_printk(ref->dev, NULL,
+ "device driver tries to free an "
+ "invalid DMA memory address\n");
+ } else {
+ err_printk(ref->dev, NULL,
+ "device driver tries to free DMA "
+ "memory it has not allocated [device "
+ "address=0x%016llx] [size=%llu bytes]\n",
+ ref->dev_addr, ref->size);
+ }
+ return;
+ }
+
+ if (ref->size != entry->size) {
+ err_printk(ref->dev, entry, "device driver frees "
+ "DMA memory with different size "
+ "[device address=0x%016llx] [map size=%llu bytes] "
+ "[unmap size=%llu bytes]\n",
+ ref->dev_addr, entry->size, ref->size);
+ }
+
+ if (ref->type != entry->type) {
+ err_printk(ref->dev, entry, "device driver frees "
+ "DMA memory with wrong function "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped as %s] [unmapped as %s]\n",
+ ref->dev_addr, ref->size,
+ type2name[entry->type], type2name[ref->type]);
+ } else if ((entry->type == dma_debug_coherent ||
+ entry->type == dma_debug_noncoherent) &&
+ ref->paddr != entry->paddr) {
+ err_printk(ref->dev, entry, "device driver frees "
+ "DMA memory with different CPU address "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[cpu alloc address=0x%pa] "
+ "[cpu free address=0x%pa]",
+ ref->dev_addr, ref->size,
+ &entry->paddr,
+ &ref->paddr);
+ }
+
+ if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+ ref->sg_call_ents != entry->sg_call_ents) {
+ err_printk(ref->dev, entry, "device driver frees "
+ "DMA sg list with different entry count "
+ "[map count=%d] [unmap count=%d]\n",
+ entry->sg_call_ents, ref->sg_call_ents);
+ }
+
+ /*
+ * This may be no bug in reality - but most implementations of the
+ * DMA API don't handle this properly, so check for it here
+ */
+ if (ref->direction != entry->direction) {
+ err_printk(ref->dev, entry, "device driver frees "
+ "DMA memory with different direction "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [unmapped with %s]\n",
+ ref->dev_addr, ref->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+ }
+
+ /*
+ * Drivers should use dma_mapping_error() to check the returned
+ * addresses of dma_map_single() and dma_map_page().
+ * If not, print this warning message. See Documentation/core-api/dma-api.rst.
+ */
+ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
+ err_printk(ref->dev, entry,
+ "device driver failed to check map error"
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped as %s]",
+ ref->dev_addr, ref->size,
+ type2name[entry->type]);
+ }
+
+ hash_bucket_del(entry);
+ put_hash_bucket(bucket, flags);
+
+ /*
+ * Free the entry outside of bucket_lock to avoid ABBA deadlocks
+ * between that and radix_lock.
+ */
+ dma_entry_free(entry);
+}
+
+static void check_for_stack(struct device *dev, phys_addr_t phys)
+{
+ void *addr;
+ struct vm_struct *stack_vm_area = task_stack_vm_area(current);
+
+ if (!stack_vm_area) {
+ /* Stack is direct-mapped. */
+ if (PhysHighMem(phys))
+ return;
+ addr = phys_to_virt(phys);
+ if (object_is_on_stack(addr))
+ err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr);
+ } else {
+ /* Stack is vmalloced. */
+ int i;
+
+ for (i = 0; i < stack_vm_area->nr_pages; i++) {
+ if (__phys_to_pfn(phys) !=
+ page_to_pfn(stack_vm_area->pages[i]))
+ continue;
+
+ addr = (u8 *)current->stack + i * PAGE_SIZE +
+ (phys % PAGE_SIZE);
+ err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr);
+ break;
+ }
+ }
+}
+
+static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len)
+{
+ if (memory_intersects(_stext, _etext, addr, len) ||
+ memory_intersects(__start_rodata, __end_rodata, addr, len))
+ err_printk(dev, NULL, "device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len);
+}
+
+static void check_sync(struct device *dev,
+ struct dma_debug_entry *ref,
+ bool to_cpu)
+{
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+
+ bucket = get_hash_bucket(ref, &flags);
+
+ entry = bucket_find_contain(&bucket, ref, &flags);
+
+ if (!entry) {
+ err_printk(dev, NULL, "device driver tries "
+ "to sync DMA memory it has not allocated "
+ "[device address=0x%016llx] [size=%llu bytes]\n",
+ (unsigned long long)ref->dev_addr, ref->size);
+ goto out;
+ }
+
+ if (ref->size > entry->size) {
+ err_printk(dev, entry, "device driver syncs"
+ " DMA memory outside allocated range "
+ "[device address=0x%016llx] "
+ "[allocation size=%llu bytes] "
+ "[sync offset+size=%llu]\n",
+ entry->dev_addr, entry->size,
+ ref->size);
+ }
+
+ if (entry->direction == DMA_BIDIRECTIONAL)
+ goto out;
+
+ if (ref->direction != entry->direction) {
+ err_printk(dev, entry, "device driver syncs "
+ "DMA memory with different direction "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [synced with %s]\n",
+ (unsigned long long)ref->dev_addr, entry->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+ }
+
+ if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) &&
+ !(ref->direction == DMA_TO_DEVICE))
+ err_printk(dev, entry, "device driver syncs "
+ "device read-only DMA memory for cpu "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [synced with %s]\n",
+ (unsigned long long)ref->dev_addr, entry->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+
+ if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) &&
+ !(ref->direction == DMA_FROM_DEVICE))
+ err_printk(dev, entry, "device driver syncs "
+ "device write-only DMA memory to device "
+ "[device address=0x%016llx] [size=%llu bytes] "
+ "[mapped with %s] [synced with %s]\n",
+ (unsigned long long)ref->dev_addr, entry->size,
+ dir2name[entry->direction],
+ dir2name[ref->direction]);
+
+ if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+ ref->sg_call_ents != entry->sg_call_ents) {
+ err_printk(ref->dev, entry, "device driver syncs "
+ "DMA sg list with different entry count "
+ "[map count=%d] [sync count=%d]\n",
+ entry->sg_call_ents, ref->sg_call_ents);
+ }
+
+out:
+ put_hash_bucket(bucket, flags);
+}
+
+static void check_sg_segment(struct device *dev, struct scatterlist *sg)
+{
+ unsigned int max_seg = dma_get_max_seg_size(dev);
+ u64 start, end, boundary = dma_get_seg_boundary(dev);
+
+ /*
+ * Either the driver forgot to set dma_parms appropriately, or
+ * whoever generated the list forgot to check them.
+ */
+ if (sg->length > max_seg)
+ err_printk(dev, NULL, "mapping sg segment longer than device claims to support [len=%u] [max=%u]\n",
+ sg->length, max_seg);
+ /*
+ * In some cases this could potentially be the DMA API
+ * implementation's fault, but it would usually imply that
+ * the scatterlist was built inappropriately to begin with.
+ */
+ start = sg_dma_address(sg);
+ end = start + sg_dma_len(sg) - 1;
+ if ((start ^ end) & ~boundary)
+ err_printk(dev, NULL, "mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n",
+ start, end, boundary);
+}
+
+void debug_dma_map_single(struct device *dev, const void *addr,
+ unsigned long len)
+{
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ if (!virt_addr_valid(addr))
+ err_printk(dev, NULL, "device driver maps memory from invalid area [addr=%p] [len=%lu]\n",
+ addr, len);
+
+ if (is_vmalloc_addr(addr))
+ err_printk(dev, NULL, "device driver maps memory from vmalloc area [addr=%p] [len=%lu]\n",
+ addr, len);
+}
+EXPORT_SYMBOL(debug_dma_map_single);
+
+void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+ int direction, dma_addr_t dma_addr, unsigned long attrs)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ if (dma_mapping_error(dev, dma_addr))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->dev = dev;
+ entry->type = dma_debug_phy;
+ entry->paddr = phys;
+ entry->dev_addr = dma_addr;
+ entry->size = size;
+ entry->direction = direction;
+ entry->map_err_type = MAP_ERR_NOT_CHECKED;
+
+ if (!(attrs & DMA_ATTR_MMIO)) {
+ check_for_stack(dev, phys);
+
+ if (!PhysHighMem(phys))
+ check_for_illegal_area(dev, phys_to_virt(phys), size);
+ }
+
+ add_dma_entry(entry, attrs);
+}
+
+void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ struct dma_debug_entry ref;
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.dev = dev;
+ ref.dev_addr = dma_addr;
+ bucket = get_hash_bucket(&ref, &flags);
+
+ list_for_each_entry(entry, &bucket->list, list) {
+ if (!exact_match(&ref, entry))
+ continue;
+
+ /*
+ * The same physical address can be mapped multiple
+ * times. Without a hardware IOMMU this results in the
+ * same device addresses being put into the dma-debug
+ * hash multiple times too. This can result in false
+ * positives being reported. Therefore we implement a
+ * best-fit algorithm here which updates the first entry
+ * from the hash which fits the reference value and is
+ * not currently listed as being checked.
+ */
+ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
+ entry->map_err_type = MAP_ERR_CHECKED;
+ break;
+ }
+ }
+
+ put_hash_bucket(bucket, flags);
+}
+EXPORT_SYMBOL(debug_dma_mapping_error);
+
+void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int direction)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_phy,
+ .dev = dev,
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+ check_unmap(&ref);
+}
+
+void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
+{
+ struct dma_debug_entry *entry;
+ struct scatterlist *s;
+ int i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sg, s, nents, i) {
+ check_for_stack(dev, sg_phys(s));
+ if (!PageHighMem(sg_page(s)))
+ check_for_illegal_area(dev, sg_virt(s), s->length);
+ }
+
+ for_each_sg(sg, s, mapped_ents, i) {
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_sg;
+ entry->dev = dev;
+ entry->paddr = sg_phys(s);
+ entry->size = sg_dma_len(s);
+ entry->dev_addr = sg_dma_address(s);
+ entry->direction = direction;
+ entry->sg_call_ents = nents;
+ entry->sg_mapped_ents = mapped_ents;
+
+ check_sg_segment(dev, s);
+
+ add_dma_entry(entry, attrs);
+ }
+}
+
+static int get_nr_mapped_entries(struct device *dev,
+ struct dma_debug_entry *ref)
+{
+ struct dma_debug_entry *entry;
+ struct hash_bucket *bucket;
+ unsigned long flags;
+ int mapped_ents;
+
+ bucket = get_hash_bucket(ref, &flags);
+ entry = bucket_find_exact(bucket, ref);
+ mapped_ents = 0;
+
+ if (entry)
+ mapped_ents = entry->sg_mapped_ents;
+ put_hash_bucket(bucket, flags);
+
+ return mapped_ents;
+}
+
+void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ struct scatterlist *s;
+ int mapped_ents = 0, i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sglist, s, nelems, i) {
+
+ struct dma_debug_entry ref = {
+ .type = dma_debug_sg,
+ .dev = dev,
+ .paddr = sg_phys(s),
+ .dev_addr = sg_dma_address(s),
+ .size = sg_dma_len(s),
+ .direction = dir,
+ .sg_call_ents = nelems,
+ };
+
+ if (mapped_ents && i >= mapped_ents)
+ break;
+
+ if (!i)
+ mapped_ents = get_nr_mapped_entries(dev, &ref);
+
+ check_unmap(&ref);
+ }
+}
+
+static phys_addr_t virt_to_paddr(void *virt)
+{
+ struct page *page;
+
+ if (is_vmalloc_addr(virt))
+ page = vmalloc_to_page(virt);
+ else
+ page = virt_to_page(virt);
+
+ return page_to_phys(page) + offset_in_page(virt);
+}
+
+void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ if (unlikely(virt == NULL))
+ return;
+
+ /* handle vmalloc and linear addresses */
+ if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_coherent;
+ entry->dev = dev;
+ entry->paddr = virt_to_paddr(virt);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = DMA_BIDIRECTIONAL;
+
+ add_dma_entry(entry, attrs);
+}
+
+void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t dma_addr)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_coherent,
+ .dev = dev,
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = DMA_BIDIRECTIONAL,
+ };
+
+ /* handle vmalloc and linear addresses */
+ if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt))
+ return;
+
+ ref.paddr = virt_to_paddr(virt);
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+
+void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ struct dma_debug_entry ref;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.type = dma_debug_single;
+ ref.dev = dev;
+ ref.dev_addr = dma_handle;
+ ref.size = size;
+ ref.direction = direction;
+ ref.sg_call_ents = 0;
+
+ check_sync(dev, &ref, true);
+}
+
+void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle, size_t size,
+ int direction)
+{
+ struct dma_debug_entry ref;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ ref.type = dma_debug_single;
+ ref.dev = dev;
+ ref.dev_addr = dma_handle;
+ ref.size = size;
+ ref.direction = direction;
+ ref.sg_call_ents = 0;
+
+ check_sync(dev, &ref, false);
+}
+
+void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct scatterlist *s;
+ int mapped_ents = 0, i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sg, s, nelems, i) {
+
+ struct dma_debug_entry ref = {
+ .type = dma_debug_sg,
+ .dev = dev,
+ .paddr = sg_phys(s),
+ .dev_addr = sg_dma_address(s),
+ .size = sg_dma_len(s),
+ .direction = direction,
+ .sg_call_ents = nelems,
+ };
+
+ if (!i)
+ mapped_ents = get_nr_mapped_entries(dev, &ref);
+
+ if (i >= mapped_ents)
+ break;
+
+ check_sync(dev, &ref, true);
+ }
+}
+
+void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct scatterlist *s;
+ int mapped_ents = 0, i;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ for_each_sg(sg, s, nelems, i) {
+
+ struct dma_debug_entry ref = {
+ .type = dma_debug_sg,
+ .dev = dev,
+ .paddr = sg_phys(sg),
+ .dev_addr = sg_dma_address(s),
+ .size = sg_dma_len(s),
+ .direction = direction,
+ .sg_call_ents = nelems,
+ };
+ if (!i)
+ mapped_ents = get_nr_mapped_entries(dev, &ref);
+
+ if (i >= mapped_ents)
+ break;
+
+ check_sync(dev, &ref, false);
+ }
+}
+
+void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+ struct dma_debug_entry *entry;
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ entry = dma_entry_alloc();
+ if (!entry)
+ return;
+
+ entry->type = dma_debug_noncoherent;
+ entry->dev = dev;
+ entry->paddr = page_to_phys(page);
+ entry->size = size;
+ entry->dev_addr = dma_addr;
+ entry->direction = direction;
+
+ add_dma_entry(entry, attrs);
+}
+
+void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+ struct dma_debug_entry ref = {
+ .type = dma_debug_noncoherent,
+ .dev = dev,
+ .paddr = page_to_phys(page),
+ .dev_addr = dma_addr,
+ .size = size,
+ .direction = direction,
+ };
+
+ if (unlikely(dma_debug_disabled()))
+ return;
+
+ check_unmap(&ref);
+}
+
+static int __init dma_debug_driver_setup(char *str)
+{
+ int i;
+
+ for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) {
+ current_driver_name[i] = *str;
+ if (*str == 0)
+ break;
+ }
+
+ if (current_driver_name[0])
+ pr_info("enable driver filter for driver [%s]\n",
+ current_driver_name);
+
+
+ return 1;
+}
+__setup("dma_debug_driver=", dma_debug_driver_setup);
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
new file mode 100644
index 000000000000..da7be0bddcf6
--- /dev/null
+++ b/kernel/dma/debug.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2008 Advanced Micro Devices, Inc.
+ *
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ */
+
+#ifndef _KERNEL_DMA_DEBUG_H
+#define _KERNEL_DMA_DEBUG_H
+
+#ifdef CONFIG_DMA_API_DEBUG
+extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, int direction, dma_addr_t dma_addr,
+ unsigned long attrs);
+
+extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
+ size_t size, int direction);
+
+extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs);
+
+extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir);
+
+extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs);
+
+extern void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t addr);
+
+extern void debug_dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t dma_handle, size_t size,
+ int direction);
+
+extern void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction);
+
+extern void debug_dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction);
+
+extern void debug_dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction);
+extern void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs);
+extern void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr);
+#else /* CONFIG_DMA_API_DEBUG */
+static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, int direction,
+ dma_addr_t dma_addr, unsigned long attrs)
+{
+}
+
+static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_unmap_sg(struct device *dev,
+ struct scatterlist *sglist,
+ int nelems, int dir)
+{
+}
+
+static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_free_coherent(struct device *dev, size_t size,
+ void *virt, dma_addr_t addr)
+{
+}
+
+static inline void debug_dma_sync_single_for_cpu(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_sync_single_for_device(struct device *dev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+}
+
+static inline void debug_dma_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+}
+
+static inline void debug_dma_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+}
+
+static inline void debug_dma_alloc_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr,
+ unsigned long attrs)
+{
+}
+
+static inline void debug_dma_free_pages(struct device *dev, struct page *page,
+ size_t size, int direction,
+ dma_addr_t dma_addr)
+{
+}
+#endif /* CONFIG_DMA_API_DEBUG */
+#endif /* _KERNEL_DMA_DEBUG_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
new file mode 100644
index 000000000000..50c3fe2a1d55
--- /dev/null
+++ b/kernel/dma/direct.c
@@ -0,0 +1,668 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018-2020 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without using an IOMMU.
+ */
+#include <linux/memblock.h> /* for max_pfn */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-map-ops.h>
+#include <linux/scatterlist.h>
+#include <linux/pfn.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
+#include "direct.h"
+
+/*
+ * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use
+ * it for entirely different regions. In that case the arch code needs to
+ * override the variable below for dma-direct to work properly.
+ */
+u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);
+
+static inline dma_addr_t phys_to_dma_direct(struct device *dev,
+ phys_addr_t phys)
+{
+ if (force_dma_unencrypted(dev))
+ return phys_to_dma_unencrypted(dev, phys);
+ return phys_to_dma(dev, phys);
+}
+
+static inline struct page *dma_direct_to_page(struct device *dev,
+ dma_addr_t dma_addr)
+{
+ return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr)));
+}
+
+u64 dma_direct_get_required_mask(struct device *dev)
+{
+ phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT;
+ u64 max_dma = phys_to_dma_direct(dev, phys);
+
+ return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
+}
+
+static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit)
+{
+ u64 dma_limit = min_not_zero(
+ dev->coherent_dma_mask,
+ dev->bus_dma_limit);
+
+ /*
+ * Optimistically try the zone that the physical address mask falls
+ * into first. If that returns memory that isn't actually addressable
+ * we will fallback to the next lower zone and try again.
+ *
+ * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
+ * zones.
+ */
+ *phys_limit = dma_to_phys(dev, dma_limit);
+ if (*phys_limit <= zone_dma_limit)
+ return GFP_DMA;
+ if (*phys_limit <= DMA_BIT_MASK(32))
+ return GFP_DMA32;
+ return 0;
+}
+
+bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
+{
+ dma_addr_t dma_addr = phys_to_dma_direct(dev, phys);
+
+ if (dma_addr == DMA_MAPPING_ERROR)
+ return false;
+ return dma_addr + size - 1 <=
+ min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
+}
+
+static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
+{
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size));
+}
+
+static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
+{
+ int ret;
+
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
+ if (ret)
+ pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
+ return ret;
+}
+
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+ size_t size)
+{
+ if (swiotlb_free(dev, page, size))
+ return;
+ dma_free_contiguous(dev, page, size);
+}
+
+static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
+{
+ struct page *page = swiotlb_alloc(dev, size);
+
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ swiotlb_free(dev, page, size);
+ return NULL;
+ }
+
+ return page;
+}
+
+static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
+ gfp_t gfp, bool allow_highmem)
+{
+ int node = dev_to_node(dev);
+ struct page *page;
+ u64 phys_limit;
+
+ WARN_ON_ONCE(!PAGE_ALIGNED(size));
+
+ if (is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_swiotlb(dev, size);
+
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
+ page = dma_alloc_contiguous(dev, size, gfp);
+ if (page) {
+ if (dma_coherent_ok(dev, page_to_phys(page), size) &&
+ (allow_highmem || !PageHighMem(page)))
+ return page;
+
+ dma_free_contiguous(dev, page, size);
+ }
+
+ while ((page = alloc_pages_node(node, gfp, get_order(size)))
+ && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ __free_pages(page, get_order(size));
+
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ phys_limit < DMA_BIT_MASK(64) &&
+ !(gfp & (GFP_DMA32 | GFP_DMA)))
+ gfp |= GFP_DMA32;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA))
+ gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+ else
+ return NULL;
+ }
+
+ return page;
+}
+
+/*
+ * Check if a potentially blocking operations needs to dip into the atomic
+ * pools for the given device/gfp.
+ */
+static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
+{
+ return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
+}
+
+static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+ u64 phys_limit;
+ void *ret;
+
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
+ return NULL;
+
+ gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
+ page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
+ if (!page)
+ return NULL;
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return ret;
+}
+
+static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+
+ /* return the page pointer as the opaque cookie */
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+}
+
+void *dma_direct_alloc(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+ bool remap = false, set_uncached = false;
+ struct page *page;
+ void *ret;
+
+ size = PAGE_ALIGN(size);
+ if (attrs & DMA_ATTR_NO_WARN)
+ gfp |= __GFP_NOWARN;
+
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
+
+ if (!dev_is_dma_coherent(dev)) {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
+ !is_swiotlb_for_alloc(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp,
+ attrs);
+
+ /*
+ * If there is a global pool, always allocate from it for
+ * non-coherent devices.
+ */
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
+ return dma_alloc_from_global_coherent(dev, size,
+ dma_handle);
+
+ /*
+ * Otherwise we require the architecture to either be able to
+ * mark arbitrary parts of the kernel direct mapping uncached,
+ * or remapped it uncached.
+ */
+ set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED);
+ remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
+ if (!set_uncached && !remap) {
+ pr_warn_once("coherent DMA allocations not supported on this platform.\n");
+ return NULL;
+ }
+ }
+
+ /*
+ * Remapping or decrypting memory may block, allocate the memory from
+ * the atomic pools instead if we aren't allowed block.
+ */
+ if ((remap || force_dma_unencrypted(dev)) &&
+ dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+ /* we always manually zero the memory once we are done */
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
+ if (!page)
+ return NULL;
+
+ /*
+ * dma_alloc_contiguous can return highmem pages depending on a
+ * combination the cma= arguments and per-arch setup. These need to be
+ * remapped to return a kernel virtual address.
+ */
+ if (PageHighMem(page)) {
+ remap = true;
+ set_uncached = false;
+ }
+
+ if (remap) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ if (force_dma_unencrypted(dev))
+ prot = pgprot_decrypted(prot);
+
+ /* remove any dirty cache lines on the kernel alias */
+ arch_dma_prep_coherent(page, size);
+
+ /* create a coherent mapping */
+ ret = dma_common_contiguous_remap(page, size, prot,
+ __builtin_return_address(0));
+ if (!ret)
+ goto out_free_pages;
+ } else {
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
+ goto out_leak_pages;
+ }
+
+ memset(ret, 0, size);
+
+ if (set_uncached) {
+ arch_dma_prep_coherent(page, size);
+ ret = arch_dma_set_uncached(ret, size);
+ if (IS_ERR(ret))
+ goto out_encrypt_pages;
+ }
+
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return ret;
+
+out_encrypt_pages:
+ if (dma_set_encrypted(dev, page_address(page), size))
+ return NULL;
+out_free_pages:
+ __dma_direct_free_pages(dev, page, size);
+ return NULL;
+out_leak_pages:
+ return NULL;
+}
+
+void dma_direct_free(struct device *dev, size_t size,
+ void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
+{
+ unsigned int page_order = get_order(size);
+
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
+ /* cpu_addr is a struct page cookie, not a kernel address */
+ dma_free_contiguous(dev, cpu_addr, size);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
+ !dev_is_dma_coherent(dev) &&
+ !is_swiotlb_for_alloc(dev)) {
+ arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !dev_is_dma_coherent(dev)) {
+ if (!dma_release_from_global_coherent(page_order, cpu_addr))
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
+ return;
+
+ if (is_vmalloc_addr(cpu_addr)) {
+ vunmap(cpu_addr);
+ } else {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+ arch_dma_clear_uncached(cpu_addr, size);
+ if (dma_set_encrypted(dev, cpu_addr, size))
+ return;
+ }
+
+ __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
+}
+
+struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ struct page *page;
+ void *ret;
+
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
+
+ page = __dma_direct_alloc_pages(dev, size, gfp, false);
+ if (!page)
+ return NULL;
+
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
+ goto out_leak_pages;
+ memset(ret, 0, size);
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+out_leak_pages:
+ return NULL;
+}
+
+void dma_direct_free_pages(struct device *dev, size_t size,
+ struct page *page, dma_addr_t dma_addr,
+ enum dma_data_direction dir)
+{
+ void *vaddr = page_address(page);
+
+ /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(dev, vaddr, size))
+ return;
+
+ if (dma_set_encrypted(dev, vaddr, size))
+ return;
+ __dma_direct_free_pages(dev, page, size);
+}
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i) {
+ phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
+
+ swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(paddr, sg->length,
+ dir);
+ }
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i) {
+ phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu(paddr, sg->length, dir);
+
+ swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
+
+ if (dir == DMA_FROM_DEVICE)
+ arch_dma_mark_clean(paddr, sg->length);
+ }
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu_all();
+}
+
+/*
+ * Unmaps segments, except for ones marked as pci_p2pdma which do not
+ * require any further action as they contain a bus address.
+ */
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i) {
+ if (sg_dma_is_bus_address(sg))
+ sg_dma_unmark_bus_address(sg);
+ else
+ dma_direct_unmap_phys(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
+ }
+}
+#endif
+
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct pci_p2pdma_map_state p2pdma_state = {};
+ struct scatterlist *sg;
+ int i, ret;
+
+ for_each_sg(sgl, sg, nents, i) {
+ switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI host bridge
+ * must be mapped with CPU physical address and not PCI
+ * bus addresses.
+ */
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+ sg->length, dir, attrs);
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ sg->dma_address = pci_p2pdma_bus_addr_map(
+ p2pdma_state.mem, sg_phys(sg));
+ sg_dma_len(sg) = sg->length;
+ sg_dma_mark_bus_address(sg);
+ continue;
+ default:
+ ret = -EREMOTEIO;
+ goto out_unmap;
+ }
+ sg_dma_len(sg) = sg->length;
+ }
+
+ return nents;
+
+out_unmap:
+ dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
+ return ret;
+}
+
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ struct page *page = dma_direct_to_page(dev, dma_addr);
+ int ret;
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
+}
+
+bool dma_direct_can_mmap(struct device *dev)
+{
+ return dev_is_dma_coherent(dev) ||
+ IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP);
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ unsigned long user_count = vma_pages(vma);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
+ int ret = -ENXIO;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+ if (force_dma_unencrypted(dev))
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+ if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
+ return -ENXIO;
+ return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+ user_count << PAGE_SHIFT, vma->vm_page_prot);
+}
+
+int dma_direct_supported(struct device *dev, u64 mask)
+{
+ u64 min_mask = (max_pfn - 1) << PAGE_SHIFT;
+
+ /*
+ * Because 32-bit DMA masks are so common we expect every architecture
+ * to be able to satisfy them - either by not supporting more physical
+ * memory, or by providing a ZONE_DMA32. If neither is the case, the
+ * architecture needs to use an IOMMU instead of the direct mapping.
+ */
+ if (mask >= DMA_BIT_MASK(32))
+ return 1;
+
+ /*
+ * This check needs to be against the actual bit mask value, so use
+ * phys_to_dma_unencrypted() here so that the SME encryption mask isn't
+ * part of the check.
+ */
+ if (IS_ENABLED(CONFIG_ZONE_DMA))
+ min_mask = min_t(u64, min_mask, zone_dma_limit);
+ return mask >= phys_to_dma_unencrypted(dev, min_mask);
+}
+
+static const struct bus_dma_region *dma_find_range(struct device *dev,
+ unsigned long start_pfn)
+{
+ const struct bus_dma_region *m;
+
+ for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
+ unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+
+ if (start_pfn >= cpu_start_pfn &&
+ start_pfn - cpu_start_pfn < PFN_DOWN(m->size))
+ return m;
+ }
+
+ return NULL;
+}
+
+/*
+ * To check whether all ram resource ranges are covered by dma range map
+ * Returns 0 when further check is needed
+ * Returns 1 if there is some RAM range can't be covered by dma_range_map
+ */
+static int check_ram_in_range_map(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ struct device *dev = data;
+
+ while (start_pfn < end_pfn) {
+ const struct bus_dma_region *bdr;
+
+ bdr = dma_find_range(dev, start_pfn);
+ if (!bdr)
+ return 1;
+
+ start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size);
+ }
+
+ return 0;
+}
+
+bool dma_direct_all_ram_mapped(struct device *dev)
+{
+ if (!dev->dma_range_map)
+ return true;
+ return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev,
+ check_ram_in_range_map);
+}
+
+size_t dma_direct_max_mapping_size(struct device *dev)
+{
+ /* If SWIOTLB is active, use its maximum mapping size */
+ if (is_swiotlb_active(dev) &&
+ (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev)))
+ return swiotlb_max_mapping_size(dev);
+ return SIZE_MAX;
+}
+
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+ return !dev_is_dma_coherent(dev) ||
+ swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr));
+}
+
+/**
+ * dma_direct_set_offset - Assign scalar offset for a single DMA range.
+ * @dev: device pointer; needed to "own" the alloced memory.
+ * @cpu_start: beginning of memory region covered by this offset.
+ * @dma_start: beginning of DMA/PCI region covered by this offset.
+ * @size: size of the region.
+ *
+ * This is for the simple case of a uniform offset which cannot
+ * be discovered by "dma-ranges".
+ *
+ * It returns -ENOMEM if out of memory, -EINVAL if a map
+ * already exists, 0 otherwise.
+ *
+ * Note: any call to this from a driver is a bug. The mapping needs
+ * to be described by the device tree or other firmware interfaces.
+ */
+int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
+ dma_addr_t dma_start, u64 size)
+{
+ struct bus_dma_region *map;
+ u64 offset = (u64)cpu_start - (u64)dma_start;
+
+ if (dev->dma_range_map) {
+ dev_err(dev, "attempt to add DMA range to existing map\n");
+ return -EINVAL;
+ }
+
+ if (!offset)
+ return 0;
+
+ map = kcalloc(2, sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+ map[0].cpu_start = cpu_start;
+ map[0].dma_start = dma_start;
+ map[0].size = size;
+ dev->dma_range_map = map;
+ return 0;
+}
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
new file mode 100644
index 000000000000..da2fadf45bcd
--- /dev/null
+++ b/kernel/dma/direct.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Christoph Hellwig.
+ *
+ * DMA operations that map physical memory directly without using an IOMMU.
+ */
+#ifndef _KERNEL_DMA_DIRECT_H
+#define _KERNEL_DMA_DIRECT_H
+
+#include <linux/dma-direct.h>
+#include <linux/memremap.h>
+
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs);
+bool dma_direct_can_mmap(struct device *dev);
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs);
+bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, unsigned long attrs);
+bool dma_direct_all_ram_mapped(struct device *dev);
+size_t dma_direct_max_mapping_size(struct device *dev);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+ defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_sg(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+ struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ swiotlb_sync_single_for_device(dev, paddr, size, dir);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_device(paddr, size, dir);
+}
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev)) {
+ arch_sync_dma_for_cpu(paddr, size, dir);
+ arch_sync_dma_for_cpu_all();
+ }
+
+ swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
+
+ if (dir == DMA_FROM_DEVICE)
+ arch_dma_mark_clean(paddr, size);
+}
+
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t dma_addr;
+
+ if (is_swiotlb_force_bounce(dev)) {
+ if (attrs & DMA_ATTR_MMIO)
+ goto err_overflow;
+
+ return swiotlb_map(dev, phys, size, dir, attrs);
+ }
+
+ if (attrs & DMA_ATTR_MMIO) {
+ dma_addr = phys;
+ if (unlikely(!dma_capable(dev, dma_addr, size, false)))
+ goto err_overflow;
+ } else {
+ dma_addr = phys_to_dma(dev, phys);
+ if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
+ dma_kmalloc_needs_bounce(dev, size, dir)) {
+ if (is_swiotlb_active(dev))
+ return swiotlb_map(dev, phys, size, dir, attrs);
+
+ goto err_overflow;
+ }
+ }
+
+ if (!dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ arch_sync_dma_for_device(phys, size, dir);
+ return dma_addr;
+
+err_overflow:
+ dev_WARN_ONCE(
+ dev, 1,
+ "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+ return DMA_MAPPING_ERROR;
+}
+
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t phys;
+
+ if (attrs & DMA_ATTR_MMIO)
+ /* nothing to do: uncached and no swiotlb */
+ return;
+
+ phys = dma_to_phys(dev, addr);
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
+}
+#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
new file mode 100644
index 000000000000..16a51736a2a3
--- /dev/null
+++ b/kernel/dma/dummy.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dummy DMA ops that always fail.
+ */
+#include <linux/dma-map-ops.h>
+
+static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ return -ENXIO;
+}
+
+static dma_addr_t dma_dummy_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ return DMA_MAPPING_ERROR;
+}
+static void dma_dummy_unmap_phys(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ /*
+ * Dummy ops doesn't support map_phys, so unmap_page should never be
+ * called.
+ */
+ WARN_ON_ONCE(true);
+}
+
+static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ return -EINVAL;
+}
+
+static void dma_dummy_unmap_sg(struct device *dev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ /*
+ * Dummy ops doesn't support map_sg, so unmap_sg should never be called.
+ */
+ WARN_ON_ONCE(true);
+}
+
+static int dma_dummy_supported(struct device *hwdev, u64 mask)
+{
+ return 0;
+}
+
+const struct dma_map_ops dma_dummy_ops = {
+ .mmap = dma_dummy_mmap,
+ .map_phys = dma_dummy_map_phys,
+ .unmap_phys = dma_dummy_unmap_phys,
+ .map_sg = dma_dummy_map_sg,
+ .unmap_sg = dma_dummy_unmap_sg,
+ .dma_supported = dma_dummy_supported,
+};
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
new file mode 100644
index 000000000000..794041a39e65
--- /dev/null
+++ b/kernel/dma/map_benchmark.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 HiSilicon Limited.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+#include <uapi/linux/map_benchmark.h>
+
+struct map_benchmark_data {
+ struct map_benchmark bparam;
+ struct device *dev;
+ struct dentry *debugfs;
+ enum dma_data_direction dir;
+ atomic64_t sum_map_100ns;
+ atomic64_t sum_unmap_100ns;
+ atomic64_t sum_sq_map;
+ atomic64_t sum_sq_unmap;
+ atomic64_t loops;
+};
+
+static int map_benchmark_thread(void *data)
+{
+ void *buf;
+ dma_addr_t dma_addr;
+ struct map_benchmark_data *map = data;
+ int npages = map->bparam.granule;
+ u64 size = npages * PAGE_SIZE;
+ int ret = 0;
+
+ buf = alloc_pages_exact(size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ while (!kthread_should_stop()) {
+ u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
+ ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
+ ktime_t map_delta, unmap_delta;
+
+ /*
+ * for a non-coherent device, if we don't stain them in the
+ * cache, this will give an underestimate of the real-world
+ * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
+ * 66 means evertything goes well! 66 is lucky.
+ */
+ if (map->dir != DMA_FROM_DEVICE)
+ memset(buf, 0x66, size);
+
+ map_stime = ktime_get();
+ dma_addr = dma_map_single(map->dev, buf, size, map->dir);
+ if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
+ pr_err("dma_map_single failed on %s\n",
+ dev_name(map->dev));
+ ret = -ENOMEM;
+ goto out;
+ }
+ map_etime = ktime_get();
+ map_delta = ktime_sub(map_etime, map_stime);
+
+ /* Pretend DMA is transmitting */
+ ndelay(map->bparam.dma_trans_ns);
+
+ unmap_stime = ktime_get();
+ dma_unmap_single(map->dev, dma_addr, size, map->dir);
+ unmap_etime = ktime_get();
+ unmap_delta = ktime_sub(unmap_etime, unmap_stime);
+
+ /* calculate sum and sum of squares */
+
+ map_100ns = div64_ul(map_delta, 100);
+ unmap_100ns = div64_ul(unmap_delta, 100);
+ map_sq = map_100ns * map_100ns;
+ unmap_sq = unmap_100ns * unmap_100ns;
+
+ atomic64_add(map_100ns, &map->sum_map_100ns);
+ atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
+ atomic64_add(map_sq, &map->sum_sq_map);
+ atomic64_add(unmap_sq, &map->sum_sq_unmap);
+ atomic64_inc(&map->loops);
+
+ /*
+ * We may test for a long time so periodically check whether
+ * we need to schedule to avoid starving the others. Otherwise
+ * we may hangup the kernel in a non-preemptible kernel when
+ * the test kthreads number >= CPU number, the test kthreads
+ * will run endless on every CPU since the thread resposible
+ * for notifying the kthread stop (in do_map_benchmark())
+ * could not be scheduled.
+ *
+ * Note this may degrade the test concurrency since the test
+ * threads may need to share the CPU time with other load
+ * in the system. So it's recommended to run this benchmark
+ * on an idle system.
+ */
+ cond_resched();
+ }
+
+out:
+ free_pages_exact(buf, size);
+ return ret;
+}
+
+static int do_map_benchmark(struct map_benchmark_data *map)
+{
+ struct task_struct **tsk;
+ int threads = map->bparam.threads;
+ int node = map->bparam.node;
+ u64 loops;
+ int ret = 0;
+ int i;
+
+ tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
+ if (!tsk)
+ return -ENOMEM;
+
+ get_device(map->dev);
+
+ for (i = 0; i < threads; i++) {
+ tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
+ map->bparam.node, "dma-map-benchmark/%d", i);
+ if (IS_ERR(tsk[i])) {
+ pr_err("create dma_map thread failed\n");
+ ret = PTR_ERR(tsk[i]);
+ while (--i >= 0)
+ kthread_stop(tsk[i]);
+ goto out;
+ }
+
+ if (node != NUMA_NO_NODE)
+ kthread_bind_mask(tsk[i], cpumask_of_node(node));
+ }
+
+ /* clear the old value in the previous benchmark */
+ atomic64_set(&map->sum_map_100ns, 0);
+ atomic64_set(&map->sum_unmap_100ns, 0);
+ atomic64_set(&map->sum_sq_map, 0);
+ atomic64_set(&map->sum_sq_unmap, 0);
+ atomic64_set(&map->loops, 0);
+
+ for (i = 0; i < threads; i++) {
+ get_task_struct(tsk[i]);
+ wake_up_process(tsk[i]);
+ }
+
+ msleep_interruptible(map->bparam.seconds * 1000);
+
+ /* wait for the completion of all started benchmark threads */
+ for (i = 0; i < threads; i++) {
+ int kthread_ret = kthread_stop_put(tsk[i]);
+
+ if (kthread_ret)
+ ret = kthread_ret;
+ }
+
+ if (ret)
+ goto out;
+
+ loops = atomic64_read(&map->loops);
+ if (likely(loops > 0)) {
+ u64 map_variance, unmap_variance;
+ u64 sum_map = atomic64_read(&map->sum_map_100ns);
+ u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
+ u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
+ u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
+
+ /* average latency */
+ map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
+ map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
+
+ /* standard deviation of latency */
+ map_variance = div64_u64(sum_sq_map, loops) -
+ map->bparam.avg_map_100ns *
+ map->bparam.avg_map_100ns;
+ unmap_variance = div64_u64(sum_sq_unmap, loops) -
+ map->bparam.avg_unmap_100ns *
+ map->bparam.avg_unmap_100ns;
+ map->bparam.map_stddev = int_sqrt64(map_variance);
+ map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
+ }
+
+out:
+ put_device(map->dev);
+ kfree(tsk);
+ return ret;
+}
+
+static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct map_benchmark_data *map = file->private_data;
+ void __user *argp = (void __user *)arg;
+ u64 old_dma_mask;
+ int ret;
+
+ if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case DMA_MAP_BENCHMARK:
+ if (map->bparam.threads == 0 ||
+ map->bparam.threads > DMA_MAP_MAX_THREADS) {
+ pr_err("invalid thread number\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.seconds == 0 ||
+ map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
+ pr_err("invalid duration seconds\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.dma_trans_ns > DMA_MAP_MAX_TRANS_DELAY) {
+ pr_err("invalid transmission delay\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.node != NUMA_NO_NODE &&
+ (map->bparam.node < 0 || map->bparam.node >= MAX_NUMNODES ||
+ !node_possible(map->bparam.node))) {
+ pr_err("invalid numa node\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.granule < 1 || map->bparam.granule > 1024) {
+ pr_err("invalid granule size\n");
+ return -EINVAL;
+ }
+
+ switch (map->bparam.dma_dir) {
+ case DMA_MAP_BIDIRECTIONAL:
+ map->dir = DMA_BIDIRECTIONAL;
+ break;
+ case DMA_MAP_FROM_DEVICE:
+ map->dir = DMA_FROM_DEVICE;
+ break;
+ case DMA_MAP_TO_DEVICE:
+ map->dir = DMA_TO_DEVICE;
+ break;
+ default:
+ pr_err("invalid DMA direction\n");
+ return -EINVAL;
+ }
+
+ old_dma_mask = dma_get_mask(map->dev);
+
+ ret = dma_set_mask(map->dev,
+ DMA_BIT_MASK(map->bparam.dma_bits));
+ if (ret) {
+ pr_err("failed to set dma_mask on device %s\n",
+ dev_name(map->dev));
+ return -EINVAL;
+ }
+
+ ret = do_map_benchmark(map);
+
+ /*
+ * restore the original dma_mask as many devices' dma_mask are
+ * set by architectures, acpi, busses. When we bind them back
+ * to their original drivers, those drivers shouldn't see
+ * dma_mask changed by benchmark
+ */
+ dma_set_mask(map->dev, old_dma_mask);
+
+ if (ret)
+ return ret;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static const struct file_operations map_benchmark_fops = {
+ .open = simple_open,
+ .unlocked_ioctl = map_benchmark_ioctl,
+};
+
+static void map_benchmark_remove_debugfs(void *data)
+{
+ struct map_benchmark_data *map = (struct map_benchmark_data *)data;
+
+ debugfs_remove(map->debugfs);
+}
+
+static int __map_benchmark_probe(struct device *dev)
+{
+ struct dentry *entry;
+ struct map_benchmark_data *map;
+ int ret;
+
+ map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+ map->dev = dev;
+
+ ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
+ if (ret) {
+ pr_err("Can't add debugfs remove action\n");
+ return ret;
+ }
+
+ /*
+ * we only permit a device bound with this driver, 2nd probe
+ * will fail
+ */
+ entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
+ &map_benchmark_fops);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+ map->debugfs = entry;
+
+ return 0;
+}
+
+static int map_benchmark_platform_probe(struct platform_device *pdev)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct platform_driver map_benchmark_platform_driver = {
+ .driver = {
+ .name = "dma_map_benchmark",
+ },
+ .probe = map_benchmark_platform_probe,
+};
+
+static int
+map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct pci_driver map_benchmark_pci_driver = {
+ .name = "dma_map_benchmark",
+ .probe = map_benchmark_pci_probe,
+};
+
+static int __init map_benchmark_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&map_benchmark_pci_driver);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&map_benchmark_platform_driver);
+ if (ret) {
+ pci_unregister_driver(&map_benchmark_pci_driver);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit map_benchmark_cleanup(void)
+{
+ platform_driver_unregister(&map_benchmark_platform_driver);
+ pci_unregister_driver(&map_benchmark_pci_driver);
+}
+
+module_init(map_benchmark_init);
+module_exit(map_benchmark_cleanup);
+
+MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
+MODULE_DESCRIPTION("dma_map benchmark driver");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
new file mode 100644
index 000000000000..37163eb49f9f
--- /dev/null
+++ b/kernel/dma/mapping.c
@@ -0,0 +1,1010 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch-independent dma-mapping routines
+ *
+ * Copyright (c) 2006 SUSE Linux Products GmbH
+ * Copyright (c) 2006 Tejun Heo <teheo@suse.de>
+ */
+#include <linux/memblock.h> /* for max_pfn */
+#include <linux/acpi.h>
+#include <linux/dma-map-ops.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/iommu-dma.h>
+#include <linux/kmsan.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "debug.h"
+#include "direct.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/dma.h>
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
+bool dma_default_coherent = IS_ENABLED(CONFIG_ARCH_DMA_DEFAULT_COHERENT);
+#endif
+
+/*
+ * Managed DMA API
+ */
+struct dma_devres {
+ size_t size;
+ void *vaddr;
+ dma_addr_t dma_handle;
+ unsigned long attrs;
+};
+
+static void dmam_release(struct device *dev, void *res)
+{
+ struct dma_devres *this = res;
+
+ dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle,
+ this->attrs);
+}
+
+static int dmam_match(struct device *dev, void *res, void *match_data)
+{
+ struct dma_devres *this = res, *match = match_data;
+
+ if (this->vaddr == match->vaddr) {
+ WARN_ON(this->size != match->size ||
+ this->dma_handle != match->dma_handle);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * dmam_free_coherent - Managed dma_free_coherent()
+ * @dev: Device to free coherent memory for
+ * @size: Size of allocation
+ * @vaddr: Virtual address of the memory to free
+ * @dma_handle: DMA handle of the memory to free
+ *
+ * Managed dma_free_coherent().
+ */
+void dmam_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_handle)
+{
+ struct dma_devres match_data = { size, vaddr, dma_handle };
+
+ WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data));
+ dma_free_coherent(dev, size, vaddr, dma_handle);
+}
+EXPORT_SYMBOL(dmam_free_coherent);
+
+/**
+ * dmam_alloc_attrs - Managed dma_alloc_attrs()
+ * @dev: Device to allocate non_coherent memory for
+ * @size: Size of allocation
+ * @dma_handle: Out argument for allocated DMA handle
+ * @gfp: Allocation flags
+ * @attrs: Flags in the DMA_ATTR_* namespace.
+ *
+ * Managed dma_alloc_attrs(). Memory allocated using this function will be
+ * automatically released on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t gfp, unsigned long attrs)
+{
+ struct dma_devres *dr;
+ void *vaddr;
+
+ dr = devres_alloc(dmam_release, sizeof(*dr), gfp);
+ if (!dr)
+ return NULL;
+
+ vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs);
+ if (!vaddr) {
+ devres_free(dr);
+ return NULL;
+ }
+
+ dr->vaddr = vaddr;
+ dr->dma_handle = *dma_handle;
+ dr->size = size;
+ dr->attrs = attrs;
+
+ devres_add(dev, dr);
+
+ return vaddr;
+}
+EXPORT_SYMBOL(dmam_alloc_attrs);
+
+static bool dma_go_direct(struct device *dev, dma_addr_t mask,
+ const struct dma_map_ops *ops)
+{
+ if (use_dma_iommu(dev))
+ return false;
+
+ if (likely(!ops))
+ return true;
+
+#ifdef CONFIG_DMA_OPS_BYPASS
+ if (dev->dma_ops_bypass)
+ return min_not_zero(mask, dev->bus_dma_limit) >=
+ dma_direct_get_required_mask(dev);
+#endif
+ return false;
+}
+
+
+/*
+ * Check if the devices uses a direct mapping for streaming DMA operations.
+ * This allows IOMMU drivers to set a bypass mode if the DMA mask is large
+ * enough.
+ */
+static inline bool dma_alloc_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, dev->coherent_dma_mask, ops);
+}
+
+static inline bool dma_map_direct(struct device *dev,
+ const struct dma_map_ops *ops)
+{
+ return dma_go_direct(dev, *dev->dma_mask, ops);
+}
+
+dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ bool is_mmio = attrs & DMA_ATTR_MMIO;
+ dma_addr_t addr = DMA_MAPPING_ERROR;
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return DMA_MAPPING_ERROR;
+
+ if (dma_map_direct(dev, ops) ||
+ (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
+ addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
+ else if (use_dma_iommu(dev))
+ addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
+ else if (ops->map_phys)
+ addr = ops->map_phys(dev, phys, size, dir, attrs);
+
+ if (!is_mmio)
+ kmsan_handle_dma(phys, size, dir);
+ trace_dma_map_phys(dev, phys, addr, size, dir, attrs);
+ debug_dma_map_phys(dev, phys, size, dir, addr, attrs);
+
+ return addr;
+}
+EXPORT_SYMBOL_GPL(dma_map_phys);
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+ size_t offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return DMA_MAPPING_ERROR;
+
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+ WARN_ON_ONCE(is_zone_device_page(page)))
+ return DMA_MAPPING_ERROR;
+
+ return dma_map_phys(dev, phys, size, dir, attrs);
+}
+EXPORT_SYMBOL(dma_map_page_attrs);
+
+void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ bool is_mmio = attrs & DMA_ATTR_MMIO;
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops) ||
+ (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
+ dma_direct_unmap_phys(dev, addr, size, dir, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, addr, size, dir, attrs);
+ trace_dma_unmap_phys(dev, addr, size, dir, attrs);
+ debug_dma_unmap_phys(dev, addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(dma_unmap_phys);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return;
+
+ dma_unmap_phys(dev, addr, size, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_page_attrs);
+
+static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ int ents;
+
+ BUG_ON(!valid_dma_direction(dir));
+
+ if (WARN_ON_ONCE(!dev->dma_mask))
+ return 0;
+
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_map_sg_direct(dev, sg, nents))
+ ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+ else if (use_dma_iommu(dev))
+ ents = iommu_dma_map_sg(dev, sg, nents, dir, attrs);
+ else
+ ents = ops->map_sg(dev, sg, nents, dir, attrs);
+
+ if (ents > 0) {
+ kmsan_handle_dma_sg(sg, nents, dir);
+ trace_dma_map_sg(dev, sg, nents, ents, dir, attrs);
+ debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
+ } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
+ ents != -EIO && ents != -EREMOTEIO)) {
+ trace_dma_map_sg_err(dev, sg, nents, ents, dir, attrs);
+ return -EIO;
+ }
+
+ return ents;
+}
+
+/**
+ * dma_map_sg_attrs - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sg: The sg_table object describing the buffer
+ * @nents: Number of entries to map
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist passed in the sg argument with
+ * nents segments for the @dir DMA operation by the @dev device.
+ *
+ * Returns the number of mapped entries (which can be less than nents)
+ * on success. Zero is returned for any error.
+ *
+ * dma_unmap_sg_attrs() should be used to unmap the buffer with the
+ * original sg and original nents (not the value returned by this funciton).
+ */
+unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, unsigned long attrs)
+{
+ int ret;
+
+ ret = __dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+ if (ret < 0)
+ return 0;
+ return ret;
+}
+EXPORT_SYMBOL(dma_map_sg_attrs);
+
+/**
+ * dma_map_sgtable - Map the given buffer for DMA
+ * @dev: The device for which to perform the DMA operation
+ * @sgt: The sg_table object describing the buffer
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Maps a buffer described by a scatterlist stored in the given sg_table
+ * object for the @dir DMA operation by the @dev device. After success, the
+ * ownership for the buffer is transferred to the DMA domain. One has to
+ * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
+ * ownership of the buffer back to the CPU domain before touching the
+ * buffer by the CPU.
+ *
+ * Returns 0 on success or a negative error code on error. The following
+ * error codes are supported with the given meaning:
+ *
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned
+ * DMA_MAPPING_ERROR.
+ * -EREMOTEIO The DMA device cannot access P2PDMA memory specified
+ * in the sg_table. This will not succeed if retried.
+ */
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ int nents;
+
+ nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
+ if (nents < 0)
+ return nents;
+ sgt->nents = nents;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dma_map_sgtable);
+
+void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ trace_dma_unmap_sg(dev, sg, nents, dir, attrs);
+ debug_dma_unmap_sg(dev, sg, nents, dir);
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unmap_sg_direct(dev, sg, nents))
+ dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_unmap_sg(dev, sg, nents, dir, attrs);
+ else if (ops->unmap_sg)
+ ops->unmap_sg(dev, sg, nents, dir, attrs);
+}
+EXPORT_SYMBOL(dma_unmap_sg_attrs);
+
+dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+ WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
+ return DMA_MAPPING_ERROR;
+
+ return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO);
+}
+EXPORT_SYMBOL(dma_map_resource);
+
+void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO);
+}
+EXPORT_SYMBOL(dma_unmap_resource);
+
+#ifdef CONFIG_DMA_NEED_SYNC
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_single_for_cpu(dev, addr, size, dir);
+ else if (ops->sync_single_for_cpu)
+ ops->sync_single_for_cpu(dev, addr, size, dir);
+ trace_dma_sync_single_for_cpu(dev, addr, size, dir);
+ debug_dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(__dma_sync_single_for_cpu);
+
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_single_for_device(dev, addr, size, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_single_for_device(dev, addr, size, dir);
+ else if (ops->sync_single_for_device)
+ ops->sync_single_for_device(dev, addr, size, dir);
+ trace_dma_sync_single_for_device(dev, addr, size, dir);
+ debug_dma_sync_single_for_device(dev, addr, size, dir);
+}
+EXPORT_SYMBOL(__dma_sync_single_for_device);
+
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_cpu)
+ ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+ trace_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(__dma_sync_sg_for_cpu);
+
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ BUG_ON(!valid_dma_direction(dir));
+ if (dma_map_direct(dev, ops))
+ dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+ else if (use_dma_iommu(dev))
+ iommu_dma_sync_sg_for_device(dev, sg, nelems, dir);
+ else if (ops->sync_sg_for_device)
+ ops->sync_sg_for_device(dev, sg, nelems, dir);
+ trace_dma_sync_sg_for_device(dev, sg, nelems, dir);
+ debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+EXPORT_SYMBOL(__dma_sync_sg_for_device);
+
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops))
+ /*
+ * dma_skip_sync could've been reset on first SWIOTLB buffer
+ * mapping, but @dma_addr is not necessary an SWIOTLB buffer.
+ * In this case, fall back to more granular check.
+ */
+ return dma_direct_need_sync(dev, dma_addr);
+ return true;
+}
+EXPORT_SYMBOL_GPL(__dma_need_sync);
+
+/**
+ * dma_need_unmap - does this device need dma_unmap_* operations
+ * @dev: device to check
+ *
+ * If this function returns %false, drivers can skip calling dma_unmap_* after
+ * finishing an I/O. This function must be called after all mappings that might
+ * need to be unmapped have been performed.
+ */
+bool dma_need_unmap(struct device *dev)
+{
+ if (!dma_map_direct(dev, get_dma_ops(dev)))
+ return true;
+ if (!dev->dma_skip_sync)
+ return true;
+ return IS_ENABLED(CONFIG_DMA_API_DEBUG);
+}
+EXPORT_SYMBOL_GPL(dma_need_unmap);
+
+static void dma_setup_need_sync(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_map_direct(dev, ops) || use_dma_iommu(dev))
+ /*
+ * dma_skip_sync will be reset to %false on first SWIOTLB buffer
+ * mapping, if any. During the device initialization, it's
+ * enough to check only for the DMA coherence.
+ */
+ dev->dma_skip_sync = dev_is_dma_coherent(dev);
+ else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu &&
+ !ops->sync_sg_for_device && !ops->sync_sg_for_cpu)
+ /*
+ * Synchronization is not possible when none of DMA sync ops
+ * is set.
+ */
+ dev->dma_skip_sync = true;
+ else
+ dev->dma_skip_sync = false;
+}
+#else /* !CONFIG_DMA_NEED_SYNC */
+static inline void dma_setup_need_sync(struct device *dev) { }
+#endif /* !CONFIG_DMA_NEED_SYNC */
+
+/*
+ * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
+ * that the intention is to allow exporting memory allocated via the
+ * coherent DMA APIs through the dma_buf API, which only accepts a
+ * scattertable. This presents a couple of problems:
+ * 1. Not all memory allocated via the coherent DMA APIs is backed by
+ * a struct page
+ * 2. Passing coherent DMA memory into the streaming APIs is not allowed
+ * as we will try to flush the memory through a different alias to that
+ * actually being used (and the flushes are redundant.)
+ */
+int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_alloc_direct(dev, ops))
+ return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ size, attrs);
+ if (use_dma_iommu(dev))
+ return iommu_dma_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ size, attrs);
+ if (!ops->get_sgtable)
+ return -ENXIO;
+ return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
+}
+EXPORT_SYMBOL(dma_get_sgtable_attrs);
+
+#ifdef CONFIG_MMU
+/*
+ * Return the page attributes used for mapping dma_alloc_* memory, either in
+ * kernel space if remapping is needed, or to userspace through dma_mmap_*.
+ */
+pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
+{
+ if (dev_is_dma_coherent(dev))
+ return prot;
+#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
+ if (attrs & DMA_ATTR_WRITE_COMBINE)
+ return pgprot_writecombine(prot);
+#endif
+ return pgprot_dmacoherent(prot);
+}
+#endif /* CONFIG_MMU */
+
+/**
+ * dma_can_mmap - check if a given device supports dma_mmap_*
+ * @dev: device to check
+ *
+ * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to
+ * map DMA allocations to userspace.
+ */
+bool dma_can_mmap(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_alloc_direct(dev, ops))
+ return dma_direct_can_mmap(dev);
+ if (use_dma_iommu(dev))
+ return true;
+ return ops->mmap != NULL;
+}
+EXPORT_SYMBOL_GPL(dma_can_mmap);
+
+/**
+ * dma_mmap_attrs - map a coherent DMA allocation into user space
+ * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
+ * @vma: vm_area_struct describing requested user mapping
+ * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs
+ * @dma_addr: device-view address returned from dma_alloc_attrs
+ * @size: size of memory originally requested in dma_alloc_attrs
+ * @attrs: attributes of mapping properties requested in dma_alloc_attrs
+ *
+ * Map a coherent DMA buffer previously allocated by dma_alloc_attrs into user
+ * space. The coherent DMA buffer must not be freed by the driver until the
+ * user space mapping has been released.
+ */
+int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_alloc_direct(dev, ops))
+ return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
+ attrs);
+ if (use_dma_iommu(dev))
+ return iommu_dma_mmap(dev, vma, cpu_addr, dma_addr, size,
+ attrs);
+ if (!ops->mmap)
+ return -ENXIO;
+ return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+}
+EXPORT_SYMBOL(dma_mmap_attrs);
+
+u64 dma_get_required_mask(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_alloc_direct(dev, ops))
+ return dma_direct_get_required_mask(dev);
+
+ if (use_dma_iommu(dev))
+ return DMA_BIT_MASK(32);
+
+ if (ops->get_required_mask)
+ return ops->get_required_mask(dev);
+
+ /*
+ * We require every DMA ops implementation to at least support a 32-bit
+ * DMA mask (and use bounce buffering if that isn't supported in
+ * hardware). As the direct mapping code has its own routine to
+ * actually report an optimal mask we default to 32-bit here as that
+ * is the right thing for most IOMMUs, and at least not actively
+ * harmful in general.
+ */
+ return DMA_BIT_MASK(32);
+}
+EXPORT_SYMBOL_GPL(dma_get_required_mask);
+
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flag, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ void *cpu_addr;
+
+ WARN_ON_ONCE(!dev->coherent_dma_mask);
+
+ /*
+ * DMA allocations can never be turned back into a page pointer, so
+ * requesting compound pages doesn't make sense (and can't even be
+ * supported at all by various backends).
+ */
+ if (WARN_ON_ONCE(flag & __GFP_COMP))
+ return NULL;
+
+ if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) {
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size,
+ DMA_BIDIRECTIONAL, flag, attrs);
+ return cpu_addr;
+ }
+
+ /* let the implementation decide on the zone to allocate from: */
+ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
+
+ if (dma_alloc_direct(dev, ops)) {
+ cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
+ } else if (use_dma_iommu(dev)) {
+ cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs);
+ } else if (ops->alloc) {
+ cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+ } else {
+ trace_dma_alloc(dev, NULL, 0, size, DMA_BIDIRECTIONAL, flag,
+ attrs);
+ return NULL;
+ }
+
+ trace_dma_alloc(dev, cpu_addr, *dma_handle, size, DMA_BIDIRECTIONAL,
+ flag, attrs);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
+ return cpu_addr;
+}
+EXPORT_SYMBOL(dma_alloc_attrs);
+
+void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle, unsigned long attrs)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
+ return;
+ /*
+ * On non-coherent platforms which implement DMA-coherent buffers via
+ * non-cacheable remaps, ops->free() may call vunmap(). Thus getting
+ * this far in IRQ context is a) at risk of a BUG_ON() or trying to
+ * sleep on some machines, and b) an indication that the driver is
+ * probably misusing the coherent API anyway.
+ */
+ WARN_ON(irqs_disabled());
+
+ trace_dma_free(dev, cpu_addr, dma_handle, size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (!cpu_addr)
+ return;
+
+ debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+ if (dma_alloc_direct(dev, ops))
+ dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
+ else if (use_dma_iommu(dev))
+ iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs);
+ else if (ops->free)
+ ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
+
+static struct page *__dma_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (WARN_ON_ONCE(!dev->coherent_dma_mask))
+ return NULL;
+ if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
+ return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
+
+ size = PAGE_ALIGN(size);
+ if (dma_alloc_direct(dev, ops))
+ return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (use_dma_iommu(dev))
+ return dma_common_alloc_pages(dev, size, dma_handle, dir, gfp);
+ if (!ops->alloc_pages_op)
+ return NULL;
+ return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp);
+}
+
+struct page *dma_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
+
+ if (page) {
+ trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle,
+ size, dir, gfp, 0);
+ debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0);
+ } else {
+ trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0);
+ }
+ return page;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_pages);
+
+static void __dma_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ size = PAGE_ALIGN(size);
+ if (dma_alloc_direct(dev, ops))
+ dma_direct_free_pages(dev, size, page, dma_handle, dir);
+ else if (use_dma_iommu(dev))
+ dma_common_free_pages(dev, size, page, dma_handle, dir);
+ else if (ops->free_pages)
+ ops->free_pages(dev, size, page, dma_handle, dir);
+}
+
+void dma_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0);
+ debug_dma_free_pages(dev, page, size, dir, dma_handle);
+ __dma_free_pages(dev, size, page, dma_handle, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_pages);
+
+int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma,
+ size_t size, struct page *page)
+{
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff)
+ return -ENXIO;
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(page) + vma->vm_pgoff,
+ vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot);
+}
+EXPORT_SYMBOL_GPL(dma_mmap_pages);
+
+static struct sg_table *alloc_single_sgt(struct device *dev, size_t size,
+ enum dma_data_direction dir, gfp_t gfp)
+{
+ struct sg_table *sgt;
+ struct page *page;
+
+ sgt = kmalloc(sizeof(*sgt), gfp);
+ if (!sgt)
+ return NULL;
+ if (sg_alloc_table(sgt, 1, gfp))
+ goto out_free_sgt;
+ page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp);
+ if (!page)
+ goto out_free_table;
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ sg_dma_len(sgt->sgl) = sgt->sgl->length;
+ return sgt;
+out_free_table:
+ sg_free_table(sgt);
+out_free_sgt:
+ kfree(sgt);
+ return NULL;
+}
+
+struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
+ enum dma_data_direction dir, gfp_t gfp, unsigned long attrs)
+{
+ struct sg_table *sgt;
+
+ if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
+ return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
+
+ if (use_dma_iommu(dev))
+ sgt = iommu_dma_alloc_noncontiguous(dev, size, dir, gfp, attrs);
+ else
+ sgt = alloc_single_sgt(dev, size, dir, gfp);
+
+ if (sgt) {
+ sgt->nents = 1;
+ trace_dma_alloc_sgt(dev, sgt, size, dir, gfp, attrs);
+ debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
+ } else {
+ trace_dma_alloc_sgt_err(dev, NULL, 0, size, dir, gfp, attrs);
+ }
+ return sgt;
+}
+EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous);
+
+static void free_single_sgt(struct device *dev, size_t size,
+ struct sg_table *sgt, enum dma_data_direction dir)
+{
+ __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address,
+ dir);
+ sg_free_table(sgt);
+ kfree(sgt);
+}
+
+void dma_free_noncontiguous(struct device *dev, size_t size,
+ struct sg_table *sgt, enum dma_data_direction dir)
+{
+ trace_dma_free_sgt(dev, sgt, size, dir);
+ debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir);
+
+ if (use_dma_iommu(dev))
+ iommu_dma_free_noncontiguous(dev, size, sgt, dir);
+ else
+ free_single_sgt(dev, size, sgt, dir);
+}
+EXPORT_SYMBOL_GPL(dma_free_noncontiguous);
+
+void *dma_vmap_noncontiguous(struct device *dev, size_t size,
+ struct sg_table *sgt)
+{
+
+ if (use_dma_iommu(dev))
+ return iommu_dma_vmap_noncontiguous(dev, size, sgt);
+
+ return page_address(sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous);
+
+void dma_vunmap_noncontiguous(struct device *dev, void *vaddr)
+{
+ if (use_dma_iommu(dev))
+ iommu_dma_vunmap_noncontiguous(dev, vaddr);
+}
+EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous);
+
+int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma,
+ size_t size, struct sg_table *sgt)
+{
+ if (use_dma_iommu(dev))
+ return iommu_dma_mmap_noncontiguous(dev, vma, size, sgt);
+ return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl));
+}
+EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous);
+
+static int dma_supported(struct device *dev, u64 mask)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (use_dma_iommu(dev)) {
+ if (WARN_ON(ops))
+ return false;
+ return true;
+ }
+
+ /*
+ * ->dma_supported sets and clears the bypass flag, so ignore it here
+ * and always call into the method if there is one.
+ */
+ if (ops) {
+ if (!ops->dma_supported)
+ return true;
+ return ops->dma_supported(dev, mask);
+ }
+
+ return dma_direct_supported(dev, mask);
+}
+
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ /*
+ * Note: dma_ops_bypass is not checked here because P2PDMA should
+ * not be used with dma mapping ops that do not have support even
+ * if the specific device is bypassing them.
+ */
+
+ /* if ops is not set, dma direct and default IOMMU support P2PDMA */
+ return !ops;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
+
+int dma_set_mask(struct device *dev, u64 mask)
+{
+ /*
+ * Truncate the mask to the actually supported dma_addr_t width to
+ * avoid generating unsupportable addresses.
+ */
+ mask = (dma_addr_t)mask;
+
+ if (!dev->dma_mask || !dma_supported(dev, mask))
+ return -EIO;
+
+ arch_dma_set_mask(dev, mask);
+ *dev->dma_mask = mask;
+ dma_setup_need_sync(dev);
+
+ return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+int dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+ /*
+ * Truncate the mask to the actually supported dma_addr_t width to
+ * avoid generating unsupportable addresses.
+ */
+ mask = (dma_addr_t)mask;
+
+ if (!dma_supported(dev, mask))
+ return -EIO;
+
+ dev->coherent_dma_mask = mask;
+ return 0;
+}
+EXPORT_SYMBOL(dma_set_coherent_mask);
+
+static bool __dma_addressing_limited(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+ dma_get_required_mask(dev))
+ return true;
+
+ if (unlikely(ops) || use_dma_iommu(dev))
+ return false;
+ return !dma_direct_all_ram_mapped(dev);
+}
+
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev: device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false. Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+bool dma_addressing_limited(struct device *dev)
+{
+ if (!__dma_addressing_limited(dev))
+ return false;
+
+ dev_dbg(dev, "device is DMA addressing limited\n");
+ return true;
+}
+EXPORT_SYMBOL_GPL(dma_addressing_limited);
+
+size_t dma_max_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (dma_map_direct(dev, ops))
+ size = dma_direct_max_mapping_size(dev);
+ else if (use_dma_iommu(dev))
+ size = iommu_dma_max_mapping_size(dev);
+ else if (ops && ops->max_mapping_size)
+ size = ops->max_mapping_size(dev);
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+
+size_t dma_opt_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (use_dma_iommu(dev))
+ size = iommu_dma_opt_mapping_size();
+ else if (ops && ops->opt_mapping_size)
+ size = ops->opt_mapping_size();
+
+ return min(dma_max_mapping_size(dev), size);
+}
+EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
+
+unsigned long dma_get_merge_boundary(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (use_dma_iommu(dev))
+ return iommu_dma_get_merge_boundary(dev);
+
+ if (!ops || !ops->get_merge_boundary)
+ return 0; /* can't merge */
+
+ return ops->get_merge_boundary(dev);
+}
+EXPORT_SYMBOL_GPL(dma_get_merge_boundary);
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
new file mode 100644
index 000000000000..20caf9cabf69
--- /dev/null
+++ b/kernel/dma/ops_helpers.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for DMA ops implementations. These generally rely on the fact that
+ * the allocated memory contains normal pages in the direct kernel mapping.
+ */
+#include <linux/dma-map-ops.h>
+#include <linux/iommu-dma.h>
+
+static struct page *dma_common_vaddr_to_page(void *cpu_addr)
+{
+ if (is_vmalloc_addr(cpu_addr))
+ return vmalloc_to_page(cpu_addr);
+ return virt_to_page(cpu_addr);
+}
+
+/*
+ * Create scatter-list for the already allocated DMA buffer.
+ */
+int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
+ int ret;
+
+ ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+ if (!ret)
+ sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+ return ret;
+}
+
+/*
+ * Create userspace mapping for the DMA-coherent memory.
+ */
+int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ unsigned long attrs)
+{
+#ifdef CONFIG_MMU
+ unsigned long user_count = vma_pages(vma);
+ unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long off = vma->vm_pgoff;
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
+ int ret = -ENXIO;
+
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+ if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+ return ret;
+
+ if (off >= count || user_count > count - off)
+ return -ENXIO;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(page) + vma->vm_pgoff,
+ user_count << PAGE_SHIFT, vma->vm_page_prot);
+#else
+ return -ENXIO;
+#endif /* CONFIG_MMU */
+}
+
+struct page *dma_common_alloc_pages(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ struct page *page;
+ phys_addr_t phys;
+
+ page = dma_alloc_contiguous(dev, size, gfp);
+ if (!page)
+ page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size));
+ if (!page)
+ return NULL;
+
+ phys = page_to_phys(page);
+ if (use_dma_iommu(dev))
+ *dma_handle = iommu_dma_map_phys(dev, phys, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ else
+ *dma_handle = ops->map_phys(dev, phys, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ if (*dma_handle == DMA_MAPPING_ERROR) {
+ dma_free_contiguous(dev, page, size);
+ return NULL;
+ }
+
+ memset(page_address(page), 0, size);
+ return page;
+}
+
+void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
+ dma_addr_t dma_handle, enum dma_data_direction dir)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (use_dma_iommu(dev))
+ iommu_dma_unmap_phys(dev, dma_handle, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ else if (ops->unmap_phys)
+ ops->unmap_phys(dev, dma_handle, size, dir,
+ DMA_ATTR_SKIP_CPU_SYNC);
+ dma_free_contiguous(dev, page, size);
+}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
new file mode 100644
index 000000000000..ee45dee33d49
--- /dev/null
+++ b/kernel/dma/pool.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2020 Google LLC
+ */
+#include <linux/cma.h>
+#include <linux/debugfs.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dma-direct.h>
+#include <linux/init.h>
+#include <linux/genalloc.h>
+#include <linux/set_memory.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+static struct gen_pool *atomic_pool_dma __ro_after_init;
+static unsigned long pool_size_dma;
+static struct gen_pool *atomic_pool_dma32 __ro_after_init;
+static unsigned long pool_size_dma32;
+static struct gen_pool *atomic_pool_kernel __ro_after_init;
+static unsigned long pool_size_kernel;
+
+/* Size can be defined by the coherent_pool command line */
+static size_t atomic_pool_size;
+
+/* Dynamic background expansion when the atomic pool is near capacity */
+static struct work_struct atomic_pool_work;
+
+static int __init early_coherent_pool(char *p)
+{
+ atomic_pool_size = memparse(p, &p);
+ return 0;
+}
+early_param("coherent_pool", early_coherent_pool);
+
+static void __init dma_atomic_pool_debugfs_init(void)
+{
+ struct dentry *root;
+
+ root = debugfs_create_dir("dma_pools", NULL);
+ debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
+ debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
+ debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
+}
+
+static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
+{
+ if (gfp & __GFP_DMA)
+ pool_size_dma += size;
+ else if (gfp & __GFP_DMA32)
+ pool_size_dma32 += size;
+ else
+ pool_size_kernel += size;
+}
+
+static bool cma_in_zone(gfp_t gfp)
+{
+ unsigned long size;
+ phys_addr_t end;
+ struct cma *cma;
+
+ cma = dev_get_cma_area(NULL);
+ if (!cma)
+ return false;
+
+ size = cma_get_size(cma);
+ if (!size)
+ return false;
+
+ /* CMA can't cross zone boundaries, see cma_activate_area() */
+ end = cma_get_base(cma) + size - 1;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ return end <= zone_dma_limit;
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ return end <= max(DMA_BIT_MASK(32), zone_dma_limit);
+ return true;
+}
+
+static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
+ gfp_t gfp)
+{
+ unsigned int order;
+ struct page *page = NULL;
+ void *addr;
+ int ret = -ENOMEM;
+
+ /* Cannot allocate larger than MAX_PAGE_ORDER */
+ order = min(get_order(pool_size), MAX_PAGE_ORDER);
+
+ do {
+ pool_size = 1 << (PAGE_SHIFT + order);
+ if (cma_in_zone(gfp))
+ page = dma_alloc_from_contiguous(NULL, 1 << order,
+ order, false);
+ if (!page)
+ page = alloc_pages(gfp, order);
+ } while (!page && order-- > 0);
+ if (!page)
+ goto out;
+
+ arch_dma_prep_coherent(page, pool_size);
+
+#ifdef CONFIG_DMA_DIRECT_REMAP
+ addr = dma_common_contiguous_remap(page, pool_size,
+ pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)),
+ __builtin_return_address(0));
+ if (!addr)
+ goto free_page;
+#else
+ addr = page_to_virt(page);
+#endif
+ /*
+ * Memory in the atomic DMA pools must be unencrypted, the pools do not
+ * shrink so no re-encryption occurs in dma_direct_free().
+ */
+ ret = set_memory_decrypted((unsigned long)page_to_virt(page),
+ 1 << order);
+ if (ret)
+ goto remove_mapping;
+ ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
+ pool_size, NUMA_NO_NODE);
+ if (ret)
+ goto encrypt_mapping;
+
+ dma_atomic_pool_size_add(gfp, pool_size);
+ return 0;
+
+encrypt_mapping:
+ ret = set_memory_encrypted((unsigned long)page_to_virt(page),
+ 1 << order);
+ if (WARN_ON_ONCE(ret)) {
+ /* Decrypt succeeded but encrypt failed, purposely leak */
+ goto out;
+ }
+remove_mapping:
+#ifdef CONFIG_DMA_DIRECT_REMAP
+ dma_common_free_remap(addr, pool_size);
+free_page:
+ __free_pages(page, order);
+#endif
+out:
+ return ret;
+}
+
+static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
+{
+ if (pool && gen_pool_avail(pool) < atomic_pool_size)
+ atomic_pool_expand(pool, gen_pool_size(pool), gfp);
+}
+
+static void atomic_pool_work_fn(struct work_struct *work)
+{
+ if (IS_ENABLED(CONFIG_ZONE_DMA))
+ atomic_pool_resize(atomic_pool_dma,
+ GFP_KERNEL | GFP_DMA);
+ if (IS_ENABLED(CONFIG_ZONE_DMA32))
+ atomic_pool_resize(atomic_pool_dma32,
+ GFP_KERNEL | GFP_DMA32);
+ atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
+}
+
+static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
+ gfp_t gfp)
+{
+ struct gen_pool *pool;
+ int ret;
+
+ pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+ if (!pool)
+ return NULL;
+
+ gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
+
+ ret = atomic_pool_expand(pool, pool_size, gfp);
+ if (ret) {
+ gen_pool_destroy(pool);
+ pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
+ pool_size >> 10, &gfp);
+ return NULL;
+ }
+
+ pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
+ gen_pool_size(pool) >> 10, &gfp);
+ return pool;
+}
+
+static int __init dma_atomic_pool_init(void)
+{
+ int ret = 0;
+
+ /*
+ * If coherent_pool was not used on the command line, default the pool
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER.
+ */
+ if (!atomic_pool_size) {
+ unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
+ pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES);
+ atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K);
+ }
+ INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
+
+ atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
+ GFP_KERNEL);
+ if (!atomic_pool_kernel)
+ ret = -ENOMEM;
+ if (has_managed_dma()) {
+ atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
+ GFP_KERNEL | GFP_DMA);
+ if (!atomic_pool_dma)
+ ret = -ENOMEM;
+ }
+ if (IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
+ GFP_KERNEL | GFP_DMA32);
+ if (!atomic_pool_dma32)
+ ret = -ENOMEM;
+ }
+
+ dma_atomic_pool_debugfs_init();
+ return ret;
+}
+postcore_initcall(dma_atomic_pool_init);
+
+static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
+{
+ if (prev == NULL) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ return atomic_pool_dma32;
+ if (atomic_pool_dma && (gfp & GFP_DMA))
+ return atomic_pool_dma;
+ return atomic_pool_kernel;
+ }
+ if (prev == atomic_pool_kernel)
+ return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
+ if (prev == atomic_pool_dma32)
+ return atomic_pool_dma;
+ return NULL;
+}
+
+static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
+ struct gen_pool *pool, void **cpu_addr,
+ bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
+{
+ unsigned long addr;
+ phys_addr_t phys;
+
+ addr = gen_pool_alloc(pool, size);
+ if (!addr)
+ return NULL;
+
+ phys = gen_pool_virt_to_phys(pool, addr);
+ if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) {
+ gen_pool_free(pool, addr, size);
+ return NULL;
+ }
+
+ if (gen_pool_avail(pool) < atomic_pool_size)
+ schedule_work(&atomic_pool_work);
+
+ *cpu_addr = (void *)addr;
+ memset(*cpu_addr, 0, size);
+ return pfn_to_page(__phys_to_pfn(phys));
+}
+
+struct page *dma_alloc_from_pool(struct device *dev, size_t size,
+ void **cpu_addr, gfp_t gfp,
+ bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
+{
+ struct gen_pool *pool = NULL;
+ struct page *page;
+
+ while ((pool = dma_guess_pool(pool, gfp))) {
+ page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
+ phys_addr_ok);
+ if (page)
+ return page;
+ }
+
+ WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev));
+ return NULL;
+}
+
+bool dma_free_from_pool(struct device *dev, void *start, size_t size)
+{
+ struct gen_pool *pool = NULL;
+
+ while ((pool = dma_guess_pool(pool, 0))) {
+ if (!gen_pool_has_addr(pool, (unsigned long)start, size))
+ continue;
+ gen_pool_free(pool, (unsigned long)start, size);
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
new file mode 100644
index 000000000000..b7c1c0c92d0c
--- /dev/null
+++ b/kernel/dma/remap.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2014 The Linux Foundation
+ */
+#include <linux/dma-map-ops.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+struct page **dma_common_find_pages(void *cpu_addr)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || !(area->flags & VM_DMA_COHERENT))
+ return NULL;
+ WARN(area->flags != VM_DMA_COHERENT,
+ "unexpected flags in area: %p\n", cpu_addr);
+ return area->pages;
+}
+
+/*
+ * Remaps an array of PAGE_SIZE pages into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_pages_remap(struct page **pages, size_t size,
+ pgprot_t prot, const void *caller)
+{
+ void *vaddr;
+
+ vaddr = vmap(pages, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ VM_DMA_COHERENT, prot);
+ if (vaddr)
+ find_vm_area(vaddr)->pages = pages;
+ return vaddr;
+}
+
+/*
+ * Remaps an allocated contiguous region into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *dma_common_contiguous_remap(struct page *page, size_t size,
+ pgprot_t prot, const void *caller)
+{
+ int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct page **pages;
+ void *vaddr;
+ int i;
+
+ pages = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+ for (i = 0; i < count; i++)
+ pages[i] = page++;
+ vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
+ kvfree(pages);
+
+ return vaddr;
+}
+
+/*
+ * Unmaps a range previously mapped by dma_common_*_remap
+ */
+void dma_common_free_remap(void *cpu_addr, size_t size)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || !(area->flags & VM_DMA_COHERENT)) {
+ WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
+ return;
+ }
+
+ vunmap(cpu_addr);
+}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
new file mode 100644
index 000000000000..a547c7693135
--- /dev/null
+++ b/kernel/dma/swiotlb.c
@@ -0,0 +1,1881 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is a fallback for platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API.
+ * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid
+ * unnecessary i-cache flushing.
+ * 04/07/.. ak Better overflow handling. Assorted fixes.
+ * 05/09/10 linville Add support for syncing ranges, support syncing for
+ * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+ * 08/12/11 beckyb Add highmem support
+ */
+
+#define pr_fmt(fmt) "software IO TLB: " fmt
+
+#include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/rculist.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/swiotlb.h>
+#include <linux/types.h>
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/slab.h>
+#endif
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/swiotlb.h>
+
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+
+/*
+ * Minimum IO TLB size to bother booting with. Systems with mainly
+ * 64bit capable cards will only lightly use the swiotlb. If we can't
+ * allocate a contiguous 1MB, we're probably in trouble anyway.
+ */
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+
+/**
+ * struct io_tlb_slot - IO TLB slot descriptor
+ * @orig_addr: The original address corresponding to a mapped entry.
+ * @alloc_size: Size of the allocated buffer.
+ * @list: The free list describing the number of free entries available
+ * from each index.
+ * @pad_slots: Number of preceding padding slots. Valid only in the first
+ * allocated non-padding slot.
+ */
+struct io_tlb_slot {
+ phys_addr_t orig_addr;
+ size_t alloc_size;
+ unsigned short list;
+ unsigned short pad_slots;
+};
+
+static bool swiotlb_force_bounce;
+static bool swiotlb_force_disable;
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+static void swiotlb_dyn_alloc(struct work_struct *work);
+
+static struct io_tlb_mem io_tlb_default_mem = {
+ .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock),
+ .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools),
+ .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc,
+ swiotlb_dyn_alloc),
+};
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static struct io_tlb_mem io_tlb_default_mem;
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
+static unsigned long default_nareas;
+
+/**
+ * struct io_tlb_area - IO TLB memory area descriptor
+ *
+ * This is a single area with a single lock.
+ *
+ * @used: The number of used IO TLB block.
+ * @index: The slot index to start searching in this area for next round.
+ * @lock: The lock to protect the above data structures in the map and
+ * unmap calls.
+ */
+struct io_tlb_area {
+ unsigned long used;
+ unsigned int index;
+ spinlock_t lock;
+};
+
+/*
+ * Round up number of slabs to the next power of 2. The last area is going
+ * be smaller than the rest if default_nslabs is not power of two.
+ * The number of slot in an area should be a multiple of IO_TLB_SEGSIZE,
+ * otherwise a segment may span two or more areas. It conflicts with free
+ * contiguous slots tracking: free slots are treated contiguous no matter
+ * whether they cross an area boundary.
+ *
+ * Return true if default_nslabs is rounded up.
+ */
+static bool round_up_default_nslabs(void)
+{
+ if (!default_nareas)
+ return false;
+
+ if (default_nslabs < IO_TLB_SEGSIZE * default_nareas)
+ default_nslabs = IO_TLB_SEGSIZE * default_nareas;
+ else if (is_power_of_2(default_nslabs))
+ return false;
+ default_nslabs = roundup_pow_of_two(default_nslabs);
+ return true;
+}
+
+/**
+ * swiotlb_adjust_nareas() - adjust the number of areas and slots
+ * @nareas: Desired number of areas. Zero is treated as 1.
+ *
+ * Adjust the default number of areas in a memory pool.
+ * The default size of the memory pool may also change to meet minimum area
+ * size requirements.
+ */
+static void swiotlb_adjust_nareas(unsigned int nareas)
+{
+ if (!nareas)
+ nareas = 1;
+ else if (!is_power_of_2(nareas))
+ nareas = roundup_pow_of_two(nareas);
+
+ default_nareas = nareas;
+
+ pr_info("area num %d.\n", nareas);
+ if (round_up_default_nslabs())
+ pr_info("SWIOTLB bounce buffer size roundup to %luMB",
+ (default_nslabs << IO_TLB_SHIFT) >> 20);
+}
+
+/**
+ * limit_nareas() - get the maximum number of areas for a given memory pool size
+ * @nareas: Desired number of areas.
+ * @nslots: Total number of slots in the memory pool.
+ *
+ * Limit the number of areas to the maximum possible number of areas in
+ * a memory pool of the given size.
+ *
+ * Return: Maximum possible number of areas.
+ */
+static unsigned int limit_nareas(unsigned int nareas, unsigned long nslots)
+{
+ if (nslots < nareas * IO_TLB_SEGSIZE)
+ return nslots / IO_TLB_SEGSIZE;
+ return nareas;
+}
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+ if (isdigit(*str)) {
+ /* avoid tail segment of size < IO_TLB_SEGSIZE */
+ default_nslabs =
+ ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE);
+ }
+ if (*str == ',')
+ ++str;
+ if (isdigit(*str))
+ swiotlb_adjust_nareas(simple_strtoul(str, &str, 0));
+ if (*str == ',')
+ ++str;
+ if (!strcmp(str, "force"))
+ swiotlb_force_bounce = true;
+ else if (!strcmp(str, "noforce"))
+ swiotlb_force_disable = true;
+
+ return 0;
+}
+early_param("swiotlb", setup_io_tlb_npages);
+
+unsigned long swiotlb_size_or_default(void)
+{
+ return default_nslabs << IO_TLB_SHIFT;
+}
+
+void __init swiotlb_adjust_size(unsigned long size)
+{
+ /*
+ * If swiotlb parameter has not been specified, give a chance to
+ * architectures such as those supporting memory encryption to
+ * adjust/expand SWIOTLB size for their use.
+ */
+ if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT)
+ return;
+
+ size = ALIGN(size, IO_TLB_SIZE);
+ default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ if (round_up_default_nslabs())
+ size = default_nslabs << IO_TLB_SHIFT;
+ pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20);
+}
+
+void swiotlb_print_info(void)
+{
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+
+ if (!mem->nslabs) {
+ pr_warn("No low mem\n");
+ return;
+ }
+
+ pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end,
+ (mem->nslabs << IO_TLB_SHIFT) >> 20);
+}
+
+static inline unsigned long io_tlb_offset(unsigned long val)
+{
+ return val & (IO_TLB_SEGSIZE - 1);
+}
+
+static inline unsigned long nr_slots(u64 val)
+{
+ return DIV_ROUND_UP(val, IO_TLB_SIZE);
+}
+
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations. This function allows the architecture to
+ * call SWIOTLB when the operations are possible. It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long bytes;
+
+ if (!mem->nslabs || mem->late_alloc)
+ return;
+ bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
+ set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
+}
+
+static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
+ unsigned long nslabs, bool late_alloc, unsigned int nareas)
+{
+ void *vaddr = phys_to_virt(start);
+ unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+
+ mem->nslabs = nslabs;
+ mem->start = start;
+ mem->end = mem->start + bytes;
+ mem->late_alloc = late_alloc;
+ mem->nareas = nareas;
+ mem->area_nslabs = nslabs / mem->nareas;
+
+ for (i = 0; i < mem->nareas; i++) {
+ spin_lock_init(&mem->areas[i].lock);
+ mem->areas[i].index = 0;
+ mem->areas[i].used = 0;
+ }
+
+ for (i = 0; i < mem->nslabs; i++) {
+ mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i),
+ mem->nslabs - i);
+ mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ mem->slots[i].alloc_size = 0;
+ mem->slots[i].pad_slots = 0;
+ }
+
+ memset(vaddr, 0, bytes);
+ mem->vaddr = vaddr;
+ return;
+}
+
+/**
+ * add_mem_pool() - add a memory pool to the allocator
+ * @mem: Software IO TLB allocator.
+ * @pool: Memory pool to be added.
+ */
+static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock(&mem->lock);
+ list_add_rcu(&pool->node, &mem->pools);
+ mem->nslabs += pool->nslabs;
+ spin_unlock(&mem->lock);
+#else
+ mem->nslabs = pool->nslabs;
+#endif
+}
+
+static void __init *swiotlb_memblock_alloc(unsigned long nslabs,
+ unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ void *tlb;
+
+ /*
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
+ */
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+
+ if (!tlb) {
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
+ return NULL;
+ }
+
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return NULL;
+ }
+
+ return tlb;
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long nslabs;
+ unsigned int nareas;
+ size_t alloc_size;
+ void *tlb;
+
+ if (!addressing_limit && !swiotlb_force_bounce)
+ return;
+ if (swiotlb_force_disable)
+ return;
+
+ io_tlb_default_mem.force_bounce =
+ swiotlb_force_bounce || (flags & SWIOTLB_FORCE);
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (flags & SWIOTLB_ANY)
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+ else
+ io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT;
+#endif
+
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+ nslabs = default_nslabs;
+ nareas = limit_nareas(default_nareas, nslabs);
+ while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
+ if (nslabs <= IO_TLB_MIN_SLABS)
+ return;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
+ }
+
+ if (default_nslabs != nslabs) {
+ pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
+ default_nslabs, nslabs);
+ default_nslabs = nslabs;
+ }
+
+ alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
+ mem->slots = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!mem->slots) {
+ pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+ return;
+ }
+
+ mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
+ nareas), SMP_CACHE_BYTES);
+ if (!mem->areas) {
+ pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
+ return;
+ }
+
+ swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
+
+ if (flags & SWIOTLB_VERBOSE)
+ swiotlb_print_info();
+}
+
+void __init swiotlb_init(bool addressing_limit, unsigned int flags)
+{
+ swiotlb_init_remap(addressing_limit, flags, NULL);
+}
+
+/*
+ * Systems with larger DMA zones (those that don't support ISA) can
+ * initialize the swiotlb later using the slab allocator if needed.
+ * This should be just like above, but with some error catching.
+ */
+int swiotlb_init_late(size_t size, gfp_t gfp_mask,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE);
+ unsigned int nareas;
+ unsigned char *vstart = NULL;
+ unsigned int order, area_order;
+ bool retried = false;
+ int rc = 0;
+
+ if (io_tlb_default_mem.nslabs)
+ return 0;
+
+ if (swiotlb_force_disable)
+ return 0;
+
+ io_tlb_default_mem.force_bounce = swiotlb_force_bounce;
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ if (!remap)
+ io_tlb_default_mem.can_grow = true;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA))
+ io_tlb_default_mem.phys_limit = zone_dma_limit;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32))
+ io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit);
+ else
+ io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1);
+#endif
+
+ if (!default_nareas)
+ swiotlb_adjust_nareas(num_possible_cpus());
+
+retry:
+ order = get_order(nslabs << IO_TLB_SHIFT);
+ nslabs = SLABS_PER_PAGE << order;
+
+ while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+ vstart = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
+ order);
+ if (vstart)
+ break;
+ order--;
+ nslabs = SLABS_PER_PAGE << order;
+ retried = true;
+ }
+
+ if (!vstart)
+ return -ENOMEM;
+
+ if (remap)
+ rc = remap(vstart, nslabs);
+ if (rc) {
+ free_pages((unsigned long)vstart, order);
+
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ if (nslabs < IO_TLB_MIN_SLABS)
+ return rc;
+ retried = true;
+ goto retry;
+ }
+
+ if (retried) {
+ pr_warn("only able to allocate %ld MB\n",
+ (PAGE_SIZE << order) >> 20);
+ }
+
+ nareas = limit_nareas(default_nareas, nslabs);
+ area_order = get_order(array_size(sizeof(*mem->areas), nareas));
+ mem->areas = (struct io_tlb_area *)
+ __get_free_pages(GFP_KERNEL | __GFP_ZERO, area_order);
+ if (!mem->areas)
+ goto error_area;
+
+ mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(array_size(sizeof(*mem->slots), nslabs)));
+ if (!mem->slots)
+ goto error_slots;
+
+ set_memory_decrypted((unsigned long)vstart,
+ (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
+ nareas);
+ add_mem_pool(&io_tlb_default_mem, mem);
+
+ swiotlb_print_info();
+ return 0;
+
+error_slots:
+ free_pages((unsigned long)mem->areas, area_order);
+error_area:
+ free_pages((unsigned long)vstart, order);
+ return -ENOMEM;
+}
+
+void __init swiotlb_exit(void)
+{
+ struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
+ unsigned long tbl_vaddr;
+ size_t tbl_size, slots_size;
+ unsigned int area_order;
+
+ if (swiotlb_force_bounce)
+ return;
+
+ if (!mem->nslabs)
+ return;
+
+ pr_info("tearing down default memory pool\n");
+ tbl_vaddr = (unsigned long)phys_to_virt(mem->start);
+ tbl_size = PAGE_ALIGN(mem->end - mem->start);
+ slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
+
+ set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
+ if (mem->late_alloc) {
+ area_order = get_order(array_size(sizeof(*mem->areas),
+ mem->nareas));
+ free_pages((unsigned long)mem->areas, area_order);
+ free_pages(tbl_vaddr, get_order(tbl_size));
+ free_pages((unsigned long)mem->slots, get_order(slots_size));
+ } else {
+ memblock_free_late(__pa(mem->areas),
+ array_size(sizeof(*mem->areas), mem->nareas));
+ memblock_free_late(mem->start, tbl_size);
+ memblock_free_late(__pa(mem->slots), slots_size);
+ }
+
+ memset(mem, 0, sizeof(*mem));
+}
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * alloc_dma_pages() - allocate pages to be used for DMA
+ * @gfp: GFP flags for the allocation.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ *
+ * Allocate pages from the buddy allocator. If successful, make the allocated
+ * pages decrypted that they can be used for DMA.
+ *
+ * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
+ * if the allocated physical address was above @phys_limit.
+ */
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
+{
+ unsigned int order = get_order(bytes);
+ struct page *page;
+ phys_addr_t paddr;
+ void *vaddr;
+
+ page = alloc_pages(gfp, order);
+ if (!page)
+ return NULL;
+
+ paddr = page_to_phys(page);
+ if (paddr + bytes - 1 > phys_limit) {
+ __free_pages(page, order);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ vaddr = phys_to_virt(paddr);
+ if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ goto error;
+ return page;
+
+error:
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(page, order);
+ return NULL;
+}
+
+/**
+ * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer
+ * @dev: Device for which a memory pool is allocated.
+ * @bytes: Size of the buffer.
+ * @phys_limit: Maximum allowed physical address of the buffer.
+ * @gfp: GFP flags for the allocation.
+ *
+ * Return: Allocated pages, or %NULL on allocation failure.
+ */
+static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
+ u64 phys_limit, gfp_t gfp)
+{
+ struct page *page;
+
+ /*
+ * Allocate from the atomic pools if memory is encrypted and
+ * the allocation is atomic, because decrypting may block.
+ */
+ if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
+ void *vaddr;
+
+ if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
+ return NULL;
+
+ return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
+ dma_coherent_ok);
+ }
+
+ gfp &= ~GFP_ZONEMASK;
+ if (phys_limit <= zone_dma_limit)
+ gfp |= __GFP_DMA;
+ else if (phys_limit <= DMA_BIT_MASK(32))
+ gfp |= __GFP_DMA32;
+
+ while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+ phys_limit < DMA_BIT_MASK(64) &&
+ !(gfp & (__GFP_DMA32 | __GFP_DMA)))
+ gfp |= __GFP_DMA32;
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) &&
+ !(gfp & __GFP_DMA))
+ gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA;
+ else
+ return NULL;
+ }
+
+ return page;
+}
+
+/**
+ * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
+ * @vaddr: Virtual address of the buffer.
+ * @bytes: Size of the buffer.
+ */
+static void swiotlb_free_tlb(void *vaddr, size_t bytes)
+{
+ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
+ dma_free_from_pool(NULL, vaddr, bytes))
+ return;
+
+ /* Intentional leak if pages cannot be encrypted again. */
+ if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+ __free_pages(virt_to_page(vaddr), get_order(bytes));
+}
+
+/**
+ * swiotlb_alloc_pool() - allocate a new IO TLB memory pool
+ * @dev: Device for which a memory pool is allocated.
+ * @minslabs: Minimum number of slabs.
+ * @nslabs: Desired (maximum) number of slabs.
+ * @nareas: Number of areas.
+ * @phys_limit: Maximum DMA buffer physical address.
+ * @gfp: GFP flags for the allocations.
+ *
+ * Allocate and initialize a new IO TLB memory pool. The actual number of
+ * slabs may be reduced if allocation of @nslabs fails. If even
+ * @minslabs cannot be allocated, this function fails.
+ *
+ * Return: New memory pool, or %NULL on allocation failure.
+ */
+static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
+ unsigned long minslabs, unsigned long nslabs,
+ unsigned int nareas, u64 phys_limit, gfp_t gfp)
+{
+ struct io_tlb_pool *pool;
+ unsigned int slot_order;
+ struct page *tlb;
+ size_t pool_size;
+ size_t tlb_size;
+
+ if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
+ nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
+ nareas = limit_nareas(nareas, nslabs);
+ }
+
+ pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
+ pool = kzalloc(pool_size, gfp);
+ if (!pool)
+ goto error;
+ pool->areas = (void *)pool + sizeof(*pool);
+
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
+ if (nslabs <= minslabs)
+ goto error_tlb;
+ nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
+ nareas = limit_nareas(nareas, nslabs);
+ tlb_size = nslabs << IO_TLB_SHIFT;
+ }
+
+ slot_order = get_order(array_size(sizeof(*pool->slots), nslabs));
+ pool->slots = (struct io_tlb_slot *)
+ __get_free_pages(gfp, slot_order);
+ if (!pool->slots)
+ goto error_slots;
+
+ swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas);
+ return pool;
+
+error_slots:
+ swiotlb_free_tlb(page_address(tlb), tlb_size);
+error_tlb:
+ kfree(pool);
+error:
+ return NULL;
+}
+
+/**
+ * swiotlb_dyn_alloc() - dynamic memory pool allocation worker
+ * @work: Pointer to dyn_alloc in struct io_tlb_mem.
+ */
+static void swiotlb_dyn_alloc(struct work_struct *work)
+{
+ struct io_tlb_mem *mem =
+ container_of(work, struct io_tlb_mem, dyn_alloc);
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
+ default_nareas, mem->phys_limit, GFP_KERNEL);
+ if (!pool) {
+ pr_warn_ratelimited("Failed to allocate new pool");
+ return;
+ }
+
+ add_mem_pool(mem, pool);
+}
+
+/**
+ * swiotlb_dyn_free() - RCU callback to free a memory pool
+ * @rcu: RCU head in the corresponding struct io_tlb_pool.
+ */
+static void swiotlb_dyn_free(struct rcu_head *rcu)
+{
+ struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu);
+ size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs);
+ size_t tlb_size = pool->end - pool->start;
+
+ free_pages((unsigned long)pool->slots, get_order(slots_size));
+ swiotlb_free_tlb(pool->vaddr, tlb_size);
+ kfree(pool);
+}
+
+/**
+ * __swiotlb_find_pool() - find the IO TLB pool for a physical address
+ * @dev: Device which has mapped the DMA buffer.
+ * @paddr: Physical address within the DMA buffer.
+ *
+ * Find the IO TLB memory pool descriptor which contains the given physical
+ * address, if any. This function is for use only when the dev is known to
+ * be using swiotlb. Use swiotlb_find_pool() for the more general case
+ * when this condition is not met.
+ *
+ * Return: Memory pool which contains @paddr, or %NULL if none.
+ */
+struct io_tlb_pool *__swiotlb_find_pool(struct device *dev, phys_addr_t paddr)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+
+ list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) {
+ if (paddr >= pool->start && paddr < pool->end)
+ goto out;
+ }
+ pool = NULL;
+out:
+ rcu_read_unlock();
+ return pool;
+}
+
+/**
+ * swiotlb_del_pool() - remove an IO TLB pool from a device
+ * @dev: Owning device.
+ * @pool: Memory pool to be removed.
+ */
+static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_del_rcu(&pool->node);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+
+ call_rcu(&pool->rcu, swiotlb_dyn_free);
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/**
+ * swiotlb_dev_init() - initialize swiotlb fields in &struct device
+ * @dev: Device to be initialized.
+ */
+void swiotlb_dev_init(struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ INIT_LIST_HEAD(&dev->dma_io_tlb_pools);
+ spin_lock_init(&dev->dma_io_tlb_lock);
+ dev->dma_uses_io_tlb = false;
+#endif
+}
+
+/**
+ * swiotlb_align_offset() - Get required offset into an IO TLB allocation.
+ * @dev: Owning device.
+ * @align_mask: Allocation alignment mask.
+ * @addr: DMA address.
+ *
+ * Return the minimum offset from the start of an IO TLB allocation which is
+ * required for a given buffer address and allocation alignment to keep the
+ * device happy.
+ *
+ * First, the address bits covered by min_align_mask must be identical in the
+ * original address and the bounce buffer address. High bits are preserved by
+ * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra
+ * padding bytes before the bounce buffer.
+ *
+ * Second, @align_mask specifies which bits of the first allocated slot must
+ * be zero. This may require allocating additional padding slots, and then the
+ * offset (in bytes) from the first such padding slot is returned.
+ */
+static unsigned int swiotlb_align_offset(struct device *dev,
+ unsigned int align_mask, u64 addr)
+{
+ return addr & dma_get_min_align_mask(dev) &
+ (align_mask | (IO_TLB_SIZE - 1));
+}
+
+/*
+ * Bounce: copy the swiotlb buffer from or back to the original dma location
+ */
+static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size,
+ enum dma_data_direction dir, struct io_tlb_pool *mem)
+{
+ int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = mem->slots[index].orig_addr;
+ size_t alloc_size = mem->slots[index].alloc_size;
+ unsigned long pfn = PFN_DOWN(orig_addr);
+ unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
+ int tlb_offset;
+
+ if (orig_addr == INVALID_PHYS_ADDR)
+ return;
+
+ /*
+ * It's valid for tlb_offset to be negative. This can happen when the
+ * "offset" returned by swiotlb_align_offset() is non-zero, and the
+ * tlb_addr is pointing within the first "offset" bytes of the second
+ * or subsequent slots of the allocated swiotlb area. While it's not
+ * valid for tlb_addr to be pointing within the first "offset" bytes
+ * of the first slot, there's no way to check for such an error since
+ * this function can't distinguish the first slot from the second and
+ * subsequent slots.
+ */
+ tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
+ swiotlb_align_offset(dev, 0, orig_addr);
+
+ orig_addr += tlb_offset;
+ alloc_size -= tlb_offset;
+
+ if (size > alloc_size) {
+ dev_WARN_ONCE(dev, 1,
+ "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n",
+ alloc_size, size);
+ size = alloc_size;
+ }
+
+ if (PageHighMem(pfn_to_page(pfn))) {
+ unsigned int offset = orig_addr & ~PAGE_MASK;
+ struct page *page;
+ unsigned int sz = 0;
+ unsigned long flags;
+
+ while (size) {
+ sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+ local_irq_save(flags);
+ page = pfn_to_page(pfn);
+ if (dir == DMA_TO_DEVICE)
+ memcpy_from_page(vaddr, page, offset, sz);
+ else
+ memcpy_to_page(page, offset, vaddr, sz);
+ local_irq_restore(flags);
+
+ size -= sz;
+ pfn++;
+ vaddr += sz;
+ offset = 0;
+ }
+ } else if (dir == DMA_TO_DEVICE) {
+ memcpy(vaddr, phys_to_virt(orig_addr), size);
+ } else {
+ memcpy(phys_to_virt(orig_addr), vaddr, size);
+ }
+}
+
+static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx)
+{
+ return start + (idx << IO_TLB_SHIFT);
+}
+
+/*
+ * Carefully handle integer overflow which can occur when boundary_mask == ~0UL.
+ */
+static inline unsigned long get_max_slots(unsigned long boundary_mask)
+{
+ return (boundary_mask >> IO_TLB_SHIFT) + 1;
+}
+
+static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index)
+{
+ if (index >= mem->area_nslabs)
+ return 0;
+ return index;
+}
+
+/*
+ * Track the total used slots with a global atomic value in order to have
+ * correct information to determine the high water mark. The mem_used()
+ * function gives imprecise results because there's no locking across
+ * multiple areas.
+ */
+#ifdef CONFIG_DEBUG_FS
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ unsigned long old_hiwater, new_used;
+
+ new_used = atomic_long_add_return(nslots, &mem->total_used);
+ old_hiwater = atomic_long_read(&mem->used_hiwater);
+ do {
+ if (new_used <= old_hiwater)
+ break;
+ } while (!atomic_long_try_cmpxchg(&mem->used_hiwater,
+ &old_hiwater, new_used));
+}
+
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_used_and_hiwater(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+#ifdef CONFIG_DEBUG_FS
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_add(nslots, &mem->transient_nslabs);
+}
+
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+ atomic_long_sub(nslots, &mem->transient_nslabs);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+static void inc_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+static void dec_transient_used(struct io_tlb_mem *mem, unsigned int nslots)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/**
+ * swiotlb_search_pool_area() - search one memory area in one pool
+ * @dev: Device which maps the buffer.
+ * @pool: Memory pool to be searched.
+ * @area_index: Index of the IO TLB memory area to be searched.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ *
+ * Find a suitable sequence of IO TLB entries for the request and allocate
+ * a buffer from the given IO TLB memory area.
+ * This function takes care of locking.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool,
+ int area_index, phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask)
+{
+ struct io_tlb_area *area = pool->areas + area_index;
+ unsigned long boundary_mask = dma_get_seg_boundary(dev);
+ dma_addr_t tbl_dma_addr =
+ phys_to_dma_unencrypted(dev, pool->start) & boundary_mask;
+ unsigned long max_slots = get_max_slots(boundary_mask);
+ unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
+ unsigned int nslots = nr_slots(alloc_size), stride;
+ unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr);
+ unsigned int index, slots_checked, count = 0, i;
+ unsigned long flags;
+ unsigned int slot_base;
+ unsigned int slot_index;
+
+ BUG_ON(!nslots);
+ BUG_ON(area_index >= pool->nareas);
+
+ /*
+ * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be
+ * page-aligned in the absence of any other alignment requirements.
+ * 'alloc_align_mask' was later introduced to specify the alignment
+ * explicitly, however this is passed as zero for streaming mappings
+ * and so we preserve the old behaviour there in case any drivers are
+ * relying on it.
+ */
+ if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE)
+ alloc_align_mask = PAGE_SIZE - 1;
+
+ /*
+ * Ensure that the allocation is at least slot-aligned and update
+ * 'iotlb_align_mask' to ignore bits that will be preserved when
+ * offsetting into the allocation.
+ */
+ alloc_align_mask |= (IO_TLB_SIZE - 1);
+ iotlb_align_mask &= ~alloc_align_mask;
+
+ /*
+ * For mappings with an alignment requirement don't bother looping to
+ * unaligned slots once we found an aligned one.
+ */
+ stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask));
+
+ spin_lock_irqsave(&area->lock, flags);
+ if (unlikely(nslots > pool->area_nslabs - area->used))
+ goto not_found;
+
+ slot_base = area_index * pool->area_nslabs;
+ index = area->index;
+
+ for (slots_checked = 0; slots_checked < pool->area_nslabs; ) {
+ phys_addr_t tlb_addr;
+
+ slot_index = slot_base + index;
+ tlb_addr = slot_addr(tbl_dma_addr, slot_index);
+
+ if ((tlb_addr & alloc_align_mask) ||
+ (orig_addr && (tlb_addr & iotlb_align_mask) !=
+ (orig_addr & iotlb_align_mask))) {
+ index = wrap_area_index(pool, index + 1);
+ slots_checked++;
+ continue;
+ }
+
+ if (!iommu_is_span_boundary(slot_index, nslots,
+ nr_slots(tbl_dma_addr),
+ max_slots)) {
+ if (pool->slots[slot_index].list >= nslots)
+ goto found;
+ }
+ index = wrap_area_index(pool, index + stride);
+ slots_checked += stride;
+ }
+
+not_found:
+ spin_unlock_irqrestore(&area->lock, flags);
+ return -1;
+
+found:
+ /*
+ * If we find a slot that indicates we have 'nslots' number of
+ * contiguous buffers, we allocate the buffers from that slot onwards
+ * and set the list of free entries to '0' indicating unavailable.
+ */
+ for (i = slot_index; i < slot_index + nslots; i++) {
+ pool->slots[i].list = 0;
+ pool->slots[i].alloc_size = alloc_size - (offset +
+ ((i - slot_index) << IO_TLB_SHIFT));
+ }
+ for (i = slot_index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
+ pool->slots[i].list; i--)
+ pool->slots[i].list = ++count;
+
+ /*
+ * Update the indices to avoid searching in the next round.
+ */
+ area->index = wrap_area_index(pool, index + nslots);
+ area->used += nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
+
+ inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots);
+ return slot_index;
+}
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_search_area() - search one memory area in all pools
+ * @dev: Device which maps the buffer.
+ * @start_cpu: Start CPU number.
+ * @cpu_offset: Offset from @start_cpu.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search one memory area in all pools for a sequence of slots that match the
+ * allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_search_area(struct device *dev, int start_cpu,
+ int cpu_offset, phys_addr_t orig_addr, size_t alloc_size,
+ unsigned int alloc_align_mask, struct io_tlb_pool **retpool)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ int area_index;
+ int index = -1;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node) {
+ if (cpu_offset >= pool->nareas)
+ continue;
+ area_index = (start_cpu + cpu_offset) & (pool->nareas - 1);
+ index = swiotlb_search_pool_area(dev, pool, area_index,
+ orig_addr, alloc_size,
+ alloc_align_mask);
+ if (index >= 0) {
+ *retpool = pool;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return index;
+}
+
+/**
+ * swiotlb_find_slots() - search for slots in the whole swiotlb
+ * @dev: Device which maps the buffer.
+ * @orig_addr: Original (non-bounced) IO buffer address.
+ * @alloc_size: Total requested size of the bounce buffer,
+ * including initial alignment padding.
+ * @alloc_align_mask: Required alignment of the allocated buffer.
+ * @retpool: Used memory pool, updated on return.
+ *
+ * Search through the whole software IO TLB to find a sequence of slots that
+ * match the allocation constraints.
+ *
+ * Return: Index of the first allocated slot, or -1 on error.
+ */
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ unsigned long nslabs;
+ unsigned long flags;
+ u64 phys_limit;
+ int cpu, i;
+ int index;
+
+ if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE)
+ return -1;
+
+ cpu = raw_smp_processor_id();
+ for (i = 0; i < default_nareas; ++i) {
+ index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
+ alloc_align_mask, &pool);
+ if (index >= 0)
+ goto found;
+ }
+
+ if (!mem->can_grow)
+ return -1;
+
+ schedule_work(&mem->dyn_alloc);
+
+ nslabs = nr_slots(alloc_size);
+ phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
+ pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
+ GFP_NOWAIT);
+ if (!pool)
+ return -1;
+
+ index = swiotlb_search_pool_area(dev, pool, 0, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index < 0) {
+ swiotlb_dyn_free(&pool->rcu);
+ return -1;
+ }
+
+ pool->transient = true;
+ spin_lock_irqsave(&dev->dma_io_tlb_lock, flags);
+ list_add_rcu(&pool->node, &dev->dma_io_tlb_pools);
+ spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
+ inc_transient_used(mem, pool->nslabs);
+
+found:
+ WRITE_ONCE(dev->dma_uses_io_tlb, true);
+
+ /*
+ * The general barrier orders reads and writes against a presumed store
+ * of the SWIOTLB buffer address by a device driver (to a driver private
+ * data structure). It serves two purposes.
+ *
+ * First, the store to dev->dma_uses_io_tlb must be ordered before the
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
+ *
+ * Second, the load from mem->pools must be ordered before the same
+ * presumed store. This guarantees that the returned buffer address
+ * cannot be observed by another CPU before an update of the RCU list
+ * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
+ * atomicity).
+ *
+ * See also the comment in swiotlb_find_pool().
+ */
+ smp_mb();
+
+ *retpool = pool;
+ return index;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
+ size_t alloc_size, unsigned int alloc_align_mask,
+ struct io_tlb_pool **retpool)
+{
+ struct io_tlb_pool *pool;
+ int start, i;
+ int index;
+
+ *retpool = pool = &dev->dma_io_tlb_mem->defpool;
+ i = start = raw_smp_processor_id() & (pool->nareas - 1);
+ do {
+ index = swiotlb_search_pool_area(dev, pool, i, orig_addr,
+ alloc_size, alloc_align_mask);
+ if (index >= 0)
+ return index;
+ if (++i >= pool->nareas)
+ i = 0;
+ } while (i != start);
+ return -1;
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+#ifdef CONFIG_DEBUG_FS
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is accurate in this version of the function, because an atomic
+ * counter is available if CONFIG_DEBUG_FS is set.
+ *
+ * Return: Number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+ return atomic_long_read(&mem->total_used);
+}
+
+#else /* !CONFIG_DEBUG_FS */
+
+/**
+ * mem_pool_used() - get number of used slots in a memory pool
+ * @pool: Software IO TLB memory pool.
+ *
+ * The result is not accurate, see mem_used().
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_pool_used(struct io_tlb_pool *pool)
+{
+ int i;
+ unsigned long used = 0;
+
+ for (i = 0; i < pool->nareas; i++)
+ used += pool->areas[i].used;
+ return used;
+}
+
+/**
+ * mem_used() - get number of used slots in an allocator
+ * @mem: Software IO TLB allocator.
+ *
+ * The result is not accurate, because there is no locking of individual
+ * areas.
+ *
+ * Return: Approximate number of used slots.
+ */
+static unsigned long mem_used(struct io_tlb_mem *mem)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ struct io_tlb_pool *pool;
+ unsigned long used = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &mem->pools, node)
+ used += mem_pool_used(pool);
+ rcu_read_unlock();
+
+ return used;
+#else
+ return mem_pool_used(&mem->defpool);
+#endif
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+/**
+ * swiotlb_tbl_map_single() - bounce buffer map a single contiguous physical area
+ * @dev: Device which maps the buffer.
+ * @orig_addr: Original (non-bounced) physical IO buffer address
+ * @mapping_size: Requested size of the actual bounce buffer, excluding
+ * any pre- or post-padding for alignment
+ * @alloc_align_mask: Required start and end alignment of the allocated buffer
+ * @dir: DMA direction
+ * @attrs: Optional DMA attributes for the map operation
+ *
+ * Find and allocate a suitable sequence of IO TLB slots for the request.
+ * The allocated space starts at an alignment specified by alloc_align_mask,
+ * and the size of the allocated space is rounded up so that the total amount
+ * of allocated space is a multiple of (alloc_align_mask + 1). If
+ * alloc_align_mask is zero, the allocated space may be at any alignment and
+ * the size is not rounded up.
+ *
+ * The returned address is within the allocated space and matches the bits
+ * of orig_addr that are specified in the DMA min_align_mask for the device. As
+ * such, this returned address may be offset from the beginning of the allocated
+ * space. The bounce buffer space starting at the returned address for
+ * mapping_size bytes is initialized to the contents of the original IO buffer
+ * area. Any pre-padding (due to an offset) and any post-padding (due to
+ * rounding-up the size) is not initialized.
+ */
+phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
+ size_t mapping_size, unsigned int alloc_align_mask,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ unsigned int offset;
+ struct io_tlb_pool *pool;
+ unsigned int i;
+ size_t size;
+ int index;
+ phys_addr_t tlb_addr;
+ unsigned short pad_slots;
+
+ if (!mem || !mem->nslabs) {
+ dev_warn_ratelimited(dev,
+ "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
+ pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+
+ /*
+ * The default swiotlb memory pool is allocated with PAGE_SIZE
+ * alignment. If a mapping is requested with larger alignment,
+ * the mapping may be unable to use the initial slot(s) in all
+ * sets of IO_TLB_SEGSIZE slots. In such case, a mapping request
+ * of or near the maximum mapping size would always fail.
+ */
+ dev_WARN_ONCE(dev, alloc_align_mask > ~PAGE_MASK,
+ "Alloc alignment may prevent fulfilling requests with max mapping_size\n");
+
+ offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr);
+ size = ALIGN(mapping_size + offset, alloc_align_mask + 1);
+ index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool);
+ if (index == -1) {
+ if (!(attrs & DMA_ATTR_NO_WARN))
+ dev_warn_ratelimited(dev,
+ "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+ size, mem->nslabs, mem_used(mem));
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
+
+ /*
+ * If dma_skip_sync was set, reset it on first SWIOTLB buffer
+ * mapping to always sync SWIOTLB buffers.
+ */
+ dma_reset_need_sync(dev);
+
+ /*
+ * Save away the mapping from the original address to the DMA address.
+ * This is needed when we sync the memory. Then we sync the buffer if
+ * needed.
+ */
+ pad_slots = offset >> IO_TLB_SHIFT;
+ offset &= (IO_TLB_SIZE - 1);
+ index += pad_slots;
+ pool->slots[index].pad_slots = pad_slots;
+ for (i = 0; i < (nr_slots(size) - pad_slots); i++)
+ pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
+ tlb_addr = slot_addr(pool->start, index) + offset;
+ /*
+ * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy
+ * the original buffer to the TLB buffer before initiating DMA in order
+ * to preserve the original's data if the device does a partial write,
+ * i.e. if the device doesn't overwrite the entire buffer. Preserving
+ * the original data, even if it's garbage, is necessary to match
+ * hardware behavior. Use of swiotlb is supposed to be transparent,
+ * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE, pool);
+ return tlb_addr;
+}
+
+static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *mem)
+{
+ unsigned long flags;
+ unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr);
+ int index, nslots, aindex;
+ struct io_tlb_area *area;
+ int count, i;
+
+ index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
+ index -= mem->slots[index].pad_slots;
+ nslots = nr_slots(mem->slots[index].alloc_size + offset);
+ aindex = index / mem->area_nslabs;
+ area = &mem->areas[aindex];
+
+ /*
+ * Return the buffer to the free list by setting the corresponding
+ * entries to indicate the number of contiguous entries available.
+ * While returning the entries to the free list, we merge the entries
+ * with slots below and above the pool being returned.
+ */
+ BUG_ON(aindex >= mem->nareas);
+
+ spin_lock_irqsave(&area->lock, flags);
+ if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE))
+ count = mem->slots[index + nslots].list;
+ else
+ count = 0;
+
+ /*
+ * Step 1: return the slots to the free list, merging the slots with
+ * superceeding slots
+ */
+ for (i = index + nslots - 1; i >= index; i--) {
+ mem->slots[i].list = ++count;
+ mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+ mem->slots[i].alloc_size = 0;
+ mem->slots[i].pad_slots = 0;
+ }
+
+ /*
+ * Step 2: merge the returned slots with the preceding slots, if
+ * available (non zero)
+ */
+ for (i = index - 1;
+ io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list;
+ i--)
+ mem->slots[i].list = ++count;
+ area->used -= nslots;
+ spin_unlock_irqrestore(&area->lock, flags);
+
+ dec_used(dev->dma_io_tlb_mem, nslots);
+}
+
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
+/**
+ * swiotlb_del_transient() - delete a transient memory pool
+ * @dev: Device which mapped the buffer.
+ * @tlb_addr: Physical address within a bounce buffer.
+ * @pool: Pointer to the transient memory pool to be checked and deleted.
+ *
+ * Check whether the address belongs to a transient SWIOTLB memory pool.
+ * If yes, then delete the pool.
+ *
+ * Return: %true if @tlb_addr belonged to a transient pool that was released.
+ */
+static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr,
+ struct io_tlb_pool *pool)
+{
+ if (!pool->transient)
+ return false;
+
+ dec_used(dev->dma_io_tlb_mem, pool->nslabs);
+ swiotlb_del_pool(dev, pool);
+ dec_transient_used(dev->dma_io_tlb_mem, pool->nslabs);
+ return true;
+}
+
+#else /* !CONFIG_SWIOTLB_DYNAMIC */
+
+static inline bool swiotlb_del_transient(struct device *dev,
+ phys_addr_t tlb_addr, struct io_tlb_pool *pool)
+{
+ return false;
+}
+
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs, struct io_tlb_pool *pool)
+{
+ /*
+ * First, sync the memory before unmapping the entry
+ */
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+ (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+ swiotlb_bounce(dev, tlb_addr, mapping_size,
+ DMA_FROM_DEVICE, pool);
+
+ if (swiotlb_del_transient(dev, tlb_addr, pool))
+ return;
+ swiotlb_release_slots(dev, tlb_addr, pool);
+}
+
+void __swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
+{
+ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE, pool);
+ else
+ BUG_ON(dir != DMA_FROM_DEVICE);
+}
+
+void __swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ struct io_tlb_pool *pool)
+{
+ if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+ swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE, pool);
+ else
+ BUG_ON(dir != DMA_TO_DEVICE);
+}
+
+/*
+ * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing
+ * to the device copy the data into it as well.
+ */
+dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t swiotlb_addr;
+ dma_addr_t dma_addr;
+
+ trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
+
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
+ if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
+ return DMA_MAPPING_ERROR;
+
+ /* Ensure that the address returned is DMA'ble */
+ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
+ if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
+ __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC,
+ swiotlb_find_pool(dev, swiotlb_addr));
+ dev_WARN_ONCE(dev, 1,
+ "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ arch_sync_dma_for_device(swiotlb_addr, size, dir);
+ return dma_addr;
+}
+
+size_t swiotlb_max_mapping_size(struct device *dev)
+{
+ int min_align_mask = dma_get_min_align_mask(dev);
+ int min_align = 0;
+
+ /*
+ * swiotlb_find_slots() skips slots according to
+ * min align mask. This affects max mapping size.
+ * Take it into acount here.
+ */
+ if (min_align_mask)
+ min_align = roundup(min_align_mask, IO_TLB_SIZE);
+
+ return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align;
+}
+
+/**
+ * is_swiotlb_allocated() - check if the default software IO TLB is initialized
+ */
+bool is_swiotlb_allocated(void)
+{
+ return io_tlb_default_mem.nslabs;
+}
+
+bool is_swiotlb_active(struct device *dev)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+
+ return mem && mem->nslabs;
+}
+
+/**
+ * default_swiotlb_base() - get the base address of the default SWIOTLB
+ *
+ * Get the lowest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_base(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ io_tlb_default_mem.can_grow = false;
+#endif
+ return io_tlb_default_mem.defpool.start;
+}
+
+/**
+ * default_swiotlb_limit() - get the address limit of the default SWIOTLB
+ *
+ * Get the highest physical address used by the default software IO TLB pool.
+ */
+phys_addr_t default_swiotlb_limit(void)
+{
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ return io_tlb_default_mem.phys_limit;
+#else
+ return io_tlb_default_mem.defpool.end - 1;
+#endif
+}
+
+#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+static unsigned long mem_transient_used(struct io_tlb_mem *mem)
+{
+ return atomic_long_read(&mem->transient_nslabs);
+}
+
+static int io_tlb_transient_used_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+
+ *val = mem_transient_used(mem);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_transient_used, io_tlb_transient_used_get,
+ NULL, "%llu\n");
+#endif /* CONFIG_SWIOTLB_DYNAMIC */
+
+static int io_tlb_used_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+
+ *val = mem_used(mem);
+ return 0;
+}
+
+static int io_tlb_hiwater_get(void *data, u64 *val)
+{
+ struct io_tlb_mem *mem = data;
+
+ *val = atomic_long_read(&mem->used_hiwater);
+ return 0;
+}
+
+static int io_tlb_hiwater_set(void *data, u64 val)
+{
+ struct io_tlb_mem *mem = data;
+
+ /* Only allow setting to zero */
+ if (val != 0)
+ return -EINVAL;
+
+ atomic_long_set(&mem->used_hiwater, val);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_used, io_tlb_used_get, NULL, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_tlb_hiwater, io_tlb_hiwater_get,
+ io_tlb_hiwater_set, "%llu\n");
+
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+ mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+ if (!mem->nslabs)
+ return;
+
+ debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
+ debugfs_create_file("io_tlb_used", 0400, mem->debugfs, mem,
+ &fops_io_tlb_used);
+ debugfs_create_file("io_tlb_used_hiwater", 0600, mem->debugfs, mem,
+ &fops_io_tlb_hiwater);
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ debugfs_create_file("io_tlb_transient_nslabs", 0400, mem->debugfs,
+ mem, &fops_io_tlb_transient_used);
+#endif
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+ swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
+ return 0;
+}
+
+late_initcall(swiotlb_create_default_debugfs);
+
+#else /* !CONFIG_DEBUG_FS */
+
+static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+
+struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+ struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+ struct io_tlb_pool *pool;
+ phys_addr_t tlb_addr;
+ unsigned int align;
+ int index;
+
+ if (!mem)
+ return NULL;
+
+ align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
+ index = swiotlb_find_slots(dev, 0, size, align, &pool);
+ if (index == -1)
+ return NULL;
+
+ tlb_addr = slot_addr(pool->start, index);
+ if (unlikely(!PAGE_ALIGNED(tlb_addr))) {
+ dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n",
+ &tlb_addr);
+ swiotlb_release_slots(dev, tlb_addr, pool);
+ return NULL;
+ }
+
+ return pfn_to_page(PFN_DOWN(tlb_addr));
+}
+
+bool swiotlb_free(struct device *dev, struct page *page, size_t size)
+{
+ phys_addr_t tlb_addr = page_to_phys(page);
+ struct io_tlb_pool *pool;
+
+ pool = swiotlb_find_pool(dev, tlb_addr);
+ if (!pool)
+ return false;
+
+ swiotlb_release_slots(dev, tlb_addr, pool);
+
+ return true;
+}
+
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ struct io_tlb_mem *mem = rmem->priv;
+ unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+ /* Set Per-device io tlb area to one */
+ unsigned int nareas = 1;
+
+ if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) {
+ dev_err(dev, "Restricted DMA pool must be accessible within the linear mapping.");
+ return -EINVAL;
+ }
+
+ /*
+ * Since multiple devices can share the same pool, the private data,
+ * io_tlb_mem struct, will be initialized by the first device attached
+ * to it.
+ */
+ if (!mem) {
+ struct io_tlb_pool *pool;
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+ pool = &mem->defpool;
+
+ pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL);
+ if (!pool->slots) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ pool->areas = kcalloc(nareas, sizeof(*pool->areas),
+ GFP_KERNEL);
+ if (!pool->areas) {
+ kfree(pool->slots);
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
+ rmem->size >> PAGE_SHIFT);
+ swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
+ false, nareas);
+ mem->force_bounce = true;
+ mem->for_alloc = true;
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+ spin_lock_init(&mem->lock);
+ INIT_LIST_HEAD_RCU(&mem->pools);
+#endif
+ add_mem_pool(mem, pool);
+
+ rmem->priv = mem;
+
+ swiotlb_create_debugfs_files(mem, rmem->name);
+ }
+
+ dev->dma_io_tlb_mem = mem;
+
+ return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+ struct device *dev)
+{
+ dev->dma_io_tlb_mem = &io_tlb_default_mem;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+ .device_init = rmem_swiotlb_device_init,
+ .device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+ unsigned long node = rmem->fdt_node;
+
+ if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+ of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+ of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+ of_get_flat_dt_prop(node, "no-map", NULL))
+ return -EINVAL;
+
+ rmem->ops = &rmem_swiotlb_ops;
+ pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n",
+ &rmem->base, (unsigned long)rmem->size / SZ_1M);
+ return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
deleted file mode 100644
index e556751d15d9..000000000000
--- a/kernel/elfcore.c
+++ /dev/null
@@ -1,24 +0,0 @@
-#include <linux/elf.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/binfmts.h>
-
-Elf_Half __weak elf_core_extra_phdrs(void)
-{
- return 0;
-}
-
-int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-{
- return 1;
-}
-
-int __weak elf_core_write_extra_data(struct coredump_params *cprm)
-{
- return 1;
-}
-
-size_t __weak elf_core_extra_data_size(void)
-{
- return 0;
-}
diff --git a/kernel/crash_dump.c b/kernel/elfcorehdr.c
index b64e238b553b..92da32275af5 100644
--- a/kernel/crash_dump.c
+++ b/kernel/elfcorehdr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/crash_dump.h>
#include <linux/init.h>
@@ -5,12 +6,6 @@
#include <linux/export.h>
/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-
-/*
* stores the physical address of elf header of crash image
*
* Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
new file mode 100644
index 000000000000..2333d70802e4
--- /dev/null
+++ b/kernel/entry/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Prevent the noinstr section from being pestered by sanitizer and other goodies
+# as long as these things cannot be disabled per function.
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+# Branch profiling isn't noinstr-safe
+ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
+
+CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
+CFLAGS_common.o += -fno-stack-protector
+
+obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o
+obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o
+obj-$(CONFIG_VIRT_XFER_TO_GUEST_WORK) += virt.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
new file mode 100644
index 000000000000..5c792b30c58a
--- /dev/null
+++ b/kernel/entry/common.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/irq-entry-common.h>
+#include <linux/resume_user_mode.h>
+#include <linux/highmem.h>
+#include <linux/jump_label.h>
+#include <linux/kmsan.h>
+#include <linux/livepatch.h>
+#include <linux/tick.h>
+
+/* Workaround to allow gradual conversion of architecture code */
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
+
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
+#else
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
+#endif
+
+static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ /*
+ * Before returning to user space ensure that all pending work
+ * items have been completed.
+ */
+ while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
+
+ local_irq_enable_exit_to_user(ti_work);
+
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
+ schedule();
+
+ if (ti_work & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
+ if (ti_work & _TIF_PATCH_PENDING)
+ klp_update_patch_state(current);
+
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ arch_do_signal_or_restart(regs);
+
+ if (ti_work & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(regs);
+
+ /* Architecture specific TIF work */
+ arch_exit_to_user_mode_work(regs, ti_work);
+
+ /*
+ * Disable interrupts and reevaluate the work flags as they
+ * might have changed while interrupts and preemption was
+ * enabled above.
+ */
+ local_irq_disable_exit_to_user();
+
+ /* Check if any of the above work has queued a deferred wakeup */
+ tick_nohz_user_enter_prepare();
+
+ ti_work = read_thread_flags();
+ }
+
+ /* Return the latest work state for arch_exit_to_user_mode() */
+ return ti_work;
+}
+
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ for (;;) {
+ ti_work = __exit_to_user_mode_loop(regs, ti_work);
+
+ if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
+ return ti_work;
+ ti_work = read_thread_flags();
+ }
+}
+
+noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
+{
+ irqentry_state_t ret = {
+ .exit_rcu = false,
+ };
+
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ return ret;
+ }
+
+ /*
+ * If this entry hit the idle task invoke ct_irq_enter() whether
+ * RCU is watching or not.
+ *
+ * Interrupts can nest when the first interrupt invokes softirq
+ * processing on return which enables interrupts.
+ *
+ * Scheduler ticks in the idle task can mark quiescent state and
+ * terminate a grace period, if and only if the timer interrupt is
+ * not nested into another interrupt.
+ *
+ * Checking for rcu_is_watching() here would prevent the nesting
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
+ * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+ * assume that it is the first interrupt and eventually claim
+ * quiescent state and end grace periods prematurely.
+ *
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
+ * consistent.
+ *
+ * TINY_RCU does not support EQS, so let the compiler eliminate
+ * this part when enabled.
+ */
+ if (!IS_ENABLED(CONFIG_TINY_RCU) &&
+ (is_idle_task(current) || arch_in_rcu_eqs())) {
+ /*
+ * If RCU is not watching then the same careful
+ * sequence vs. lockdep and tracing is required
+ * as in irqentry_enter_from_user_mode().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ ct_irq_enter();
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ ret.exit_rcu = true;
+ return ret;
+ }
+
+ /*
+ * If RCU is watching then RCU only wants to check whether it needs
+ * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+ * already contains a warning when RCU is not watching, so no point
+ * in having another one here.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ rcu_irq_enter_check_tick();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ return ret;
+}
+
+/**
+ * arch_irqentry_exit_need_resched - Architecture specific need resched function
+ *
+ * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed.
+ * Defaults return true.
+ *
+ * The main purpose is to permit arch to avoid preemption of a task from an IRQ.
+ */
+static inline bool arch_irqentry_exit_need_resched(void);
+
+#ifndef arch_irqentry_exit_need_resched
+static inline bool arch_irqentry_exit_need_resched(void) { return true; }
+#endif
+
+void raw_irqentry_exit_cond_resched(void)
+{
+ if (!preempt_count()) {
+ /* Sanity check RCU and thread stack */
+ rcu_irq_exit_check_preempt();
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ WARN_ON_ONCE(!on_thread_stack());
+ if (need_resched() && arch_irqentry_exit_need_resched())
+ preempt_schedule_irq();
+ }
+}
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
+#endif
+
+noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
+{
+ lockdep_assert_irqs_disabled();
+
+ /* Check whether this returns to user mode */
+ if (user_mode(regs)) {
+ irqentry_exit_to_user_mode(regs);
+ } else if (!regs_irqs_disabled(regs)) {
+ /*
+ * If RCU was not watching on entry this needs to be done
+ * carefully and needs the same ordering of lockdep/tracing
+ * and RCU as the return to user mode path.
+ */
+ if (state.exit_rcu) {
+ instrumentation_begin();
+ /* Tell the tracer that IRET will enable interrupts */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare();
+ instrumentation_end();
+ ct_irq_exit();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ return;
+ }
+
+ instrumentation_begin();
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ irqentry_exit_cond_resched();
+
+ /* Covers both tracing and lockdep */
+ trace_hardirqs_on();
+ instrumentation_end();
+ } else {
+ /*
+ * IRQ flags state is correct already. Just tell RCU if it
+ * was not watching on entry.
+ */
+ if (state.exit_rcu)
+ ct_irq_exit();
+ }
+}
+
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ irq_state.lockdep = lockdep_hardirqs_enabled();
+
+ __nmi_enter();
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirq_enter();
+ ct_nmi_enter();
+
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ trace_hardirqs_off_finish();
+ ftrace_nmi_enter();
+ instrumentation_end();
+
+ return irq_state;
+}
+
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
+{
+ instrumentation_begin();
+ ftrace_nmi_exit();
+ if (irq_state.lockdep) {
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare();
+ }
+ instrumentation_end();
+
+ ct_nmi_exit();
+ lockdep_hardirq_exit();
+ if (irq_state.lockdep)
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ __nmi_exit();
+}
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
new file mode 100644
index 000000000000..f6e6d02f07fe
--- /dev/null
+++ b/kernel/entry/common.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _COMMON_H
+#define _COMMON_H
+
+bool syscall_user_dispatch(struct pt_regs *regs);
+
+#endif
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
new file mode 100644
index 000000000000..940a597ded40
--- /dev/null
+++ b/kernel/entry/syscall-common.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/audit.h>
+#include <linux/entry-common.h>
+#include "common.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+ unsigned long work)
+{
+ long ret = 0;
+
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
+ /* Handle ptrace */
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+ ret = ptrace_report_syscall_entry(regs);
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (work & SYSCALL_WORK_SECCOMP) {
+ ret = __secure_computing();
+ if (ret == -1L)
+ return ret;
+ }
+
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
+ trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
+
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static inline bool report_single_step(unsigned long work)
+{
+ if (work & SYSCALL_WORK_SYSCALL_EMU)
+ return false;
+
+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+ bool step;
+
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
+ audit_syscall_exit(regs);
+
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+ ptrace_report_syscall_exit(regs, step);
+}
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
new file mode 100644
index 000000000000..a9055eccb27e
--- /dev/null
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#include <linux/sched.h>
+#include <linux/prctl.h>
+#include <linux/ptrace.h>
+#include <linux/syscall_user_dispatch.h>
+#include <linux/uaccess.h>
+#include <linux/signal.h>
+#include <linux/elf.h>
+
+#include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/syscall.h>
+
+#include "common.h"
+
+static void trigger_sigsys(struct pt_regs *regs)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_USER_DISPATCH;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = 0;
+ info.si_arch = syscall_get_arch(current);
+ info.si_syscall = syscall_get_nr(current, regs);
+
+ force_sig_info(&info);
+}
+
+bool syscall_user_dispatch(struct pt_regs *regs)
+{
+ struct syscall_user_dispatch *sd = &current->syscall_dispatch;
+ char state;
+
+ if (likely(instruction_pointer(regs) - sd->offset < sd->len))
+ return false;
+
+ if (unlikely(arch_syscall_is_vdso_sigreturn(regs)))
+ return false;
+
+ if (likely(sd->selector)) {
+ /*
+ * access_ok() is performed once, at prctl time, when
+ * the selector is loaded by userspace.
+ */
+ if (unlikely(__get_user(state, sd->selector))) {
+ force_exit_sig(SIGSEGV);
+ return true;
+ }
+
+ if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW))
+ return false;
+
+ if (state != SYSCALL_DISPATCH_FILTER_BLOCK) {
+ force_exit_sig(SIGSYS);
+ return true;
+ }
+ }
+
+ sd->on_dispatch = true;
+ syscall_rollback(current, regs);
+ trigger_sigsys(regs);
+
+ return true;
+}
+
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
+{
+ switch (mode) {
+ case PR_SYS_DISPATCH_OFF:
+ if (offset || len || selector)
+ return -EINVAL;
+ break;
+ case PR_SYS_DISPATCH_EXCLUSIVE_ON:
+ /*
+ * Validate the direct dispatcher region just for basic
+ * sanity against overflow and a 0-sized dispatcher
+ * region. If the user is able to submit a syscall from
+ * an address, that address is obviously valid.
+ */
+ if (offset && offset + len <= offset)
+ return -EINVAL;
+ break;
+ case PR_SYS_DISPATCH_INCLUSIVE_ON:
+ if (len == 0 || offset + len <= offset)
+ return -EINVAL;
+ /*
+ * Invert the range, the check in syscall_user_dispatch()
+ * supports wrap-around.
+ */
+ offset = offset + len;
+ len = -len;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+ *
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (mode != PR_SYS_DISPATCH_OFF && selector &&
+ !access_ok(untagged_addr(selector), sizeof(*selector)))
+ return -EFAULT;
+
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
+
+ if (mode != PR_SYS_DISPATCH_OFF)
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
+ else
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/entry/virt.c b/kernel/entry/virt.c
new file mode 100644
index 000000000000..c52f99249763
--- /dev/null
+++ b/kernel/entry/virt.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/entry-virt.h>
+
+static int xfer_to_guest_mode_work(unsigned long ti_work)
+{
+ do {
+ int ret;
+
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ return -EINTR;
+
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
+ schedule();
+
+ if (ti_work & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(NULL);
+
+ ret = arch_xfer_to_guest_mode_handle_work(ti_work);
+ if (ret)
+ return ret;
+
+ ti_work = read_thread_flags();
+ } while (ti_work & XFER_TO_GUEST_MODE_WORK);
+ return 0;
+}
+
+int xfer_to_guest_mode_handle_work(void)
+{
+ unsigned long ti_work;
+
+ /*
+ * This is invoked from the outer guest loop with interrupts and
+ * preemption enabled.
+ *
+ * KVM invokes xfer_to_guest_mode_work_pending() with interrupts
+ * disabled in the inner loop before going into guest mode. No need
+ * to disable interrupts here.
+ */
+ ti_work = read_thread_flags();
+ if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
+ return 0;
+
+ return xfer_to_guest_mode_work(ti_work);
+}
+EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work);
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 2925188f50ea..91a62f566743 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -1,9 +1,6 @@
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
-endif
-
+# SPDX-License-Identifier: GPL-2.0
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o
obj-$(CONFIG_UPROBES) += uprobes.o
-
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index d659487254d5..b9c7e00725d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -1,35 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events callchain code, extracted from core.c:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- * Copyright 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*/
#include <linux/perf_event.h>
#include <linux/slab.h>
+#include <linux/sched/task_stack.h>
+#include <linux/uprobes.h>
+
#include "internal.h"
struct callchain_cpus_entries {
struct rcu_head rcu_head;
- struct perf_callchain_entry *cpu_entries[0];
+ struct perf_callchain_entry *cpu_entries[];
};
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
+static const int six_hundred_forty_kb = 640 * 1024;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * (sysctl_perf_event_max_stack +
+ sysctl_perf_event_max_contexts_per_stack));
+}
+
+static DEFINE_PER_CPU(u8, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
static struct callchain_cpus_entries *callchain_cpus_entries;
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
}
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
}
@@ -73,7 +86,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -94,7 +107,7 @@ fail:
return -ENOMEM;
}
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
{
int err = 0;
int count;
@@ -107,14 +120,20 @@ int get_callchain_buffers(void)
goto exit;
}
- if (count > 1) {
- /* If the allocation failed, give up */
- if (!callchain_cpus_entries)
- err = -ENOMEM;
+ /*
+ * If requesting per event more than the global cap,
+ * return a different error to help userspace figure
+ * this out.
+ *
+ * And also do it here so that we have &callchain_mutex held.
+ */
+ if (event_max_stack > sysctl_perf_event_max_stack) {
+ err = -EOVERFLOW;
goto exit;
}
- err = alloc_callchain_buffers();
+ if (count == 1)
+ err = alloc_callchain_buffers();
exit:
if (err)
atomic_dec(&nr_callchain_events);
@@ -132,7 +151,7 @@ void put_callchain_buffers(void)
}
}
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
int cpu;
struct callchain_cpus_entries *entries;
@@ -142,64 +161,114 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
return NULL;
entries = rcu_dereference(callchain_cpus_entries);
- if (!entries)
+ if (!entries) {
+ put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx);
return NULL;
+ }
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}
-static void
+void
put_callchain_entry(int rctx)
{
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
+static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entry,
+ int start_entry_idx)
+{
+#ifdef CONFIG_UPROBES
+ struct uprobe_task *utask = current->utask;
+ struct return_instance *ri;
+ __u64 *cur_ip, *last_ip, tramp_addr;
+
+ if (likely(!utask || !utask->return_instances))
+ return;
+
+ cur_ip = &entry->ip[start_entry_idx];
+ last_ip = &entry->ip[entry->nr - 1];
+ ri = utask->return_instances;
+ tramp_addr = uprobe_get_trampoline_vaddr();
+
+ /*
+ * If there are pending uretprobes for the current thread, they are
+ * recorded in a list inside utask->return_instances; each such
+ * pending uretprobe replaces traced user function's return address on
+ * the stack, so when stack trace is captured, instead of seeing
+ * actual function's return address, we'll have one or many uretprobe
+ * trampoline addresses in the stack trace, which are not helpful and
+ * misleading to users.
+ * So here we go over the pending list of uretprobes, and each
+ * encountered trampoline address is replaced with actual return
+ * address.
+ */
+ while (ri && cur_ip <= last_ip) {
+ if (*cur_ip == tramp_addr) {
+ *cur_ip = ri->orig_ret_vaddr;
+ ri = ri->next;
+ }
+ cur_ip++;
+ }
+#endif
+}
+
struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs)
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
+ u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
{
- int rctx;
struct perf_callchain_entry *entry;
+ struct perf_callchain_entry_ctx ctx;
+ int rctx, start_entry_idx;
- int kernel = !event->attr.exclude_callchain_kernel;
- int user = !event->attr.exclude_callchain_user;
-
- if (!kernel && !user)
+ /* crosstask is not supported for user stacks */
+ if (crosstask && user && !kernel)
return NULL;
entry = get_callchain_entry(&rctx);
- if (rctx == -1)
- return NULL;
-
if (!entry)
- goto exit_put;
+ return NULL;
- entry->nr = 0;
+ ctx.entry = entry;
+ ctx.max_stack = max_stack;
+ ctx.nr = entry->nr = 0;
+ ctx.contexts = 0;
+ ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
- perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
- perf_callchain_kernel(entry, regs);
+ if (add_mark)
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
+ perf_callchain_kernel(&ctx, regs);
}
- if (user) {
+ if (user && !crosstask) {
if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
+ if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
+ goto exit_put;
+ regs = task_pt_regs(current);
}
- if (regs) {
+ if (defer_cookie) {
/*
- * Disallow cross-task user callchains.
+ * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+ * which can be stitched to this one, and add
+ * the cookie after it (it will be cut off when the
+ * user stack is copied to the callchain).
*/
- if (event->ctx->task && event->ctx->task != current)
- goto exit_put;
-
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+ perf_callchain_store_context(&ctx, defer_cookie);
+ goto exit_put;
}
+
+ if (add_mark)
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+
+ start_entry_idx = entry->nr;
+ perf_callchain_user(&ctx, regs);
+ fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
exit_put:
@@ -207,3 +276,55 @@ exit_put:
return entry;
}
+
+static int perf_event_max_stack_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *value = table->data;
+ int new_value = *value, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ *value = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
+
+static const struct ctl_table callchain_sysctl_table[] = {
+ {
+ .procname = "perf_event_max_stack",
+ .data = &sysctl_perf_event_max_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&six_hundred_forty_kb,
+ },
+ {
+ .procname = "perf_event_max_contexts_per_stack",
+ .data = &sysctl_perf_event_max_contexts_per_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+};
+
+static int __init init_callchain_sysctls(void)
+{
+ register_sysctl_init("kernel", callchain_sysctl_table);
+ return 0;
+}
+core_initcall(init_callchain_sysctls);
+
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0ceb386777ae..ece716879cbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1,12 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events core code:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
@@ -29,6 +28,7 @@
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
@@ -36,7 +36,7 @@
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
@@ -44,16 +44,29 @@
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
+#include <linux/min_heap.h>
+#include <linux/highmem.h>
+#include <linux/pgtable.h>
+#include <linux/buildid.h>
+#include <linux/task_work.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
#include "internal.h"
#include <asm/irq_regs.h>
-static struct workqueue_struct *perf_wq;
+typedef int (*remote_function_f)(void *);
struct remote_function_call {
struct task_struct *p;
- int (*func)(void *info);
+ remote_function_f func;
void *info;
int ret;
};
@@ -64,8 +77,17 @@ static void remote_function(void *data)
struct task_struct *p = tfc->p;
if (p) {
- tfc->ret = -EAGAIN;
- if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ /* -EAGAIN */
+ if (task_cpu(p) != smp_processor_id())
+ return;
+
+ /*
+ * Now that we're on right CPU with IRQs disabled, we can test
+ * if we hit the right task without races.
+ */
+
+ tfc->ret = -ESRCH; /* No such (running) process */
+ if (p != current)
return;
}
@@ -79,30 +101,41 @@ static void remote_function(void *data)
* @info: the function call argument
*
* Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
+ * be on the current CPU, which just calls the function directly. This will
+ * retry due to any failures in smp_call_function_single(), such as if the
+ * task_cpu() goes offline concurrently.
*
- * returns: @func return value, or
- * -ESRCH - when the process isn't running
- * -EAGAIN - when the process moved away
+ * returns @func return value or -ESRCH or -ENXIO when the process isn't running
*/
static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = p,
.func = func,
.info = info,
- .ret = -ESRCH, /* No such (running) process */
+ .ret = -EAGAIN,
};
+ int ret;
- if (task_curr(p))
- smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ for (;;) {
+ ret = smp_call_function_single(task_cpu(p), remote_function,
+ &data, 1);
+ if (!ret)
+ ret = data.ret;
- return data.ret;
+ if (ret != -EAGAIN)
+ break;
+
+ cond_resched();
+ }
+
+ return ret;
}
/**
* cpu_function_call - call a function on the cpu
+ * @cpu: target cpu to queue this function
* @func: the function to be called
* @info: the function call argument
*
@@ -110,7 +143,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
*
* returns: @func return value or -ENXIO when the cpu is offline
*/
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
struct remote_function_call data = {
.p = NULL,
@@ -124,11 +157,258 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
return data.ret;
}
-#define EVENT_OWNER_KERNEL ((void *) -1)
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x01,
+ EVENT_PINNED = 0x02,
+ EVENT_TIME = 0x04,
+ EVENT_FROZEN = 0x08,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x10,
+ EVENT_CGROUP = 0x20,
+
+ /* compound helpers */
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+ EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+{
+ raw_spin_lock(&ctx->lock);
+ WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ __perf_ctx_lock(&cpuctx->ctx);
+ if (ctx)
+ __perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+ /*
+ * If ctx_sched_in() didn't again set any ALL flags, clean up
+ * after ctx_sched_out() by clearing is_active.
+ */
+ if (ctx->is_active & EVENT_FROZEN) {
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+ else
+ ctx->is_active &= ~EVENT_FROZEN;
+ }
+ raw_spin_unlock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ __perf_ctx_unlock(ctx);
+ __perf_ctx_unlock(&cpuctx->ctx);
+}
+
+typedef struct {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+} class_perf_ctx_lock_t;
+
+static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
+{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
+
+static inline class_perf_ctx_lock_t
+class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
+
+#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
{
- return event->owner == EVENT_OWNER_KERNEL;
+ return READ_ONCE(event->owner) == TASK_TOMBSTONE;
+}
+
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
+/*
+ * On task ctx scheduling...
+ *
+ * When !ctx->nr_events a task context will not be scheduled. This means
+ * we can disable the scheduler hooks (for performance) without leaving
+ * pending task ctx state.
+ *
+ * This however results in two special cases:
+ *
+ * - removing the last event from a task ctx; this is relatively straight
+ * forward and is done in __perf_remove_from_context.
+ *
+ * - adding the first event to a task ctx; this is tricky because we cannot
+ * rely on ctx->is_active and therefore cannot use event_function_call().
+ * See perf_install_in_context().
+ *
+ * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
+ */
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+ struct perf_event_context *, void *);
+
+struct event_function_struct {
+ struct perf_event *event;
+ event_f func;
+ void *data;
+};
+
+static int event_function(void *info)
+{
+ struct event_function_struct *efs = info;
+ struct perf_event *event = efs->event;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ int ret = 0;
+
+ lockdep_assert_irqs_disabled();
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ /*
+ * Since we do the IPI call without holding ctx->lock things can have
+ * changed, double check we hit the task we set out to hit.
+ */
+ if (ctx->task) {
+ if (ctx->task != current) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+
+ /*
+ * We only use event_function_call() on established contexts,
+ * and event_function() is only ever called when active (or
+ * rather, we'll have bailed in task_function_call() or the
+ * above ctx->task != current test), therefore we must have
+ * ctx->is_active here.
+ */
+ WARN_ON_ONCE(!ctx->is_active);
+ /*
+ * And since we have ctx->is_active, cpuctx->task_ctx must
+ * match.
+ */
+ WARN_ON_ONCE(task_ctx != ctx);
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ efs->func(event, cpuctx, ctx, efs->data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+
+ return ret;
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct perf_cpu_context *cpuctx;
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ if (!event->parent) {
+ /*
+ * If this is a !child event, we must hold ctx::mutex to
+ * stabilize the event->ctx relation. See
+ * perf_event_ctx_lock().
+ */
+ lockdep_assert_held(&ctx->mutex);
+ }
+
+ if (!task) {
+ cpu_function_call(event->cpu, event_function, &efs);
+ return;
+ }
+
+ if (task == TASK_TOMBSTONE)
+ return;
+
+again:
+ if (!task_function_call(task, event_function, &efs))
+ return;
+
+ local_irq_disable();
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ perf_ctx_lock(cpuctx, ctx);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
+ if (ctx->is_active) {
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
+ goto again;
+ }
+ func(event, NULL, ctx, data);
+unlock:
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
+}
+
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct task_struct *task = READ_ONCE(ctx->task);
+ struct perf_event_context *task_ctx = NULL;
+
+ lockdep_assert_irqs_disabled();
+
+ if (task) {
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ task_ctx = ctx;
+ }
+
+ perf_ctx_lock(cpuctx, task_ctx);
+
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
+
+ if (task) {
+ /*
+ * We must be either inactive or active and the right task,
+ * otherwise we're screwed, since we cannot IPI to somewhere
+ * else.
+ */
+ if (ctx->is_active) {
+ if (WARN_ON_ONCE(task != current))
+ goto unlock;
+
+ if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+ goto unlock;
+ }
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ func(event, cpuctx, ctx, data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
}
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
@@ -143,28 +423,40 @@ static bool is_kernel_event(struct perf_event *event)
(PERF_SAMPLE_BRANCH_KERNEL |\
PERF_SAMPLE_BRANCH_HV)
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
/*
* perf_sched_events : >0 events exist
- * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct static_key_deferred perf_sched_events __read_mostly;
-static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_ksymbol_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
+static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
+static atomic_t nr_build_id_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
+static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
+static struct kmem_cache *perf_event_cache;
/*
* perf event paranoia level:
@@ -173,10 +465,10 @@ static struct srcu_struct pmus_srcu;
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
*/
-int sysctl_perf_event_paranoid __read_mostly = 1;
+int sysctl_perf_event_paranoid __read_mostly = 2;
-/* Minimum for 512 kiB + 1 user control page */
-int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
+/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
+static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
/*
* max perf event sample rate
@@ -186,6 +478,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
#define DEFAULT_CPU_TIME_MAX_PERCENT 25
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
@@ -193,23 +486,32 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
static int perf_sample_allowed_ns __read_mostly =
DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
-void update_perf_cpu_limits(void)
+static void update_perf_cpu_limits(void)
{
u64 tmp = perf_sample_period_ns;
tmp *= sysctl_perf_cpu_time_max_percent;
- do_div(tmp, 100);
- ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+ tmp = div_u64(tmp, 100);
+ if (!tmp)
+ tmp = 1;
+
+ WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ int ret;
+ int perf_cpu = sysctl_perf_cpu_time_max_percent;
+ /*
+ * If throttling is disabled don't allow the write:
+ */
+ if (write && (perf_cpu == 100 || perf_cpu == 0))
+ return -EINVAL;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -220,21 +522,71 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
return 0;
}
-int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
-
-int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
- update_perf_cpu_limits();
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0) {
+ printk(KERN_WARNING
+ "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
+ WRITE_ONCE(perf_sample_allowed_ns, 0);
+ } else {
+ update_perf_cpu_limits();
+ }
+
+ return 0;
+}
+
+static const struct ctl_table events_core_sysctl_table[] = {
+ /*
+ * User-space relies on this file as a feature check for
+ * perf_events being enabled. It's an ABI, do not remove!
+ */
+ {
+ .procname = "perf_event_paranoid",
+ .data = &sysctl_perf_event_paranoid,
+ .maxlen = sizeof(sysctl_perf_event_paranoid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_mlock_kb",
+ .data = &sysctl_perf_event_mlock,
+ .maxlen = sizeof(sysctl_perf_event_mlock),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_max_sample_rate",
+ .data = &sysctl_perf_event_sample_rate,
+ .maxlen = sizeof(sysctl_perf_event_sample_rate),
+ .mode = 0644,
+ .proc_handler = perf_event_max_sample_rate_handler,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "perf_cpu_time_max_percent",
+ .data = &sysctl_perf_cpu_time_max_percent,
+ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
+ .mode = 0644,
+ .proc_handler = perf_cpu_time_max_percent_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+};
+static int __init init_events_core_sysctls(void)
+{
+ register_sysctl_init("kernel", events_core_sysctl_table);
return 0;
}
+core_initcall(init_events_core_sysctls);
+
/*
* perf samples are done in some very critical code paths (NMIs).
@@ -245,85 +597,79 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);
+static u64 __report_avg;
+static u64 __report_allowed;
+
static void perf_duration_warn(struct irq_work *w)
{
- u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- u64 avg_local_sample_len;
- u64 local_samples_len;
-
- local_samples_len = __this_cpu_read(running_sample_length);
- avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
- printk_ratelimited(KERN_WARNING
- "perf interrupt took too long (%lld > %lld), lowering "
- "kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns >> 1,
- sysctl_perf_event_sample_rate);
+ printk_ratelimited(KERN_INFO
+ "perf: interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ __report_avg, __report_allowed,
+ sysctl_perf_event_sample_rate);
}
static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
void perf_sample_event_took(u64 sample_len_ns)
{
- u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- u64 avg_local_sample_len;
- u64 local_samples_len;
+ u64 max_len = READ_ONCE(perf_sample_allowed_ns);
+ u64 running_len;
+ u64 avg_len;
+ u32 max;
- if (allowed_ns == 0)
+ if (max_len == 0)
return;
- /* decay the counter by 1 average sample */
- local_samples_len = __this_cpu_read(running_sample_length);
- local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
- local_samples_len += sample_len_ns;
- __this_cpu_write(running_sample_length, local_samples_len);
+ /* Decay the counter by 1 average sample. */
+ running_len = __this_cpu_read(running_sample_length);
+ running_len -= running_len/NR_ACCUMULATED_SAMPLES;
+ running_len += sample_len_ns;
+ __this_cpu_write(running_sample_length, running_len);
/*
- * note: this will be biased artifically low until we have
- * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * Note: this will be biased artificially low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
* from having to maintain a count.
*/
- avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
- if (avg_local_sample_len <= allowed_ns)
+ avg_len = running_len/NR_ACCUMULATED_SAMPLES;
+ if (avg_len <= max_len)
return;
- if (max_samples_per_tick <= 1)
- return;
+ __report_avg = avg_len;
+ __report_allowed = max_len;
- max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
- sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
- perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ /*
+ * Compute a throttle threshold 25% below the current duration.
+ */
+ avg_len += avg_len / 4;
+ max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
+ if (avg_len < max)
+ max /= (u32)avg_len;
+ else
+ max = 1;
- update_perf_cpu_limits();
+ WRITE_ONCE(perf_sample_allowed_ns, avg_len);
+ WRITE_ONCE(max_samples_per_tick, max);
+
+ sysctl_perf_event_sample_rate = max * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
if (!irq_work_queue(&perf_duration_work)) {
- early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ early_printk("perf: interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns >> 1,
+ __report_avg, __report_allowed,
sysctl_perf_event_sample_rate);
}
}
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
void __weak perf_event_print_debug(void) { }
-extern __weak const char *perf_pmu_name(void)
-{
- return "pmu";
-}
-
static inline u64 perf_clock(void)
{
return local_clock();
@@ -334,35 +680,138 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
+/*
+ * State based event timekeeping...
+ *
+ * The basic idea is to use event->state to determine which (if any) time
+ * fields to increment with the current delta. This means we only need to
+ * update timestamps when we change state or when they are explicitly requested
+ * (read).
+ *
+ * Event groups make things a little more complicated, but not terribly so. The
+ * rules for a group are that if the group leader is OFF the entire group is
+ * OFF, irrespective of what the group member states are. This results in
+ * __perf_effective_state().
+ *
+ * A further ramification is that when a group leader flips between OFF and
+ * !OFF, we need to update all group member times.
+ *
+ *
+ * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+ * need to make sure the relevant context time is updated before we try and
+ * update our timestamps.
+ */
+
+static __always_inline enum perf_event_state
+__perf_effective_state(struct perf_event *event)
{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+ struct perf_event *leader = event->group_leader;
+
+ if (leader->state <= PERF_EVENT_STATE_OFF)
+ return leader->state;
+
+ return event->state;
}
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+static __always_inline void
+__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
{
- raw_spin_lock(&cpuctx->ctx.lock);
- if (ctx)
- raw_spin_lock(&ctx->lock);
+ enum perf_event_state state = __perf_effective_state(event);
+ u64 delta = now - event->tstamp;
+
+ *enabled = event->total_time_enabled;
+ if (state >= PERF_EVENT_STATE_INACTIVE)
+ *enabled += delta;
+
+ *running = event->total_time_running;
+ if (state >= PERF_EVENT_STATE_ACTIVE)
+ *running += delta;
}
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+static void perf_event_update_time(struct perf_event *event)
{
- if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
+ u64 now = perf_event_time(event);
+
+ __perf_update_times(event, now, &event->total_time_enabled,
+ &event->total_time_running);
+ event->tstamp = now;
+}
+
+static void perf_event_update_sibling_time(struct perf_event *leader)
+{
+ struct perf_event *sibling;
+
+ for_each_sibling_event(sibling, leader)
+ perf_event_update_time(sibling);
+}
+
+static void
+perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+{
+ if (event->state == state)
+ return;
+
+ perf_event_update_time(event);
+ /*
+ * If a group leader gets enabled/disabled all its siblings
+ * are affected too.
+ */
+ if ((event->state < 0) ^ (state < 0))
+ perf_event_update_sibling_time(event);
+
+ WRITE_ONCE(event->state, state);
}
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val) \
+do { \
+ barrier(); \
+ WRITE_ONCE(*(ptr), (val)); \
+} while (0)
+
+#define __load_acquire(ptr) \
+({ \
+ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
+ barrier(); \
+ ___p; \
+})
+
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+ if (_cgroup && !_epc->nr_cgroups) \
+ continue; \
+ else if (_pmu && _epc->pmu != _pmu) \
+ continue; \
+ else
+
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
+ perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -401,29 +850,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
- struct perf_cgroup_info *info;
- u64 now;
-
- now = perf_clock();
+ struct perf_cgroup_info *t;
- info = this_cpu_ptr(cgrp->info);
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ if (!__load_acquire(&t->active))
+ return t->time;
+ now += READ_ONCE(t->timeoffset);
+ return now;
+}
- info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ if (adv)
+ info->time += now - info->timestamp;
info->timestamp = now;
+ /*
+ * see update_context_time()
+ */
+ WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
- struct perf_cgroup *cgrp_out = cpuctx->cgrp;
- if (cgrp_out)
- __update_cgrp_time(cgrp_out);
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
+ struct cgroup_subsys_state *css;
+ struct perf_cgroup_info *info;
+
+ if (cgrp) {
+ u64 now = perf_clock();
+
+ for (css = &cgrp->css; css; css = css->parent) {
+ cgrp = container_of(css, struct perf_cgroup, css);
+ info = this_cpu_ptr(cgrp->info);
+
+ __update_cgrp_time(info, now, true);
+ if (final)
+ __store_release(&info->active, 0);
+ }
+ }
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
- struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
/*
* ensure we access cgroup data only when needed and
@@ -432,155 +903,126 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
if (!is_cgroup_event(event))
return;
- cgrp = perf_cgroup_from_task(current);
+ info = this_cpu_ptr(event->cgrp->info);
/*
* Do not update time when cgroup is not active
*/
- if (cgrp == event->cgrp)
- __update_cgrp_time(event->cgrp);
+ if (info->active)
+ __update_cgrp_time(info, perf_clock(), true);
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp;
+ struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
struct perf_cgroup_info *info;
+ struct cgroup_subsys_state *css;
/*
* ctx->lock held by caller
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
- if (!task || !ctx->nr_cgroups)
+ if (!cgrp)
return;
- cgrp = perf_cgroup_from_task(task);
- info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
-}
+ WARN_ON_ONCE(!ctx->nr_cgroups);
-#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
+ for (css = &cgrp->css; css; css = css->parent) {
+ cgrp = container_of(css, struct perf_cgroup, css);
+ info = this_cpu_ptr(cgrp->info);
+ __update_cgrp_time(info, ctx->timestamp, false);
+ __store_release(&info->active, 1);
+ }
+}
/*
* reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
*/
-void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task)
{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cgroup *cgrp;
/*
- * disable interrupts to avoid geting nr_cgroup
- * changes via __perf_event_disable(). Also
- * avoids preemption.
+ * cpuctx->cgrp is set when the first cgroup event enabled,
+ * and is cleared when the last cgroup event disabled.
*/
- local_irq_save(flags);
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
+ cgrp = perf_cgroup_from_task(task, NULL);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
+
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
/*
- * we reschedule only in the presence of cgroup
- * constrained events.
+ * Re-check, could've raced vs perf_remove_from_context().
*/
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- continue; /* ensure we process each cpuctx once */
-
- /*
- * perf_cgroup_events says at least one
- * context on this CPU has cgroup events.
- *
- * ctx->nr_cgroups reports the number of cgroup
- * events for a context.
- */
- if (cpuctx->ctx.nr_cgroups > 0) {
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
-
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- */
- cpuctx->cgrp = perf_cgroup_from_task(task);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
- }
-
- rcu_read_unlock();
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
- local_irq_restore(flags);
-}
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ perf_ctx_disable(&cpuctx->ctx, true);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
- * we come here when we know perf_cgroup_events > 0
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
*/
- cgrp1 = perf_cgroup_from_task(task);
-
+ cpuctx->cgrp = cgrp;
/*
- * next is NULL when called from perf_event_enable_on_exec()
- * that will systematically cause a cgroup_switch()
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
*/
- if (next)
- cgrp2 = perf_cgroup_from_task(next);
+ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+ perf_ctx_enable(&cpuctx->ctx, true);
}
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static int perf_cgroup_ensure_storage(struct perf_event *event,
+ struct cgroup_subsys_state *css)
{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
+ struct perf_cpu_context *cpuctx;
+ struct perf_event **storage;
+ int cpu, heap_size, ret = 0;
/*
- * we come here when we know perf_cgroup_events > 0
+ * Allow storage to have sufficient space for an iterator for each
+ * possibly nested cgroup plus an iterator for events with no cgroup.
*/
- cgrp1 = perf_cgroup_from_task(task);
+ for (heap_size = 1; css; css = css->parent)
+ heap_size++;
- /* prev can never be NULL */
- cgrp2 = perf_cgroup_from_task(prev);
+ for_each_possible_cpu(cpu) {
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ if (heap_size <= cpuctx->heap_size)
+ continue;
- /*
- * only need to schedule in cgroup events if we are changing
- * cgroup during ctxsw. Cgroup events were not scheduled
- * out of ctxsw out if that was not the case.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!storage) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ raw_spin_lock_irq(&cpuctx->ctx.lock);
+ if (cpuctx->heap_size < heap_size) {
+ swap(cpuctx->heap, storage);
+ if (storage == cpuctx->heap_default)
+ storage = NULL;
+ cpuctx->heap_size = heap_size;
+ }
+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
+
+ kfree(storage);
+ }
+
+ return ret;
}
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -589,18 +1031,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int ret = 0;
- if (!f.file)
+ if (fd_empty(f))
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_path.dentry,
+ css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
&perf_event_cgrp_subsys);
- if (IS_ERR(css)) {
- ret = PTR_ERR(css);
- goto out;
- }
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+
+ ret = perf_cgroup_ensure_storage(event, css);
+ if (ret)
+ return ret;
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
@@ -614,52 +1058,53 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
perf_detach_cgroup(event);
ret = -EINVAL;
}
-out:
- fdput(f);
return ret;
}
static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_cgroup_info *t;
- t = per_cpu_ptr(event->cgrp->info, event->cpu);
- event->shadow_ctx_time = now - t->timestamp;
-}
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
+ return;
+
+ event->pmu_ctx->nr_cgroups++;
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
/*
- * when the current task's perf cgroup does not match
- * the event's, we need to remember to call the
- * perf_mark_enable() function the first time a task with
- * a matching perf cgroup is scheduled in.
+ * Because cgroup events are always per-cpu events,
+ * @ctx == &cpuctx->ctx.
*/
- if (is_cgroup_event(event) && !perf_cgroup_match(event))
- event->cgrp_defer_enabled = 1;
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (ctx->nr_cgroups++)
+ return;
+
+ cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}
static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
+ struct perf_cpu_context *cpuctx;
- if (!event->cgrp_defer_enabled)
+ if (!is_cgroup_event(event))
return;
- event->cgrp_defer_enabled = 0;
+ event->pmu_ctx->nr_cgroups--;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
- sub->cgrp_defer_enabled = 0;
- }
- }
+ /*
+ * Because cgroup events are always per-cpu events,
+ * @ctx == &cpuctx->ctx.
+ */
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (--ctx->nr_cgroups)
+ return;
+
+ cpuctx->cgrp = NULL;
}
+
#else /* !CONFIG_CGROUP_PERF */
static inline bool
@@ -676,26 +1121,12 @@ static inline int is_cgroup_event(struct perf_event *event)
return 0;
}
-static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
-{
- return 0;
-}
-
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
-{
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+ bool final)
{
}
@@ -707,34 +1138,31 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}
-void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
+ return 0;
}
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
+ return 0;
}
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline void
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
- return 0;
}
static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
+static void perf_cgroup_switch(struct task_struct *task)
{
}
#endif
@@ -745,145 +1173,104 @@ perf_cgroup_mark_enabled(struct perf_event *event,
*/
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
- * function must be called with interrupts disbled
+ * function must be called with interrupts disabled
*/
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
- int rotations = 0;
-
- WARN_ON(!irqs_disabled());
+ struct perf_cpu_pmu_context *cpc;
+ bool rotations;
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+ lockdep_assert_irqs_disabled();
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- /*
- * arm timer if needed
- */
- if (rotations) {
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
- ret = HRTIMER_RESTART;
- }
-
- return ret;
-}
-
-/* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
- unsigned long flags;
-
- if (WARN_ON(cpu != smp_processor_id()))
- return;
-
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- if (pmu->task_ctx_nr == perf_sw_context)
- continue;
-
- hrtimer_cancel(&cpuctx->hrtimer);
- }
-
- rcu_read_unlock();
+ raw_spin_lock(&cpc->hrtimer_lock);
+ if (rotations)
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
+ else
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
- local_irq_restore(flags);
+ return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
- int timer;
-
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
+ u64 interval;
/*
* check default is sane, if not set then force to
* default interval (1/tick)
*/
- timer = pmu->hrtimer_interval_ms;
- if (timer < 1)
- timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+ interval = pmu->hrtimer_interval_ms;
+ if (interval < 1)
+ interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- hr->function = perf_cpu_hrtimer_handler;
+ raw_spin_lock_init(&cpc->hrtimer_lock);
+ hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *hr = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ }
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
- if (hrtimer_active(hr))
- return;
+ return 0;
+}
- if (!hrtimer_callback_running(hr))
- __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
- 0, HRTIMER_MODE_REL_PINNED, 0);
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
+static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
+{
+ return *this_cpu_ptr(pmu->cpu_pmu_context);
}
void perf_pmu_disable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!(*count)++)
pmu->pmu_disable(pmu);
}
void perf_pmu_enable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!--(*count))
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- WARN_ON(!irqs_disabled());
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
+ WARN_ON_ONCE(*count == 0);
}
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+static inline void perf_pmu_read(struct perf_event *event)
{
- WARN_ON(!irqs_disabled());
-
- WARN_ON(list_empty(&ctx->active_ctx_list));
-
- list_del_init(&ctx->active_ctx_list);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
}
static void get_ctx(struct perf_event_context *ctx)
{
- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+ refcount_inc(&ctx->refcount);
}
static void free_ctx(struct rcu_head *head)
@@ -891,18 +1278,21 @@ static void free_ctx(struct rcu_head *head)
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- kfree(ctx->task_ctx_data);
kfree(ctx);
}
static void put_ctx(struct perf_event_context *ctx)
{
- if (atomic_dec_and_test(&ctx->refcount)) {
+ if (refcount_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
- if (ctx->task)
+ if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
+ } else {
+ smp_mb__after_atomic(); /* pairs with wait_var_event() */
+ if (ctx->task == TASK_TOMBSTONE)
+ wake_up_var(&ctx->refcount);
}
}
@@ -917,9 +1307,8 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex nests and those are:
*
* - perf_event_exit_task_context() [ child , 0 ]
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event() [ parent, 1 ]
+ * perf_event_exit_event()
+ * put_event() [ parent, 1 ]
*
* - perf_event_init_context() [ parent, 0 ]
* inherit_task_group()
@@ -934,7 +1323,7 @@ static void put_ctx(struct perf_event_context *ctx)
* life-time rules separate them. That is an exiting task cannot fork, and a
* spawning task cannot (yet) exit.
*
- * But remember that that these are parent<->child context relations, and
+ * But remember that these are parent<->child context relations, and
* migration does not affect children, therefore these two orderings should not
* interact.
*
@@ -960,12 +1349,19 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
+ * exec_update_lock
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
- * perf_event::mmap_mutex
- * mmap_sem
+ * perf_event_context::lock
+ * mmap_lock
+ * perf_event::mmap_mutex
+ * perf_buffer::aux_mutex
+ * perf_addr_filters_head::lock
+ *
+ * cpu_hotplug_lock
+ * pmus_lock
+ * cpuctx->mutex / perf_event_context::mutex
*/
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -974,8 +1370,8 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
- ctx = ACCESS_ONCE(event->ctx);
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ ctx = READ_ONCE(event->ctx);
+ if (!refcount_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
}
@@ -1023,26 +1419,31 @@ unclone_ctx(struct perf_event_context *ctx)
return parent_ctx;
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ enum pid_type type)
{
+ u32 nr;
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
- return task_tgid_nr_ns(p, event->ns);
+ nr = __task_pid_nr_ns(p, type, event->ns);
+ /* avoid -1 if it is idle thread or runs in another ns */
+ if (!nr && !pid_alive(p))
+ nr = -1;
+ return nr;
}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
+ return perf_event_pid_type(event, p, PIDTYPE_TGID);
+}
- return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ return perf_event_pid_type(event, p, PIDTYPE_PID);
}
/*
@@ -1061,11 +1462,12 @@ static u64 primary_event_id(struct perf_event *event)
/*
* Get the perf_event_context for a task and lock it.
- * This has to cope with with the fact that until it is locked,
+ *
+ * This has to cope with the fact that until it is locked,
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1073,15 +1475,15 @@ retry:
/*
* One of the few rules of preemptible RCU is that one cannot do
* rcu_read_unlock() while holding a scheduler (or nested) lock when
- * part of the read side critical section was preemptible -- see
+ * part of the read side critical section was irqs-enabled -- see
* rcu_read_unlock_special().
*
* Since ctx->lock nests under rq->lock we must ensure the entire read
- * side critical section is non-preemptible.
+ * side critical section has interrupts disabled.
*/
- preempt_disable();
+ local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1093,21 +1495,25 @@ retry:
* if so. If we locked the right context, then it
* can't get swapped on us any more.
*/
- raw_spin_lock_irqsave(&ctx->lock, *flags);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
- raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ raw_spin_lock(&ctx->lock);
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+ raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
- preempt_enable();
+ local_irq_restore(*flags);
goto retry;
}
- if (!atomic_inc_not_zero(&ctx->refcount)) {
- raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ if (ctx->task == TASK_TOMBSTONE ||
+ !refcount_inc_not_zero(&ctx->refcount)) {
+ raw_spin_unlock(&ctx->lock);
ctx = NULL;
+ } else {
+ WARN_ON_ONCE(ctx->task != task);
}
}
rcu_read_unlock();
- preempt_enable();
+ if (!ctx)
+ local_irq_restore(*flags);
return ctx;
}
@@ -1117,12 +1523,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1142,78 +1548,99 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
- ctx->time += now - ctx->timestamp;
+ lockdep_assert_held(&ctx->lock);
+
+ if (adv)
+ ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+ __update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx ? ctx->time : 0;
+ return ctx->time;
}
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
- */
-static void update_event_times(struct perf_event *event)
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
struct perf_event_context *ctx = event->ctx;
- u64 run_end;
- if (event->state < PERF_EVENT_STATE_INACTIVE ||
- event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
- return;
- /*
- * in cgroup mode, time_enabled represents
- * the time the event was enabled AND active
- * tasks were in the monitored cgroup. This is
- * independent of the activity of the context as
- * there may be a mix of cgroup and non-cgroup events.
- *
- * That is why we treat cgroup events differently
- * here.
- */
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
- run_end = perf_cgroup_event_time(event);
- else if (ctx->is_active)
- run_end = ctx->time;
- else
- run_end = event->tstamp_stopped;
+ return perf_cgroup_event_time_now(event, now);
- event->total_time_enabled = run_end - event->tstamp_enabled;
+ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+ return ctx->time;
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- run_end = event->tstamp_stopped;
- else
- run_end = perf_event_time(event);
+ now += READ_ONCE(ctx->timeoffset);
+ return now;
+}
+
+static enum event_type_t get_event_type(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ enum event_type_t event_type;
+
+ lockdep_assert_held(&ctx->lock);
- event->total_time_running = run_end - event->tstamp_running;
+ /*
+ * It's 'group type', really, because if our group leader is
+ * pinned, so are we.
+ */
+ if (event->group_leader != event)
+ event = event->group_leader;
+
+ event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
+ if (!ctx->task)
+ event_type |= EVENT_CPU;
+ return event_type;
}
/*
- * Update total_time_enabled and total_time_running for all events in a group.
+ * Helper function to initialize event group nodes.
*/
-static void update_group_times(struct perf_event *leader)
+static void init_event_group(struct perf_event *event)
{
- struct perf_event *event;
-
- update_event_times(leader);
- list_for_each_entry(event, &leader->sibling_list, group_entry)
- update_event_times(event);
+ RB_CLEAR_NODE(&event->group_node);
+ event->group_index = 0;
}
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits.
+ */
+static struct perf_event_groups *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
return &ctx->pinned_groups;
@@ -1222,39 +1649,270 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Add a event from the lists for its context.
+ * Helper function to initializes perf_event_group trees.
+ */
+static void perf_event_groups_init(struct perf_event_groups *groups)
+{
+ groups->tree = RB_ROOT;
+ groups->index = 0;
+}
+
+static inline struct cgroup *event_cgroup(const struct perf_event *event)
+{
+ struct cgroup *cgroup = NULL;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp)
+ cgroup = event->cgrp->css.cgroup;
+#endif
+
+ return cgroup;
+}
+
+/*
+ * Compare function for event groups;
+ *
+ * Implements complex key that first sorts by CPU and then by virtual index
+ * which provides ordering when rotating groups for the same CPU.
+ */
+static __always_inline int
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
+{
+ if (left_cpu < right->cpu)
+ return -1;
+ if (left_cpu > right->cpu)
+ return 1;
+
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
+
+#ifdef CONFIG_CGROUP_PERF
+ {
+ const struct cgroup *right_cgroup = event_cgroup(right);
+
+ if (left_cgroup != right_cgroup) {
+ if (!left_cgroup) {
+ /*
+ * Left has no cgroup but right does, no
+ * cgroups come first.
+ */
+ return -1;
+ }
+ if (!right_cgroup) {
+ /*
+ * Right has no cgroup but left does, no
+ * cgroups come first.
+ */
+ return 1;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+ return -1;
+
+ return 1;
+ }
+ }
+#endif
+
+ if (left_group_index < right->group_index)
+ return -1;
+ if (left_group_index > right->group_index)
+ return 1;
+
+ return 0;
+}
+
+#define __node_2_pe(node) \
+ rb_entry((node), struct perf_event, group_node)
+
+static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct perf_event *e = __node_2_pe(a);
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
+}
+
+struct __group_key {
+ int cpu;
+ struct pmu *pmu;
+ struct cgroup *cgroup;
+};
+
+static inline int __group_cmp(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
+}
+
+/*
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
+ */
+static void
+perf_event_groups_insert(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ event->group_index = ++groups->index;
+
+ rb_add(&event->group_node, &groups->tree, __group_less);
+}
+
+/*
+ * Helper function to insert event into the pinned or flexible groups.
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event_groups *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Delete a group from a tree.
+ */
+static void
+perf_event_groups_delete(struct perf_event_groups *groups,
+ struct perf_event *event)
+{
+ WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
+ RB_EMPTY_ROOT(&groups->tree));
+
+ rb_erase(&event->group_node, &groups->tree);
+ init_event_group(event);
+}
+
+/*
+ * Helper function to delete event from its groups.
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_event_groups *groups;
+
+ groups = get_event_groups(event, ctx);
+ perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
+ */
+static struct perf_event *
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu, struct cgroup *cgrp)
+{
+ struct __group_key key = {
+ .cpu = cpu,
+ .pmu = pmu,
+ .cgroup = cgrp,
+ };
+ struct rb_node *node;
+
+ node = rb_find_first(&key, &groups->tree, __group_cmp);
+ if (node)
+ return __node_2_pe(node);
+
+ return NULL;
+}
+
+static struct perf_event *
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
+{
+ struct __group_key key = {
+ .cpu = event->cpu,
+ .pmu = pmu,
+ .cgroup = event_cgroup(event),
+ };
+ struct rb_node *next;
+
+ next = rb_next_match(&key, &event->group_node, __group_cmp);
+ if (next)
+ return __node_2_pe(next);
+
+ return NULL;
+}
+
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
+/*
+ * Iterate through the whole groups tree.
+ */
+#define perf_event_groups_for_each(event, groups) \
+ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
+ typeof(*event), group_node); event; \
+ event = rb_entry_safe(rb_next(&event->group_node), \
+ typeof(*event), group_node))
+
+/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
+/*
+ * Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ lockdep_assert_held(&ctx->lock);
+
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
+ event->tstamp = perf_event_time(event);
+
/*
* If we're a stand alone event or group leader, we go to the context
* list, group events are kept attached to the group so that
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
- struct list_head *list;
-
- if (is_software_event(event))
- event->group_flags |= PERF_GROUP_SOFTWARE;
-
- list = ctx_group_list(event, ctx);
- list_add_tail(&event->group_entry, list);
+ event->group_caps = event->event_caps;
+ add_event_to_groups(event, ctx);
}
- if (is_cgroup_event(event))
- ctx->nr_cgroups++;
-
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_inc(&ctx->nr_no_switch_fast);
+
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1266,42 +1924,41 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-/*
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_ID)
+ if (read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
+ if (read_format & PERF_FORMAT_GROUP) {
+ nr += nr_siblings;
size += sizeof(u64);
}
- size += entry * nr;
- event->read_size = size;
+ /*
+ * Since perf_event_validate_size() limits this to 16k and inhibits
+ * adding more siblings, this will never overflow.
+ */
+ return size + nr * entry;
}
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
struct perf_sample_data *data;
- u64 sample_type = event->attr.sample_type;
u16 size = 0;
- perf_event__read_size(event);
-
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
@@ -1311,8 +1968,8 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
- if (sample_type & PERF_SAMPLE_WEIGHT)
- size += sizeof(data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ size += sizeof(data->weight.full);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
@@ -1323,9 +1980,33 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ size += sizeof(data->cgroup);
+
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ size += sizeof(data->code_page_size);
+
event->header_size = size;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+ event->read_size =
+ __perf_event_read_size(event->attr.read_format,
+ event->group_leader->nr_siblings);
+ __perf_event_header_size(event, event->attr.sample_type);
+}
+
static void perf_event__id_header_size(struct perf_event *event)
{
struct perf_sample_data *data;
@@ -1353,12 +2034,57 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
+static bool perf_event_validate_size(struct perf_event *event)
+{
+ struct perf_event *sibling, *group_leader = event->group_leader;
+
+ if (__perf_event_read_size(event->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ if (__perf_event_read_size(group_leader->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ /*
+ * When creating a new group leader, group_leader->ctx is initialized
+ * after the size has been validated, but we cannot safely use
+ * for_each_sibling_event() until group_leader->ctx is set. A new group
+ * leader cannot have any siblings yet, so we can safely skip checking
+ * the non-existent siblings.
+ */
+ if (event == group_leader)
+ return true;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (__perf_event_read_size(sibling->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+ }
+
+ return true;
+}
+
static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -1370,28 +2096,25 @@ static void perf_group_attach(struct perf_event *event)
WARN_ON_ONCE(group_leader->ctx != event->ctx);
- if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
- !is_software_event(event))
- group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+ group_leader->group_caps &= event->event_caps;
- list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
+ group_leader->group_generation++;
perf_event__header_size(group_leader);
- list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ for_each_sibling_event(pos, group_leader)
perf_event__header_size(pos);
}
/*
- * Remove a event from the lists for its context.
+ * Remove an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
- struct perf_cpu_context *cpuctx;
-
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -1403,46 +2126,136 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- if (is_cgroup_event(event)) {
- ctx->nr_cgroups--;
- cpuctx = __get_cpu_context(ctx);
- /*
- * if there are no more cgroup events
- * then cler cgrp to avoid stale pointer
- * in update_cgrp_time_from_cpuctx()
- */
- if (!ctx->nr_cgroups)
- cpuctx->cgrp = NULL;
- }
-
ctx->nr_events--;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
- list_del_init(&event->group_entry);
+ del_event_from_groups(event, ctx);
+
+ ctx->generation++;
+ event->pmu_ctx->nr_events--;
+}
+
+static int
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+{
+ if (!has_aux(aux_event))
+ return 0;
+
+ if (!event->pmu->aux_output_match)
+ return 0;
- update_group_times(event);
+ return event->pmu->aux_output_match(aux_event);
+}
+
+static void put_event(struct perf_event *event);
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state);
+
+static void perf_put_aux_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *iter;
/*
- * If event was in error state, then keep it
- * that way, otherwise bogus counts will be
- * returned on read(). The only way to get out
- * of error state is by explicit re-enabling
- * of the event
+ * If event uses aux_event tear down the link
*/
- if (event->state > PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_OFF;
+ if (event->aux_event) {
+ iter = event->aux_event;
+ event->aux_event = NULL;
+ put_event(iter);
+ return;
+ }
- ctx->generation++;
+ /*
+ * If the event is an aux_event, tear down all links to
+ * it from other events.
+ */
+ for_each_sibling_event(iter, event) {
+ if (iter->aux_event != event)
+ continue;
+
+ iter->aux_event = NULL;
+ put_event(event);
+
+ /*
+ * If it's ACTIVE, schedule it out and put it into ERROR
+ * state so that we don't try to schedule it again. Note
+ * that perf_event_enable() will clear the ERROR status.
+ */
+ __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
+ }
+}
+
+static bool perf_need_aux_event(struct perf_event *event)
+{
+ return event->attr.aux_output || has_aux_action(event);
+}
+
+static int perf_get_aux_event(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ /*
+ * Our group leader must be an aux event if we want to be
+ * an aux_output. This way, the aux event will precede its
+ * aux_output events in the group, and therefore will always
+ * schedule first.
+ */
+ if (!group_leader)
+ return 0;
+
+ /*
+ * aux_output and aux_sample_size are mutually exclusive.
+ */
+ if (event->attr.aux_output && event->attr.aux_sample_size)
+ return 0;
+
+ if (event->attr.aux_output &&
+ !perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if ((event->attr.aux_pause || event->attr.aux_resume) &&
+ !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return 0;
+
+ if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
+ return 0;
+
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ return 0;
+
+ /*
+ * Link aux_outputs to their aux event; this is undone in
+ * perf_group_detach() by perf_put_aux_event(). When the
+ * group in torn down, the aux_output events loose their
+ * link to the aux_event and can't schedule any more.
+ */
+ event->aux_event = group_leader;
+
+ return 1;
+}
+
+static inline struct list_head *get_event_list(struct perf_event *event)
+{
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
}
static void perf_group_detach(struct perf_event *event)
{
+ struct perf_event *leader = event->group_leader;
struct perf_event *sibling, *tmp;
- struct list_head *list = NULL;
+ struct perf_event_context *ctx = event->ctx;
+
+ lockdep_assert_held(&ctx->lock);
/*
* We can have double detach due to exit/hot-unplug + close.
@@ -1452,164 +2265,209 @@ static void perf_group_detach(struct perf_event *event)
event->attach_state &= ~PERF_ATTACH_GROUP;
+ perf_put_aux_event(event);
+
/*
* If this is a sibling, remove it from its group.
*/
- if (event->group_leader != event) {
- list_del_init(&event->group_entry);
+ if (leader != event) {
+ list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
+ event->group_leader->group_generation++;
goto out;
}
- if (!list_empty(&event->group_entry))
- list = &event->group_entry;
-
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
*/
- list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- if (list)
- list_move_tail(&sibling->group_entry, list);
+ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+
+ /*
+ * Events that have PERF_EV_CAP_SIBLING require being part of
+ * a group and cannot exist on their own, schedule them out
+ * and move them into the ERROR state. Also see
+ * _perf_event_enable(), it will not be able to recover this
+ * ERROR state.
+ */
+ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
+ __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
+
sibling->group_leader = sibling;
+ list_del_init(&sibling->sibling_list);
/* Inherit group flags from the previous leader */
- sibling->group_flags = event->group_flags;
+ sibling->group_caps = event->group_caps;
+
+ if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
+ add_event_to_groups(sibling, event->ctx);
+
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
+ list_add_tail(&sibling->active_list, get_event_list(sibling));
+ }
WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
- perf_event__header_size(event->group_leader);
-
- list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ for_each_sibling_event(tmp, leader)
perf_event__header_size(tmp);
-}
-/*
- * User event without the task.
- */
-static bool is_orphaned_event(struct perf_event *event)
-{
- return event && !is_kernel_event(event) && !event->owner;
+ perf_event__header_size(leader);
}
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
+static void sync_child_event(struct perf_event *child_event);
+
+static void perf_child_detach(struct perf_event *event)
{
- return is_orphaned_event(event->parent);
-}
+ struct perf_event *parent_event = event->parent;
-static void orphans_remove_work(struct work_struct *work);
+ if (!(event->attach_state & PERF_ATTACH_CHILD))
+ return;
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
+ event->attach_state &= ~PERF_ATTACH_CHILD;
+
+ if (WARN_ON_ONCE(!parent_event))
return;
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
+ /*
+ * Can't check this from an IPI, the holder is likey another CPU.
+ *
+ lockdep_assert_held(&parent_event->child_mutex);
+ */
+
+ sync_child_event(event);
+ list_del_init(&event->child_list);
}
-static int __init perf_workqueue_init(void)
+static bool is_orphaned_event(struct perf_event *event)
{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
+ return event->state == PERF_EVENT_STATE_DEAD;
}
-core_initcall(perf_workqueue_init);
-
static inline int
event_filter_match(struct perf_event *event)
{
- return (event->cpu == -1 || event->cpu == smp_processor_id())
- && perf_cgroup_match(event);
+ return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+ perf_cgroup_match(event);
+}
+
+static inline bool is_event_in_freq_mode(struct perf_event *event)
+{
+ return event->attr.freq && event->attr.sample_freq;
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
- u64 delta;
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
+ enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+
+ // XXX cpc serialization, probably per-cpu IRQ disabled
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
- /*
- * An event which could not be activated because of
- * filter mismatch still needs to have its timings
- * maintained, otherwise bogus information is return
- * via read() for time_enabled, time_running:
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE
- && !event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
- }
-
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
+ /*
+ * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
+ * we can schedule events _OUT_ individually through things like
+ * __perf_remove_from_context().
+ */
+ list_del_init(&event->active_list);
+
perf_pmu_disable(event->pmu);
- event->state = PERF_EVENT_STATE_INACTIVE;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
+
if (event->pending_disable) {
event->pending_disable = 0;
- event->state = PERF_EVENT_STATE_OFF;
+ perf_cgroup_event_disable(event, ctx);
+ state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
+
+ perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
- if (event->attr.freq && event->attr.sample_freq)
+ cpc->active_oncpu--;
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
-
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
+ epc->nr_freq--;
+ }
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
- int state = group_event->state;
- event_sched_out(group_event, cpuctx, ctx);
+ if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
+
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry)
- event_sched_out(event, cpuctx, ctx);
+ for_each_sibling_event(event, group_event)
+ event_sched_out(event, ctx);
+}
- if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
- cpuctx->exclusive = 0;
+static inline void
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, final);
+ }
}
-struct remove_event {
- struct perf_event *event;
- bool detach_group;
-};
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ __ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ ctx_time_update(cpuctx, ctx);
+ if (ctx->is_active & EVENT_TIME)
+ ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+}
+
+#define DETACH_GROUP 0x01UL
+#define DETACH_CHILD 0x02UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -1617,34 +2475,68 @@ struct remove_event {
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct remove_event *re = info;
- struct perf_event *event = re->event;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
+ unsigned long flags = (unsigned long)info;
- raw_spin_lock(&ctx->lock);
- event_sched_out(event, cpuctx, ctx);
- if (re->detach_group)
+ ctx_time_update(cpuctx, ctx);
+
+ /*
+ * Ensure event_sched_out() switches to OFF, at the very least
+ * this avoids raising perf_pending_task() at this time.
+ */
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
+ if (flags & DETACH_DEAD)
+ state = PERF_EVENT_STATE_DEAD;
+
+ event_sched_out(event, ctx);
+
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_disable(event, ctx);
+
+ perf_event_set_state(event, min(event->state, state));
+
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
+ if (flags & DETACH_CHILD)
+ perf_child_detach(event);
list_del_event(event, ctx);
- if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
- ctx->is_active = 0;
- cpuctx->task_ctx = NULL;
+
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);
+
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
}
- raw_spin_unlock(&ctx->lock);
- return 0;
-}
+ if (!ctx->nr_events && ctx->is_active) {
+ if (ctx == &cpuctx->ctx)
+ update_cgrp_time_from_cpuctx(cpuctx, true);
+ ctx->is_active = 0;
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ cpuctx->task_ctx = NULL;
+ }
+ }
+}
/*
* Remove the event from a task's (or a CPU's) list of events.
*
- * CPU events are removed with a smp call. For task events we only
- * call when the task is on a CPU.
- *
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This is OK when called from perf_release since
@@ -1652,151 +2544,99 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
- struct remove_event re = {
- .event = event,
- .detach_group = detach_group,
- };
lockdep_assert_held(&ctx->mutex);
- if (!task) {
- /*
- * Per cpu events are removed via an smp call. The removal can
- * fail if the CPU is currently offline, but in that case we
- * already called __perf_remove_from_context from
- * perf_event_exit_cpu.
- */
- cpu_function_call(event->cpu, __perf_remove_from_context, &re);
- return;
- }
-
-retry:
- if (!task_function_call(task, __perf_remove_from_context, &re))
- return;
-
- raw_spin_lock_irq(&ctx->lock);
/*
- * If we failed to find a running task, but find the context active now
- * that we've acquired the ctx->lock, retry.
+ * Because of perf_event_exit_task(), perf_remove_from_context() ought
+ * to work in the face of TASK_TOMBSTONE, unlike every other
+ * event_function_call() user.
*/
- if (ctx->is_active) {
+ raw_spin_lock_irq(&ctx->lock);
+ if (!ctx->is_active) {
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
+ ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
- /*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
- */
- task = ctx->task;
- goto retry;
+ return;
}
-
- /*
- * Since the task isn't running, its safe to remove the event, us
- * holding the ctx->lock ensures the task won't get scheduled in.
- */
- if (detach_group)
- perf_group_detach(event);
- list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
+
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
+}
+
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state)
+{
+ event_sched_out(event, ctx);
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, state);
}
/*
* Cross CPU call to disable a performance event
*/
-int __perf_event_disable(void *info)
+static void __perf_event_disable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return;
+
+ perf_pmu_disable(event->pmu_ctx->pmu);
+ ctx_time_update_event(ctx, event);
/*
- * If this is a per-task event, need to check whether this
- * event's task is the current task on this cpu.
- *
- * Can trigger due to concurrent perf_event_context_sched_out()
- * flipping contexts around.
+ * When disabling a group leader, the whole group becomes ineligible
+ * to run, so schedule out the full group.
*/
- if (ctx->task && cpuctx->task_ctx != ctx)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
+ if (event == event->group_leader)
+ group_sched_out(event, ctx);
/*
- * If the event is on, turn it off.
- * If it is in error state, leave it in error state.
+ * But only mark the leader OFF; the siblings will remain
+ * INACTIVE.
*/
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
- if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
- else
- event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
- }
+ __event_disable(event, ctx, PERF_EVENT_STATE_OFF);
- raw_spin_unlock(&ctx->lock);
-
- return 0;
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
- * Disable a event.
+ * Disable an event.
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
- * remains valid. This condition is satisifed when called through
+ * remains valid. This condition is satisfied when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
- * When called from perf_pending_event it's OK because event->ctx
+ * goes to exit will block in perf_event_exit_event().
+ *
+ * When called from perf_pending_disable it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
static void _perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
-
- if (!task) {
- /*
- * Disable the event on the cpu that it's on
- */
- cpu_function_call(event->cpu, __perf_event_disable, event);
- return;
- }
-
-retry:
- if (!task_function_call(task, __perf_event_disable, event))
- return;
raw_spin_lock_irq(&ctx->lock);
- /*
- * If the event is still active, we need to retry the cross-call.
- */
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (event->state <= PERF_EVENT_STATE_OFF) {
raw_spin_unlock_irq(&ctx->lock);
- /*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
- */
- task = ctx->task;
- goto retry;
- }
-
- /*
- * Since we have the lock this context can't be scheduled
- * in, so we can change the state safely.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_OFF;
+ return;
}
raw_spin_unlock_irq(&ctx->lock);
+
+ event_function_call(event, __perf_event_disable, NULL);
+}
+
+void perf_event_disable_local(struct perf_event *event)
+{
+ event_function_local(event, __perf_event_disable, NULL);
}
/*
@@ -1813,39 +2653,10 @@ void perf_event_disable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_disable);
-static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx,
- u64 tstamp)
+void perf_event_disable_inatomic(struct perf_event *event)
{
- /*
- * use the correct time source for the time snapshot
- *
- * We could get by without this by leveraging the
- * fact that to get to this function, the caller
- * has most likely already called update_context_time()
- * and update_cgrp_time_xx() and thus both timestamp
- * are identical (or very close). Given that tstamp is,
- * already adjusted for cgroup, we could say that:
- * tstamp - ctx->timestamp
- * is equivalent to
- * tstamp - cgrp->timestamp.
- *
- * Then, in perf_output_read(), the calculation would
- * work with no changes because:
- * - event is guaranteed scheduled in
- * - no scheduled out in between
- * - thus the timestamp would be the same
- *
- * But this is a bit hairy.
- *
- * So instead, we have an explicit cgroup call to remain
- * within the time time source all along. We believe it
- * is cleaner and simpler to understand.
- */
- if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, tstamp);
- else
- event->shadow_ctx_time = tstamp - ctx->timestamp;
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending_disable_irq);
}
#define MAX_INTERRUPTS (~0ULL)
@@ -1853,64 +2664,97 @@ static void perf_set_shadow_time(struct perf_event *event,
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
+static void perf_event_unthrottle(struct perf_event *event, bool start)
+{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event->hw.interrupts = 0;
+ if (start)
+ event->pmu->start(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 1);
+}
+
+static void perf_event_throttle(struct perf_event *event)
+{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
+ event->hw.interrupts = MAX_INTERRUPTS;
+ event->pmu->stop(event, 0);
+ if (event == event->group_leader)
+ perf_log_throttle(event, 0);
+}
+
+static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_unthrottle(leader, skip_start_event ? leader != event : true);
+ for_each_sibling_event(sibling, leader)
+ perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true);
+}
+
+static void perf_event_throttle_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+
+ perf_event_throttle(leader);
+ for_each_sibling_event(sibling, leader)
+ perf_event_throttle(sibling);
+}
+
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
int ret = 0;
+ WARN_ON_ONCE(event->ctx != ctx);
+
lockdep_assert_held(&ctx->lock);
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
- event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = smp_processor_id();
+ WRITE_ONCE(event->oncpu, smp_processor_id());
+ /*
+ * Order event::oncpu write to happen before the ACTIVE state is
+ * visible. This allows perf_event_{stop,read}() to observe the correct
+ * ->oncpu if it sees ACTIVE.
+ */
+ smp_wmb();
+ perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
- if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
- perf_log_throttle(event, 1);
- event->hw.interrupts = 0;
- }
-
- /*
- * The new state must be visible before we turn it on in the hardware:
- */
- smp_wmb();
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
+ perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
- perf_set_shadow_time(event, ctx, tstamp);
-
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
- event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
event->oncpu = -1;
ret = -EAGAIN;
goto out;
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
- if (event->attr.freq && event->attr.sample_freq)
+ cpc->active_oncpu++;
+ if (is_event_in_freq_mode(event)) {
ctx->nr_freq++;
-
+ epc->nr_freq++;
+ }
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
-
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -1919,31 +2763,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
- u64 now = ctx->time;
- bool simulate = false;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
- pmu->start_txn(pmu);
+ pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx)) {
- pmu->cancel_txn(pmu);
- perf_cpu_hrtimer_restart(cpuctx);
- return -EAGAIN;
- }
+ if (event_sched_in(group_event, ctx))
+ goto error;
/*
* Schedule in siblings as one group (if any):
*/
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ for_each_sibling_event(event, group_event) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -1956,60 +2793,45 @@ group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
- * The events up to the failed event are scheduled out normally,
- * tstamp_stopped will be updated.
- *
- * The failed events and the remaining siblings need to have
- * their timings updated as if they had gone thru event_sched_in()
- * and event_sched_out(). This is required to get consistent timings
- * across the group. This also takes care of the case where the group
- * could never be scheduled by ensuring tstamp_stopped is set to mark
- * the time the event was actually stopped, such that time delta
- * calculation in update_event_times() is correct.
- */
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ * The events up to the failed event are scheduled out normally.
+ */
+ for_each_sibling_event(event, group_event) {
if (event == partial_group)
- simulate = true;
+ break;
- if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
- } else {
- event_sched_out(event, cpuctx, ctx);
- }
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
+error:
pmu->cancel_txn(pmu);
-
- perf_cpu_hrtimer_restart(cpuctx);
-
return -EAGAIN;
}
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
+
/*
* Groups consisting entirely of software events can always go on.
*/
- if (event->group_flags & PERF_GROUP_SOFTWARE)
+ if (event->group_caps & PERF_EV_CAP_SOFTWARE)
return 1;
/*
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
* events on the CPU, it can't go on.
*/
- if (event->attr.exclusive && cpuctx->active_oncpu)
+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
return 0;
/*
* Otherwise, try to add it if all previous groups were able
@@ -2021,256 +2843,336 @@ static int group_can_go_on(struct perf_event *event,
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
-
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
}
-static void task_ctx_sched_out(struct perf_event_context *ctx);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ struct pmu *pmu,
+ enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ ctx_sched_out(ctx, pmu, event_type);
+}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
- struct task_struct *task)
+ struct pmu *pmu)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
- * Cross CPU call to install and enable a performance event
+ * We want to maintain the following priority of scheduling:
+ * - CPU pinned (EVENT_CPU | EVENT_PINNED)
+ * - task pinned (EVENT_PINNED)
+ * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
+ * - task flexible (EVENT_FLEXIBLE).
*
- * Must be called with ctx->mutex held
+ * In order to avoid unscheduling and scheduling back in everything every
+ * time an event is added, only do it for the groups of equal priority and
+ * below.
+ *
+ * This can be called after a batch operation on task events, in which case
+ * event_type is a bit mask of the types of events involved. For CPU events,
+ * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
-static int __perf_install_in_context(void *info)
+static void ctx_resched(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *task_ctx,
+ struct pmu *pmu, enum event_type_t event_type)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- struct perf_event_context *task_ctx = cpuctx->task_ctx;
- struct task_struct *task = current;
-
- perf_ctx_lock(cpuctx, task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ bool cpu_event = !!(event_type & EVENT_CPU);
+ struct perf_event_pmu_context *epc;
/*
- * If there was an active task_ctx schedule it out.
+ * If pinned groups are involved, flexible groups also need to be
+ * scheduled out.
*/
- if (task_ctx)
- task_ctx_sched_out(task_ctx);
+ if (event_type & EVENT_PINNED)
+ event_type |= EVENT_FLEXIBLE;
+
+ event_type &= EVENT_ALL;
+
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ if (task_ctx) {
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ task_ctx_sched_out(task_ctx, pmu, event_type);
+ }
/*
- * If the context we're installing events in is not the
- * active task_ctx, flip them.
+ * Decide which cpu ctx groups to schedule out based on the types
+ * of events that caused rescheduling:
+ * - EVENT_CPU: schedule out corresponding groups;
+ * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
+ * - otherwise, do nothing more.
*/
- if (ctx->task && task_ctx != ctx) {
- if (task_ctx)
- raw_spin_unlock(&task_ctx->lock);
- raw_spin_lock(&ctx->lock);
- task_ctx = ctx;
- }
+ if (cpu_event)
+ ctx_sched_out(&cpuctx->ctx, pmu, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+
+ perf_event_sched_in(cpuctx, task_ctx, pmu);
+
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
if (task_ctx) {
- cpuctx->task_ctx = task_ctx;
- task = task_ctx->task;
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
}
+}
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+void perf_pmu_resched(struct pmu *pmu)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
- update_context_time(ctx);
- /*
- * update cgrp time only if current cgrp
- * matches event->cgrp. Must be done before
- * calling add_event_to_ctx()
- */
- update_cgrp_time_from_event(event);
+ perf_ctx_lock(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
- add_event_to_ctx(event, ctx);
+/*
+ * Cross CPU call to install and enable a performance event
+ *
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
+ */
+static int __perf_install_in_context(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ bool reprogram = true;
+ int ret = 0;
- /*
- * Schedule everything back in
- */
- perf_event_sched_in(cpuctx, task_ctx, task);
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx->task) {
+ raw_spin_lock(&ctx->lock);
+ task_ctx = ctx;
+
+ reprogram = (ctx->task == current);
+
+ /*
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
+ */
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ } else if (task_ctx) {
+ raw_spin_lock(&task_ctx->lock);
+ }
+
+#ifdef CONFIG_CGROUP_PERF
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
+ /*
+ * If the current cgroup doesn't match the event's
+ * cgroup, we should not try to schedule it.
+ */
+ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+ reprogram = cgroup_is_descendant(cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
+ }
+#endif
+
+ if (reprogram) {
+ ctx_time_freeze(cpuctx, ctx);
+ add_event_to_ctx(event, ctx);
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ get_event_type(event));
+ } else {
+ add_event_to_ctx(event, ctx);
+ }
- perf_pmu_enable(cpuctx->ctx.pmu);
+unlock:
perf_ctx_unlock(cpuctx, task_ctx);
- return 0;
+ return ret;
}
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx);
+
/*
- * Attach a performance event to a context
+ * Attach a performance event to a context.
*
- * First we add the event to the list with the hardware enable bit
- * in event->hw_config cleared.
- *
- * If the event is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
+ * Very similar to event_function_call, see comment there.
*/
static void
perf_install_in_context(struct perf_event_context *ctx,
struct perf_event *event,
int cpu)
{
- struct task_struct *task = ctx->task;
+ struct task_struct *task = READ_ONCE(ctx->task);
lockdep_assert_held(&ctx->mutex);
- event->ctx = ctx;
+ WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
+
+ /*
+ * Ensures that if we can observe event->ctx, both the event and ctx
+ * will be 'complete'. See perf_iterate_sb_cpu().
+ */
+ smp_store_release(&event->ctx, ctx);
+
+ /*
+ * perf_event_attr::disabled events will not run and can be initialized
+ * without IPI. Except when this is the first event for the context, in
+ * that case we need the magic of the IPI to set ctx->is_active.
+ *
+ * The IOC_ENABLE that is sure to follow the creation of a disabled
+ * event will issue the IPI and reprogram the hardware.
+ */
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
+ ctx->nr_events && !is_cgroup_event(event)) {
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
if (!task) {
- /*
- * Per cpu events are installed via an smp call and
- * the install is always successful.
- */
cpu_function_call(cpu, __perf_install_in_context, event);
return;
}
-retry:
- if (!task_function_call(task, __perf_install_in_context, event))
+ /*
+ * Should not happen, we validate the ctx is still alive before calling.
+ */
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
return;
- raw_spin_lock_irq(&ctx->lock);
/*
- * If we failed to find a running task, but find the context active now
- * that we've acquired the ctx->lock, retry.
+ * Installing events is tricky because we cannot rely on ctx->is_active
+ * to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
- if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
+
+ /*
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
+ */
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ task = ctx->task;
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
/*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
+ * Cannot happen because we already checked above (which also
+ * cannot happen), and we hold ctx->mutex, which serializes us
+ * against perf_event_exit_task_context().
*/
- task = ctx->task;
- goto retry;
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
}
-
/*
- * Since the task isn't running, its safe to add the event, us holding
- * the ctx->lock ensures the task won't get scheduled in.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
/*
- * Put a event into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_event_mark_enabled(struct perf_event *event)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
- }
-}
-
-/*
* Cross CPU call to enable a performance event
*/
-static int __perf_event_enable(void *info)
+static void __perf_event_enable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- int err;
+ struct perf_event_context *task_ctx;
- /*
- * There's a time window between 'ctx->is_active' check
- * in perf_event_enable function and this place having:
- * - IRQs on
- * - ctx->lock unlocked
- *
- * where the task could be killed and 'ctx' deactivated
- * by perf_event_exit_task.
- */
- if (!ctx->is_active)
- return -EINVAL;
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
+ return;
- raw_spin_lock(&ctx->lock);
- update_context_time(ctx);
+ ctx_time_freeze(cpuctx, ctx);
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto unlock;
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_cgroup_event_enable(event, ctx);
- /*
- * set current task's cgroup time reference point
- */
- perf_cgroup_set_timestamp(current, ctx);
-
- __perf_event_mark_enabled(event);
+ if (!ctx->is_active)
+ return;
- if (!event_filter_match(event)) {
- if (is_cgroup_event(event))
- perf_cgroup_defer_enabled(event);
- goto unlock;
- }
+ if (!event_filter_match(event))
+ return;
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
-
- if (!group_can_go_on(event, cpuctx, 1)) {
- err = -EEXIST;
- } else {
- if (event == leader)
- err = group_sched_in(event, cpuctx, ctx);
- else
- err = event_sched_in(event, cpuctx, ctx);
- }
-
- if (err) {
- /*
- * If this event can't go on and it's part of a
- * group, then the whole group has to come off.
- */
- if (leader != event) {
- group_sched_out(leader, cpuctx, ctx);
- perf_cpu_hrtimer_restart(cpuctx);
- }
- if (leader->attr.pinned) {
- update_group_times(leader);
- leader->state = PERF_EVENT_STATE_ERROR;
- }
- }
+ return;
-unlock:
- raw_spin_unlock(&ctx->lock);
+ task_ctx = cpuctx->task_ctx;
+ if (ctx->task)
+ WARN_ON_ONCE(task_ctx != ctx);
- return 0;
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
- * Enable a event.
+ * Enable an event.
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
@@ -2281,58 +3183,35 @@ unlock:
static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
- if (!task) {
- /*
- * Enable the event on the cpu that it's on
- */
- cpu_function_call(event->cpu, __perf_event_enable, event);
+ raw_spin_lock_irq(&ctx->lock);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
+out:
+ raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto out;
-
/*
* If the event is in error state, clear that first.
- * That way, if we see the event in error state below, we
- * know that it has gone back into error state, as distinct
- * from the task having been scheduled away before the
- * cross-call arrived.
- */
- if (event->state == PERF_EVENT_STATE_ERROR)
- event->state = PERF_EVENT_STATE_OFF;
-
-retry:
- if (!ctx->is_active) {
- __perf_event_mark_enabled(event);
- goto out;
- }
-
- raw_spin_unlock_irq(&ctx->lock);
-
- if (!task_function_call(task, __perf_event_enable, event))
- return;
-
- raw_spin_lock_irq(&ctx->lock);
-
- /*
- * If the context is active and the event is still off,
- * we need to retry the cross-call.
+ *
+ * That way, if we see the event in error state below, we know that it
+ * has gone back into error state, as distinct from the task having
+ * been scheduled away before the cross-call arrived.
*/
- if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+ if (event->state == PERF_EVENT_STATE_ERROR) {
/*
- * task could have been flipped by a concurrent
- * perf_event_context_sched_out()
+ * Detached SIBLING events cannot leave ERROR state.
*/
- task = ctx->task;
- goto retry;
- }
+ if (event->event_caps & PERF_EV_CAP_SIBLING &&
+ event->group_leader == event)
+ goto out;
-out:
+ event->state = PERF_EVENT_STATE_OFF;
+ }
raw_spin_unlock_irq(&ctx->lock);
+
+ event_function_call(event, __perf_event_enable, NULL);
}
/*
@@ -2348,6 +3227,112 @@ void perf_event_enable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_enable);
+struct stop_event_data {
+ struct perf_event *event;
+ unsigned int restart;
+};
+
+static int __perf_event_stop(void *info)
+{
+ struct stop_event_data *sd = info;
+ struct perf_event *event = sd->event;
+
+ /* if it's already INACTIVE, do nothing */
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * There is a window with interrupts enabled before we get here,
+ * so we need to check again lest we try to stop another CPU's event.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return -EAGAIN;
+
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ /*
+ * May race with the actual stop (through perf_pmu_output_stop()),
+ * but it is only used for events with AUX ring buffer, and such
+ * events will refuse to restart because of rb::aux_mmap_count==0,
+ * see comments in perf_aux_output_begin().
+ *
+ * Since this is happening on an event-local CPU, no trace is lost
+ * while restarting.
+ */
+ if (sd->restart)
+ event->pmu->start(event, 0);
+
+ return 0;
+}
+
+static int perf_event_stop(struct perf_event *event, int restart)
+{
+ struct stop_event_data sd = {
+ .event = event,
+ .restart = restart,
+ };
+ int ret = 0;
+
+ do {
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * We only want to restart ACTIVE events, so if the event goes
+ * inactive here (event->oncpu==-1), there's nothing more to do;
+ * fall through with ret==-ENXIO.
+ */
+ ret = cpu_function_call(READ_ONCE(event->oncpu),
+ __perf_event_stop, &sd);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * In order to contain the amount of racy and tricky in the address filter
+ * configuration management, it is a two part process:
+ *
+ * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
+ * we update the addresses of corresponding vmas in
+ * event::addr_filter_ranges array and bump the event::addr_filters_gen;
+ * (p2) when an event is scheduled in (pmu::add), it calls
+ * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
+ * if the generation has changed since the previous call.
+ *
+ * If (p1) happens while the event is active, we restart it to force (p2).
+ *
+ * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
+ * pre-existing mappings, called once when new filters arrive via SET_FILTER
+ * ioctl;
+ * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
+ * for reading;
+ * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
+ * of exec.
+ */
+void perf_event_addr_filters_sync(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock(&ifh->lock);
+ if (event->addr_filters_gen != event->hw.addr_filters_gen) {
+ event->pmu->addr_filters_sync(event);
+ event->hw.addr_filters_gen = event->addr_filters_gen;
+ }
+ raw_spin_unlock(&ifh->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
+
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
@@ -2378,33 +3363,183 @@ int perf_event_refresh(struct perf_event *event, int refresh)
}
EXPORT_SYMBOL_GPL(perf_event_refresh);
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static int perf_event_modify_breakpoint(struct perf_event *bp,
+ struct perf_event_attr *attr)
{
- struct perf_event *event;
- int is_active = ctx->is_active;
+ int err;
- ctx->is_active &= ~event_type;
- if (likely(!ctx->nr_events))
+ _perf_event_disable(bp);
+
+ err = modify_user_hw_breakpoint_check(bp, attr, true);
+
+ if (!bp->attr.disabled)
+ _perf_event_enable(bp);
+
+ return err;
+}
+
+/*
+ * Copy event-type-independent attributes that may be modified.
+ */
+static void perf_event_modify_copy_attr(struct perf_event_attr *to,
+ const struct perf_event_attr *from)
+{
+ to->sig_data = from->sig_data;
+}
+
+static int perf_event_modify_attr(struct perf_event *event,
+ struct perf_event_attr *attr)
+{
+ int (*func)(struct perf_event *, struct perf_event_attr *);
+ struct perf_event *child;
+ int err;
+
+ if (event->attr.type != attr->type)
+ return -EINVAL;
+
+ switch (event->attr.type) {
+ case PERF_TYPE_BREAKPOINT:
+ func = perf_event_modify_breakpoint;
+ break;
+ default:
+ /* Place holder for future additions. */
+ return -EOPNOTSUPP;
+ }
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+
+ mutex_lock(&event->child_mutex);
+ /*
+ * Event-type-independent attributes must be copied before event-type
+ * modification, which will validate that final attributes match the
+ * source attributes after all relevant attributes have been copied.
+ */
+ perf_event_modify_copy_attr(&event->attr, attr);
+ err = func(event, attr);
+ if (err)
+ goto out;
+ list_for_each_entry(child, &event->child_list, child_list) {
+ perf_event_modify_copy_attr(&child->attr, attr);
+ err = func(child, attr);
+ if (err)
+ goto out;
+ }
+out:
+ mutex_unlock(&event->child_mutex);
+ return err;
+}
+
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
+{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
+ struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
+
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!(event_type & EVENT_ALL))
return;
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
- if (!ctx->nr_active)
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
+static void
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
+ int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
+
+ lockdep_assert_held(&ctx->lock);
+
+ if (likely(!ctx->nr_events)) {
+ /*
+ * See __perf_remove_from_context().
+ */
+ WARN_ON_ONCE(ctx->is_active);
+ if (ctx->task)
+ WARN_ON_ONCE(cpuctx->task_ctx);
return;
+ }
+
+ /*
+ * Always update time if it was set; not only when it changes.
+ * Otherwise we can 'forget' to update time for any but the last
+ * context we sched out. For example:
+ *
+ * ctx_sched_out(.event_type = EVENT_FLEXIBLE)
+ * ctx_sched_out(.event_type = EVENT_PINNED)
+ *
+ * would only update time for the pinned events.
+ */
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ ctx->is_active &= ~event_type;
- perf_pmu_disable(ctx->pmu);
- if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
- list_for_each_entry(event, &ctx->pinned_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ if (!(ctx->is_active & EVENT_ALL)) {
+ /*
+ * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+ * does not observe a hole. perf_ctx_unlock() will clean up.
+ */
+ if (ctx->is_active & EVENT_FROZEN)
+ ctx->is_active &= EVENT_TIME_FROZEN;
+ else
+ ctx->is_active = 0;
}
- if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
- list_for_each_entry(event, &ctx->flexible_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!(ctx->is_active & EVENT_ALL))
+ cpuctx->task_ctx = NULL;
}
- perf_pmu_enable(ctx->pmu);
+
+ is_active ^= ctx->is_active; /* changed bits */
+
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
/*
@@ -2460,18 +3595,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- switch (event->state) {
- case PERF_EVENT_STATE_ACTIVE:
- event->pmu->read(event);
- /* fall-through */
+ perf_pmu_read(event);
- case PERF_EVENT_STATE_INACTIVE:
- update_event_times(event);
- break;
-
- default:
- break;
- }
+ perf_event_update_time(event);
/*
* In order to keep per-task stats reliable we need to flip the event
@@ -2517,24 +3643,33 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
+ struct task_struct *task, bool sched_in)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpc(pmu_ctx->pmu);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
if (likely(!ctx))
return;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -2558,16 +3693,42 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
+
+ perf_ctx_disable(ctx, false);
+
+ /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
+ if (local_read(&ctx->nr_no_switch_fast) ||
+ local_read(&next_ctx->nr_no_switch_fast)) {
+ /*
+ * Must not swap out ctx when there's pending
+ * events that rely on the ctx->task relation.
+ *
+ * Likewise, when a context contains inherit +
+ * SAMPLE_READ events they should be switched
+ * out using the slow path so that they are
+ * treated as if they were distinct contexts.
+ */
+ raw_spin_unlock(&next_ctx->lock);
+ rcu_read_unlock();
+ goto inside_switch;
+ }
+
+ WRITE_ONCE(ctx->task, next);
+ WRITE_ONCE(next_ctx->task, task);
+
+ perf_ctx_sched_task_cb(ctx, task, false);
+
+ perf_ctx_enable(ctx, false);
+
/*
- * XXX do we need a memory barrier of sorts
- * wrt to rcu_dereference() of perf_event_ctxp
+ * RCU_INIT_POINTER here is safe because we've not
+ * modified the ctx and the above modification of
+ * ctx->task is immaterial since this value is
+ * always verified under ctx->lock which we're now
+ * holding.
*/
- task->perf_event_ctxp[ctxn] = next_ctx;
- next->perf_event_ctxp[ctxn] = ctx;
- ctx->task = next;
- next_ctx->task = task;
-
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -2581,64 +3742,89 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
+ perf_ctx_disable(ctx, false);
+
+inside_switch:
+ perf_ctx_sched_task_cb(ctx, task, false);
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
+
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
+
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
+
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
+
void perf_sched_cb_inc(struct pmu *pmu)
{
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
+
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
/*
* This function provides the context switch callback to the lower code
* layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
*/
-static void perf_pmu_sched_task(struct task_struct *prev,
- struct task_struct *next,
- bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
+ struct task_struct *task, bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- unsigned long flags;
- if (prev == next)
- return;
+ pmu = cpc->epc.pmu;
- local_irq_save(flags);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- if (pmu->sched_task) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ /* software PMUs will not have sched_task */
+ if (WARN_ON_ONCE(!pmu->sched_task))
+ return;
- perf_pmu_disable(pmu);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpc->task_epc, task, sched_in);
- perf_pmu_enable(pmu);
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
- }
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- rcu_read_unlock();
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
+ return;
- local_irq_restore(flags);
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
}
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in);
/*
* Called from scheduler to remove the events of the current task,
@@ -2654,167 +3840,327 @@ static void perf_pmu_sched_task(struct task_struct *prev,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, next, false);
+
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_out(task, next);
+ perf_cgroup_switch(next);
}
-static void task_ctx_sched_out(struct perf_event_context *ctx)
+static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)
{
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ const struct perf_event *le = *(const struct perf_event **)l;
+ const struct perf_event *re = *(const struct perf_event **)r;
- if (!cpuctx->task_ctx)
- return;
+ return le->group_index < re->group_index;
+}
- if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap);
+
+static const struct min_heap_callbacks perf_min_heap = {
+ .less = perf_less_group_idx,
+ .swp = NULL,
+};
+
+static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)
+{
+ struct perf_event **itrs = heap->data;
+
+ if (event) {
+ itrs[heap->nr] = event;
+ heap->nr++;
+ }
+}
+
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
return;
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
+ cpc = this_cpc(pmu_ctx->pmu);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
+ struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
+ int (*func)(struct perf_event *, void *),
+ void *data)
+{
+#ifdef CONFIG_CGROUP_PERF
+ struct cgroup_subsys_state *css = NULL;
+#endif
+ struct perf_cpu_context *cpuctx = NULL;
+ /* Space for per CPU and/or any CPU event iterators. */
+ struct perf_event *itrs[2];
+ struct perf_event_min_heap event_heap;
+ struct perf_event **evt;
+ int ret;
+
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ event_heap = (struct perf_event_min_heap){
+ .data = cpuctx->heap,
+ .nr = 0,
+ .size = cpuctx->heap_size,
+ };
+
+ lockdep_assert_held(&cpuctx->ctx.lock);
+
+#ifdef CONFIG_CGROUP_PERF
+ if (cpuctx->cgrp)
+ css = &cpuctx->cgrp->css;
+#endif
+ } else {
+ event_heap = (struct perf_event_min_heap){
+ .data = itrs,
+ .nr = 0,
+ .size = ARRAY_SIZE(itrs),
+ };
+ /* Events not within a CPU context may be on any CPU. */
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
+ }
+ evt = event_heap.data;
+
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
+
+#ifdef CONFIG_CGROUP_PERF
+ for (; css; css = css->parent)
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
+#endif
+
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
+ min_heapify_all_inline(&event_heap, &perf_min_heap, NULL);
+
+ while (event_heap.nr) {
+ ret = func(*evt, data);
+ if (ret)
+ return ret;
+
+ *evt = perf_event_groups_next(*evt, pmu);
+ if (*evt)
+ min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL);
+ else
+ min_heap_pop_inline(&event_heap, &perf_min_heap, NULL);
+ }
+
+ return 0;
}
/*
- * Called with IRQs disabled
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
*/
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static inline bool event_update_userpage(struct perf_event *event)
{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+ if (likely(!refcount_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_event_update_userpage(event);
+
+ return true;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static inline void group_update_userpage(struct perf_event *group_event)
{
struct perf_event *event;
- list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- if (!event_filter_match(event))
- continue;
+ if (!event_update_userpage(group_event))
+ return;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
- if (group_can_go_on(event, cpuctx, 1))
- group_sched_in(event, cpuctx, ctx);
+static int merge_sched_in(struct perf_event *event, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ int *can_add_hw = data;
- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
+ if (event->state <= PERF_EVENT_STATE_OFF)
+ return 0;
+
+ if (!event_filter_match(event))
+ return 0;
+
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
+ list_add_tail(&event->active_list, get_event_list(event));
+ }
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
+ if (event->attr.pinned) {
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ if (*perf_event_fasync(event))
+ event->pending_kill = POLL_ERR;
+
+ perf_event_wakeup(event);
+ } else {
+ struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
+
+ event->pmu_ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpc);
+ group_update_userpage(event);
}
}
+
+ return 0;
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
- struct perf_event *event;
int can_add_hw = 1;
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+}
- list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
- /* Ignore events in OFF or ERROR state */
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- /*
- * Listen to the 'cpu' scheduling filter constraint
- * of events:
- */
- if (!event_filter_match(event))
- continue;
-
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
+{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
- if (group_can_go_on(event, cpuctx, can_add_hw)) {
- if (group_sched_in(event, cpuctx, ctx))
- can_add_hw = 0;
- }
- }
+ if (event_type & EVENT_PINNED)
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ if (event_type & EVENT_FLEXIBLE)
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
- u64 now;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
+
+ lockdep_assert_held(&ctx->lock);
- ctx->is_active |= event_type;
if (likely(!ctx->nr_events))
return;
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
+ if (!(is_active & EVENT_TIME)) {
+ /* start ctx time */
+ __update_context_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
+ ctx->is_active |= (event_type | EVENT_TIME);
+ if (ctx->task) {
+ if (!(is_active & EVENT_ALL))
+ cpuctx->task_ctx = ctx;
+ else
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ }
+
+ is_active ^= ctx->is_active; /* changed bits */
+
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
- ctx_pinned_sched_in(ctx, cpuctx);
+ if (is_active & EVENT_PINNED) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ }
/* Then walk through the lower prio flexible groups */
- if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
- ctx_flexible_sched_in(ctx, cpuctx);
+ if (is_active & EVENT_FLEXIBLE) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ }
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
- ctx_sched_in(ctx, cpuctx, event_type, task);
-}
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx, false);
- cpuctx = __get_cpu_context(ctx);
- if (cpuctx->task_ctx == ctx)
- return;
+ perf_ctx_sched_task_cb(ctx, task, true);
+
+ perf_ctx_enable(ctx, false);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
+ }
perf_ctx_lock(cpuctx, ctx);
- perf_pmu_disable(ctx->pmu);
+ /*
+ * We must check ctx->nr_events while holding ctx->lock, such
+ * that we serialize against perf_install_in_context().
+ */
+ if (!ctx->nr_events)
+ goto unlock;
+
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
* cpu flexible, task flexible.
+ *
+ * However, if task's ctx is not carrying any pinned
+ * events, no need to flip the cpuctx's events around.
*/
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx, false);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
+ }
+
+ perf_event_sched_in(cpuctx, ctx, NULL);
- if (ctx->nr_events)
- cpuctx->task_ctx = ctx;
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
- perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+ perf_ctx_enable(&cpuctx->ctx, false);
- perf_pmu_enable(ctx->pmu);
+ perf_ctx_enable(ctx, false);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -2831,23 +4177,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
+ perf_event_context_sched_in(task);
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
- /*
- * if cgroup events exist on this CPU, then we need
- * to check if we have to switch in PMU state.
- * cgroup event are system-wide mode only
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, prev, true);
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
@@ -2938,7 +4271,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period);
- delta = (delta + 7) / 8; /* low pass filter */
+ if (delta >= 0)
+ delta += 7;
+ else
+ delta -= 7;
+ delta /= 8; /* low pass filter */
sample_period = hwc->sample_period + delta;
@@ -2958,49 +4295,28 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
}
}
-/*
- * combine freq adjustment with unthrottling to avoid two passes over the
- * events. At the same time, make sure, having freq events does not change
- * the rate of unthrottling as that would introduce bias.
- */
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
s64 delta;
- /*
- * only need to iterate over all events iff:
- * - context have events in frequency mode (needs freq adjust)
- * - there are events to unthrottle on this cpu
- */
- if (!(ctx->nr_freq || needs_unthr))
- return;
-
- raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ list_for_each_entry(event, event_list, active_list) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
- perf_pmu_disable(event->pmu);
-
hwc = &event->hw;
- if (hwc->interrupts == MAX_INTERRUPTS) {
- hwc->interrupts = 0;
- perf_log_throttle(event, 1);
- event->pmu->start(event, 0);
- }
+ if (hwc->interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
- if (!event->attr.freq || !event->attr.sample_freq)
- goto next;
+ if (!is_event_in_freq_mode(event))
+ continue;
/*
* stop the event and update event->count
@@ -3022,90 +4338,191 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
- next:
- perf_pmu_enable(event->pmu);
+ }
+}
+
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || unthrottle))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (!(pmu_ctx->nr_freq || unthrottle))
+ continue;
+ if (!perf_pmu_ctx_is_active(pmu_ctx))
+ continue;
+ if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
+ continue;
+
+ perf_pmu_disable(pmu_ctx->pmu);
+ perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
+ perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
+ perf_pmu_enable(pmu_ctx->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
/*
- * Round-robin a context's events:
+ * Move @event to the tail of the @ctx's elegible events.
*/
-static void rotate_ctx(struct perf_event_context *ctx)
+static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
- if (!ctx->rotate_disable)
- list_rotate_left(&ctx->flexible_groups);
+ if (ctx->rotate_disable)
+ return;
+
+ perf_event_groups_delete(&ctx->flexible_groups, event);
+ perf_event_groups_insert(&ctx->flexible_groups, event);
}
-static int perf_rotate_context(struct perf_cpu_context *cpuctx)
+/* pick an event from the flexible_groups to rotate */
+static inline struct perf_event *
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
- struct perf_event_context *ctx = NULL;
- int rotate = 0;
+ struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
+ struct perf_event, active_list);
+ if (event)
+ goto out;
+
+ /* if no active flexible event, pick the first event */
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
- if (cpuctx->ctx.nr_events) {
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- rotate = 1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
}
- ctx = cpuctx->task_ctx;
- if (ctx && ctx->nr_events) {
- if (ctx->nr_events != ctx->nr_active)
- rotate = 1;
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
}
- if (!rotate)
- goto done;
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+out:
+ /*
+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
+ * finds there are unschedulable events, it will set it again.
+ */
+ pmu_ctx->rotate_necessary = 0;
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- if (ctx)
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ return event;
+}
- rotate_ctx(&cpuctx->ctx);
- if (ctx)
- rotate_ctx(ctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
+ struct perf_event *cpu_event = NULL, *task_event = NULL;
+ int cpu_rotate, task_rotate;
+ struct pmu *pmu;
- perf_event_sched_in(cpuctx, ctx, current);
+ /*
+ * Since we run this from IRQ context, nobody can install new
+ * events, thus the event count values are stable.
+ */
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-done:
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
- return rotate;
-}
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
-#ifdef CONFIG_NO_HZ_FULL
-bool perf_event_can_stop_tick(void)
-{
- if (atomic_read(&nr_freq_events) ||
- __this_cpu_read(perf_throttled_count))
+ if (!(cpu_rotate || task_rotate))
return false;
- else
- return true;
+
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
+
+ if (task_rotate)
+ task_event = ctx_event_to_rotate(task_epc);
+ if (cpu_rotate)
+ cpu_event = ctx_event_to_rotate(cpu_epc);
+
+ /*
+ * As per the order given at ctx_resched() first 'pop' task flexible
+ * and then, if needed CPU flexible.
+ */
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
+
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
+ rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
+ }
+
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
+
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
+
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+
+ return true;
}
-#endif
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
+ tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -3118,7 +4535,7 @@ static int event_enable_on_exec(struct perf_event *event,
if (event->state >= PERF_EVENT_STATE_INACTIVE)
return 0;
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
return 1;
}
@@ -3130,45 +4547,37 @@ static int event_enable_on_exec(struct perf_event *event,
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
struct perf_event_context *clone_ctx = NULL;
+ enum event_type_t event_type = 0;
+ struct perf_cpu_context *cpuctx;
struct perf_event *event;
unsigned long flags;
int enabled = 0;
- int ret;
local_irq_save(flags);
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
goto out;
- /*
- * We must ctxsw out cgroup events to avoid conflict
- * when invoking perf_task_event_sched_in() later on
- * in this function. Otherwise we end up trying to
- * ctxswin cgroup events which are already scheduled
- * in.
- */
- perf_cgroup_sched_out(current, NULL);
+ if (!ctx->nr_events)
+ goto out;
- raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(ctx);
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ perf_ctx_lock(cpuctx, ctx);
+ ctx_time_freeze(cpuctx, ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) {
- ret = event_enable_on_exec(event, ctx);
- if (ret)
- enabled = 1;
+ enabled |= event_enable_on_exec(event, ctx);
+ event_type |= get_event_type(event);
}
/*
- * Unclone this context if we enabled any event.
+ * Unclone and reschedule this context if we enabled any event.
*/
- if (enabled)
+ if (enabled) {
clone_ctx = unclone_ctx(ctx);
+ ctx_resched(cpuctx, ctx, NULL, event_type);
+ }
+ perf_ctx_unlock(cpuctx, ctx);
- raw_spin_unlock(&ctx->lock);
-
- /*
- * Also calls ctxswin for cgroup events, if any:
- */
- perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
@@ -3176,20 +4585,83 @@ out:
put_ctx(clone_ctx);
}
-void perf_event_exec(void)
+static void perf_remove_from_owner(struct perf_event *event);
+static void perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx,
+ bool revoke);
+
+/*
+ * Removes all events from the current task that have been marked
+ * remove-on-exec, and feeds their values back to parent events.
+ */
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx;
- int ctxn;
+ struct perf_event_context *clone_ctx = NULL;
+ struct perf_event *event, *next;
+ unsigned long flags;
+ bool modified = false;
- rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx)
+ mutex_lock(&ctx->mutex);
+
+ if (WARN_ON_ONCE(ctx->task != current))
+ goto unlock;
+
+ list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
+ if (!event->attr.remove_on_exec)
continue;
- perf_event_enable_on_exec(ctx);
+ if (!is_kernel_event(event))
+ perf_remove_from_owner(event);
+
+ modified = true;
+
+ perf_event_exit_event(event, ctx, false);
}
- rcu_read_unlock();
+
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ if (modified)
+ clone_ctx = unclone_ctx(ctx);
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+unlock:
+ mutex_unlock(&ctx->mutex);
+
+ if (clone_ctx)
+ put_ctx(clone_ctx);
+}
+
+struct perf_read_data {
+ struct perf_event *event;
+ bool group;
+ int ret;
+};
+
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
+
+static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
+{
+ int local_cpu = smp_processor_id();
+ u16 local_pkg, event_pkg;
+
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return event_cpu;
+
+ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
+
+ if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
+ return local_cpu;
+ }
+
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+ event_pkg = topology_physical_package_id(event_cpu);
+ local_pkg = topology_physical_package_id(local_cpu);
+
+ if (event_pkg == local_pkg)
+ return local_cpu;
+ }
+
+ return event_cpu;
}
/*
@@ -3197,9 +4669,11 @@ void perf_event_exec(void)
*/
static void __perf_event_read(void *info)
{
- struct perf_event *event = info;
+ struct perf_read_data *data = info;
+ struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct pmu *pmu = event->pmu;
/*
* If this is a task context, we need to check whether it is
@@ -3212,52 +4686,208 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
+ ctx_time_update_event(ctx, event);
+
+ perf_event_update_time(event);
+ if (data->group)
+ perf_event_update_sibling_time(event);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ goto unlock;
+
+ if (!data->group) {
+ pmu->read(event);
+ data->ret = 0;
+ goto unlock;
}
- update_event_times(event);
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+
+ pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+
+ pmu->read(event);
+
+ for_each_sibling_event(sub, event)
+ perf_pmu_read(sub);
+
+ data->ret = pmu->commit_txn(pmu);
+
+unlock:
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count(struct perf_event *event, bool self)
+{
+ if (self)
+ return local64_read(&event->count);
+
+ return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
{
- if (event->pmu->count)
- return event->pmu->count(event);
+ u64 ctx_time;
- return __perf_event_count(event);
+ *now = perf_clock();
+ ctx_time = perf_event_time_now(event, *now);
+ __perf_update_times(event, ctx_time, enabled, running);
}
-static u64 perf_event_read(struct perf_event *event)
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ * - either for the current task, or for this CPU
+ * - does not have inherit set, for inherited task events
+ * will not be local and we cannot read them atomically
+ * - must not have a pmu::count method
+ */
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
+ unsigned long flags;
+ int event_oncpu;
+ int event_cpu;
+ int ret = 0;
+
+ /*
+ * Disabling interrupts avoids all counter scheduling (context
+ * switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
+ /*
+ * It must not be an event with inherit set, we cannot read
+ * all child counters from atomic context.
+ */
+ if (event->attr.inherit) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* If this is a per-task event, it must be for current */
+ if ((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Get the event CPU numbers, and adjust them to local if the event is
+ * a per-package event that can be read locally
+ */
+ event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+ event_cpu = __perf_event_read_cpu(event, event->cpu);
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ if (!(event->attach_state & PERF_ATTACH_TASK) &&
+ event_cpu != smp_processor_id()) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If this is a pinned event it must be running on this CPU */
+ if (event->attr.pinned && event_oncpu != smp_processor_id()) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * If the event is currently on this CPU, its either a per-task event,
+ * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+ * oncpu == -1).
+ */
+ if (event_oncpu == smp_processor_id())
+ event->pmu->read(event);
+
+ *value = local64_read(&event->count);
+ if (enabled || running) {
+ u64 __enabled, __running, __now;
+
+ calc_timer_values(event, &__now, &__enabled, &__running);
+ if (enabled)
+ *enabled = __enabled;
+ if (running)
+ *running = __running;
+ }
+out:
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static int perf_event_read(struct perf_event *event, bool group)
+{
+ enum perf_event_state state = READ_ONCE(event->state);
+ int event_cpu, ret = 0;
+
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
- smp_call_function_single(event->oncpu,
- __perf_event_read, event, 1);
- } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+again:
+ if (state == PERF_EVENT_STATE_ACTIVE) {
+ struct perf_read_data data;
+
+ /*
+ * Orders the ->state and ->oncpu loads such that if we see
+ * ACTIVE we must also see the right ->oncpu.
+ *
+ * Matches the smp_wmb() from event_sched_in().
+ */
+ smp_rmb();
+
+ event_cpu = READ_ONCE(event->oncpu);
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return 0;
+
+ data = (struct perf_read_data){
+ .event = event,
+ .group = group,
+ .ret = 0,
+ };
+
+ preempt_disable();
+ event_cpu = __perf_event_read_cpu(event, event_cpu);
+
+ /*
+ * Purposely ignore the smp_call_function_single() return
+ * value.
+ *
+ * If event_cpu isn't a valid CPU it means the event got
+ * scheduled out and that will have updated the event count.
+ *
+ * Therefore, either way, we'll have an up-to-date event count
+ * after this.
+ */
+ (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
+ preempt_enable();
+ ret = data.ret;
+
+ } else if (state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ state = event->state;
+ if (state != PERF_EVENT_STATE_INACTIVE) {
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ goto again;
+ }
+
/*
- * may read while context is not active
- * (e.g., thread is blocked), in that case
- * we cannot update context time
+ * May read while context is not active (e.g., thread is
+ * blocked), in that case we cannot update context time
*/
- if (ctx->is_active) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
- update_event_times(event);
+ ctx_time_update_event(ctx, event);
+
+ perf_event_update_time(event);
+ if (group)
+ perf_event_update_sibling_time(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
- return perf_event_count(event);
+ return ret;
}
/*
@@ -3267,16 +4897,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
- INIT_LIST_HEAD(&ctx->pinned_groups);
- INIT_LIST_HEAD(&ctx->flexible_groups);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
+ perf_event_groups_init(&ctx->pinned_groups);
+ perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
+ refcount_set(&ctx->refcount, 1);
+}
+
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
}
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -3285,11 +4924,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
return NULL;
__perf_event_init_context(ctx);
- if (task) {
- ctx->task = task;
- get_task_struct(task);
- }
- ctx->pmu = pmu;
+ if (task)
+ ctx->task = get_task_struct(task);
return ctx;
}
@@ -3298,7 +4934,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
- int err;
rcu_read_lock();
if (!vpid)
@@ -3312,91 +4947,53 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /* Reuse ptrace permission checks for now. */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- goto errout;
-
return task;
-errout:
- put_task_struct(task);
- return ERR_PTR(err);
-
}
/*
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EACCES);
-
- /*
- * We could be clever and allow to attach a event to an
- * offline CPU and activate it when the CPU comes up, but
- * that's for later.
- */
- if (!cpu_online(cpu))
- return ERR_PTR(-ENODEV);
+ err = perf_allow_cpu();
+ if (err)
+ return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return ctx;
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -3405,12 +5002,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -3423,61 +5020,486 @@ retry:
}
}
- kfree(task_ctx_data);
return ctx;
errout:
- kfree(task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
+
+ if (!ctx->task) {
+ /*
+ * perf_pmu_migrate_context() / __perf_pmu_install_event()
+ * relies on the fact that find_get_pmu_context() cannot fail
+ * for CPU contexts.
+ */
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+ raw_spin_lock_irq(&ctx->lock);
+ if (!epc->ctx) {
+ /*
+ * One extra reference for the pmu; see perf_pmu_free().
+ */
+ atomic_set(&epc->refcount, 2);
+ epc->embedded = 1;
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ /* Make sure the pmu_ctx_list is sorted by PMU type: */
+ if (!pos && epc->pmu->type > pmu->type)
+ pos = epc;
+ }
+
+ epc = new;
+ new = NULL;
+
+ if (!pos)
+ list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ else
+ list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);
+
+ epc->ctx = ctx;
+
+found_epc:
+ raw_spin_unlock_irq(&ctx->lock);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_cpc_rcu(struct rcu_head *head)
+{
+ struct perf_cpu_pmu_context *cpc =
+ container_of(head, typeof(*cpc), epc.rcu_head);
+
+ kfree(cpc);
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ struct perf_event_context *ctx = epc->ctx;
+ unsigned long flags;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
+ return;
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+ if (epc->embedded) {
+ call_rcu(&epc->rcu_head, free_cpc_rcu);
+ return;
+ }
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
-static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
- kfree(event);
+ kmem_cache_free(perf_event_cache, event);
}
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
+ struct perf_buffer *rb);
+
+static void detach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_del_rcu(&event->sb_list);
+ raw_spin_unlock(&pel->lock);
+}
-static void unaccount_event_cpu(struct perf_event *event, int cpu)
+static bool is_sb_event(struct perf_event *event)
{
+ struct perf_event_attr *attr = &event->attr;
+
if (event->parent)
+ return false;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return false;
+
+ if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+ attr->comm || attr->comm_exec ||
+ attr->task || attr->ksymbol ||
+ attr->context_switch || attr->text_poke ||
+ attr->bpf_event)
+ return true;
+
+ return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ detach_sb_event(event);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static DEFINE_SPINLOCK(nr_freq_lock);
+#endif
+
+static void unaccount_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ spin_lock(&nr_freq_lock);
+ if (atomic_dec_and_test(&nr_freq_events))
+ tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
+ spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void unaccount_freq_event(void)
+{
+ if (tick_nohz_full_enabled())
+ unaccount_freq_event_nohz();
+ else
+ atomic_dec(&nr_freq_events);
+}
+
+
+static struct perf_ctx_data *
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+{
+ struct perf_ctx_data *cd;
+
+ cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+ if (!cd)
+ return NULL;
+
+ cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+ if (!cd->data) {
+ kfree(cd);
+ return NULL;
+ }
+
+ cd->global = global;
+ cd->ctx_cache = ctx_cache;
+ refcount_set(&cd->refcount, 1);
+
+ return cd;
+}
+
+static void free_perf_ctx_data(struct perf_ctx_data *cd)
+{
+ kmem_cache_free(cd->ctx_cache, cd->data);
+ kfree(cd);
+}
+
+static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_ctx_data *cd;
+
+ cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+ free_perf_ctx_data(cd);
+}
+
+static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
+{
+ call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
+}
+
+static int
+attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
+ bool global)
+{
+ struct perf_ctx_data *cd, *old = NULL;
+
+ cd = alloc_perf_ctx_data(ctx_cache, global);
+ if (!cd)
+ return -ENOMEM;
+
+ for (;;) {
+ if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (old)
+ perf_free_ctx_data_rcu(old);
+ return 0;
+ }
+
+ if (!old) {
+ /*
+ * After seeing a dead @old, we raced with
+ * removal and lost, try again to install @cd.
+ */
+ continue;
+ }
+
+ if (refcount_inc_not_zero(&old->refcount)) {
+ free_perf_ctx_data(cd); /* unused */
+ return 0;
+ }
+
+ /*
+ * @old is a dead object, refcount==0 is stable, try and
+ * replace it with @cd.
+ */
+ }
+ return 0;
+}
+
+static void __detach_global_ctx_data(void);
+DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
+static refcount_t global_ctx_data_ref;
+
+static int
+attach_global_ctx_data(struct kmem_cache *ctx_cache)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+ int ret;
+
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+again:
+ /* Allocate everything */
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (cd && !cd->global) {
+ cd->global = 1;
+ if (!refcount_inc_not_zero(&cd->refcount))
+ cd = NULL;
+ }
+ if (!cd) {
+ get_task_struct(p);
+ goto alloc;
+ }
+ }
+ }
+
+ refcount_set(&global_ctx_data_ref, 1);
+
+ return 0;
+alloc:
+ ret = attach_task_ctx_data(p, ctx_cache, true);
+ put_task_struct(p);
+ if (ret) {
+ __detach_global_ctx_data();
+ return ret;
+ }
+ goto again;
+}
+
+static int
+attach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
+ int ret;
+
+ if (!ctx_cache)
+ return -ENOMEM;
+
+ if (task)
+ return attach_task_ctx_data(task, ctx_cache, false);
+
+ ret = attach_global_ctx_data(ctx_cache);
+ if (ret)
+ return ret;
+
+ event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
+ return 0;
+}
+
+static void
+detach_task_ctx_data(struct task_struct *p)
+{
+ struct perf_ctx_data *cd;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !refcount_dec_and_test(&cd->refcount))
+ return;
+ }
+
+ /*
+ * The old ctx_data may be lost because of the race.
+ * Nothing is required to do for the case.
+ * See attach_task_ctx_data().
+ */
+ if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+}
+
+static void __detach_global_ctx_data(void)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+
+again:
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !cd->global)
+ continue;
+ cd->global = 0;
+ get_task_struct(p);
+ goto detach;
+ }
+ }
+ return;
+detach:
+ detach_task_ctx_data(p);
+ put_task_struct(p);
+ goto again;
+}
+
+static void detach_global_ctx_data(void)
+{
+ if (refcount_dec_not_one(&global_ctx_data_ref))
return;
- if (is_cgroup_event(event))
- atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (!refcount_dec_and_test(&global_ctx_data_ref))
+ return;
+
+ /* remove everything */
+ __detach_global_ctx_data();
+}
+
+static void detach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ event->attach_state &= ~PERF_ATTACH_TASK_DATA;
+
+ if (task)
+ return detach_task_ctx_data(task);
+
+ if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
+ detach_global_ctx_data();
+ event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
+ }
}
static void unaccount_event(struct perf_event *event)
{
+ bool dec = false;
+
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_dec_deferred(&perf_sched_events);
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
+ dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_dec(&nr_build_id_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_dec(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_dec(&nr_cgroup_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
- atomic_dec(&nr_freq_events);
+ unaccount_freq_event();
+ if (event->attr.context_switch) {
+ dec = true;
+ atomic_dec(&nr_switch_events);
+ }
if (is_cgroup_event(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
if (has_branch_stack(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
+ if (event->attr.ksymbol)
+ atomic_dec(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_dec(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_dec(&nr_text_poke_events);
+
+ if (dec) {
+ if (!atomic_add_unless(&perf_sched_count, -1, 1))
+ schedule_delayed_work(&perf_sched_work, HZ);
+ }
+
+ unaccount_pmu_sb_event(event);
+}
- unaccount_event_cpu(event, event->cpu);
+static void perf_sched_delayed(struct work_struct *work)
+{
+ mutex_lock(&perf_sched_mutex);
+ if (atomic_dec_and_test(&perf_sched_count))
+ static_branch_disable(&perf_sched_events);
+ mutex_unlock(&perf_sched_mutex);
}
/*
@@ -3487,16 +5509,16 @@ static void unaccount_event(struct perf_event *event)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
*/
static int exclusive_event_init(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return 0;
/*
@@ -3520,6 +5542,8 @@ static int exclusive_event_init(struct perf_event *event)
return -EBUSY;
}
+ event->attach_state |= PERF_ATTACH_EXCLUSIVE;
+
return 0;
}
@@ -3527,19 +5551,18 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
- return;
-
/* see comment in exclusive_event_init() */
if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt);
else
atomic_inc(&pmu->exclusive_cnt);
+
+ event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
- if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+ if ((e1->pmu == e2->pmu) &&
(e1->cpu == e2->cpu ||
e1->cpu == -1 ||
e2->cpu == -1))
@@ -3547,14 +5570,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
return false;
}
-/* Called under the same ctx::mutex as perf_install_in_context() */
static bool exclusive_event_installable(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ lockdep_assert_held(&ctx->mutex);
+
+ if (!is_exclusive_pmu(pmu))
return true;
list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -3565,35 +5589,78 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
+static void perf_free_addr_filters(struct perf_event *event);
+
+/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
+ struct pmu *pmu = event->pmu;
- perf_event_free_bpf_prog(event);
+ if (event->attach_state & PERF_ATTACH_CALLCHAIN)
+ put_callchain_buffers();
+
+ kfree(event->addr_filter_ranges);
+
+ if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
+ exclusive_event_destroy(event);
+
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ detach_perf_ctx_data(event);
if (event->destroy)
event->destroy(event);
+ /*
+ * Must be after ->destroy(), due to uprobe_perf_close() using
+ * hw.target.
+ */
+ if (event->hw.target)
+ put_task_struct(event->hw.target);
+
+ if (event->pmu_ctx) {
+ /*
+ * put_pmu_ctx() needs an event->ctx reference, because of
+ * epc->ctx.
+ */
+ WARN_ON_ONCE(!pmu);
+ WARN_ON_ONCE(!event->ctx);
+ WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
+ put_pmu_ctx(event->pmu_ctx);
+ }
+
+ /*
+ * perf_event_free_task() relies on put_ctx() being 'last', in
+ * particular all task references must be cleaned up.
+ */
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
}
call_rcu(&event->rcu_head, free_event_rcu);
}
+DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
+
+/* vs perf_event_alloc() success */
static void _free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending);
+ irq_work_sync(&event->pending_irq);
+ irq_work_sync(&event->pending_disable_irq);
unaccount_event(event);
+ security_perf_event_free(event);
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -3606,21 +5673,21 @@ static void _free_event(struct perf_event *event)
mutex_unlock(&event->mmap_mutex);
}
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
__free_event(event);
}
/*
* Used to free events which have a known refcount of 1, such as in error paths
- * where the event isn't exposed yet and inherited events.
+ * of inherited events.
*/
static void free_event(struct perf_event *event)
{
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
- "unexpected event refcount: %ld; ptr=%p\n",
- atomic_long_read(&event->refcount), event)) {
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
/* leak to avoid use-after-free */
return;
}
@@ -3636,14 +5703,13 @@ static void perf_remove_from_owner(struct perf_event *event)
struct task_struct *owner;
rcu_read_lock();
- owner = ACCESS_ONCE(event->owner);
/*
- * Matches the smp_wmb() in perf_event_exit_task(). If we observe
- * !owner it means the list deletion is complete and we can indeed
- * free this event, otherwise we need to serialize on
+ * Matches the smp_store_release() in perf_event_exit_task(). If we
+ * observe !owner it means the list deletion is complete and we can
+ * indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- smp_read_barrier_depends();
+ owner = READ_ONCE(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -3671,8 +5737,10 @@ static void perf_remove_from_owner(struct perf_event *event)
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ smp_store_release(&event->owner, NULL);
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
@@ -3680,36 +5748,122 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
+ struct perf_event *parent;
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ parent = event->parent;
+ _free_event(event);
+
+ /* Matches the refcount bump in inherit_event() */
+ if (parent)
+ put_event(parent);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *child, *tmp;
+
+ /*
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
+ */
+ if (!ctx) {
+ WARN_ON_ONCE(event->attach_state &
+ (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+ goto no_ctx;
+ }
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+
/*
- * There are two ways this annotation is useful:
+ * Mark this event as STATE_DEAD, there is no external reference to it
+ * anymore.
*
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
+ * Anybody acquiring event->child_mutex after the below loop _must_
+ * also see this, most importantly inherit_event() which will avoid
+ * placing more children on the list.
*
- * 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
*/
- ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
- WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
+
perf_event_ctx_unlock(event, ctx);
- _free_event(event);
-}
+again:
+ mutex_lock(&event->child_mutex);
+ list_for_each_entry(child, &event->child_list, child_list) {
+ /*
+ * Cannot change, child events are not migrated, see the
+ * comment with perf_event_ctx_lock_nested().
+ */
+ ctx = READ_ONCE(child->ctx);
+ /*
+ * Since child_mutex nests inside ctx::mutex, we must jump
+ * through hoops. We start by grabbing a reference on the ctx.
+ *
+ * Since the event cannot get freed while we hold the
+ * child_mutex, the context must also exist and have a !0
+ * reference count.
+ */
+ get_ctx(ctx);
-int perf_event_release_kernel(struct perf_event *event)
-{
+ /*
+ * Now that we have a ctx ref, we can drop child_mutex, and
+ * acquire ctx::mutex without fear of it going away. Then we
+ * can re-acquire child_mutex.
+ */
+ mutex_unlock(&event->child_mutex);
+ mutex_lock(&ctx->mutex);
+ mutex_lock(&event->child_mutex);
+
+ /*
+ * Now that we hold ctx::mutex and child_mutex, revalidate our
+ * state, if child is still the first entry, it didn't get freed
+ * and we can continue doing so.
+ */
+ tmp = list_first_entry_or_null(&event->child_list,
+ struct perf_event, child_list);
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
+ } else {
+ child = NULL;
+ }
+
+ mutex_unlock(&event->child_mutex);
+ mutex_unlock(&ctx->mutex);
+
+ if (child) {
+ /* Last reference unless ->pending_task work is pending */
+ put_event(child);
+ }
+ put_ctx(ctx);
+
+ goto again;
+ }
+ mutex_unlock(&event->child_mutex);
+
+no_ctx:
+ /*
+ * Last reference unless ->pending_task work is pending on this event
+ * or any of its children.
+ */
put_event(event);
return 0;
}
@@ -3720,47 +5874,11 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
-u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;
@@ -3769,14 +5887,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
*running = 0;
mutex_lock(&event->child_mutex);
- total += perf_event_read(event);
+
+ (void)perf_event_read(event, false);
+ total += perf_event_count(event, false);
+
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
*running += event->total_time_running +
atomic64_read(&event->child_total_time_running);
list_for_each_entry(child, &event->child_list, child_list) {
- total += perf_event_read(child);
+ (void)perf_event_read(child, false);
+ total += perf_event_count(child, false);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
@@ -3784,70 +5906,157 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
return total;
}
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ count = __perf_event_read_value(event, enabled, running);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
EXPORT_SYMBOL_GPL(perf_event_read_value);
-static int perf_event_read_group(struct perf_event *event,
- u64 read_format, char __user *buf)
+static int __perf_read_group_add(struct perf_event *leader,
+ u64 read_format, u64 *values)
{
- struct perf_event *leader = event->group_leader, *sub;
struct perf_event_context *ctx = leader->ctx;
- int n = 0, size = 0, ret;
- u64 count, enabled, running;
- u64 values[5];
+ struct perf_event *sub, *parent;
+ unsigned long flags;
+ int n = 1; /* skip @nr */
+ int ret;
- lockdep_assert_held(&ctx->mutex);
+ ret = perf_event_read(leader, true);
+ if (ret)
+ return ret;
- count = perf_event_read_value(leader, &enabled, &running);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * Verify the grouping between the parent and child (inherited)
+ * events is still in tact.
+ *
+ * Specifically:
+ * - leader->ctx->lock pins leader->sibling_list
+ * - parent->child_mutex pins parent->child_list
+ * - parent->ctx->mutex pins parent->sibling_list
+ *
+ * Because parent->ctx != leader->ctx (and child_list nests inside
+ * ctx->mutex), group destruction is not atomic between children, also
+ * see perf_event_release_kernel(). Additionally, parent can grow the
+ * group.
+ *
+ * Therefore it is possible to have parent and child groups in a
+ * different configuration and summing over such a beast makes no sense
+ * what so ever.
+ *
+ * Reject this.
+ */
+ parent = leader->parent;
+ if (parent &&
+ (parent->group_generation != leader->group_generation ||
+ parent->nr_siblings != leader->nr_siblings)) {
+ ret = -ECHILD;
+ goto unlock;
+ }
- values[n++] = 1 + leader->nr_siblings;
- if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- values[n++] = enabled;
- if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- values[n++] = running;
- values[n++] = count;
+ /*
+ * Since we co-schedule groups, {enabled,running} times of siblings
+ * will be identical to those of the leader, so we only publish one
+ * set.
+ */
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] += leader->total_time_enabled +
+ atomic64_read(&leader->child_total_time_enabled);
+ }
+
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] += leader->total_time_running +
+ atomic64_read(&leader->child_total_time_running);
+ }
+
+ /*
+ * Write {count,id} tuples for every sibling.
+ */
+ values[n++] += perf_event_count(leader, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
- size = n * sizeof(u64);
+ for_each_sibling_event(sub, leader) {
+ values[n++] += perf_event_count(sub, false);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
+ }
- if (copy_to_user(buf, values, size))
- return -EFAULT;
+unlock:
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ return ret;
+}
- ret = size;
+static int perf_read_group(struct perf_event *event,
+ u64 read_format, char __user *buf)
+{
+ struct perf_event *leader = event->group_leader, *child;
+ struct perf_event_context *ctx = leader->ctx;
+ int ret;
+ u64 *values;
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- n = 0;
+ lockdep_assert_held(&ctx->mutex);
- values[n++] = perf_event_read_value(sub, &enabled, &running);
- if (read_format & PERF_FORMAT_ID)
- values[n++] = primary_event_id(sub);
+ values = kzalloc(event->read_size, GFP_KERNEL);
+ if (!values)
+ return -ENOMEM;
- size = n * sizeof(u64);
+ values[0] = 1 + leader->nr_siblings;
- if (copy_to_user(buf + ret, values, size)) {
- return -EFAULT;
- }
+ mutex_lock(&leader->child_mutex);
- ret += size;
+ ret = __perf_read_group_add(leader, read_format, values);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(child, &leader->child_list, child_list) {
+ ret = __perf_read_group_add(child, read_format, values);
+ if (ret)
+ goto unlock;
}
+ mutex_unlock(&leader->child_mutex);
+
+ ret = event->read_size;
+ if (copy_to_user(buf, values, event->read_size))
+ ret = -EFAULT;
+ goto out;
+
+unlock:
+ mutex_unlock(&leader->child_mutex);
+out:
+ kfree(values);
return ret;
}
-static int perf_event_read_one(struct perf_event *event,
+static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
- values[n++] = perf_event_read_value(event, &enabled, &running);
+ values[n++] = __perf_event_read_value(event, &enabled, &running);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -3859,7 +6068,7 @@ static bool is_event_hup(struct perf_event *event)
{
bool no_children;
- if (event->state != PERF_EVENT_STATE_EXIT)
+ if (event->state > PERF_EVENT_STATE_EXIT)
return false;
mutex_lock(&event->child_mutex);
@@ -3872,13 +6081,13 @@ static bool is_event_hup(struct perf_event *event)
* Read the performance event - simple non blocking version for now
*/
static ssize_t
-perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
u64 read_format = event->attr.read_format;
int ret;
/*
- * Return end-of-file for a read on a event that is in
+ * Return end-of-file for a read on an event that is in
* error state (i.e. because it was pinned but it couldn't be
* scheduled on to the CPU at some point).
*/
@@ -3890,9 +6099,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
WARN_ON_ONCE(event->ctx->parent_ctx);
if (read_format & PERF_FORMAT_GROUP)
- ret = perf_event_read_group(event, read_format, buf);
+ ret = perf_read_group(event, read_format, buf);
else
- ret = perf_event_read_one(event, read_format, buf);
+ ret = perf_read_one(event, read_format, buf);
return ret;
}
@@ -3904,24 +6113,38 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct perf_event_context *ctx;
int ret;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
- ret = perf_read_hw(event, buf, count);
+ ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
return ret;
}
-static unsigned int perf_poll(struct file *file, poll_table *wait)
+static __poll_t perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
- struct ring_buffer *rb;
- unsigned int events = POLLHUP;
+ struct perf_buffer *rb;
+ __poll_t events = EPOLLHUP;
+
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
+ if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
+ event->attr.pinned))
+ return EPOLLERR;
+
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
@@ -3936,15 +6159,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
static void _perf_event_reset(struct perf_event *event)
{
- (void)perf_event_read(event);
+ (void)perf_event_read(event, false);
local64_set(&event->count, 0);
perf_event_update_userpage(event);
}
+/* Assume it's not an event with inherit set. */
+u64 perf_event_pause(struct perf_event *event, bool reset)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(event->attr.inherit);
+ _perf_event_disable(event);
+ count = local64_read(&event->count);
+ if (reset)
+ local64_set(&event->count, 0);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(perf_event_pause);
+
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
* task existence requirements of perf_event_enable/disable.
*/
static void perf_event_for_each_child(struct perf_event *event,
@@ -3972,32 +6213,19 @@ static void perf_event_for_each(struct perf_event *event,
event = event->group_leader;
perf_event_for_each_child(event, func);
- list_for_each_entry(sibling, &event->sibling_list, group_entry)
+ for_each_sibling_event(sibling, event)
perf_event_for_each_child(sibling, func);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
+static void __perf_event_period(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event_context *ctx = event->ctx;
- int ret = 0, active;
- u64 value;
-
- if (!is_sampling_event(event))
- return -EINVAL;
-
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
+ u64 value = *((u64 *)info);
+ bool active;
- if (!value)
- return -EINVAL;
-
- raw_spin_lock_irq(&ctx->lock);
if (event->attr.freq) {
- if (value > sysctl_perf_event_sample_rate) {
- ret = -EINVAL;
- goto unlock;
- }
-
event->attr.sample_freq = value;
} else {
event->attr.sample_period = value;
@@ -4006,7 +6234,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(event->pmu);
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -4014,41 +6242,83 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ /*
+ * Once the period is force-reset, the event starts immediately.
+ * But the event/group could be throttled. Unthrottle the
+ * event/group now to avoid the next tick trying to unthrottle
+ * while we already re-started the event/group.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS)
+ perf_event_unthrottle_group(event, true);
+ perf_pmu_enable(event->pmu);
+ }
+}
+
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+ return event->pmu->check_period(event, value);
+}
+
+static int _perf_event_period(struct perf_event *event, u64 value)
+{
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ if (!value)
+ return -EINVAL;
+
+ if (event->attr.freq) {
+ if (value > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+ } else {
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+ if (value & (1ULL << 63))
+ return -EINVAL;
}
-unlock:
- raw_spin_unlock_irq(&ctx->lock);
+ event_function_call(event, __perf_event_period, &value);
+
+ return 0;
+}
+
+int perf_event_period(struct perf_event *event, u64 value)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_period(event, value);
+ perf_event_ctx_unlock(event, ctx);
return ret;
}
+EXPORT_SYMBOL_GPL(perf_event_period);
static const struct file_operations perf_fops;
-static inline int perf_fget_light(int fd, struct fd *p)
+static inline bool is_perf_file(struct fd f)
{
- struct fd f = fdget(fd);
- if (!f.file)
- return -EBADF;
-
- if (f.file->f_op != &perf_fops) {
- fdput(f);
- return -EBADF;
- }
- *p = f;
- return 0;
+ return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
}
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_copy_attr(struct perf_event_attr __user *uattr,
+ struct perf_event_attr *attr);
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -4064,8 +6334,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
- return perf_event_period(event, (u64 __user *)arg);
+ {
+ u64 value;
+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
+ return -EFAULT;
+
+ return _perf_event_period(event, value);
+ }
case PERF_EVENT_IOC_ID:
{
u64 id = primary_event_id(event);
@@ -4077,28 +6353,64 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_OUTPUT:
{
- int ret;
+ CLASS(fd, output)(arg); // arg == -1 => empty
+ struct perf_event *output_event = NULL;
if (arg != -1) {
- struct perf_event *output_event;
- struct fd output;
- ret = perf_fget_light(arg, &output);
- if (ret)
- return ret;
- output_event = output.file->private_data;
- ret = perf_event_set_output(event, output_event);
- fdput(output);
- } else {
- ret = perf_event_set_output(event, NULL);
+ if (!is_perf_file(output))
+ return -EBADF;
+ output_event = fd_file(output)->private_data;
}
- return ret;
+ return perf_event_set_output(event, output_event);
}
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
case PERF_EVENT_IOC_SET_BPF:
- return perf_event_set_bpf_prog(event, arg);
+ {
+ struct bpf_prog *prog;
+ int err;
+
+ prog = bpf_prog_get(arg);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __perf_event_set_bpf_prog(event, prog, 0);
+ if (err) {
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ return 0;
+ }
+
+ case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+ struct perf_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb || !rb->nr_pages) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ rb_toggle_paused(rb, !!arg);
+ rcu_read_unlock();
+ return 0;
+ }
+
+ case PERF_EVENT_IOC_QUERY_BPF:
+ return perf_event_query_prog_array(event, (void __user *)arg);
+ case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
+ struct perf_event_attr new_attr;
+ int err = perf_copy_attr((struct perf_event_attr __user *)arg,
+ &new_attr);
+
+ if (err)
+ return err;
+
+ return perf_event_modify_attr(event, &new_attr);
+ }
default:
return -ENOTTY;
}
@@ -4117,6 +6429,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct perf_event_context *ctx;
long ret;
+ /* Treat ioctl like writes as it is likely a mutating operation. */
+ ret = security_perf_event_write(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -4131,6 +6448,8 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
+ case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
+ case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;
@@ -4187,23 +6506,10 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void calc_timer_values(struct perf_event *event,
- u64 *now,
- u64 *enabled,
- u64 *running)
-{
- u64 ctx_time;
-
- *now = perf_clock();
- ctx_time = event->shadow_ctx_time + *now;
- *enabled = ctx_time - event->tstamp_enabled;
- *running = ctx_time - event->tstamp_running;
-}
-
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -4235,7 +6541,7 @@ void __weak arch_perf_update_userpage(
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
u64 enabled, running, now;
rcu_read_lock();
@@ -4256,14 +6562,14 @@ void perf_event_update_userpage(struct perf_event *event)
userpg = rb->user_page;
/*
- * Disable preemption so as to not let the corresponding user-space
- * spin too long if we get preempted.
+ * Disable preemption to guarantee consistent time stamps are stored to
+ * the user page.
*/
preempt_disable();
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = perf_event_count(event);
+ userpg->offset = perf_event_count(event, false);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
@@ -4281,48 +6587,16 @@ void perf_event_update_userpage(struct perf_event *event)
unlock:
rcu_read_unlock();
}
-
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb;
- int ret = VM_FAULT_SIGBUS;
-
- if (vmf->flags & FAULT_FLAG_MKWRITE) {
- if (vmf->pgoff == 0)
- ret = 0;
- return ret;
- }
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
- goto unlock;
-
- vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
- if (!vmf->page)
- goto unlock;
-
- get_page(vmf->page);
- vmf->page->mapping = vma->vm_file->f_mapping;
- vmf->page->index = vmf->pgoff;
-
- ret = 0;
-unlock:
- rcu_read_unlock();
-
- return ret;
-}
+EXPORT_SYMBOL_GPL(perf_event_update_userpage);
static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb)
+ struct perf_buffer *rb)
{
- struct ring_buffer *old_rb = NULL;
+ struct perf_buffer *old_rb = NULL;
unsigned long flags;
+ WARN_ON_ONCE(event->parent);
+
if (event->rb) {
/*
* Should be impossible, we set this when removing
@@ -4350,6 +6624,19 @@ static void ring_buffer_attach(struct perf_event *event,
spin_unlock_irqrestore(&rb->event_lock, flags);
}
+ /*
+ * Avoid racing with perf_mmap_close(AUX): stop the event
+ * before swizzling the event::rb pointer; if it's getting
+ * unmapped, its aux_mmap_count will be 0 and it won't
+ * restart. See the comment in __perf_pmu_output_stop().
+ *
+ * Data will inevitably be lost when set_output is done in
+ * mid-air, but then again, whoever does it like this is
+ * not in for the data anyway.
+ */
+ if (has_aux(event))
+ perf_event_stop(event, 0);
+
rcu_assign_pointer(event->rb, rb);
if (old_rb) {
@@ -4365,7 +6652,10 @@ static void ring_buffer_attach(struct perf_event *event,
static void ring_buffer_wakeup(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
+
+ if (event->parent)
+ event = event->parent;
rcu_read_lock();
rb = rcu_dereference(event->rb);
@@ -4376,22 +6666,17 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_unlock();
}
-static void rb_free_rcu(struct rcu_head *rcu_head)
+struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
- rb = container_of(rcu_head, struct ring_buffer, rcu_head);
- rb_free(rb);
-}
-
-struct ring_buffer *ring_buffer_get(struct perf_event *event)
-{
- struct ring_buffer *rb;
+ if (event->parent)
+ event = event->parent;
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
- if (!atomic_inc_not_zero(&rb->refcount))
+ if (!refcount_inc_not_zero(&rb->refcount))
rb = NULL;
}
rcu_read_unlock();
@@ -4399,9 +6684,9 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
return rb;
}
-void ring_buffer_put(struct ring_buffer *rb)
+void ring_buffer_put(struct perf_buffer *rb)
{
- if (!atomic_dec_and_test(&rb->refcount))
+ if (!refcount_dec_and_test(&rb->refcount))
return;
WARN_ON_ONCE(!list_empty(&rb->event_list));
@@ -4409,20 +6694,35 @@ void ring_buffer_put(struct ring_buffer *rb)
call_rcu(&rb->rcu_head, rb_free_rcu);
}
+typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm);
+
+#define get_mapped(event, func) \
+({ struct pmu *pmu; \
+ mapped_f f = NULL; \
+ guard(rcu)(); \
+ pmu = READ_ONCE(event->pmu); \
+ if (pmu) \
+ f = pmu->func; \
+ f; \
+})
+
static void perf_mmap_open(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
+ mapped_f mapped = get_mapped(event, event_mapped);
- atomic_inc(&event->mmap_count);
- atomic_inc(&event->rb->mmap_count);
+ refcount_inc(&event->mmap_count);
+ refcount_inc(&event->rb->mmap_count);
if (vma->vm_pgoff)
- atomic_inc(&event->rb->aux_mmap_count);
+ refcount_inc(&event->rb->aux_mmap_count);
- if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ if (mapped)
+ mapped(event, vma->vm_mm);
}
+static void perf_pmu_output_stop(struct perf_event *event);
+
/*
* A buffer can be mmap()ed multiple times; either directly through the same
* event, or through other events by use of perf_event_set_output().
@@ -4434,39 +6734,53 @@ static void perf_mmap_open(struct vm_area_struct *vma)
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
-
- struct ring_buffer *rb = ring_buffer_get(event);
+ mapped_f unmapped = get_mapped(event, event_unmapped);
+ struct perf_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ bool detach_rest = false;
- if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ /* FIXIES vs perf_pmu_unregister() */
+ if (unmapped)
+ unmapped(event, vma->vm_mm);
/*
- * rb->aux_mmap_count will always drop before rb->mmap_count and
- * event->mmap_count, so it is ok to use event->mmap_mutex to
- * serialize with perf_mmap here.
+ * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
+ * to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
- atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
+ /*
+ * Stop all AUX events that are writing to this buffer,
+ * so that we can free its AUX pages and corresponding PMU
+ * data. Note that after rb::aux_mmap_count dropped to zero,
+ * they won't start any more (see perf_aux_output_begin()).
+ */
+ perf_pmu_output_stop(event);
+
+ /* now it's safe to free the pages */
+ atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
+ /* this has to be the last one */
rb_free_aux(rb);
- mutex_unlock(&event->mmap_mutex);
+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
+
+ mutex_unlock(&rb->aux_mutex);
}
- atomic_dec(&rb->mmap_count);
+ if (refcount_dec_and_test(&rb->mmap_count))
+ detach_rest = true;
- if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count))
+ if (!detach_rest)
goto out_put;
/*
@@ -4520,105 +6834,157 @@ again:
* undo the VM accounting.
*/
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= mmap_locked;
+ atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
+ &mmap_user->locked_vm);
+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
out_put:
ring_buffer_put(rb); /* could be last */
}
+static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
+{
+ /* The first page is the user control page, others are read-only. */
+ return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
+}
+
+static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting perf mappings to prevent refcount leaks due to
+ * the resulting non-matching offsets and sizes. See open()/close().
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
- .close = perf_mmap_close, /* non mergable */
- .fault = perf_mmap_fault,
- .page_mkwrite = perf_mmap_fault,
+ .close = perf_mmap_close, /* non mergeable */
+ .pfn_mkwrite = perf_mmap_pfn_mkwrite,
+ .may_split = perf_mmap_may_split,
};
-static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
{
- struct perf_event *event = file->private_data;
- unsigned long user_locked, user_lock_limit;
- struct user_struct *user = current_user();
- unsigned long locked, lock_limit;
- struct ring_buffer *rb = NULL;
- unsigned long vma_size;
- unsigned long nr_pages;
- long user_extra = 0, extra = 0;
- int ret = 0, flags = 0;
+ unsigned long nr_pages = vma_pages(vma);
+ int err = 0;
+ unsigned long pagenum;
/*
- * Don't allow mmap() of inherited per-task counters. This would
- * create a performance issue due to all children writing to the
- * same rb.
+ * We map this as a VM_PFNMAP VMA.
+ *
+ * This is not ideal as this is designed broadly for mappings of PFNs
+ * referencing memory-mapped I/O ranges or non-system RAM i.e. for which
+ * !pfn_valid(pfn).
+ *
+ * We are mapping kernel-allocated memory (memory we manage ourselves)
+ * which would more ideally be mapped using vm_insert_page() or a
+ * similar mechanism, that is as a VM_MIXEDMAP mapping.
+ *
+ * However this won't work here, because:
+ *
+ * 1. It uses vma->vm_page_prot, but this field has not been completely
+ * setup at the point of the f_op->mmp() hook, so we are unable to
+ * indicate that this should be mapped CoW in order that the
+ * mkwrite() hook can be invoked to make the first page R/W and the
+ * rest R/O as desired.
+ *
+ * 2. Anything other than a VM_PFNMAP of valid PFNs will result in
+ * vm_normal_page() returning a struct page * pointer, which means
+ * vm_ops->page_mkwrite() will be invoked rather than
+ * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
+ * to work around retry logic in the fault handler, however this
+ * field is no longer allowed to be used within struct page.
+ *
+ * 3. Having a struct page * made available in the fault logic also
+ * means that the page gets put on the rmap and becomes
+ * inappropriately accessible and subject to map and ref counting.
+ *
+ * Ideally we would have a mechanism that could explicitly express our
+ * desires, but this is not currently the case, so we instead use
+ * VM_PFNMAP.
+ *
+ * We manage the lifetime of these mappings with internal refcounts (see
+ * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
+ * this mapping is maintained correctly.
*/
- if (event->cpu == -1 && event->attr.inherit)
- return -EINVAL;
-
- if (!(vma->vm_flags & VM_SHARED))
- return -EINVAL;
+ for (pagenum = 0; pagenum < nr_pages; pagenum++) {
+ unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
+ struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
- vma_size = vma->vm_end - vma->vm_start;
-
- if (vma->vm_pgoff == 0) {
- nr_pages = (vma_size / PAGE_SIZE) - 1;
- } else {
- /*
- * AUX area mapping: if rb->aux_nr_pages != 0, it's already
- * mapped, all subsequent mappings should have the same size
- * and offset. Must be above the normal perf buffer.
- */
- u64 aux_offset, aux_size;
-
- if (!event->rb)
- return -EINVAL;
+ if (page == NULL) {
+ err = -EINVAL;
+ break;
+ }
- nr_pages = vma_size / PAGE_SIZE;
+ /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
+ err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
+ vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
+ if (err)
+ break;
+ }
- mutex_lock(&event->mmap_mutex);
- ret = -EINVAL;
+#ifdef CONFIG_MMU
+ /* Clear any partial mappings on error. */
+ if (err)
+ zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
+#endif
- rb = event->rb;
- if (!rb)
- goto aux_unlock;
+ return err;
+}
- aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
- aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
+{
+ unsigned long user_locked, user_lock_limit, locked, lock_limit;
+ struct user_struct *user = current_user();
- if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
- goto aux_unlock;
+ user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ /* Increase the limit linearly with more CPUs */
+ user_lock_limit *= num_online_cpus();
- if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
- goto aux_unlock;
+ user_locked = atomic_long_read(&user->locked_vm);
- /* already mapped with a different offset */
- if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
- goto aux_unlock;
+ /*
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
+ */
+ if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += *user_extra;
- if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
- goto aux_unlock;
+ if (user_locked > user_lock_limit) {
+ /*
+ * charge locked_vm until it hits user_lock_limit;
+ * charge the rest from pinned_vm
+ */
+ *extra = user_locked - user_lock_limit;
+ *user_extra -= *extra;
+ }
- /* already mapped with a different size */
- if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
- goto aux_unlock;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
- if (!is_power_of_2(nr_pages))
- goto aux_unlock;
+ return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
+}
- if (!atomic_inc_not_zero(&rb->mmap_count))
- goto aux_unlock;
+static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
+{
+ struct user_struct *user = current_user();
- if (rb_has_aux(rb)) {
- atomic_inc(&rb->aux_mmap_count);
- ret = 0;
- goto unlock;
- }
+ atomic_long_add(user_extra, &user->locked_vm);
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
+}
- atomic_set(&rb->aux_mmap_count, 1);
- user_extra = nr_pages;
+static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ struct perf_buffer *rb;
+ int rb_flags = 0;
- goto accounting;
- }
+ nr_pages -= 1;
/*
* If we have rb pages ensure they're a power-of-two number, so we
@@ -4627,107 +6993,203 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (nr_pages != 0 && !is_power_of_2(nr_pages))
return -EINVAL;
- if (vma_size != PAGE_SIZE * (1 + nr_pages))
- return -EINVAL;
-
WARN_ON_ONCE(event->ctx->parent_ctx);
-again:
- mutex_lock(&event->mmap_mutex);
+
if (event->rb) {
- if (event->rb->nr_pages != nr_pages) {
- ret = -EINVAL;
- goto unlock;
- }
+ if (data_page_nr(event->rb) != nr_pages)
+ return -EINVAL;
- if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ if (refcount_inc_not_zero(&event->rb->mmap_count)) {
/*
- * Raced against perf_mmap_close() through
- * perf_event_set_output(). Try again, hope for better
- * luck.
+ * Success -- managed to mmap() the same buffer
+ * multiple times.
*/
- mutex_unlock(&event->mmap_mutex);
- goto again;
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
+ return 0;
}
- goto unlock;
+ /*
+ * Raced against perf_mmap_close()'s
+ * refcount_dec_and_mutex_lock() remove the
+ * event and continue as if !event->rb
+ */
+ ring_buffer_attach(event, NULL);
}
- user_extra = nr_pages + 1;
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
+ return -EPERM;
-accounting:
- user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
+
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, rb_flags);
+
+ if (!rb)
+ return -ENOMEM;
+
+ refcount_set(&rb->mmap_count, 1);
+ rb->mmap_user = get_current_user();
+ rb->mmap_locked = extra;
+
+ ring_buffer_attach(event, rb);
+
+ perf_event_update_time(event);
+ perf_event_init_userpage(event);
+ perf_event_update_userpage(event);
+
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_set(&event->mmap_count, 1);
+
+ return 0;
+}
+
+static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
+ unsigned long nr_pages)
+{
+ long extra = 0, user_extra = nr_pages;
+ u64 aux_offset, aux_size;
+ struct perf_buffer *rb;
+ int ret, rb_flags = 0;
+
+ rb = event->rb;
+ if (!rb)
+ return -EINVAL;
+
+ guard(mutex)(&rb->aux_mutex);
/*
- * Increase the limit linearly with more CPUs:
+ * AUX area mapping: if rb->aux_nr_pages != 0, it's already
+ * mapped, all subsequent mappings should have the same size
+ * and offset. Must be above the normal perf buffer.
*/
- user_lock_limit *= num_online_cpus();
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+ if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
+ return -EINVAL;
- if (user_locked > user_lock_limit)
- extra = user_locked - user_lock_limit;
+ if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
+ return -EINVAL;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- locked = vma->vm_mm->pinned_vm + extra;
+ /* already mapped with a different offset */
+ if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
+ return -EINVAL;
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
- !capable(CAP_IPC_LOCK)) {
- ret = -EPERM;
- goto unlock;
- }
+ if (aux_size != nr_pages * PAGE_SIZE)
+ return -EINVAL;
- WARN_ON(!rb && event->rb);
+ /* already mapped with a different size */
+ if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
+ return -EINVAL;
- if (vma->vm_flags & VM_WRITE)
- flags |= RING_BUFFER_WRITABLE;
+ if (!is_power_of_2(nr_pages))
+ return -EINVAL;
- if (!rb) {
- rb = rb_alloc(nr_pages,
- event->attr.watermark ? event->attr.wakeup_watermark : 0,
- event->cpu, flags);
+ if (!refcount_inc_not_zero(&rb->mmap_count))
+ return -EINVAL;
- if (!rb) {
- ret = -ENOMEM;
- goto unlock;
+ if (rb_has_aux(rb)) {
+ refcount_inc(&rb->aux_mmap_count);
+
+ } else {
+ if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
+ refcount_dec(&rb->mmap_count);
+ return -EPERM;
}
- atomic_set(&rb->mmap_count, 1);
- rb->mmap_user = get_current_user();
- rb->mmap_locked = extra;
+ WARN_ON(!rb && event->rb);
- ring_buffer_attach(event, rb);
+ if (vma->vm_flags & VM_WRITE)
+ rb_flags |= RING_BUFFER_WRITABLE;
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
- } else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
- event->attr.aux_watermark, flags);
- if (!ret)
- rb->aux_mmap_locked = extra;
+ event->attr.aux_watermark, rb_flags);
+ if (ret) {
+ refcount_dec(&rb->mmap_count);
+ return ret;
+ }
+
+ refcount_set(&rb->aux_mmap_count, 1);
+ rb->aux_mmap_locked = extra;
}
-unlock:
- if (!ret) {
- atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
+ perf_mmap_account(vma, user_extra, extra);
+ refcount_inc(&event->mmap_count);
- atomic_inc(&event->mmap_count);
- } else if (rb) {
- atomic_dec(&rb->mmap_count);
+ return 0;
+}
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct perf_event *event = file->private_data;
+ unsigned long vma_size, nr_pages;
+ mapped_f mapped;
+ int ret;
+
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same rb.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
+ vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = vma_size / PAGE_SIZE;
+
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
+
+ if (vma_size != PAGE_SIZE * nr_pages)
+ return -EINVAL;
+
+ scoped_guard (mutex, &event->mmap_mutex) {
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
+ if (vma->vm_pgoff == 0)
+ ret = perf_mmap_rb(vma, event, nr_pages);
+ else
+ ret = perf_mmap_aux(vma, event, nr_pages);
+ if (ret)
+ return ret;
}
-aux_unlock:
- mutex_unlock(&event->mmap_mutex);
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
*/
- vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
- if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ mapped = get_mapped(event, event_mapped);
+ if (mapped)
+ mapped(event, vma->vm_mm);
+
+ /*
+ * Try to map it into the page table. On fail, invoke
+ * perf_mmap_close() to undo the above, as the callsite expects
+ * full cleanup in this case and therefore does not invoke
+ * vmops::close().
+ */
+ ret = map_range(event->rb, vma);
+ if (ret)
+ perf_mmap_close(vma);
return ret;
}
@@ -4738,9 +7200,12 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
- mutex_lock(&inode->i_mutex);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
+ inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (retval < 0)
return retval;
@@ -4749,7 +7214,6 @@ static int perf_fasync(int fd, struct file *filp, int on)
}
static const struct file_operations perf_fops = {
- .llseek = no_llseek,
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
@@ -4771,28 +7235,110 @@ void perf_event_wakeup(struct perf_event *event)
ring_buffer_wakeup(event);
if (event->pending_kill) {
- kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+ kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
event->pending_kill = 0;
}
}
-static void perf_pending_event(struct irq_work *entry)
+static void perf_sigtrap(struct perf_event *event)
+{
+ /*
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
+ */
+ if (current->flags & PF_EXITING)
+ return;
+
+ /*
+ * We'd expect this to only occur if the irq_work is delayed and either
+ * ctx->task or current has changed in the meantime. This can be the
+ * case on architectures that do not implement arch_irq_work_raise().
+ */
+ if (WARN_ON_ONCE(event->ctx->task != current))
+ return;
+
+ send_sig_perf((void __user *)event->pending_addr,
+ event->orig_type, event->attr.sig_data);
+}
+
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_disable(struct perf_event *event)
{
- struct perf_event *event = container_of(entry,
- struct perf_event, pending);
+ int cpu = READ_ONCE(event->oncpu);
+
+ /*
+ * If the event isn't running; we done. event_sched_out() will have
+ * taken care of things.
+ */
+ if (cpu < 0)
+ return;
+
+ /*
+ * Yay, we hit home and are in the context of the event.
+ */
+ if (cpu == smp_processor_id()) {
+ if (event->pending_disable) {
+ event->pending_disable = 0;
+ perf_event_disable_local(event);
+ }
+ return;
+ }
+
+ /*
+ * CPU-A CPU-B
+ *
+ * perf_event_disable_inatomic()
+ * @pending_disable = 1;
+ * irq_work_queue();
+ *
+ * sched-out
+ * @pending_disable = 0;
+ *
+ * sched-in
+ * perf_event_disable_inatomic()
+ * @pending_disable = 1;
+ * irq_work_queue(); // FAILS
+ *
+ * irq_work_run()
+ * perf_pending_disable()
+ *
+ * But the event runs on CPU-B and wants disabling there.
+ */
+ irq_work_queue_on(&event->pending_disable_irq, cpu);
+}
+
+static void perf_pending_disable(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
int rctx;
- rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
+ rctx = perf_swevent_get_recursion_context();
+ __perf_pending_disable(event);
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+}
- if (event->pending_disable) {
- event->pending_disable = 0;
- __perf_event_disable(event);
- }
+static void perf_pending_irq(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ rctx = perf_swevent_get_recursion_context();
+ /*
+ * The wakeup isn't bound to the context of the event -- it can happen
+ * irrespective of where the event is.
+ */
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
@@ -4802,35 +7348,98 @@ static void perf_pending_event(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
-/*
- * We assume there is only KVM supporting the callbacks.
- * Later on, we might change it to a list if there is
- * another virtualization implementation supporting the callbacks.
- */
-struct perf_guest_info_callbacks *perf_guest_cbs;
+static void perf_pending_task(struct callback_head *head)
+{
+ struct perf_event *event = container_of(head, struct perf_event, pending_task);
+ int rctx;
+
+ /*
+ * If we 'fail' here, that's OK, it means recursion is already disabled
+ * and we won't recurse 'further'.
+ */
+ rctx = perf_swevent_get_recursion_context();
+
+ if (event->pending_work) {
+ event->pending_work = 0;
+ perf_sigtrap(event);
+ local_dec(&event->ctx->nr_no_switch_fast);
+ }
+ put_event(event);
-int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+}
+
+#ifdef CONFIG_GUEST_PERF_EVENTS
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
+
+DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
+DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+
+void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
+ static_call_update(__perf_guest_state, cbs->state);
+ static_call_update(__perf_guest_get_ip, cbs->get_ip);
+
+ /* Implementing ->handle_intel_pt_intr is optional. */
+ if (cbs->handle_intel_pt_intr)
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
-int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ static_call_update(__perf_guest_state, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ (void *)&__static_call_return0);
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+#endif
+
+static bool should_sample_guest(struct perf_event *event)
+{
+ return !event->attr.exclude_guest && perf_guest_state();
+}
+
+unsigned long perf_misc_flags(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (should_sample_guest(event))
+ return perf_arch_guest_misc_flags(regs);
+
+ return perf_arch_misc_flags(regs);
+}
+
+unsigned long perf_instruction_pointer(struct perf_event *event,
+ struct pt_regs *regs)
+{
+ if (should_sample_guest(event))
+ return perf_guest_get_ip();
+
+ return perf_arch_instruction_pointer(regs);
+}
static void
perf_output_sample_regs(struct perf_output_handle *handle,
struct pt_regs *regs, u64 mask)
{
int bit;
+ DECLARE_BITMAP(_mask, 64);
- for_each_set_bit(bit, (const unsigned long *) &mask,
- sizeof(mask) * BITS_PER_BYTE) {
+ bitmap_from_u64(_mask, mask);
+ for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
u64 val;
val = perf_reg_value(regs, bit);
@@ -4839,14 +7448,13 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
static void perf_sample_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (current->mm) {
- perf_get_regs_user(regs_user, regs, regs_user_copy);
+ } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+ perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
regs_user->regs = NULL;
@@ -4865,7 +7473,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
* Get remaining task size from user stack pointer.
*
* It'd be better to take stack vma map and limit this more
- * precisly, but there's no way to get it safely under interrupt,
+ * precisely, but there's no way to get it safely under interrupt,
* so using TASK_SIZE as limit.
*/
static u64 perf_ustack_task_size(struct pt_regs *regs)
@@ -4888,6 +7496,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -4956,14 +7568,136 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
}
}
-static void __perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event)
+static unsigned long perf_prepare_sample_aux(struct perf_event *event,
+ struct perf_sample_data *data,
+ size_t size)
{
- u64 sample_type = event->attr.sample_type;
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
- data->type = sample_type;
- header->size += event->id_header_size;
+ data->aux_size = 0;
+
+ if (!sampler)
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+ goto out;
+
+ if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+ goto out;
+
+ rb = ring_buffer_get(sampler);
+ if (!rb)
+ goto out;
+
+ /*
+ * If this is an NMI hit inside sampling code, don't take
+ * the sample. See also perf_aux_sample_output().
+ */
+ if (READ_ONCE(rb->aux_in_sampling)) {
+ data->aux_size = 0;
+ } else {
+ size = min_t(size_t, size, perf_aux_size(rb));
+ data->aux_size = ALIGN(size, sizeof(u64));
+ }
+ ring_buffer_put(rb);
+
+out:
+ return data->aux_size;
+}
+
+static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
+ struct perf_event *event,
+ struct perf_output_handle *handle,
+ unsigned long size)
+{
+ unsigned long flags;
+ long ret;
+
+ /*
+ * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+ * paths. If we start calling them in NMI context, they may race with
+ * the IRQ ones, that is, for example, re-starting an event that's just
+ * been stopped, which is why we're using a separate callback that
+ * doesn't change the event state.
+ *
+ * IRQs need to be disabled to prevent IPIs from racing with us.
+ */
+ local_irq_save(flags);
+ /*
+ * Guard against NMI hits inside the critical section;
+ * see also perf_prepare_sample_aux().
+ */
+ WRITE_ONCE(rb->aux_in_sampling, 1);
+ barrier();
+
+ ret = event->pmu->snapshot_aux(event, handle, size);
+
+ barrier();
+ WRITE_ONCE(rb->aux_in_sampling, 0);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ struct perf_event *sampler = event->aux_event;
+ struct perf_buffer *rb;
+ unsigned long pad;
+ long size;
+
+ if (WARN_ON_ONCE(!sampler || !data->aux_size))
+ return;
+
+ rb = ring_buffer_get(sampler);
+ if (!rb)
+ return;
+
+ size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
+
+ /*
+ * An error here means that perf_output_copy() failed (returned a
+ * non-zero surplus that it didn't copy), which in its current
+ * enlightened implementation is not possible. If that changes, we'd
+ * like to know.
+ */
+ if (WARN_ON_ONCE(size < 0))
+ goto out_put;
+
+ /*
+ * The pad comes from ALIGN()ing data->aux_size up to u64 in
+ * perf_prepare_sample_aux(), so should not be more than that.
+ */
+ pad = data->aux_size - size;
+ if (WARN_ON_ONCE(pad >= sizeof(u64)))
+ pad = 8;
+
+ if (pad) {
+ u64 zero = 0;
+ perf_output_copy(handle, &zero, pad);
+ }
+
+out_put:
+ ring_buffer_put(rb);
+}
+
+/*
+ * A set of common sample data types saved even for non-sample records
+ * when event->attr.sample_id_all is set.
+ */
+#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
+
+static void __perf_event_header__init_id(struct perf_sample_data *data,
+ struct perf_event *event,
+ u64 sample_type)
+{
+ data->type = event->attr.sample_type;
+ data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
@@ -4990,8 +7724,10 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
- if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event);
+ if (event->attr.sample_id_all) {
+ header->size += event->id_header_size;
+ __perf_event_header__init_id(data, event, event->attr.sample_type);
+ }
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -5031,10 +7767,10 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
- values[n++] = perf_event_count(event);
+ values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -5045,21 +7781,28 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
-/*
- * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
- */
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event,
- u64 enabled, u64 running)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ unsigned long flags;
+ u64 values[6];
int n = 0;
+ bool self = has_inherit_and_sample_read(&event->attr);
+
+ /*
+ * Disabling interrupts avoids all counter scheduling
+ * (context switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
values[n++] = 1 + leader->nr_siblings;
@@ -5069,33 +7812,49 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if (leader != event)
- leader->pmu->read(leader);
+ if ((leader != event) && !handle->skip_read)
+ perf_pmu_read(leader);
- values[n++] = perf_event_count(leader);
+ values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ for_each_sibling_event(sub, leader) {
n = 0;
- if ((sub != event) &&
- (sub->state == PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ if ((sub != event) && !handle->skip_read)
+ perf_pmu_read(sub);
- values[n++] = perf_event_count(sub);
+ values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
+
+ local_irq_restore(flags);
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
PERF_FORMAT_TOTAL_TIME_RUNNING)
+/*
+ * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
+ *
+ * The problem is that its both hard and excessively expensive to iterate the
+ * child list, not to mention that its impossible to IPI the children running
+ * on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
+ */
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
@@ -5127,6 +7886,9 @@ void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = data->type;
+ if (data->sample_flags & PERF_SAMPLE_READ)
+ handle->skip_read = 1;
+
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
@@ -5160,26 +7922,34 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_read(handle, event);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- if (data->callchain) {
- int size = 1;
-
- if (data->callchain)
- size += data->callchain->nr;
-
- size *= sizeof(u64);
+ int size = 1;
- __output_copy(handle, data->callchain, size);
- } else {
- u64 nr = 0;
- perf_output_put(handle, nr);
- }
+ size += data->callchain->nr;
+ size *= sizeof(u64);
+ __output_copy(handle, data->callchain, size);
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- perf_output_put(handle, data->raw->size);
- __output_copy(handle, data->raw->data,
- data->raw->size);
+ struct perf_raw_record *raw = data->raw;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+
+ perf_output_put(handle, raw->size);
+ do {
+ if (frag->copy) {
+ __output_custom(handle, frag->copy,
+ frag->data, frag->size);
+ } else {
+ __output_copy(handle, frag->data,
+ frag->size);
+ }
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+ if (frag->pad)
+ __output_skip(handle, NULL, frag->pad);
} else {
struct {
u32 size;
@@ -5200,7 +7970,17 @@ void perf_output_sample(struct perf_output_handle *handle,
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
+ if (branch_sample_hw_index(event))
+ perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
+ /*
+ * Add the extension space which is appended
+ * right after the struct perf_branch_stack.
+ */
+ if (data->br_stack_cntr) {
+ size = data->br_stack->nr * sizeof(u64);
+ perf_output_copy(handle, data->br_stack_cntr, size);
+ }
} else {
/*
* we always store at least the value of nr
@@ -5233,8 +8013,8 @@ void perf_output_sample(struct perf_output_handle *handle,
data->regs_user.regs);
}
- if (sample_type & PERF_SAMPLE_WEIGHT)
- perf_output_put(handle, data->weight);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ perf_output_put(handle, data->weight.full);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
@@ -5259,11 +8039,30 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ perf_output_put(handle, data->cgroup);
+
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ perf_output_put(handle, data->code_page_size);
+
+ if (sample_type & PERF_SAMPLE_AUX) {
+ perf_output_put(handle, data->aux_size);
+
+ if (data->aux_size)
+ perf_aux_sample_output(event, handle, data);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events);
if (events >= wakeup_events) {
@@ -5274,61 +8073,228 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
-void perf_prepare_sample(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event,
- struct pt_regs *regs)
+static u64 perf_virt_to_phys(u64 virt)
{
- u64 sample_type = event->attr.sample_type;
+ u64 phys_addr = 0;
- header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header) + event->header_size;
+ if (!virt)
+ return 0;
- header->misc = 0;
- header->misc |= perf_misc_flags(regs);
+ if (virt >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe get_user_page_fast_only first.
+ * If failed, leave phys_addr as 0.
+ */
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+ struct page *p;
- __perf_event_header__init_id(header, data, event);
+ pagefault_disable();
+ if (get_user_page_fast_only(virt, 0, &p)) {
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ put_page(p);
+ }
+ pagefault_enable();
+ }
+ }
- if (sample_type & PERF_SAMPLE_IP)
- data->ip = perf_instruction_pointer(regs);
+ return phys_addr;
+}
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- int size = 1;
+/*
+ * Return the pagetable size of a given virtual address.
+ */
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
+{
+ u64 size = 0;
+
+#ifdef CONFIG_HAVE_GUP_FAST
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ return 0;
+
+ if (pgd_leaf(pgd))
+ return pgd_leaf_size(pgd);
+
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
+ return 0;
- data->callchain = perf_callchain(event, regs);
+ if (p4d_leaf(p4d))
+ return p4d_leaf_size(p4d);
- if (data->callchain)
- size += data->callchain->nr;
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
+ return 0;
+
+ if (pud_leaf(pud))
+ return pud_leaf_size(pud);
+
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+again:
+ pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (pmd_leaf(pmd))
+ return pmd_leaf_size(pmd);
+
+ ptep = pte_offset_map(&pmd, addr);
+ if (!ptep)
+ goto again;
+
+ pte = ptep_get_lockless(ptep);
+ if (pte_present(pte))
+ size = __pte_leaf_size(pmd, pte);
+ pte_unmap(ptep);
+#endif /* CONFIG_HAVE_GUP_FAST */
+
+ return size;
+}
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
- header->size += size * sizeof(u64);
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
}
- if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
+ size = perf_get_pgtable_size(mm, addr);
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+ local_irq_restore(flags);
+
+ return size;
+}
+
+static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+
+static struct unwind_work perf_unwind_work;
+
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
+{
+ bool kernel = !event->attr.exclude_callchain_kernel;
+ bool user = !event->attr.exclude_callchain_user &&
+ !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
+ /* Disallow cross-task user callchains. */
+ bool crosstask = event->ctx->task && event->ctx->task != current;
+ bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+ event->attr.defer_callchain;
+ const u32 max_stack = event->attr.sample_max_stack;
+ struct perf_callchain_entry *callchain;
+ u64 defer_cookie;
+
+ if (!current->mm)
+ user = false;
+
+ if (!kernel && !user)
+ return &__empty_callchain;
+
+ if (!(user && defer_user && !crosstask &&
+ unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+ defer_cookie = 0;
+
+ callchain = get_perf_callchain(regs, kernel, user, max_stack,
+ crosstask, true, defer_cookie);
+
+ return callchain ?: &__empty_callchain;
+}
+
+static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
+{
+ return d * !!(flags & s);
+}
+
+void perf_prepare_sample(struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ u64 sample_type = event->attr.sample_type;
+ u64 filtered_sample_type;
+
+ /*
+ * Add the sample flags that are dependent to others. And clear the
+ * sample flags that have already been done by the PMU driver.
+ */
+ filtered_sample_type = sample_type;
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
+ PERF_SAMPLE_IP);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
+ PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
+ filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
+ PERF_SAMPLE_REGS_USER);
+ filtered_sample_type &= ~data->sample_flags;
- WARN_ON_ONCE(size & (sizeof(u64)-1));
- header->size += size;
+ if (filtered_sample_type == 0) {
+ /* Make sure it has the correct data->type for output */
+ data->type = event->attr.sample_type;
+ return;
}
- if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- int size = sizeof(u64); /* nr */
- if (data->br_stack) {
- size += data->br_stack->nr
- * sizeof(struct perf_branch_entry);
- }
- header->size += size;
+ __perf_event_header__init_id(data, event, filtered_sample_type);
+
+ if (filtered_sample_type & PERF_SAMPLE_IP) {
+ data->ip = perf_instruction_pointer(event, regs);
+ data->sample_flags |= PERF_SAMPLE_IP;
}
- if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
- perf_sample_regs_user(&data->regs_user, regs,
- &data->regs_user_copy);
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, regs);
- if (sample_type & PERF_SAMPLE_REGS_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_RAW) {
+ data->raw = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_RAW;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ data->br_stack = NULL;
+ data->dyn_size += sizeof(u64);
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
+ perf_sample_regs_user(&data->regs_user, regs);
+
+ /*
+ * It cannot use the filtered_sample_type here as REGS_USER can be set
+ * by STACK_USER (using __cond_set() above) and we don't want to update
+ * the dyn_size if it's not requested by users.
+ */
+ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -5337,20 +8303,22 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_USER;
}
- if (sample_type & PERF_SAMPLE_STACK_USER) {
+ if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
/*
- * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * Either we need PERF_SAMPLE_STACK_USER bit to be always
* processed as the last one or have additional check added
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
+ u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
- stack_size = perf_sample_ustack_size(stack_size, header->size,
+ stack_size = perf_sample_ustack_size(stack_size, header_size,
data->regs_user.regs);
/*
@@ -5362,10 +8330,31 @@ void perf_prepare_sample(struct perf_event_header *header,
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_STACK_USER;
}
- if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ data->weight.full = 0;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
+ data->data_src.val = PERF_MEM_NA;
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
+ data->txn = 0;
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_ADDR) {
+ data->addr = 0;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -5377,23 +8366,148 @@ void perf_prepare_sample(struct perf_event_header *header,
size += hweight64(mask) * sizeof(u64);
}
- header->size += size;
+ data->dyn_size += size;
+ data->sample_flags |= PERF_SAMPLE_REGS_INTR;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
+ data->phys_addr = perf_virt_to_phys(data->addr);
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
+
+#ifdef CONFIG_CGROUP_PERF
+ if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
+ struct cgroup *cgrp;
+
+ /* protected by RCU */
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+ data->cgroup = cgroup_id(cgrp);
+ data->sample_flags |= PERF_SAMPLE_CGROUP;
+ }
+#endif
+
+ /*
+ * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+ * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+ * but the value will not dump to the userspace.
+ */
+ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
+ data->data_page_size = perf_get_page_size(data->addr);
+ data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
+ data->code_page_size = perf_get_page_size(data->ip);
+ data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
+ }
+
+ if (filtered_sample_type & PERF_SAMPLE_AUX) {
+ u64 size;
+ u16 header_size = perf_sample_data_size(data, event);
+
+ header_size += sizeof(u64); /* size */
+
+ /*
+ * Given the 16bit nature of header::size, an AUX sample can
+ * easily overflow it, what with all the preceding sample bits.
+ * Make sure this doesn't happen by using up to U16_MAX bytes
+ * per sample in total (rounded down to 8 byte boundary).
+ */
+ size = min_t(size_t, U16_MAX - header_size,
+ event->attr.aux_sample_size);
+ size = rounddown(size, 8);
+ size = perf_prepare_sample_aux(event, data, size);
+
+ WARN_ON_ONCE(size + header_size > U16_MAX);
+ data->dyn_size += size + sizeof(u64); /* size above */
+ data->sample_flags |= PERF_SAMPLE_AUX;
}
}
-static void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+void perf_prepare_header(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event,
+ struct pt_regs *regs)
+{
+ header->type = PERF_RECORD_SAMPLE;
+ header->size = perf_sample_data_size(data, event);
+ header->misc = perf_misc_flags(event, regs);
+
+ /*
+ * If you're adding more sample types here, you likely need to do
+ * something about the overflowing header::size, like repurpose the
+ * lowest 3 bits of size, which should be always zero at the moment.
+ * This raises a more important question, do we really need 512k sized
+ * samples and why, so good argumentation is in order for whatever you
+ * do here next.
+ */
+ WARN_ON_ONCE(header->size & 7);
+}
+
+static void __perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ if (pause) {
+ if (!event->hw.aux_paused) {
+ event->hw.aux_paused = 1;
+ event->pmu->stop(event, PERF_EF_PAUSE);
+ }
+ } else {
+ if (event->hw.aux_paused) {
+ event->hw.aux_paused = 0;
+ event->pmu->start(event, PERF_EF_RESUME);
+ }
+ }
+}
+
+static void perf_event_aux_pause(struct perf_event *event, bool pause)
+{
+ struct perf_buffer *rb;
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ rb = ring_buffer_get(event);
+ if (!rb)
+ return;
+
+ scoped_guard (irqsave) {
+ /*
+ * Guard against self-recursion here. Another event could trip
+ * this same from NMI context.
+ */
+ if (READ_ONCE(rb->aux_in_pause_resume))
+ break;
+
+ WRITE_ONCE(rb->aux_in_pause_resume, 1);
+ barrier();
+ __perf_event_aux_pause(event, pause);
+ barrier();
+ WRITE_ONCE(rb->aux_in_pause_resume, 0);
+ }
+ ring_buffer_put(rb);
+}
+
+static __always_inline int
+__perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs,
+ int (*output_begin)(struct perf_output_handle *,
+ struct perf_sample_data *,
+ struct perf_event *,
+ unsigned int))
{
struct perf_output_handle handle;
struct perf_event_header header;
+ int err;
/* protect the callchain buffers */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ err = output_begin(&handle, data, event, header.size);
+ if (err)
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -5402,6 +8516,31 @@ static void perf_event_output(struct perf_event *event,
exit:
rcu_read_unlock();
+ return err;
+}
+
+void
+perf_event_output_forward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_forward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+int
+perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return __perf_event_output(event, data, regs, perf_output_begin);
}
/*
@@ -5433,7 +8572,7 @@ perf_event_read_event(struct perf_event *event,
int ret;
perf_event_header__init_id(&read_event.header, &sample, event);
- ret = perf_output_begin(&handle, event, read_event.header.size);
+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
if (ret)
return;
@@ -5444,16 +8583,41 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
- perf_event_aux_output_cb output,
- void *data)
+perf_iterate_ctx(struct perf_event_context *ctx,
+ perf_iterate_f output,
+ void *data, bool all)
{
struct perf_event *event;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (!all) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ }
+
+ output(event, data);
+ }
+}
+
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
+{
+ struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &pel->list, sb_list) {
+ /*
+ * Skip events that are not fully formed yet; ensure that
+ * if we observe event->ctx, both event and ctx will be
+ * complete enough. See perf_install_in_context().
+ */
+ if (!smp_load_acquire(&event->ctx))
+ continue;
+
if (event->state < PERF_EVENT_STATE_INACTIVE)
continue;
if (!event_filter_match(event))
@@ -5462,37 +8626,169 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
}
}
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
- struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
- int ctxn;
rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data);
- if (task_ctx)
- goto next;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_event_aux_ctx(ctx, output, data);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
+ preempt_disable();
+ /*
+ * If we have task_ctx != NULL we only notify the task context itself.
+ * The task_ctx is set only for EXIT events before releasing task
+ * context.
+ */
if (task_ctx) {
- preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data);
- preempt_enable();
+ perf_iterate_ctx(task_ctx, output, data, false);
+ goto done;
+ }
+
+ perf_iterate_sb_cpu(output, data);
+
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
+done:
+ preempt_enable();
+ rcu_read_unlock();
+}
+
+/*
+ * Clear all file-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+ unsigned long flags;
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (filter->path.dentry) {
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
+ restart++;
+ }
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_stop(event, 1);
+}
+
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
+
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ scoped_guard(rcu)
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
+}
+
+struct remote_output {
+ struct perf_buffer *rb;
+ int err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+ struct perf_event *parent = event->parent;
+ struct remote_output *ro = data;
+ struct perf_buffer *rb = ro->rb;
+ struct stop_event_data sd = {
+ .event = event,
+ };
+
+ if (!has_aux(event))
+ return;
+
+ if (!parent)
+ parent = event;
+
+ /*
+ * In case of inheritance, it will be the parent that links to the
+ * ring-buffer, but it will be the child that's actually using it.
+ *
+ * We are using event::rb to determine if the event should be stopped,
+ * however this may race with ring_buffer_attach() (through set_output),
+ * which will make us skip the event that actually needs to be stopped.
+ * So ring_buffer_attach() has to stop an aux event before re-assigning
+ * its rb pointer.
+ */
+ if (rcu_dereference(parent->rb) == rb)
+ ro->err = __perf_event_stop(&sd);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+ struct perf_event *event = info;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct remote_output ro = {
+ .rb = event->rb,
+ };
+
+ rcu_read_lock();
+ perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ if (cpuctx->task_ctx)
+ perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ &ro, false);
+ rcu_read_unlock();
+
+ return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+ struct perf_event *iter;
+ int err, cpu;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+ /*
+ * For per-CPU events, we need to make sure that neither they
+ * nor their children are running; for cpu==-1 events it's
+ * sufficient to stop the event itself if it's active, since
+ * it can't have children.
+ */
+ cpu = iter->cpu;
+ if (cpu == -1)
+ cpu = READ_ONCE(iter->oncpu);
+
+ if (cpu == -1)
+ continue;
+
+ err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+ if (err == -EAGAIN) {
+ rcu_read_unlock();
+ goto restart;
+ }
}
rcu_read_unlock();
}
@@ -5539,16 +8835,23 @@ static void perf_event_task_output(struct perf_event *event,
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
task_event->event_id.header.size);
if (ret)
goto out;
task_event->event_id.pid = perf_event_pid(event, task);
- task_event->event_id.ppid = perf_event_pid(event, current);
-
task_event->event_id.tid = perf_event_tid(event, task);
- task_event->event_id.ptid = perf_event_tid(event, current);
+
+ if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
+ task_event->event_id.ppid = perf_event_pid(event,
+ task->real_parent);
+ task_event->event_id.ptid = perf_event_pid(event,
+ task->real_parent);
+ } else { /* PERF_RECORD_FORK */
+ task_event->event_id.ppid = perf_event_pid(event, current);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+ }
task_event->event_id.time = perf_event_clock(event);
@@ -5589,14 +8892,63 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_aux(perf_event_task_output,
+ perf_iterate_sb(perf_event_task_output,
&task_event,
task_ctx);
}
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void
+perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct kmem_cache *ctx_cache = NULL;
+ struct perf_ctx_data *cd;
+
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(parent->perf_ctx_data);
+ if (cd)
+ ctx_cache = cd->ctx_cache;
+ }
+
+ if (!ctx_cache)
+ return;
+
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ scoped_guard (rcu) {
+ cd = rcu_dereference(child->perf_ctx_data);
+ if (!cd) {
+ /*
+ * A system-wide event may be unaccount,
+ * when attaching the perf_ctx_data.
+ */
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+ goto attach;
+ }
+
+ if (!cd->global) {
+ cd->global = 1;
+ refcount_inc(&cd->refcount);
+ }
+ }
+
+ return;
+attach:
+ attach_task_ctx_data(child, ctx_cache, true);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
+ perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}
/*
@@ -5634,7 +8986,7 @@ static void perf_event_comm_output(struct perf_event *event,
return;
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
comm_event->event_id.header.size);
if (ret)
@@ -5660,7 +9012,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strlcpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm);
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -5668,7 +9020,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- perf_event_aux(perf_event_comm_output,
+ perf_iterate_sb(perf_event_comm_output,
comm_event,
NULL);
}
@@ -5699,6 +9051,235 @@ void perf_event_comm(struct task_struct *task, bool exec)
}
/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+ struct task_struct *task;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 nr_namespaces;
+ struct perf_ns_link_info link_info[NR_NAMESPACES];
+ } event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+ return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_namespaces_event *namespaces_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u16 header_size = namespaces_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_namespaces_match(event))
+ return;
+
+ perf_event_header__init_id(&namespaces_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, &sample, event,
+ namespaces_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ namespaces_event->event_id.pid = perf_event_pid(event,
+ namespaces_event->task);
+ namespaces_event->event_id.tid = perf_event_tid(event,
+ namespaces_event->task);
+
+ perf_output_put(&handle, namespaces_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ namespaces_event->event_id.header.size = header_size;
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+ struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct path ns_path;
+ struct inode *ns_inode;
+ int error;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error) {
+ ns_inode = ns_path.dentry->d_inode;
+ ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ ns_link_info->ino = ns_inode->i_ino;
+ path_put(&ns_path);
+ }
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+ struct perf_namespaces_event namespaces_event;
+ struct perf_ns_link_info *ns_link_info;
+
+ if (!atomic_read(&nr_namespaces_events))
+ return;
+
+ namespaces_event = (struct perf_namespaces_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_NAMESPACES,
+ .misc = 0,
+ .size = sizeof(namespaces_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ .nr_namespaces = NR_NAMESPACES,
+ /* .link_info[NR_NAMESPACES] */
+ },
+ };
+
+ ns_link_info = namespaces_event.event_id.link_info;
+
+ perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+ task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+ perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+ task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+ perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+ task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+ perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+ task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+ perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+ task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+ perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+ task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+ perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+ task, &cgroupns_operations);
+#endif
+
+ perf_iterate_sb(perf_event_namespaces_output,
+ &namespaces_event,
+ NULL);
+}
+
+/*
+ * cgroup tracking
+ */
+#ifdef CONFIG_CGROUP_PERF
+
+struct perf_cgroup_event {
+ char *path;
+ int path_size;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ char path[];
+ } event_id;
+};
+
+static int perf_event_cgroup_match(struct perf_event *event)
+{
+ return event->attr.cgroup;
+}
+
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
+{
+ struct perf_cgroup_event *cgroup_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u16 header_size = cgroup_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_cgroup_match(event))
+ return;
+
+ perf_event_header__init_id(&cgroup_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, &sample, event,
+ cgroup_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, cgroup_event->event_id);
+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ cgroup_event->event_id.header.size = header_size;
+}
+
+static void perf_event_cgroup(struct cgroup *cgrp)
+{
+ struct perf_cgroup_event cgroup_event;
+ char path_enomem[16] = "//enomem";
+ char *pathname;
+ size_t size;
+
+ if (!atomic_read(&nr_cgroup_events))
+ return;
+
+ cgroup_event = (struct perf_cgroup_event){
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_CGROUP,
+ .misc = 0,
+ .size = sizeof(cgroup_event.event_id),
+ },
+ .id = cgroup_id(cgrp),
+ },
+ };
+
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (pathname == NULL) {
+ cgroup_event.path = path_enomem;
+ } else {
+ /* just to be sure to have enough space for alignment */
+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
+ cgroup_event.path = pathname;
+ }
+
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(cgroup_event.path) + 1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ cgroup_event.path[size++] = '\0';
+
+ cgroup_event.event_id.header.size += size;
+ cgroup_event.path_size = size;
+
+ perf_iterate_sb(perf_event_cgroup_output,
+ &cgroup_event,
+ NULL);
+
+ kfree(pathname);
+}
+
+#endif
+
+/*
* mmap tracking
*/
@@ -5711,6 +9292,8 @@ struct perf_mmap_event {
u64 ino;
u64 ino_generation;
u32 prot, flags;
+ u8 build_id[BUILD_ID_SIZE_MAX];
+ u32 build_id_size;
struct {
struct perf_event_header header;
@@ -5741,6 +9324,8 @@ static void perf_event_mmap_output(struct perf_event *event,
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
+ u32 type = mmap_event->event_id.header.type;
+ bool use_build_id;
int ret;
if (!perf_event_mmap_match(event, data))
@@ -5757,7 +9342,7 @@ static void perf_event_mmap_output(struct perf_event *event,
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
mmap_event->event_id.header.size);
if (ret)
goto out;
@@ -5765,13 +9350,25 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
+ use_build_id = event->attr.build_id && mmap_event->build_id_size;
+
+ if (event->attr.mmap2 && use_build_id)
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
+
perf_output_put(&handle, mmap_event->event_id);
if (event->attr.mmap2) {
- perf_output_put(&handle, mmap_event->maj);
- perf_output_put(&handle, mmap_event->min);
- perf_output_put(&handle, mmap_event->ino);
- perf_output_put(&handle, mmap_event->ino_generation);
+ if (use_build_id) {
+ u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
+
+ __output_copy(&handle, size, 4);
+ __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
+ } else {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ }
perf_output_put(&handle, mmap_event->prot);
perf_output_put(&handle, mmap_event->flags);
}
@@ -5784,6 +9381,7 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_end(&handle);
out:
mmap_event->event_id.header.size = size;
+ mmap_event->event_id.header.type = type;
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -5796,10 +9394,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
unsigned int size;
char tmp[16];
char *buf = NULL;
- char *name;
+ char *name = NULL;
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (is_vm_hugetlb_page(vma))
+ flags |= MAP_HUGETLB;
if (file) {
- struct inode *inode;
+ const struct inode *inode;
dev_t dev;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -5812,68 +9427,36 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
- name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+ name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
}
- inode = file_inode(vma->vm_file);
+ inode = file_user_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
- if (vma->vm_flags & VM_READ)
- prot |= PROT_READ;
- if (vma->vm_flags & VM_WRITE)
- prot |= PROT_WRITE;
- if (vma->vm_flags & VM_EXEC)
- prot |= PROT_EXEC;
-
- if (vma->vm_flags & VM_MAYSHARE)
- flags = MAP_SHARED;
- else
- flags = MAP_PRIVATE;
-
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
- flags |= MAP_HUGETLB;
-
goto got_name;
} else {
- if (vma->vm_ops && vma->vm_ops->name) {
+ if (vma->vm_ops && vma->vm_ops->name)
name = (char *) vma->vm_ops->name(vma);
- if (name)
- goto cpy_name;
- }
-
- name = (char *)arch_vma_name(vma);
- if (name)
- goto cpy_name;
-
- if (vma->vm_start <= vma->vm_mm->start_brk &&
- vma->vm_end >= vma->vm_mm->brk) {
- name = "[heap]";
- goto cpy_name;
- }
- if (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack) {
- name = "[stack]";
- goto cpy_name;
+ if (!name)
+ name = (char *)arch_vma_name(vma);
+ if (!name) {
+ if (vma_is_initial_heap(vma))
+ name = "[heap]";
+ else if (vma_is_initial_stack(vma))
+ name = "[stack]";
+ else
+ name = "//anon";
}
-
- name = "//anon";
- goto cpy_name;
}
cpy_name:
- strlcpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name);
name = tmp;
got_name:
/*
@@ -5899,13 +9482,113 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- perf_event_aux(perf_event_mmap_output,
+ if (atomic_read(&nr_build_id_events))
+ build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
+
+ perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
kfree(buf);
}
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_addr_filter_match(struct perf_addr_filter *filter,
+ struct file *file, unsigned long offset,
+ unsigned long size)
+{
+ /* d_inode(NULL) won't be equal to any mapped user-space file */
+ if (!filter->path.dentry)
+ return false;
+
+ if (d_inode(filter->path.dentry) != file_user_inode(file))
+ return false;
+
+ if (filter->offset > offset + size)
+ return false;
+
+ if (filter->offset + filter->size < offset)
+ return false;
+
+ return true;
+}
+
+static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
+ struct vm_area_struct *vma,
+ struct perf_addr_filter_range *fr)
+{
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct file *file = vma->vm_file;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ return false;
+
+ if (filter->offset < off) {
+ fr->start = vma->vm_start;
+ fr->size = min(vma_size, filter->size - (off - filter->offset));
+ } else {
+ fr->start = vma->vm_start + filter->offset - off;
+ fr->size = min(vma->vm_end - fr->start, filter->size);
+ }
+
+ return true;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct vm_area_struct *vma = data;
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+ unsigned long flags;
+
+ if (!has_addr_filter(event))
+ return;
+
+ if (!vma->vm_file)
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (perf_addr_filter_vma_adjust(filter, vma,
+ &event->addr_filter_ranges[count]))
+ restart++;
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_stop(event, 1);
+}
+
+/*
+ * Adjust all task's events' filters to the new vma
+ */
+static void perf_addr_filters_adjust(struct vm_area_struct *vma)
+{
+ struct perf_event_context *ctx;
+
+ /*
+ * Data tracing isn't supported yet and as such there is no need
+ * to keep track of anything that isn't related to executable code:
+ */
+ if (!(vma->vm_flags & VM_EXEC))
+ return;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ rcu_read_unlock();
+}
+
void perf_event_mmap(struct vm_area_struct *vma)
{
struct perf_mmap_event mmap_event;
@@ -5937,6 +9620,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .flags (attr_mmap2 only) */
};
+ perf_addr_filters_adjust(vma);
perf_event_mmap_event(&mmap_event);
}
@@ -5963,7 +9647,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
int ret;
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
@@ -5975,6 +9659,127 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
}
/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ struct {
+ struct perf_event_header header;
+ u64 lost;
+ } lost_samples_event = {
+ .header = {
+ .type = PERF_RECORD_LOST_SAMPLES,
+ .misc = 0,
+ .size = sizeof(lost_samples_event),
+ },
+ .lost = lost,
+ };
+
+ perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ lost_samples_event.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, lost_samples_event);
+ perf_event__output_id_sample(event, &handle, &sample);
+ perf_output_end(&handle);
+}
+
+/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+ struct task_struct *task;
+ struct task_struct *next_prev;
+
+ struct {
+ struct perf_event_header header;
+ u32 next_prev_pid;
+ u32 next_prev_tid;
+ } event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+ return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+ struct perf_switch_event *se = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_switch_match(event))
+ return;
+
+ /* Only CPU-wide events are allowed to see next/prev pid/tid */
+ if (event->ctx->task) {
+ se->event_id.header.type = PERF_RECORD_SWITCH;
+ se->event_id.header.size = sizeof(se->event_id.header);
+ } else {
+ se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+ se->event_id.header.size = sizeof(se->event_id);
+ se->event_id.next_prev_pid =
+ perf_event_pid(event, se->next_prev);
+ se->event_id.next_prev_tid =
+ perf_event_tid(event, se->next_prev);
+ }
+
+ perf_event_header__init_id(&se->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
+ if (ret)
+ return;
+
+ if (event->ctx->task)
+ perf_output_put(&handle, se->event_id.header);
+ else
+ perf_output_put(&handle, se->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in)
+{
+ struct perf_switch_event switch_event;
+
+ /* N.B. caller checks nr_switch_events != 0 */
+
+ switch_event = (struct perf_switch_event){
+ .task = task,
+ .next_prev = next_prev,
+ .event_id = {
+ .header = {
+ /* .type */
+ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+ /* .size */
+ },
+ /* .next_prev_pid */
+ /* .next_prev_tid */
+ },
+ };
+
+ if (!sched_in && task_is_runnable(task)) {
+ switch_event.event_id.header.misc |=
+ PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+ }
+
+ perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
+}
+
+/*
* IRQ throttle logging
*/
@@ -6005,7 +9810,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_event_header__init_id(&throttle_event.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
throttle_event.header.size);
if (ret)
return;
@@ -6015,6 +9820,349 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+/*
+ * ksymbol register/unregister tracking
+ */
+
+struct perf_ksymbol_event {
+ const char *name;
+ int name_len;
+ struct {
+ struct perf_event_header header;
+ u64 addr;
+ u32 len;
+ u16 ksym_type;
+ u16 flags;
+ } event_id;
+};
+
+static int perf_event_ksymbol_match(struct perf_event *event)
+{
+ return event->attr.ksymbol;
+}
+
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
+{
+ struct perf_ksymbol_event *ksymbol_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_ksymbol_match(event))
+ return;
+
+ perf_event_header__init_id(&ksymbol_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, &sample, event,
+ ksymbol_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, ksymbol_event->event_id);
+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
+ const char *sym)
+{
+ struct perf_ksymbol_event ksymbol_event;
+ char name[KSYM_NAME_LEN];
+ u16 flags = 0;
+ int name_len;
+
+ if (!atomic_read(&nr_ksymbol_events))
+ return;
+
+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
+ goto err;
+
+ strscpy(name, sym);
+ name_len = strlen(name) + 1;
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
+ name[name_len++] = '\0';
+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
+
+ if (unregister)
+ flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
+
+ ksymbol_event = (struct perf_ksymbol_event){
+ .name = name,
+ .name_len = name_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_KSYMBOL,
+ .size = sizeof(ksymbol_event.event_id) +
+ name_len,
+ },
+ .addr = addr,
+ .len = len,
+ .ksym_type = ksym_type,
+ .flags = flags,
+ },
+ };
+
+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
+ return;
+err:
+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
+}
+
+/*
+ * bpf program load/unload tracking
+ */
+
+struct perf_bpf_event {
+ struct bpf_prog *prog;
+ struct {
+ struct perf_event_header header;
+ u16 type;
+ u16 flags;
+ u32 id;
+ u8 tag[BPF_TAG_SIZE];
+ } event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+ return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event, void *data)
+{
+ struct perf_bpf_event *bpf_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_bpf_match(event))
+ return;
+
+ perf_event_header__init_id(&bpf_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, &sample, event,
+ bpf_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, bpf_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
+ enum perf_bpf_event_type type)
+{
+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
+ int i;
+
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)prog->bpf_func,
+ prog->jited_len, unregister,
+ prog->aux->ksym.name);
+
+ for (i = 1; i < prog->aux->func_cnt; i++) {
+ struct bpf_prog *subprog = prog->aux->func[i];
+
+ perf_event_ksymbol(
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)subprog->bpf_func,
+ subprog->jited_len, unregister,
+ subprog->aux->ksym.name);
+ }
+}
+
+void perf_event_bpf_event(struct bpf_prog *prog,
+ enum perf_bpf_event_type type,
+ u16 flags)
+{
+ struct perf_bpf_event bpf_event;
+
+ switch (type) {
+ case PERF_BPF_EVENT_PROG_LOAD:
+ case PERF_BPF_EVENT_PROG_UNLOAD:
+ if (atomic_read(&nr_ksymbol_events))
+ perf_event_bpf_emit_ksymbols(prog, type);
+ break;
+ default:
+ return;
+ }
+
+ if (!atomic_read(&nr_bpf_events))
+ return;
+
+ bpf_event = (struct perf_bpf_event){
+ .prog = prog,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_BPF_EVENT,
+ .size = sizeof(bpf_event.event_id),
+ },
+ .type = type,
+ .flags = flags,
+ .id = prog->aux->id,
+ },
+ };
+
+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
+
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
+}
+
+struct perf_callchain_deferred_event {
+ struct unwind_stacktrace *trace;
+ struct {
+ struct perf_event_header header;
+ u64 cookie;
+ u64 nr;
+ u64 ips[];
+ } event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void *data)
+{
+ struct perf_callchain_deferred_event *deferred_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret, size = deferred_event->event.header.size;
+
+ if (!event->attr.defer_output)
+ return;
+
+ /* XXX do we really need sample_id_all for this ??? */
+ perf_event_header__init_id(&deferred_event->event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ deferred_event->event.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, deferred_event->event);
+ for (int i = 0; i < deferred_event->trace->nr; i++) {
+ u64 entry = deferred_event->trace->entries[i];
+ perf_output_put(&handle, entry);
+ }
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ deferred_event->event.header.size = size;
+}
+
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+ struct unwind_stacktrace *trace, u64 cookie)
+{
+ struct perf_callchain_deferred_event deferred_event = {
+ .trace = trace,
+ .event = {
+ .header = {
+ .type = PERF_RECORD_CALLCHAIN_DEFERRED,
+ .misc = PERF_RECORD_MISC_USER,
+ .size = sizeof(deferred_event.event) +
+ (trace->nr * sizeof(u64)),
+ },
+ .cookie = cookie,
+ .nr = trace->nr,
+ },
+ };
+
+ perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
+struct perf_text_poke_event {
+ const void *old_bytes;
+ const void *new_bytes;
+ size_t pad;
+ u16 old_len;
+ u16 new_len;
+
+ struct {
+ struct perf_event_header header;
+
+ u64 addr;
+ } event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+ return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+ struct perf_text_poke_event *text_poke_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u64 padding = 0;
+ int ret;
+
+ if (!perf_event_text_poke_match(event))
+ return;
+
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ text_poke_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, text_poke_event->event_id);
+ perf_output_put(&handle, text_poke_event->old_len);
+ perf_output_put(&handle, text_poke_event->new_len);
+
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+ if (text_poke_event->pad)
+ __output_copy(&handle, &padding, text_poke_event->pad);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+ size_t old_len, const void *new_bytes, size_t new_len)
+{
+ struct perf_text_poke_event text_poke_event;
+ size_t tot, pad;
+
+ if (!atomic_read(&nr_text_poke_events))
+ return;
+
+ tot = sizeof(text_poke_event.old_len) + old_len;
+ tot += sizeof(text_poke_event.new_len) + new_len;
+ pad = ALIGN(tot, sizeof(u64)) - tot;
+
+ text_poke_event = (struct perf_text_poke_event){
+ .old_bytes = old_bytes,
+ .new_bytes = new_bytes,
+ .pad = pad,
+ .old_len = old_len,
+ .new_len = new_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_TEXT_POKE,
+ .misc = PERF_RECORD_MISC_KERNEL,
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
+ },
+ .addr = (unsigned long)addr,
+ },
+ };
+
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
+}
+
+void perf_event_itrace_started(struct perf_event *event)
+{
+ WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE);
+}
+
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
@@ -6030,11 +10178,9 @@ static void perf_log_itrace_start(struct perf_event *event)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- event->hw.itrace_started)
+ event->attach_state & PERF_ATTACH_ITRACE)
return;
- event->hw.itrace_started = 1;
-
rec.header.type = PERF_RECORD_ITRACE_START;
rec.header.misc = 0;
rec.header.size = sizeof(rec);
@@ -6042,7 +10188,7 @@ static void perf_log_itrace_start(struct perf_event *event)
rec.tid = perf_event_tid(event, current);
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
@@ -6053,25 +10199,43 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
-/*
- * Generic event overflow handling, sampling.
- */
+void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
+{
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ struct perf_aux_event {
+ struct perf_event_header header;
+ u64 hw_id;
+ } rec;
+ int ret;
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+ if (event->parent)
+ event = event->parent;
+
+ rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
+ rec.header.misc = 0;
+ rec.header.size = sizeof(rec);
+ rec.hw_id = hw_id;
+
+ perf_event_header__init_id(&rec.header, &sample, event);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
+
+ if (ret)
+ return;
+
+ perf_output_put(&handle, rec);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+EXPORT_SYMBOL_GPL(perf_report_aux_output_id);
+
+static int
+__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -6079,14 +10243,13 @@ static int __perf_event_overflow(struct perf_event *event,
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
- __this_cpu_inc(perf_throttled_count);
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- tick_nohz_full_kick();
- ret = 1;
- }
+ }
+
+ if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
+ tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
+ perf_event_throttle_group(event);
+ ret = 1;
}
if (event->attr.freq) {
@@ -6099,6 +10262,148 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}
+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct bpf_perf_event_data_kern ctx = {
+ .data = data,
+ .event = event,
+ };
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ ctx.regs = perf_arch_bpf_user_pt_regs(regs);
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+ goto out;
+ rcu_read_lock();
+ prog = READ_ONCE(event->prog);
+ if (prog) {
+ perf_prepare_sample(data, event, regs);
+ ret = bpf_prog_run(prog, &ctx);
+ }
+ rcu_read_unlock();
+out:
+ __this_cpu_dec(bpf_prog_active);
+
+ return ret;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ if (event->overflow_handler_context)
+ /* hw breakpoint or kernel counter */
+ return -EINVAL;
+
+ if (event->prog)
+ return -EEXIST;
+
+ if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
+ return -EINVAL;
+
+ if (event->attr.precise_ip &&
+ prog->call_get_stack &&
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
+ event->attr.exclude_callchain_kernel ||
+ event->attr.exclude_callchain_user)) {
+ /*
+ * On perf_event with precise_ip, calling bpf_get_stack()
+ * may trigger unwinder warnings and occasional crashes.
+ * bpf_get_[stack|stackid] works around this issue by using
+ * callchain attached to perf_sample_data. If the
+ * perf_event does not full (kernel and user) callchain
+ * attached to perf_sample_data, do not allow attaching BPF
+ * program that calls bpf_get_[stack|stackid].
+ */
+ return -EPROTO;
+ }
+
+ event->prog = prog;
+ event->bpf_cookie = bpf_cookie;
+ return 0;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+ struct bpf_prog *prog = event->prog;
+
+ if (!prog)
+ return;
+
+ event->prog = NULL;
+ bpf_prog_put(prog);
+}
+#else
+static inline int bpf_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return 1;
+}
+
+static inline int perf_event_set_bpf_handler(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
+ if (event->attr.aux_pause)
+ perf_event_aux_pause(event->aux_event, true);
+
+ if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ !bpf_overflow_handler(event, data, regs))
+ goto out;
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -6108,26 +10413,69 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_disable = 1;
- irq_work_queue(&event->pending);
+ perf_event_disable_inatomic(event);
+ event->pmu->stop(event, 0);
}
- if (event->overflow_handler)
- event->overflow_handler(event, data, regs);
- else
- perf_event_output(event, data, regs);
+ if (event->attr.sigtrap) {
+ /*
+ * The desired behaviour of sigtrap vs invalid samples is a bit
+ * tricky; on the one hand, one should not loose the SIGTRAP if
+ * it is the first event, on the other hand, we should also not
+ * trigger the WARN or override the data address.
+ */
+ bool valid_sample = sample_is_allowed(event, regs);
+ unsigned int pending_id = 1;
+ enum task_work_notify_mode notify_mode;
+
+ if (regs)
+ pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
- if (event->fasync && event->pending_kill) {
+ notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
+
+ if (!event->pending_work &&
+ !task_work_add(current, &event->pending_task, notify_mode)) {
+ event->pending_work = pending_id;
+ local_inc(&event->ctx->nr_no_switch_fast);
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ event->pending_addr = data->addr;
+
+ } else if (event->attr.exclude_kernel && valid_sample) {
+ /*
+ * Should not be able to return to user space without
+ * consuming pending_work; with exceptions:
+ *
+ * 1. Where !exclude_kernel, events can overflow again
+ * in the kernel without returning to user space.
+ *
+ * 2. Events that can overflow again before the IRQ-
+ * work without user space progress (e.g. hrtimer).
+ * To approximate progress (with false negatives),
+ * check 32-bit hash of the current IP.
+ */
+ WARN_ON_ONCE(event->pending_work != pending_id);
+ }
+ }
+
+ READ_ONCE(event->overflow_handler)(event, data, regs);
+
+ if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
- irq_work_queue(&event->pending);
+ irq_work_queue(&event->pending_irq);
}
+out:
+ if (event->attr.aux_resume)
+ perf_event_aux_pause(event->aux_event, false);
return ret;
}
int perf_event_overflow(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
return __perf_event_overflow(event, 1, data, regs);
}
@@ -6140,14 +10488,7 @@ struct swevent_htable {
struct swevent_hlist *swevent_hlist;
struct mutex hlist_mutex;
int hlist_refcount;
-
- /* Recursion avoidance in each contexts */
- int recursion[PERF_NR_CONTEXTS];
-
- /* Keeps track of cpu being initialized/exited */
- bool online;
};
-
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
/*
@@ -6166,16 +10507,16 @@ u64 perf_swevent_set_period(struct perf_event *event)
hwc->last_period = hwc->sample_period;
-again:
- old = val = local64_read(&hwc->period_left);
- if (val < 0)
- return 0;
+ old = local64_read(&hwc->period_left);
+ do {
+ val = old;
+ if (val < 0)
+ return 0;
- nr = div64_u64(period + val, period);
- offset = nr * period;
- val -= offset;
- if (local64_cmpxchg(&hwc->period_left, old, val) != old)
- goto again;
+ nr = div64_u64(period + val, period);
+ offset = nr * period;
+ val -= offset;
+ } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));
return nr;
}
@@ -6235,8 +10576,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, data, regs);
}
-static int perf_exclude_event(struct perf_event *event,
- struct pt_regs *regs)
+int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
return 1;
@@ -6345,17 +10685,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
int perf_swevent_get_recursion_context(void)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- return get_recursion_context(swhash->recursion);
+ return get_recursion_context(current->perf_recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-inline void perf_swevent_put_recursion_context(int rctx)
+void perf_swevent_put_recursion_context(int rctx)
{
- struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
-
- put_recursion_context(swhash->recursion, rctx);
+ put_recursion_context(current->perf_recursion, rctx);
}
void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
@@ -6403,14 +10739,8 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (!head) {
- /*
- * We can race with cpu hotplug code. Do not
- * WARN if the cpu just got unplugged.
- */
- WARN_ON_ONCE(swhash->online);
+ if (WARN_ON_ONCE(!head))
return -EINVAL;
- }
hlist_add_head_rcu(&event->hlist_entry, head);
perf_event_update_userpage(event);
@@ -6452,7 +10782,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
kfree_rcu(hlist, rcu_head);
}
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+static void swevent_hlist_put_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -6464,22 +10794,22 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-static void swevent_hlist_put(struct perf_event *event)
+static void swevent_hlist_put(void)
{
int cpu;
for_each_possible_cpu(cpu)
- swevent_hlist_put_cpu(event, cpu);
+ swevent_hlist_put_cpu(cpu);
}
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+static int swevent_hlist_get_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
int err = 0;
mutex_lock(&swhash->hlist_mutex);
-
- if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+ if (!swevent_hlist_deref(swhash) &&
+ cpumask_test_cpu(cpu, perf_online_mask)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -6496,30 +10826,27 @@ exit:
return err;
}
-static int swevent_hlist_get(struct perf_event *event)
+static int swevent_hlist_get(void)
{
- int err;
- int cpu, failed_cpu;
+ int err, cpu, failed_cpu;
- get_online_cpus();
+ mutex_lock(&pmus_lock);
for_each_possible_cpu(cpu) {
- err = swevent_hlist_get_cpu(event, cpu);
+ err = swevent_hlist_get_cpu(cpu);
if (err) {
failed_cpu = cpu;
goto fail;
}
}
- put_online_cpus();
-
+ mutex_unlock(&pmus_lock);
return 0;
fail:
for_each_possible_cpu(cpu) {
if (cpu == failed_cpu)
break;
- swevent_hlist_put_cpu(event, cpu);
+ swevent_hlist_put_cpu(cpu);
}
-
- put_online_cpus();
+ mutex_unlock(&pmus_lock);
return err;
}
@@ -6532,9 +10859,12 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
static_key_slow_dec(&perf_swevent_enabled[event_id]);
- swevent_hlist_put(event);
+ swevent_hlist_put();
}
+static struct pmu perf_cpu_clock; /* fwd declaration */
+static struct pmu perf_task_clock;
+
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
@@ -6550,7 +10880,10 @@ static int perf_swevent_init(struct perf_event *event)
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
+ event->attr.type = perf_cpu_clock.type;
+ return -ENOENT;
case PERF_COUNT_SW_TASK_CLOCK:
+ event->attr.type = perf_task_clock.type;
return -ENOENT;
default:
@@ -6563,7 +10896,7 @@ static int perf_swevent_init(struct perf_event *event)
if (!event->parent) {
int err;
- err = swevent_hlist_get(event);
+ err = swevent_hlist_get();
if (err)
return err;
@@ -6589,10 +10922,52 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
- struct perf_sample_data *data)
+ struct perf_raw_record *raw)
{
- void *record = data->raw->data;
+ void *record = raw->frag.data;
+
+ /* only top level events have filters set */
+ if (event->parent)
+ event = event->parent;
if (likely(!event->filter) || filter_match_preds(event->filter, record))
return 1;
@@ -6600,24 +10975,84 @@ static int perf_tp_filter_match(struct perf_event *event,
}
static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data,
+ struct perf_raw_record *raw,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
return 0;
/*
- * All tracepoints are from kernel-space.
+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
*/
- if (event->attr.exclude_kernel)
+ if (event->attr.exclude_kernel && !user_mode(regs))
return 0;
- if (!perf_tp_filter_match(event, data))
+ if (!perf_tp_filter_match(event, raw))
return 0;
return 1;
}
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+ struct trace_event_call *call, u64 count,
+ struct pt_regs *regs, struct hlist_head *head,
+ struct task_struct *task)
+{
+ if (bpf_prog_array_valid(call)) {
+ *(struct pt_regs **)raw_data = regs;
+ if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+ }
+ perf_tp_event(call->event.type, count, raw_data, size, regs, head,
+ rctx, task);
+}
+EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_raw_record *raw,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, raw, regs)) {
+ perf_sample_data_init(data, 0, 0);
+ perf_sample_save_raw_data(data, event, raw);
+ perf_swevent_event(event, count, data, regs);
+ }
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_raw_record *raw,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, raw, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, raw, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
+ }
+}
+
+void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
{
@@ -6625,16 +11060,28 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
struct perf_event *event;
struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
+ .frag = {
+ .size = entry_size,
+ .data = record,
+ },
};
- perf_sample_data_init(&data, addr, 0);
- data.raw = &raw;
+ perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
+ if (perf_tp_event_match(event, &raw, regs)) {
+ /*
+ * Here use the same on-stack perf_sample_data,
+ * some members in data are event-specific and
+ * need to be re-computed for different sweveents.
+ * Re-initialize data->sample_flags safely to avoid
+ * the problem that next event skips preparing data
+ * because data->sample_flags is set.
+ */
+ perf_sample_data_init(&data, 0, 0);
+ perf_sample_save_raw_data(&data, event, &raw);
perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -6643,21 +11090,15 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -6666,65 +11107,156 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
+/*
+ * Flags in config, used by dynamic PMU kprobe and uprobe
+ * The flags should match following PMU_FORMAT_ATTR().
+ *
+ * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
+ * if not set, create kprobe/uprobe
+ *
+ * The following values specify a reference counter (or semaphore in the
+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
+ *
+ * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset
+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
+ */
+enum perf_probe_config {
+ PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
+ PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
+ PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
+};
-static int perf_tp_event_init(struct perf_event *event)
+PMU_FORMAT_ATTR(retprobe, "config:0");
+#endif
+
+#ifdef CONFIG_KPROBE_EVENTS
+static struct attribute *kprobe_attrs[] = {
+ &format_attr_retprobe.attr,
+ NULL,
+};
+
+static struct attribute_group kprobe_format_group = {
+ .name = "format",
+ .attrs = kprobe_attrs,
+};
+
+static const struct attribute_group *kprobe_attr_groups[] = {
+ &kprobe_format_group,
+ NULL,
+};
+
+static int perf_kprobe_event_init(struct perf_event *event);
+static struct pmu perf_kprobe = {
+ .task_ctx_nr = perf_sw_context,
+ .event_init = perf_kprobe_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+ .attr_groups = kprobe_attr_groups,
+};
+
+static int perf_kprobe_event_init(struct perf_event *event)
{
int err;
+ bool is_retprobe;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ if (event->attr.type != perf_kprobe.type)
return -ENOENT;
+ if (!perfmon_capable())
+ return -EACCES;
+
/*
- * no branch sampling for tracepoint events
+ * no branch sampling for probe events
*/
if (has_branch_stack(event))
return -EOPNOTSUPP;
- err = perf_trace_init(event);
+ is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ err = perf_kprobe_init(event, is_retprobe);
if (err)
return err;
- event->destroy = tp_perf_event_destroy;
+ event->destroy = perf_kprobe_destroy;
return 0;
}
+#endif /* CONFIG_KPROBE_EVENTS */
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
+#ifdef CONFIG_UPROBE_EVENTS
+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
- .event_init = perf_tp_event_init,
+static struct attribute *uprobe_attrs[] = {
+ &format_attr_retprobe.attr,
+ &format_attr_ref_ctr_offset.attr,
+ NULL,
+};
+
+static struct attribute_group uprobe_format_group = {
+ .name = "format",
+ .attrs = uprobe_attrs,
+};
+
+static const struct attribute_group *uprobe_attr_groups[] = {
+ &uprobe_format_group,
+ NULL,
+};
+
+static int perf_uprobe_event_init(struct perf_event *event);
+static struct pmu perf_uprobe = {
+ .task_ctx_nr = perf_sw_context,
+ .event_init = perf_uprobe_event_init,
.add = perf_trace_add,
.del = perf_trace_del,
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+ .attr_groups = uprobe_attr_groups,
};
-static inline void perf_tp_register(void)
+static int perf_uprobe_event_init(struct perf_event *event)
{
- perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
-}
+ int err;
+ unsigned long ref_ctr_offset;
+ bool is_retprobe;
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- char *filter_str;
- int ret;
+ if (event->attr.type != perf_uprobe.type)
+ return -ENOENT;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
- filter_str = strndup_user(arg, PAGE_SIZE);
- if (IS_ERR(filter_str))
- return PTR_ERR(filter_str);
+ /*
+ * no branch sampling for probe events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
+ err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
+ if (err)
+ return err;
- kfree(filter_str);
- return ret;
+ event->destroy = perf_uprobe_destroy;
+
+ return 0;
+}
+#endif /* CONFIG_UPROBE_EVENTS */
+
+static inline void perf_tp_register(void)
+{
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+#ifdef CONFIG_KPROBE_EVENTS
+ perf_pmu_register(&perf_kprobe, "kprobe", -1);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ perf_pmu_register(&perf_uprobe, "uprobe", -1);
+#endif
}
static void perf_event_free_filter(struct perf_event *event)
@@ -6732,47 +11264,96 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event);
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+/*
+ * returns true if the event is a tracepoint, or a kprobe/upprobe created
+ * with perf_event_open()
+ */
+static inline bool perf_event_is_tracing(struct perf_event *event)
{
- struct bpf_prog *prog;
+ if (event->pmu == &perf_tracepoint)
+ return true;
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->pmu == &perf_kprobe)
+ return true;
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->pmu == &perf_uprobe)
+ return true;
+#endif
+ return false;
+}
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
+{
+ bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
+
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
+ if (!perf_event_is_tracing(event))
+ return perf_event_set_bpf_handler(event, prog, bpf_cookie);
+
+ is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
+ is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
+ is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
+ /* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
- if (event->tp_event->prog)
- return -EEXIST;
+ if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
+ return -EINVAL;
- if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
- /* bpf programs can only be attached to kprobes */
+ if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
+ /* only uprobe programs are allowed to be sleepable */
return -EINVAL;
- prog = bpf_prog_get(prog_fd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
+ /* Kprobe override only works for kprobes, not uprobes. */
+ if (prog->kprobe_override && !is_kprobe)
+ return -EINVAL;
- if (prog->type != BPF_PROG_TYPE_KPROBE) {
- /* valid fd, but invalid bpf program type */
- bpf_prog_put(prog);
+ /* Writing to context allowed only for uprobes. */
+ if (prog->aux->kprobe_write_ctx && !is_uprobe)
return -EINVAL;
- }
- event->tp_event->prog = prog;
+ if (is_tracepoint || is_syscall_tp) {
+ int off = trace_event_get_offsets(event->tp_event);
- return 0;
+ if (prog->aux->max_ctx_offset > off)
+ return -EACCES;
+ }
+
+ return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
- struct bpf_prog *prog;
+ struct perf_event_context *ctx;
+ int ret;
- if (!event->tp_event)
+ ctx = perf_event_ctx_lock(event);
+ ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+
+void perf_event_free_bpf_prog(struct perf_event *event)
+{
+ if (!event->prog)
return;
- prog = event->tp_event->prog;
- if (prog) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
+ if (!perf_event_is_tracing(event)) {
+ perf_event_free_bpf_handler(event);
+ return;
}
+ perf_event_detach_bpf_prog(event);
}
#else
@@ -6781,21 +11362,25 @@ static inline void perf_tp_register(void)
{
}
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+static void perf_event_free_filter(struct perf_event *event)
{
- return -ENOENT;
}
-static void perf_event_free_filter(struct perf_event *event)
+static int __perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
+ return -ENOENT;
}
-static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+int perf_event_set_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog,
+ u64 bpf_cookie)
{
return -ENOENT;
}
-static void perf_event_free_bpf_prog(struct perf_event *event)
+void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */
@@ -6814,6 +11399,438 @@ void perf_bp_event(struct perf_event *bp, void *data)
#endif
/*
+ * Allocate a new address filter
+ */
+static struct perf_addr_filter *
+perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
+{
+ int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+ struct perf_addr_filter *filter;
+
+ filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+ if (!filter)
+ return NULL;
+
+ INIT_LIST_HEAD(&filter->entry);
+ list_add_tail(&filter->entry, filters);
+
+ return filter;
+}
+
+static void free_filters_list(struct list_head *filters)
+{
+ struct perf_addr_filter *filter, *iter;
+
+ list_for_each_entry_safe(filter, iter, filters, entry) {
+ path_put(&filter->path);
+ list_del(&filter->entry);
+ kfree(filter);
+ }
+}
+
+/*
+ * Free existing address filters and optionally install new ones
+ */
+static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head)
+{
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ if (!has_addr_filter(event))
+ return;
+
+ /* don't bother with children, they don't have their own filters */
+ if (event->parent)
+ return;
+
+ raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
+
+ list_splice_init(&event->addr_filters.list, &list);
+ if (head)
+ list_splice(head, &event->addr_filters.list);
+
+ raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
+
+ free_filters_list(&list);
+}
+
+static void perf_free_addr_filters(struct perf_event *event)
+{
+ /*
+ * Used during free paths, there is no concurrency.
+ */
+ if (list_empty(&event->addr_filters.list))
+ return;
+
+ perf_addr_filters_splice(event, NULL);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_lock down for reading.
+ */
+static void perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm,
+ struct perf_addr_filter_range *fr)
+{
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ for_each_vma(vmi, vma) {
+ if (!vma->vm_file)
+ continue;
+
+ if (perf_addr_filter_vma_adjust(filter, vma, fr))
+ return;
+ }
+}
+
+/*
+ * Update event's address range filters based on the
+ * task's existing mappings, if any.
+ */
+static void perf_event_addr_filters_apply(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct task_struct *task = READ_ONCE(event->ctx->task);
+ struct perf_addr_filter *filter;
+ struct mm_struct *mm = NULL;
+ unsigned int count = 0;
+ unsigned long flags;
+
+ /*
+ * We may observe TASK_TOMBSTONE, which means that the event tear-down
+ * will stop on the parent's child_mutex that our caller is also holding
+ */
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ if (ifh->nr_file_filters) {
+ mm = get_task_mm(task);
+ if (!mm)
+ goto restart;
+
+ mmap_read_lock(mm);
+ }
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (filter->path.dentry) {
+ /*
+ * Adjust base offset if the filter is associated to a
+ * binary that needs to be mapped:
+ */
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
+
+ perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
+ } else {
+ event->addr_filter_ranges[count].start = filter->offset;
+ event->addr_filter_ranges[count].size = filter->size;
+ }
+
+ count++;
+ }
+
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (ifh->nr_file_filters) {
+ mmap_read_unlock(mm);
+
+ mmput(mm);
+ }
+
+restart:
+ perf_event_stop(event, 1);
+}
+
+/*
+ * Address range filtering: limiting the data to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION RANGE_SPEC
+ * where ACTION is one of the
+ * * "filter": limit the trace to this region
+ * * "start": start tracing from this address
+ * * "stop": stop tracing at this address/region;
+ * RANGE_SPEC is
+ * * for kernel addresses: <start address>[/<size>]
+ * * for object files: <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified or is zero, the range is treated as a single
+ * address; not valid for ACTION=="filter".
+ */
+enum {
+ IF_ACT_NONE = -1,
+ IF_ACT_FILTER,
+ IF_ACT_START,
+ IF_ACT_STOP,
+ IF_SRC_FILE,
+ IF_SRC_KERNEL,
+ IF_SRC_FILEADDR,
+ IF_SRC_KERNELADDR,
+};
+
+enum {
+ IF_STATE_ACTION = 0,
+ IF_STATE_SOURCE,
+ IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+ { IF_ACT_FILTER, "filter" },
+ { IF_ACT_START, "start" },
+ { IF_ACT_STOP, "stop" },
+ { IF_SRC_FILE, "%u/%u@%s" },
+ { IF_SRC_KERNEL, "%u/%u" },
+ { IF_SRC_FILEADDR, "%u@%s" },
+ { IF_SRC_KERNELADDR, "%u" },
+ { IF_ACT_NONE, NULL },
+};
+
+/*
+ * Address filter string parser
+ */
+static int
+perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
+ struct list_head *filters)
+{
+ struct perf_addr_filter *filter = NULL;
+ char *start, *orig, *filename = NULL;
+ substring_t args[MAX_OPT_ARGS];
+ int state = IF_STATE_ACTION, token;
+ unsigned int kernel = 0;
+ int ret = -EINVAL;
+
+ orig = fstr = kstrdup(fstr, GFP_KERNEL);
+ if (!fstr)
+ return -ENOMEM;
+
+ while ((start = strsep(&fstr, " ,\n")) != NULL) {
+ static const enum perf_addr_filter_action_t actions[] = {
+ [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
+ [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START,
+ [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP,
+ };
+ ret = -EINVAL;
+
+ if (!*start)
+ continue;
+
+ /* filter definition begins */
+ if (state == IF_STATE_ACTION) {
+ filter = perf_addr_filter_new(event, filters);
+ if (!filter)
+ goto fail;
+ }
+
+ token = match_token(start, if_tokens, args);
+ switch (token) {
+ case IF_ACT_FILTER:
+ case IF_ACT_START:
+ case IF_ACT_STOP:
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ filter->action = actions[token];
+ state = IF_STATE_SOURCE;
+ break;
+
+ case IF_SRC_KERNELADDR:
+ case IF_SRC_KERNEL:
+ kernel = 1;
+ fallthrough;
+
+ case IF_SRC_FILEADDR:
+ case IF_SRC_FILE:
+ if (state != IF_STATE_SOURCE)
+ goto fail;
+
+ *args[0].to = 0;
+ ret = kstrtoul(args[0].from, 0, &filter->offset);
+ if (ret)
+ goto fail;
+
+ if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
+ *args[1].to = 0;
+ ret = kstrtoul(args[1].from, 0, &filter->size);
+ if (ret)
+ goto fail;
+ }
+
+ if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+ int fpos = token == IF_SRC_FILE ? 2 : 1;
+
+ kfree(filename);
+ filename = match_strdup(&args[fpos]);
+ if (!filename) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ state = IF_STATE_END;
+ break;
+
+ default:
+ goto fail;
+ }
+
+ /*
+ * Filter definition is fully parsed, validate and install it.
+ * Make sure that it doesn't contradict itself or the event's
+ * attribute.
+ */
+ if (state == IF_STATE_END) {
+ ret = -EINVAL;
+
+ /*
+ * ACTION "filter" must have a non-zero length region
+ * specified.
+ */
+ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
+ !filter->size)
+ goto fail;
+
+ if (!kernel) {
+ if (!filename)
+ goto fail;
+
+ /*
+ * For now, we only support file-based filters
+ * in per-task events; doing so for CPU-wide
+ * events requires additional context switching
+ * trickery, since same object code will be
+ * mapped at different virtual addresses in
+ * different processes.
+ */
+ ret = -EOPNOTSUPP;
+ if (!event->ctx->task)
+ goto fail;
+
+ /* look up the path and grab its inode */
+ ret = kern_path(filename, LOOKUP_FOLLOW,
+ &filter->path);
+ if (ret)
+ goto fail;
+
+ ret = -EINVAL;
+ if (!filter->path.dentry ||
+ !S_ISREG(d_inode(filter->path.dentry)
+ ->i_mode))
+ goto fail;
+
+ event->addr_filters.nr_file_filters++;
+ }
+
+ /* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
+ state = IF_STATE_ACTION;
+ filter = NULL;
+ kernel = 0;
+ }
+ }
+
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ kfree(filename);
+ kfree(orig);
+
+ return 0;
+
+fail:
+ kfree(filename);
+ free_filters_list(filters);
+ kfree(orig);
+
+ return ret;
+}
+
+static int
+perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
+{
+ LIST_HEAD(filters);
+ int ret;
+
+ /*
+ * Since this is called in perf_ioctl() path, we're already holding
+ * ctx::mutex.
+ */
+ lockdep_assert_held(&event->ctx->mutex);
+
+ if (WARN_ON_ONCE(event->parent))
+ return -EINVAL;
+
+ ret = perf_event_parse_addr_filter(event, filter_str, &filters);
+ if (ret)
+ goto fail_clear_files;
+
+ ret = event->pmu->addr_filters_validate(&filters);
+ if (ret)
+ goto fail_free_filters;
+
+ /* remove existing filters, if any */
+ perf_addr_filters_splice(event, &filters);
+
+ /* install new filters */
+ perf_event_for_each_child(event, perf_event_addr_filters_apply);
+
+ return ret;
+
+fail_free_filters:
+ free_filters_list(&filters);
+
+fail_clear_files:
+ event->addr_filters.nr_file_filters = 0;
+
+ return ret;
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ int ret = -EINVAL;
+ char *filter_str;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+#ifdef CONFIG_EVENT_TRACING
+ if (perf_event_is_tracing(event)) {
+ struct perf_event_context *ctx = event->ctx;
+
+ /*
+ * Beware, here be dragons!!
+ *
+ * the tracepoint muck will deadlock against ctx->mutex, but
+ * the tracepoint stuff does not actually need it. So
+ * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
+ * already have a reference on ctx.
+ *
+ * This can result in event getting moved to a different ctx,
+ * but that does not affect the tracepoint state.
+ */
+ mutex_unlock(&ctx->mutex);
+ ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ mutex_lock(&ctx->mutex);
+ } else
+#endif
+ if (has_addr_filter(event))
+ ret = perf_event_set_addr_filter(event, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+/*
* hrtimer based swevent callback
*/
@@ -6827,7 +11844,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
- if (event->state != PERF_EVENT_STATE_ACTIVE)
+ if (event->state != PERF_EVENT_STATE_ACTIVE ||
+ event->hw.state & PERF_HES_STOPPED)
return HRTIMER_NORESTART;
event->pmu->read(event);
@@ -6864,20 +11882,29 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
} else {
period = max_t(u64, 10000, hwc->sample_period);
}
- __hrtimer_start_range_ns(&hwc->hrtimer,
- ns_to_ktime(period), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+ HRTIMER_MODE_REL_PINNED_HARD);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (is_sampling_event(event)) {
+ /*
+ * Careful: this function can be triggered in the hrtimer handler,
+ * for cpu-clock events, so hrtimer_cancel() would cause a
+ * deadlock.
+ *
+ * So use hrtimer_try_to_cancel() to try to stop the hrtimer,
+ * and the cpu-clock handler also sets the PERF_HES_STOPPED flag,
+ * which guarantees that perf_swevent_hrtimer() will stop the
+ * hrtimer once it sees the PERF_HES_STOPPED flag.
+ */
+ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
- hrtimer_cancel(&hwc->hrtimer);
+ hrtimer_try_to_cancel(&hwc->hrtimer);
}
}
@@ -6888,8 +11915,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
+ hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
@@ -6922,14 +11948,17 @@ static void cpu_clock_event_update(struct perf_event *event)
static void cpu_clock_event_start(struct perf_event *event, int flags)
{
+ event->hw.state = 0;
local64_set(&event->hw.prev_count, local_clock());
perf_swevent_start_hrtimer(event);
}
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
+ event->hw.state = PERF_HES_STOPPED;
perf_swevent_cancel_hrtimer(event);
- cpu_clock_event_update(event);
+ if (flags & PERF_EF_UPDATE)
+ cpu_clock_event_update(event);
}
static int cpu_clock_event_add(struct perf_event *event, int flags)
@@ -6943,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
static void cpu_clock_event_del(struct perf_event *event, int flags)
{
- cpu_clock_event_stop(event, flags);
+ cpu_clock_event_stop(event, PERF_EF_UPDATE);
}
static void cpu_clock_event_read(struct perf_event *event)
@@ -6953,7 +11982,7 @@ static void cpu_clock_event_read(struct perf_event *event)
static int cpu_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_cpu_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
@@ -6974,6 +12003,7 @@ static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
@@ -6999,14 +12029,17 @@ static void task_clock_event_update(struct perf_event *event, u64 now)
static void task_clock_event_start(struct perf_event *event, int flags)
{
+ event->hw.state = 0;
local64_set(&event->hw.prev_count, event->ctx->time);
perf_swevent_start_hrtimer(event);
}
static void task_clock_event_stop(struct perf_event *event, int flags)
{
+ event->hw.state = PERF_HES_STOPPED;
perf_swevent_cancel_hrtimer(event);
- task_clock_event_update(event, event->ctx->time);
+ if (flags & PERF_EF_UPDATE)
+ task_clock_event_update(event, event->ctx->time);
}
static int task_clock_event_add(struct perf_event *event, int flags)
@@ -7034,7 +12067,7 @@ static void task_clock_event_read(struct perf_event *event)
static int task_clock_event_init(struct perf_event *event)
{
- if (event->attr.type != PERF_TYPE_SOFTWARE)
+ if (event->attr.type != perf_task_clock.type)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
@@ -7055,6 +12088,7 @@ static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
+ .dev = PMU_NULL_DEV,
.event_init = task_clock_event_init,
.add = task_clock_event_add,
@@ -7068,24 +12102,54 @@ static void perf_pmu_nop_void(struct pmu *pmu)
{
}
+static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
+{
+}
+
static int perf_pmu_nop_int(struct pmu *pmu)
{
return 0;
}
-static void perf_pmu_start_txn(struct pmu *pmu)
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+ return 0;
+}
+
+static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
+
+static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
+ __this_cpu_write(nop_txn_flags, flags);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_disable(pmu);
}
static int perf_pmu_commit_txn(struct pmu *pmu)
{
+ unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+ __this_cpu_write(nop_txn_flags, 0);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return 0;
+
perf_pmu_enable(pmu);
return 0;
}
static void perf_pmu_cancel_txn(struct pmu *pmu)
{
+ unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+ __this_cpu_write(nop_txn_flags, 0);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_enable(pmu);
}
@@ -7095,57 +12159,18 @@ static int perf_event_idx_default(struct perf_event *event)
}
/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
+ * Let userspace know that this PMU supports address range filtering:
*/
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
+static ssize_t nr_addr_filters_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
-static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ struct pmu *pmu = dev_get_drvdata(dev);
- if (cpuctx->unique_pmu == old_pmu)
- cpuctx->unique_pmu = pmu;
- }
+ return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
}
+DEVICE_ATTR_RO(nr_addr_filters);
-static void free_pmu_context(struct pmu *pmu)
-{
- struct pmu *i;
-
- mutex_lock(&pmus_lock);
- /*
- * Like a real lame refcount.
- */
- list_for_each_entry(i, &pmus, entry) {
- if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
- update_pmu_context(i, pmu);
- goto out;
- }
- }
-
- free_percpu(pmu->pmu_cpu_context);
-out:
- mutex_unlock(&pmus_lock);
-}
static struct idr pmu_idr;
static ssize_t
@@ -7153,7 +12178,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+ return sysfs_emit(page, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -7164,9 +12189,11 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+ return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
}
+static DEFINE_MUTEX(mux_interval_mutex);
+
static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
struct device_attribute *attr,
@@ -7186,31 +12213,109 @@ perf_event_mux_interval_ms_store(struct device *dev,
if (timer == pmu->hrtimer_interval_ms)
return count;
+ mutex_lock(&mux_interval_mutex);
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
- for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct perf_cpu_pmu_context *cpc;
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- if (hrtimer_active(&cpuctx->hrtimer))
- hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
+ cpus_read_unlock();
+ mutex_unlock(&mux_interval_mutex);
return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
+ &dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (n == 2 && !pmu->nr_addr_filters)
+ return 0;
+
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group pmu_dev_attr_group = {
+ .is_visible = pmu_dev_is_visible,
+ .attrs = pmu_dev_attrs,
+};
+
+static const struct attribute_group *pmu_dev_groups[] = {
+ &pmu_dev_attr_group,
NULL,
};
-ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
-static struct bus_type pmu_bus = {
+static const struct bus_type pmu_bus = {
.name = "event_source",
.dev_groups = pmu_dev_groups,
};
@@ -7230,83 +12335,139 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
+ pmu->dev->parent = pmu->parent;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
+ if (pmu->attr_update) {
+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
+ if (ret)
+ goto del_dev;
+ }
+
out:
return ret;
+del_dev:
+ device_del(pmu->dev);
+
free_dev:
put_device(pmu->dev);
+ pmu->dev = NULL;
goto out;
}
static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
+static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
{
- int cpu, ret;
+ void *tmp, *val = idr_find(idr, id);
- mutex_lock(&pmus_lock);
- ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;
+ if (val != old)
+ return false;
- pmu->type = -1;
- if (!name)
- goto skip_type;
- pmu->name = name;
+ tmp = idr_replace(idr, new, id);
+ if (IS_ERR(tmp))
+ return false;
+
+ WARN_ON_ONCE(tmp != val);
+ return true;
+}
- if (type < 0) {
- type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
- if (type < 0) {
- ret = type;
- goto free_pdc;
+static void perf_pmu_free(struct pmu *pmu)
+{
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
+
+ if (pmu->cpu_pmu_context) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ continue;
+ if (cpc->epc.embedded) {
+ /* refcount managed */
+ put_pmu_ctx(&cpc->epc);
+ continue;
+ }
+ kfree(cpc);
}
+ free_percpu(pmu->cpu_pmu_context);
}
- pmu->type = type;
+}
- if (pmu_bus_running) {
- ret = pmu_dev_alloc(pmu);
+DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))
+
+int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
+{
+ int cpu, max = PERF_TYPE_MAX;
+
+ struct pmu *pmu __free(pmu_unregister) = _pmu;
+ guard(mutex)(&pmus_lock);
+
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
+ return -EINVAL;
+
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
+ "Can not register a pmu with an invalid scope.\n"))
+ return -EINVAL;
+
+ pmu->name = name;
+
+ if (type >= 0)
+ max = type;
+
+ CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
+ if (pmu_type.id < 0)
+ return pmu_type.id;
+
+ WARN_ON(type >= 0 && pmu_type.id != type);
+
+ pmu->type = pmu_type.id;
+ atomic_set(&pmu->exclusive_cnt, 0);
+
+ if (pmu_bus_running && !pmu->dev) {
+ int ret = pmu_dev_alloc(pmu);
if (ret)
- goto free_idr;
+ return ret;
}
-skip_type:
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
-
- ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
- goto free_dev;
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
+ if (!pmu->cpu_pmu_context)
+ return -ENOMEM;
for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc =
+ kmalloc_node(sizeof(struct perf_cpu_pmu_context),
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
-
- __perf_cpu_hrtimer_init(cpuctx, cpu);
+ if (!cpc)
+ return -ENOMEM;
- cpuctx->unique_pmu = pmu;
+ *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -7318,7 +12479,7 @@ got_cpu_context:
pmu->commit_txn = perf_pmu_commit_txn;
pmu->cancel_txn = perf_pmu_cancel_txn;
} else {
- pmu->start_txn = perf_pmu_nop_void;
+ pmu->start_txn = perf_pmu_nop_txn;
pmu->commit_txn = perf_pmu_nop_int;
pmu->cancel_txn = perf_pmu_nop_void;
}
@@ -7329,53 +12490,174 @@ got_cpu_context:
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->check_period)
+ pmu->check_period = perf_event_nop_int;
+
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
+ /*
+ * Now that the PMU is complete, make it visible to perf_try_init_event().
+ */
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
+ return -EINVAL;
list_add_rcu(&pmu->entry, &pmus);
- atomic_set(&pmu->exclusive_cnt, 0);
- ret = 0;
-unlock:
- mutex_unlock(&pmus_lock);
- return ret;
+ take_idr_id(pmu_type);
+ _pmu = no_free_ptr(pmu); // let it rip
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
-free_dev:
- device_del(pmu->dev);
- put_device(pmu->dev);
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
-free_idr:
- if (pmu->type >= PERF_TYPE_MAX)
- idr_remove(&pmu_idr, pmu->type);
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ /*
+ * Mostly an empty lock sequence, such that perf_mmap(), which
+ * relies on mmap_mutex, is sure to observe the state change.
+ */
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
-free_pdc:
- free_percpu(pmu->pmu_disable_count);
- goto unlock;
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
}
-EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
{
- mutex_lock(&pmus_lock);
- list_del_rcu(&pmu->entry);
- mutex_unlock(&pmus_lock);
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
+{
+ scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
+ list_del_rcu(&pmu->entry);
+ }
/*
* We dereference the pmu list under both SRCU and regular RCU, so
* synchronize against both of those.
+ *
+ * Notably, the entirety of event creation, from perf_init_event()
+ * (which will now fail, because of the above) until
+ * perf_install_in_context() should be under SRCU such that
+ * this synchronizes against event creation. This avoids trying to
+ * detach events that are not fully formed.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
- free_percpu(pmu->pmu_disable_count);
- if (pmu->type >= PERF_TYPE_MAX)
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
+ }
+
+ scoped_guard (mutex, &pmus_lock)
idr_remove(&pmu_idr, pmu->type);
- device_del(pmu->dev);
- put_device(pmu->dev);
- free_pmu_context(pmu);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
+ perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static inline bool has_extended_regs(struct perf_event *event)
+{
+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
+
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
struct perf_event_context *ctx = NULL;
@@ -7384,7 +12666,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (!try_module_get(pmu->module))
return -ENODEV;
- if (event->group_leader != event) {
+ /*
+ * A number of pmu->event_init() methods iterate the sibling_list to,
+ * for example, validate if the group fits on the PMU. Therefore,
+ * if this is a sibling event, acquire the ctx->mutex to protect
+ * the sibling_list.
+ */
+ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
/*
* This ctx->mutex can nest when we're called through
* inheritance. See the perf_event_ctx_lock_nested() comment.
@@ -7401,82 +12689,236 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
perf_event_ctx_unlock(event->group_leader, ctx);
if (ret)
- module_put(pmu->module);
+ goto err_pmu;
+
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
+
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ ret = -EINVAL;
+ goto err_destroy;
+ }
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask;
+ struct cpumask *pmu_cpumask;
+ int cpu;
+
+ cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ pmu_cpumask = perf_scope_cpumask(pmu->scope);
+
+ ret = -ENODEV;
+ if (!pmu_cpumask || !cpumask)
+ goto err_destroy;
+
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ goto err_destroy;
+
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
+ }
+
+ return 0;
+
+err_destroy:
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+err_pmu:
+ event->pmu = NULL;
+ module_put(pmu->module);
return ret;
}
-struct pmu *perf_init_event(struct perf_event *event)
+static struct pmu *perf_init_event(struct perf_event *event)
{
- struct pmu *pmu = NULL;
- int idx;
- int ret;
+ bool extended_type = false;
+ struct pmu *pmu;
+ int type, ret;
- idx = srcu_read_lock(&pmus_srcu);
+ guard(srcu)(&pmus_srcu); /* pmu idr/list access */
- rcu_read_lock();
- pmu = idr_find(&pmu_idr, event->attr.type);
- rcu_read_unlock();
+ /*
+ * Save original type before calling pmu->event_init() since certain
+ * pmus overwrites event->attr.type to forward event to another pmu.
+ */
+ event->orig_type = event->attr.type;
+
+ /* Try parent's PMU first: */
+ if (event->parent && event->parent->pmu) {
+ pmu = event->parent->pmu;
+ ret = perf_try_init_event(pmu, event);
+ if (!ret)
+ return pmu;
+ }
+
+ /*
+ * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * are often aliases for PERF_TYPE_RAW.
+ */
+ type = event->attr.type;
+ if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
+ type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
+ if (!type) {
+ type = PERF_TYPE_RAW;
+ } else {
+ extended_type = true;
+ event->attr.config &= PERF_HW_EVENT_MASK;
+ }
+ }
+
+again:
+ scoped_guard (rcu)
+ pmu = idr_find(&pmu_idr, type);
if (pmu) {
+ if (event->attr.type != type && type != PERF_TYPE_RAW &&
+ !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
+ return ERR_PTR(-ENOENT);
+
ret = perf_try_init_event(pmu, event);
+ if (ret == -ENOENT && event->attr.type != type && !extended_type) {
+ type = event->attr.type;
+ goto again;
+ }
+
if (ret)
- pmu = ERR_PTR(ret);
- goto unlock;
+ return ERR_PTR(ret);
+
+ return pmu;
}
- list_for_each_entry_rcu(pmu, &pmus, entry) {
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
- if (ret != -ENOENT) {
- pmu = ERR_PTR(ret);
- goto unlock;
- }
+ if (ret != -ENOENT)
+ return ERR_PTR(ret);
}
- pmu = ERR_PTR(-ENOENT);
-unlock:
- srcu_read_unlock(&pmus_srcu, idx);
- return pmu;
+ return ERR_PTR(-ENOENT);
}
-static void account_event_cpu(struct perf_event *event, int cpu)
+static void attach_sb_event(struct perf_event *event)
{
- if (event->parent)
- return;
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
- if (is_cgroup_event(event))
- atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+ raw_spin_lock(&pel->lock);
+ list_add_rcu(&event->sb_list, &pel->list);
+ raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ attach_sb_event(event);
+}
+
+/* Freq events need the tick to stay alive (see perf_event_task_tick). */
+static void account_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ /* Lock so we don't race with concurrent unaccount */
+ spin_lock(&nr_freq_lock);
+ if (atomic_inc_return(&nr_freq_events) == 1)
+ tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
+ spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void account_freq_event(void)
+{
+ if (tick_nohz_full_enabled())
+ account_freq_event_nohz();
+ else
+ atomic_inc(&nr_freq_events);
}
+
static void account_event(struct perf_event *event)
{
+ bool inc = false;
+
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_inc(&perf_sched_events.key);
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
+ inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_inc(&nr_build_id_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_inc(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_inc(&nr_cgroup_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
- if (event->attr.freq) {
- if (atomic_inc_return(&nr_freq_events) == 1)
- tick_nohz_full_kick_all();
+ if (event->attr.freq)
+ account_freq_event();
+ if (event->attr.context_switch) {
+ atomic_inc(&nr_switch_events);
+ inc = true;
}
if (has_branch_stack(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
if (is_cgroup_event(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
+ if (event->attr.ksymbol)
+ atomic_inc(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_inc(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_inc(&nr_text_poke_events);
+
+ if (inc) {
+ /*
+ * We need the mutex here because static_branch_enable()
+ * must complete *before* the perf_sched_count increment
+ * becomes visible.
+ */
+ if (atomic_inc_not_zero(&perf_sched_count))
+ goto enabled;
- account_event_cpu(event, event->cpu);
+ mutex_lock(&perf_sched_mutex);
+ if (!atomic_read(&perf_sched_count)) {
+ static_branch_enable(&perf_sched_events);
+ /*
+ * Guarantee that all CPUs observe they key change and
+ * call the perf scheduling hooks before proceeding to
+ * install events that need them.
+ */
+ synchronize_rcu();
+ }
+ /*
+ * Now that we have waited for the sync_sched(), allow further
+ * increments to by-pass the mutex.
+ */
+ atomic_inc(&perf_sched_count);
+ mutex_unlock(&perf_sched_mutex);
+ }
+enabled:
+
+ account_pmu_sb_event(event);
}
/*
- * Allocate and initialize a event structure
+ * Allocate and initialize an event structure
*/
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
@@ -7487,16 +12929,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
void *context, int cgroup_fd)
{
struct pmu *pmu;
- struct perf_event *event;
struct hw_perf_event *hwc;
long err = -EINVAL;
+ int node;
if ((unsigned)cpu >= nr_cpu_ids) {
if (!task || cpu != -1)
return ERR_PTR(-EINVAL);
}
+ if (attr->sigtrap && !task) {
+ /* Requires a task: avoid signalling random tasks. */
+ return ERR_PTR(-EINVAL);
+ }
- event = kzalloc(sizeof(*event), GFP_KERNEL);
+ node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
+ struct perf_event *event __free(__free_event) =
+ kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -7510,18 +12958,24 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
mutex_init(&event->child_mutex);
INIT_LIST_HEAD(&event->child_list);
- INIT_LIST_HEAD(&event->group_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
+ INIT_LIST_HEAD(&event->active_list);
+ init_event_group(event);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
+ INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
- init_irq_work(&event->pending, perf_pending_event);
+ init_irq_work(&event->pending_irq, perf_pending_irq);
+ event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
+ init_task_work(&event->pending_task, perf_pending_task);
mutex_init(&event->mmap_mutex);
+ raw_spin_lock_init(&event->addr_filters.lock);
atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
@@ -7537,6 +12991,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->state = PERF_EVENT_STATE_INACTIVE;
+ if (parent_event)
+ event->event_caps = parent_event->event_caps;
+
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
@@ -7544,7 +13001,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
- event->hw.target = task;
+ event->hw.target = get_task_struct(task);
}
event->clock = &local_clock;
@@ -7554,10 +13011,26 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
+ if (parent_event->prog) {
+ struct bpf_prog *prog = parent_event->prog;
+
+ bpf_prog_inc(prog);
+ event->prog = prog;
+ }
+#endif
}
- event->overflow_handler = overflow_handler;
- event->overflow_handler_context = context;
+ if (overflow_handler) {
+ event->overflow_handler = overflow_handler;
+ event->overflow_handler_context = context;
+ } else if (is_write_backward(event)){
+ event->overflow_handler = perf_event_output_backward;
+ event->overflow_handler_context = NULL;
+ } else {
+ event->overflow_handler = perf_event_output_forward;
+ event->overflow_handler_context = NULL;
+ }
perf_event__state_init(event);
@@ -7565,64 +13038,121 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
hwc = &event->hw;
hwc->sample_period = attr->sample_period;
- if (attr->freq && attr->sample_freq)
+ if (is_event_in_freq_mode(event))
hwc->sample_period = 1;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * we currently do not support PERF_FORMAT_GROUP on inherited events
+ * We do not support PERF_SAMPLE_READ on inherited events unless
+ * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ * collect per-thread samples.
+ * See perf_output_read().
*/
- if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
- goto err_ns;
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
+ return ERR_PTR(-EINVAL);
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
- if (cgroup_fd != -1) {
- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ pmu = perf_init_event(event);
+ if (IS_ERR(pmu))
+ return (void*)pmu;
+
+ /*
+ * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
+ * The attach should be right after the perf_init_event().
+ * Otherwise, the __free_event() would mistakenly detach the non-exist
+ * perf_ctx_data because of the other errors between them.
+ */
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ err = attach_perf_ctx_data(event);
if (err)
- goto err_ns;
+ return ERR_PTR(err);
}
- pmu = perf_init_event(event);
- if (!pmu)
- goto err_ns;
- else if (IS_ERR(pmu)) {
- err = PTR_ERR(pmu);
- goto err_ns;
+ /*
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
+ */
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
+ return ERR_PTR(-EINVAL);
+
+ if (event->attr.aux_output &&
+ (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
+ event->attr.aux_pause || event->attr.aux_resume))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (event->attr.aux_pause && event->attr.aux_resume)
+ return ERR_PTR(-EINVAL);
+
+ if (event->attr.aux_start_paused) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return ERR_PTR(-EOPNOTSUPP);
+ event->hw.aux_paused = 1;
+ }
+
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ return ERR_PTR(err);
}
err = exclusive_event_init(event);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
+
+ if (has_addr_filter(event)) {
+ event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+ sizeof(struct perf_addr_filter_range),
+ GFP_KERNEL);
+ if (!event->addr_filter_ranges)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Clone the parent's vma offsets: they are valid until exec()
+ * even if the mm is not shared with the parent.
+ */
+ if (event->parent) {
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ raw_spin_lock_irq(&ifh->lock);
+ memcpy(event->addr_filter_ranges,
+ event->parent->addr_filter_ranges,
+ pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
+ raw_spin_unlock_irq(&ifh->lock);
+ }
+
+ /* force hw sync on the address filters */
+ event->addr_filters_gen = 1;
+ }
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
- err = get_callchain_buffers();
+ err = get_callchain_buffers(attr->sample_max_stack);
if (err)
- goto err_per_task;
+ return ERR_PTR(err);
+ event->attach_state |= PERF_ATTACH_CALLCHAIN;
}
}
- return event;
+ err = security_perf_event_alloc(event);
+ if (err)
+ return ERR_PTR(err);
-err_per_task:
- exclusive_event_destroy(event);
+ /* symmetric to unaccount_event() in _free_event() */
+ account_event(event);
-err_pmu:
- if (event->destroy)
- event->destroy(event);
- module_put(pmu->module);
-err_ns:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
- if (event->ns)
- put_pid_ns(event->ns);
- kfree(event);
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ lockdep_assert_held(&pmus_srcu);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
- return ERR_PTR(err);
+ return_ptr(event);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -7631,56 +13161,29 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
u32 size;
int ret;
- if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
- return -EFAULT;
-
- /*
- * zero the full structure, so that a short copy will be nice.
- */
+ /* Zero the full structure, so that a short copy will be nice. */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
- if (size > PAGE_SIZE) /* silly large */
- goto err_size;
-
- if (!size) /* abi compat */
+ /* ABI compatibility quirk: */
+ if (!size)
size = PERF_ATTR_SIZE_VER0;
-
- if (size < PERF_ATTR_SIZE_VER0)
+ if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
}
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
+ attr->size = size;
- if (attr->__reserved_1)
+ if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -7718,9 +13221,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
- if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+ ret = perf_allow_kernel();
+ if (ret)
+ return ret;
+ }
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -7739,13 +13244,34 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
* __u16 sample size limit.
*/
if (attr->sample_stack_user >= USHRT_MAX)
- ret = -EINVAL;
+ return -EINVAL;
else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
- ret = -EINVAL;
+ return -EINVAL;
}
+ if (!attr->sample_max_stack)
+ attr->sample_max_stack = sysctl_perf_event_max_stack;
+
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
+ return -EINVAL;
+#endif
+ if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
+ (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
+ return -EINVAL;
+
+ if (!attr->inherit && attr->inherit_thread)
+ return -EINVAL;
+
+ if (attr->remove_on_exec && attr->enable_on_exec)
+ return -EINVAL;
+
+ if (attr->sigtrap && !attr->remove_on_exec)
+ return -EINVAL;
+
out:
return ret;
@@ -7755,14 +13281,25 @@ err_size:
goto out;
}
+static void mutex_lock_double(struct mutex *a, struct mutex *b)
+{
+ if (b < a)
+ swap(a, b);
+
+ mutex_lock(a);
+ mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
+}
+
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL;
+ struct perf_buffer *rb = NULL;
int ret = -EINVAL;
- if (!output_event)
+ if (!output_event) {
+ mutex_lock(&event->mmap_mutex);
goto set;
+ }
/* don't allow circular references */
if (event == output_event)
@@ -7777,7 +13314,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -7787,23 +13324,46 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
goto out;
/*
+ * Either writing ring buffer from beginning or from end.
+ * Mixing is not allowed.
+ */
+ if (is_write_backward(output_event) != is_write_backward(event))
+ goto out;
+
+ /*
* If both events generate aux data, they must be on the same PMU
*/
if (has_aux(event) && has_aux(output_event) &&
event->pmu != output_event->pmu)
goto out;
+ /*
+ * Hold both mmap_mutex to serialize against perf_mmap_close(). Since
+ * output_event is already on rb->event_list, and the list iteration
+ * restarts after every removal, it is guaranteed this new event is
+ * observed *OR* if output_event is already removed, it's guaranteed we
+ * observe !rb->mmap_count.
+ */
+ mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
- mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
- if (atomic_read(&event->mmap_count))
+ if (refcount_read(&event->mmap_count))
goto unlock;
if (output_event) {
+ if (output_event->state <= PERF_EVENT_STATE_REVOKED)
+ goto unlock;
+
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
+
+ /* did we race against perf_mmap_close() */
+ if (!refcount_read(&rb->mmap_count)) {
+ ring_buffer_put(rb);
+ goto unlock;
+ }
}
ring_buffer_attach(event, rb);
@@ -7811,20 +13371,13 @@ set:
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
+ if (output_event)
+ mutex_unlock(&output_event->mmap_mutex);
out:
return ret;
}
-static void mutex_lock_double(struct mutex *a, struct mutex *b)
-{
- if (b < a)
- swap(a, b);
-
- mutex_lock(a);
- mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
-}
-
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
@@ -7845,11 +13398,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
break;
case CLOCK_BOOTTIME:
- event->clock = &ktime_get_boot_ns;
+ event->clock = &ktime_get_boottime_ns;
break;
case CLOCK_TAI:
- event->clock = &ktime_get_tai_ns;
+ event->clock = &ktime_get_clocktai_ns;
break;
default:
@@ -7862,6 +13415,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
+static bool
+perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
+{
+ unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
+ bool is_capable = perfmon_capable();
+
+ if (attr->sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other task.
+ * Require the current task to also have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
+ rcu_read_unlock();
+
+ /*
+ * If the required capabilities aren't available, checks for
+ * ptrace permissions: upgrade to ATTACH, since sending signals
+ * can effectively change the target task.
+ */
+ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
+ }
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility. The
+ * ptrace check also includes checks that the current task and other
+ * task have matching uids, and is therefore not done here explicitly.
+ */
+ return is_capable || ptrace_may_access(task, ptrace_mode);
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -7869,17 +13453,18 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
* @pid: target pid
* @cpu: target cpu
* @group_fd: group leader event fd
+ * @flags: perf event open flags
*/
SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *uninitialized_var(gctx);
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct fd group = {NULL, 0};
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
@@ -7896,8 +13481,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (err)
return err;
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(PERF_SECURITY_OPEN);
+ if (err)
+ return err;
+
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ err = perf_allow_kernel();
+ if (err)
+ return err;
+ }
+
+ if (attr.namespaces) {
+ if (!perfmon_capable())
return -EACCES;
}
@@ -7909,6 +13505,20 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get physical addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+ err = perf_allow_kernel();
+ if (err)
+ return err;
+ }
+
+ /* REGS_INTR can leak data, lockdown must prevent this */
+ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
+ err = security_locked_down(LOCKDOWN_PERF);
+ if (err)
+ return err;
+ }
+
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
@@ -7925,11 +13535,22 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
+ CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
- err = perf_fget_light(group_fd, &group);
- if (err)
+ if (!is_perf_file(group)) {
+ err = -EBADF;
+ goto err_fd;
+ }
+ group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
goto err_fd;
- group_leader = group.file->private_data;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -7940,7 +13561,7 @@ SYSCALL_DEFINE5(perf_event_open,
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
- goto err_group_fd;
+ goto err_fd;
}
}
@@ -7950,8 +13571,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
- get_online_cpus();
-
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -7959,18 +13578,16 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cpus;
+ goto err_task;
}
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
- err = -ENOTSUPP;
+ err = -EOPNOTSUPP;
goto err_alloc;
}
}
- account_event(event);
-
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -7983,51 +13600,56 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
- if (group_leader &&
- (is_software_event(event) != is_software_event(group_leader))) {
- if (is_software_event(event)) {
- /*
- * If event and group_leader are not both a software
- * event, and event is, then group leader is not.
- *
- * Allow the addition of software events to !software
- * groups, this is safe because software events never
- * fail to schedule.
- */
- pmu = group_leader->pmu;
- } else if (is_software_event(group_leader) &&
- (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
+
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
}
- if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
- err = -EBUSY;
- goto err_context;
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
}
- if (task) {
- put_task_struct(task);
- task = NULL;
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -8036,42 +13658,73 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_locked;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same context; either task or cpu.
+ */
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
+ goto err_locked;
+
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
+ /*
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
+ */
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
+ }
+
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
+ goto err_locked;
+ }
+ }
+
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
if (output_event) {
err = perf_event_set_output(event, output_event);
@@ -8079,41 +13732,47 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
goto err_context;
}
- if (move_group) {
- gctx = group_leader->ctx;
+ if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
+ err = -EINVAL;
+ goto err_context;
+ }
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ /*
+ * Must be under the same ctx::mutex as perf_install_in_context(),
+ * because we need to serialize with concurrent event creation.
+ */
+ if (!exclusive_event_installable(event, ctx)) {
+ err = -EBUSY;
+ goto err_context;
+ }
- perf_remove_from_context(group_leader, false);
+ WARN_ON_ONCE(ctx->parent_ctx);
- list_for_each_entry(sibling, &group_leader->sibling_list,
- group_entry) {
- perf_remove_from_context(sibling, false);
- put_ctx(gctx);
- }
- } else {
- mutex_lock(&ctx->mutex);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
}
- WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * This is the point on no return; we cannot fail hereafter. This is
+ * where we start modifying current state.
+ */
if (move_group) {
- /*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
+ perf_remove_from_context(group_leader, 0);
+ put_pmu_ctx(group_leader->pmu_ctx);
+
+ for_each_sibling_event(sibling, group_leader) {
+ perf_remove_from_context(sibling, 0);
+ put_pmu_ctx(sibling->pmu_ctx);
+ }
/*
* Install the group siblings before the group leader.
@@ -8125,11 +13784,11 @@ SYSCALL_DEFINE5(perf_event_open,
* By installing siblings first we NO-OP because they're not
* reachable through the group lists.
*/
- list_for_each_entry(sibling, &group_leader->sibling_list,
- group_entry) {
+ for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -8137,63 +13796,61 @@ SYSCALL_DEFINE5(perf_event_open,
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
- if (!exclusive_event_installable(event, ctx)) {
- err = -EBUSY;
- mutex_unlock(&ctx->mutex);
- fput(event_file);
- goto err_context;
- }
+ /*
+ * Precalculate sample_data sizes; do while holding ctx::mutex such
+ * that we're serialized against further additions and before
+ * perf_install_in_context() which is the point the event is active and
+ * can use these values.
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
+ event->owner = current;
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group) {
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- }
mutex_unlock(&ctx->mutex);
- put_online_cpus();
-
- event->owner = current;
+ if (task) {
+ up_read(&task->signal->exec_update_lock);
+ put_task_struct(task);
+ }
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
/*
- * Precalculate sample_data sizes
- */
- perf_event__header_size(event);
- perf_event__id_header_size(event);
-
- /*
- * Drop the reference on the group_event after placing the
- * new event on the sibling_list. This ensures destruction
- * of the group leader will find the pointer to itself in
- * perf_group_detach().
+ * File reference in group guarantees that group_leader has been
+ * kept alive until we place the new event on the sibling_list.
+ * This ensures destruction of the group leader will find
+ * the pointer to itself in perf_group_detach().
*/
- fdput(group);
fd_install(event_fd, event_file);
return event_fd;
err_context:
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
+err_locked:
+ mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
+err_cred:
+ if (task)
+ up_read(&task->signal->exec_update_lock);
err_alloc:
- free_event(event);
-err_cpus:
- put_online_cpus();
+ put_event(event);
err_task:
if (task)
put_task_struct(task);
-err_group_fd:
- fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
@@ -8205,6 +13862,8 @@ err_fd:
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @task: task to profile (NULL for percpu)
+ * @overflow_handler: callback to trigger when we hit the event
+ * @context: context data could be used in overflow_handler callback
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
@@ -8212,13 +13871,23 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
- * Get the target context (task or percpu):
+ * Grouping is not supported for kernel events, neither is 'AUX',
+ * make sure the caller's intentions are adjusted.
*/
+ if (attr->aux_output || attr->aux_action)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
@@ -8228,66 +13897,122 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
}
/* Mark owner so we could distinguish it from user events. */
- event->owner = EVENT_OWNER_KERNEL;
+ event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
- account_event(event);
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
- ctx = find_get_context(event->pmu, task, event);
+ /*
+ * Get the target context (task or percpu):
+ */
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_unlock;
+ }
+
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx =
+ container_of(ctx, struct perf_cpu_context, ctx);
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_pmu_ctx;
+ }
+ }
+
if (!exclusive_event_installable(event, ctx)) {
- mutex_unlock(&ctx->mutex);
- perf_unpin_context(ctx);
- put_ctx(ctx);
err = -EBUSY;
- goto err_free;
+ goto err_pmu_ctx;
}
- perf_install_in_context(ctx, event, cpu);
+ perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
return event;
-err_free:
- free_event(event);
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
+ event->pmu_ctx = NULL; /* _free_event() */
+err_unlock:
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
+err_alloc:
+ put_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
+ struct perf_event *event, *sibling;
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
+ perf_remove_from_context(event, 0);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
- perf_remove_from_context(event, false);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
+
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+ struct perf_event_context *old_ctx = event->ctx;
+
+ get_ctx(ctx); /* normally find_get_context() */
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_install_in_context(ctx, event, cpu);
/*
- * Wait for the events to quiesce before re-instating them.
+ * Now that event->ctx is updated and visible, put the old ctx.
*/
- synchronize_rcu();
+ put_ctx(old_ctx);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -8297,45 +14022,72 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
+ }
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ /*
+ * Since per-cpu context is persistent, no need to grab an extra
+ * reference.
+ */
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ if (!list_empty(&events)) {
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
}
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
-static void sync_child_event(struct perf_event *child_event,
- struct task_struct *child)
+static void sync_child_event(struct perf_event *child_event)
{
struct perf_event *parent_event = child_event->parent;
u64 child_val;
- if (child_event->attr.inherit_stat)
- perf_event_read_event(child_event, child);
+ if (child_event->attr.inherit_stat) {
+ struct task_struct *task = child_event->ctx->task;
- child_val = perf_event_count(child_event);
+ if (task && task != TASK_TOMBSTONE)
+ perf_event_read_event(child_event, task);
+ }
+
+ child_val = perf_event_count(child_event, false);
/*
* Add back the child's count to the parent's count:
@@ -8345,98 +14097,111 @@ static void sync_child_event(struct perf_event *child_event,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
-
- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}
static void
-__perf_event_exit_task(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
+ struct perf_event *parent_event = event->parent;
+ unsigned long detach_flags = DETACH_EXIT;
+ unsigned int attach_state;
+
+ if (parent_event) {
+ /*
+ * Do not destroy the 'original' grouping; because of the
+ * context switch optimization the original events could've
+ * ended up in a random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this
+ * random child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
+ mutex_lock(&parent_event->child_mutex);
+ /* PERF_ATTACH_ITRACE might be set concurrently */
+ attach_state = READ_ONCE(event->attach_state);
+ }
+
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
+
+ perf_remove_from_context(event, detach_flags);
/*
- * Do not destroy the 'original' grouping; because of the context
- * switch optimization the original events could've ended up in a
- * random child task.
- *
- * If we were to destroy the original group, all group related
- * operations would cease to function properly after this random
- * child dies.
- *
- * Do destroy all inherited groups, we don't care about those
- * and being thorough is better.
+ * Child events can be freed.
*/
- perf_remove_from_context(child_event, !!child_event->parent);
+ if (parent_event) {
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Match the refcount initialization. Make sure it doesn't happen
+ * twice if pmu_detach_event() calls it on an already exited task.
+ */
+ if (attach_state & PERF_ATTACH_CHILD) {
+ /*
+ * Kick perf_poll() for is_event_hup();
+ */
+ perf_event_wakeup(parent_event);
+ /*
+ * pmu_detach_event() will have an extra refcount.
+ * perf_pending_task() might have one too.
+ */
+ put_event(event);
+ }
+
+ return;
+ }
/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
- perf_event_wakeup(child_event);
- }
+ perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *task, bool exit)
{
+ struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
- struct perf_event_context *child_ctx, *clone_ctx = NULL;
- unsigned long flags;
- if (likely(!child->perf_event_ctxp[ctxn])) {
- perf_event_task(child, NULL, 0);
+ ctx = perf_pin_task_context(task);
+ if (!ctx)
return;
- }
- local_irq_save(flags);
/*
- * We can't reschedule here because interrupts are disabled,
- * and either child is current or it is a task that can't be
- * scheduled, so we are now safe from rescheduling changing
- * our context.
+ * In order to reduce the amount of tricky in ctx tear-down, we hold
+ * ctx::mutex over the entire thing. This serializes against almost
+ * everything that wants to access the ctx.
+ *
+ * The exception is sys_perf_event_open() /
+ * perf_event_create_kernel_count() which does find_get_context()
+ * without ctx::mutex (it cannot because of the move_group double mutex
+ * lock thing). See the comments in perf_install_in_context().
*/
- child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+ mutex_lock(&ctx->mutex);
/*
- * Take the context lock here so that if find_get_context is
- * reading child->perf_event_ctxp, we wait until it has
- * incremented the context's refcount before we do put_ctx below.
+ * In a single ctx::lock section, de-schedule the events and detach the
+ * context from the task such that we cannot ever get it scheduled back
+ * in.
*/
- raw_spin_lock(&child_ctx->lock);
- task_ctx_sched_out(child_ctx);
- child->perf_event_ctxp[ctxn] = NULL;
+ raw_spin_lock_irq(&ctx->lock);
+ if (exit)
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
/*
- * If this context is a clone; unclone it so it can't get
- * swapped to another process while we're removing all
- * the events from it.
+ * Now that the context is inactive, destroy the task <-> ctx relation
+ * and mark the context dead.
*/
- clone_ctx = unclone_ctx(child_ctx);
- update_context_time(child_ctx);
- raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ put_ctx(ctx); /* cannot be last */
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+
+ clone_ctx = unclone_ctx(ctx);
+ raw_spin_unlock_irq(&ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -8446,38 +14211,48 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
- perf_event_task(child, child_ctx, 0);
-
- /*
- * We can recurse on the same lock type through:
- *
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event()
- * mutex_lock(&ctx->mutex)
- *
- * But since its the parent context it won't be the same instance.
- */
- mutex_lock(&child_ctx->mutex);
+ if (exit)
+ perf_event_task(task, ctx, 0);
- list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
+ list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry)
+ perf_event_exit_event(child_event, ctx, false);
- mutex_unlock(&child_ctx->mutex);
+ mutex_unlock(&ctx->mutex);
- put_ctx(child_ctx);
+ if (!exit) {
+ /*
+ * perf_event_release_kernel() could still have a reference on
+ * this context. In that case we must wait for these events to
+ * have been freed (in particular all their references to this
+ * task must've been dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount,
+ refcount_read(&ctx->refcount) == 1);
+ }
+ put_ctx(ctx);
}
/*
- * When a child task exits, feed back event values to parent events.
+ * When a task exits, feed back event values to parent events.
+ *
+ * Can be called with exec_update_lock held when called from
+ * setup_new_exec().
*/
-void perf_event_exit_task(struct task_struct *child)
+void perf_event_exit_task(struct task_struct *task)
{
struct perf_event *event, *tmp;
- int ctxn;
- mutex_lock(&child->perf_event_mutex);
- list_for_each_entry_safe(event, tmp, &child->perf_event_list,
+ WARN_ON_ONCE(task != current);
+
+ mutex_lock(&task->perf_event_mutex);
+ list_for_each_entry_safe(event, tmp, &task->perf_event_list,
owner_entry) {
list_del_init(&event->owner_entry);
@@ -8486,84 +14261,90 @@ void perf_event_exit_task(struct task_struct *child)
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
- smp_wmb();
- event->owner = NULL;
+ smp_store_release(&event->owner, NULL);
}
- mutex_unlock(&child->perf_event_mutex);
-
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
-}
-
-static void perf_free_event(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *parent = event->parent;
+ mutex_unlock(&task->perf_event_mutex);
- if (WARN_ON_ONCE(!parent))
- return;
-
- mutex_lock(&parent->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent->child_mutex);
+ perf_event_exit_task_context(task, true);
- put_event(parent);
+ /*
+ * The perf_event_exit_task_context calls perf_event_task
+ * with task's task_ctx, which generates EXIT events for
+ * task contexts and sets task->perf_event_ctxp[] to NULL.
+ * At this point we need to send EXIT events to cpu contexts.
+ */
+ perf_event_task(task, NULL, 0);
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- list_del_event(event, ctx);
- raw_spin_unlock_irq(&ctx->lock);
- free_event(event);
+ /*
+ * Detach the perf_ctx_data for the system-wide event.
+ */
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ detach_task_ctx_data(task);
}
/*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
*
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
*/
void perf_event_free_task(struct task_struct *task)
{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
- int ctxn;
+ perf_event_exit_task_context(task, false);
+}
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+void perf_event_delayed_put(struct task_struct *task)
+{
+ WARN_ON_ONCE(task->perf_event_ctxp);
+}
- mutex_lock(&ctx->mutex);
-again:
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
- group_entry)
- perf_free_event(event, ctx);
+struct file *perf_event_get(unsigned int fd)
+{
+ struct file *file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
- group_entry)
- perf_free_event(event, ctx);
+ if (file->f_op != &perf_fops) {
+ fput(file);
+ return ERR_PTR(-EBADF);
+ }
- if (!list_empty(&ctx->pinned_groups) ||
- !list_empty(&ctx->flexible_groups))
- goto again;
+ return file;
+}
- mutex_unlock(&ctx->mutex);
+const struct perf_event *perf_get_event(struct file *file)
+{
+ if (file->f_op != &perf_fops)
+ return ERR_PTR(-EINVAL);
- put_ctx(ctx);
- }
+ return file->private_data;
}
-void perf_event_delayed_put(struct task_struct *task)
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
- int ctxn;
+ if (!event)
+ return ERR_PTR(-EINVAL);
+
+ return &event->attr;
+}
+
+int perf_allow_kernel(void)
+{
+ if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
+ return -EACCES;
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ return security_perf_event_open(PERF_SECURITY_KERNEL);
}
+EXPORT_SYMBOL_GPL(perf_allow_kernel);
/*
- * inherit a event from parent task to child task:
+ * Inherit an event from parent task to child task.
+ *
+ * Returns:
+ * - valid pointer on success
+ * - NULL for orphaned events
+ * - IS_ERR() on error
*/
static struct perf_event *
inherit_event(struct perf_event *parent_event,
@@ -8573,7 +14354,8 @@ inherit_event(struct perf_event *parent_event,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
- enum perf_event_active_state parent_state = parent_event->state;
+ enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -8586,6 +14368,14 @@ inherit_event(struct perf_event *parent_event,
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
+ /*
+ * Event creation should be under SRCU, see perf_pmu_unregister().
+ */
+ guard(srcu)(&pmus_srcu);
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
@@ -8594,14 +14384,30 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ get_ctx(child_ctx);
+ child_event->ctx = child_ctx;
+
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return ERR_CAST(pmu_ctx);
+ }
+ child_event->pmu_ctx = pmu_ctx;
+
+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel(), such that either we must observe
+ * is_orphaned_event() or they will observe us on the child_list.
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
- get_ctx(child_ctx);
-
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
@@ -8622,7 +14428,6 @@ inherit_event(struct perf_event *parent_event,
local64_set(&hwc->period_left, sample_period);
}
- child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
@@ -8638,19 +14443,28 @@ inherit_event(struct perf_event *parent_event,
*/
raw_spin_lock_irqsave(&child_ctx->lock, flags);
add_event_to_ctx(child_event, child_ctx);
+ child_event->attach_state |= PERF_ATTACH_CHILD;
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
return child_event;
}
+/*
+ * Inherits an event group.
+ *
+ * This will quietly suppress orphaned events; !inherit_event() is not an error.
+ * This matches with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int inherit_group(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -8665,30 +14479,55 @@ static int inherit_group(struct perf_event *parent_event,
child, NULL, child_ctx);
if (IS_ERR(leader))
return PTR_ERR(leader);
- list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+ /*
+ * @leader can be NULL here because of is_orphaned_event(). In this
+ * case inherit_event() will create individual events, similar to what
+ * perf_group_detach() would do anyway.
+ */
+ for_each_sibling_event(sub, parent_event) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
if (IS_ERR(child_ctr))
return PTR_ERR(child_ctr);
+
+ if (sub->aux_event == parent_event && child_ctr &&
+ !perf_get_aux_event(child_ctr, leader))
+ return -EINVAL;
}
+ if (leader)
+ leader->group_generation = parent_event->group_generation;
return 0;
}
+/*
+ * Creates the child task context and tries to inherit the event-group.
+ *
+ * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
+ * inherited_all set when we 'fail' to inherit an orphaned event; this is
+ * consistent with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
- int *inherited_all)
+ struct task_struct *child,
+ u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
- if (!event->attr.inherit) {
+ if (!event->attr.inherit ||
+ (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
+ /* Do not inherit if sigtrap and signal handlers were cleared. */
+ (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
*inherited_all = 0;
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -8696,17 +14535,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
-
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -8716,7 +14552,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -8726,14 +14562,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -8754,11 +14590,11 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
- list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
/*
@@ -8770,17 +14606,17 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
- list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+ perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -8802,6 +14638,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
@@ -8813,20 +14650,20 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
/*
* Initialize the perf_event context in task_struct
*/
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ memset(child->perf_recursion, 0, sizeof(child->perf_recursion));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
+ child->perf_ctx_data = NULL;
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -8835,22 +14672,42 @@ int perf_event_init_task(struct task_struct *child)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
+ zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+ raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
-static void perf_event_init_cpu(int cpu)
+static void perf_swevent_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
- swhash->online = true;
- if (swhash->hlist_refcount > 0) {
+ if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
struct swevent_hlist *hlist;
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -8860,50 +14717,143 @@ static void perf_event_init_cpu(int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = true };
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
+ struct perf_event *event;
- rcu_read_lock();
- list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
- __perf_remove_from_context(&re);
- rcu_read_unlock();
+ raw_spin_lock(&ctx->lock);
+ ctx_sched_out(ctx, NULL, EVENT_TIME);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ raw_spin_unlock(&ctx->lock);
}
-static void perf_event_exit_cpu_context(int cpu)
+static void perf_event_clear_cpumask(unsigned int cpu)
{
- struct perf_event_context *ctx;
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
struct pmu *pmu;
- int idx;
- idx = srcu_read_lock(&pmus_srcu);
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+ cpumask_clear_cpu(cpu, perf_online_mask);
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- mutex_unlock(&ctx->mutex);
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
}
- srcu_read_unlock(&pmus_srcu, idx);
}
-static void perf_event_exit_cpu(int cpu)
+static void perf_event_exit_cpu_context(int cpu)
{
- struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
- perf_event_exit_cpu_context(cpu);
+ // XXX simplify cpuctx->online
+ mutex_lock(&pmus_lock);
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&swhash->hlist_mutex);
- swhash->online = false;
- swevent_hlist_release(swhash);
- mutex_unlock(&swhash->hlist_mutex);
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
+ mutex_unlock(&pmus_lock);
}
#else
-static inline void perf_event_exit_cpu(int cpu) { }
+
+static void perf_event_exit_cpu_context(int cpu) { }
+
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (cpumask_empty(perf_online_mask)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ goto end;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+end:
+ cpumask_set_cpu(cpu, perf_online_mask);
+}
+
+int perf_event_init_cpu(unsigned int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+
+ perf_swevent_init_cpu(cpu);
+
+ mutex_lock(&pmus_lock);
+ perf_event_setup_cpumask(cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
+ mutex_unlock(&pmus_lock);
+
+ return 0;
+}
+
+int perf_event_exit_cpu(unsigned int cpu)
+{
+ perf_event_exit_cpu_context(cpu);
+ return 0;
+}
+
static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
@@ -8924,49 +14874,28 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- case CPU_DOWN_FAILED:
- perf_event_init_cpu(cpu);
- break;
-
- case CPU_UP_CANCELED:
- case CPU_DOWN_PREPARE:
- perf_event_exit_cpu(cpu);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
void __init perf_event_init(void)
{
int ret;
idr_init(&pmu_idr);
+ unwind_deferred_init(&perf_unwind_work,
+ perf_unwind_deferred_callback);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
- perf_pmu_register(&perf_cpu_clock, NULL, -1);
- perf_pmu_register(&perf_task_clock, NULL, -1);
+ perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
+ perf_pmu_register(&perf_task_clock, "task_clock", -1);
perf_tp_register();
- perf_cpu_notifier(perf_cpu_notify);
+ perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&perf_sched_events, HZ);
+ perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
/*
* Build time assertion that we keep the data_head at the intended
@@ -8987,6 +14916,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
return 0;
}
+EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
static int __init perf_event_sysfs_init(void)
{
@@ -9000,7 +14930,7 @@ static int __init perf_event_sysfs_init(void)
goto unlock;
list_for_each_entry(pmu, &pmus, entry) {
- if (!pmu->name || pmu->type < 0)
+ if (pmu->dev)
continue;
ret = pmu_dev_alloc(pmu);
@@ -9043,41 +14973,45 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
kfree(jc);
}
-static int __perf_cgroup_move(void *info)
+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
{
- struct task_struct *task = info;
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ perf_event_cgroup(css->cgroup);
return 0;
}
-static void perf_cgroup_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static int __perf_cgroup_move(void *info)
{
- struct task_struct *task;
+ struct task_struct *task = info;
- cgroup_taskset_for_each(task, tset)
- task_function_call(task, __perf_cgroup_move, task);
+ preempt_disable();
+ perf_cgroup_switch(task);
+ preempt_enable();
+
+ return 0;
}
-static void perf_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
+static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
- /*
- * cgroup_exit() is called in the copy_process() failure path.
- * Ignore this case since the task hasn't ran yet, this avoids
- * trying to poke a half freed task state from generic code.
- */
- if (!(task->flags & PF_EXITING))
- return;
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
- task_function_call(task, __perf_cgroup_move, task);
+ cgroup_taskset_for_each(task, css, tset)
+ task_function_call(task, __perf_cgroup_move, task);
}
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
- .exit = perf_cgroup_exit,
+ .css_online = perf_cgroup_css_online,
.attach = perf_cgroup_attach,
+ /*
+ * Implicitly enable on dfl hierarchy so that perf events can
+ * always be filtered by cgroup2 path as long as perf_event
+ * controller is not mounted on a legacy hierarchy.
+ */
+ .implicit_on_dfl = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
+
+DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4ccc26..8ec2cb688903 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -1,18 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) 2007 Alan Stern
* Copyright (C) IBM Corporation, 2009
* Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
@@ -30,103 +17,341 @@
* This file contains the arch-independent routines.
*/
+#include <linux/hw_breakpoint.h>
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/irqflags.h>
-#include <linux/kallsyms.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/hw_breakpoint.h>
/*
- * Constraints data
+ * Datastructure to track the total uses of N slots across tasks or CPUs;
+ * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots.
+ */
+struct bp_slots_histogram {
+#ifdef hw_breakpoint_slots
+ atomic_t count[hw_breakpoint_slots(0)];
+#else
+ atomic_t *count;
+#endif
+};
+
+/*
+ * Per-CPU constraints data.
*/
struct bp_cpuinfo {
- /* Number of pinned cpu breakpoints in a cpu */
- unsigned int cpu_pinned;
- /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
- unsigned int *tsk_pinned;
- /* Number of non-pinned cpu/task breakpoints in a cpu */
- unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+ /* Number of pinned CPU breakpoints in a CPU. */
+ unsigned int cpu_pinned;
+ /* Histogram of pinned task breakpoints in a CPU. */
+ struct bp_slots_histogram tsk_pinned;
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
+/* Number of pinned CPU breakpoints globally. */
+static struct bp_slots_histogram cpu_pinned[TYPE_MAX];
+/* Number of pinned CPU-independent task breakpoints. */
+static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX];
+
/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};
-static int constraints_initialized;
+static bool constraints_initialized __ro_after_init;
-/* Gather the number of total pinned and un-pinned bp in a cpuset */
-struct bp_busy_slots {
- unsigned int pinned;
- unsigned int flexible;
-};
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ * (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ * 2. Holding a write-lock is required for computations that require a
+ * stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ * 3. In all other cases, non-atomic accesses require the appropriately held
+ * lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
+
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+ struct task_struct *tsk = bp->hw.target;
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+ return tsk ? &tsk->perf_event_mutex : NULL;
+}
-__weak int hw_breakpoint_weight(struct perf_event *bp)
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
{
- return 1;
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx) {
+ /*
+ * Fully analogous to the perf_try_init_event() nesting
+ * argument in the comment near perf_event_ctx_lock_nested();
+ * this child->perf_event_mutex cannot ever deadlock against
+ * the parent->perf_event_mutex usage from
+ * perf_event_task_{en,dis}able().
+ *
+ * Specifically, inherited events will never occur on
+ * ->perf_event_list.
+ */
+ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING);
+ percpu_down_read(&bp_cpuinfo_sem);
+ } else {
+ percpu_down_write(&bp_cpuinfo_sem);
+ }
+
+ return tsk_mtx;
}
-static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
{
- if (bp->attr.bp_type & HW_BREAKPOINT_RW)
- return TYPE_DATA;
+ if (tsk_mtx) {
+ percpu_up_read(&bp_cpuinfo_sem);
+ mutex_unlock(tsk_mtx);
+ } else {
+ percpu_up_write(&bp_cpuinfo_sem);
+ }
+}
- return TYPE_INST;
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+ (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+ percpu_is_read_locked(&bp_cpuinfo_sem));
+}
+
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx)
+ lockdep_assert_held(tsk_mtx);
+ lockdep_assert_held(&bp_cpuinfo_sem);
}
+#ifdef hw_breakpoint_slots
/*
- * Report the maximum number of pinned breakpoints a task
- * have in this cpu
+ * Number of breakpoint slots is constant, and the same for all types.
*/
-static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
+static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA));
+static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); }
+static inline int init_breakpoint_slots(void) { return 0; }
+#else
+/*
+ * Dynamic number of breakpoint slots.
+ */
+static int __nr_bp_slots[TYPE_MAX] __ro_after_init;
+
+static inline int hw_breakpoint_slots_cached(int type)
+{
+ return __nr_bp_slots[type];
+}
+
+static __init bool
+bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL);
+ return hist->count;
+}
+
+static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist)
+{
+ kfree(hist->count);
+}
+
+static __init int init_breakpoint_slots(void)
{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int i;
+ int i, cpu, err_cpu;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ __nr_bp_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
- for (i = nr_slots[type] - 1; i >= 0; i--) {
- if (tsk_pinned[i] > 0)
+ if (!bp_slots_histogram_alloc(&info->tsk_pinned, i))
+ goto err;
+ }
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ if (!bp_slots_histogram_alloc(&cpu_pinned[i], i))
+ goto err;
+ if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i))
+ goto err;
+ }
+
+ return 0;
+err:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ bp_slots_histogram_free(&cpu_pinned[i]);
+ bp_slots_histogram_free(&tsk_pinned_all[i]);
+ }
+
+ return -ENOMEM;
+}
+#endif
+
+static inline void
+bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val)
+{
+ const int old_idx = old - 1;
+ const int new_idx = old_idx + val;
+
+ if (old_idx >= 0)
+ WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0);
+ if (new_idx >= 0)
+ WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0);
+}
+
+static int
+bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count = atomic_read(&hist->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist->count[i]);
+ if (count > 0)
+ return i + 1;
+ WARN(count < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+static int
+bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2,
+ enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count1 = atomic_read(&hist1->count[i]);
+ const int count2 = atomic_read(&hist2->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]);
+ ASSERT_EXCLUSIVE_WRITER(hist2->count[i]);
+ if (count1 + count2 > 0)
return i + 1;
+ WARN(count1 < 0, "inconsistent breakpoint slots histogram");
+ WARN(count2 < 0, "inconsistent breakpoint slots histogram");
}
return 0;
}
+#ifndef hw_breakpoint_weight
+static inline int hw_breakpoint_weight(struct perf_event *bp)
+{
+ return 1;
+}
+#endif
+
+static inline enum bp_type_idx find_slot_idx(u64 bp_type)
+{
+ if (bp_type & HW_BREAKPOINT_RW)
+ return TYPE_DATA;
+
+ return TYPE_INST;
+}
+
+/*
+ * Return the maximum number of pinned breakpoints a task has in this CPU.
+ */
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
+{
+ struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned;
+
+ /*
+ * At this point we want to have acquired the bp_cpuinfo_sem as a
+ * writer to ensure that there are no concurrent writers in
+ * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+ */
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type);
+}
+
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
+ *
+ * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent,
+ * returns a negative value.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;
- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter) == type &&
- (iter->cpu < 0 || cpu == iter->cpu))
- count += hw_breakpoint_weight(iter);
+ /*
+ * We need a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
+
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) != type)
+ continue;
+
+ if (iter->cpu >= 0) {
+ if (cpu == -1) {
+ count = -1;
+ goto out;
+ } else if (cpu != iter->cpu)
+ continue;
+ }
+
+ count += hw_breakpoint_weight(iter);
}
+out:
+ rcu_read_unlock();
return count;
}
@@ -138,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
}
/*
- * Report the number of pinned/un-pinned breakpoints we have in
- * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ * Returns the max pinned breakpoint slots in a given
+ * CPU (cpu > -1) or across all of them (cpu = -1).
*/
-static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
- enum bp_type_idx type)
+static int
+max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int pinned_slots = 0;
int cpu;
+ if (bp->hw.target && bp->cpu < 0) {
+ int max_pinned = task_bp_pinned(-1, bp, type);
+
+ if (max_pinned >= 0) {
+ /*
+ * Fast path: task_bp_pinned() is CPU-independent and
+ * returns the same value for any CPU.
+ */
+ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type);
+ return max_pinned;
+ }
+ }
+
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
@@ -158,86 +396,140 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
else
nr += task_bp_pinned(cpu, bp, type);
- if (nr > slots->pinned)
- slots->pinned = nr;
-
- nr = info->flexible;
- if (nr > slots->flexible)
- slots->flexible = nr;
+ pinned_slots = max(nr, pinned_slots);
}
-}
-/*
- * For now, continue to consider flexible as pinned, until we can
- * ensure no flexible event can ever be scheduled before a pinned event
- * in a same cpu.
- */
-static void
-fetch_this_slot(struct bp_busy_slots *slots, int weight)
-{
- slots->pinned += weight;
-}
-
-/*
- * Add a pinned breakpoint for the given task in our constraint table
- */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
- enum bp_type_idx type, int weight)
-{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int old_idx, new_idx;
-
- old_idx = task_bp_pinned(cpu, bp, type) - 1;
- new_idx = old_idx + weight;
-
- if (old_idx >= 0)
- tsk_pinned[old_idx]--;
- if (new_idx >= 0)
- tsk_pinned[new_idx]++;
+ return pinned_slots;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
- int weight)
+static int
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight)
{
- const struct cpumask *cpumask = cpumask_of_bp(bp);
- int cpu;
+ int cpu, next_tsk_pinned;
if (!enable)
weight = -weight;
- /* Pinned counter cpu profiling */
if (!bp->hw.target) {
- get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ /*
+ * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the
+ * global histogram.
+ */
+ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type);
+
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight);
+ info->cpu_pinned += weight;
+ return 0;
}
- /* Pinned counter task profiling */
- for_each_cpu(cpu, cpumask)
- toggle_bp_task_slot(bp, cpu, type, weight);
+ /*
+ * If bp->hw.target, tsk_pinned is only modified, but not used
+ * otherwise. We can permit concurrent updates as long as there are no
+ * other uses: having acquired bp_cpuinfo_sem as a reader allows
+ * concurrent updates here. Uses of tsk_pinned will require acquiring
+ * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+ */
+ lockdep_assert_held_read(&bp_cpuinfo_sem);
- if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
- else
- list_del(&bp->hw.bp_list);
-}
+ /*
+ * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global
+ * histogram. We need to take care of 4 cases:
+ *
+ * 1. This breakpoint targets all CPUs (cpu < 0), and there may only
+ * exist other task breakpoints targeting all CPUs. In this case we
+ * can simply update the global slots histogram.
+ *
+ * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may
+ * only exist other task breakpoints targeting all CPUs.
+ *
+ * a. On enable: remove the existing breakpoints from the global
+ * slots histogram and use the per-CPU histogram.
+ *
+ * b. On disable: re-insert the existing breakpoints into the global
+ * slots histogram and remove from per-CPU histogram.
+ *
+ * 3. Some other existing task breakpoints target specific CPUs. Only
+ * update the per-CPU slots histogram.
+ */
+
+ if (!enable) {
+ /*
+ * Remove before updating histograms so we can determine if this
+ * was the last task breakpoint for a specific CPU.
+ */
+ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ if (ret)
+ return ret;
+ }
+ /*
+ * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint.
+ */
+ next_tsk_pinned = task_bp_pinned(-1, bp, type);
+
+ if (next_tsk_pinned >= 0) {
+ if (bp->cpu < 0) { /* Case 1: fast path */
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight);
+ } else if (enable) { /* Case 2.a: slow path */
+ /* Add existing to per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ 0, next_tsk_pinned);
+ }
+ /* Add this first CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned,
+ -next_tsk_pinned);
+ } else { /* Case 2.b: slow path */
+ /* Remove this last CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned + hw_breakpoint_weight(bp), weight);
+ /* Remove all from per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, -next_tsk_pinned);
+ }
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned);
+ }
+ } else { /* Case 3: slow path */
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+
+ for_each_cpu(cpu, cpumask) {
+ next_tsk_pinned = task_bp_pinned(cpu, bp, type);
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ }
+ }
-/*
- * Function to perform processor-specific cleanup during unregistration
- */
-__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
-{
/*
- * A weak stub function here for those archs that don't define
- * it inside arch/.../kernel/hw_breakpoint.c
+ * Readers want a stable snapshot of the per-task breakpoint list.
*/
+ assert_bp_constraints_lock_held(bp);
+
+ if (enable)
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ return 0;
}
/*
- * Contraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter.
+ *
+ * Note: Flexible breakpoints are currently unimplemented, but outlined in the
+ * below algorithm for completeness. The implementation treats flexible as
+ * pinned due to no guarantee that we currently always schedule flexible events
+ * before a pinned event in a same CPU.
*
* == Non-pinned counter == (Considered as pinned for now)
*
@@ -277,10 +569,10 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
* ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
* + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
*/
-static int __reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
- struct bp_busy_slots slots = {0};
enum bp_type_idx type;
+ int max_pinned_slots;
int weight;
/* We couldn't initialize breakpoint constraints on boot */
@@ -288,60 +580,77 @@ static int __reserve_bp_slot(struct perf_event *bp)
return -ENOMEM;
/* Basic checks */
- if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
- bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+ if (bp_type == HW_BREAKPOINT_EMPTY ||
+ bp_type == HW_BREAKPOINT_INVALID)
return -EINVAL;
- type = find_slot_idx(bp);
+ type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- fetch_bp_busy_slots(&slots, bp, type);
- /*
- * Simulate the addition of this breakpoint to the constraints
- * and see the result.
- */
- fetch_this_slot(&slots, weight);
-
- /* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ /* Check if this new breakpoint can be satisfied across all CPUs. */
+ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight;
+ if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}
int reserve_bp_slot(struct perf_event *bp)
{
- int ret;
-
- mutex_lock(&nr_bp_mutex);
-
- ret = __reserve_bp_slot(bp);
-
- mutex_unlock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
return ret;
}
-static void __release_bp_slot(struct perf_event *bp)
+static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
{
enum bp_type_idx type;
int weight;
- type = find_slot_idx(bp);
+ type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}
void release_bp_slot(struct perf_event *bp)
{
- mutex_lock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+
+ __release_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
+}
+
+static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
+{
+ int err;
+
+ __release_bp_slot(bp, old_type);
+
+ err = __reserve_bp_slot(bp, new_type);
+ if (err) {
+ /*
+ * Reserve the old_type slot back in case
+ * there's no space for the new type.
+ *
+ * This must succeed, because we just released
+ * the old_type slot in the __release_bp_slot
+ * call above. If not, something is broken.
+ */
+ WARN_ON(__reserve_bp_slot(bp, old_type));
+ }
- arch_unregister_hw_breakpoint(bp);
- __release_bp_slot(bp);
+ return err;
+}
+
+static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
+{
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
+ return ret;
}
/*
@@ -351,32 +660,44 @@ void release_bp_slot(struct perf_event *bp)
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ int ret;
+
+ if (bp_constraints_is_locked(bp))
return -1;
- return __reserve_bp_slot(bp);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
+
+ return ret;
}
int dbg_release_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ if (bp_constraints_is_locked(bp))
return -1;
- __release_bp_slot(bp);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ __release_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
return 0;
}
-static int validate_hw_breakpoint(struct perf_event *bp)
+static int hw_breakpoint_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
{
- int ret;
+ int err;
- ret = arch_validate_hwbkpt_settings(bp);
- if (ret)
- return ret;
+ err = hw_breakpoint_arch_parse(bp, attr, hw);
+ if (err)
+ return err;
- if (arch_check_bp_in_kernelspace(bp)) {
- if (bp->attr.exclude_kernel)
+ if (arch_check_bp_in_kernelspace(hw)) {
+ if (attr->exclude_kernel)
return -EINVAL;
/*
* Don't let unprivileged users set a breakpoint in the trap
@@ -391,25 +712,29 @@ static int validate_hw_breakpoint(struct perf_event *bp)
int register_perf_hw_breakpoint(struct perf_event *bp)
{
- int ret;
-
- ret = reserve_bp_slot(bp);
- if (ret)
- return ret;
+ struct arch_hw_breakpoint hw = { };
+ int err;
- ret = validate_hw_breakpoint(bp);
+ err = reserve_bp_slot(bp);
+ if (err)
+ return err;
- /* if arch_validate_hwbkpt_settings() fails then release bp slot */
- if (ret)
+ err = hw_breakpoint_parse(bp, &bp->attr, &hw);
+ if (err) {
release_bp_slot(bp);
+ return err;
+ }
- return ret;
+ bp->hw.info = hw;
+
+ return 0;
}
/**
* register_user_hw_breakpoint - register a hardware breakpoint for user space
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
+ * @context: context data could be used in the triggered callback
* @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
struct perf_event *
@@ -423,19 +748,55 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+static void hw_breakpoint_copy_attr(struct perf_event_attr *to,
+ struct perf_event_attr *from)
+{
+ to->bp_addr = from->bp_addr;
+ to->bp_type = from->bp_type;
+ to->bp_len = from->bp_len;
+ to->disabled = from->disabled;
+}
+
+int
+modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
+ bool check)
+{
+ struct arch_hw_breakpoint hw = { };
+ int err;
+
+ err = hw_breakpoint_parse(bp, attr, &hw);
+ if (err)
+ return err;
+
+ if (check) {
+ struct perf_event_attr old_attr;
+
+ old_attr = bp->attr;
+ hw_breakpoint_copy_attr(&old_attr, attr);
+ if (memcmp(&old_attr, attr, sizeof(*attr)))
+ return -EINVAL;
+ }
+
+ if (bp->attr.bp_type != attr->bp_type) {
+ err = modify_bp_slot(bp, bp->attr.bp_type, attr->bp_type);
+ if (err)
+ return err;
+ }
+
+ hw_breakpoint_copy_attr(&bp->attr, attr);
+ bp->hw.info = hw;
+
+ return 0;
+}
+
/**
* modify_user_hw_breakpoint - modify a user-space hardware breakpoint
* @bp: the breakpoint structure to modify
* @attr: new breakpoint attributes
- * @triggered: callback to trigger when we hit the breakpoint
- * @tsk: pointer to 'task_struct' of the process to which the address belongs
*/
int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
{
- u64 old_addr = bp->attr.bp_addr;
- u64 old_len = bp->attr.bp_len;
- int old_type = bp->attr.bp_type;
- int err = 0;
+ int err;
/*
* modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
@@ -444,35 +805,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
* current task.
*/
if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
- __perf_event_disable(bp);
+ perf_event_disable_local(bp);
else
perf_event_disable(bp);
- bp->attr.bp_addr = attr->bp_addr;
- bp->attr.bp_type = attr->bp_type;
- bp->attr.bp_len = attr->bp_len;
+ err = modify_user_hw_breakpoint_check(bp, attr, false);
- if (attr->disabled)
- goto end;
-
- err = validate_hw_breakpoint(bp);
- if (!err)
+ if (!bp->attr.disabled)
perf_event_enable(bp);
- if (err) {
- bp->attr.bp_addr = old_addr;
- bp->attr.bp_type = old_type;
- bp->attr.bp_len = old_len;
- if (!bp->attr.disabled)
- perf_event_enable(bp);
-
- return err;
- }
-
-end:
- bp->attr.disabled = attr->disabled;
-
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
@@ -492,6 +834,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
* register_wide_hw_breakpoint - register a wide breakpoint in the kernel
* @attr: breakpoint attributes
* @triggered: callback to trigger when we hit the breakpoint
+ * @context: context data could be used in the triggered callback
*
* @return a set of per_cpu pointers to perf events
*/
@@ -506,9 +849,9 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
cpu_events = alloc_percpu(typeof(*cpu_events));
if (!cpu_events)
- return (void __percpu __force *)ERR_PTR(-ENOMEM);
+ return ERR_PTR_PCPU(-ENOMEM);
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
@@ -519,13 +862,13 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
per_cpu(*cpu_events, cpu) = bp;
}
- put_online_cpus();
+ cpus_read_unlock();
if (likely(!err))
return cpu_events;
unregister_wide_hw_breakpoint(cpu_events);
- return (void __percpu __force *)ERR_PTR(err);
+ return ERR_PTR_PCPU(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -544,6 +887,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+/**
+ * hw_breakpoint_is_used - check if breakpoints are currently used
+ *
+ * Returns: true if breakpoints are used, false otherwise.
+ */
+bool hw_breakpoint_is_used(void)
+{
+ int cpu;
+
+ if (!constraints_initialized)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+
+ if (info->cpu_pinned)
+ return true;
+
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ if (atomic_read(&info->tsk_pinned.count[slot]))
+ return true;
+ }
+ }
+ }
+
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ /*
+ * Warn, because if there are CPU pinned counters,
+ * should never get here; bp_cpuinfo::cpu_pinned should
+ * be consistent with the global cpu_pinned histogram.
+ */
+ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot])))
+ return true;
+
+ if (atomic_read(&tsk_pinned_all[type].count[slot]))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
@@ -563,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
return -ENOENT;
/*
- * no branch sampling for breakpoint events
+ * Check if breakpoint type is supported before proceeding.
+ * Also, no branch sampling for breakpoint events.
*/
- if (has_branch_stack(bp))
+ if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp))
return -EOPNOTSUPP;
err = register_perf_hw_breakpoint(bp);
@@ -618,38 +1006,19 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- int cpu, err_cpu;
- int i;
-
- for (i = 0; i < TYPE_MAX; i++)
- nr_slots[i] = hw_breakpoint_slots(i);
+ int ret;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < TYPE_MAX; i++) {
- struct bp_cpuinfo *info = get_bp_info(cpu, i);
+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ return ret;
- info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
- GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
- }
- }
+ ret = init_breakpoint_slots();
+ if (ret)
+ return ret;
- constraints_initialized = 1;
+ constraints_initialized = true;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
-
- err_alloc:
- for_each_possible_cpu(err_cpu) {
- for (i = 0; i < TYPE_MAX; i++)
- kfree(get_bp_info(err_cpu, i)->tsk_pinned);
- if (err_cpu == cpu)
- break;
- }
-
- return -ENOMEM;
}
-
-
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
new file mode 100644
index 000000000000..2cfeeecf8de9
--- /dev/null
+++ b/kernel/events/hw_breakpoint_test.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for hw_breakpoint constraints accounting logic.
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/cpumask.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/kthread.h>
+#include <linux/perf_event.h>
+#include <asm/hw_breakpoint.h>
+
+#define TEST_REQUIRES_BP_SLOTS(test, slots) \
+ do { \
+ if ((slots) > get_test_bp_slots()) { \
+ kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \
+ get_test_bp_slots()); \
+ } \
+ } while (0)
+
+#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr))
+
+#define MAX_TEST_BREAKPOINTS 512
+
+static char break_vars[MAX_TEST_BREAKPOINTS];
+static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS];
+static struct task_struct *__other_task;
+
+static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx)
+{
+ struct perf_event_attr attr = {};
+
+ if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS))
+ return NULL;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)&break_vars[idx];
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+}
+
+static void unregister_test_bp(struct perf_event **bp)
+{
+ if (WARN_ON(IS_ERR(*bp)))
+ return;
+ if (WARN_ON(!*bp))
+ return;
+ unregister_hw_breakpoint(*bp);
+ *bp = NULL;
+}
+
+static int get_test_bp_slots(void)
+{
+ static int slots;
+
+ if (!slots)
+ slots = hw_breakpoint_slots(TYPE_DATA);
+
+ return slots;
+}
+
+static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk)
+{
+ struct perf_event *bp = register_test_bp(cpu, tsk, *id);
+
+ KUNIT_ASSERT_NOT_NULL(test, bp);
+ KUNIT_ASSERT_FALSE(test, IS_ERR(bp));
+ KUNIT_ASSERT_NULL(test, test_bps[*id]);
+ test_bps[(*id)++] = bp;
+}
+
+/*
+ * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free.
+ *
+ * Returns true if this can be called again, continuing at @id.
+ */
+static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip)
+{
+ for (int i = 0; i < get_test_bp_slots() - skip; ++i)
+ fill_one_bp_slot(test, id, cpu, tsk);
+
+ return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS;
+}
+
+static int dummy_kthread(void *arg)
+{
+ return 0;
+}
+
+static struct task_struct *get_other_task(struct kunit *test)
+{
+ struct task_struct *tsk;
+
+ if (__other_task)
+ return __other_task;
+
+ tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task");
+ KUNIT_ASSERT_FALSE(test, IS_ERR(tsk));
+ __other_task = tsk;
+ return __other_task;
+}
+
+static int get_test_cpu(int num)
+{
+ int cpu;
+
+ WARN_ON(num < 0);
+
+ for_each_online_cpu(cpu) {
+ if (num-- <= 0)
+ break;
+ }
+
+ return cpu;
+}
+
+/* ===== Test cases ===== */
+
+static void test_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_many_cpus(struct kunit *test)
+{
+ int idx = 0;
+ int cpu;
+
+ /* Test that CPUs are independent. */
+ for_each_online_cpu(cpu) {
+ bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx));
+ if (!do_continue)
+ break;
+ }
+}
+
+static void test_one_task_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, -1, current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one and adding back CPU-target should work. */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_two_tasks_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ /* Test that tasks are independent. */
+ fill_bp_slots(test, &idx, -1, current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one from first task and adding back CPU-target should not work. */
+ unregister_test_bp(&test_bps[0]);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_one_task_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /*
+ * Remove one and adding back CPU-target should work; this case is
+ * special vs. above because the task's constraints are CPU-dependent.
+ */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_one_task_mixed(struct kunit *test)
+{
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_bp_slots(test, &idx, -1, current, 1);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* Transition from CPU-dependent pinned count to CPU-independent. */
+ unregister_test_bp(&test_bps[0]);
+ unregister_test_bp(&test_bps[1]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_two_tasks_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Can still create breakpoints on some other CPU. */
+ fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0);
+}
+
+static void test_two_tasks_on_one_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Cannot create breakpoints on some other CPU either. */
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static void test_task_on_all_and_one_cpu(struct kunit *test)
+{
+ int tsk_on_cpu_idx, cpu_idx;
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_bp_slots(test, &idx, -1, current, 2);
+ /* Transitioning from only all CPU breakpoints to mixed. */
+ tsk_on_cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* We should still be able to use up another CPU's slots. */
+ cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+
+ /* Transitioning back to task target on all CPUs. */
+ unregister_test_bp(&test_bps[tsk_on_cpu_idx]);
+ /* Still have a CPU target breakpoint in get_test_cpu(1). */
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ /* Remove it and try again. */
+ unregister_test_bp(&test_bps[cpu_idx]);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static struct kunit_case hw_breakpoint_test_cases[] = {
+ KUNIT_CASE(test_one_cpu),
+ KUNIT_CASE(test_many_cpus),
+ KUNIT_CASE(test_one_task_on_all_cpus),
+ KUNIT_CASE(test_two_tasks_on_all_cpus),
+ KUNIT_CASE(test_one_task_on_one_cpu),
+ KUNIT_CASE(test_one_task_mixed),
+ KUNIT_CASE(test_two_tasks_on_one_cpu),
+ KUNIT_CASE(test_two_tasks_on_one_all_cpus),
+ KUNIT_CASE(test_task_on_all_and_one_cpu),
+ {},
+};
+
+static int test_init(struct kunit *test)
+{
+ /* Most test cases want 2 distinct CPUs. */
+ if (num_online_cpus() < 2)
+ kunit_skip(test, "not enough cpus");
+
+ /* Want the system to not use breakpoints elsewhere. */
+ if (hw_breakpoint_is_used())
+ kunit_skip(test, "hw breakpoint already in use");
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) {
+ if (test_bps[i])
+ unregister_test_bp(&test_bps[i]);
+ }
+
+ if (__other_task) {
+ kthread_stop(__other_task);
+ __other_task = NULL;
+ }
+
+ /* Verify that internal state agrees that no breakpoints are in use. */
+ KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used());
+}
+
+static struct kunit_suite hw_breakpoint_test_suite = {
+ .name = "hw_breakpoint",
+ .test_cases = hw_breakpoint_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+
+kunit_test_suites(&hw_breakpoint_test_suite);
+
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9ba4a04..d9cc57083091 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,15 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _KERNEL_EVENTS_INTERNAL_H
#define _KERNEL_EVENTS_INTERNAL_H
#include <linux/hardirq.h>
#include <linux/uaccess.h>
+#include <linux/refcount.h>
/* Buffer handling */
#define RING_BUFFER_WRITABLE 0x01
-struct ring_buffer {
- atomic_t refcount;
+struct perf_buffer {
+ refcount_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
@@ -17,11 +19,12 @@ struct ring_buffer {
#endif
int nr_pages; /* nr of data pages */
int overwrite; /* can overwrite itself */
+ int paused; /* can write into ring buffer */
atomic_t poll; /* POLL_ for wakeups */
local_t head; /* write position */
- local_t nest; /* nested writers */
+ unsigned int nest; /* nested writers */
local_t events; /* event limit */
local_t wakeup; /* wakeup stamp */
local_t lost; /* nr records lost */
@@ -32,39 +35,59 @@ struct ring_buffer {
spinlock_t event_lock;
struct list_head event_list;
- atomic_t mmap_count;
+ refcount_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
/* AUX area */
- local_t aux_head;
- local_t aux_nest;
- local_t aux_wakeup;
+ struct mutex aux_mutex;
+ long aux_head;
+ unsigned int aux_nest;
+ long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
- atomic_t aux_mmap_count;
+ refcount_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
- atomic_t aux_refcount;
+ refcount_t aux_refcount;
+ int aux_in_sampling;
+ int aux_in_pause_resume;
void **aux_pages;
void *aux_priv;
struct perf_event_mmap_page *user_page;
- void *data_pages[0];
+ void *data_pages[];
};
-extern void rb_free(struct ring_buffer *rb);
-extern struct ring_buffer *
+extern void rb_free(struct perf_buffer *rb);
+
+static inline void rb_free_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_buffer *rb;
+
+ rb = container_of(rcu_head, struct perf_buffer, rcu_head);
+ rb_free(rb);
+}
+
+static inline void rb_toggle_paused(struct perf_buffer *rb, bool pause)
+{
+ if (!pause && rb->nr_pages)
+ rb->paused = 0;
+ else
+ rb->paused = 1;
+}
+
+extern struct perf_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
-extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+extern int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags);
-extern void rb_free_aux(struct ring_buffer *rb);
-extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
-extern void ring_buffer_put(struct ring_buffer *rb);
+extern void rb_free_aux(struct perf_buffer *rb);
+extern struct perf_buffer *ring_buffer_get(struct perf_event *event);
+extern void ring_buffer_put(struct perf_buffer *rb);
-static inline bool rb_has_aux(struct ring_buffer *rb)
+static inline bool rb_has_aux(struct perf_buffer *rb)
{
return !!rb->aux_nr_pages;
}
@@ -72,17 +95,8 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
void perf_event_aux_event(struct perf_event *event, unsigned long head,
unsigned long size, u64 flags);
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
- struct perf_output_handle *handle,
- struct perf_sample_data *sample);
-
extern struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff);
#ifdef CONFIG_PERF_USE_VMALLOC
/*
@@ -91,47 +105,50 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
* Required for architectures that have d-cache aliasing issues.
*/
-static inline int page_order(struct ring_buffer *rb)
+static inline int page_order(struct perf_buffer *rb)
{
return rb->page_order;
}
#else
-static inline int page_order(struct ring_buffer *rb)
+static inline int page_order(struct perf_buffer *rb)
{
return 0;
}
#endif
-static inline unsigned long perf_data_size(struct ring_buffer *rb)
+static inline int data_page_nr(struct perf_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
+
+static inline unsigned long perf_data_size(struct perf_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
-static inline unsigned long perf_aux_size(struct ring_buffer *rb)
+static inline unsigned long perf_aux_size(struct perf_buffer *rb)
{
- return rb->aux_nr_pages << PAGE_SHIFT;
+ return (unsigned long)rb->aux_nr_pages << PAGE_SHIFT;
}
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned long \
-func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned long len) \
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
- written = memcpy_func(handle->addr, buf, size); \
+ written = memcpy_func(__VA_ARGS__); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
- buf += written; \
+ if (advance_buf) \
+ buf += written; \
handle->size -= written; \
if (!handle->size) { \
- struct ring_buffer *rb = handle->rb; \
+ struct perf_buffer *rb = handle->rb; \
\
handle->page++; \
handle->page &= rb->nr_pages - 1; \
@@ -143,6 +160,21 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+ const void *buf, unsigned long len)
+{
+ unsigned long orig_len = len;
+ __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+ orig_len - len, size)
+}
+
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
@@ -178,24 +210,9 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
-/* Callchain handling */
-extern struct perf_callchain_entry *
-perf_callchain(struct perf_event *event, struct pt_regs *regs);
-extern int get_callchain_buffers(void);
-extern void put_callchain_buffers(void);
-
-static inline int get_recursion_context(int *recursion)
+static inline int get_recursion_context(u8 *recursion)
{
- int rctx;
-
- if (in_nmi())
- rctx = 3;
- else if (in_irq())
- rctx = 2;
- else if (in_softirq())
- rctx = 1;
- else
- rctx = 0;
+ unsigned char rctx = interrupt_context_level();
if (recursion[rctx])
return -1;
@@ -206,7 +223,7 @@ static inline int get_recursion_context(int *recursion)
return rctx;
}
-static inline void put_recursion_context(int *recursion, int rctx)
+static inline void put_recursion_context(u8 *recursion, unsigned char rctx)
{
barrier();
recursion[rctx]--;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 725c416085e3..20a905023736 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -1,12 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events ring-buffer code:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
@@ -14,15 +13,20 @@
#include <linux/slab.h>
#include <linux/circ_buf.h>
#include <linux/poll.h>
+#include <linux/nospec.h>
#include "internal.h"
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, POLLIN);
+ atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM);
handle->event->pending_wakeup = 1;
- irq_work_queue(&handle->event->pending);
+
+ if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
+ handle->event->pending_kill = POLL_IN;
+
+ irq_work_queue(&handle->event->pending_irq);
}
/*
@@ -35,28 +39,51 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
*/
static void perf_output_get_handle(struct perf_output_handle *handle)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
preempt_disable();
- local_inc(&rb->nest);
+
+ /*
+ * Avoid an explicit LOAD/STORE such that architectures with memops
+ * can use them.
+ */
+ (*(volatile unsigned int *)&rb->nest)++;
handle->wakeup = local_read(&rb->wakeup);
}
static void perf_output_put_handle(struct perf_output_handle *handle)
{
- struct ring_buffer *rb = handle->rb;
+ struct perf_buffer *rb = handle->rb;
unsigned long head;
+ unsigned int nest;
+
+ /*
+ * If this isn't the outermost nesting, we don't have to update
+ * @rb->user_page->data_head.
+ */
+ nest = READ_ONCE(rb->nest);
+ if (nest > 1) {
+ WRITE_ONCE(rb->nest, nest - 1);
+ goto out;
+ }
again:
+ /*
+ * In order to avoid publishing a head value that goes backwards,
+ * we must ensure the load of @rb->head happens after we've
+ * incremented @rb->nest.
+ *
+ * Otherwise we can observe a @rb->head value before one published
+ * by an IRQ/NMI happening between the load and the increment.
+ */
+ barrier();
head = local_read(&rb->head);
/*
- * IRQ/NMI can happen here, which means we can miss a head update.
+ * IRQ/NMI can happen here and advance @rb->head, causing our
+ * load above to be stale.
*/
- if (!local_dec_and_test(&rb->nest))
- goto out;
-
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
@@ -84,14 +111,23 @@ again:
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
- rb->user_page->data_head = head;
+ WRITE_ONCE(rb->user_page->data_head, head);
/*
- * Now check if we missed an update -- rely on previous implied
- * compiler barriers to force a re-read.
+ * We must publish the head before decrementing the nest count,
+ * otherwise an IRQ/NMI can publish a more recent head value and our
+ * write will (temporarily) publish a stale value.
*/
+ barrier();
+ WRITE_ONCE(rb->nest, 0);
+
+ /*
+ * Ensure we decrement @rb->nest before we validate the @rb->head.
+ * Otherwise we cannot be sure we caught the 'last' nested update.
+ */
+ barrier();
if (unlikely(head != local_read(&rb->head))) {
- local_inc(&rb->nest);
+ WRITE_ONCE(rb->nest, 1);
goto again;
}
@@ -102,10 +138,24 @@ out:
preempt_enable();
}
-int perf_output_begin(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size)
+static __always_inline bool
+ring_buffer_has_space(unsigned long head, unsigned long tail,
+ unsigned long data_size, unsigned int size,
+ bool backward)
{
- struct ring_buffer *rb;
+ if (!backward)
+ return CIRC_SPACE(head, tail, data_size) >= size;
+ else
+ return CIRC_SPACE(tail, head, data_size) >= size;
+}
+
+static __always_inline int
+__perf_output_begin(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
+ struct perf_event *event, unsigned int size,
+ bool backward)
+{
+ struct perf_buffer *rb;
unsigned long tail, offset, head;
int have_lost, page_shift;
struct {
@@ -125,11 +175,17 @@ int perf_output_begin(struct perf_output_handle *handle,
if (unlikely(!rb))
goto out;
- if (unlikely(!rb->nr_pages))
+ if (unlikely(rb->paused)) {
+ if (rb->nr_pages) {
+ local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
+ }
handle->rb = rb;
handle->event = event;
+ handle->flags = 0;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
@@ -140,12 +196,16 @@ int perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
+ offset = local_read(&rb->head);
do {
- tail = ACCESS_ONCE(rb->user_page->data_tail);
- offset = head = local_read(&rb->head);
- if (!rb->overwrite &&
- unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
- goto fail;
+ head = offset;
+ tail = READ_ONCE(rb->user_page->data_tail);
+ if (!rb->overwrite) {
+ if (unlikely(!ring_buffer_has_space(head, tail,
+ perf_data_size(rb),
+ size, backward)))
+ goto fail;
+ }
/*
* The above forms a control dependency barrier separating the
@@ -159,8 +219,16 @@ int perf_output_begin(struct perf_output_handle *handle,
* See perf_output_put_handle().
*/
- head += size;
- } while (local_cmpxchg(&rb->head, offset, head) != offset);
+ if (!backward)
+ head += size;
+ else
+ head -= size;
+ } while (!local_try_cmpxchg(&rb->head, &offset, head));
+
+ if (backward) {
+ offset = head;
+ head = (u64)(-head);
+ }
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
@@ -178,24 +246,23 @@ int perf_output_begin(struct perf_output_handle *handle,
handle->size = (1UL << page_shift) - offset;
if (unlikely(have_lost)) {
- struct perf_sample_data sample_data;
-
lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
- perf_event_header__init_id(&lost_event.header,
- &sample_data, event);
+ /* XXX mostly redundant; @data is already fully initializes */
+ perf_event_header__init_id(&lost_event.header, data, event);
perf_output_put(handle, lost_event);
- perf_event__output_id_sample(event, handle, &sample_data);
+ perf_event__output_id_sample(event, handle, data);
}
return 0;
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
@@ -203,6 +270,29 @@ out:
return -ENOSPC;
}
+int perf_output_begin_forward(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, data, event, size, false);
+}
+
+int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, data, event, size, true);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
+ struct perf_event *event, unsigned int size)
+{
+
+ return __perf_output_begin(handle, data, event, size,
+ unlikely(is_write_backward(event)));
+}
+
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
@@ -222,7 +312,7 @@ void perf_output_end(struct perf_output_handle *handle)
}
static void
-ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
{
long max_size = perf_data_size(rb);
@@ -237,11 +327,33 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
else
rb->overwrite = 1;
- atomic_set(&rb->refcount, 1);
+ refcount_set(&rb->refcount, 1);
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
+
+ /*
+ * perf_output_begin() only checks rb->paused, therefore
+ * rb->paused must be true if we have no pages for output.
+ */
+ if (!rb->nr_pages)
+ rb->paused = 1;
+
+ mutex_init(&rb->aux_mutex);
+}
+
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+ /*
+ * OVERWRITE is determined by perf_aux_output_end() and can't
+ * be passed in directly.
+ */
+ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+ return;
+
+ handle->aux_flags |= flags;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
/*
* This is called before hardware starts writing to the AUX area to
@@ -252,13 +364,18 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
* The ordering is similar to that of perf_output_{begin,end}, with
* the exception of (B), which should be taken care of by the pmu
* driver, since ordering rules will differ depending on hardware.
+ *
+ * Call this from pmu::start(); see the comment in perf_aux_output_end()
+ * about its use in pmu callbacks. Both can also be called from the PMI
+ * handler if needed.
*/
void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event)
{
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
+ unsigned int nest;
if (output_event->parent)
output_event = output_event->parent;
@@ -272,22 +389,40 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!rb)
return NULL;
- if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+ if (!rb_has_aux(rb))
+ goto err;
+
+ /*
+ * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
+ * about to get freed, so we leave immediately.
+ *
+ * Checking rb::aux_mmap_count and rb::refcount has to be done in
+ * the same order, see perf_mmap_close. Otherwise we end up freeing
+ * aux pages in this path, which is a bug, because in_atomic().
+ */
+ if (!refcount_read(&rb->aux_mmap_count))
+ goto err;
+
+ if (!refcount_inc_not_zero(&rb->aux_refcount))
goto err;
+ nest = READ_ONCE(rb->aux_nest);
/*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
*/
- if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+ if (WARN_ON_ONCE(nest))
goto err_put;
- aux_head = local_read(&rb->aux_head);
+ WRITE_ONCE(rb->aux_nest, nest + 1);
+
+ aux_head = rb->aux_head;
handle->rb = rb;
handle->event = event;
handle->head = aux_head;
handle->size = 0;
+ handle->aux_flags = 0;
/*
* In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -295,8 +430,8 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* (B) <-> (C) ordering is still observed by the pmu driver.
*/
if (!rb->aux_overwrite) {
- aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
- handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ aux_tail = READ_ONCE(rb->user_page->aux_tail);
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
@@ -306,9 +441,9 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = 1;
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
}
}
@@ -316,6 +451,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
return handle->rb->aux_priv;
err_put:
+ /* can't be last */
rb_free_aux(rb);
err:
@@ -324,54 +460,84 @@ err:
return NULL;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
+
+static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
+{
+ if (rb->aux_overwrite)
+ return false;
+
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ return true;
+ }
+
+ return false;
+}
/*
* Commit the data written by hardware into the ring buffer by adjusting
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
* pmu driver's responsibility to observe ordering rules of the hardware,
* so that all the data is externally visible before this is called.
+ *
+ * Note: this has to be called from pmu::stop() callback, as the assumption
+ * of the AUX buffer management code is that after pmu::stop(), the AUX
+ * transaction must be stopped and therefore drop the AUX reference count.
*/
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
- bool truncated)
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
{
- struct ring_buffer *rb = handle->rb;
+ bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
+ struct perf_buffer *rb = handle->rb;
unsigned long aux_head;
- u64 flags = 0;
-
- if (truncated)
- flags |= PERF_AUX_FLAG_TRUNCATED;
/* in overwrite mode, driver provides aux_head via handle */
if (rb->aux_overwrite) {
- flags |= PERF_AUX_FLAG_OVERWRITE;
+ handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
- local_set(&rb->aux_head, aux_head);
+ rb->aux_head = aux_head;
} else {
- aux_head = local_read(&rb->aux_head);
- local_add(size, &rb->aux_head);
- }
+ handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
- if (size || flags) {
- /*
- * Only send RECORD_AUX if we have something useful to communicate
- */
-
- perf_event_aux_event(handle->event, aux_head, size, flags);
+ aux_head = rb->aux_head;
+ rb->aux_head += size;
}
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
+ /*
+ * Only send RECORD_AUX if we have something useful to communicate
+ *
+ * Note: the OVERWRITE records by themselves are not considered
+ * useful, as they don't communicate any *new* information,
+ * aside from the short-lived offset, that becomes history at
+ * the next event sched-in and therefore isn't useful.
+ * The userspace that needs to copy out AUX data in overwrite
+ * mode should know to use user_page::aux_head for the actual
+ * offset. So, from now on we don't output AUX records that
+ * have *only* OVERWRITE flag set.
+ */
+ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
+ perf_event_aux_event(handle->event, aux_head, size,
+ handle->aux_flags);
+
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
+ if (rb_need_aux_wakeup(rb))
+ wakeup = true;
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ if (wakeup) {
+ if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
- local_add(rb->aux_watermark, &rb->aux_wakeup);
}
+
handle->event = NULL;
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
+ /* can't be last */
rb_free_aux(rb);
ring_buffer_put(rb);
}
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
/*
* Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -379,27 +545,25 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
*/
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
- struct ring_buffer *rb = handle->rb;
- unsigned long aux_head;
+ struct perf_buffer *rb = handle->rb;
if (size > handle->size)
return -ENOSPC;
- local_add(size, &rb->aux_head);
+ rb->aux_head += size;
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
+ if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
- local_add(rb->aux_watermark, &rb->aux_wakeup);
- handle->wakeup = local_read(&rb->aux_wakeup) +
- rb->aux_watermark;
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
- handle->head = aux_head;
+ handle->head = rb->aux_head;
handle->size -= size;
return 0;
}
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
void *perf_get_aux(struct perf_output_handle *handle)
{
@@ -409,6 +573,43 @@ void *perf_get_aux(struct perf_output_handle *handle)
return handle->rb->aux_priv;
}
+EXPORT_SYMBOL_GPL(perf_get_aux);
+
+/*
+ * Copy out AUX data from an AUX handle.
+ */
+long perf_output_copy_aux(struct perf_output_handle *aux_handle,
+ struct perf_output_handle *handle,
+ unsigned long from, unsigned long to)
+{
+ struct perf_buffer *rb = aux_handle->rb;
+ unsigned long tocopy, remainder, len = 0;
+ void *addr;
+
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+
+ do {
+ tocopy = PAGE_SIZE - offset_in_page(from);
+ if (to > from)
+ tocopy = min(tocopy, to - from);
+ if (!tocopy)
+ break;
+
+ addr = rb->aux_pages[from >> PAGE_SHIFT];
+ addr += offset_in_page(from);
+
+ remainder = perf_output_copy(handle, addr, tocopy);
+ if (remainder)
+ return -EFAULT;
+
+ len += tocopy;
+ from += tocopy;
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
+ } while (to != from);
+
+ return len;
+}
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
@@ -416,8 +617,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
{
struct page *page;
- if (order > MAX_ORDER)
- order = MAX_ORDER;
+ if (order > MAX_PAGE_ORDER)
+ order = MAX_PAGE_ORDER;
do {
page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -425,7 +626,10 @@ static struct page *rb_alloc_aux_page(int node, int order)
if (page && order) {
/*
- * Communicate the allocation size to the driver
+ * Communicate the allocation size to the driver:
+ * if we managed to secure a high-order allocation,
+ * set its first page's private to this order;
+ * !PagePrivate(page) means it's just a normal page.
*/
split_page(page, order);
SetPagePrivate(page);
@@ -435,46 +639,97 @@ static struct page *rb_alloc_aux_page(int node, int order)
return page;
}
-static void rb_free_aux_page(struct ring_buffer *rb, int idx)
+static void rb_free_aux_page(struct perf_buffer *rb, int idx)
{
struct page *page = virt_to_page(rb->aux_pages[idx]);
ClearPagePrivate(page);
- page->mapping = NULL;
__free_page(page);
}
-int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
+static void __rb_free_aux(struct perf_buffer *rb)
+{
+ int pg;
+
+ /*
+ * Should never happen, the last reference should be dropped from
+ * perf_mmap_close() path, which first stops aux transactions (which
+ * in turn are the atomic holders of aux_refcount) and then does the
+ * last rb_free_aux().
+ */
+ WARN_ON_ONCE(in_atomic());
+
+ if (rb->aux_priv) {
+ rb->free_aux(rb->aux_priv);
+ rb->free_aux = NULL;
+ rb->aux_priv = NULL;
+ }
+
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
+
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
+}
+
+int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order = 0;
+ bool use_contiguous_pages = event->pmu->capabilities & (
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE);
+ /*
+ * Initialize max_order to 0 for page allocation. This allocates single
+ * pages to minimize memory fragmentation. This is overridden if the
+ * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
+ */
+ int max_order = 0;
+ int ret = -ENOMEM;
if (!has_aux(event))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
+
+ if (nr_pages <= 0)
+ return -EINVAL;
- if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
+ if (!overwrite) {
/*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
+ * Watermark defaults to half the buffer, to aid PMU drivers
+ * in double buffering.
*/
- max_order = ilog2(nr_pages);
+ if (!watermark)
+ watermark = min_t(unsigned long,
+ U32_MAX,
+ (unsigned long)nr_pages << (PAGE_SHIFT - 1));
/*
- * PMU requests more than one contiguous chunks of memory
- * for SW double buffering
+ * If using contiguous pages, use aux_watermark as the basis
+ * for chunking to help PMU drivers honor the watermark.
*/
- if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
- !overwrite) {
- if (!max_order)
- return -EINVAL;
-
- max_order--;
- }
+ if (use_contiguous_pages)
+ max_order = get_order(watermark);
+ } else {
+ /*
+ * If using contiguous pages, we need to start with the
+ * max_order that fits in nr_pages, not the other way around,
+ * hence ilog2() and not get_order.
+ */
+ if (use_contiguous_pages)
+ max_order = ilog2(nr_pages);
+ watermark = 0;
}
- rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node);
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
+ return -ENOMEM;
+ rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
+ node);
if (!rb->aux_pages)
return -ENOMEM;
@@ -507,7 +762,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
goto out;
}
- rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+ rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
overwrite);
if (!rb->aux_priv)
goto out;
@@ -520,43 +775,23 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
* we keep a refcount here to make sure either of the two can
* reference them safely.
*/
- atomic_set(&rb->aux_refcount, 1);
+ refcount_set(&rb->aux_refcount, 1);
rb->aux_overwrite = overwrite;
rb->aux_watermark = watermark;
- if (!rb->aux_watermark && !rb->aux_overwrite)
- rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1);
-
out:
if (!ret)
rb->aux_pgoff = pgoff;
else
- rb_free_aux(rb);
+ __rb_free_aux(rb);
return ret;
}
-static void __rb_free_aux(struct ring_buffer *rb)
-{
- int pg;
-
- if (rb->aux_priv) {
- rb->free_aux(rb->aux_priv);
- rb->free_aux = NULL;
- rb->aux_priv = NULL;
- }
-
- for (pg = 0; pg < rb->aux_nr_pages; pg++)
- rb_free_aux_page(rb, pg);
-
- kfree(rb->aux_pages);
- rb->aux_nr_pages = 0;
-}
-
-void rb_free_aux(struct ring_buffer *rb)
+void rb_free_aux(struct perf_buffer *rb)
{
- if (atomic_dec_and_test(&rb->aux_refcount))
+ if (refcount_dec_and_test(&rb->aux_refcount))
__rb_free_aux(rb);
}
@@ -567,7 +802,7 @@ void rb_free_aux(struct ring_buffer *rb)
*/
static struct page *
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (pgoff > rb->nr_pages)
return NULL;
@@ -591,16 +826,27 @@ static void *perf_mmap_alloc_page(int cpu)
return page_address(page);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+static void perf_mmap_free_page(void *addr)
+{
+ struct page *page = virt_to_page(addr);
+
+ __free_page(page);
+}
+
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long size;
- int i;
+ int i, node;
- size = sizeof(struct ring_buffer);
+ size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- rb = kzalloc(size, GFP_KERNEL);
+ if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
+ goto fail;
+
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ rb = kzalloc_node(size, GFP_KERNEL, node);
if (!rb)
goto fail;
@@ -622,9 +868,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
fail_data_pages:
for (i--; i >= 0; i--)
- free_page((unsigned long)rb->data_pages[i]);
+ perf_mmap_free_page(rb->data_pages[i]);
- free_page((unsigned long)rb->user_page);
+ perf_mmap_free_page(rb->user_page);
fail_user_page:
kfree(rb);
@@ -633,32 +879,19 @@ fail:
return NULL;
}
-static void perf_mmap_free_page(unsigned long addr)
-{
- struct page *page = virt_to_page((void *)addr);
-
- page->mapping = NULL;
- __free_page(page);
-}
-
-void rb_free(struct ring_buffer *rb)
+void rb_free(struct perf_buffer *rb)
{
int i;
- perf_mmap_free_page((unsigned long)rb->user_page);
+ perf_mmap_free_page(rb->user_page);
for (i = 0; i < rb->nr_pages; i++)
- perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+ perf_mmap_free_page(rb->data_pages[i]);
kfree(rb);
}
#else
-static int data_page_nr(struct ring_buffer *rb)
-{
- return rb->nr_pages << page_order(rb);
-}
-
static struct page *
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
/* The '>' counts in the user page. */
if (pgoff > data_page_nr(rb))
@@ -667,46 +900,33 @@ __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
}
-static void perf_mmap_unmark_page(void *addr)
-{
- struct page *page = vmalloc_to_page(addr);
-
- page->mapping = NULL;
-}
-
static void rb_free_work(struct work_struct *work)
{
- struct ring_buffer *rb;
- void *base;
- int i, nr;
+ struct perf_buffer *rb;
- rb = container_of(work, struct ring_buffer, work);
- nr = data_page_nr(rb);
+ rb = container_of(work, struct perf_buffer, work);
- base = rb->user_page;
- /* The '<=' counts in the user page. */
- for (i = 0; i <= nr; i++)
- perf_mmap_unmark_page(base + (i * PAGE_SIZE));
-
- vfree(base);
+ vfree(rb->user_page);
kfree(rb);
}
-void rb_free(struct ring_buffer *rb)
+void rb_free(struct perf_buffer *rb)
{
schedule_work(&rb->work);
}
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
- struct ring_buffer *rb;
+ struct perf_buffer *rb;
unsigned long size;
void *all_buf;
+ int node;
- size = sizeof(struct ring_buffer);
+ size = sizeof(struct perf_buffer);
size += sizeof(void *);
- rb = kzalloc(size, GFP_KERNEL);
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ rb = kzalloc_node(size, GFP_KERNEL, node);
if (!rb)
goto fail;
@@ -718,8 +938,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
- rb->page_order = ilog2(nr_pages);
- rb->nr_pages = !!nr_pages;
+ if (nr_pages) {
+ rb->nr_pages = 1;
+ rb->page_order = ilog2(nr_pages);
+ }
ring_buffer_init(rb, watermark, flags);
@@ -735,7 +957,7 @@ fail:
#endif
struct page *
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
{
if (rb->aux_nr_pages) {
/* above AUX space */
@@ -743,8 +965,10 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
return NULL;
/* AUX space */
- if (pgoff >= rb->aux_pgoff)
- return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+ if (pgoff >= rb->aux_pgoff) {
+ int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
+ return virt_to_page(rb->aux_pages[aux_pgoff]);
+ }
}
return __perf_mmap_to_page(rb, pgoff);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..f11ceb8be8c4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1,25 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* User-space Probes (UProbes)
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2008-2012
* Authors:
* Srikar Dronamraju
* Jim Keniston
- * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/kernel.h>
@@ -27,16 +14,22 @@
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
-#include <linux/mmu_notifier.h> /* set_pte_at_notify */
-#include <linux/swap.h> /* try_to_free_swap */
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h> /* folio_free_swap */
#include <linux/ptrace.h> /* user_enable_single_step */
#include <linux/kdebug.h> /* notifier mechanism */
-#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
+#include <linux/khugepaged.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/workqueue.h>
+#include <linux/srcu.h>
+#include <linux/oom.h> /* check_stable_address_space */
+#include <linux/pagewalk.h>
#include <linux/uprobes.h>
@@ -50,28 +43,37 @@ static struct rb_root uprobes_tree = RB_ROOT;
*/
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
-static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
+static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-static struct percpu_rw_semaphore dup_mmap_sem;
+DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
+
+/* Covers return_instance's uprobe lifetime. */
+DEFINE_STATIC_SRCU(uretprobes_srcu);
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
- atomic_t ref;
+ refcount_t ref;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
- struct uprobe_consumer *consumers;
+ struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
loff_t offset;
- unsigned long flags;
+ loff_t ref_ctr_offset;
+ unsigned long flags; /* "unsigned long" so bitops work */
/*
* The generic code assumes that it has two members of unknown type
@@ -86,15 +88,15 @@ struct uprobe {
struct arch_uprobe arch;
};
-struct return_instance {
- struct uprobe *uprobe;
- unsigned long func;
- unsigned long orig_ret_vaddr; /* original return address */
- bool chained; /* true, if instance is nested */
-
- struct return_instance *next; /* keep as stack */
+struct delayed_uprobe {
+ struct list_head list;
+ struct uprobe *uprobe;
+ struct mm_struct *mm;
};
+static DEFINE_MUTEX(delayed_uprobe_lock);
+static LIST_HEAD(delayed_uprobe_list);
+
/*
* Execute out of line area: anonymous executable mapping installed
* by the probed task to execute the copy of the original instruction
@@ -105,19 +107,23 @@ struct return_instance {
* allocated.
*/
struct xol_area {
- wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
- unsigned long *bitmap; /* 0 = free slot */
- struct page *page;
+ wait_queue_head_t wq; /* if all slots are busy */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct page *page;
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
* the vma go away, and we must handle that reasonably gracefully.
*/
- unsigned long vaddr; /* Page(s) of instruction slots */
+ unsigned long vaddr; /* Page(s) of instruction slots */
};
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg);
+}
+
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -147,73 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
}
/**
- * __replace_page - replace page in vma by new page.
- * based on replace_page in mm/ksm.c
- *
- * @vma: vma that holds the pte pointing to page
- * @addr: address the old @page is mapped at
- * @page: the cowed page we are replacing by kpage
- * @kpage: the modified page we replace page by
- *
- * Returns 0 on success, -EFAULT on failure.
- */
-static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, struct page *kpage)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pte_t *ptep;
- int err;
- /* For mmu_notifiers */
- const unsigned long mmun_start = addr;
- const unsigned long mmun_end = addr + PAGE_SIZE;
- struct mem_cgroup *memcg;
-
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
- if (err)
- return err;
-
- /* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(page);
-
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- err = -EAGAIN;
- ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
- goto unlock;
-
- get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr);
- mem_cgroup_commit_charge(kpage, memcg, false);
- lru_cache_add_active_or_unevictable(kpage, vma);
-
- if (!PageAnon(page)) {
- dec_mm_counter(mm, MM_FILEPAGES);
- inc_mm_counter(mm, MM_ANONPAGES);
- }
-
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
-
- page_remove_rmap(page);
- if (!page_mapped(page))
- try_to_free_swap(page);
- pte_unmap_unlock(ptep, ptl);
-
- if (vma->vm_flags & VM_LOCKED)
- munlock_vma_page(page);
- put_page(page);
-
- err = 0;
- unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- unlock_page(page);
- return err;
-}
-
-/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_swbp_insn
@@ -238,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}
-static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
+void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
@@ -252,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}
-static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
+static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
+ int nbytes, void *data)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -266,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
- copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
- if (is_swbp_insn(new_opcode)) {
+ if (is_swbp_insn(insn)) {
if (is_swbp) /* register: already installed? */
return 0;
} else {
@@ -280,6 +220,254 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
return 1;
}
+static struct delayed_uprobe *
+delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct delayed_uprobe *du;
+
+ list_for_each_entry(du, &delayed_uprobe_list, list)
+ if (du->uprobe == uprobe && du->mm == mm)
+ return du;
+ return NULL;
+}
+
+static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct delayed_uprobe *du;
+
+ if (delayed_uprobe_check(uprobe, mm))
+ return 0;
+
+ du = kzalloc(sizeof(*du), GFP_KERNEL);
+ if (!du)
+ return -ENOMEM;
+
+ du->uprobe = uprobe;
+ du->mm = mm;
+ list_add(&du->list, &delayed_uprobe_list);
+ return 0;
+}
+
+static void delayed_uprobe_delete(struct delayed_uprobe *du)
+{
+ if (WARN_ON(!du))
+ return;
+ list_del(&du->list);
+ kfree(du);
+}
+
+static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct list_head *pos, *q;
+ struct delayed_uprobe *du;
+
+ if (!uprobe && !mm)
+ return;
+
+ list_for_each_safe(pos, q, &delayed_uprobe_list) {
+ du = list_entry(pos, struct delayed_uprobe, list);
+
+ if (uprobe && du->uprobe != uprobe)
+ continue;
+ if (mm && du->mm != mm)
+ continue;
+
+ delayed_uprobe_delete(du);
+ }
+}
+
+static bool valid_ref_ctr_vma(struct uprobe *uprobe,
+ struct vm_area_struct *vma)
+{
+ unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
+
+ return uprobe->ref_ctr_offset &&
+ vma->vm_file &&
+ file_inode(vma->vm_file) == uprobe->inode &&
+ (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
+ vma->vm_start <= vaddr &&
+ vma->vm_end > vaddr;
+}
+
+static struct vm_area_struct *
+find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *tmp;
+
+ for_each_vma(vmi, tmp)
+ if (valid_ref_ctr_vma(uprobe, tmp))
+ return tmp;
+
+ return NULL;
+}
+
+static int
+__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
+{
+ void *kaddr;
+ struct page *page;
+ int ret;
+ short *ptr;
+
+ if (!vaddr || !d)
+ return -EINVAL;
+
+ ret = get_user_pages_remote(mm, vaddr, 1,
+ FOLL_WRITE, &page, NULL);
+ if (unlikely(ret <= 0)) {
+ /*
+ * We are asking for 1 page. If get_user_pages_remote() fails,
+ * it may return 0, in that case we have to return error.
+ */
+ return ret == 0 ? -EBUSY : ret;
+ }
+
+ kaddr = kmap_atomic(page);
+ ptr = kaddr + (vaddr & ~PAGE_MASK);
+
+ if (unlikely(*ptr + d < 0)) {
+ pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
+ "curr val: %d, delta: %d\n", vaddr, *ptr, d);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *ptr += d;
+ ret = 0;
+out:
+ kunmap_atomic(kaddr);
+ put_page(page);
+ return ret;
+}
+
+static void update_ref_ctr_warn(struct uprobe *uprobe,
+ struct mm_struct *mm, short d)
+{
+ pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
+ "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
+ d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
+ (unsigned long long) uprobe->offset,
+ (unsigned long long) uprobe->ref_ctr_offset, mm);
+}
+
+static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
+ short d)
+{
+ struct vm_area_struct *rc_vma;
+ unsigned long rc_vaddr;
+ int ret = 0;
+
+ rc_vma = find_ref_ctr_vma(uprobe, mm);
+
+ if (rc_vma) {
+ rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
+ ret = __update_ref_ctr(mm, rc_vaddr, d);
+ if (ret)
+ update_ref_ctr_warn(uprobe, mm, d);
+
+ if (d > 0)
+ return ret;
+ }
+
+ mutex_lock(&delayed_uprobe_lock);
+ if (d > 0)
+ ret = delayed_uprobe_add(uprobe, mm);
+ else
+ delayed_uprobe_remove(uprobe, mm);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ return ret;
+}
+
+static bool orig_page_is_identical(struct vm_area_struct *vma,
+ unsigned long vaddr, struct page *page, bool *pmd_mappable)
+{
+ const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
+ struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
+ index);
+ struct page *orig_page;
+ bool identical;
+
+ if (IS_ERR(orig_folio))
+ return false;
+ orig_page = folio_file_page(orig_folio, index);
+
+ *pmd_mappable = folio_test_pmd_mappable(orig_folio);
+ identical = folio_test_uptodate(orig_folio) &&
+ pages_identical(page, orig_page);
+ folio_put(orig_folio);
+ return identical;
+}
+
+static int __uprobe_write(struct vm_area_struct *vma,
+ struct folio_walk *fw, struct folio *folio,
+ unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ bool is_register)
+{
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
+ bool pmd_mappable;
+
+ /* For now, we'll only handle PTE-mapped folios. */
+ if (fw->level != FW_LEVEL_PTE)
+ return -EFAULT;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
+ * but the VMA might not be writable.
+ */
+ if (!pte_write(fw->pte)) {
+ if (!PageAnonExclusive(fw->page))
+ return -EFAULT;
+ if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
+ return -EFAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /*
+ * We'll temporarily unmap the page and flush the TLB, such that we can
+ * modify the page atomically.
+ */
+ flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
+ fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
+ copy_to_page(fw->page, insn_vaddr, insn, nbytes);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * there are no unexpected folio references ...
+ */
+ if (is_register || userfaultfd_missing(vma) ||
+ (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1))
+ goto remap;
+
+ /*
+ * ... and the mapped page is identical to the original page that
+ * would get faulted in on next access.
+ */
+ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
+ goto remap;
+
+ dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, fw->page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+
+ return pmd_mappable;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
+ return 0;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
@@ -290,214 +478,547 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* that have fixed length instructions.
*
* uprobe_write_opcode - write the opcode at a given virtual address.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to store the opcode.
- * @opcode: opcode to be written at @vaddr.
+ * @auprobe: arch specific probepoint information.
+ * @vma: the probed virtual memory area.
+ * @opcode_vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @opcode_vaddr.
*
- * Called with mm->mmap_sem held for write.
+ * Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
- uprobe_opcode_t opcode)
+int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
+ bool is_register)
{
- struct page *old_page, *new_page;
- struct vm_area_struct *vma;
- int ret;
+ return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
+ verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
+}
+
+int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
+ uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
+ void *data)
+{
+ const unsigned long vaddr = insn_vaddr & PAGE_MASK;
+ struct mm_struct *mm = vma->vm_mm;
+ struct uprobe *uprobe;
+ int ret, ref_ctr_updated = 0;
+ unsigned int gup_flags = FOLL_FORCE;
+ struct mmu_notifier_range range;
+ struct folio_walk fw;
+ struct folio *folio;
+ struct page *page;
+
+ uprobe = container_of(auprobe, struct uprobe, arch);
+
+ if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
+ return -EINVAL;
+
+ /*
+ * When registering, we have to break COW to get an exclusive anonymous
+ * page that we can safely modify. Use FOLL_WRITE to trigger a write
+ * fault if required. When unregistering, we might be lucky and the
+ * anon page is already gone. So defer write faults until really
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write()
+ * cannot deal with PMDs yet.
+ */
+ if (is_register)
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
retry:
- /* Read the page with vaddr into memory */
- ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
if (ret <= 0)
- return ret;
+ goto out;
+ folio = page_folio(page);
- ret = verify_opcode(old_page, vaddr, &opcode);
- if (ret <= 0)
- goto put_old;
+ ret = verify(page, insn_vaddr, insn, nbytes, data);
+ if (ret <= 0) {
+ folio_put(folio);
+ goto out;
+ }
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_old;
+ /* We are going to replace instruction, update ref_ctr. */
+ if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
+ ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
+ if (ret) {
+ folio_put(folio);
+ goto out;
+ }
- ret = -ENOMEM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
- if (!new_page)
- goto put_old;
+ ref_ctr_updated = 1;
+ }
+
+ ret = 0;
+ if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
+ VM_WARN_ON_ONCE(is_register);
+ folio_put(folio);
+ goto out;
+ }
+
+ if (!is_register) {
+ /*
+ * In the common case, we'll be able to zap the page when
+ * unregistering. So trigger MMU notifiers now, as we won't
+ * be able to do it under PTL.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ vaddr, vaddr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ }
- __SetPageUptodate(new_page);
- copy_highpage(new_page, old_page);
- copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ ret = -EAGAIN;
+ /* Walk the page tables again, to perform the actual update. */
+ if (folio_walk_start(&fw, vma, vaddr, 0)) {
+ if (fw.page == page)
+ ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
+ folio_walk_end(&fw, vma);
+ }
- ret = __replace_page(vma, vaddr, old_page, new_page);
- page_cache_release(new_page);
-put_old:
- put_page(old_page);
+ if (!is_register)
+ mmu_notifier_invalidate_range_end(&range);
- if (unlikely(ret == -EAGAIN))
+ folio_put(folio);
+ switch (ret) {
+ case -EFAULT:
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
+ fallthrough;
+ case -EAGAIN:
goto retry;
- return ret;
+ default:
+ break;
+ }
+
+out:
+ /* Revert back reference counter if instruction update failed. */
+ if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
+ update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
+
+ /* try collapse pmd for compound page */
+ if (ret > 0)
+ collapse_pte_mapped_thp(mm, vaddr, false);
+
+ return ret < 0 ? ret : 0;
}
/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, store the breakpoint instruction at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
}
/**
* set_orig_insn - Restore the original instruction.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_orig_insn(struct arch_uprobe *auprobe,
+ struct vm_area_struct *vma, unsigned long vaddr)
{
- return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
+ return uprobe_write_opcode(auprobe, vma, vaddr,
+ *(uprobe_opcode_t *)&auprobe->insn, false);
}
-static int match_uprobe(struct uprobe *l, struct uprobe *r)
+/* uprobe should have guaranteed positive refcount */
+static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
- if (l->inode < r->inode)
+ refcount_inc(&uprobe->ref);
+ return uprobe;
+}
+
+/*
+ * uprobe should have guaranteed lifetime, which can be either of:
+ * - caller already has refcount taken (and wants an extra one);
+ * - uprobe is RCU protected and won't be freed until after grace period;
+ * - we are holding uprobes_treelock (for read or write, doesn't matter).
+ */
+static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
+{
+ if (refcount_inc_not_zero(&uprobe->ref))
+ return uprobe;
+ return NULL;
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+
+static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ kfree(uprobe);
+}
+
+static void uprobe_free_srcu(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
+}
+
+static void uprobe_free_deferred(struct work_struct *work)
+{
+ struct uprobe *uprobe = container_of(work, struct uprobe, work);
+
+ write_lock(&uprobes_treelock);
+
+ if (uprobe_is_active(uprobe)) {
+ write_seqcount_begin(&uprobes_seqcount);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ write_seqcount_end(&uprobes_seqcount);
+ }
+
+ write_unlock(&uprobes_treelock);
+
+ /*
+ * If application munmap(exec_vma) before uprobe_unregister()
+ * gets called, we don't get a chance to remove uprobe from
+ * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ */
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(uprobe, NULL);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ /* start srcu -> rcu_tasks_trace -> kfree chain */
+ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (!refcount_dec_and_test(&uprobe->ref))
+ return;
+
+ INIT_WORK(&uprobe->work, uprobe_free_deferred);
+ schedule_work(&uprobe->work);
+}
+
+/* Initialize hprobe as SRCU-protected "leased" uprobe */
+static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
+{
+ WARN_ON(!uprobe);
+ hprobe->state = HPROBE_LEASED;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = srcu_idx;
+}
+
+/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
+static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
+{
+ hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
+ hprobe->uprobe = uprobe;
+ hprobe->srcu_idx = -1;
+}
+
+/*
+ * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
+ * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
+ * used only once for a given hprobe.
+ *
+ * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
+ * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
+ * is appropriate.
+ */
+static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
+{
+ *hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
+ switch (*hstate) {
+ case HPROBE_LEASED:
+ case HPROBE_STABLE:
+ return hprobe->uprobe;
+ case HPROBE_GONE: /* uprobe is NULL, no SRCU */
+ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */
+ return NULL;
+ default:
+ WARN(1, "hprobe invalid state %d", *hstate);
+ return NULL;
+ }
+}
+
+/*
+ * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
+ * hprobe_finalize() can only be used from current context after
+ * hprobe_consume() call (which determines uprobe and hstate value).
+ */
+static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
+{
+ switch (hstate) {
+ case HPROBE_LEASED:
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ break;
+ case HPROBE_STABLE:
+ put_uprobe(hprobe->uprobe);
+ break;
+ case HPROBE_GONE:
+ case HPROBE_CONSUMED:
+ break;
+ default:
+ WARN(1, "hprobe invalid state %d", hstate);
+ break;
+ }
+}
+
+/*
+ * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
+ * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
+ * them can win the race to perform SRCU unlocking. Whoever wins must perform
+ * SRCU unlock.
+ *
+ * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
+ * to begin with or we failed to bump its refcount and it's going away.
+ *
+ * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
+ * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
+ * an extra refcount for caller to assume and use. Otherwise, it's not
+ * guaranteed that returned uprobe has a positive refcount, so caller has to
+ * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
+ * SRCU lock region. See dup_utask().
+ */
+static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
+{
+ enum hprobe_state hstate;
+
+ /*
+ * Caller should guarantee that return_instance is not going to be
+ * freed from under us. This can be achieved either through holding
+ * rcu_read_lock() or by owning return_instance in the first place.
+ *
+ * Underlying uprobe is itself protected from reuse by SRCU, so ensure
+ * SRCU lock is held properly.
+ */
+ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
+
+ hstate = READ_ONCE(hprobe->state);
+ switch (hstate) {
+ case HPROBE_STABLE:
+ /* uprobe has positive refcount, bump refcount, if necessary */
+ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
+ case HPROBE_GONE:
+ /*
+ * SRCU was unlocked earlier and we didn't manage to take
+ * uprobe refcnt, so it's effectively NULL
+ */
+ return NULL;
+ case HPROBE_CONSUMED:
+ /*
+ * uprobe was consumed, so it's effectively NULL as far as
+ * uretprobe processing logic is concerned
+ */
+ return NULL;
+ case HPROBE_LEASED: {
+ struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
+ /*
+ * Try to switch hprobe state, guarding against
+ * hprobe_consume() or another hprobe_expire() racing with us.
+ * Note, if we failed to get uprobe refcount, we use special
+ * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
+ * be used as it will be freed after SRCU is unlocked.
+ */
+ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
+ /* We won the race, we are the ones to unlock SRCU */
+ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
+ return get ? get_uprobe(uprobe) : uprobe;
+ }
+
+ /*
+ * We lost the race, undo refcount bump (if it ever happened),
+ * unless caller would like an extra refcount anyways.
+ */
+ if (uprobe && !get)
+ put_uprobe(uprobe);
+ /*
+ * Even if hprobe_consume() or another hprobe_expire() wins
+ * the state update race and unlocks SRCU from under us, we
+ * still have a guarantee that underyling uprobe won't be
+ * freed due to ongoing caller's SRCU lock region, so we can
+ * return it regardless. Also, if `get` was true, we also have
+ * an extra ref for the caller to own. This is used in dup_utask().
+ */
+ return uprobe;
+ }
+ default:
+ WARN(1, "unknown hprobe state %d", hstate);
+ return NULL;
+ }
+}
+
+static __always_inline
+int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
+ const struct uprobe *r)
+{
+ if (l_inode < r->inode)
return -1;
- if (l->inode > r->inode)
+ if (l_inode > r->inode)
return 1;
- if (l->offset < r->offset)
+ if (l_offset < r->offset)
return -1;
- if (l->offset > r->offset)
+ if (l_offset > r->offset)
return 1;
return 0;
}
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
-{
- struct uprobe u = { .inode = inode, .offset = offset };
- struct rb_node *n = uprobes_tree.rb_node;
- struct uprobe *uprobe;
- int match;
+#define __node_2_uprobe(node) \
+ rb_entry((node), struct uprobe, rb_node)
- while (n) {
- uprobe = rb_entry(n, struct uprobe, rb_node);
- match = match_uprobe(&u, uprobe);
- if (!match) {
- atomic_inc(&uprobe->ref);
- return uprobe;
- }
+struct __uprobe_key {
+ struct inode *inode;
+ loff_t offset;
+};
- if (match < 0)
- n = n->rb_left;
- else
- n = n->rb_right;
- }
- return NULL;
+static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
+{
+ const struct __uprobe_key *a = key;
+ return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
+}
+
+static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct uprobe *u = __node_2_uprobe(a);
+ return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
+ * Assumes being inside RCU protected region.
+ * No refcount is taken on returned uprobe.
*/
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
+static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
- struct uprobe *uprobe;
+ struct __uprobe_key key = {
+ .inode = inode,
+ .offset = offset,
+ };
+ struct rb_node *node;
+ unsigned int seq;
- spin_lock(&uprobes_treelock);
- uprobe = __find_uprobe(inode, offset);
- spin_unlock(&uprobes_treelock);
+ lockdep_assert(rcu_read_lock_trace_held());
- return uprobe;
+ do {
+ seq = read_seqcount_begin(&uprobes_seqcount);
+ node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
+ /*
+ * Lockless RB-tree lookups can result only in false negatives.
+ * If the element is found, it is correct and can be returned
+ * under RCU protection. If we find nothing, we need to
+ * validate that seqcount didn't change. If it did, we have to
+ * try again as we might have missed the element (false
+ * negative). If seqcount is unchanged, search truly failed.
+ */
+ if (node)
+ return __node_2_uprobe(node);
+ } while (read_seqcount_retry(&uprobes_seqcount, seq));
+
+ return NULL;
}
+/*
+ * Attempt to insert a new uprobe into uprobes_tree.
+ *
+ * If uprobe already exists (for given inode+offset), we just increment
+ * refcount of previously existing uprobe.
+ *
+ * If not, a provided new instance of uprobe is inserted into the tree (with
+ * assumed initial refcount == 1).
+ *
+ * In any case, we return a uprobe instance that ends up being in uprobes_tree.
+ * Caller has to clean up new uprobe instance, if it ended up not being
+ * inserted into the tree.
+ *
+ * We assume that uprobes_treelock is held for writing.
+ */
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
- struct rb_node **p = &uprobes_tree.rb_node;
- struct rb_node *parent = NULL;
- struct uprobe *u;
- int match;
-
- while (*p) {
- parent = *p;
- u = rb_entry(parent, struct uprobe, rb_node);
- match = match_uprobe(uprobe, u);
- if (!match) {
- atomic_inc(&u->ref);
- return u;
- }
+ struct rb_node *node;
+again:
+ node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ if (node) {
+ struct uprobe *u = __node_2_uprobe(node);
- if (match < 0)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
+ if (!try_get_uprobe(u)) {
+ rb_erase(node, &uprobes_tree);
+ RB_CLEAR_NODE(&u->rb_node);
+ goto again;
+ }
+ return u;
}
- u = NULL;
- rb_link_node(&uprobe->rb_node, parent, p);
- rb_insert_color(&uprobe->rb_node, &uprobes_tree);
- /* get access + creation ref */
- atomic_set(&uprobe->ref, 2);
-
- return u;
+ return uprobe;
}
/*
- * Acquire uprobes_treelock.
- * Matching uprobe already exists in rbtree;
- * increment (access refcount) and return the matching uprobe.
- *
- * No matching uprobe; insert the uprobe in rb_tree;
- * get a double refcount (access + creation) and return NULL.
+ * Acquire uprobes_treelock and insert uprobe into uprobes_tree
+ * (or reuse existing one, see __insert_uprobe() comments above).
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
- spin_lock(&uprobes_treelock);
+ write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
- spin_unlock(&uprobes_treelock);
+ write_seqcount_end(&uprobes_seqcount);
+ write_unlock(&uprobes_treelock);
return u;
}
-static void put_uprobe(struct uprobe *uprobe)
+static void
+ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
- if (atomic_dec_and_test(&uprobe->ref))
- kfree(uprobe);
+ pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
+ "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
+ uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
+ (unsigned long long) cur_uprobe->ref_ctr_offset,
+ (unsigned long long) uprobe->ref_ctr_offset);
}
-static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
+ loff_t ref_ctr_offset)
{
struct uprobe *uprobe, *cur_uprobe;
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
- return NULL;
+ return ERR_PTR(-ENOMEM);
- uprobe->inode = igrab(inode);
+ uprobe->inode = inode;
uprobe->offset = offset;
+ uprobe->ref_ctr_offset = ref_ctr_offset;
+ INIT_LIST_HEAD(&uprobe->consumers);
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
+ RB_CLEAR_NODE(&uprobe->rb_node);
+ refcount_set(&uprobe->ref, 1);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
- if (cur_uprobe) {
+ if (cur_uprobe != uprobe) {
+ if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
+ ref_ctr_mismatch_warn(cur_uprobe, uprobe);
+ put_uprobe(cur_uprobe);
+ kfree(uprobe);
+ return ERR_PTR(-EINVAL);
+ }
kfree(uprobe);
uprobe = cur_uprobe;
- iput(inode);
}
return uprobe;
@@ -505,33 +1026,23 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
+ static atomic64_t id;
+
down_write(&uprobe->consumer_rwsem);
- uc->next = uprobe->consumers;
- uprobe->consumers = uc;
+ list_add_rcu(&uc->cons_node, &uprobe->consumers);
+ uc->id = (__u64) atomic64_inc_return(&id);
up_write(&uprobe->consumer_rwsem);
}
/*
* For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
+ * Should never be called with consumer that's not part of @uprobe->consumers.
*/
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- struct uprobe_consumer **con;
- bool ret = false;
-
down_write(&uprobe->consumer_rwsem);
- for (con = &uprobe->consumers; *con; con = &(*con)->next) {
- if (*con == uc) {
- *con = uc->next;
- ret = true;
- break;
- }
- }
+ list_del_rcu(&uc->cons_node);
up_write(&uprobe->consumer_rwsem);
-
- return ret;
}
static int __copy_insn(struct address_space *mapping, struct file *filp,
@@ -540,18 +1051,18 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
struct page *page;
/*
* Ensure that the page that has the original instruction is populated
- * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
* see uprobe_register().
*/
- if (mapping->a_ops->readpage)
- page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ if (mapping->a_ops->read_folio)
+ page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
- page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
- copy_from_page(page, offset, insn, nbytes);
- page_cache_release(page);
+ uprobe_copy_from_page(page, offset, insn, nbytes);
+ put_page(page);
return 0;
}
@@ -607,11 +1118,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* uprobe_write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
- smp_wmb(); /* pairs with rmb() in find_active_uprobe() */
+ smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
@@ -620,21 +1127,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
return ret;
}
-static inline bool consumer_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
- return !uc->filter || uc->filter(uc, ctx, mm);
+ return !uc->filter || uc->filter(uc, mm);
}
-static bool filter_chain(struct uprobe *uprobe,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- ret = consumer_filter(uc, ctx, mm);
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ ret = consumer_filter(uc, mm);
if (ret)
break;
}
@@ -643,10 +1148,10 @@ static bool filter_chain(struct uprobe *uprobe,
return ret;
}
-static int
-install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long vaddr)
+static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
bool first_uprobe;
int ret;
@@ -658,46 +1163,26 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
* set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
* the task can hit this breakpoint right after __replace_page().
*/
- first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm);
if (first_uprobe)
- set_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_set(MMF_HAS_UPROBES, mm);
- ret = set_swbp(&uprobe->arch, mm, vaddr);
+ ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
- clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_RECALC_UPROBES, mm);
else if (first_uprobe)
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
return ret;
}
-static int
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
-{
- set_bit(MMF_RECALC_UPROBES, &mm->flags);
- return set_orig_insn(&uprobe->arch, mm, vaddr);
-}
-
-static inline bool uprobe_is_active(struct uprobe *uprobe)
-{
- return !RB_EMPTY_NODE(&uprobe->rb_node);
-}
-/*
- * There could be threads that have already hit the breakpoint. They
- * will recheck the current insn and restart if find_uprobe() fails.
- * See find_active_uprobe().
- */
-static void delete_uprobe(struct uprobe *uprobe)
+static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- if (WARN_ON(!uprobe_is_active(uprobe)))
- return;
+ struct mm_struct *mm = vma->vm_mm;
- spin_lock(&uprobes_treelock);
- rb_erase(&uprobe->rb_node, &uprobes_tree);
- spin_unlock(&uprobes_treelock);
- RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
- iput(uprobe->inode);
- put_uprobe(uprobe);
+ mm_flags_set(MMF_RECALC_UPROBES, mm);
+ return set_orig_insn(&uprobe->arch, vma, vaddr);
}
struct map_info {
@@ -735,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
- GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ GFP_NOWAIT | __GFP_NOMEMALLOC);
if (prev)
prev->next = NULL;
}
@@ -744,7 +1229,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
continue;
}
- if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ if (!mmget_not_zero(vma->vm_mm))
continue;
info = prev;
@@ -804,8 +1289,17 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
+ /*
+ * We take mmap_lock for writing to avoid the race with
+ * find_active_uprobe_rcu() which takes mmap_lock for reading.
+ * Thus this install_breakpoint() can not make
+ * is_trap_at_addr() true right after find_uprobe_rcu()
+ * returns NULL in find_active_uprobe_rcu().
+ */
+ mmap_write_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
- down_write(&mm->mmap_sem);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
@@ -817,17 +1311,15 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
- if (consumer_filter(new,
- UPROBE_FILTER_REGISTER, mm))
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
- } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
- if (!filter_chain(uprobe,
- UPROBE_FILTER_UNREGISTER, mm))
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ if (consumer_filter(new, mm))
+ err = install_breakpoint(uprobe, vma, info->vaddr);
+ } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) {
+ if (!filter_chain(uprobe, mm))
+ err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
unlock:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
free:
mmput(mm);
info = free_map_info(info);
@@ -837,29 +1329,51 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
-static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
- consumer_add(uprobe, uc);
- return register_for_each_vma(uprobe, uc);
-}
-
-static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+/**
+ * uprobe_unregister_nosync - unregister an already registered probe.
+ * @uprobe: uprobe to remove
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
- if (WARN_ON(!consumer_del(uprobe, uc)))
- return;
-
+ down_write(&uprobe->register_rwsem);
+ consumer_del(uprobe, uc);
err = register_for_each_vma(uprobe, NULL);
+ up_write(&uprobe->register_rwsem);
+
/* TODO : cant unregister? schedule a worker thread */
- if (!uprobe->consumers && !err)
- delete_uprobe(uprobe);
+ if (unlikely(err)) {
+ uprobe_warn(current, "unregister, leaking uprobe");
+ return;
+ }
+
+ put_uprobe(uprobe);
}
+EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
-/*
+void uprobe_unregister_sync(void)
+{
+ /*
+ * Now that handler_chain() and handle_uretprobe_chain() iterate over
+ * uprobe->consumers list under RCU protection without holding
+ * uprobe->register_rwsem, we need to wait for RCU grace period to
+ * make sure that we can't call into just unregistered
+ * uprobe_consumer's callbacks anymore. If we don't do that, fast and
+ * unlucky enough caller can free consumer's memory and cause
+ * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
+ */
+ synchronize_rcu_tasks_trace();
+ synchronize_srcu(&uretprobes_srcu);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
+
+/**
* uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
+ * @ref_ctr_offset: offset of SDT marker / reference counter
* @uc: information on howto handle the probe..
*
* Apart from the access refcount, uprobe_register() takes a creation
@@ -868,108 +1382,99 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters.
+ * unregisters. Caller of uprobe_register() is required to keep @inode
+ * (and the containing mount) referenced.
*
- * Return errno if it cannot successully install probes
- * else return 0 (success)
+ * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/
-int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
+struct uprobe *uprobe_register(struct inode *inode,
+ loff_t offset, loff_t ref_ctr_offset,
+ struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
- if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
- return -EIO;
+ if (!inode->i_mapping->a_ops->read_folio &&
+ !shmem_mapping(inode->i_mapping))
+ return ERR_PTR(-EIO);
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- retry:
- uprobe = alloc_uprobe(inode, offset);
- if (!uprobe)
- return -ENOMEM;
/*
- * We can race with uprobe_unregister()->delete_uprobe().
- * Check uprobe_is_active() and retry if it is false.
+ * This ensures that uprobe_copy_from_page(), copy_to_page() and
+ * __update_ref_ctr() can't cross page boundary.
*/
+ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
+ return ERR_PTR(-EINVAL);
+ if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
+ return ERR_PTR(-EINVAL);
+
+ uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
+ if (IS_ERR(uprobe))
+ return uprobe;
+
down_write(&uprobe->register_rwsem);
- ret = -EAGAIN;
- if (likely(uprobe_is_active(uprobe))) {
- ret = __uprobe_register(uprobe, uc);
- if (ret)
- __uprobe_unregister(uprobe, uc);
- }
+ consumer_add(uprobe, uc);
+ ret = register_for_each_vma(uprobe, uc);
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
- if (unlikely(ret == -EAGAIN))
- goto retry;
- return ret;
+ if (ret) {
+ uprobe_unregister_nosync(uprobe, uc);
+ /*
+ * Registration might have partially succeeded, so we can have
+ * this consumer being called right at this time. We need to
+ * sync here. It's ok, it's unlikely slow path.
+ */
+ uprobe_unregister_sync();
+ return ERR_PTR(ret);
+ }
+
+ return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);
-/*
- * uprobe_apply - unregister a already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+/**
+ * uprobe_apply - add or remove the breakpoints according to @uc->filter
+ * @uprobe: uprobe which "owns" the breakpoint
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
+ * Return: 0 on success or negative error code.
*/
-int uprobe_apply(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc, bool add)
+int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
- struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return ret;
-
down_write(&uprobe->register_rwsem);
- for (con = uprobe->consumers; con && con != uc ; con = con->next)
- ;
- if (con)
- ret = register_for_each_vma(uprobe, add ? uc : NULL);
- up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
-
- return ret;
-}
-
-/*
- * uprobe_unregister - unregister a already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
- * @uc: identify which probe if multiple probes are colocated.
- */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
- struct uprobe *uprobe;
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return;
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ if (con == uc) {
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ break;
+ }
+ }
+ rcu_read_unlock_trace();
- down_write(&uprobe->register_rwsem);
- __uprobe_unregister(uprobe, uc);
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(uprobe_unregister);
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
int err = 0;
- down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -983,9 +1488,9 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
- err |= remove_breakpoint(uprobe, mm, vaddr);
+ err |= remove_breakpoint(uprobe, vma, vaddr);
}
- up_read(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return err;
}
@@ -1031,29 +1536,60 @@ static void build_probe_list(struct inode *inode,
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock(&uprobes_treelock);
+ read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset < min)
break;
- list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
- list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
+ }
+ }
+ read_unlock(&uprobes_treelock);
+}
+
+/* @vma contains reference counter, not the probed instruction. */
+static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
+{
+ struct list_head *pos, *q;
+ struct delayed_uprobe *du;
+ unsigned long vaddr;
+ int ret = 0, err = 0;
+
+ mutex_lock(&delayed_uprobe_lock);
+ list_for_each_safe(pos, q, &delayed_uprobe_list) {
+ du = list_entry(pos, struct delayed_uprobe, list);
+
+ if (du->mm != vma->vm_mm ||
+ !valid_ref_ctr_vma(du->uprobe, vma))
+ continue;
+
+ vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
+ ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
+ if (ret) {
+ update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
+ if (!err)
+ err = ret;
}
+ delayed_uprobe_delete(du);
}
- spin_unlock(&uprobes_treelock);
+ mutex_unlock(&delayed_uprobe_lock);
+ return err;
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
+ * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
@@ -1064,7 +1600,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
struct uprobe *uprobe, *u;
struct inode *inode;
- if (no_uprobe_events() || !valid_vma(vma, true))
+ if (no_uprobe_events())
+ return 0;
+
+ if (vma->vm_file &&
+ (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
+ mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm))
+ delayed_ref_ctr_inc(vma);
+
+ if (!valid_vma(vma, true))
return 0;
inode = file_inode(vma->vm_file);
@@ -1080,9 +1624,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
- filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ install_breakpoint(uprobe, vma, vaddr);
}
put_uprobe(uprobe);
}
@@ -1103,9 +1647,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock(&uprobes_treelock);
+ read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
- spin_unlock(&uprobes_treelock);
+ read_unlock(&uprobes_treelock);
return !!n;
}
@@ -1121,61 +1665,102 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
return;
- if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
- test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
+ if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) ||
+ mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm))
return;
if (vma_has_uprobes(vma, start, end))
- set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm);
+}
+
+static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
+
+ vmf->page = area->page;
+ get_page(vmf->page);
+ return 0;
+}
+
+static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
+{
+ return -EPERM;
}
+static const struct vm_special_mapping xol_mapping = {
+ .name = "[uprobes]",
+ .fault = xol_fault,
+ .mremap = xol_mremap,
+};
+
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
- int ret = -EALREADY;
+ struct vm_area_struct *vma;
+ int ret;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
- down_write(&mm->mmap_sem);
- if (mm->uprobes_state.xol_area)
+ if (mm->uprobes_state.xol_area) {
+ ret = -EALREADY;
goto fail;
+ }
if (!area->vaddr) {
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
PAGE_SIZE, 0, 0);
- if (area->vaddr & ~PAGE_MASK) {
+ if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr;
goto fail;
}
}
- ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
- VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
- if (ret)
+ vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|
+ VM_SEALED_SYSMAP,
+ &xol_mapping);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
goto fail;
+ }
- smp_wmb(); /* pairs with get_xol_area() */
- mm->uprobes_state.xol_area = area;
+ ret = 0;
+ /* pairs with get_xol_area() */
+ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
fail:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
+void * __weak arch_uretprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
static struct xol_area *__create_xol_area(unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
- uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ unsigned long insns_size;
struct xol_area *area;
+ void *insns;
- area = kmalloc(sizeof(*area), GFP_KERNEL);
+ area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
goto out;
- area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
+ area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
+ GFP_KERNEL);
if (!area->bitmap)
goto free_area;
- area->page = alloc_page(GFP_HIGHUSER);
+ area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
if (!area->page)
goto free_bitmap;
@@ -1183,8 +1768,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
- atomic_set(&area->slot_count, 1);
- copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ insns = arch_uretprobe_trampoline(&insns_size);
+ arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
return area;
@@ -1212,11 +1797,19 @@ static struct xol_area *get_xol_area(void)
if (!mm->uprobes_state.xol_area)
__create_xol_area(0);
- area = mm->uprobes_state.xol_area;
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ /* Pairs with xol_add_vma() smp_store_release() */
+ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
return area;
}
+void __weak arch_uprobe_clear_state(struct mm_struct *mm)
+{
+}
+
+void __weak arch_uprobe_init_state(struct mm_struct *mm)
+{
+}
+
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
@@ -1224,6 +1817,12 @@ void uprobe_clear_state(struct mm_struct *mm)
{
struct xol_area *area = mm->uprobes_state.xol_area;
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(NULL, mm);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ arch_uprobe_clear_state(mm);
+
if (!area)
return;
@@ -1244,100 +1843,64 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- newmm->uprobes_state.xol_area = NULL;
-
- if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
- set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) {
+ mm_flags_set(MMF_HAS_UPROBES, newmm);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
- set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ mm_flags_set(MMF_RECALC_UPROBES, newmm);
}
}
-/*
- * - search for a free slot.
- */
-static unsigned long xol_take_insn_slot(struct xol_area *area)
+static unsigned long xol_get_slot_nr(struct xol_area *area)
{
- unsigned long slot_addr;
- int slot_nr;
-
- do {
- slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
- if (slot_nr < UINSNS_PER_PAGE) {
- if (!test_and_set_bit(slot_nr, area->bitmap))
- break;
+ unsigned long slot_nr;
- slot_nr = UINSNS_PER_PAGE;
- continue;
- }
- wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
- } while (slot_nr >= UINSNS_PER_PAGE);
-
- slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
- atomic_inc(&area->slot_count);
+ slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
+ if (slot_nr < UINSNS_PER_PAGE) {
+ if (!test_and_set_bit(slot_nr, area->bitmap))
+ return slot_nr;
+ }
- return slot_addr;
+ return UINSNS_PER_PAGE;
}
/*
* xol_get_insn_slot - allocate a slot for xol.
- * Returns the allocated slot address or 0.
*/
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
+static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long xol_vaddr;
+ struct xol_area *area = get_xol_area();
+ unsigned long slot_nr;
- area = get_xol_area();
if (!area)
- return 0;
+ return false;
- xol_vaddr = xol_take_insn_slot(area);
- if (unlikely(!xol_vaddr))
- return 0;
+ wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);
- arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
+ arch_uprobe_copy_ixol(area->page, utask->xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
-
- return xol_vaddr;
+ return true;
}
/*
- * xol_free_insn_slot - If slot was earlier allocated by
- * @xol_get_insn_slot(), make the slot available for
- * subsequent requests.
+ * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
*/
-static void xol_free_insn_slot(struct task_struct *tsk)
+static void xol_free_insn_slot(struct uprobe_task *utask)
{
- struct xol_area *area;
- unsigned long vma_end;
- unsigned long slot_addr;
+ struct xol_area *area = current->mm->uprobes_state.xol_area;
+ unsigned long offset = utask->xol_vaddr - area->vaddr;
+ unsigned int slot_nr;
- if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
+ utask->xol_vaddr = 0;
+ /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */
+ if (WARN_ON_ONCE(offset >= PAGE_SIZE))
return;
- slot_addr = tsk->utask->xol_vaddr;
- if (unlikely(!slot_addr))
- return;
-
- area = tsk->mm->uprobes_state.xol_area;
- vma_end = area->vaddr + PAGE_SIZE;
- if (area->vaddr <= slot_addr && slot_addr < vma_end) {
- unsigned long offset;
- int slot_nr;
-
- offset = slot_addr - area->vaddr;
- slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
- if (slot_nr >= UINSNS_PER_PAGE)
- return;
-
- clear_bit(slot_nr, area->bitmap);
- atomic_dec(&area->slot_count);
- if (waitqueue_active(&area->wq))
- wake_up(&area->wq);
-
- tsk->utask->xol_vaddr = 0;
- }
+ slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
+ clear_bit(slot_nr, area->bitmap);
+ smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
+ if (waitqueue_active(&area->wq))
+ wake_up(&area->wq);
}
void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
@@ -1347,7 +1910,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
copy_to_page(page, vaddr, src, len);
/*
- * We probably need flush_icache_user_range() but it needs vma.
+ * We probably need flush_icache_user_page() but it needs vma.
* This should work on most of architectures by default. If
* architecture needs to do something different it can define
* its own version of the function.
@@ -1376,37 +1939,144 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
+static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
+{
+ ri->cons_cnt = 0;
+ ri->next = utask->ri_pool;
+ utask->ri_pool = ri;
+}
+
+static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
+{
+ struct return_instance *ri = utask->ri_pool;
+
+ if (likely(ri))
+ utask->ri_pool = ri->next;
+
+ return ri;
+}
+
+static void ri_free(struct return_instance *ri)
+{
+ kfree(ri->extra_consumers);
+ kfree_rcu(ri, rcu);
+}
+
+static void free_ret_instance(struct uprobe_task *utask,
+ struct return_instance *ri, bool cleanup_hprobe)
+{
+ unsigned seq;
+
+ if (cleanup_hprobe) {
+ enum hprobe_state hstate;
+
+ (void)hprobe_consume(&ri->hprobe, &hstate);
+ hprobe_finalize(&ri->hprobe, hstate);
+ }
+
+ /*
+ * At this point return_instance is unlinked from utask's
+ * return_instances list and this has become visible to ri_timer().
+ * If seqcount now indicates that ri_timer's return instance
+ * processing loop isn't active, we can return ri into the pool of
+ * to-be-reused return instances for future uretprobes. If ri_timer()
+ * happens to be running right now, though, we fallback to safety and
+ * just perform RCU-delated freeing of ri.
+ * Admittedly, this is a rather simple use of seqcount, but it nicely
+ * abstracts away all the necessary memory barriers, so we use
+ * a well-supported kernel primitive here.
+ */
+ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
+ /* immediate reuse of ri without RCU GP is OK */
+ ri_pool_push(utask, ri);
+ } else {
+ /* we might be racing with ri_timer(), so play it safe */
+ ri_free(ri);
+ }
+}
+
/*
* Called with no locks held.
- * Called in context of a exiting or a exec-ing thread.
+ * Called in context of an exiting or an exec-ing thread.
*/
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- struct return_instance *ri, *tmp;
+ struct return_instance *ri, *ri_next;
if (!utask)
return;
- if (utask->active_uprobe)
- put_uprobe(utask->active_uprobe);
+ t->utask = NULL;
+ WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
+
+ timer_delete_sync(&utask->ri_timer);
ri = utask->return_instances;
while (ri) {
- tmp = ri;
- ri = ri->next;
+ ri_next = ri->next;
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
+ }
- put_uprobe(tmp->uprobe);
- kfree(tmp);
+ /* free_ret_instance() above might add to ri_pool, so this loop should come last */
+ ri = utask->ri_pool;
+ while (ri) {
+ ri_next = ri->next;
+ ri_free(ri);
+ ri = ri_next;
}
- xol_free_insn_slot(t);
kfree(utask);
- t->utask = NULL;
+}
+
+#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
+
+#define for_each_ret_instance_rcu(pos, head) \
+ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
+
+static void ri_timer(struct timer_list *timer)
+{
+ struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
+ struct return_instance *ri;
+
+ /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
+ guard(srcu)(&uretprobes_srcu);
+ /* RCU protects return_instance from freeing. */
+ guard(rcu)();
+
+ /*
+ * See free_ret_instance() for notes on seqcount use.
+ * We also employ raw API variants to avoid lockdep false-positive
+ * warning complaining about enabled preemption. The timer can only be
+ * invoked once for a uprobe_task. Therefore there can only be one
+ * writer. The reader does not require an even sequence count to make
+ * progress, so it is OK to remain preemptible on PREEMPT_RT.
+ */
+ raw_write_seqcount_begin(&utask->ri_seqcount);
+
+ for_each_ret_instance_rcu(ri, utask->return_instances)
+ hprobe_expire(&ri->hprobe, false);
+
+ raw_write_seqcount_end(&utask->ri_seqcount);
+}
+
+static struct uprobe_task *alloc_utask(void)
+{
+ struct uprobe_task *utask;
+
+ utask = kzalloc(sizeof(*utask), GFP_KERNEL);
+ if (!utask)
+ return NULL;
+
+ timer_setup(&utask->ri_timer, ri_timer, 0);
+ seqcount_init(&utask->ri_seqcount);
+
+ return utask;
}
/*
- * Allocate a uprobe_task object for the task if if necessary.
+ * Allocate a uprobe_task object for the task if necessary.
* Called when the thread hits a breakpoint.
*
* Returns:
@@ -1416,57 +2086,101 @@ void uprobe_free_utask(struct task_struct *t)
static struct uprobe_task *get_utask(void)
{
if (!current->utask)
- current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ current->utask = alloc_utask();
return current->utask;
}
+static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
+{
+ struct return_instance *ri;
+
+ ri = ri_pool_pop(utask);
+ if (ri)
+ return ri;
+
+ ri = kzalloc(sizeof(*ri), GFP_KERNEL);
+ if (!ri)
+ return ZERO_SIZE_PTR;
+
+ return ri;
+}
+
+static struct return_instance *dup_return_instance(struct return_instance *old)
+{
+ struct return_instance *ri;
+
+ ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
+ if (!ri)
+ return NULL;
+
+ if (unlikely(old->cons_cnt > 1)) {
+ ri->extra_consumers = kmemdup(old->extra_consumers,
+ sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
+ GFP_KERNEL);
+ if (!ri->extra_consumers) {
+ kfree(ri);
+ return NULL;
+ }
+ }
+
+ return ri;
+}
+
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
struct uprobe_task *n_utask;
struct return_instance **p, *o, *n;
+ struct uprobe *uprobe;
- n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ n_utask = alloc_utask();
if (!n_utask)
return -ENOMEM;
t->utask = n_utask;
+ /* protect uprobes from freeing, we'll need try_get_uprobe() them */
+ guard(srcu)(&uretprobes_srcu);
+
p = &n_utask->return_instances;
for (o = o_utask->return_instances; o; o = o->next) {
- n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+ n = dup_return_instance(o);
if (!n)
return -ENOMEM;
- *n = *o;
- atomic_inc(&n->uprobe->ref);
- n->next = NULL;
+ /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
+ uprobe = hprobe_expire(&o->hprobe, true);
- *p = n;
+ /*
+ * New utask will have stable properly refcounted uprobe or
+ * NULL. Even if we failed to get refcounted uprobe, we still
+ * need to preserve full set of return_instances for proper
+ * uretprobe handling and nesting in forked task.
+ */
+ hprobe_init_stable(&n->hprobe, uprobe);
+
+ n->next = NULL;
+ rcu_assign_pointer(*p, n);
p = &n->next;
+
n_utask->depth++;
}
return 0;
}
-static void uprobe_warn(struct task_struct *t, const char *msg)
-{
- pr_warn("uprobe: %s:%d failed to %s\n",
- current->comm, current->pid, msg);
-}
-
static void dup_xol_work(struct callback_head *work)
{
if (current->flags & PF_EXITING)
return;
- if (!__create_xol_area(current->utask->dup_xol_addr))
+ if (!__create_xol_area(current->utask->dup_xol_addr) &&
+ !fatal_signal_pending(current))
uprobe_warn(current, "dup xol area");
}
/*
* Called in context of a new clone/fork from copy_process.
*/
-void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+void uprobe_copy_process(struct task_struct *t, u64 flags)
{
struct uprobe_task *utask = current->utask;
struct mm_struct *mm = current->mm;
@@ -1493,7 +2207,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
t->utask->dup_xol_addr = area->vaddr;
init_task_work(&t->utask->dup_xol_work, dup_xol_work);
- task_work_add(t, &t->utask->dup_xol_work, true);
+ task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}
/*
@@ -1502,115 +2216,128 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
*
* Returns -1 in case the xol_area is not allocated.
*/
-static unsigned long get_trampoline_vaddr(void)
+unsigned long uprobe_get_trampoline_vaddr(void)
{
+ unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR;
struct xol_area *area;
- unsigned long trampoline_vaddr = -1;
- area = current->mm->uprobes_state.xol_area;
- smp_read_barrier_depends();
+ /* Pairs with xol_add_vma() smp_store_release() */
+ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
if (area)
trampoline_vaddr = area->vaddr;
return trampoline_vaddr;
}
-static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+ struct pt_regs *regs)
{
- struct return_instance *ri;
- struct uprobe_task *utask;
+ struct return_instance *ri = utask->return_instances, *ri_next;
+ enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
+
+ while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
+ utask->depth--;
+
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
+ }
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
+ struct return_instance *ri)
+{
+ struct uprobe_task *utask = current->utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
- bool chained = false;
+ bool chained;
+ int srcu_idx;
if (!get_xol_area())
- return;
-
- utask = get_utask();
- if (!utask)
- return;
+ goto free;
if (utask->depth >= MAX_URETPROBE_DEPTH) {
printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
" nestedness limit pid/tgid=%d/%d\n",
current->pid, current->tgid);
- return;
+ goto free;
}
- ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
- if (!ri)
- goto fail;
-
- trampoline_vaddr = get_trampoline_vaddr();
+ trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
- goto fail;
+ goto free;
+
+ /* drop the entries invalidated by longjmp() */
+ chained = (orig_ret_vaddr == trampoline_vaddr);
+ cleanup_return_instances(utask, chained, regs);
/*
* We don't want to keep trampoline address in stack, rather keep the
* original return address of first caller thru all the consequent
* instances. This also makes breakpoint unwrapping easier.
*/
- if (orig_ret_vaddr == trampoline_vaddr) {
+ if (chained) {
if (!utask->return_instances) {
/*
* This situation is not possible. Likely we have an
* attack from user-space.
*/
- pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
- goto fail;
+ uprobe_warn(current, "handle tail call");
+ goto free;
}
-
- chained = true;
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- atomic_inc(&uprobe->ref);
- ri->uprobe = uprobe;
+ /* __srcu_read_lock() because SRCU lock survives switch to user space */
+ srcu_idx = __srcu_read_lock(&uretprobes_srcu);
+
ri->func = instruction_pointer(regs);
+ ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
- /* add instance to the stack */
+ hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
ri->next = utask->return_instances;
- utask->return_instances = ri;
+ rcu_assign_pointer(utask->return_instances, ri);
- return;
+ mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
- fail:
- kfree(ri);
+ return;
+free:
+ ri_free(ri);
}
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
- struct uprobe_task *utask;
- unsigned long xol_vaddr;
+ struct uprobe_task *utask = current->utask;
int err;
- utask = get_utask();
- if (!utask)
- return -ENOMEM;
+ if (!try_get_uprobe(uprobe))
+ return -EINVAL;
- xol_vaddr = xol_get_insn_slot(uprobe);
- if (!xol_vaddr)
- return -ENOMEM;
+ if (!xol_get_insn_slot(uprobe, utask)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
- utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
-
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
- xol_free_insn_slot(current);
- return err;
+ xol_free_insn_slot(utask);
+ goto err_out;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
+err_out:
+ put_uprobe(uprobe);
+ return err;
}
/*
@@ -1632,10 +2359,9 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
- if (signal_pending(t)) {
- spin_lock_irq(&t->sighand->siglock);
+ if (task_sigpending(t)) {
+ utask->signal_denied = true;
clear_tsk_thread_flag(t, TIF_SIGPENDING);
- spin_unlock_irq(&t->sighand->siglock);
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
@@ -1648,9 +2374,10 @@ bool uprobe_deny_signal(void)
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!valid_vma(vma, false))
continue;
/*
@@ -1663,7 +2390,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
return;
}
- clear_bit(MMF_HAS_UPROBES, &mm->flags);
+ mm_flags_clear(MMF_HAS_UPROBES, mm);
}
static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
@@ -1672,39 +2399,87 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
uprobe_opcode_t opcode;
int result;
+ if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
+ return -EINVAL;
+
pagefault_disable();
- result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
- sizeof(opcode));
+ result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
if (likely(result == 0))
goto out;
- result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
if (result < 0)
return result;
- copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
return is_trap_insn(&opcode);
}
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+ struct file *vm_file;
+ loff_t offset;
+ unsigned int seq;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return NULL;
+
+ vma = vma_lookup(mm, bp_vaddr);
+ if (!vma)
+ return NULL;
+
+ /*
+ * vm_file memory can be reused for another instance of struct file,
+ * but can't be freed from under us, so it's safe to read fields from
+ * it, even if the values are some garbage values; ultimately
+ * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
+ * that whatever we speculatively found is correct
+ */
+ vm_file = READ_ONCE(vma->vm_file);
+ if (!vm_file)
+ return NULL;
+
+ offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
+ uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
+ if (!uprobe)
+ return NULL;
+
+ /* now double check that nothing about MM changed */
+ if (mmap_lock_speculate_retry(mm, seq))
+ return NULL;
+
+ return uprobe;
+}
+
+/* assumes being inside RCU protected region */
+static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, bp_vaddr);
- if (vma && vma->vm_start <= bp_vaddr) {
- if (valid_vma(vma, false)) {
+ uprobe = find_active_uprobe_speculative(bp_vaddr);
+ if (uprobe)
+ return uprobe;
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, bp_vaddr);
+ if (vma) {
+ if (vma->vm_file) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
- uprobe = find_uprobe(inode, offset);
+ uprobe = find_uprobe_rcu(inode, offset);
}
if (!uprobe)
@@ -1713,104 +2488,217 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
*is_swbp = -EFAULT;
}
- if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm))
mmf_recalc_uprobes(mm);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return uprobe;
}
+static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
+{
+ struct return_consumer *ric;
+
+ if (unlikely(ri == ZERO_SIZE_PTR))
+ return ri;
+
+ if (unlikely(ri->cons_cnt > 0)) {
+ ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
+ if (!ric) {
+ ri_free(ri);
+ return ZERO_SIZE_PTR;
+ }
+ ri->extra_consumers = ric;
+ }
+
+ ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
+ ric->id = id;
+ ric->cookie = cookie;
+
+ ri->cons_cnt++;
+ return ri;
+}
+
+static struct return_consumer *
+return_consumer_find(struct return_instance *ri, int *iter, int id)
+{
+ struct return_consumer *ric;
+ int idx;
+
+ for (idx = *iter; idx < ri->cons_cnt; idx++)
+ {
+ ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
+ if (ric->id == id) {
+ *iter = idx + 1;
+ return ric;
+ }
+ }
+
+ return NULL;
+}
+
+static bool ignore_ret_handler(int rc)
+{
+ return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
+}
+
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
- int remove = UPROBE_HANDLER_REMOVE;
- bool need_prep = false; /* prepare return uprobe, when needed */
+ bool has_consumers = false, remove = true;
+ struct return_instance *ri = NULL;
+ struct uprobe_task *utask = current->utask;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ utask->auprobe = &uprobe->arch;
+
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
+ __u64 cookie = 0;
int rc = 0;
if (uc->handler) {
- rc = uc->handler(uc, regs);
- WARN(rc & ~UPROBE_HANDLER_MASK,
- "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ rc = uc->handler(uc, regs, &cookie);
+ WARN(rc < 0 || rc > 2,
+ "bad rc=0x%x from %ps()\n", rc, uc->handler);
}
- if (uc->ret_handler)
- need_prep = true;
+ remove &= rc == UPROBE_HANDLER_REMOVE;
+ has_consumers = true;
+
+ if (!uc->ret_handler || ignore_ret_handler(rc))
+ continue;
+
+ if (!ri)
+ ri = alloc_return_instance(utask);
- remove &= rc;
+ if (session)
+ ri = push_consumer(ri, uc->id, cookie);
}
+ utask->auprobe = NULL;
+
+ if (!ZERO_OR_NULL_PTR(ri))
+ prepare_uretprobe(uprobe, regs, ri);
- if (need_prep && !remove)
- prepare_uretprobe(uprobe, regs); /* put bp at return */
+ if (remove && has_consumers) {
+ down_read(&uprobe->register_rwsem);
- if (remove && uprobe->consumers) {
- WARN_ON(!uprobe_is_active(uprobe));
- unapply_uprobe(uprobe, current->mm);
+ /* re-check that removal is still required, this time under lock */
+ if (!filter_chain(uprobe, current->mm)) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+
+ up_read(&uprobe->register_rwsem);
}
- up_read(&uprobe->register_rwsem);
}
static void
-handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
{
- struct uprobe *uprobe = ri->uprobe;
+ struct return_consumer *ric;
struct uprobe_consumer *uc;
+ int ric_idx = 0;
+
+ /* all consumers unsubscribed meanwhile */
+ if (unlikely(!uprobe))
+ return;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- if (uc->ret_handler)
- uc->ret_handler(uc, ri->func, regs);
+ rcu_read_lock_trace();
+ list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
+ bool session = uc->handler && uc->ret_handler;
+
+ if (uc->ret_handler) {
+ ric = return_consumer_find(ri, &ric_idx, uc->id);
+ if (!session || ric)
+ uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
+ }
}
- up_read(&uprobe->register_rwsem);
+ rcu_read_unlock_trace();
}
-static bool handle_trampoline(struct pt_regs *regs)
+static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
- struct uprobe_task *utask;
- struct return_instance *ri, *tmp;
bool chained;
+ do {
+ chained = ri->chained;
+ ri = ri->next; /* can't be NULL if chained */
+ } while (chained);
+
+ return ri;
+}
+
+void uprobe_handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *ri_next, *next_chain;
+ struct uprobe *uprobe;
+ enum hprobe_state hstate;
+ bool valid;
+
utask = current->utask;
if (!utask)
- return false;
+ goto sigill;
ri = utask->return_instances;
if (!ri)
- return false;
+ goto sigill;
- /*
- * TODO: we should throw out return_instance's invalidated by
- * longjmp(), currently we assume that the probed function always
- * returns.
- */
- instruction_pointer_set(regs, ri->orig_ret_vaddr);
+ do {
+ /*
+ * We should throw out the frames invalidated by longjmp().
+ * If this chain is valid, then the next one should be alive
+ * or NULL; the latter case means that nobody but ri->func
+ * could hit this trampoline on return. TODO: sigaltstack().
+ */
+ next_chain = find_next_ret_chain(ri);
+ valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
+
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+ do {
+ /* pop current instance from the stack of pending return instances,
+ * as it's not pending anymore: we just fixed up original
+ * instruction pointer in regs and are about to call handlers;
+ * this allows fixup_uretprobe_trampoline_entries() to properly fix up
+ * captured stack traces from uretprobe handlers, in which pending
+ * trampoline addresses on the stack are replaced with correct
+ * original return addresses
+ */
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
+ utask->depth--;
- for (;;) {
- handle_uretprobe_chain(ri, regs);
+ uprobe = hprobe_consume(&ri->hprobe, &hstate);
+ if (valid)
+ handle_uretprobe_chain(ri, uprobe, regs);
+ hprobe_finalize(&ri->hprobe, hstate);
- chained = ri->chained;
- put_uprobe(ri->uprobe);
+ /* We already took care of hprobe, no need to waste more time on that. */
+ free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
+ ri = ri_next;
+ } while (ri != next_chain);
+ } while (!valid);
- tmp = ri;
- ri = ri->next;
- kfree(tmp);
- utask->depth--;
+ return;
- if (!chained)
- break;
- BUG_ON(!ri);
- }
+sigill:
+ uprobe_warn(current, "handle uretprobe, sending SIGILL.");
+ force_sig(SIGILL);
+}
- utask->return_instances = ri;
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+ return false;
+}
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
return true;
}
-bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
{
- return false;
}
/*
@@ -1821,22 +2709,19 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int uninitialized_var(is_swbp);
+ int is_swbp;
bp_vaddr = uprobe_get_swbp_addr(regs);
- if (bp_vaddr == get_trampoline_vaddr()) {
- if (handle_trampoline(regs))
- return;
+ if (bp_vaddr == uprobe_get_trampoline_vaddr())
+ return uprobe_handle_trampoline(regs);
- pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
- }
+ rcu_read_lock_trace();
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
- send_sig(SIGTRAP, current, 0);
+ force_sig(SIGTRAP);
} else {
/*
* Either we raced with uprobe_unregister() or we can't
@@ -1848,7 +2733,7 @@ static void handle_swbp(struct pt_regs *regs)
*/
instruction_pointer_set(regs, bp_vaddr);
}
- return;
+ goto out;
}
/* change it in advance for ->handler() and restart */
@@ -1859,10 +2744,18 @@ static void handle_swbp(struct pt_regs *regs)
* After we hit the bp, _unregister + _register can install the
* new and not-yet-analyzed uprobe at the same address, restart.
*/
- smp_rmb(); /* pairs with wmb() in install_breakpoint() */
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
goto out;
+ /*
+ * Pairs with the smp_wmb() in prepare_uprobe().
+ *
+ * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
+ * we must also see the stores to &uprobe->arch performed by the
+ * prepare_uprobe() call.
+ */
+ smp_rmb();
+
/* Tracing handlers use ->utask to communicate with fetch methods */
if (!get_utask())
goto out;
@@ -1872,15 +2765,42 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
+ /* Try to optimize after first hit. */
+ arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
+ /*
+ * If user decided to take execution elsewhere, it makes little sense
+ * to execute the original instruction, so let's skip it.
+ */
+ if (instruction_pointer(regs) != bp_vaddr)
+ goto out;
+
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr))
- return;
+ if (pre_ssout(uprobe, regs, bp_vaddr))
+ goto out;
- /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
- put_uprobe(uprobe);
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+ rcu_read_unlock_trace();
+}
+
+void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
+{
+ struct uprobe *uprobe;
+ int is_swbp;
+
+ guard(rcu_tasks_trace)();
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
+ if (!uprobe)
+ return;
+ if (!get_utask())
+ return;
+ if (arch_uprobe_ignore(&uprobe->arch, regs))
+ return;
+ handler_chain(uprobe, regs);
}
/*
@@ -1903,15 +2823,16 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
put_uprobe(uprobe);
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
- xol_free_insn_slot(current);
+ xol_free_insn_slot(utask);
- spin_lock_irq(&current->sighand->siglock);
- recalc_sigpending(); /* see uprobe_deny_signal() */
- spin_unlock_irq(&current->sighand->siglock);
+ if (utask->signal_denied) {
+ set_thread_flag(TIF_SIGPENDING);
+ utask->signal_denied = false;
+ }
if (unlikely(err)) {
uprobe_warn(current, "execute the probed insn, sending SIGILL.");
- force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ force_sig(SIGILL);
}
}
@@ -1948,7 +2869,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
if (!current->mm)
return 0;
- if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) &&
(!current->utask || !current->utask->return_instances))
return 0;
@@ -1978,16 +2899,12 @@ static struct notifier_block uprobe_exception_nb = {
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
};
-static int __init init_uprobes(void)
+void __init uprobes_init(void)
{
int i;
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- if (percpu_init_rwsem(&dup_mmap_sem))
- return -ENOMEM;
-
- return register_die_notifier(&uprobe_exception_nb);
+ BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
-__initcall(init_uprobes);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 6873bb3e6b7e..33f07c5f2515 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Handling of different ABIs (personalities).
*
@@ -18,7 +19,6 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
-#include <linux/fs_struct.h>
#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -27,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int execdomains_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, execdomains_proc_show, NULL);
-}
-
-static const struct file_operations execdomains_proc_fops = {
- .open = execdomains_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int __init proc_execdomains_init(void)
{
- proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+ proc_create_single("execdomains", 0, NULL, execdomains_proc_show);
return 0;
}
module_init(proc_execdomains_init);
diff --git a/kernel/exit.c b/kernel/exit.c
index 22fcc05dec40..8a87021211ae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/exit.c
*
@@ -6,6 +7,12 @@
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
@@ -14,12 +21,10 @@
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
-#include <linux/security.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
@@ -42,9 +47,9 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
-#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
@@ -53,65 +58,132 @@
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
+#include <linux/kcov.h>
+#include <linux/kmsan.h>
+#include <linux/random.h>
+#include <linux/rcuwait.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/sysfs.h>
+#include <linux/user_events.h>
+#include <linux/unwind_deferred.h>
+#include <linux/uaccess.h>
+#include <linux/pidfs.h>
+
+#include <uapi/linux/wait.h>
-#include <asm/uaccess.h>
#include <asm/unistd.h>
-#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct *tsk);
+#include "exit.h"
+
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
+/*
+ * For things release_task() would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+ struct pid *pids[PIDTYPE_MAX];
+};
-static void __unhash_process(struct task_struct *p, bool group_dead)
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+ bool group_dead)
{
+ struct pid *pid = task_pid(p);
+
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
+
+ detach_pid(post->pids, p, PIDTYPE_PID);
+ wake_up_all(&pid->wait_pidfd);
+
if (group_dead) {
- detach_pid(p, PIDTYPE_PGID);
- detach_pid(p, PIDTYPE_SID);
+ detach_pid(post->pids, p, PIDTYPE_TGID);
+ detach_pid(post->pids, p, PIDTYPE_PGID);
+ detach_pid(post->pids, p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
- list_del_rcu(&p->thread_group);
list_del_rcu(&p->thread_node);
}
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
- struct tty_struct *uninitialized_var(tty);
- cputime_t utime, stime;
+ struct tty_struct *tty;
+ u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
+#ifdef CONFIG_POSIX_TIMERS
posix_cpu_timers_exit(tsk);
- if (group_dead) {
+ if (group_dead)
posix_cpu_timers_exit_group(tsk);
+#endif
+
+ if (group_dead) {
tty = sig->tty;
sig->tty = NULL;
} else {
/*
- * This can only happen if the caller is de_thread().
- * FIXME: this is the temporary hack, we should teach
- * posix-cpu-timers to handle this case correctly.
- */
- if (unlikely(has_group_leader_pid(tsk)))
- posix_cpu_timers_exit_group(tsk);
-
- /*
* If there is any task waiting for the group exit
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
- wake_up_process(sig->group_exit_task);
+ wake_up_process(sig->group_exec_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
@@ -137,51 +209,60 @@ static void __exit_signal(struct task_struct *tsk)
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
- __unhash_process(tsk, group_dead);
+ __unhash_process(post, tsk, group_dead);
write_sequnlock(&sig->stats_lock);
- /*
- * Do this under ->siglock, we can race with another thread
- * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
- */
- flush_sigqueue(&tsk->pending);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
- if (group_dead) {
- flush_sigqueue(&sig->shared_pending);
+ if (group_dead)
tty_kref_put(tty);
- }
}
static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+ kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
+void put_task_struct_rcu_user(struct task_struct *task)
+{
+ if (refcount_dec_and_test(&task->rcu_users))
+ call_rcu(&task->rcu, delayed_put_task_struct);
+}
+
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
void release_task(struct task_struct *p)
{
+ struct release_task_post post;
struct task_struct *leader;
+ struct pid *thread_pid;
int zap_leader;
repeat:
+ memset(&post, 0, sizeof(post));
+
/* don't need to get the RCU readlock here - the process is dead and
- * can't be modifying its own credentials. But shut RCU-lockdep up */
- rcu_read_lock();
- atomic_dec(&__task_cred(p)->user->processes);
- rcu_read_unlock();
+ * can't be modifying its own credentials. */
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
- proc_flush_task(p);
+ pidfs_exit(p);
+ cgroup_task_release(p);
+
+ /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
+ thread_pid = task_pid(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- __exit_signal(p);
+ __exit_signal(&post, p);
/*
* If we are the last non-leader member of the thread
@@ -192,6 +273,9 @@ repeat:
leader = p->group_leader;
if (leader != p && thread_group_empty(leader)
&& leader->exit_state == EXIT_ZOMBIE) {
+ /* for pidfs_exit() and do_notify_parent() */
+ if (leader->signal->flags & SIGNAL_GROUP_EXIT)
+ leader->exit_code = leader->signal->group_exit_code;
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
@@ -203,14 +287,59 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ /* @thread_pid can't go away until free_pids() below */
+ proc_flush_pid(thread_pid);
+ exit_cred_namespaces(p);
+ add_device_randomness(&p->se.sum_exec_runtime,
+ sizeof(p->se.sum_exec_runtime));
+ free_pids(post.pids);
release_thread(p);
- call_rcu(&p->rcu, delayed_put_task_struct);
+ /*
+ * This task was already removed from the process/thread/pid lists
+ * and lock_task_sighand(p) can't succeed. Nobody else can touch
+ * ->pending or, if group dead, signal->shared_pending. We can call
+ * flush_sigqueue() lockless.
+ */
+ flush_sigqueue(&p->pending);
+ if (thread_group_leader(p))
+ flush_sigqueue(&p->signal->shared_pending);
+
+ put_task_struct_rcu_user(p);
p = leader;
if (unlikely(zap_leader))
goto repeat;
}
+int rcuwait_wake_up(struct rcuwait *w)
+{
+ int ret = 0;
+ struct task_struct *task;
+
+ rcu_read_lock();
+
+ /*
+ * Order condition vs @task, such that everything prior to the load
+ * of @task is visible. This is the condition as to why the user called
+ * rcuwait_wake() in the first place. Pairs with set_current_state()
+ * barrier (A) in rcuwait_wait_event().
+ *
+ * WAIT WAKE
+ * [S] tsk = current [S] cond = true
+ * MB (A) MB (B)
+ * [L] cond [L] tsk
+ */
+ smp_mb(); /* (B) */
+
+ task = rcu_dereference(w->task);
+ if (task)
+ ret = wake_up_process(task);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcuwait_wake_up);
+
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
@@ -292,15 +421,73 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
+static void coredump_task_exit(struct task_struct *tsk,
+ struct core_state *core_state)
+{
+ struct core_thread self;
+
+ self.task = tsk;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_current_state(TASK_IDLE|TASK_FREEZABLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
#ifdef CONFIG_MEMCG
+/* drops tasklist_lock if succeeds */
+static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm)
+{
+ bool ret = false;
+
+ task_lock(tsk);
+ if (likely(tsk->mm == mm)) {
+ /* tsk can't pass exit_mm/exec_mmap and exit */
+ read_unlock(&tasklist_lock);
+ WRITE_ONCE(mm->owner, tsk);
+ lru_gen_migrate_mm(mm);
+ ret = true;
+ }
+ task_unlock(tsk);
+ return ret;
+}
+
+static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm)
+{
+ struct task_struct *t;
+
+ for_each_thread(g, t) {
+ struct mm_struct *t_mm = READ_ONCE(t->mm);
+ if (t_mm == mm) {
+ if (__try_to_set_owner(t, mm))
+ return true;
+ } else if (t_mm)
+ break;
+ }
+
+ return false;
+}
+
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
void mm_update_next_owner(struct mm_struct *mm)
{
- struct task_struct *c, *g, *p = current;
+ struct task_struct *g, *p = current;
-retry:
/*
* If the exiting or execing task is not the owner, it's
* someone else's problem.
@@ -313,7 +500,7 @@ retry:
* freed task structure.
*/
if (atomic_read(&mm->mm_users) <= 1) {
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
return;
}
@@ -321,31 +508,27 @@ retry:
/*
* Search in the children
*/
- list_for_each_entry(c, &p->children, sibling) {
- if (c->mm == mm)
- goto assign_new_owner;
+ list_for_each_entry(g, &p->children, sibling) {
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
-
/*
* Search in the siblings
*/
- list_for_each_entry(c, &p->real_parent->children, sibling) {
- if (c->mm == mm)
- goto assign_new_owner;
+ list_for_each_entry(g, &p->real_parent->children, sibling) {
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
-
/*
* Search through everything else, we should not get here often.
*/
for_each_process(g) {
+ if (atomic_read(&mm->mm_users) <= 1)
+ break;
if (g->flags & PF_KTHREAD)
continue;
- for_each_thread(g, c) {
- if (c->mm == mm)
- goto assign_new_owner;
- if (c->mm)
- break;
- }
+ if (try_to_set_owner(g, mm))
+ goto ret;
}
read_unlock(&tasklist_lock);
/*
@@ -353,30 +536,10 @@ retry:
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
+ ret:
return;
-assign_new_owner:
- BUG_ON(c == p);
- get_task_struct(c);
- /*
- * The task_lock protects c->mm from changing.
- * We always want mm->owner->mm == mm
- */
- task_lock(c);
- /*
- * Delay read_unlock() till we have the task_lock()
- * to ensure that c does not slip away underneath us
- */
- read_unlock(&tasklist_lock);
- if (c->mm != mm) {
- task_unlock(c);
- put_task_struct(c);
- goto retry;
- }
- mm->owner = c;
- task_unlock(c);
- put_task_struct(c);
}
#endif /* CONFIG_MEMCG */
@@ -384,59 +547,40 @@ assign_new_owner:
* Turn us into a lazy TLB process if we
* aren't already..
*/
-static void exit_mm(struct task_struct *tsk)
+static void exit_mm(void)
{
- struct mm_struct *mm = tsk->mm;
- struct core_state *core_state;
+ struct mm_struct *mm = current->mm;
- mm_release(tsk, mm);
+ exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
+ mmap_read_lock(mm);
+ mmgrab_lazy_tlb(mm);
+ BUG_ON(mm != current->active_mm);
+ /* more a memory barrier than a real lock */
+ task_lock(current);
/*
- * Serialize with any possible pending coredump.
- * We must hold mmap_sem around checking core_state
- * and clearing tsk->mm. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group with ->mm != NULL.
+ * When a thread stops operating on an address space, the loop
+ * in membarrier_private_expedited() may not observe that
+ * tsk->mm, and the loop in membarrier_global_expedited() may
+ * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
+ * rq->membarrier_state, so those would not issue an IPI.
+ * Membarrier requires a memory barrier after accessing
+ * user-space memory, before clearing tsk->mm or the
+ * rq->membarrier_state.
*/
- down_read(&mm->mmap_sem);
- core_state = mm->core_state;
- if (core_state) {
- struct core_thread self;
-
- up_read(&mm->mmap_sem);
-
- self.task = tsk;
- self.next = xchg(&core_state->dumper.next, &self);
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
-
- for (;;) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- if (!self.task) /* see coredump_finish() */
- break;
- freezable_schedule();
- }
- __set_task_state(tsk, TASK_RUNNING);
- down_read(&mm->mmap_sem);
- }
- atomic_inc(&mm->mm_count);
- BUG_ON(mm != tsk->active_mm);
- /* more a memory barrier than a real lock */
- task_lock(tsk);
- tsk->mm = NULL;
- up_read(&mm->mmap_sem);
+ smp_mb__after_spinlock();
+ local_irq_disable();
+ current->mm = NULL;
+ membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
- task_unlock(tsk);
+ local_irq_enable();
+ task_unlock(current);
+ mmap_read_unlock(mm);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- unmark_oom_victim();
+ exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -450,12 +594,14 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
return NULL;
}
-static struct task_struct *find_child_reaper(struct task_struct *father)
+static struct task_struct *find_child_reaper(struct task_struct *father,
+ struct list_head *dead)
__releases(&tasklist_lock)
__acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *reaper = pid_ns->child_reaper;
+ struct task_struct *p, *n;
if (likely(reaper != father))
return reaper;
@@ -467,10 +613,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
}
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?: father->exit_code);
+
+ list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
}
+
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
@@ -494,15 +642,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
return thread;
if (father->signal->has_child_subreaper) {
+ unsigned int ns_level = task_pid(father)->level;
/*
* Find the first ->is_child_subreaper ancestor in our pid_ns.
- * We start from father to ensure we can not look into another
- * namespace, this is safe because all its threads are dead.
+ * We can't check reaper != child_reaper to ensure we do not
+ * cross the namespaces, the exiting parent could be injected
+ * by setns() + fork().
+ * We check pid->level, this is slightly more efficient than
+ * task_active_pid_ns(reaper) != task_active_pid_ns(father).
*/
- for (reaper = father;
- !same_thread_group(reaper, child_reaper);
+ for (reaper = father->real_parent;
+ task_pid(reaper)->level == ns_level;
reaper = reaper->real_parent) {
- /* call_usermodehelper() descendants need this check */
if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
@@ -541,12 +692,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
}
/*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ * Make init inherit all the child processes
*/
static void forget_original_parent(struct task_struct *father,
struct list_head *dead)
@@ -557,20 +703,21 @@ static void forget_original_parent(struct task_struct *father,
exit_ptrace(father, dead);
/* Can drop and reacquire tasklist_lock */
- reaper = find_child_reaper(father);
+ reaper = find_child_reaper(father, dead);
if (list_empty(&father->children))
return;
reaper = find_new_reaper(father, reaper);
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
- t->real_parent = reaper;
- BUG_ON((!t->ptrace) != (t->parent == father));
+ RCU_INIT_POINTER(t->real_parent, reaper);
+ BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
if (likely(!t->ptrace))
t->parent = t->real_parent;
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
- SEND_SIG_NOINFO, t);
+ SEND_SIG_NOINFO, t,
+ PIDTYPE_TGID);
}
/*
* If this is a threaded reparent there is no need to
@@ -598,6 +745,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
+ tsk->exit_state = EXIT_ZOMBIE;
+
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -609,15 +758,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
do_notify_parent(tsk, tsk->exit_signal);
} else {
autoreap = true;
+ /* untraced sub-thread */
+ do_notify_pidfd(tsk);
}
- tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
- if (tsk->exit_state == EXIT_DEAD)
+ if (autoreap) {
+ tsk->exit_state = EXIT_DEAD;
list_add(&tsk->ptrace_entry, &dead);
+ }
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
- wake_up_process(tsk->signal->group_exit_task);
+ wake_up_process(tsk->signal->group_exec_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
@@ -627,6 +779,67 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
#ifdef CONFIG_DEBUG_STACK_USAGE
+#ifdef CONFIG_STACK_GROWSUP
+unsigned long stack_not_used(struct task_struct *p)
+{
+ unsigned long *n = end_of_stack(p);
+
+ do { /* Skip over canary */
+ n--;
+ } while (!*n);
+
+ return (unsigned long)end_of_stack(p) - (unsigned long)n;
+}
+#else /* !CONFIG_STACK_GROWSUP */
+unsigned long stack_not_used(struct task_struct *p)
+{
+ unsigned long *n = end_of_stack(p);
+
+ do { /* Skip over canary */
+ n++;
+ } while (!*n);
+
+ return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif /* CONFIG_STACK_GROWSUP */
+
+/* Count the maximum pages reached in kernel stacks */
+static inline void kstack_histogram(unsigned long used_stack)
+{
+#ifdef CONFIG_VM_EVENT_COUNTERS
+ if (used_stack <= 1024)
+ count_vm_event(KSTACK_1K);
+#if THREAD_SIZE > 1024
+ else if (used_stack <= 2048)
+ count_vm_event(KSTACK_2K);
+#endif
+#if THREAD_SIZE > 2048
+ else if (used_stack <= 4096)
+ count_vm_event(KSTACK_4K);
+#endif
+#if THREAD_SIZE > 4096
+ else if (used_stack <= 8192)
+ count_vm_event(KSTACK_8K);
+#endif
+#if THREAD_SIZE > 8192
+ else if (used_stack <= 16384)
+ count_vm_event(KSTACK_16K);
+#endif
+#if THREAD_SIZE > 16384
+ else if (used_stack <= 32768)
+ count_vm_event(KSTACK_32K);
+#endif
+#if THREAD_SIZE > 32768
+ else if (used_stack <= 65536)
+ count_vm_event(KSTACK_64K);
+#endif
+#if THREAD_SIZE > 65536
+ else
+ count_vm_event(KSTACK_REST);
+#endif
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+}
+
static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
@@ -634,91 +847,88 @@ static void check_stack_usage(void)
unsigned long free;
free = stack_not_used(current);
+ kstack_histogram(THREAD_SIZE - free);
if (free >= lowest_to_date)
return;
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
}
-#else
+#else /* !CONFIG_DEBUG_STACK_USAGE */
static inline void check_stack_usage(void) {}
-#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
-void do_exit(long code)
+static void synchronize_group_exit(struct task_struct *tsk, long code)
{
- struct task_struct *tsk = current;
- int group_dead;
- TASKS_RCU(int tasks_rcu_i);
-
- profile_task_exit(tsk);
-
- WARN_ON(blk_needs_flush_plug(tsk));
-
- if (unlikely(in_interrupt()))
- panic("Aiee, killing interrupt handler!");
- if (unlikely(!tsk->pid))
- panic("Attempted to kill the idle task!");
+ struct sighand_struct *sighand = tsk->sighand;
+ struct signal_struct *signal = tsk->signal;
+ struct core_state *core_state;
+ spin_lock_irq(&sighand->siglock);
+ signal->quick_threads--;
+ if ((signal->quick_threads == 0) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT)) {
+ signal->flags = SIGNAL_GROUP_EXIT;
+ signal->group_exit_code = code;
+ signal->group_stop_count = 0;
+ }
/*
- * If do_exit is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
*/
- set_fs(USER_DS);
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = signal->core_state;
+ spin_unlock_irq(&sighand->siglock);
- ptrace_event(PTRACE_EVENT_EXIT, code);
+ if (unlikely(core_state))
+ coredump_task_exit(tsk, core_state);
+}
- validate_creds_for_do_exit(tsk);
+void __noreturn do_exit(long code)
+{
+ struct task_struct *tsk = current;
+ int group_dead;
- /*
- * We're taking recursive faults here in do_exit. Safest is to just
- * leave this task alone and wait for reboot.
- */
- if (unlikely(tsk->flags & PF_EXITING)) {
- pr_alert("Fixing recursive fault but reboot is needed!\n");
- /*
- * We can do this unlocked here. The futex code uses
- * this flag just to verify whether the pi state
- * cleanup has been done or not. In the worst case it
- * loops once more. We pretend that the cleanup was
- * done as there is no way to return. Either the
- * OWNER_DIED bit is set by now or we push the blocked
- * task into the wait for ever nirwana as well.
- */
- tsk->flags |= PF_EXITPIDONE;
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- }
+ WARN_ON(irqs_disabled());
+ WARN_ON(tsk->plug);
+ kcov_task_exit(tsk);
+ kmsan_task_exit(tsk);
+
+ synchronize_group_exit(tsk, code);
+ ptrace_event(PTRACE_EVENT_EXIT, code);
+ user_events_exit(tsk);
+
+ io_uring_files_cancel();
+ sched_mm_cid_exit(tsk);
exit_signals(tsk); /* sets PF_EXITING */
- /*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
- */
- smp_mb();
- raw_spin_unlock_wait(&tsk->pi_lock);
- if (unlikely(in_atomic()))
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
+ seccomp_filter_release(tsk);
acct_update_integrals(tsk);
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
+ /*
+ * If the last thread of global init has exited, panic
+ * immediately to get a useable coredump.
+ */
+ if (unlikely(is_global_init(tsk)))
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ tsk->signal->group_exit_code ?: (int)code);
+
+#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
- exit_itimers(tsk->signal);
+ exit_itimers(tsk);
+#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
@@ -729,12 +939,27 @@ void do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ trace_sched_process_exit(tsk, group_dead);
+
+ /*
+ * Since sampling can touch ->mm, make sure to stop everything before we
+ * tear it down.
+ *
+ * Also flushes inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_event_exit_task(tsk);
+ /*
+ * PF_EXITING (above) ensures unwind_deferred_request() will no
+ * longer add new unwinds. While exit_mm() (below) will destroy the
+ * abaility to do unwinds. So flush any pending unwinds here.
+ */
+ unwind_deferred_task_exit(tsk);
- exit_mm(tsk);
+ exit_mm();
if (group_dead)
acct_process();
- trace_sched_process_exit(tsk);
exit_sem(tsk);
exit_shm(tsk);
@@ -742,34 +967,22 @@ void do_exit(long code)
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
- exit_task_namespaces(tsk);
+ exit_nsproxy_namespaces(tsk);
exit_task_work(tsk);
- exit_thread();
+ exit_thread(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
- cgroup_exit(tsk);
+ sched_autogroup_exit_task(tsk);
+ cgroup_task_exit(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
- TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
+ exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
-#ifdef CONFIG_NUMA
- task_lock(tsk);
- mpol_put(tsk->mempolicy);
- tsk->mempolicy = NULL;
- task_unlock(tsk);
-#endif
+ mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
@@ -778,12 +991,6 @@ void do_exit(long code)
* Make sure we are holding no locks:
*/
debug_check_no_locks_held();
- /*
- * We can do this unlocked here. The futex code uses this flag
- * just to verify whether the pi state cleanup has been done
- * or not. In the worst case it loops once more.
- */
- tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context(tsk);
@@ -794,49 +1001,78 @@ void do_exit(long code)
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
- validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
- TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+ exit_tasks_rcu_finish();
+ lockdep_free_task(tsk);
+ do_task_dead();
+}
+
+void __noreturn make_task_dead(int signr)
+{
/*
- * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
- * when the following two conditions become true.
- * - There is race condition of mmap_sem (It is acquired by
- * exit_mm()), and
- * - SMI occurs before setting TASK_RUNINNG.
- * (or hypervisor of virtual machine switches to other guest)
- * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ * Take the task off the cpu after something catastrophic has
+ * happened.
*
- * To avoid it, we have to wait for releasing tsk->pi_lock which
- * is held by try_to_wake_up()
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
*/
- smp_mb();
- raw_spin_unlock_wait(&tsk->pi_lock);
+ struct task_struct *tsk = current;
+ unsigned int limit;
- /* causes final put_task_struct in finish_task_switch(). */
- tsk->state = TASK_DEAD;
- tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
- schedule();
- BUG();
- /* Avoid "noreturn function does return". */
- for (;;)
- cpu_relax(); /* For when BUG is null */
-}
-EXPORT_SYMBOL_GPL(do_exit);
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
-void complete_and_exit(struct completion *comp, long code)
-{
- if (comp)
- complete(comp);
+ if (unlikely(irqs_disabled())) {
+ pr_info("note: %s[%d] exited with irqs disabled\n",
+ current->comm, task_pid_nr(current));
+ local_irq_enable();
+ }
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
+
+ /*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ /*
+ * We're taking recursive faults here in make_task_dead. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+ futex_exit_recursive(tsk);
+ tsk->exit_state = EXIT_DEAD;
+ refcount_inc(&tsk->rcu_users);
+ do_task_dead();
+ }
- do_exit(code);
+ do_exit(signr);
}
-EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
@@ -847,22 +1083,24 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-void
+void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
- BUG_ON(exit_code & 0x80); /* core dumps don't get here */
-
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
exit_code = sig->group_exit_code;
- else if (!thread_group_empty(current)) {
+ else if (sig->group_exec_task)
+ exit_code = 0;
+ else {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
+ else if (sig->group_exec_task)
+ exit_code = 0;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
@@ -887,75 +1125,37 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
-struct wait_opts {
- enum pid_type wo_type;
- int wo_flags;
- struct pid *wo_pid;
-
- struct siginfo __user *wo_info;
- int __user *wo_stat;
- struct rusage __user *wo_rusage;
-
- wait_queue_t child_wait;
- int notask_error;
-};
-
-static inline
-struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
-{
- if (type != PIDTYPE_PID)
- task = task->group_leader;
- return task->pids[type].pid;
-}
-
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return 0;
- /* Wait for all children (clone and not) if __WALL is set;
- * otherwise, wait for clone children *only* if __WCLONE is
- * set; otherwise, wait for non-clone children *only*. (Note:
- * A "clone" child here is one that reports to its parent
- * using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
- && !(wo->wo_flags & __WALL))
- return 0;
- return 1;
-}
+ /*
+ * Wait for all children (clone and not) if __WALL is set or
+ * if it is traced by us.
+ */
+ if (ptrace || (wo->wo_flags & __WALL))
+ return 1;
-static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
- pid_t pid, uid_t uid, int why, int status)
-{
- struct siginfo __user *infop;
- int retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ /*
+ * Otherwise, wait for clone children *only* if __WCLONE is set;
+ * otherwise, wait for non-clone children *only*.
+ *
+ * Note: a "clone" child here is one that reports to its parent
+ * using a signal other than SIGCHLD, or a non-leader thread which
+ * we can only see if it is traced by us.
+ */
+ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
+ return 0;
- put_task_struct(p);
- infop = wo->wo_info;
- if (infop) {
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval)
- retval = pid;
- return retval;
+ return 1;
}
/*
@@ -966,30 +1166,24 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- int state, retval, status;
+ int state, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
- struct siginfo __user *infop;
+ struct waitid_info *infop;
if (!likely(wo->wo_flags & WEXITED))
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- int exit_code = p->exit_code;
- int why;
-
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
-
- if ((exit_code & 0x7f) == 0) {
- why = CLD_EXITED;
- status = exit_code >> 8;
- } else {
- why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status = exit_code & 0x7f;
- }
- return wait_noreap_copyout(wo, p, pid, uid, why, status);
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
+ goto out_info;
}
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1011,7 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
struct signal_struct *sig = p->signal;
struct signal_struct *psig = current->signal;
unsigned long maxrss;
- cputime_t tgutime, tgstime;
+ u64 tgutime, tgstime;
/*
* The resource counters for the group leader are in its
@@ -1024,18 +1218,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* p->signal fields because the whole thread group is dead
* and nobody can change them.
*
- * psig->stats_lock also protects us from our sub-theads
- * which can reap other children at the same time. Until
- * we change k_getrusage()-like users to rely on this lock
- * we have to take ->siglock as well.
+ * psig->stats_lock also protects us from our sub-threads
+ * which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&current->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1058,42 +1249,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&current->sighand->siglock);
+ write_sequnlock_irq(&psig->stats_lock);
}
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
- if (!retval && wo->wo_stat)
- retval = put_user(status, wo->wo_stat);
-
- infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop) {
- int why;
-
- if ((status & 0x7f) == 0) {
- why = CLD_EXITED;
- status >>= 8;
- } else {
- why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status &= 0x7f;
- }
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
+ wo->wo_stat = status;
if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
@@ -1110,14 +1273,27 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (state == EXIT_DEAD)
release_task(p);
- return retval;
+out_info:
+ infop = wo->wo_info;
+ if (infop) {
+ if ((status & 0x7f) == 0) {
+ infop->cause = CLD_EXITED;
+ infop->status = status >> 8;
+ } else {
+ infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ infop->status = status & 0x7f;
+ }
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+
+ return pid;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
- if (task_is_stopped_or_traced(p) &&
- !(p->jobctl & JOBCTL_LISTENING))
+ if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1147,8 +1323,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
{
- struct siginfo __user *infop;
- int retval, exit_code, *p_code, why;
+ struct waitid_info *infop;
+ int exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
@@ -1193,34 +1369,21 @@ unlock_sig:
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (unlikely(wo->wo_flags & WNOWAIT))
- return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
-
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- if (!retval && wo->wo_stat)
- retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
+ if (likely(!(wo->wo_flags & WNOWAIT)))
+ wo->wo_stat = (exit_code << 8) | 0x7f;
infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop)
- retval = put_user((short)why, &infop->si_code);
- if (!retval && infop)
- retval = put_user(exit_code, &infop->si_status);
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
- put_task_struct(p);
-
- BUG_ON(!retval);
- return retval;
+ if (infop) {
+ infop->cause = why;
+ infop->status = exit_code;
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+ return pid;
}
/*
@@ -1231,7 +1394,7 @@ unlock_sig:
*/
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
- int retval;
+ struct waitid_info *infop;
pid_t pid;
uid_t uid;
@@ -1256,22 +1419,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (!wo->wo_info) {
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- put_task_struct(p);
- if (!retval && wo->wo_stat)
- retval = put_user(0xffff, wo->wo_stat);
- if (!retval)
- retval = pid;
+ infop = wo->wo_info;
+ if (!infop) {
+ wo->wo_stat = 0xffff;
} else {
- retval = wait_noreap_copyout(wo, p, pid, uid,
- CLD_CONTINUED, SIGCONT);
- BUG_ON(retval == 0);
+ infop->cause = CLD_CONTINUED;
+ infop->pid = pid;
+ infop->uid = uid;
+ infop->status = SIGCONT;
}
-
- return retval;
+ return pid;
}
/*
@@ -1281,7 +1442,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
@@ -1291,30 +1452,16 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
- int exit_state = ACCESS_ONCE(p->exit_state);
+ int exit_state = READ_ONCE(p->exit_state);
int ret;
if (unlikely(exit_state == EXIT_DEAD))
return 0;
- ret = eligible_child(wo, p);
+ ret = eligible_child(wo, ptrace, p);
if (!ret)
return ret;
- ret = security_task_wait(p);
- if (unlikely(ret < 0)) {
- /*
- * If we have not yet seen any eligible child,
- * then let this error code replace -ECHILD.
- * A permission error will give the user a clue
- * to look for security policy problems, rather
- * than for mysterious wait bugs.
- */
- if (wo->notask_error)
- wo->notask_error = ret;
- return 0;
- }
-
if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
@@ -1407,7 +1554,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
@@ -1437,91 +1584,158 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
-static int child_wait_callback(wait_queue_t *wait, unsigned mode,
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+ if (!eligible_pid(wo, p))
+ return false;
+
+ if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+ return false;
+
+ return true;
+}
+
+static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct wait_opts *wo = container_of(wait, struct wait_opts,
child_wait);
struct task_struct *p = key;
- if (!eligible_pid(wo, p))
- return 0;
-
- if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
- return 0;
+ if (pid_child_should_wake(wo, p))
+ return default_wake_function(wait, mode, sync, key);
- return default_wake_function(wait, mode, sync, key);
+ return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
- TASK_INTERRUPTIBLE, 1, p);
+ TASK_INTERRUPTIBLE, p);
}
-static long do_wait(struct wait_opts *wo)
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+ struct task_struct *target)
{
- struct task_struct *tsk;
+ struct task_struct *parent =
+ !ptrace ? target->real_parent : target->parent;
+
+ return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+ same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+ bool ptrace;
+ struct task_struct *target;
int retval;
- trace_sched_process_wait(wo->wo_pid);
+ ptrace = false;
+ target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+ if (target && is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ ptrace = true;
+ target = pid_task(wo->wo_pid, PIDTYPE_PID);
+ if (target && target->ptrace &&
+ is_effectively_child(wo, ptrace, target)) {
+ retval = wait_consider_task(wo, ptrace, target);
+ if (retval)
+ return retval;
+ }
+
+ return 0;
+}
+
+long __do_wait(struct wait_opts *wo)
+{
+ long retval;
- init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
- wo->child_wait.private = current;
- add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
/*
- * If there is nothing that can match our critiera just get out.
+ * If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
* might later match our criteria, even if we are not able to reap
* it yet.
*/
wo->notask_error = -ECHILD;
if ((wo->wo_type < PIDTYPE_MAX) &&
- (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+ (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
- set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
- tsk = current;
- do {
- retval = do_wait_thread(wo, tsk);
- if (retval)
- goto end;
- retval = ptrace_do_wait(wo, tsk);
+ if (wo->wo_type == PIDTYPE_PID) {
+ retval = do_wait_pid(wo);
if (retval)
- goto end;
+ return retval;
+ } else {
+ struct task_struct *tsk = current;
- if (wo->wo_flags & __WNOTHREAD)
- break;
- } while_each_thread(current, tsk);
+ do {
+ retval = do_wait_thread(wo, tsk);
+ if (retval)
+ return retval;
+
+ retval = ptrace_do_wait(wo, tsk);
+ if (retval)
+ return retval;
+
+ if (wo->wo_flags & __WNOTHREAD)
+ break;
+ } while_each_thread(current, tsk);
+ }
read_unlock(&tasklist_lock);
notask:
retval = wo->notask_error;
- if (!retval && !(wo->wo_flags & WNOHANG)) {
- retval = -ERESTARTSYS;
- if (!signal_pending(current)) {
- schedule();
- goto repeat;
- }
- }
-end:
+ if (!retval && !(wo->wo_flags & WNOHANG))
+ return -ERESTARTSYS;
+
+ return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+ int retval;
+
+ trace_sched_process_wait(wo->wo_pid);
+
+ init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ wo->child_wait.private = current;
+ add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ retval = __do_wait(wo);
+ if (retval != -ERESTARTSYS)
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ } while (1);
+
__set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval;
}
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
- infop, int, options, struct rusage __user *, ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru)
{
- struct wait_opts wo;
+ unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
- long ret;
- if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+ if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
+ __WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
return -EINVAL;
@@ -1534,55 +1748,97 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
type = PIDTYPE_PID;
if (upid <= 0)
return -EINVAL;
+
+ pid = find_get_pid(upid);
break;
case P_PGID:
type = PIDTYPE_PGID;
- if (upid <= 0)
+ if (upid < 0)
return -EINVAL;
+
+ if (upid)
+ pid = find_get_pid(upid);
+ else
+ pid = get_task_pid(current, PIDTYPE_PGID);
+ break;
+ case P_PIDFD:
+ type = PIDTYPE_PID;
+ if (upid < 0)
+ return -EINVAL;
+
+ pid = pidfd_get_pid(upid, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
break;
default:
return -EINVAL;
}
- if (type < PIDTYPE_MAX)
- pid = find_get_pid(upid);
+ wo->wo_type = type;
+ wo->wo_pid = pid;
+ wo->wo_flags = options;
+ wo->wo_info = infop;
+ wo->wo_rusage = ru;
+ if (f_flags & O_NONBLOCK)
+ wo->wo_flags |= WNOHANG;
- wo.wo_type = type;
- wo.wo_pid = pid;
- wo.wo_flags = options;
- wo.wo_info = infop;
- wo.wo_stat = NULL;
- wo.wo_rusage = ru;
- ret = do_wait(&wo);
+ return 0;
+}
- if (ret > 0) {
- ret = 0;
- } else if (infop) {
- /*
- * For a WNOHANG return, clear out all the fields
- * we would set so the user can easily tell the
- * difference.
- */
- if (!ret)
- ret = put_user(0, &infop->si_signo);
- if (!ret)
- ret = put_user(0, &infop->si_errno);
- if (!ret)
- ret = put_user(0, &infop->si_code);
- if (!ret)
- ret = put_user(0, &infop->si_pid);
- if (!ret)
- ret = put_user(0, &infop->si_uid);
- if (!ret)
- ret = put_user(0, &infop->si_status);
- }
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
+{
+ struct wait_opts wo;
+ long ret;
- put_pid(pid);
+ ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+ if (ret)
+ return ret;
+
+ ret = do_wait(&wo);
+ if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
+ ret = -EAGAIN;
+
+ put_pid(wo.wo_pid);
return ret;
}
-SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
- int, options, struct rusage __user *, ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+ int signo = 0;
+
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ if (!infop)
+ return err;
+
+ if (!user_write_access_begin(infop, sizeof(*infop)))
+ return -EFAULT;
+
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_write_access_end();
+ return err;
+Efault:
+ user_write_access_end();
+ return -EFAULT;
+}
+
+long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
+ struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
@@ -1593,6 +1849,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
+ /* -INT_MIN is not defined */
+ if (upid == INT_MIN)
+ return -ESRCH;
+
if (upid == -1)
type = PIDTYPE_MAX;
else if (upid < 0) {
@@ -1610,14 +1870,45 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
wo.wo_pid = pid;
wo.wo_flags = options | WEXITED;
wo.wo_info = NULL;
- wo.wo_stat = stat_addr;
+ wo.wo_stat = 0;
wo.wo_rusage = ru;
ret = do_wait(&wo);
put_pid(pid);
+ if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+int kernel_wait(pid_t pid, int *stat)
+{
+ struct wait_opts wo = {
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = find_get_pid(pid),
+ .wo_flags = WEXITED,
+ };
+ int ret;
+ ret = do_wait(&wo);
+ if (ret > 0 && wo.wo_stat)
+ *stat = wo.wo_stat;
+ put_pid(wo.wo_pid);
return ret;
}
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+ int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
+
+ if (err > 0) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ return err;
+}
+
#ifdef __ARCH_WANT_SYS_WAITPID
/*
@@ -1626,7 +1917,82 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
*/
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
- return sys_wait4(pid, stat_addr, options, NULL);
+ return kernel_wait4(pid, stat_addr, options, NULL);
}
#endif
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
+ if (err > 0) {
+ if (ru && put_compat_rusage(&r, ru))
+ return -EFAULT;
+ }
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, infop, int, options,
+ struct compat_rusage __user *, uru)
+{
+ struct rusage ru;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
+ int signo = 0;
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ if (uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
+ }
+
+ if (!infop)
+ return err;
+
+ if (!user_write_access_begin(infop, sizeof(*infop)))
+ return -EFAULT;
+
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_write_access_end();
+ return err;
+Efault:
+ user_write_access_end();
+ return -EFAULT;
+}
+#endif
+
+/*
+ * This needs to be __function_aligned as GCC implicitly makes any
+ * implementation of abort() cold and drops alignment specified by
+ * -falign-functions=N.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
+ */
+__weak __function_aligned void abort(void)
+{
+ BUG();
+
+ /* if that doesn't kill us, halt */
+ panic("Oops failed to kill thread");
+}
+EXPORT_SYMBOL(abort);
diff --git a/kernel/exit.h b/kernel/exit.h
new file mode 100644
index 000000000000..278faa26a653
--- /dev/null
+++ b/kernel/exit.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef LINUX_WAITID_H
+#define LINUX_WAITID_H
+
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
+
+ wait_queue_entry_t child_wait;
+ int notask_error;
+};
+
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
+long __do_wait(struct wait_opts *wo);
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru);
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..71f482581cab 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -1,34 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Rewritten by Rusty Russell, on the backs of many others...
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/elf.h>
#include <linux/ftrace.h>
#include <linux/memory.h>
+#include <linux/extable.h>
#include <linux/module.h>
-#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
+#include <linux/filter.h>
#include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/*
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
+ * Note: Also protects SMP-alternatives modification on x86.
+ *
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
@@ -42,68 +35,48 @@ u32 __initdata __visible main_extable_sort_needed = 1;
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
+ if (main_extable_sort_needed &&
+ &__stop___ex_table > &__start___ex_table) {
pr_notice("Sorting __ex_table...\n");
sort_extable(__start___ex_table, __stop___ex_table);
}
}
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+ return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
- e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+ e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
+ if (!e)
+ e = search_bpf_extables(addr);
return e;
}
-static inline int init_kernel_text(unsigned long addr)
-{
- if (addr >= (unsigned long)_sinittext &&
- addr < (unsigned long)_einittext)
- return 1;
- return 0;
-}
-
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
- if (addr >= (unsigned long)_stext &&
- addr < (unsigned long)_etext)
+ if (is_kernel_text(addr))
return 1;
- if (system_state == SYSTEM_BOOTING &&
- init_kernel_text(addr))
- return 1;
- return 0;
-}
-
-/**
- * core_kernel_data - tell if addr points to kernel data
- * @addr: address to test
- *
- * Returns true if @addr passed in is from the core kernel data
- * section.
- *
- * Note: On some archs it may return true for core RODATA, and false
- * for others. But will always be true for core RW data.
- */
-int core_kernel_data(unsigned long addr)
-{
- if (addr >= (unsigned long)_sdata &&
- addr < (unsigned long)_edata)
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ is_kernel_inittext(addr))
return 1;
return 0;
}
int __kernel_text_address(unsigned long addr)
{
- if (core_kernel_text(addr))
- return 1;
- if (is_module_text_address(addr))
- return 1;
- if (is_ftrace_trampoline(addr))
+ if (kernel_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
@@ -113,27 +86,80 @@ int __kernel_text_address(unsigned long addr)
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
- if (init_kernel_text(addr))
+ if (is_kernel_inittext(addr))
return 1;
return 0;
}
int kernel_text_address(unsigned long addr)
{
+ bool no_rcu;
+ int ret = 1;
+
if (core_kernel_text(addr))
return 1;
+
+ /*
+ * If a stack dump happens while RCU is not watching, then
+ * RCU needs to be notified that it requires to start
+ * watching again. This can happen either by tracing that
+ * triggers a stack trace, or a WARN() that happens during
+ * coming back from idle, or cpu on or offlining.
+ *
+ * is_module_text_address() as well as the kprobe slots,
+ * is_bpf_text_address() and is_bpf_image_address require
+ * RCU to be watching.
+ */
+ no_rcu = !rcu_is_watching();
+
+ /* Treat this like an NMI as it can happen anywhere */
+ if (no_rcu)
+ ct_nmi_enter();
+
if (is_module_text_address(addr))
- return 1;
- return is_ftrace_trampoline(addr);
+ goto out;
+ if (is_ftrace_trampoline(addr))
+ goto out;
+ if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
+ goto out;
+ if (is_bpf_text_address(addr))
+ goto out;
+ ret = 0;
+out:
+ if (no_rcu)
+ ct_nmi_exit();
+
+ return ret;
}
/*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+ func_desc_t *desc = ptr;
+ void *p;
+
+ if (!get_kernel_nofault(p, (void *)&desc->addr))
+ ptr = p;
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+ if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+ return ptr;
+
+ return dereference_function_descriptor(ptr);
+}
+#endif
+
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index 000000000000..d971a0189319
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fail_function.c: Function-based error injection
+ */
+#include <linux/error-injection.h>
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
+
+static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs,
+ unsigned long flags)
+{
+ /*
+ * A dummy post handler is required to prohibit optimizing, because
+ * jump optimization does not support execution path overriding.
+ */
+}
+
+struct fei_attr {
+ struct list_head list;
+ struct kprobe kp;
+ unsigned long retval;
+};
+static DEFINE_MUTEX(fei_lock);
+static LIST_HEAD(fei_attr_list);
+static DECLARE_FAULT_ATTR(fei_fault_attr);
+static struct dentry *fei_debugfs_dir;
+
+static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
+{
+ switch (get_injectable_error_type(addr)) {
+ case EI_ETYPE_NULL:
+ return 0;
+ case EI_ETYPE_ERRNO:
+ if (retv < (unsigned long)-MAX_ERRNO)
+ return (unsigned long)-EINVAL;
+ break;
+ case EI_ETYPE_ERRNO_NULL:
+ if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
+ return (unsigned long)-EINVAL;
+ break;
+ case EI_ETYPE_TRUE:
+ return 1;
+ }
+
+ return retv;
+}
+
+static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
+{
+ struct fei_attr *attr;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (attr) {
+ attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL);
+ if (!attr->kp.symbol_name) {
+ kfree(attr);
+ return NULL;
+ }
+ attr->kp.pre_handler = fei_kprobe_handler;
+ attr->kp.post_handler = fei_post_handler;
+ attr->retval = adjust_error_retval(addr, 0);
+ INIT_LIST_HEAD(&attr->list);
+ }
+ return attr;
+}
+
+static void fei_attr_free(struct fei_attr *attr)
+{
+ if (attr) {
+ kfree(attr->kp.symbol_name);
+ kfree(attr);
+ }
+}
+
+static struct fei_attr *fei_attr_lookup(const char *sym)
+{
+ struct fei_attr *attr;
+
+ list_for_each_entry(attr, &fei_attr_list, list) {
+ if (!strcmp(attr->kp.symbol_name, sym))
+ return attr;
+ }
+
+ return NULL;
+}
+
+static bool fei_attr_is_valid(struct fei_attr *_attr)
+{
+ struct fei_attr *attr;
+
+ list_for_each_entry(attr, &fei_attr_list, list) {
+ if (attr == _attr)
+ return true;
+ }
+
+ return false;
+}
+
+static int fei_retval_set(void *data, u64 val)
+{
+ struct fei_attr *attr = data;
+ unsigned long retv = (unsigned long)val;
+ int err = 0;
+
+ mutex_lock(&fei_lock);
+ /*
+ * Since this operation can be done after retval file is removed,
+ * It is safer to check the attr is still valid before accessing
+ * its member.
+ */
+ if (!fei_attr_is_valid(attr)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (attr->kp.addr) {
+ if (adjust_error_retval((unsigned long)attr->kp.addr,
+ val) != retv)
+ err = -EINVAL;
+ }
+ if (!err)
+ attr->retval = val;
+out:
+ mutex_unlock(&fei_lock);
+
+ return err;
+}
+
+static int fei_retval_get(void *data, u64 *val)
+{
+ struct fei_attr *attr = data;
+ int err = 0;
+
+ mutex_lock(&fei_lock);
+ /* Here we also validate @attr to ensure it still exists. */
+ if (!fei_attr_is_valid(attr))
+ err = -ENOENT;
+ else
+ *val = attr->retval;
+ mutex_unlock(&fei_lock);
+
+ return err;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
+ "%llx\n");
+
+static void fei_debugfs_add_attr(struct fei_attr *attr)
+{
+ struct dentry *dir;
+
+ dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
+
+ debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops);
+}
+
+static void fei_debugfs_remove_attr(struct fei_attr *attr)
+{
+ debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir);
+}
+
+static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
+{
+ struct fei_attr *attr = container_of(kp, struct fei_attr, kp);
+
+ if (should_fail(&fei_fault_attr, 1)) {
+ regs_set_return_value(regs, attr->retval);
+ override_function_with_return(regs);
+ return 1;
+ }
+
+ return 0;
+}
+NOKPROBE_SYMBOL(fei_kprobe_handler)
+
+static void *fei_seq_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&fei_lock);
+ return seq_list_start(&fei_attr_list, *pos);
+}
+
+static void fei_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&fei_lock);
+}
+
+static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &fei_attr_list, pos);
+}
+
+static int fei_seq_show(struct seq_file *m, void *v)
+{
+ struct fei_attr *attr = list_entry(v, struct fei_attr, list);
+
+ seq_printf(m, "%ps\n", attr->kp.addr);
+ return 0;
+}
+
+static const struct seq_operations fei_seq_ops = {
+ .start = fei_seq_start,
+ .next = fei_seq_next,
+ .stop = fei_seq_stop,
+ .show = fei_seq_show,
+};
+
+static int fei_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fei_seq_ops);
+}
+
+static void fei_attr_remove(struct fei_attr *attr)
+{
+ fei_debugfs_remove_attr(attr);
+ unregister_kprobe(&attr->kp);
+ list_del(&attr->list);
+ fei_attr_free(attr);
+}
+
+static void fei_attr_remove_all(void)
+{
+ struct fei_attr *attr, *n;
+
+ list_for_each_entry_safe(attr, n, &fei_attr_list, list) {
+ fei_attr_remove(attr);
+ }
+}
+
+static ssize_t fei_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ struct fei_attr *attr;
+ unsigned long addr;
+ char *buf, *sym;
+ int ret;
+
+ /* cut off if it is too long */
+ if (count > KSYM_NAME_LEN)
+ count = KSYM_NAME_LEN;
+
+ buf = memdup_user_nul(buffer, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ sym = strstrip(buf);
+
+ mutex_lock(&fei_lock);
+
+ /* Writing just spaces will remove all injection points */
+ if (sym[0] == '\0') {
+ fei_attr_remove_all();
+ ret = count;
+ goto out;
+ }
+ /* Writing !function will remove one injection point */
+ if (sym[0] == '!') {
+ attr = fei_attr_lookup(sym + 1);
+ if (!attr) {
+ ret = -ENOENT;
+ goto out;
+ }
+ fei_attr_remove(attr);
+ ret = count;
+ goto out;
+ }
+
+ addr = kallsyms_lookup_name(sym);
+ if (!addr) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!within_error_injection_list(addr)) {
+ ret = -ERANGE;
+ goto out;
+ }
+ if (fei_attr_lookup(sym)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ attr = fei_attr_new(sym, addr);
+ if (!attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = register_kprobe(&attr->kp);
+ if (ret) {
+ fei_attr_free(attr);
+ goto out;
+ }
+ fei_debugfs_add_attr(attr);
+ list_add_tail(&attr->list, &fei_attr_list);
+ ret = count;
+out:
+ mutex_unlock(&fei_lock);
+ kfree(buf);
+ return ret;
+}
+
+static const struct file_operations fei_ops = {
+ .open = fei_open,
+ .read = seq_read,
+ .write = fei_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init fei_debugfs_init(void)
+{
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_function", NULL,
+ &fei_fault_attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ /* injectable attribute is just a symlink of error_inject/list */
+ debugfs_create_symlink("injectable", dir, "../error_injection/list");
+
+ debugfs_create_file("inject", 0600, dir, NULL, &fei_ops);
+
+ fei_debugfs_dir = dir;
+
+ return 0;
+}
+
+late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..b1f3915d5f8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/fork.c
*
@@ -11,7 +12,19 @@
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
+#include <linux/anon_inodes.h>
#include <linux/slab.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/user.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/ext.h>
+#include <linux/seq_file.h>
+#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
@@ -24,12 +37,14 @@
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
+#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
+#include <linux/mm_inline.h>
+#include <linux/memblock.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -39,6 +54,7 @@
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
@@ -55,14 +71,13 @@
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
@@ -75,19 +90,39 @@
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
+#include <linux/kcov.h>
+#include <linux/livepatch.h>
+#include <linux/thread_info.h>
+#include <linux/kstack_erase.h>
+#include <linux/kasan.h>
+#include <linux/scs.h>
+#include <linux/io_uring.h>
+#include <linux/bpf.h>
+#include <linux/stackprotector.h>
+#include <linux/user_events.h>
+#include <linux/iommu.h>
+#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
+#include <linux/tick.h>
+#include <linux/unwind_deferred.h>
+#include <linux/pgalloc.h>
+#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+/* For dup_mmap(). */
+#include "../mm/internal.h"
+
#include <trace/events/sched.h>
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>
+#include <kunit/visibility.h>
+
/*
* Minimum number of threads to boot the kernel
*/
@@ -104,7 +139,16 @@
unsigned long total_forks; /* Handle normal Linux uptimes. */
int nr_threads; /* The idle threads do not count.. */
-int max_threads; /* tunable limit on nr_threads */
+static int max_threads; /* tunable limit on nr_threads */
+
+#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x)
+
+static const char * const resident_page_types[] = {
+ NAMED_ARRAY_INDEX(MM_FILEPAGES),
+ NAMED_ARRAY_INDEX(MM_ANONPAGES),
+ NAMED_ARRAY_INDEX(MM_SWAPENTS),
+ NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
+};
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
@@ -133,7 +177,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk)
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
@@ -145,54 +188,279 @@ static inline void free_task_struct(struct task_struct *tsk)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-void __weak arch_release_thread_info(struct thread_info *ti)
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush. Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+/*
+ * Allocated stacks are cached and later reused by new threads, so memcg
+ * accounting is performed by the code assigning/releasing stacks to tasks.
+ * We need a zeroed memory without __GFP_ACCOUNT.
+ */
+#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
+
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm_area;
+ unsigned int i;
+
+ /*
+ * If the node has memory, we are guaranteed the stacks are backed by local pages.
+ * Otherwise the pages are arbitrary.
+ *
+ * Note that depending on cpuset it is possible we will get migrated to a different
+ * node immediately after allocating here, so this does *not* guarantee locality for
+ * arbitrary callers.
+ */
+ scoped_guard(preempt) {
+ if (node != NUMA_NO_NODE && numa_node_id() != node)
+ return NULL;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+ if (vm_area)
+ return vm_area;
+ }
+ }
+
+ return NULL;
+}
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
+{
+ unsigned int i;
+ int nid;
+
+ /*
+ * Don't cache stacks if any of the pages don't match the local domain, unless
+ * there is no local memory to begin with.
+ *
+ * Note that lack of local memory does not automatically mean it makes no difference
+ * performance-wise which other domain backs the stack. In this case we are merely
+ * trying to avoid constantly going to vmalloc.
+ */
+ scoped_guard(preempt) {
+ nid = numa_node_id();
+ if (node_state(nid, N_MEMORY)) {
+ for (i = 0; i < vm_area->nr_pages; i++) {
+ struct page *page = vm_area->pages[i];
+ if (page_to_nid(page) != nid)
+ return false;
+ }
+ }
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *tmp = NULL;
+
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
+ return true;
+ }
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+ struct vm_struct *vm_area = vm_stack->stack_vm_area;
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_area->addr);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
+static int free_vm_stack_cache(unsigned int cpu)
+{
+ struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
+ int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *vm_area = cached_vm_stack_areas[i];
+
+ if (!vm_area)
+ continue;
+
+ vfree(vm_area->addr);
+ cached_vm_stack_areas[i] = NULL;
+ }
+
+ return 0;
+}
+
+static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
+{
+ int i;
+ int ret;
+ int nr_charged = 0;
+
+ BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ nr_charged++;
+ }
+ return 0;
+err:
+ for (i = 0; i < nr_charged; i++)
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
+ return ret;
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm_area;
+ void *stack;
+
+ vm_area = alloc_thread_stack_node_from_cache(tsk, node);
+ if (vm_area) {
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(vm_area->addr);
+ return -ENOMEM;
+ }
+
+ /* Reset stack metadata. */
+ kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
+
+ stack = kasan_reset_tag(vm_area->addr);
+
+ /* Clear stale pointers from reused stack. */
+ memset(stack, 0, THREAD_SIZE);
+
+ tsk->stack_vm_area = vm_area;
+ tsk->stack = stack;
+ return 0;
+ }
+
+ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
+ GFP_VMAP_STACK,
+ node, __builtin_return_address(0));
+ if (!stack)
+ return -ENOMEM;
+
+ vm_area = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
+ /*
+ * We can't call find_vm_area() in interrupt context, and
+ * free_thread_stack() can be called in interrupt context,
+ * so cache the vm_struct.
+ */
+ tsk->stack_vm_area = vm_area;
+ stack = kasan_reset_tag(stack);
+ tsk->stack = stack;
+ return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
{
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
}
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#else /* !CONFIG_VMAP_STACK */
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
*/
-# if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
- int node)
+#if THREAD_SIZE >= PAGE_SIZE
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
{
- struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
- THREAD_SIZE_ORDER);
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
- return page ? page_address(page) : NULL;
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ THREAD_SIZE_ORDER);
+
+ if (likely(page)) {
+ tsk->stack = kasan_reset_tag(page_address(page));
+ return 0;
+ }
+ return -ENOMEM;
}
-static inline void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(struct task_struct *tsk)
{
- free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
-# else
-static struct kmem_cache *thread_info_cache;
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
- int node)
+#else /* !(THREAD_SIZE >= PAGE_SIZE) */
+
+static struct kmem_cache *thread_stack_cache;
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
{
- return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+ kmem_cache_free(thread_stack_cache, rh);
}
-static void free_thread_info(struct thread_info *ti)
+static void thread_stack_delayed_free(struct task_struct *tsk)
{
- kmem_cache_free(thread_info_cache, ti);
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
}
-void thread_info_cache_init(void)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
- thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
- THREAD_SIZE, 0, NULL);
- BUG_ON(thread_info_cache == NULL);
+ unsigned long *stack;
+ stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+ stack = kasan_reset_tag(stack);
+ tsk->stack = stack;
+ return stack ? 0 : -ENOMEM;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
-# endif
-#endif
+
+void thread_stack_cache_init(void)
+{
+ thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
+ THREAD_SIZE, THREAD_SIZE, 0, 0,
+ THREAD_SIZE, NULL);
+ BUG_ON(thread_stack_cache == NULL);
+}
+
+#endif /* THREAD_SIZE >= PAGE_SIZE */
+#endif /* CONFIG_VMAP_STACK */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -206,79 +474,349 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
-/* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
-
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
{
- struct zone *zone = page_zone(virt_to_page(ti));
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm_area = task_stack_vm_area(tsk);
+ int i;
- mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
+ account * (PAGE_SIZE / 1024));
+ } else {
+ void *stack = task_stack_page(tsk);
+
+ /* All stack pages are in the same node. */
+ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
+ }
}
+void exit_task_stack_account(struct task_struct *tsk)
+{
+ account_kernel_stack(tsk, -1);
+
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm_area;
+ int i;
+
+ vm_area = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
+ }
+}
+
+static void release_task_stack(struct task_struct *tsk)
+{
+ if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
+ return; /* Better to leak the stack than to free prematurely */
+
+ free_thread_stack(tsk);
+}
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+ if (refcount_dec_and_test(&tsk->stack_refcount))
+ release_task_stack(tsk);
+}
+#endif
+
void free_task(struct task_struct *tsk)
{
- account_kernel_stack(tsk->stack, -1);
- arch_release_thread_info(tsk->stack);
- free_thread_info(tsk->stack);
+#ifdef CONFIG_SECCOMP
+ WARN_ON_ONCE(tsk->seccomp.filter);
+#endif
+ release_user_cpus_ptr(tsk);
+ scs_release(tsk);
+
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+ /*
+ * The task is finally done with both the stack and thread_info,
+ * so free both.
+ */
+ release_task_stack(tsk);
+#else
+ /*
+ * If the task had a separate stack allocation, it should be gone
+ * by now.
+ */
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
+#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
- put_seccomp_filter(tsk);
arch_release_task_struct(tsk);
+ if (tsk->flags & PF_KTHREAD)
+ free_kthread_struct(tsk);
+ bpf_task_storage_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+ struct file *exe_file;
+
+ exe_file = get_mm_exe_file(oldmm);
+ RCU_INIT_POINTER(mm->exe_file, exe_file);
+ /*
+ * We depend on the oldmm having properly denied write access to the
+ * exe_file already.
+ */
+ if (exe_file && exe_file_deny_write_access(exe_file))
+ pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
+}
+
+#ifdef CONFIG_MMU
+static inline int mm_alloc_pgd(struct mm_struct *mm)
+{
+ mm->pgd = pgd_alloc(mm);
+ if (unlikely(!mm->pgd))
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct *mm)
+{
+ pgd_free(mm, mm->pgd);
+}
+#else
+#define mm_alloc_pgd(mm) (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+#ifdef CONFIG_MM_ID
+static DEFINE_IDA(mm_ida);
+
+static inline int mm_alloc_id(struct mm_struct *mm)
+{
+ int ret;
+
+ ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+ mm->mm_id = ret;
+ return 0;
+}
+
+static inline void mm_free_id(struct mm_struct *mm)
+{
+ const mm_id_t id = mm->mm_id;
+
+ mm->mm_id = MM_ID_DUMMY;
+ if (id == MM_ID_DUMMY)
+ return;
+ if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
+ return;
+ ida_free(&mm_ida, id);
+}
+#else /* !CONFIG_MM_ID */
+static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
+static inline void mm_free_id(struct mm_struct *mm) {}
+#endif /* CONFIG_MM_ID */
+
+static void check_mm(struct mm_struct *mm)
+{
+ int i;
+
+ BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
+ "Please make sure 'struct resident_page_types[]' is updated as well");
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
+
+ if (unlikely(x)) {
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
+ mm, resident_page_types[i], x,
+ current->comm,
+ task_pid_nr(current));
+ }
+ }
+
+ if (mm_pgtables_bytes(mm))
+ pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+ mm_pgtables_bytes(mm));
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
+
+#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+void __mmdrop(struct mm_struct *mm)
+{
+ BUG_ON(mm == &init_mm);
+ WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
+ WARN_ON_ONCE(mm == current->active_mm);
+ mm_free_pgd(mm);
+ mm_free_id(mm);
+ destroy_context(mm);
+ mmu_notifier_subscriptions_destroy(mm);
+ check_mm(mm);
+ put_user_ns(mm->user_ns);
+ mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
+ percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+
+ free_mm(mm);
+}
+EXPORT_SYMBOL_GPL(__mmdrop);
+
+static void mmdrop_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm;
+
+ mm = container_of(work, struct mm_struct, async_put_work);
+ __mmdrop(mm);
+}
+
+static void mmdrop_async(struct mm_struct *mm)
+{
+ if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+ INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+
static inline void free_signal_struct(struct signal_struct *sig)
{
taskstats_tgid_free(sig);
sched_autogroup_exit(sig);
+ /*
+ * __mmdrop is not safe to call from softirq context on x86 due to
+ * pgd_dtor so postpone it to the async context
+ */
+ if (sig->oom_mm)
+ mmdrop_async(sig->oom_mm);
kmem_cache_free(signal_cachep, sig);
}
static inline void put_signal_struct(struct signal_struct *sig)
{
- if (atomic_dec_and_test(&sig->sigcnt))
+ if (refcount_dec_and_test(&sig->sigcnt))
free_signal_struct(sig);
}
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
- WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
- task_numa_free(tsk);
+ unwind_task_free(tsk);
+ io_uring_free(tsk);
+ cgroup_task_free(tsk);
+ task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
-
- if (!profile_handoff_task(tsk))
- free_task(tsk);
+ sched_core_free(tsk);
+ free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
+{
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
+
+ __put_task_struct(task);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
+
void __init __weak arch_task_cache_init(void) { }
/*
* set_max_threads
*/
-static void set_max_threads(unsigned int max_threads_suggested)
+static void __init set_max_threads(unsigned int max_threads_suggested)
{
u64 threads;
+ unsigned long nr_pages = memblock_estimated_nr_free_pages();
/*
* The number of threads shall be limited such that the thread
* structures may only consume a small part of the available memory.
*/
- if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64)
+ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
threads = MAX_THREADS;
else
- threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
(u64) THREAD_SIZE * 8UL);
if (threads > max_threads_suggested)
@@ -287,17 +825,41 @@ static void set_max_threads(unsigned int max_threads_suggested)
max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
+
+static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
+{
+ /* Fetch thread_struct whitelist for the architecture. */
+ arch_thread_struct_whitelist(offset, size);
+
+ /*
+ * Handle zero-sized whitelist or empty thread_struct, otherwise
+ * adjust offset to position of thread_struct in task_struct.
+ */
+ if (unlikely(*size == 0))
+ *offset = 0;
+ else
+ *offset += offsetof(struct task_struct, thread);
+}
+
void __init fork_init(void)
{
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
+ int i;
#ifndef ARCH_MIN_TASKALIGN
-#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
+#define ARCH_MIN_TASKALIGN 0
#endif
+ int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
+ unsigned long useroffset, usersize;
+
/* create a slab on which task_structs can be allocated */
- task_struct_cachep =
- kmem_cache_create("task_struct", sizeof(struct task_struct),
- ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
-#endif
+ task_struct_whitelist(&useroffset, &usersize);
+ task_struct_cachep = kmem_cache_create_usercopy("task_struct",
+ arch_task_struct_size, align,
+ SLAB_PANIC|SLAB_ACCOUNT,
+ useroffset, usersize, NULL);
/* do the arch specific task caches init */
arch_task_cache_init();
@@ -308,6 +870,24 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC];
+
+ for (i = 0; i < UCOUNT_COUNTS; i++)
+ init_user_ns.ucount_max[i] = max_threads/2;
+
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+ set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
+
+#ifdef CONFIG_VMAP_STACK
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
+ NULL, free_vm_stack_cache);
+#endif
+
+ scs_init();
+
+ lockdep_init_task(&init_task);
+ uprobes_init();
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -325,26 +905,34 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
-static struct task_struct *dup_task_struct(struct task_struct *orig)
+static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- struct thread_info *ti;
- int node = tsk_fork_get_node(orig);
int err;
+ if (node == NUMA_NO_NODE)
+ node = tsk_fork_get_node(orig);
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
- ti = alloc_thread_info_node(tsk, node);
- if (!ti)
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
goto free_tsk;
- err = arch_dup_task_struct(tsk, orig);
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
+ goto free_tsk;
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ refcount_set(&tsk->stack_refcount, 1);
+#endif
+ account_kernel_stack(tsk, 1);
+
+ err = scs_prepare(tsk, node);
if (err)
- goto free_ti;
+ goto free_stack;
- tsk->stack = ti;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
@@ -359,194 +947,71 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
+ clear_syscall_work_syscall_user_dispatch(tsk);
-#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_int();
+#ifdef CONFIG_STACKPROTECTOR
+ tsk->stack_canary = get_random_canary();
#endif
+ if (orig->cpus_ptr == &orig->cpus_mask)
+ tsk->cpus_ptr = &tsk->cpus_mask;
+ dup_user_cpus_ptr(tsk, orig, node);
/*
- * One for us, one for whoever does the "release_task()" (usually
- * parent)
+ * One for the user space visible state that goes away when reaped.
+ * One for the scheduler.
*/
- atomic_set(&tsk->usage, 2);
+ refcount_set(&tsk->rcu_users, 2);
+ /* One for the rcu users */
+ refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
+ tsk->wake_q.next = NULL;
+ tsk->worker_private = NULL;
- account_kernel_stack(ti, 1);
-
- return tsk;
-
-free_ti:
- free_thread_info(ti);
-free_tsk:
- free_task_struct(tsk);
- return NULL;
-}
-
-#ifdef CONFIG_MMU
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
- struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
- struct rb_node **rb_link, *rb_parent;
- int retval;
- unsigned long charge;
-
- uprobe_start_dup_mmap();
- down_write(&oldmm->mmap_sem);
- flush_cache_dup_mm(oldmm);
- uprobe_dup_mmap(oldmm, mm);
- /*
- * Not linked in yet - no deadlock potential:
- */
- down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
-
- /* No ordering required: file already has been exposed. */
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-
- mm->total_vm = oldmm->total_vm;
- mm->shared_vm = oldmm->shared_vm;
- mm->exec_vm = oldmm->exec_vm;
- mm->stack_vm = oldmm->stack_vm;
-
- rb_link = &mm->mm_rb.rb_node;
- rb_parent = NULL;
- pprev = &mm->mmap;
- retval = ksm_fork(mm, oldmm);
- if (retval)
- goto out;
- retval = khugepaged_fork(mm, oldmm);
- if (retval)
- goto out;
+ kcov_task_init(tsk);
+ kmsan_task_create(tsk);
+ kmap_local_fork(tsk);
- prev = NULL;
- for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
- struct file *file;
-
- if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -vma_pages(mpnt));
- continue;
- }
- charge = 0;
- if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned long len = vma_pages(mpnt);
-
- if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
- goto fail_nomem;
- charge = len;
- }
- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!tmp)
- goto fail_nomem;
- *tmp = *mpnt;
- INIT_LIST_HEAD(&tmp->anon_vma_chain);
- retval = vma_dup_policy(mpnt, tmp);
- if (retval)
- goto fail_nomem_policy;
- tmp->vm_mm = mm;
- if (anon_vma_fork(tmp, mpnt))
- goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
- tmp->vm_next = tmp->vm_prev = NULL;
- file = tmp->vm_file;
- if (file) {
- struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
-
- get_file(file);
- if (tmp->vm_flags & VM_DENYWRITE)
- atomic_dec(&inode->i_writecount);
- i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
- flush_dcache_mmap_lock(mapping);
- /* insert tmp into the share list, just after mpnt */
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- /*
- * Clear hugetlb-related page reserves for children. This only
- * affects MAP_PRIVATE mappings. Faults generated by the child
- * are not guaranteed to succeed, even if read-only
- */
- if (is_vm_hugetlb_page(tmp))
- reset_vma_resv_huge_pages(tmp);
-
- /*
- * Link in the new vma and copy the page table entries.
- */
- *pprev = tmp;
- pprev = &tmp->vm_next;
- tmp->vm_prev = prev;
- prev = tmp;
+#ifdef CONFIG_FAULT_INJECTION
+ tsk->fail_nth = 0;
+#endif
- __vma_link_rb(mm, tmp, rb_link, rb_parent);
- rb_link = &tmp->vm_rb.rb_right;
- rb_parent = &tmp->vm_rb;
+#ifdef CONFIG_BLK_CGROUP
+ tsk->throttle_disk = NULL;
+ tsk->use_memdelay = 0;
+#endif
- mm->map_count++;
- retval = copy_page_range(mm, oldmm, mpnt);
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
+ tsk->pasid_activated = 0;
+#endif
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
+#ifdef CONFIG_MEMCG
+ tsk->active_memcg = NULL;
+#endif
- if (retval)
- goto out;
- }
- /* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
-out:
- up_write(&mm->mmap_sem);
- flush_tlb_mm(oldmm);
- up_write(&oldmm->mmap_sem);
- uprobe_end_dup_mmap();
- return retval;
-fail_nomem_anon_vma_fork:
- mpol_put(vma_policy(tmp));
-fail_nomem_policy:
- kmem_cache_free(vm_area_cachep, tmp);
-fail_nomem:
- retval = -ENOMEM;
- vm_unacct_memory(charge);
- goto out;
-}
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
+ tsk->reported_split_lock = 0;
+#endif
-static inline int mm_alloc_pgd(struct mm_struct *mm)
-{
- mm->pgd = pgd_alloc(mm);
- if (unlikely(!mm->pgd))
- return -ENOMEM;
- return 0;
-}
+#ifdef CONFIG_SCHED_MM_CID
+ tsk->mm_cid.cid = MM_CID_UNSET;
+ tsk->mm_cid.active = 0;
+#endif
+ return tsk;
-static inline void mm_free_pgd(struct mm_struct *mm)
-{
- pgd_free(mm, mm->pgd);
-}
-#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
- down_write(&oldmm->mmap_sem);
- RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
- up_write(&oldmm->mmap_sem);
- return 0;
+free_stack:
+ exit_task_stack_account(tsk);
+ free_thread_stack(tsk);
+free_tsk:
+ free_task_struct(tsk);
+ return NULL;
}
-#define mm_alloc_pgd(mm) (0)
-#define mm_free_pgd(mm)
-#endif /* CONFIG_MMU */
__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
-
static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
static int __init coredump_filter_setup(char *s)
@@ -569,6 +1034,15 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+ struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ if (mm->owner == p)
+ WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
@@ -576,79 +1050,102 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+ arch_uprobe_init_state(mm);
+#endif
+}
+
+static void mmap_init_lock(struct mm_struct *mm)
+{
+ init_rwsem(&mm->mmap_lock);
+ mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+ rcuwait_init(&mm->vma_writer_wait);
+#endif
+}
+
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ struct user_namespace *user_ns)
{
- mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
- mm->vmacache_seqnum = 0;
+ mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
+ mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
- init_rwsem(&mm->mmap_sem);
+ seqcount_init(&mm->write_protect_seq);
+ mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
- mm->core_state = NULL;
- atomic_long_set(&mm->nr_ptes, 0);
- mm_nr_pmds_init(mm);
+ mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
- mm->pinned_vm = 0;
+ atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ spin_lock_init(&mm->arg_lock);
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
- mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm_pasid_init(mm);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
+ mmu_notifier_subscriptions_init(mm);
+ init_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
+ mm_init_uprobes_state(mm);
+ hugetlb_count_init(mm);
+ mm_flags_clear_all(mm);
if (current->mm) {
- mm->flags = current->mm->flags & MMF_INIT_MASK;
+ unsigned long flags = __mm_flags_get_word(current->mm);
+
+ __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- mm->flags = default_dump_filter;
+ __mm_flags_overwrite_word(mm, default_dump_filter);
mm->def_flags = 0;
}
+ if (futex_mm_init(mm))
+ goto fail_mm_init;
+
if (mm_alloc_pgd(mm))
goto fail_nopgd;
+ if (mm_alloc_id(mm))
+ goto fail_noid;
+
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm, p))
+ goto fail_cid;
+
+ if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ NR_MM_COUNTERS))
+ goto fail_pcpu;
+
+ mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
+fail_pcpu:
+ mm_destroy_cid(mm);
+fail_cid:
+ destroy_context(mm);
fail_nocontext:
+ mm_free_id(mm);
+fail_noid:
mm_free_pgd(mm);
fail_nopgd:
+ futex_hash_free(mm);
+fail_mm_init:
free_mm(mm);
return NULL;
}
-static void check_mm(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
-
- if (unlikely(x))
- printk(KERN_ALERT "BUG: Bad rss-counter state "
- "mm:%p idx:%d val:%ld\n", mm, i, x);
- }
-
- if (atomic_long_read(&mm->nr_ptes))
- pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
- atomic_long_read(&mm->nr_ptes));
- if (mm_nr_pmds(mm))
- pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
- mm_nr_pmds(mm));
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
-#endif
-}
-
/*
* Allocate and initialize an mm_struct.
*/
@@ -661,24 +1158,32 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- return mm_init(mm, current);
+ return mm_init(mm, current, current_user_ns());
}
+EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
-/*
- * Called when the last reference to the mm
- * is dropped: either by a lazy thread or by
- * mmput. Free the page directory and the mm.
- */
-void __mmdrop(struct mm_struct *mm)
+static inline void __mmput(struct mm_struct *mm)
{
- BUG_ON(mm == &init_mm);
- mm_free_pgd(mm);
- destroy_context(mm);
- mmu_notifier_mm_destroy(mm);
- check_mm(mm);
- free_mm(mm);
+ VM_BUG_ON(atomic_read(&mm->mm_users));
+
+ uprobe_clear_state(mm);
+ exit_aio(mm);
+ ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
+ exit_mmap(mm);
+ mm_put_huge_zero_folio(mm);
+ set_mm_exe_file(mm, NULL);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
+ futex_hash_free(mm);
+ mmdrop(mm);
}
-EXPORT_SYMBOL_GPL(__mmdrop);
/*
* Decrement the use count and release all resources for an mm.
@@ -687,37 +1192,44 @@ void mmput(struct mm_struct *mm)
{
might_sleep();
+ if (atomic_dec_and_test(&mm->mm_users))
+ __mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct,
+ async_put_work);
+
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
if (atomic_dec_and_test(&mm->mm_users)) {
- uprobe_clear_state(mm);
- exit_aio(mm);
- ksm_exit(mm);
- khugepaged_exit(mm); /* must run before exit_mmap */
- exit_mmap(mm);
- set_mm_exe_file(mm, NULL);
- if (!list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- list_del(&mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
- mmdrop(mm);
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
}
}
-EXPORT_SYMBOL_GPL(mmput);
+EXPORT_SYMBOL_GPL(mmput_async);
+#endif
/**
* set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
- * invocations: in mmput() nobody alive left, in execve task is single
- * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
- * mm->exe_file, but does so without using set_mm_exe_file() in order
- * to do avoid the need for any locks.
+ * invocations: in mmput() nobody alive left, in execve it happens before
+ * the new mm is made visible to anyone.
+ *
+ * Can only fail if new_exe_file != NULL.
*/
-void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
struct file *old_exe_file;
@@ -728,15 +1240,79 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
- if (new_exe_file)
+ if (new_exe_file) {
+ /*
+ * We expect the caller (i.e., sys_execve) to already denied
+ * write access, so this is unlikely to fail.
+ */
+ if (unlikely(exe_file_deny_write_access(new_exe_file)))
+ return -EACCES;
get_file(new_exe_file);
+ }
rcu_assign_pointer(mm->exe_file, new_exe_file);
- if (old_exe_file)
+ if (old_exe_file) {
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
+ }
+ return 0;
+}
+
+/**
+ * replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
+ *
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
+ *
+ * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
+ */
+int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ struct vm_area_struct *vma;
+ struct file *old_exe_file;
+ int ret = 0;
+
+ /* Forbid mm->exe_file change if old file still mapped. */
+ old_exe_file = get_mm_exe_file(mm);
+ if (old_exe_file) {
+ VMA_ITERATOR(vmi, mm, 0);
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (!vma->vm_file)
+ continue;
+ if (path_equal(&vma->vm_file->f_path,
+ &old_exe_file->f_path)) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ mmap_read_unlock(mm);
+ fput(old_exe_file);
+ if (ret)
+ return ret;
+ }
+
+ ret = exe_file_deny_write_access(new_exe_file);
+ if (ret)
+ return -EACCES;
+ get_file(new_exe_file);
+
+ /* set the new file */
+ mmap_write_lock(mm);
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ mmap_write_unlock(mm);
+
+ if (old_exe_file) {
+ exe_file_allow_write_access(old_exe_file);
+ fput(old_exe_file);
+ }
+ return 0;
}
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
@@ -746,16 +1322,38 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
struct file *exe_file;
rcu_read_lock();
- exe_file = rcu_dereference(mm->exe_file);
- if (exe_file && !get_file_rcu(exe_file))
- exe_file = NULL;
+ exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
-EXPORT_SYMBOL(get_mm_exe_file);
+
+/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+ struct file *exe_file = NULL;
+ struct mm_struct *mm;
+
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm)
+ exe_file = get_mm_exe_file(mm);
+ task_unlock(task);
+ return exe_file;
+}
/**
* get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
@@ -767,35 +1365,46 @@ struct mm_struct *get_task_mm(struct task_struct *task)
{
struct mm_struct *mm;
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
task_lock(task);
mm = task->mm;
- if (mm) {
- if (task->flags & PF_KTHREAD)
- mm = NULL;
- else
- atomic_inc(&mm->mm_users);
- }
+ if (mm)
+ mmget(mm);
task_unlock(task);
return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);
+static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
+{
+ if (mm == current->mm)
+ return true;
+ if (ptrace_may_access(task, mode))
+ return true;
+ if ((mode & PTRACE_MODE_READ) && perfmon_capable())
+ return true;
+ return false;
+}
+
struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
struct mm_struct *mm;
int err;
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return ERR_PTR(err);
mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ if (!mm) {
+ mm = ERR_PTR(-ESRCH);
+ } else if (!may_access_mm(mm, task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
- mutex_unlock(&task->signal->cred_guard_mutex);
+ up_read(&task->signal->exec_update_lock);
return mm;
}
@@ -816,11 +1425,12 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
- freezer_do_not_count();
- killed = wait_for_completion_killable(vfork);
- freezer_count();
+ cgroup_enter_frozen();
+ killed = wait_for_completion_state(vfork, state);
+ cgroup_leave_frozen(false);
if (killed) {
task_lock(child);
@@ -845,46 +1455,27 @@ static int wait_for_vfork_done(struct task_struct *child,
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
-void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
- /* Get rid of any futexes when releasing the mm */
-#ifdef CONFIG_FUTEX
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
-#endif
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
-#endif
-
uprobe_free_utask(tsk);
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
/*
- * If we're exiting normally, clear a user-space tid field if
- * requested. We leave this alone when dying by signal, to leave
- * the value intact in a core dump, and to save the unnecessary
- * trouble, say, a killed vfork parent shouldn't touch this mm.
- * Userland only wants this done for a sys_exit.
+ * Signal userspace if we're not exiting with a core dump
+ * because we want to leave the value intact for debugging
+ * purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->flags & PF_SIGNALED) &&
- atomic_read(&mm->mm_users) > 1) {
+ if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
*/
put_user(0, tsk->clear_child_tid);
- sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
- 1, NULL, NULL, 0);
+ do_futex(tsk->clear_child_tid, FUTEX_WAKE,
+ 1, NULL, NULL, 0, 0);
}
tsk->clear_child_tid = NULL;
}
@@ -897,13 +1488,32 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
-/*
- * Allocate a new mm structure and copy contents from the
- * mm structure of the passed in task structure.
+void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exit_release(tsk);
+ mm_release(tsk, mm);
+}
+
+void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exec_release(tsk);
+ mm_release(tsk, mm);
+}
+
+/**
+ * dup_mm() - duplicates an existing mm structure
+ * @tsk: the task_struct with which the new mm will be associated.
+ * @oldmm: the mm to duplicate.
+ *
+ * Allocates a new mm structure and duplicates the provided @oldmm structure
+ * content into it.
+ *
+ * Return: the duplicated mm or NULL on failure.
*/
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk,
+ struct mm_struct *oldmm)
{
- struct mm_struct *mm, *oldmm = current->mm;
+ struct mm_struct *mm;
int err;
mm = allocate_mm();
@@ -912,12 +1522,14 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm, tsk))
+ if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
+ uprobe_start_dup_mmap();
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
+ uprobe_end_dup_mmap();
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
@@ -930,21 +1542,24 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
+ mm_init_owner(mm, NULL);
mmput(mm);
+ if (err)
+ uprobe_end_dup_mmap();
fail_nomem:
return NULL;
}
-static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_mm(u64 clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
- int retval;
tsk->min_flt = tsk->maj_flt = 0;
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+ tsk->last_switch_time = 0;
#endif
tsk->mm = NULL;
@@ -959,41 +1574,34 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
if (!oldmm)
return 0;
- /* initialize the new vmacache entries */
- vmacache_flush(tsk);
-
if (clone_flags & CLONE_VM) {
- atomic_inc(&oldmm->mm_users);
+ mmget(oldmm);
mm = oldmm;
- goto good_mm;
+ } else {
+ mm = dup_mm(tsk, current->mm);
+ if (!mm)
+ return -ENOMEM;
}
- retval = -ENOMEM;
- mm = dup_mm(tsk);
- if (!mm)
- goto fail_nomem;
-
-good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
+ sched_mm_cid_fork(tsk);
return 0;
-
-fail_nomem:
- return retval;
}
-static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
+ /* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return -EAGAIN;
}
fs->users++;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return 0;
}
tsk->fs = copy_fs_struct(fs);
@@ -1002,82 +1610,67 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(u64 clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
- int error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
- goto out;
+ return 0;
+
+ if (no_files) {
+ tsk->files = NULL;
+ return 0;
+ }
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
- goto out;
+ return 0;
}
- newf = dup_fd(oldf, &error);
- if (!newf)
- goto out;
+ newf = dup_fd(oldf, NULL);
+ if (IS_ERR(newf))
+ return PTR_ERR(newf);
tsk->files = newf;
- error = 0;
-out:
- return error;
-}
-
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
-{
-#ifdef CONFIG_BLOCK
- struct io_context *ioc = current->io_context;
- struct io_context *new_ioc;
-
- if (!ioc)
- return 0;
- /*
- * Share io context with parent, if CLONE_IO is set
- */
- if (clone_flags & CLONE_IO) {
- ioc_task_link(ioc);
- tsk->io_context = ioc;
- } else if (ioprio_valid(ioc->ioprio)) {
- new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
- if (unlikely(!new_ioc))
- return -ENOMEM;
-
- new_ioc->ioprio = ioc->ioprio;
- put_io_context(new_ioc);
- }
-#endif
return 0;
}
-static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
if (clone_flags & CLONE_SIGHAND) {
- atomic_inc(&current->sighand->count);
+ refcount_inc(&current->sighand->count);
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
- rcu_assign_pointer(tsk->sighand, sig);
+ RCU_INIT_POINTER(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
- atomic_set(&sig->count, 1);
+
+ refcount_set(&sig->count, 1);
+ spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
+ if (clone_flags & CLONE_CLEAR_SIGHAND)
+ flush_signal_handlers(tsk, 0);
+
return 0;
}
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count)) {
+ if (refcount_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
/*
- * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
* without an RCU grace period, see __lock_task_sighand().
*/
kmem_cache_free(sighand_cachep, sighand);
@@ -1089,24 +1682,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
*/
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
+ struct posix_cputimers *pct = &sig->posix_cputimers;
unsigned long cpu_limit;
- /* Thread group counters. */
- thread_group_cputime_init(sig);
-
- cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
- if (cpu_limit != RLIM_INFINITY) {
- sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
- sig->cputimer.running = 1;
- }
-
- /* The timer lists. */
- INIT_LIST_HEAD(&sig->cpu_timers[0]);
- INIT_LIST_HEAD(&sig->cpu_timers[1]);
- INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+ posix_cputimers_group_init(pct, cpu_limit);
}
-static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_signal(u64 clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
@@ -1119,8 +1702,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
sig->nr_threads = 1;
+ sig->quick_threads = 1;
atomic_set(&sig->live, 1);
- atomic_set(&sig->sigcnt, 1);
+ refcount_set(&sig->sigcnt, 1);
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
@@ -1129,11 +1713,15 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
- INIT_LIST_HEAD(&sig->posix_timers);
+ INIT_HLIST_HEAD(&sig->multiprocess);
seqlock_init(&sig->stats_lock);
+ prev_cputime_init(&sig->prev_cputime);
- hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sig->real_timer.function = it_real_fn;
+#ifdef CONFIG_POSIX_TIMERS
+ INIT_HLIST_HEAD(&sig->posix_timers);
+ INIT_HLIST_HEAD(&sig->ignored_posix_timers);
+ hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+#endif
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -1145,16 +1733,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sched_autogroup_fork(sig);
#ifdef CONFIG_CGROUPS
- init_rwsem(&sig->group_rwsem);
+ init_rwsem(&sig->cgroup_threadgroup_rwsem);
#endif
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
- sig->has_child_subreaper = current->signal->has_child_subreaper ||
- current->signal->is_child_subreaper;
-
mutex_init(&sig->cred_guard_mutex);
+ init_rwsem(&sig->exec_update_lock);
return 0;
}
@@ -1188,7 +1774,7 @@ static void copy_seccomp(struct task_struct *p)
* to manually enable the seccomp thread flag here.
*/
if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
- set_tsk_thread_flag(p, TIF_SECCOMP);
+ set_task_syscall_work(p, SECCOMP);
#endif
}
@@ -1203,29 +1789,170 @@ static void rt_mutex_init_task(struct task_struct *p)
{
raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
- p->pi_waiters = RB_ROOT;
- p->pi_waiters_leftmost = NULL;
+ p->pi_waiters = RB_ROOT_CACHED;
+ p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
}
-/*
- * Initialize POSIX timer handling for a single task.
- */
-static void posix_cpu_timers_init(struct task_struct *tsk)
+static inline void init_task_pid_links(struct task_struct *task)
{
- tsk->cputime_expires.prof_exp = 0;
- tsk->cputime_expires.virt_exp = 0;
- tsk->cputime_expires.sched_exp = 0;
- INIT_LIST_HEAD(&tsk->cpu_timers[0]);
- INIT_LIST_HEAD(&tsk->cpu_timers[1]);
- INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+ enum pid_type type;
+
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
+ INIT_HLIST_NODE(&task->pid_links[type]);
}
static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
- task->pids[type].pid = pid;
+ if (type == PIDTYPE_PID)
+ task->thread_pid = pid;
+ else
+ task->signal->pids[type] = pid;
+}
+
+static inline void rcu_copy_process(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RCU
+ p->rcu_read_lock_nesting = 0;
+ p->rcu_read_unlock_special.s = 0;
+ p->rcu_blocked_node = NULL;
+ INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+ p->rcu_tasks_holdout = false;
+ INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+ p->rcu_tasks_idle_cpu = -1;
+ INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
+#endif /* #ifdef CONFIG_TASKS_RCU */
+#ifdef CONFIG_TASKS_TRACE_RCU
+ p->trc_reader_nesting = 0;
+ p->trc_reader_special.s = 0;
+ INIT_LIST_HEAD(&p->trc_holdout_list);
+ INIT_LIST_HEAD(&p->trc_blkd_node);
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @ret_file: return the new pidfs file
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
+{
+ struct file *pidfs_file;
+
+ /*
+ * PIDFD_STALE is only allowed to be passed if the caller knows
+ * that @pid is already registered in pidfs and thus
+ * PIDFD_INFO_EXIT information is guaranteed to be available.
+ */
+ if (!(flags & PIDFD_STALE)) {
+ /*
+ * While holding the pidfd waitqueue lock removing the
+ * task linkage for the thread-group leader pid
+ * (PIDTYPE_TGID) isn't possible. Thus, if there's still
+ * task linkage for PIDTYPE_PID not having thread-group
+ * leader linkage for the pid means it wasn't a
+ * thread-group leader in the first place.
+ */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ /* Task has already been reaped. */
+ if (!pid_has_task(pid, PIDTYPE_PID))
+ return -ESRCH;
+ /*
+ * If this struct pid isn't used as a thread-group
+ * leader but the caller requested to create a
+ * thread-group leader pidfd then report ENOENT.
+ */
+ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
+ return -ENOENT;
+ }
+
+ CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
+
+ *ret_file = pidfs_file;
+ return take_fd(pidfd);
+}
+
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+ free_task(tsk);
+}
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+ if (IS_ENABLED(CONFIG_MEMCG))
+ call_rcu(&tsk->rcu, __delayed_free_task);
+ else
+ free_task(tsk);
+}
+
+static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
+{
+ /* Skip if kernel thread */
+ if (!tsk->mm)
+ return;
+
+ /* Skip if spawning a thread or using vfork */
+ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
+ return;
+
+ /* We need to synchronize with __set_oom_adj */
+ mutex_lock(&oom_adj_mutex);
+ mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
+ /* Update the values in case they were changed after copy_signal */
+ tsk->signal->oom_score_adj = current->signal->oom_score_adj;
+ tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
+ mutex_unlock(&oom_adj_mutex);
+}
+
+#ifdef CONFIG_RV
+static void rv_task_fork(struct task_struct *p)
+{
+ memset(&p->rv, 0, sizeof(p->rv));
+}
+#else
+#define rv_task_fork(p) do {} while (0)
+#endif
+
+static bool need_futex_hash_allocate_default(u64 clone_flags)
+{
+ if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
+ return false;
+ return true;
}
/*
@@ -1236,16 +1963,23 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static struct task_struct *copy_process(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *child_tidptr,
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
- int trace)
+ int trace,
+ int node,
+ struct kernel_clone_args *args)
{
- int retval;
+ int pidfd = -1, retval;
struct task_struct *p;
+ struct multiprocess_signals delayed;
+ struct file *pidfile = NULL;
+ const u64 clone_flags = args->flags;
+ struct nsproxy *nsp = current->nsproxy;
+ /*
+ * Don't allow sharing the root directory with processes in a different
+ * namespace
+ */
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1279,57 +2013,99 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/*
* If the new process will be in a different pid or user namespace
- * do not allow it to share a thread group or signal handlers or
- * parent with the forking task.
+ * do not allow it to share a thread group with the forking task.
*/
- if (clone_flags & CLONE_SIGHAND) {
+ if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
- (task_active_pid_ns(current) !=
- current->nsproxy->pid_ns_for_children))
+ (task_active_pid_ns(current) != nsp->pid_ns_for_children))
return ERR_PTR(-EINVAL);
}
- retval = security_task_create(clone_flags);
- if (retval)
+ if (clone_flags & CLONE_PIDFD) {
+ /*
+ * - CLONE_DETACHED is blocked so that we can potentially
+ * reuse it later for CLONE_PIDFD.
+ */
+ if (clone_flags & CLONE_DETACHED)
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * Force any signals received before this point to be delivered
+ * before the fork happens. Collect up signals sent to multiple
+ * processes that happen during the fork and delay them so that
+ * they appear to happen after the fork.
+ */
+ sigemptyset(&delayed.signal);
+ INIT_HLIST_NODE(&delayed.node);
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (!(clone_flags & CLONE_THREAD))
+ hlist_add_head(&delayed.node, &current->signal->multiprocess);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ retval = -ERESTARTNOINTR;
+ if (task_sigpending(current))
goto fork_out;
retval = -ENOMEM;
- p = dup_task_struct(current);
+ p = dup_task_struct(current, node);
if (!p)
goto fork_out;
+ p->flags &= ~PF_KTHREAD;
+ if (args->kthread)
+ p->flags |= PF_KTHREAD;
+ if (args->user_worker) {
+ /*
+ * Mark us a user worker, and block any signal that isn't
+ * fatal or STOP
+ */
+ p->flags |= PF_USER_WORKER;
+ siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ }
+ if (args->io_thread)
+ p->flags |= PF_IO_WORKER;
+
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
+ /*
+ * Clear TID on mm_release()?
+ */
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
ftrace_graph_init_task(p);
rt_mutex_init_task(p);
+ lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
- DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ retval = copy_creds(p, clone_flags);
+ if (retval < 0)
+ goto bad_fork_free;
+
retval = -EAGAIN;
- if (atomic_read(&p->real_cred->user->processes) >=
- task_rlimit(p, RLIMIT_NPROC)) {
+ if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
- goto bad_fork_free;
+ goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_EXCEEDED;
- retval = copy_creds(p, clone_flags);
- if (retval < 0)
- goto bad_fork_free;
-
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
- if (nr_threads >= max_threads)
+ if (data_race(nr_threads >= max_threads))
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
@@ -1340,93 +2116,97 @@ static struct task_struct *copy_process(unsigned long clone_flags,
init_sigpending(&p->pending);
p->utime = p->stime = p->gtime = 0;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
+ prev_cputime_init(&p->prev_cputime);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqlock_init(&p->vtime_seqlock);
- p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_SLEEPING;
+ seqcount_init(&p->vtime.seqcount);
+ p->vtime.starttime = 0;
+ p->vtime.state = VTIME_INACTIVE;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#ifdef CONFIG_IO_URING
+ p->io_uring = NULL;
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
+#ifdef CONFIG_PSI
+ p->psi_flags = 0;
+#endif
+
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
- posix_cpu_timers_init(p);
+ posix_cputimers_init(&p->posix_cputimers);
+ tick_dep_init_task(p);
- p->start_time = ktime_get_ns();
- p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
- p->audit_context = NULL;
- if (clone_flags & CLONE_THREAD)
- threadgroup_change_begin(current);
+ audit_set_context(p, NULL);
cgroup_fork(p);
+ if (args->kthread) {
+ if (!set_kthread_struct(p))
+ goto bad_fork_cleanup_delayacct;
+ }
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_threadgroup_lock;
+ goto bad_fork_cleanup_delayacct;
}
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
- p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
- seqcount_init(&p->mems_allowed_seq);
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
- p->irq_events = 0;
- p->hardirqs_enabled = 0;
- p->hardirq_enable_ip = 0;
- p->hardirq_enable_event = 0;
- p->hardirq_disable_ip = _THIS_IP_;
- p->hardirq_disable_event = 0;
- p->softirqs_enabled = 1;
- p->softirq_enable_ip = _THIS_IP_;
- p->softirq_enable_event = 0;
- p->softirq_disable_ip = 0;
- p->softirq_disable_event = 0;
- p->hardirq_context = 0;
- p->softirq_context = 0;
-#endif
-#ifdef CONFIG_LOCKDEP
- p->lockdep_depth = 0; /* no locks held yet */
- p->curr_chain_key = 0;
- p->lockdep_recursion = 0;
+ memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+ p->irqtrace.hardirq_disable_ip = _THIS_IP_;
+ p->irqtrace.softirq_enable_ip = _THIS_IP_;
+ p->softirqs_enabled = 1;
+ p->softirq_context = 0;
#endif
-#ifdef CONFIG_DEBUG_MUTEXES
+ p->pagefault_disabled = 0;
+
+ lockdep_init_task(p);
+
p->blocked_on = NULL; /* not blocked yet */
-#endif
+
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ RCU_INIT_POINTER(p->bpf_storage, NULL);
+ p->bpf_ctx = NULL;
+#endif
+
+ unwind_task_init(p);
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
- retval = perf_event_init_task(p);
+ retval = perf_event_init_task(p, clone_flags);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_sched_cancel_fork;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
- retval = copy_semundo(clone_flags, p);
+ retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
- retval = copy_files(clone_flags, p);
+ retval = copy_semundo(clone_flags, p);
+ if (retval)
+ goto bad_fork_cleanup_security;
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -1447,62 +2227,71 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p);
+ retval = copy_thread(p, args);
if (retval)
goto bad_fork_cleanup_io;
+ stackleak_task_init(p);
+
if (pid != &init_struct_pid) {
- pid = alloc_pid(p->nsproxy->pid_ns_for_children);
+ pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
+ args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
- goto bad_fork_cleanup_io;
+ goto bad_fork_cleanup_thread;
}
}
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
- * Clear TID on mm_release()?
+ * This has to happen after we've potentially unshared the file
+ * descriptor table (so that the pidfd doesn't leak into the child
+ * if the fd table isn't shared).
*/
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+ if (clone_flags & CLONE_PIDFD) {
+ int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+
+ /*
+ * Note that no task has been attached to @pid yet indicate
+ * that via CLONE_PIDFD.
+ */
+ retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
+ if (retval < 0)
+ goto bad_fork_free_pid;
+ pidfd = retval;
+
+ retval = put_user(pidfd, args->pidfd);
+ if (retval)
+ goto bad_fork_put_pidfd;
+ }
+
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
-#ifdef CONFIG_FUTEX
- p->robust_list = NULL;
-#ifdef CONFIG_COMPAT
- p->compat_robust_list = NULL;
-#endif
- INIT_LIST_HEAD(&p->pi_state_list);
- p->pi_state_cache = NULL;
-#endif
+ futex_init_task(p);
+
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
- p->sas_ss_sp = p->sas_ss_size = 0;
+ sas_ss_reset(p);
/*
* Syscall tracing and stepping should be turned off in the
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
- clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(p, SYSCALL_TRACE);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+ clear_task_syscall_work(p, SYSCALL_EMU);
#endif
- clear_all_latency_tracing(p);
+ clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
- p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
- if (clone_flags & CLONE_PARENT)
- p->exit_signal = current->group_leader->exit_signal;
- else
- p->exit_signal = (clone_flags & CSIGNAL);
p->group_leader = p;
p->tgid = p->pid;
}
@@ -1512,8 +2301,64 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ clear_posix_cputimers_work(p);
+
+#ifdef CONFIG_KRETPROBES
+ p->kretprobe_instances.first = NULL;
+#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
+
+ /*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked. It should be noted that the new process's css_set can be changed
+ * between here and cgroup_post_fork() if an organisation operation is in
+ * progress.
+ */
+ retval = cgroup_can_fork(p, args);
+ if (retval)
+ goto bad_fork_put_pidfd;
+
+ /*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ retval = sched_cgroup_fork(p, args);
+ if (retval)
+ goto bad_fork_cancel_cgroup;
+
+ /*
+ * Allocate a default futex hash for the user process once the first
+ * thread spawns.
+ */
+ if (need_futex_hash_allocate_default(clone_flags)) {
+ retval = futex_hash_allocate_default();
+ if (retval)
+ goto bad_fork_cancel_cgroup;
+ /*
+ * If we fail beyond this point we don't free the allocated
+ * futex hash map. We assume that another thread will be created
+ * and makes use of it. The hash map will be freed once the main
+ * thread terminates.
+ */
+ }
+ /*
+ * From this point on we must avoid any synchronous user-space
+ * communication until we take the tasklist-lock. In particular, we do
+ * not want user-space to be able to predict the process start-time by
+ * stalling fork(2) after we recorded the start_time but before it is
+ * visible to the system.
+ */
+
+ p->start_time = ktime_get_ns();
+ p->start_boottime = ktime_get_boottime_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
@@ -1525,40 +2370,53 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
+ if (clone_flags & CLONE_THREAD)
+ p->exit_signal = -1;
+ else
+ p->exit_signal = current->group_leader->exit_signal;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
+ p->exit_signal = args->exit_signal;
}
+ klp_copy_process(p);
+
+ sched_core_fork(p);
+
spin_lock(&current->sighand->siglock);
+ rv_task_fork(p);
+
+ rseq_fork(p, clone_flags);
+
+ /* Don't start children in a dying pid namespace */
+ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
+ retval = -ENOMEM;
+ goto bad_fork_core_free;
+ }
+
+ /* Let kill terminate clone/fork in the middle */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto bad_fork_core_free;
+ }
+
+ /* No more failure paths after this point. */
+
/*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p);
- /*
- * Process group and session signals need to be delivered to just the
- * parent before the fork or both the parent and the child after the
- * fork. Restart if a signal comes in before we add the new process to
- * it's process group.
- * A fatal signal pending means that current will exit, so the new
- * thread can't slip out of an OOM kill (or normal SIGKILL).
- */
- recalc_sigpending();
- if (signal_pending(current)) {
- spin_unlock(&current->sighand->siglock);
- write_unlock_irq(&tasklist_lock);
- retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
- }
-
+ init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
+ init_task_pid(p, PIDTYPE_TGID, pid);
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));
@@ -1566,54 +2424,82 @@ static struct task_struct *copy_process(unsigned long clone_flags,
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
-
- p->signal->leader_pid = pid;
+ p->signal->shared_pending.signal = delayed.signal;
p->signal->tty = tty_kref_get(current->signal->tty);
+ /*
+ * Inherit has_child_subreaper flag under the same
+ * tasklist_lock with adding child to the process tree
+ * for propagate_has_child_subreaper optimization.
+ */
+ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
+ p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ attach_pid(p, PIDTYPE_TGID);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
+ current->signal->quick_threads++;
atomic_inc(&current->signal->live);
- atomic_inc(&current->signal->sigcnt);
- list_add_tail_rcu(&p->thread_group,
- &p->group_leader->thread_group);
+ refcount_inc(&current->signal->sigcnt);
+ task_join_group_stop(p);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
-
total_forks++;
+ hlist_del_init(&delayed.node);
spin_unlock(&current->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+ if (pidfile)
+ fd_install(pidfd, pidfile);
+
proc_fork_connector(p);
- cgroup_post_fork(p);
- if (clone_flags & CLONE_THREAD)
- threadgroup_change_end(current);
+ sched_post_fork(p);
+ cgroup_post_fork(p, args);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
+
+ copy_oom_score_adj(clone_flags, p);
return p;
+bad_fork_core_free:
+ sched_core_free(p);
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
+bad_fork_cancel_cgroup:
+ cgroup_cancel_fork(p, args);
+bad_fork_put_pidfd:
+ if (clone_flags & CLONE_PIDFD) {
+ fput(pidfile);
+ put_unused_fd(pidfd);
+ }
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
+bad_fork_cleanup_thread:
+ exit_thread(p);
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- exit_task_namespaces(p);
+ exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
- if (p->mm)
+ if (p->mm) {
+ sched_mm_cid_exit(p);
+ mm_clear_owner(p->mm, p);
mmput(p->mm);
+ }
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
@@ -1625,43 +2511,67 @@ bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
+bad_fork_cleanup_security:
+ security_task_free(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_sched_cancel_fork:
+ sched_cancel_fork(p);
bad_fork_cleanup_policy:
+ lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_threadgroup_lock:
#endif
- if (clone_flags & CLONE_THREAD)
- threadgroup_change_end(current);
+bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
- atomic_dec(&p->cred->user->processes);
+ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ exit_cred_namespaces(p);
exit_creds(p);
bad_fork_free:
- free_task(p);
+ WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
+ put_task_stack(p);
+ delayed_free_task(p);
fork_out:
+ spin_lock_irq(&current->sighand->siglock);
+ hlist_del_init(&delayed.node);
+ spin_unlock_irq(&current->sighand->siglock);
return ERR_PTR(retval);
}
-static inline void init_idle_pids(struct pid_link *links)
+static inline void init_idle_pids(struct task_struct *idle)
{
enum pid_type type;
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
- INIT_HLIST_NODE(&links[type].node); /* not really needed */
- links[type].pid = &init_struct_pid;
+ INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
+ init_task_pid(idle, type, &init_struct_pid);
}
}
-struct task_struct *fork_idle(int cpu)
+static int idle_dummy(void *dummy)
+{
+ /* This function is never called */
+ return 0;
+}
+
+struct task_struct * __init fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+ struct kernel_clone_args args = {
+ .flags = CLONE_VM,
+ .fn = &idle_dummy,
+ .fn_arg = NULL,
+ .kthread = 1,
+ .idle = 1,
+ };
+
+ task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
if (!IS_ERR(task)) {
- init_idle_pids(task->pids);
+ init_idle_pids(task);
init_idle(task, cpu);
}
@@ -1669,20 +2579,56 @@ struct task_struct *fork_idle(int cpu)
}
/*
+ * This is like kernel_clone(), but shaved down and tailored to just
+ * creating io_uring workers. It returns a created task, or an error pointer.
+ * The returned task is inactive, and the caller must fire it up through
+ * wake_up_new_task(p). All signals are blocked in the created task.
+ */
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
+{
+ unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ CLONE_IO|CLONE_VM|CLONE_UNTRACED;
+ struct kernel_clone_args args = {
+ .flags = flags,
+ .fn = fn,
+ .fn_arg = arg,
+ .io_thread = 1,
+ .user_worker = 1,
+ };
+
+ return copy_process(NULL, 0, node, &args);
+}
+
+/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
*/
-long do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr)
+pid_t kernel_clone(struct kernel_clone_args *args)
{
+ u64 clone_flags = args->flags;
+ struct completion vfork;
+ struct pid *pid;
struct task_struct *p;
int trace = 0;
- long nr;
+ pid_t nr;
+
+ /*
+ * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+ * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+ * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+ * field in struct clone_args and it still doesn't make sense to have
+ * them both point at the same memory location. Performing this check
+ * here has the advantage that we don't need to have a separate helper
+ * to check for legacy clone().
+ */
+ if ((clone_flags & CLONE_PIDFD) &&
+ (clone_flags & CLONE_PARENT_SETTID) &&
+ (args->pidfd == args->parent_tid))
+ return -EINVAL;
/*
* Determine whether and which event to report to ptracer. When
@@ -1693,7 +2639,7 @@ long do_fork(unsigned long clone_flags,
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
- else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
@@ -1702,62 +2648,94 @@ long do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace);
+ p = copy_process(NULL, trace, NUMA_NO_NODE, args);
+ add_latent_entropy();
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
- if (!IS_ERR(p)) {
- struct completion vfork;
- struct pid *pid;
+ trace_sched_process_fork(current, p);
- trace_sched_process_fork(current, p);
+ pid = get_task_pid(p, PIDTYPE_PID);
+ nr = pid_vnr(pid);
- pid = get_task_pid(p, PIDTYPE_PID);
- nr = pid_vnr(pid);
+ if (clone_flags & CLONE_PARENT_SETTID)
+ put_user(nr, args->parent_tid);
- if (clone_flags & CLONE_PARENT_SETTID)
- put_user(nr, parent_tidptr);
-
- if (clone_flags & CLONE_VFORK) {
- p->vfork_done = &vfork;
- init_completion(&vfork);
- get_task_struct(p);
- }
+ if (clone_flags & CLONE_VFORK) {
+ p->vfork_done = &vfork;
+ init_completion(&vfork);
+ get_task_struct(p);
+ }
- wake_up_new_task(p);
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
- /* forking complete and child started to run, tell ptracer */
- if (unlikely(trace))
- ptrace_event_pid(trace, pid);
+ wake_up_new_task(p);
- if (clone_flags & CLONE_VFORK) {
- if (!wait_for_vfork_done(p, &vfork))
- ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
- }
+ /* forking complete and child started to run, tell ptracer */
+ if (unlikely(trace))
+ ptrace_event_pid(trace, pid);
- put_pid(pid);
- } else {
- nr = PTR_ERR(p);
+ if (clone_flags & CLONE_VFORK) {
+ if (!wait_for_vfork_done(p, &vfork))
+ ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
+
+ put_pid(pid);
return nr;
}
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
- return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL);
+ struct kernel_clone_args args = {
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
+ .name = name,
+ .kthread = 1,
+ };
+
+ return kernel_clone(&args);
+}
+
+/*
+ * Create a user mode thread.
+ */
+pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+ struct kernel_clone_args args = {
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .fn = fn,
+ .fn_arg = arg,
+ };
+
+ return kernel_clone(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+ struct kernel_clone_args args = {
+ .exit_signal = SIGCHLD,
+ };
+
+ return kernel_clone(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -1768,8 +2746,12 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL);
+ struct kernel_clone_args args = {
+ .flags = CLONE_VFORK | CLONE_VM,
+ .exit_signal = SIGCHLD,
+ };
+
+ return kernel_clone(&args);
}
#endif
@@ -1777,30 +2759,232 @@ SYSCALL_DEFINE0(vfork)
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
- int, tls_val,
+ unsigned long, tls,
int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
int, stack_size,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#endif
{
- return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+ struct kernel_clone_args args = {
+ .flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
+ .stack = newsp,
+ .tls = tls,
+ };
+
+ return kernel_clone(&args);
}
#endif
+static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+ struct clone_args __user *uargs,
+ size_t usize)
+{
+ int err;
+ struct clone_args args;
+ pid_t *kset_tid = kargs->set_tid;
+
+ BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
+ CLONE_ARGS_SIZE_VER0);
+ BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
+ CLONE_ARGS_SIZE_VER1);
+ BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
+ CLONE_ARGS_SIZE_VER2);
+ BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
+
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
+ return -EINVAL;
+
+ err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (err)
+ return err;
+
+ if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
+ return -EINVAL;
+
+ if (unlikely(!args.set_tid && args.set_tid_size > 0))
+ return -EINVAL;
+
+ if (unlikely(args.set_tid && args.set_tid_size == 0))
+ return -EINVAL;
+
+ /*
+ * Verify that higher 32bits of exit_signal are unset and that
+ * it is a valid signal
+ */
+ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+ !valid_signal(args.exit_signal)))
+ return -EINVAL;
+
+ if ((args.flags & CLONE_INTO_CGROUP) &&
+ (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
+ return -EINVAL;
+
+ *kargs = (struct kernel_clone_args){
+ .flags = args.flags,
+ .pidfd = u64_to_user_ptr(args.pidfd),
+ .child_tid = u64_to_user_ptr(args.child_tid),
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
+ .exit_signal = args.exit_signal,
+ .stack = args.stack,
+ .stack_size = args.stack_size,
+ .tls = args.tls,
+ .set_tid_size = args.set_tid_size,
+ .cgroup = args.cgroup,
+ };
+
+ if (args.set_tid &&
+ copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
+ (kargs->set_tid_size * sizeof(pid_t))))
+ return -EFAULT;
+
+ kargs->set_tid = kset_tid;
+
+ return 0;
+}
+
+/**
+ * clone3_stack_valid - check and prepare stack
+ * @kargs: kernel clone args
+ *
+ * Verify that the stack arguments userspace gave us are sane.
+ * In addition, set the stack direction for userspace since it's easy for us to
+ * determine.
+ */
+static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
+{
+ if (kargs->stack == 0) {
+ if (kargs->stack_size > 0)
+ return false;
+ } else {
+ if (kargs->stack_size == 0)
+ return false;
+
+ if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
+ return false;
+
+#if !defined(CONFIG_STACK_GROWSUP)
+ kargs->stack += kargs->stack_size;
+#endif
+ }
+
+ return true;
+}
+
+static bool clone3_args_valid(struct kernel_clone_args *kargs)
+{
+ /* Verify that no unknown flags are passed along. */
+ if (kargs->flags &
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ return false;
+
+ /*
+ * - make the CLONE_DETACHED bit reusable for clone3
+ * - make the CSIGNAL bits reusable for clone3
+ */
+ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
+ return false;
+
+ if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
+ (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
+ return false;
+
+ if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+ kargs->exit_signal)
+ return false;
+
+ if (!clone3_stack_valid(kargs))
+ return false;
+
+ return true;
+}
+
+/**
+ * sys_clone3 - create a new process with specific properties
+ * @uargs: argument structure
+ * @size: size of @uargs
+ *
+ * clone3() is the extensible successor to clone()/clone2().
+ * It takes a struct as argument that is versioned by its size.
+ *
+ * Return: On success, a positive PID for the child process.
+ * On error, a negative errno number.
+ */
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+ int err;
+
+ struct kernel_clone_args kargs;
+ pid_t set_tid[MAX_PID_NS_LEVEL];
+
+#ifdef __ARCH_BROKEN_SYS_CLONE3
+#warning clone3() entry point is missing, please fix
+ return -ENOSYS;
+#endif
+
+ kargs.set_tid = set_tid;
+
+ err = copy_clone_args_from_user(&kargs, uargs, size);
+ if (err)
+ return err;
+
+ if (!clone3_args_valid(&kargs))
+ return -EINVAL;
+
+ return kernel_clone(&kargs);
+}
+
+void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
+{
+ struct task_struct *leader, *parent, *child;
+ int res;
+
+ read_lock(&tasklist_lock);
+ leader = top = top->group_leader;
+down:
+ for_each_thread(leader, parent) {
+ list_for_each_entry(child, &parent->children, sibling) {
+ res = visitor(child, data);
+ if (res) {
+ if (res < 0)
+ goto out;
+ leader = child;
+ goto down;
+ }
+up:
+ ;
+ }
+ }
+
+ if (leader != top) {
+ child = leader;
+ parent = child->real_parent;
+ leader = parent->group_leader;
+ goto up;
+ }
+out:
+ read_unlock(&tasklist_lock);
+}
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
@@ -1813,32 +2997,43 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
+void __init mm_cache_init(void)
+{
+ unsigned int mm_size;
+
+ /*
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
+ */
+ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();
+
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
+ NULL);
+}
+
void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
- SLAB_NOTRACK, sighand_ctor);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- /*
- * FIXME! The "sizeof(struct mm_struct)" currently includes the
- * whole struct cpumask for the OFFSTACK case. We could change
- * this to *only* allocate as much of it as required by the
- * maximum number of CPU's we can ever have. The cpumask_allocation
- * is at the end of the structure, exactly for that reason.
- */
- mm_cachep = kmem_cache_create("mm_struct",
- sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ NULL);
mmap_init();
nsproxy_cache_init();
}
@@ -1851,16 +3046,25 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
+ CLONE_NEWTIME))
return -EINVAL;
/*
- * Not implemented, but pretend it works if there is nothing to
- * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
- * needs to unshare vm.
+ * Not implemented, but pretend it works if there is nothing
+ * to unshare. Note that unsharing the address space or the
+ * signal handlers also need to unshare the signal queues (aka
+ * CLONE_THREAD).
*/
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
- /* FIXME: get_task_mm() increments ->mm_users */
- if (atomic_read(&current->mm->mm_users) > 1)
+ if (!thread_group_empty(current))
+ return -EINVAL;
+ }
+ if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+ if (refcount_read(&current->sighand->count) > 1)
+ return -EINVAL;
+ }
+ if (unshare_flags & CLONE_VM) {
+ if (!current_is_single_threaded())
return -EINVAL;
}
@@ -1894,13 +3098,13 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
- int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, &error);
- if (!*new_fdp)
- return error;
+ fd = dup_fd(fd, NULL);
+ if (IS_ERR(fd))
+ return PTR_ERR(fd);
+ *new_fdp = fd;
}
return 0;
@@ -1909,36 +3113,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by kernel_clone() cannot be used here directly
* because they modify an inactive task_struct that is being
* constructed. Here we are modifying the current, active,
* task_struct.
*/
-SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
- struct files_struct *fd, *new_fd = NULL;
+ struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
/*
- * If unsharing a user namespace must also unshare the thread.
+ * If unsharing a user namespace must also unshare the thread group
+ * and unshare the filesystem root and working directories.
*/
if (unshare_flags & CLONE_NEWUSER)
unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
- * If unsharing a thread from a thread group, must also unshare vm.
- */
- if (unshare_flags & CLONE_THREAD)
- unshare_flags |= CLONE_VM;
- /*
* If unsharing vm, must also unshare signal handlers.
*/
if (unshare_flags & CLONE_VM)
unshare_flags |= CLONE_SIGHAND;
/*
+ * If unsharing a signal handlers, must also unshare the signal queues.
+ */
+ if (unshare_flags & CLONE_SIGHAND)
+ unshare_flags |= CLONE_THREAD;
+ /*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
@@ -1968,6 +3173,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
if (err)
goto bad_unshare_cleanup_cred;
+ if (new_cred) {
+ err = set_cred_ucounts(new_cred);
+ if (err)
+ goto bad_unshare_cleanup_cred;
+ }
+
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
@@ -1988,20 +3199,17 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
if (new_fs) {
fs = current->fs;
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
current->fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
}
- if (new_fd) {
- fd = current->files;
- current->files = new_fd;
- new_fd = fd;
- }
+ if (new_fd)
+ swap(current->files, new_fd);
task_unlock(current);
@@ -2012,6 +3220,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
}
+ perf_event_namespaces(current);
+
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
@@ -2027,37 +3237,42 @@ bad_unshare_out:
return err;
}
+SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+{
+ return ksys_unshare(unshare_flags);
+}
+
/*
* Helper to unshare the files of the current task.
* We don't want to expose copy_files internals to
* the exec layer of the kernel.
*/
-int unshare_files(struct files_struct **displaced)
+int unshare_files(void)
{
struct task_struct *task = current;
- struct files_struct *copy = NULL;
+ struct files_struct *old, *copy = NULL;
int error;
error = unshare_fd(CLONE_FILES, &copy);
- if (error || !copy) {
- *displaced = NULL;
+ if (error || !copy)
return error;
- }
- *displaced = task->files;
+
+ old = task->files;
task_lock(task);
task->files = copy;
task_unlock(task);
+ put_files_struct(old);
return 0;
}
-int sysctl_max_threads(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int sysctl_max_threads(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int ret;
int threads = max_threads;
- int min = MIN_THREADS;
+ int min = 1;
int max = MAX_THREADS;
t = *table;
@@ -2069,7 +3284,25 @@ int sysctl_max_threads(struct ctl_table *table, int write,
if (ret || !write)
return ret;
- set_max_threads(threads);
+ max_threads = threads;
return 0;
}
+
+static const struct ctl_table fork_sysctl_table[] = {
+ {
+ .procname = "threads-max",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_max_threads,
+ },
+};
+
+static int __init init_fork_sysctl(void)
+{
+ register_sysctl_init("kernel", fork_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_fork_sysctl);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index a8900a3bc27a..a76bf957fb32 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/freezer.c - Function to freeze a process
*
@@ -9,21 +10,19 @@
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
+#include <linux/oom.h>
#include <linux/kthread.h>
/* total number of freezing conditions in effect */
-atomic_t system_freezing_cnt = ATOMIC_INIT(0);
-EXPORT_SYMBOL(system_freezing_cnt);
-
-/* indicate whether PM freezing is in effect, protected by pm_mutex */
-bool pm_freezing;
-bool pm_nosig_freezing;
+DEFINE_STATIC_KEY_FALSE(freezer_active);
+EXPORT_SYMBOL(freezer_active);
/*
- * Temporary export for the deadlock workaround in ata_scsi_hotplug().
- * Remove once the hack becomes unnecessary.
+ * indicate whether PM freezing is in effect, protected by
+ * system_transition_mutex
*/
-EXPORT_SYMBOL_GPL(pm_freezing);
+bool pm_freezing;
+bool pm_nosig_freezing;
/* protects freezing and frozen transitions */
static DEFINE_SPINLOCK(freezer_lock);
@@ -32,7 +31,7 @@ static DEFINE_SPINLOCK(freezer_lock);
* freezing_slow_path - slow path for testing whether a task needs to be frozen
* @p: task to be tested
*
- * This function is called by freezing() if system_freezing_cnt isn't zero
+ * This function is called by freezing() if freezer_active isn't zero
* and tests whether @p needs to enter and stay in frozen state. Can be
* called under any context. The freezers are responsible for ensuring the
* target tasks see the updated state.
@@ -42,10 +41,10 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
- if (test_thread_flag(TIF_MEMDIE))
+ if (tsk_is_oom_victim(p))
return false;
- if (pm_nosig_freezing || cgroup_freezing(p))
+ if (pm_nosig_freezing || cgroup1_freezing(p))
return true;
if (pm_freezing && !(p->flags & PF_KTHREAD))
@@ -55,41 +54,44 @@ bool freezing_slow_path(struct task_struct *p)
}
EXPORT_SYMBOL(freezing_slow_path);
+bool frozen(struct task_struct *p)
+{
+ return READ_ONCE(p->__state) & TASK_FROZEN;
+}
+
/* Refrigerator is place where frozen processes are stored :-). */
bool __refrigerator(bool check_kthr_stop)
{
- /* Hmm, should we be allowed to suspend when there are realtime
- processes around? */
+ unsigned int state = get_current_state();
bool was_frozen = false;
- long save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
+ WARN_ON_ONCE(state && !(state & TASK_NORMAL));
+
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ bool freeze;
+
+ raw_spin_lock_irq(&current->pi_lock);
+ WRITE_ONCE(current->__state, TASK_FROZEN);
+ /* unstale saved_state so that __thaw_task() will wake us up */
+ current->saved_state = TASK_RUNNING;
+ raw_spin_unlock_irq(&current->pi_lock);
spin_lock_irq(&freezer_lock);
- current->flags |= PF_FROZEN;
- if (!freezing(current) ||
- (check_kthr_stop && kthread_should_stop()))
- current->flags &= ~PF_FROZEN;
+ freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
spin_unlock_irq(&freezer_lock);
- if (!(current->flags & PF_FROZEN))
+ if (!freeze)
break;
+
was_frozen = true;
schedule();
}
+ __set_current_state(TASK_RUNNING);
pr_debug("%s left refrigerator\n", current->comm);
- /*
- * Restore saved task state before returning. The mb'd version
- * needs to be used; otherwise, it might silently break
- * synchronization which depends on ordered task state change.
- */
- set_current_state(save);
-
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
@@ -104,6 +106,50 @@ static void fake_signal_wake_up(struct task_struct *p)
}
}
+static int __set_task_frozen(struct task_struct *p, void *arg)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ /*
+ * Allow freezing the sched_delayed tasks; they will not execute until
+ * ttwu() fixes them up, so it is safe to swap their state now, instead
+ * of waiting for them to get fully dequeued.
+ */
+ if (task_is_runnable(p))
+ return 0;
+
+ if (p != current && task_curr(p))
+ return 0;
+
+ if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED)))
+ return 0;
+
+ /*
+ * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they
+ * can suffer spurious wakeups.
+ */
+ if (state & TASK_FREEZABLE)
+ WARN_ON_ONCE(!(state & TASK_NORMAL));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It's dangerous to freeze with locks held; there be dragons there.
+ */
+ if (!(state & __TASK_FREEZABLE_UNSAFE))
+ WARN_ON_ONCE(debug_locks && p->lockdep_depth);
+#endif
+
+ p->saved_state = p->__state;
+ WRITE_ONCE(p->__state, TASK_FROZEN);
+ return TASK_FROZEN;
+}
+
+static bool __freeze_task(struct task_struct *p)
+{
+ /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */
+ return task_call_func(p, __set_task_frozen, NULL);
+}
+
/**
* freeze_task - send a freeze request to given task
* @p: task to send the request to
@@ -119,20 +165,8 @@ bool freeze_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * This check can race with freezer_do_not_count, but worst case that
- * will result in an extra wakeup being sent to the task. It does not
- * race with freezer_count(), the barriers in freezer_count() and
- * freezer_should_skip() ensure that either freezer_count() sees
- * freezing == true in try_to_freeze() and freezes, or
- * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
- * normally.
- */
- if (freezer_should_skip(p))
- return false;
-
spin_lock_irqsave(&freezer_lock, flags);
- if (!freezing(p) || frozen(p)) {
+ if (!freezing(p) || frozen(p) || __freeze_task(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
@@ -140,20 +174,54 @@ bool freeze_task(struct task_struct *p)
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
- wake_up_state(p, TASK_INTERRUPTIBLE);
+ wake_up_state(p, TASK_NORMAL);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
+/*
+ * Restore the saved_state before the task entered freezer. For typical task
+ * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
+ * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
+ * is restored unless they got an expected wakeup (see ttwu_state_match()).
+ * Returns 1 if the task state was restored.
+ */
+static int __restore_freezer_state(struct task_struct *p, void *arg)
+{
+ unsigned int state = p->saved_state;
+
+ if (state != TASK_RUNNING) {
+ WRITE_ONCE(p->__state, state);
+ p->saved_state = TASK_RUNNING;
+ return 1;
+ }
+
+ return 0;
+}
+
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
+ guard(spinlock_irqsave)(&freezer_lock);
+ if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL))
+ wake_up_state(p, TASK_FROZEN);
+}
- spin_lock_irqsave(&freezer_lock, flags);
- if (frozen(p))
- wake_up_process(p);
- spin_unlock_irqrestore(&freezer_lock, flags);
+/*
+ * thaw_process - Thaw a frozen process
+ * @p: the process to be thawed
+ *
+ * Iterate over all threads of @p and call __thaw_task() on each.
+ */
+void thaw_process(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ rcu_read_lock();
+ for_each_thread(p, t) {
+ __thaw_task(t);
+ }
+ rcu_read_unlock();
}
/**
diff --git a/kernel/futex.c b/kernel/futex.c
deleted file mode 100644
index 2579e407ff67..000000000000
--- a/kernel/futex.c
+++ /dev/null
@@ -1,3053 +0,0 @@
-/*
- * Fast Userspace Mutexes (which I call "Futexes!").
- * (C) Rusty Russell, IBM 2002
- *
- * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
- * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
- *
- * Removed page pinning, fix privately mapped COW pages and other cleanups
- * (C) Copyright 2003, 2004 Jamie Lokier
- *
- * Robust futex support started by Ingo Molnar
- * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
- * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
- *
- * PI-futex support started by Ingo Molnar and Thomas Gleixner
- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * PRIVATE futexes by Eric Dumazet
- * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
- *
- * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
- * Copyright (C) IBM Corporation, 2009
- * Thanks to Thomas Gleixner for conceptual design and careful reviews.
- *
- * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
- * enough at me, Linus for the original (flawed) idea, Matthew
- * Kirkwood for proof-of-concept implementation.
- *
- * "The futexes are also cursed."
- * "But they come in a choice of three flavours!"
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/jhash.h>
-#include <linux/init.h>
-#include <linux/futex.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/syscalls.h>
-#include <linux/signal.h>
-#include <linux/export.h>
-#include <linux/magic.h>
-#include <linux/pid.h>
-#include <linux/nsproxy.h>
-#include <linux/ptrace.h>
-#include <linux/sched/rt.h>
-#include <linux/hugetlb.h>
-#include <linux/freezer.h>
-#include <linux/bootmem.h>
-
-#include <asm/futex.h>
-
-#include "locking/rtmutex_common.h"
-
-/*
- * READ this before attempting to hack on futexes!
- *
- * Basic futex operation and ordering guarantees
- * =============================================
- *
- * The waiter reads the futex value in user space and calls
- * futex_wait(). This function computes the hash bucket and acquires
- * the hash bucket lock. After that it reads the futex user space value
- * again and verifies that the data has not changed. If it has not changed
- * it enqueues itself into the hash bucket, releases the hash bucket lock
- * and schedules.
- *
- * The waker side modifies the user space value of the futex and calls
- * futex_wake(). This function computes the hash bucket and acquires the
- * hash bucket lock. Then it looks for waiters on that futex in the hash
- * bucket and wakes them.
- *
- * In futex wake up scenarios where no tasks are blocked on a futex, taking
- * the hb spinlock can be avoided and simply return. In order for this
- * optimization to work, ordering guarantees must exist so that the waiter
- * being added to the list is acknowledged when the list is concurrently being
- * checked by the waker, avoiding scenarios like the following:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- * uval = *futex;
- * *futex = newval;
- * sys_futex(WAKE, futex);
- * futex_wake(futex);
- * if (queue_empty())
- * return;
- * if (uval == val)
- * lock(hash_bucket(futex));
- * queue();
- * unlock(hash_bucket(futex));
- * schedule();
- *
- * This would cause the waiter on CPU 0 to wait forever because it
- * missed the transition of the user space value from val to newval
- * and the waker did not find the waiter in the hash bucket queue.
- *
- * The correct serialization ensures that a waiter either observes
- * the changed user space value before blocking or is woken by a
- * concurrent waker:
- *
- * CPU 0 CPU 1
- * val = *futex;
- * sys_futex(WAIT, futex, val);
- * futex_wait(futex, val);
- *
- * waiters++; (a)
- * mb(); (A) <-- paired with -.
- * |
- * lock(hash_bucket(futex)); |
- * |
- * uval = *futex; |
- * | *futex = newval;
- * | sys_futex(WAKE, futex);
- * | futex_wake(futex);
- * |
- * `-------> mb(); (B)
- * if (uval == val)
- * queue();
- * unlock(hash_bucket(futex));
- * schedule(); if (waiters)
- * lock(hash_bucket(futex));
- * else wake_waiters(futex);
- * waiters--; (b) unlock(hash_bucket(futex));
- *
- * Where (A) orders the waiters increment and the futex value read through
- * atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read -- this is done by the barriers for both
- * shared and private futexes in get_futex_key_refs().
- *
- * This yields the following case (where X:=waiters, Y:=futex):
- *
- * X = Y = 0
- *
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
- *
- * Which guarantees that x==0 && y==0 is impossible; which translates back into
- * the guarantee that we cannot both miss the futex variable change and the
- * enqueue.
- *
- * Note that a new waiter is accounted for in (a) even when it is possible that
- * the wait call can return error, in which case we backtrack from it in (b).
- * Refer to the comment in queue_lock().
- *
- * Similarly, in order to account for waiters being requeued on another
- * address we always increment the waiters for the destination bucket before
- * acquiring the lock. It then decrements them again after releasing it -
- * the code that actually moves the futex(es) between hash buckets (requeue_futex)
- * will do the additional required waiter count housekeeping. This is done for
- * double_lock_hb() and double_unlock_hb(), respectively.
- */
-
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
-#endif
-
-/*
- * Futex flags used to encode options to functions and preserve them across
- * restarts.
- */
-#define FLAGS_SHARED 0x01
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
-/*
- * Priority Inheritance state:
- */
-struct futex_pi_state {
- /*
- * list of 'owned' pi_state instances - these have to be
- * cleaned up in do_exit() if the task exits prematurely:
- */
- struct list_head list;
-
- /*
- * The PI object:
- */
- struct rt_mutex pi_mutex;
-
- struct task_struct *owner;
- atomic_t refcount;
-
- union futex_key key;
-};
-
-/**
- * struct futex_q - The hashed futex queue entry, one per waiting task
- * @list: priority-sorted list of tasks waiting on this futex
- * @task: the task waiting on the futex
- * @lock_ptr: the hash bucket lock
- * @key: the key the futex is hashed on
- * @pi_state: optional priority inheritance state
- * @rt_waiter: rt_waiter storage for use with requeue_pi
- * @requeue_pi_key: the requeue_pi target futex key
- * @bitset: bitset for the optional bitmasked wakeup
- *
- * We use this hashed waitqueue, instead of a normal wait_queue_t, so
- * we can wake only the relevant ones (hashed queues may be shared).
- *
- * A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakeup is always to make the first condition true, then
- * the second.
- *
- * PI futexes are typically woken before they are removed from the hash list via
- * the rt_mutex code. See unqueue_me_pi().
- */
-struct futex_q {
- struct plist_node list;
-
- struct task_struct *task;
- spinlock_t *lock_ptr;
- union futex_key key;
- struct futex_pi_state *pi_state;
- struct rt_mutex_waiter *rt_waiter;
- union futex_key *requeue_pi_key;
- u32 bitset;
-};
-
-static const struct futex_q futex_q_init = {
- /* list gets initialized in queue_me()*/
- .key = FUTEX_KEY_INIT,
- .bitset = FUTEX_BITSET_MATCH_ANY
-};
-
-/*
- * Hash buckets are shared by all the futex_keys that hash to the same
- * location. Each key may have multiple futex_q structures, one for each task
- * waiting on a futex.
- */
-struct futex_hash_bucket {
- atomic_t waiters;
- spinlock_t lock;
- struct plist_head chain;
-} ____cacheline_aligned_in_smp;
-
-static unsigned long __read_mostly futex_hashsize;
-
-static struct futex_hash_bucket *futex_queues;
-
-static inline void futex_get_mm(union futex_key *key)
-{
- atomic_inc(&key->private.mm->mm_count);
- /*
- * Ensure futex_get_mm() implies a full barrier such that
- * get_futex_key() implies a full barrier. This is relied upon
- * as full barrier (B), see the ordering comment above.
- */
- smp_mb__after_atomic();
-}
-
-/*
- * Reflects a new waiter being added to the waitqueue.
- */
-static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_inc(&hb->waiters);
- /*
- * Full barrier (A), see the ordering comment above.
- */
- smp_mb__after_atomic();
-#endif
-}
-
-/*
- * Reflects a waiter being removed from the waitqueue by wakeup
- * paths.
- */
-static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- atomic_dec(&hb->waiters);
-#endif
-}
-
-static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
-{
-#ifdef CONFIG_SMP
- return atomic_read(&hb->waiters);
-#else
- return 1;
-#endif
-}
-
-/*
- * We hash on the keys returned from get_futex_key (see below).
- */
-static struct futex_hash_bucket *hash_futex(union futex_key *key)
-{
- u32 hash = jhash2((u32*)&key->both.word,
- (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
- key->both.offset);
- return &futex_queues[hash & (futex_hashsize - 1)];
-}
-
-/*
- * Return 1 if two futex_keys are equal, 0 otherwise.
- */
-static inline int match_futex(union futex_key *key1, union futex_key *key2)
-{
- return (key1 && key2
- && key1->both.word == key2->both.word
- && key1->both.ptr == key2->both.ptr
- && key1->both.offset == key2->both.offset);
-}
-
-/*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
-static void get_futex_key_refs(union futex_key *key)
-{
- if (!key->both.ptr)
- return;
-
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- ihold(key->shared.inode); /* implies MB (B) */
- break;
- case FUT_OFF_MMSHARED:
- futex_get_mm(key); /* implies MB (B) */
- break;
- default:
- /*
- * Private futexes do not hold reference on an inode or
- * mm, therefore the only purpose of calling get_futex_key_refs
- * is because we need the barrier for the lockless waiter check.
- */
- smp_mb(); /* explicit MB (B) */
- }
-}
-
-/*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held. This is
- * a no-op for private futexes, see comment in the get
- * counterpart.
- */
-static void drop_futex_key_refs(union futex_key *key)
-{
- if (!key->both.ptr) {
- /* If we're here then we tried to put a key we failed to get */
- WARN_ON_ONCE(1);
- return;
- }
-
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- iput(key->shared.inode);
- break;
- case FUT_OFF_MMSHARED:
- mmdrop(key->private.mm);
- break;
- }
-}
-
-/**
- * get_futex_key() - Get parameters which are the keys for a futex
- * @uaddr: virtual address of the futex
- * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
- * @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: VERIFY_READ,
- * VERIFY_WRITE)
- *
- * Return: a negative error code or 0
- *
- * The key words are stored in *key on success.
- *
- * For shared mappings, it's (page->index, file_inode(vma->vm_file),
- * offset_within_page). For private mappings, it's (uaddr, current->mm).
- * We can usually work out the index without swapping in the page.
- *
- * lock_page() might sleep, the caller should not hold a spinlock.
- */
-static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
-{
- unsigned long address = (unsigned long)uaddr;
- struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
- int err, ro = 0;
-
- /*
- * The futex address must be "naturally" aligned.
- */
- key->both.offset = address % PAGE_SIZE;
- if (unlikely((address % sizeof(u32)) != 0))
- return -EINVAL;
- address -= key->both.offset;
-
- if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
- return -EFAULT;
-
- /*
- * PROCESS_PRIVATE futexes are fast.
- * As the mm cannot disappear under us and the 'key' only needs
- * virtual address, we dont even have to find the underlying vma.
- * Note : We do have to check 'uaddr' is a valid user address,
- * but access_ok() should be faster than find_vma()
- */
- if (!fshared) {
- key->private.mm = mm;
- key->private.address = address;
- get_futex_key_refs(key); /* implies MB (B) */
- return 0;
- }
-
-again:
- err = get_user_pages_fast(address, 1, 1, &page);
- /*
- * If write access is not required (eg. FUTEX_WAIT), try
- * and get read-only access.
- */
- if (err == -EFAULT && rw == VERIFY_READ) {
- err = get_user_pages_fast(address, 1, 0, &page);
- ro = 1;
- }
- if (err < 0)
- return err;
- else
- err = 0;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
-
- /*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
- * page; but it might be the ZERO_PAGE or in the gate area or
- * in a special mapping (all cases which we are happy to fail);
- * or it may have been a good file page when get_user_pages_fast
- * found it, but truncated or holepunched or subjected to
- * invalidate_complete_page2 before we got the page lock (also
- * cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
- * prevents drop_caches from setting mapping to NULL beneath us.
- *
- * The case we do have to guard against is when memory pressure made
- * shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
- */
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
- if (shmem_swizzled)
- goto again;
- return -EFAULT;
- }
-
- /*
- * Private mappings are handled in a simple way.
- *
- * NOTE: When userspace waits on a MAP_SHARED mapping, even if
- * it's a read-only handle, it's expected that futexes attach to
- * the object not the particular process.
- */
- if (PageAnon(page_head)) {
- /*
- * A RO anonymous page will never change and thus doesn't make
- * sense for futex operations.
- */
- if (ro) {
- err = -EFAULT;
- goto out;
- }
-
- key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
- key->private.mm = mm;
- key->private.address = address;
- } else {
- key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
- key->shared.pgoff = basepage_index(page);
- }
-
- get_futex_key_refs(key); /* implies MB (B) */
-
-out:
- unlock_page(page_head);
- put_page(page_head);
- return err;
-}
-
-static inline void put_futex_key(union futex_key *key)
-{
- drop_futex_key_refs(key);
-}
-
-/**
- * fault_in_user_writeable() - Fault in user address and verify RW access
- * @uaddr: pointer to faulting user space address
- *
- * Slow path to fixup the fault we just took in the atomic write
- * access to @uaddr.
- *
- * We have no generic implementation of a non-destructive write to the
- * user address. We know that we faulted in the atomic pagefault
- * disabled section so we can as well avoid the #PF overhead by
- * calling get_user_pages() right away.
- */
-static int fault_in_user_writeable(u32 __user *uaddr)
-{
- struct mm_struct *mm = current->mm;
- int ret;
-
- down_read(&mm->mmap_sem);
- ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE);
- up_read(&mm->mmap_sem);
-
- return ret < 0 ? ret : 0;
-}
-
-/**
- * futex_top_waiter() - Return the highest priority waiter on a futex
- * @hb: the hash bucket the futex_q's reside in
- * @key: the futex key (to distinguish it from other futex futex_q's)
- *
- * Must be called with the hb lock held.
- */
-static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
- union futex_key *key)
-{
- struct futex_q *this;
-
- plist_for_each_entry(this, &hb->chain, list) {
- if (match_futex(&this->key, key))
- return this;
- }
- return NULL;
-}
-
-static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
- u32 uval, u32 newval)
-{
- int ret;
-
- pagefault_disable();
- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
- pagefault_enable();
-
- return ret;
-}
-
-static int get_futex_value_locked(u32 *dest, u32 __user *from)
-{
- int ret;
-
- pagefault_disable();
- ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
- pagefault_enable();
-
- return ret ? -EFAULT : 0;
-}
-
-
-/*
- * PI code:
- */
-static int refill_pi_state_cache(void)
-{
- struct futex_pi_state *pi_state;
-
- if (likely(current->pi_state_cache))
- return 0;
-
- pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
-
- if (!pi_state)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&pi_state->list);
- /* pi_mutex gets initialized later */
- pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
- pi_state->key = FUTEX_KEY_INIT;
-
- current->pi_state_cache = pi_state;
-
- return 0;
-}
-
-static struct futex_pi_state * alloc_pi_state(void)
-{
- struct futex_pi_state *pi_state = current->pi_state_cache;
-
- WARN_ON(!pi_state);
- current->pi_state_cache = NULL;
-
- return pi_state;
-}
-
-/*
- * Must be called with the hb lock held.
- */
-static void free_pi_state(struct futex_pi_state *pi_state)
-{
- if (!pi_state)
- return;
-
- if (!atomic_dec_and_test(&pi_state->refcount))
- return;
-
- /*
- * If pi_state->owner is NULL, the owner is most probably dying
- * and has cleaned up the pi_state already
- */
- if (pi_state->owner) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
-
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
- }
-
- if (current->pi_state_cache)
- kfree(pi_state);
- else {
- /*
- * pi_state->list is already empty.
- * clear pi_state->owner.
- * refcount is at 0 - put it back to 1.
- */
- pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
- current->pi_state_cache = pi_state;
- }
-}
-
-/*
- * Look up the task based on what TID userspace gave us.
- * We dont trust it.
- */
-static struct task_struct * futex_find_get_task(pid_t pid)
-{
- struct task_struct *p;
-
- rcu_read_lock();
- p = find_task_by_vpid(pid);
- if (p)
- get_task_struct(p);
-
- rcu_read_unlock();
-
- return p;
-}
-
-/*
- * This task is holding PI mutexes at exit time => bad.
- * Kernel cleans up PI-state, but userspace is likely hosed.
- * (Robust-futex cleanup is separate and might save the day for userspace.)
- */
-void exit_pi_state_list(struct task_struct *curr)
-{
- struct list_head *next, *head = &curr->pi_state_list;
- struct futex_pi_state *pi_state;
- struct futex_hash_bucket *hb;
- union futex_key key = FUTEX_KEY_INIT;
-
- if (!futex_cmpxchg_enabled)
- return;
- /*
- * We are a ZOMBIE and nobody can enqueue itself on
- * pi_state_list anymore, but we have to be careful
- * versus waiters unqueueing themselves:
- */
- raw_spin_lock_irq(&curr->pi_lock);
- while (!list_empty(head)) {
-
- next = head->next;
- pi_state = list_entry(next, struct futex_pi_state, list);
- key = pi_state->key;
- hb = hash_futex(&key);
- raw_spin_unlock_irq(&curr->pi_lock);
-
- spin_lock(&hb->lock);
-
- raw_spin_lock_irq(&curr->pi_lock);
- /*
- * We dropped the pi-lock, so re-check whether this
- * task still owns the PI-state:
- */
- if (head->next != next) {
- spin_unlock(&hb->lock);
- continue;
- }
-
- WARN_ON(pi_state->owner != curr);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- pi_state->owner = NULL;
- raw_spin_unlock_irq(&curr->pi_lock);
-
- rt_mutex_unlock(&pi_state->pi_mutex);
-
- spin_unlock(&hb->lock);
-
- raw_spin_lock_irq(&curr->pi_lock);
- }
- raw_spin_unlock_irq(&curr->pi_lock);
-}
-
-/*
- * We need to check the following states:
- *
- * Waiter | pi_state | pi->owner | uTID | uODIED | ?
- *
- * [1] NULL | --- | --- | 0 | 0/1 | Valid
- * [2] NULL | --- | --- | >0 | 0/1 | Valid
- *
- * [3] Found | NULL | -- | Any | 0/1 | Invalid
- *
- * [4] Found | Found | NULL | 0 | 1 | Valid
- * [5] Found | Found | NULL | >0 | 1 | Invalid
- *
- * [6] Found | Found | task | 0 | 1 | Valid
- *
- * [7] Found | Found | NULL | Any | 0 | Invalid
- *
- * [8] Found | Found | task | ==taskTID | 0/1 | Valid
- * [9] Found | Found | task | 0 | 0 | Invalid
- * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
- *
- * [1] Indicates that the kernel can acquire the futex atomically. We
- * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
- *
- * [2] Valid, if TID does not belong to a kernel thread. If no matching
- * thread is found then it indicates that the owner TID has died.
- *
- * [3] Invalid. The waiter is queued on a non PI futex
- *
- * [4] Valid state after exit_robust_list(), which sets the user space
- * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
- *
- * [5] The user space value got manipulated between exit_robust_list()
- * and exit_pi_state_list()
- *
- * [6] Valid state after exit_pi_state_list() which sets the new owner in
- * the pi_state but cannot access the user space value.
- *
- * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
- *
- * [8] Owner and user space value match
- *
- * [9] There is no transient state which sets the user space TID to 0
- * except exit_robust_list(), but this is indicated by the
- * FUTEX_OWNER_DIED bit. See [4]
- *
- * [10] There is no transient state which leaves owner and user space
- * TID out of sync.
- */
-
-/*
- * Validate that the existing waiter has a pi_state and sanity check
- * the pi_state against the user space value. If correct, attach to
- * it.
- */
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
- struct futex_pi_state **ps)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
-
- /*
- * Userspace might have messed up non-PI and PI futexes [3]
- */
- if (unlikely(!pi_state))
- return -EINVAL;
-
- WARN_ON(!atomic_read(&pi_state->refcount));
-
- /*
- * Handle the owner died case:
- */
- if (uval & FUTEX_OWNER_DIED) {
- /*
- * exit_pi_state_list sets owner to NULL and wakes the
- * topmost waiter. The task which acquires the
- * pi_state->rt_mutex will fixup owner.
- */
- if (!pi_state->owner) {
- /*
- * No pi state owner, but the user space TID
- * is not 0. Inconsistent state. [5]
- */
- if (pid)
- return -EINVAL;
- /*
- * Take a ref on the state and return success. [4]
- */
- goto out_state;
- }
-
- /*
- * If TID is 0, then either the dying owner has not
- * yet executed exit_pi_state_list() or some waiter
- * acquired the rtmutex in the pi state, but did not
- * yet fixup the TID in user space.
- *
- * Take a ref on the state and return success. [6]
- */
- if (!pid)
- goto out_state;
- } else {
- /*
- * If the owner died bit is not set, then the pi_state
- * must have an owner. [7]
- */
- if (!pi_state->owner)
- return -EINVAL;
- }
-
- /*
- * Bail out if user space manipulated the futex value. If pi
- * state exists then the owner TID must be the same as the
- * user space TID. [9/10]
- */
- if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
- atomic_inc(&pi_state->refcount);
- *ps = pi_state;
- return 0;
-}
-
-/*
- * Lookup the task for the TID provided from user space and attach to
- * it after doing proper sanity checks.
- */
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
- struct futex_pi_state **ps)
-{
- pid_t pid = uval & FUTEX_TID_MASK;
- struct futex_pi_state *pi_state;
- struct task_struct *p;
-
- /*
- * We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0 [1]
- */
- if (!pid)
- return -ESRCH;
- p = futex_find_get_task(pid);
- if (!p)
- return -ESRCH;
-
- if (unlikely(p->flags & PF_KTHREAD)) {
- put_task_struct(p);
- return -EPERM;
- }
-
- /*
- * We need to look at the task state flags to figure out,
- * whether the task is exiting. To protect against the do_exit
- * change of the task flags, we do this protected by
- * p->pi_lock:
- */
- raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->flags & PF_EXITING)) {
- /*
- * The task is on the way out. When PF_EXITPIDONE is
- * set, we know that the task has finished the
- * cleanup:
- */
- int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
-
- raw_spin_unlock_irq(&p->pi_lock);
- put_task_struct(p);
- return ret;
- }
-
- /*
- * No existing pi state. First waiter. [2]
- */
- pi_state = alloc_pi_state();
-
- /*
- * Initialize the pi_mutex in locked state and make @p
- * the owner of it:
- */
- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-
- /* Store the key for possible exit cleanups: */
- pi_state->key = *key;
-
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
- pi_state->owner = p;
- raw_spin_unlock_irq(&p->pi_lock);
-
- put_task_struct(p);
-
- *ps = pi_state;
-
- return 0;
-}
-
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
-{
- struct futex_q *match = futex_top_waiter(hb, key);
-
- /*
- * If there is a waiter on that futex, validate it and
- * attach to the pi_state when the validation succeeds.
- */
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
-
- /*
- * We are the first waiter - try to look up the owner based on
- * @uval and attach to it.
- */
- return attach_to_pi_owner(uval, key, ps);
-}
-
-static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
-{
- u32 uninitialized_var(curval);
-
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
- return -EFAULT;
-
- /*If user space value changed, let the caller retry */
- return curval != uval ? -EAGAIN : 0;
-}
-
-/**
- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
- * @uaddr: the pi futex user address
- * @hb: the pi futex hash bucket
- * @key: the futex key associated with uaddr and hb
- * @ps: the pi_state pointer where we store the result of the
- * lookup
- * @task: the task to perform the atomic lock work for. This will
- * be "current" except in the case of requeue pi.
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Return:
- * 0 - ready to wait;
- * 1 - acquired the lock;
- * <0 - error
- *
- * The hb->lock and futex_key refs shall be held by the caller.
- */
-static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
- union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct *task, int set_waiters)
-{
- u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *match;
- int ret;
-
- /*
- * Read the user space value first so we can validate a few
- * things before proceeding further.
- */
- if (get_futex_value_locked(&uval, uaddr))
- return -EFAULT;
-
- /*
- * Detect deadlocks.
- */
- if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
- return -EDEADLK;
-
- /*
- * Lookup existing state first. If it exists, try to attach to
- * its pi_state.
- */
- match = futex_top_waiter(hb, key);
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
-
- /*
- * No waiter and user TID is 0. We are here because the
- * waiters or the owner died bit is set or called from
- * requeue_cmp_pi or for whatever reason something took the
- * syscall.
- */
- if (!(uval & FUTEX_TID_MASK)) {
- /*
- * We take over the futex. No other waiters and the user space
- * TID is 0. We preserve the owner died bit.
- */
- newval = uval & FUTEX_OWNER_DIED;
- newval |= vpid;
-
- /* The futex requeue_pi code can enforce the waiters bit */
- if (set_waiters)
- newval |= FUTEX_WAITERS;
-
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- /* If the take over worked, return 1 */
- return ret < 0 ? ret : 1;
- }
-
- /*
- * First waiter. Set the waiters bit before attaching ourself to
- * the owner. If owner tries to unlock, it will be forced into
- * the kernel and blocked on hb->lock.
- */
- newval = uval | FUTEX_WAITERS;
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- if (ret)
- return ret;
- /*
- * If the update of the user space value succeeded, we try to
- * attach to the owner. If that fails, no harm done, we only
- * set the FUTEX_WAITERS bit in the user space variable.
- */
- return attach_to_pi_owner(uval, key, ps);
-}
-
-/**
- * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be NULL and must be held by the caller.
- */
-static void __unqueue_futex(struct futex_q *q)
-{
- struct futex_hash_bucket *hb;
-
- if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
- || WARN_ON(plist_node_empty(&q->list)))
- return;
-
- hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-}
-
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
- */
-static void wake_futex(struct futex_q *q)
-{
- struct task_struct *p = q->task;
-
- if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
- return;
-
- /*
- * We set q->lock_ptr = NULL _before_ we wake up the task. If
- * a non-futex wake up happens on another CPU then the task
- * might exit and p would dereference a non-existing task
- * struct. Prevent this by holding a reference on p across the
- * wake up.
- */
- get_task_struct(p);
-
- __unqueue_futex(q);
- /*
- * The waiting task can free the futex_q as soon as
- * q->lock_ptr = NULL is written, without taking any locks. A
- * memory barrier is required here to prevent the following
- * store to lock_ptr from getting ahead of the plist_del.
- */
- smp_wmb();
- q->lock_ptr = NULL;
-
- wake_up_state(p, TASK_NORMAL);
- put_task_struct(p);
-}
-
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
-{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
- u32 uninitialized_var(curval), newval;
- int ret = 0;
-
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
- new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
-
- /*
- * It is possible that the next waiter (the one that brought
- * this owner to the kernel) timed out and is no longer
- * waiting on the lock.
- */
- if (!new_owner)
- new_owner = this->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
- */
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
- if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
-
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
-
- raw_spin_lock_irq(&new_owner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
- pi_state->owner = new_owner;
- raw_spin_unlock_irq(&new_owner->pi_lock);
-
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
-
- return 0;
-}
-
-/*
- * Express the locking dependencies for lockdep:
- */
-static inline void
-double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- if (hb1 <= hb2) {
- spin_lock(&hb1->lock);
- if (hb1 < hb2)
- spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
- } else { /* hb1 > hb2 */
- spin_lock(&hb2->lock);
- spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
- }
-}
-
-static inline void
-double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-{
- spin_unlock(&hb1->lock);
- if (hb1 != hb2)
- spin_unlock(&hb2->lock);
-}
-
-/*
- * Wake up waiters matching bitset queued on this futex (uaddr).
- */
-static int
-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
-{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
- union futex_key key = FUTEX_KEY_INIT;
- int ret;
-
- if (!bitset)
- return -EINVAL;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
- if (unlikely(ret != 0))
- goto out;
-
- hb = hash_futex(&key);
-
- /* Make sure we really have tasks to wakeup */
- if (!hb_waiters_pending(hb))
- goto out_put_key;
-
- spin_lock(&hb->lock);
-
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (match_futex (&this->key, &key)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- break;
- }
-
- /* Check if one of the bits is set in both bitsets */
- if (!(this->bitset & bitset))
- continue;
-
- wake_futex(this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- spin_unlock(&hb->lock);
-out_put_key:
- put_futex_key(&key);
-out:
- return ret;
-}
-
-/*
- * Wake up all waiters hashed on the physical page that is mapped
- * to this virtual address:
- */
-static int
-futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
- int nr_wake, int nr_wake2, int op)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
- int ret, op_ret;
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
- if (unlikely(ret != 0))
- goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out_put_key1;
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- double_lock_hb(hb1, hb2);
- op_ret = futex_atomic_op_inuser(op, uaddr2);
- if (unlikely(op_ret < 0)) {
-
- double_unlock_hb(hb1, hb2);
-
-#ifndef CONFIG_MMU
- /*
- * we don't get EFAULT from MMU faults if we don't have an MMU,
- * but we might get them from range checking
- */
- ret = op_ret;
- goto out_put_keys;
-#endif
-
- if (unlikely(op_ret != -EFAULT)) {
- ret = op_ret;
- goto out_put_keys;
- }
-
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
- goto out_put_keys;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- put_futex_key(&key2);
- put_futex_key(&key1);
- goto retry;
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (match_futex (&this->key, &key1)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- wake_futex(this);
- if (++ret >= nr_wake)
- break;
- }
- }
-
- if (op_ret > 0) {
- op_ret = 0;
- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
- if (match_futex (&this->key, &key2)) {
- if (this->pi_state || this->rt_waiter) {
- ret = -EINVAL;
- goto out_unlock;
- }
- wake_futex(this);
- if (++op_ret >= nr_wake2)
- break;
- }
- }
- ret += op_ret;
- }
-
-out_unlock:
- double_unlock_hb(hb1, hb2);
-out_put_keys:
- put_futex_key(&key2);
-out_put_key1:
- put_futex_key(&key1);
-out:
- return ret;
-}
-
-/**
- * requeue_futex() - Requeue a futex_q from one hb to another
- * @q: the futex_q to requeue
- * @hb1: the source hash_bucket
- * @hb2: the target hash_bucket
- * @key2: the new key for the requeued futex_q
- */
-static inline
-void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2, union futex_key *key2)
-{
-
- /*
- * If key1 and key2 hash to the same bucket, no need to
- * requeue.
- */
- if (likely(&hb1->chain != &hb2->chain)) {
- plist_del(&q->list, &hb1->chain);
- hb_waiters_dec(hb1);
- plist_add(&q->list, &hb2->chain);
- hb_waiters_inc(hb2);
- q->lock_ptr = &hb2->lock;
- }
- get_futex_key_refs(key2);
- q->key = *key2;
-}
-
-/**
- * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
- * @q: the futex_q
- * @key: the key of the requeue target futex
- * @hb: the hash_bucket of the requeue target futex
- *
- * During futex_requeue, with requeue_pi=1, it is possible to acquire the
- * target futex if it is uncontended or via a lock steal. Set the futex_q key
- * to the requeue target futex so the waiter can detect the wakeup on the right
- * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
- * to protect access to the pi_state to fixup the owner later. Must be called
- * with both q->lock_ptr and hb->lock held.
- */
-static inline
-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
- struct futex_hash_bucket *hb)
-{
- get_futex_key_refs(key);
- q->key = *key;
-
- __unqueue_futex(q);
-
- WARN_ON(!q->rt_waiter);
- q->rt_waiter = NULL;
-
- q->lock_ptr = &hb->lock;
-
- wake_up_state(q->task, TASK_NORMAL);
-}
-
-/**
- * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
- * @pifutex: the user address of the to futex
- * @hb1: the from futex hash bucket, must be locked by the caller
- * @hb2: the to futex hash bucket, must be locked by the caller
- * @key1: the from futex key
- * @key2: the to futex key
- * @ps: address to store the pi_state pointer
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Try and get the lock on behalf of the top waiter if we can do it atomically.
- * Wake the top waiter if we succeed. If the caller specified set_waiters,
- * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
- * hb1 and hb2 must be held by the caller.
- *
- * Return:
- * 0 - failed to acquire the lock atomically;
- * >0 - acquired the lock, return value is vpid of the top_waiter
- * <0 - error
- */
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
- struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2,
- union futex_key *key1, union futex_key *key2,
- struct futex_pi_state **ps, int set_waiters)
-{
- struct futex_q *top_waiter = NULL;
- u32 curval;
- int ret, vpid;
-
- if (get_futex_value_locked(&curval, pifutex))
- return -EFAULT;
-
- /*
- * Find the top_waiter and determine if there are additional waiters.
- * If the caller intends to requeue more than 1 waiter to pifutex,
- * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
- * as we have means to handle the possible fault. If not, don't set
- * the bit unecessarily as it will force the subsequent unlock to enter
- * the kernel.
- */
- top_waiter = futex_top_waiter(hb1, key1);
-
- /* There are no waiters, nothing for us to do. */
- if (!top_waiter)
- return 0;
-
- /* Ensure we requeue to the expected futex. */
- if (!match_futex(top_waiter->requeue_pi_key, key2))
- return -EINVAL;
-
- /*
- * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
- * the contended case or if set_waiters is 1. The pi_state is returned
- * in ps in contended cases.
- */
- vpid = task_pid_vnr(top_waiter->task);
- ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- set_waiters);
- if (ret == 1) {
- requeue_pi_wake_futex(top_waiter, key2, hb2);
- return vpid;
- }
- return ret;
-}
-
-/**
- * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * @uaddr1: source futex user address
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @uaddr2: target futex user address
- * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
- * @nr_requeue: number of waiters to requeue (0-INT_MAX)
- * @cmpval: @uaddr1 expected value (or %NULL)
- * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
- *
- * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
- * uaddr2 atomically on behalf of the top waiter.
- *
- * Return:
- * >=0 - on success, the number of tasks requeued or woken;
- * <0 - on error
- */
-static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
- u32 __user *uaddr2, int nr_wake, int nr_requeue,
- u32 *cmpval, int requeue_pi)
-{
- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
- int drop_count = 0, task_count = 0, ret;
- struct futex_pi_state *pi_state = NULL;
- struct futex_hash_bucket *hb1, *hb2;
- struct futex_q *this, *next;
-
- if (requeue_pi) {
- /*
- * Requeue PI only works on two distinct uaddrs. This
- * check is only valid for private futexes. See below.
- */
- if (uaddr1 == uaddr2)
- return -EINVAL;
-
- /*
- * requeue_pi requires a pi_state, try to allocate it now
- * without any locks in case it fails.
- */
- if (refill_pi_state_cache())
- return -ENOMEM;
- /*
- * requeue_pi must wake as many tasks as it can, up to nr_wake
- * + nr_requeue, since it acquires the rt_mutex prior to
- * returning to userspace, so as to not leave the rt_mutex with
- * waiters and no owner. However, second and third wake-ups
- * cannot be predicted as they involve race conditions with the
- * first wake and a fault while looking up the pi_state. Both
- * pthread_cond_signal() and pthread_cond_broadcast() should
- * use nr_wake=1.
- */
- if (nr_wake != 1)
- return -EINVAL;
- }
-
-retry:
- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
- if (unlikely(ret != 0))
- goto out;
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
- requeue_pi ? VERIFY_WRITE : VERIFY_READ);
- if (unlikely(ret != 0))
- goto out_put_key1;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (requeue_pi && match_futex(&key1, &key2)) {
- ret = -EINVAL;
- goto out_put_keys;
- }
-
- hb1 = hash_futex(&key1);
- hb2 = hash_futex(&key2);
-
-retry_private:
- hb_waiters_inc(hb2);
- double_lock_hb(hb1, hb2);
-
- if (likely(cmpval != NULL)) {
- u32 curval;
-
- ret = get_futex_value_locked(&curval, uaddr1);
-
- if (unlikely(ret)) {
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
-
- ret = get_user(curval, uaddr1);
- if (ret)
- goto out_put_keys;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- put_futex_key(&key2);
- put_futex_key(&key1);
- goto retry;
- }
- if (curval != *cmpval) {
- ret = -EAGAIN;
- goto out_unlock;
- }
- }
-
- if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
- /*
- * Attempt to acquire uaddr2 and wake the top waiter. If we
- * intend to requeue waiters, force setting the FUTEX_WAITERS
- * bit. We force this here where we are able to easily handle
- * faults rather in the requeue loop below.
- */
- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state, nr_requeue);
-
- /*
- * At this point the top_waiter has either taken uaddr2 or is
- * waiting on it. If the former, then the pi_state will not
- * exist yet, look it up one more time to ensure we have a
- * reference to it. If the lock was taken, ret contains the
- * vpid of the top waiter task.
- */
- if (ret > 0) {
- WARN_ON(pi_state);
- drop_count++;
- task_count++;
- /*
- * If we acquired the lock, then the user
- * space value of uaddr2 should be vpid. It
- * cannot be changed by the top waiter as it
- * is blocked on hb2 lock if it tries to do
- * so. If something fiddled with it behind our
- * back the pi state lookup might unearth
- * it. So we rather use the known value than
- * rereading and handing potential crap to
- * lookup_pi_state.
- */
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
- }
-
- switch (ret) {
- case 0:
- break;
- case -EFAULT:
- free_pi_state(pi_state);
- pi_state = NULL;
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- put_futex_key(&key2);
- put_futex_key(&key1);
- ret = fault_in_user_writeable(uaddr2);
- if (!ret)
- goto retry;
- goto out;
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - Owner is exiting and we just wait for the
- * exit to complete.
- * - The user space value changed.
- */
- free_pi_state(pi_state);
- pi_state = NULL;
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
- put_futex_key(&key2);
- put_futex_key(&key1);
- cond_resched();
- goto retry;
- default:
- goto out_unlock;
- }
- }
-
- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
- if (task_count - nr_wake >= nr_requeue)
- break;
-
- if (!match_futex(&this->key, &key1))
- continue;
-
- /*
- * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
- * be paired with each other and no other futex ops.
- *
- * We should never be requeueing a futex_q with a pi_state,
- * which is awaiting a futex_unlock_pi().
- */
- if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter) ||
- this->pi_state) {
- ret = -EINVAL;
- break;
- }
-
- /*
- * Wake nr_wake waiters. For requeue_pi, if we acquired the
- * lock, we already woke the top_waiter. If not, it will be
- * woken by futex_unlock_pi().
- */
- if (++task_count <= nr_wake && !requeue_pi) {
- wake_futex(this);
- continue;
- }
-
- /* Ensure we requeue to the expected futex for requeue_pi. */
- if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
- ret = -EINVAL;
- break;
- }
-
- /*
- * Requeue nr_requeue waiters and possibly one more in the case
- * of requeue_pi if we couldn't acquire the lock atomically.
- */
- if (requeue_pi) {
- /* Prepare the waiter to take the rt_mutex. */
- atomic_inc(&pi_state->refcount);
- this->pi_state = pi_state;
- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
- this->rt_waiter,
- this->task);
- if (ret == 1) {
- /* We got the lock. */
- requeue_pi_wake_futex(this, &key2, hb2);
- drop_count++;
- continue;
- } else if (ret) {
- /* -EDEADLK */
- this->pi_state = NULL;
- free_pi_state(pi_state);
- goto out_unlock;
- }
- }
- requeue_futex(this, hb1, hb2, &key2);
- drop_count++;
- }
-
-out_unlock:
- free_pi_state(pi_state);
- double_unlock_hb(hb1, hb2);
- hb_waiters_dec(hb2);
-
- /*
- * drop_futex_key_refs() must be called outside the spinlocks. During
- * the requeue we moved futex_q's from the hash bucket at key1 to the
- * one at key2 and updated their key pointer. We no longer need to
- * hold the references to key1.
- */
- while (--drop_count >= 0)
- drop_futex_key_refs(&key1);
-
-out_put_keys:
- put_futex_key(&key2);
-out_put_key1:
- put_futex_key(&key1);
-out:
- return ret ? ret : task_count;
-}
-
-/* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
- __acquires(&hb->lock)
-{
- struct futex_hash_bucket *hb;
-
- hb = hash_futex(&q->key);
-
- /*
- * Increment the counter before taking the lock so that
- * a potential waker won't miss a to-be-slept task that is
- * waiting for the spinlock. This is safe as all queue_lock()
- * users end up calling queue_me(). Similarly, for housekeeping,
- * decrement the counter at queue_unlock() when some error has
- * occurred and we don't end up adding the task to the list.
- */
- hb_waiters_inc(hb);
-
- q->lock_ptr = &hb->lock;
-
- spin_lock(&hb->lock); /* implies MB (A) */
- return hb;
-}
-
-static inline void
-queue_unlock(struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- spin_unlock(&hb->lock);
- hb_waiters_dec(hb);
-}
-
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
-{
- int prio;
-
- /*
- * The priority used to register this element is
- * - either the real thread-priority for the real-time threads
- * (i.e. threads with a priority lower than MAX_RT_PRIO)
- * - or MAX_RT_PRIO for non-RT threads.
- * Thus, all RT-threads are woken first in priority order, and
- * the others are woken last, in FIFO order.
- */
- prio = min(current->normal_prio, MAX_RT_PRIO);
-
- plist_node_init(&q->list, prio);
- plist_add(&q->list, &hb->chain);
- q->task = current;
- spin_unlock(&hb->lock);
-}
-
-/**
- * unqueue_me() - Remove the futex_q from its futex_hash_bucket
- * @q: The futex_q to unqueue
- *
- * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
- * be paired with exactly one earlier call to queue_me().
- *
- * Return:
- * 1 - if the futex_q was still queued (and we removed unqueued it);
- * 0 - if the futex_q was already removed by the waking thread
- */
-static int unqueue_me(struct futex_q *q)
-{
- spinlock_t *lock_ptr;
- int ret = 0;
-
- /* In the common case we don't take the spinlock, which is nice. */
-retry:
- lock_ptr = q->lock_ptr;
- barrier();
- if (lock_ptr != NULL) {
- spin_lock(lock_ptr);
- /*
- * q->lock_ptr can change between reading it and
- * spin_lock(), causing us to take the wrong lock. This
- * corrects the race condition.
- *
- * Reasoning goes like this: if we have the wrong lock,
- * q->lock_ptr must have changed (maybe several times)
- * between reading it and the spin_lock(). It can
- * change again after the spin_lock() but only if it was
- * already changed before the spin_lock(). It cannot,
- * however, change back to the original value. Therefore
- * we can detect whether we acquired the correct lock.
- */
- if (unlikely(lock_ptr != q->lock_ptr)) {
- spin_unlock(lock_ptr);
- goto retry;
- }
- __unqueue_futex(q);
-
- BUG_ON(q->pi_state);
-
- spin_unlock(lock_ptr);
- ret = 1;
- }
-
- drop_futex_key_refs(&q->key);
- return ret;
-}
-
-/*
- * PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
- * and dropped here.
- */
-static void unqueue_me_pi(struct futex_q *q)
- __releases(q->lock_ptr)
-{
- __unqueue_futex(q);
-
- BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
- q->pi_state = NULL;
-
- spin_unlock(q->lock_ptr);
-}
-
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
-{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
- struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
- u32 uval, uninitialized_var(curval), newval;
- int ret;
-
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
-
- /*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but failed to get the rtmutex the first time.
- * We have to replace the newowner TID in the user space variable.
- * This must be atomic as we have to preserve the owner died bit here.
- *
- * Note: We write the user space value _before_ changing the pi_state
- * because we can fault here. Imagine swapped out pages or a fork
- * that marked all the anonymous memory readonly for cow.
- *
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
- */
-retry:
- if (get_futex_value_locked(&uval, uaddr))
- goto handle_fault;
-
- while (1) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
-
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- goto handle_fault;
- if (curval == uval)
- break;
- uval = curval;
- }
-
- /*
- * We fixed up user space. Now we need to fix the pi_state
- * itself.
- */
- if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
- }
-
- pi_state->owner = newowner;
-
- raw_spin_lock_irq(&newowner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
- return 0;
-
- /*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
- */
-handle_fault:
- spin_unlock(q->lock_ptr);
-
- ret = fault_in_user_writeable(uaddr);
-
- spin_lock(q->lock_ptr);
-
- /*
- * Check if someone else fixed it for us:
- */
- if (pi_state->owner != oldowner)
- return 0;
-
- if (ret)
- return ret;
-
- goto retry;
-}
-
-static long futex_wait_restart(struct restart_block *restart);
-
-/**
- * fixup_owner() - Post lock pi_state and corner case management
- * @uaddr: user address of the futex
- * @q: futex_q (contains pi_state and access to the rt_mutex)
- * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
- *
- * After attempting to lock an rt_mutex, this function is called to cleanup
- * the pi_state owner as well as handle race conditions that may allow us to
- * acquire the lock. Must be called with the hb lock held.
- *
- * Return:
- * 1 - success, lock taken;
- * 0 - success, lock not taken;
- * <0 - on error (-EFAULT)
- */
-static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-{
- struct task_struct *owner;
- int ret = 0;
-
- if (locked) {
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case:
- */
- if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current);
- goto out;
- }
-
- /*
- * Catch the rare case, where the lock was released when we were on the
- * way back before we locked the hash bucket.
- */
- if (q->pi_state->owner == current) {
- /*
- * Try to get the rt_mutex now. This might fail as some other
- * task acquired the rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
- locked = 1;
- goto out;
- }
-
- /*
- * pi_state is incorrect, some other task did a lock steal and
- * we returned due to timeout or signal without taking the
- * rt_mutex. Too late.
- */
- raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- if (!owner)
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
- ret = fixup_pi_state_owner(uaddr, q, owner);
- goto out;
- }
-
- /*
- * Paranoia check. If we did not take the lock, then we should not be
- * the owner of the rt_mutex.
- */
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
- printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
- "pi-state %p\n", ret,
- q->pi_state->pi_mutex.owner,
- q->pi_state->owner);
-
-out:
- return ret ? ret : locked;
-}
-
-/**
- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
- * @hb: the futex hash bucket, must be locked by the caller
- * @q: the futex_q to queue up on
- * @timeout: the prepared hrtimer_sleeper, or null for no timeout
- */
-static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout)
-{
- /*
- * The task state is guaranteed to be set before another task can
- * wake it. set_current_state() is implemented using set_mb() and
- * queue_me() calls spin_unlock() upon completion, both serializing
- * access to the hash list and forcing another memory barrier.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- queue_me(q, hb);
-
- /* Arm the timer */
- if (timeout) {
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
-
- /*
- * If we have been removed from the hash list, then another task
- * has tried to wake us, and we can skip the call to schedule().
- */
- if (likely(!plist_node_empty(&q->list))) {
- /*
- * If the timer has already expired, current will already be
- * flagged for rescheduling. Only call schedule if there
- * is no timeout, or if it has yet to expire.
- */
- if (!timeout || timeout->task)
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
-}
-
-/**
- * futex_wait_setup() - Prepare to wait on a futex
- * @uaddr: the futex userspace address
- * @val: the expected value
- * @flags: futex flags (FLAGS_SHARED, etc.)
- * @q: the associated futex_q
- * @hb: storage for hash_bucket pointer to be returned to caller
- *
- * Setup the futex_q and locate the hash_bucket. Get the futex value and
- * compare it with the expected value. Handle atomic faults internally.
- * Return with the hb lock held and a q.key reference on success, and unlocked
- * with no q.key reference on failure.
- *
- * Return:
- * 0 - uaddr contains val and hb has been locked;
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
- */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
- struct futex_q *q, struct futex_hash_bucket **hb)
-{
- u32 uval;
- int ret;
-
- /*
- * Access the page AFTER the hash-bucket is locked.
- * Order is important:
- *
- * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
- * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
- *
- * The basic logical guarantee of a futex is that it blocks ONLY
- * if cond(var) is known to be true at the time of blocking, for
- * any cond. If we locked the hash-bucket after testing *uaddr, that
- * would open a race condition where we could block indefinitely with
- * cond(var) false, which would violate the guarantee.
- *
- * On the other hand, we insert q and release the hash-bucket only
- * after testing *uaddr. This guarantees that futex_wait() will NOT
- * absorb a wakeup if *uaddr does not match the desired values
- * while the syscall executes.
- */
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
- if (unlikely(ret != 0))
- return ret;
-
-retry_private:
- *hb = queue_lock(q);
-
- ret = get_futex_value_locked(&uval, uaddr);
-
- if (ret) {
- queue_unlock(*hb);
-
- ret = get_user(uval, uaddr);
- if (ret)
- goto out;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- put_futex_key(&q->key);
- goto retry;
- }
-
- if (uval != val) {
- queue_unlock(*hb);
- ret = -EWOULDBLOCK;
- }
-
-out:
- if (ret)
- put_futex_key(&q->key);
- return ret;
-}
-
-static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
- ktime_t *abs_time, u32 bitset)
-{
- struct hrtimer_sleeper timeout, *to = NULL;
- struct restart_block *restart;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int ret;
-
- if (!bitset)
- return -EINVAL;
- q.bitset = bitset;
-
- if (abs_time) {
- to = &timeout;
-
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
-
-retry:
- /*
- * Prepare to wait on uaddr. On success, holds hb lock and increments
- * q.key refs.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out;
-
- /* queue_me and wait for wakeup, timeout, or a signal. */
- futex_wait_queue_me(hb, &q, to);
-
- /* If we were woken (and unqueued), we succeeded, whatever. */
- ret = 0;
- /* unqueue_me() drops q.key ref */
- if (!unqueue_me(&q))
- goto out;
- ret = -ETIMEDOUT;
- if (to && !to->task)
- goto out;
-
- /*
- * We expect signal_pending(current), but we might be the
- * victim of a spurious wakeup as well.
- */
- if (!signal_pending(current))
- goto retry;
-
- ret = -ERESTARTSYS;
- if (!abs_time)
- goto out;
-
- restart = &current->restart_block;
- restart->fn = futex_wait_restart;
- restart->futex.uaddr = uaddr;
- restart->futex.val = val;
- restart->futex.time = abs_time->tv64;
- restart->futex.bitset = bitset;
- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
-
- ret = -ERESTART_RESTARTBLOCK;
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-
-static long futex_wait_restart(struct restart_block *restart)
-{
- u32 __user *uaddr = restart->futex.uaddr;
- ktime_t t, *tp = NULL;
-
- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t.tv64 = restart->futex.time;
- tp = &t;
- }
- restart->fn = do_no_restart_syscall;
-
- return (long)futex_wait(uaddr, restart->futex.flags,
- restart->futex.val, tp, restart->futex.bitset);
-}
-
-
-/*
- * Userspace tried a 0 -> TID atomic transition of the futex value
- * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
- * races the kernel might see a 0 value of the futex too.)
- */
-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
- ktime_t *time, int trylock)
-{
- struct hrtimer_sleeper timeout, *to = NULL;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int res, ret;
-
- if (refill_pi_state_cache())
- return -ENOMEM;
-
- if (time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires(&to->timer, *time);
- }
-
-retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
-retry_private:
- hb = queue_lock(&q);
-
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
- if (unlikely(ret)) {
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - Task is exiting and we just wait for the
- * exit to complete.
- * - The user space value changed.
- */
- queue_unlock(hb);
- put_futex_key(&q.key);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
- }
- }
-
- /*
- * Only actually queue now that the atomic ops are done:
- */
- queue_me(&q, hb);
-
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- }
-
- spin_lock(q.lock_ptr);
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr, &q, !ret);
- /*
- * If fixup_owner() returned an error, proprogate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- /*
- * If fixup_owner() faulted and was unable to handle the fault, unlock
- * it and return the fault to userspace.
- */
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_unlock(&q.pi_state->pi_mutex);
-
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q);
-
- goto out_put_key;
-
-out_unlock_put_key:
- queue_unlock(hb);
-
-out_put_key:
- put_futex_key(&q.key);
-out:
- if (to)
- destroy_hrtimer_on_stack(&to->timer);
- return ret != -EINTR ? ret : -ERESTARTNOINTR;
-
-uaddr_faulted:
- queue_unlock(hb);
-
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out_put_key;
-
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
-
- put_futex_key(&q.key);
- goto retry;
-}
-
-/*
- * Userspace attempted a TID -> 0 atomic transition, and failed.
- * This is the in-kernel slowpath: we look up the PI state (if any),
- * and do the rt-mutex unlock.
- */
-static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
-{
- u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
- union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
- struct futex_q *match;
- int ret;
-
-retry:
- if (get_user(uval, uaddr))
- return -EFAULT;
- /*
- * We release only a lock we actually own:
- */
- if ((uval & FUTEX_TID_MASK) != vpid)
- return -EPERM;
-
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
- if (ret)
- return ret;
-
- hb = hash_futex(&key);
- spin_lock(&hb->lock);
-
- /*
- * Check waiters first. We do not trust user space values at
- * all and we at least want to know if user space fiddled
- * with the futex value instead of blindly unlocking.
- */
- match = futex_top_waiter(hb, &key);
- if (match) {
- ret = wake_futex_pi(uaddr, uval, match);
- /*
- * The atomic access to the futex value generated a
- * pagefault, so retry the user-access and the wakeup:
- */
- if (ret == -EFAULT)
- goto pi_faulted;
- goto out_unlock;
- }
-
- /*
- * We have no kernel internal state, i.e. no waiters in the
- * kernel. Waiters which are about to queue themselves are stuck
- * on hb->lock. So we can safely ignore them. We do neither
- * preserve the WAITERS bit not the OWNER_DIED one. We are the
- * owner.
- */
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
- goto pi_faulted;
-
- /*
- * If uval has changed, let user space handle it.
- */
- ret = (curval == uval) ? 0 : -EAGAIN;
-
-out_unlock:
- spin_unlock(&hb->lock);
- put_futex_key(&key);
- return ret;
-
-pi_faulted:
- spin_unlock(&hb->lock);
- put_futex_key(&key);
-
- ret = fault_in_user_writeable(uaddr);
- if (!ret)
- goto retry;
-
- return ret;
-}
-
-/**
- * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
- * @hb: the hash_bucket futex_q was original enqueued on
- * @q: the futex_q woken while waiting to be requeued
- * @key2: the futex_key of the requeue target futex
- * @timeout: the timeout associated with the wait (NULL if none)
- *
- * Detect if the task was woken on the initial futex as opposed to the requeue
- * target futex. If so, determine if it was a timeout or a signal that caused
- * the wakeup and return the appropriate error code to the caller. Must be
- * called with the hb lock held.
- *
- * Return:
- * 0 = no early wakeup detected;
- * <0 = -ETIMEDOUT or -ERESTARTNOINTR
- */
-static inline
-int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
- struct futex_q *q, union futex_key *key2,
- struct hrtimer_sleeper *timeout)
-{
- int ret = 0;
-
- /*
- * With the hb lock held, we avoid races while we process the wakeup.
- * We only need to hold hb (and not hb2) to ensure atomicity as the
- * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
- * It can't be requeued from uaddr2 to something else since we don't
- * support a PI aware source futex for requeue.
- */
- if (!match_futex(&q->key, key2)) {
- WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
- /*
- * We were woken prior to requeue by a timeout or a signal.
- * Unqueue the futex_q and determine which it was.
- */
- plist_del(&q->list, &hb->chain);
- hb_waiters_dec(hb);
-
- /* Handle spurious wakeups gracefully */
- ret = -EWOULDBLOCK;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
- else if (signal_pending(current))
- ret = -ERESTARTNOINTR;
- }
- return ret;
-}
-
-/**
- * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
- * @uaddr: the futex we initially wait on (non-pi)
- * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
- * @val: the expected value of uaddr
- * @abs_time: absolute timeout
- * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @uaddr2: the pi futex we will take prior to returning to user-space
- *
- * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
- * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
- * without one, the pi logic would not know which task to boost/deboost, if
- * there was a need to.
- *
- * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following--
- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
- * 2) wakeup on uaddr2 after a requeue
- * 3) signal
- * 4) timeout
- *
- * If 3, cleanup and return -ERESTARTNOINTR.
- *
- * If 2, we may then block on trying to take the rt_mutex and return via:
- * 5) successful lock
- * 6) signal
- * 7) timeout
- * 8) other lock acquisition failure
- *
- * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
- *
- * If 4 or 7, we cleanup and return with -ETIMEDOUT.
- *
- * Return:
- * 0 - On success;
- * <0 - On error
- */
-static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
- u32 val, ktime_t *abs_time, u32 bitset,
- u32 __user *uaddr2)
-{
- struct hrtimer_sleeper timeout, *to = NULL;
- struct rt_mutex_waiter rt_waiter;
- struct rt_mutex *pi_mutex = NULL;
- struct futex_hash_bucket *hb;
- union futex_key key2 = FUTEX_KEY_INIT;
- struct futex_q q = futex_q_init;
- int res, ret;
-
- if (uaddr == uaddr2)
- return -EINVAL;
-
- if (!bitset)
- return -EINVAL;
-
- if (abs_time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
-
- /*
- * The waiter is allocated on our stack, manipulated by the requeue
- * code while we sleep on uaddr.
- */
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
-
- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out;
-
- q.bitset = bitset;
- q.rt_waiter = &rt_waiter;
- q.requeue_pi_key = &key2;
-
- /*
- * Prepare to wait on uaddr. On success, increments q.key (key1) ref
- * count.
- */
- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
- if (ret)
- goto out_key2;
-
- /*
- * The check above which compares uaddrs is not sufficient for
- * shared futexes. We need to compare the keys:
- */
- if (match_futex(&q.key, &key2)) {
- queue_unlock(hb);
- ret = -EINVAL;
- goto out_put_keys;
- }
-
- /* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue_me(hb, &q, to);
-
- spin_lock(&hb->lock);
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out_put_keys;
-
- /*
- * In order for us to be here, we know our q.key == key2, and since
- * we took the hb->lock above, we also know that futex_requeue() has
- * completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquisition by the requeue code. The
- * futex_requeue dropped our key1 reference and incremented our key2
- * reference count.
- */
-
- /* Check if the requeue code acquired the second futex for us. */
- if (!q.rt_waiter) {
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case.
- */
- if (q.pi_state && (q.pi_state->owner != current)) {
- spin_lock(q.lock_ptr);
- ret = fixup_pi_state_owner(uaddr2, &q, current);
- spin_unlock(q.lock_ptr);
- }
- } else {
- /*
- * We have been woken up by futex_unlock_pi(), a timeout, or a
- * signal. futex_unlock_pi() will not destroy the lock_ptr nor
- * the pi_state.
- */
- WARN_ON(!q.pi_state);
- pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
- debug_rt_mutex_free_waiter(&rt_waiter);
-
- spin_lock(q.lock_ptr);
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_owner(uaddr2, &q, !ret);
- /*
- * If fixup_owner() returned an error, proprogate that. If it
- * acquired the lock, clear -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
-
- /* Unqueue and drop the lock. */
- unqueue_me_pi(&q);
- }
-
- /*
- * If fixup_pi_state_owner() faulted and was unable to handle the
- * fault, unlock the rt_mutex and return the fault to userspace.
- */
- if (ret == -EFAULT) {
- if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
- } else if (ret == -EINTR) {
- /*
- * We've already been requeued, but cannot restart by calling
- * futex_lock_pi() directly. We could restart this syscall, but
- * it would detect that the user space "val" changed and return
- * -EWOULDBLOCK. Save the overhead of the restart and return
- * -EWOULDBLOCK directly.
- */
- ret = -EWOULDBLOCK;
- }
-
-out_put_keys:
- put_futex_key(&q.key);
-out_key2:
- put_futex_key(&key2);
-
-out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret;
-}
-
-/*
- * Support for robust futexes: the kernel cleans up held futexes at
- * thread exit time.
- *
- * Implementation: user-space maintains a per-thread list of locks it
- * is holding. Upon do_exit(), the kernel carefully walks this list,
- * and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
- * always manipulated with the lock held, so the list is private and
- * per-thread. Userspace also maintains a per-thread 'list_op_pending'
- * field, to allow the kernel to clean up if the thread dies after
- * acquiring the lock, but just before it could have added itself to
- * the list. There can only be one such pending lock.
- */
-
-/**
- * sys_set_robust_list() - Set the robust-futex list head of a task
- * @head: pointer to the list-head
- * @len: length of the list-head, as userspace expects
- */
-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
- size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- /*
- * The kernel knows only one size for now:
- */
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->robust_list = head;
-
- return 0;
-}
-
-/**
- * sys_get_robust_list() - Get the robust-futex list head of a task
- * @pid: pid of the process [zero for current task]
- * @head_ptr: pointer to a list-head pointer, the kernel fills it in
- * @len_ptr: pointer to a length field, the kernel fills in the header size
- */
-SYSCALL_DEFINE3(get_robust_list, int, pid,
- struct robust_list_head __user * __user *, head_ptr,
- size_t __user *, len_ptr)
-{
- struct robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ))
- goto err_unlock;
-
- head = p->robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(head, head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-/*
- * Process a futex-list entry, check whether it's owned by the
- * dying task, and do notification if so:
- */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
-{
- u32 uval, uninitialized_var(nval), mval;
-
-retry:
- if (get_user(uval, uaddr))
- return -1;
-
- if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
- /*
- * Ok, this dying thread is truly holding a futex
- * of interest. Set the OWNER_DIED bit atomically
- * via cmpxchg, and if the value had FUTEX_WAITERS
- * set, wake up a waiter (if any). (We have to do a
- * futex_wake() even if OWNER_DIED is already set -
- * to handle the rare but possible case of recursive
- * thread-death.) The rest of the cleanup is done in
- * userspace.
- */
- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- /*
- * We are not holding a lock here, but we want to have
- * the pagefault_disable/enable() protection because
- * we want to handle the fault gracefully. If the
- * access fails we try to fault in the futex with R/W
- * verification via get_user_pages. get_user() above
- * does not guarantee R/W access. If that fails we
- * give up and leave the futex locked.
- */
- if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
- if (fault_in_user_writeable(uaddr))
- return -1;
- goto retry;
- }
- if (nval != uval)
- goto retry;
-
- /*
- * Wake robust non-PI futexes here. The wakeup of
- * PI futexes happens in exit_pi_state():
- */
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
- }
- return 0;
-}
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int fetch_robust_entry(struct robust_list __user **entry,
- struct robust_list __user * __user *head,
- unsigned int *pi)
-{
- unsigned long uentry;
-
- if (get_user(uentry, (unsigned long __user *)head))
- return -EFAULT;
-
- *entry = (void __user *)(uentry & ~1UL);
- *pi = uentry & 1;
-
- return 0;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-void exit_robust_list(struct task_struct *curr)
-{
- struct robust_list_head __user *head = curr->robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
- unsigned long futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * don't process it twice:
- */
- if (entry != pending)
- if (handle_futex_death((void __user *)entry + futex_offset,
- curr, pi))
- return;
- if (rc)
- return;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
-
- if (pending)
- handle_futex_death((void __user *)pending + futex_offset,
- curr, pip);
-}
-
-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
- u32 __user *uaddr2, u32 val2, u32 val3)
-{
- int cmd = op & FUTEX_CMD_MASK;
- unsigned int flags = 0;
-
- if (!(op & FUTEX_PRIVATE_FLAG))
- flags |= FLAGS_SHARED;
-
- if (op & FUTEX_CLOCK_REALTIME) {
- flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_LOCK_PI:
- case FUTEX_UNLOCK_PI:
- case FUTEX_TRYLOCK_PI:
- case FUTEX_WAIT_REQUEUE_PI:
- case FUTEX_CMP_REQUEUE_PI:
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- }
-
- switch (cmd) {
- case FUTEX_WAIT:
- val3 = FUTEX_BITSET_MATCH_ANY;
- case FUTEX_WAIT_BITSET:
- return futex_wait(uaddr, flags, val, timeout, val3);
- case FUTEX_WAKE:
- val3 = FUTEX_BITSET_MATCH_ANY;
- case FUTEX_WAKE_BITSET:
- return futex_wake(uaddr, flags, val, val3);
- case FUTEX_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
- case FUTEX_CMP_REQUEUE:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
- case FUTEX_WAKE_OP:
- return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
- case FUTEX_LOCK_PI:
- return futex_lock_pi(uaddr, flags, timeout, 0);
- case FUTEX_UNLOCK_PI:
- return futex_unlock_pi(uaddr, flags);
- case FUTEX_TRYLOCK_PI:
- return futex_lock_pi(uaddr, flags, NULL, 1);
- case FUTEX_WAIT_REQUEUE_PI:
- val3 = FUTEX_BITSET_MATCH_ANY;
- return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
- uaddr2);
- case FUTEX_CMP_REQUEUE_PI:
- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
- }
- return -ENOSYS;
-}
-
-
-SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct timespec __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- struct timespec ts;
- ktime_t t, *tp = NULL;
- u32 val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
-
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
- return -EFAULT;
- if (!timespec_valid(&ts))
- return -EINVAL;
-
- t = timespec_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- tp = &t;
- }
- /*
- * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
- * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
- */
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (u32) (unsigned long) utime;
-
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
-
-static void __init futex_detect_cmpxchg(void)
-{
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
- u32 curval;
-
- /*
- * This will fail and we want it. Some arch implementations do
- * runtime detection of the futex_atomic_cmpxchg_inatomic()
- * functionality. We want to know that before we call in any
- * of the complex code paths. Also we want to prevent
- * registration of robust lists in that case. NULL is
- * guaranteed to fault and we get -EFAULT on functional
- * implementation, the non-functional ones will return
- * -ENOSYS.
- */
- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
- futex_cmpxchg_enabled = 1;
-#endif
-}
-
-static int __init futex_init(void)
-{
- unsigned int futex_shift;
- unsigned long i;
-
-#if CONFIG_BASE_SMALL
- futex_hashsize = 16;
-#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
-#endif
-
- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0,
- futex_hashsize < 256 ? HASH_SMALL : 0,
- &futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
-
- futex_detect_cmpxchg();
-
- for (i = 0; i < futex_hashsize; i++) {
- atomic_set(&futex_queues[i].waiters, 0);
- plist_head_init(&futex_queues[i].chain);
- spin_lock_init(&futex_queues[i].lock);
- }
-
- return 0;
-}
-__initcall(futex_init);
diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
new file mode 100644
index 000000000000..b77188d1fa07
--- /dev/null
+++ b/kernel/futex/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
new file mode 100644
index 000000000000..cf7e610eac42
--- /dev/null
+++ b/kernel/futex/core.c
@@ -0,0 +1,2012 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Fast Userspace Mutexes (which I call "Futexes!").
+ * (C) Rusty Russell, IBM 2002
+ *
+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+ *
+ * Removed page pinning, fix privately mapped COW pages and other cleanups
+ * (C) Copyright 2003, 2004 Jamie Lokier
+ *
+ * Robust futex support started by Ingo Molnar
+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ * Copyright (C) IBM Corporation, 2009
+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ * enough at me, Linus for the original (flawed) idea, Matthew
+ * Kirkwood for proof-of-concept implementation.
+ *
+ * "The futexes are also cursed."
+ * "But they come in a choice of three flavours!"
+ */
+#include <linux/compat.h>
+#include <linux/jhash.h>
+#include <linux/pagemap.h>
+#include <linux/debugfs.h>
+#include <linux/plist.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/memblock.h>
+#include <linux/fault-inject.h>
+#include <linux/slab.h>
+#include <linux/prctl.h>
+#include <linux/mempolicy.h>
+#include <linux/mmap_lock.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in futex_hash()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+ unsigned long hashmask;
+ unsigned int hashshift;
+ struct futex_hash_bucket *queues[MAX_NUMNODES];
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+
+#define futex_hashmask (__futex_data.hashmask)
+#define futex_hashshift (__futex_data.hashshift)
+#define futex_queues (__futex_data.queues)
+
+struct futex_private_hash {
+ int state;
+ unsigned int hash_mask;
+ struct rcu_head rcu;
+ void *mm;
+ bool custom;
+ struct futex_hash_bucket queues[];
+};
+
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
+
+ bool ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = false,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private);
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#endif /* CONFIG_FAIL_FUTEX */
+
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph);
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
+static inline bool futex_key_is_private(union futex_key *key)
+{
+ /*
+ * Relies on get_futex_key() to set either bit for shared
+ * futexes -- see comment with union futex_key.
+ */
+ return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
+}
+
+static bool futex_private_hash_get(struct futex_private_hash *fph)
+{
+ return futex_ref_get(fph);
+}
+
+void futex_private_hash_put(struct futex_private_hash *fph)
+{
+ if (futex_ref_put(fph))
+ wake_up_var(fph->mm);
+}
+
+/**
+ * futex_hash_get - Get an additional reference for the local hash.
+ * @hb: ptr to the private local hash.
+ *
+ * Obtain an additional reference for the already obtained hash bucket. The
+ * caller must already own an reference.
+ */
+void futex_hash_get(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ WARN_ON_ONCE(!futex_private_hash_get(fph));
+}
+
+void futex_hash_put(struct futex_hash_bucket *hb)
+{
+ struct futex_private_hash *fph = hb->priv;
+
+ if (!fph)
+ return;
+ futex_private_hash_put(fph);
+}
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ u32 hash;
+
+ if (!futex_key_is_private(key))
+ return NULL;
+
+ if (!fph)
+ fph = rcu_dereference(key->private.mm->futex_phash);
+ if (!fph || !fph->hash_mask)
+ return NULL;
+
+ hash = jhash2((void *)&key->private.address,
+ sizeof(key->private.address) / 4,
+ key->both.offset);
+ return &fph->queues[hash & fph->hash_mask];
+}
+
+static void futex_rehash_private(struct futex_private_hash *old,
+ struct futex_private_hash *new)
+{
+ struct futex_hash_bucket *hb_old, *hb_new;
+ unsigned int slots = old->hash_mask + 1;
+ unsigned int i;
+
+ for (i = 0; i < slots; i++) {
+ struct futex_q *this, *tmp;
+
+ hb_old = &old->queues[i];
+
+ spin_lock(&hb_old->lock);
+ plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
+
+ plist_del(&this->list, &hb_old->chain);
+ futex_hb_waiters_dec(hb_old);
+
+ WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
+
+ hb_new = __futex_hash(&this->key, new);
+ futex_hb_waiters_inc(hb_new);
+ /*
+ * The new pointer isn't published yet but an already
+ * moved user can be unqueued due to timeout or signal.
+ */
+ spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
+ plist_add(&this->list, &hb_new->chain);
+ this->lock_ptr = &hb_new->lock;
+ spin_unlock(&hb_new->lock);
+ }
+ spin_unlock(&hb_old->lock);
+ }
+}
+
+static bool __futex_pivot_hash(struct mm_struct *mm,
+ struct futex_private_hash *new)
+{
+ struct futex_private_hash *fph;
+
+ WARN_ON_ONCE(mm->futex_phash_new);
+
+ fph = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ if (fph) {
+ if (!futex_ref_is_dead(fph)) {
+ mm->futex_phash_new = new;
+ return false;
+ }
+
+ futex_rehash_private(fph, new);
+ }
+ new->state = FR_PERCPU;
+ scoped_guard(rcu) {
+ mm->futex_batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mm->futex_phash, new);
+ }
+ kvfree_rcu(fph, rcu);
+ return true;
+}
+
+static void futex_pivot_hash(struct mm_struct *mm)
+{
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *fph;
+
+ fph = mm->futex_phash_new;
+ if (fph) {
+ mm->futex_phash_new = NULL;
+ __futex_pivot_hash(mm, fph);
+ }
+ }
+}
+
+struct futex_private_hash *futex_private_hash(void)
+{
+ struct mm_struct *mm = current->mm;
+ /*
+ * Ideally we don't loop. If there is a replacement in progress
+ * then a new private hash is already prepared and a reference can't be
+ * obtained once the last user dropped it's.
+ * In that case we block on mm_struct::futex_hash_lock and either have
+ * to perform the replacement or wait while someone else is doing the
+ * job. Eitherway, on the second iteration we acquire a reference on the
+ * new private hash or loop again because a new replacement has been
+ * requested.
+ */
+again:
+ scoped_guard(rcu) {
+ struct futex_private_hash *fph;
+
+ fph = rcu_dereference(mm->futex_phash);
+ if (!fph)
+ return NULL;
+
+ if (futex_private_hash_get(fph))
+ return fph;
+ }
+ futex_pivot_hash(mm);
+ goto again;
+}
+
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ struct futex_private_hash *fph;
+ struct futex_hash_bucket *hb;
+
+again:
+ scoped_guard(rcu) {
+ hb = __futex_hash(key, NULL);
+ fph = hb->priv;
+
+ if (!fph || futex_private_hash_get(fph))
+ return hb;
+ }
+ futex_pivot_hash(key->private.mm);
+ goto again;
+}
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+
+static struct futex_hash_bucket *
+__futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
+{
+ return NULL;
+}
+
+struct futex_hash_bucket *futex_hash(union futex_key *key)
+{
+ return __futex_hash(key, NULL);
+}
+
+#endif /* CONFIG_FUTEX_PRIVATE_HASH */
+
+#ifdef CONFIG_FUTEX_MPOL
+
+static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = vma_lookup(mm, addr);
+ struct mempolicy *mpol;
+ int node = FUTEX_NO_NODE;
+
+ if (!vma)
+ return FUTEX_NO_NODE;
+
+ mpol = vma_policy(vma);
+ if (!mpol)
+ return FUTEX_NO_NODE;
+
+ switch (mpol->mode) {
+ case MPOL_PREFERRED:
+ node = first_node(mpol->nodes);
+ break;
+ case MPOL_PREFERRED_MANY:
+ case MPOL_BIND:
+ if (mpol->home_node != NUMA_NO_NODE)
+ node = mpol->home_node;
+ break;
+ default:
+ break;
+ }
+
+ return node;
+}
+
+static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
+{
+ int seq, node;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return -EBUSY;
+
+ node = __futex_key_to_node(mm, addr);
+
+ if (mmap_lock_speculate_retry(mm, seq))
+ return -EAGAIN;
+
+ return node;
+}
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ int node;
+
+ node = futex_key_to_node_opt(mm, addr);
+ if (node >= FUTEX_NO_NODE)
+ return node;
+
+ guard(mmap_read_lock)(mm);
+ return __futex_key_to_node(mm, addr);
+}
+
+#else /* !CONFIG_FUTEX_MPOL */
+
+static int futex_mpol(struct mm_struct *mm, unsigned long addr)
+{
+ return FUTEX_NO_NODE;
+}
+
+#endif /* CONFIG_FUTEX_MPOL */
+
+/**
+ * __futex_hash - Return the hash bucket
+ * @key: Pointer to the futex key for which the hash is calculated
+ * @fph: Pointer to private hash if known
+ *
+ * We hash on the keys returned from get_futex_key (see below) and return the
+ * corresponding hash bucket.
+ * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
+ * private hash) is returned if existing. Otherwise a hash bucket from the
+ * global hash is returned.
+ */
+static struct futex_hash_bucket *
+__futex_hash(union futex_key *key, struct futex_private_hash *fph)
+{
+ int node = key->both.node;
+ u32 hash;
+
+ if (node == FUTEX_NO_NODE) {
+ struct futex_hash_bucket *hb;
+
+ hb = __futex_hash_private(key, fph);
+ if (hb)
+ return hb;
+ }
+
+ hash = jhash2((u32 *)key,
+ offsetof(typeof(*key), both.offset) / sizeof(u32),
+ key->both.offset);
+
+ if (node == FUTEX_NO_NODE) {
+ /*
+ * In case of !FLAGS_NUMA, use some unused hash bits to pick a
+ * node -- this ensures regular futexes are interleaved across
+ * the nodes and avoids having to allocate multiple
+ * hash-tables.
+ *
+ * NOTE: this isn't perfectly uniform, but it is fast and
+ * handles sparse node masks.
+ */
+ node = (hash >> futex_hashshift) % nr_node_ids;
+ if (!node_possible(node)) {
+ node = find_next_bit_wrap(node_possible_map.bits,
+ nr_node_ids, node);
+ }
+ }
+
+ return &futex_queues[node][hash & futex_hashmask];
+}
+
+/**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time: ptr to the given timeout value
+ * @timeout: the hrtimer_sleeper structure to be set up
+ * @flags: futex flags
+ * @range_ns: optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ * value given
+ */
+struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
+{
+ if (!time)
+ return NULL;
+
+ hrtimer_setup_sleeper_on_stack(timeout,
+ (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ /*
+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+ * effectively the same as calling hrtimer_set_expires().
+ */
+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+
+ return timeout;
+}
+
+/*
+ * Generate a machine wide unique identifier for this inode.
+ *
+ * This relies on u64 not wrapping in the life-time of the machine; which with
+ * 1ns resolution means almost 585 years.
+ *
+ * This further relies on the fact that a well formed program will not unmap
+ * the file while it has a (shared) futex waiting on it. This mapping will have
+ * a file reference which pins the mount and inode.
+ *
+ * If for some reason an inode gets evicted and read back in again, it will get
+ * a new sequence number and will _NOT_ match, even though it is the exact same
+ * file.
+ *
+ * It is important that futex_match() will never have a false-positive, esp.
+ * for PI futexes that can mess up the state. The above argues that false-negatives
+ * are only possible for malformed programs.
+ */
+static u64 get_inode_sequence_number(struct inode *inode)
+{
+ static atomic64_t i_seq;
+ u64 old;
+
+ /* Does the inode already have a sequence number? */
+ old = atomic64_read(&inode->i_sequence);
+ if (likely(old))
+ return old;
+
+ for (;;) {
+ u64 new = atomic64_inc_return(&i_seq);
+ if (WARN_ON_ONCE(!new))
+ continue;
+
+ old = 0;
+ if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
+ return old;
+ return new;
+ }
+}
+
+/**
+ * get_futex_key() - Get parameters which are the keys for a futex
+ * @uaddr: virtual address of the futex
+ * @flags: FLAGS_*
+ * @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: FUTEX_READ,
+ * FUTEX_WRITE)
+ *
+ * Return: a negative error code or 0
+ *
+ * The key words are stored in @key on success.
+ *
+ * For shared mappings (when @fshared), the key is:
+ *
+ * ( inode->i_sequence, page offset within mapping, offset_within_page )
+ *
+ * [ also see get_inode_sequence_number() ]
+ *
+ * For private mappings (or when !@fshared), the key is:
+ *
+ * ( current->mm, address, 0 )
+ *
+ * This allows (cross process, where applicable) identification of the futex
+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
+ *
+ * lock_page() might sleep, the caller should not hold a spinlock.
+ */
+int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+ enum futex_access rw)
+{
+ unsigned long address = (unsigned long)uaddr;
+ struct mm_struct *mm = current->mm;
+ struct page *page;
+ struct folio *folio;
+ struct address_space *mapping;
+ int node, err, size, ro = 0;
+ bool node_updated = false;
+ bool fshared;
+
+ fshared = flags & FLAGS_SHARED;
+ size = futex_size(flags);
+ if (flags & FLAGS_NUMA)
+ size *= 2;
+
+ /*
+ * The futex address must be "naturally" aligned.
+ */
+ key->both.offset = address % PAGE_SIZE;
+ if (unlikely((address % size) != 0))
+ return -EINVAL;
+ address -= key->both.offset;
+
+ if (unlikely(!access_ok(uaddr, size)))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
+ node = FUTEX_NO_NODE;
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (get_user_inline(node, naddr))
+ return -EFAULT;
+
+ if ((node != FUTEX_NO_NODE) &&
+ ((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
+ return -EINVAL;
+ }
+
+ if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
+ node = futex_mpol(mm, address);
+ node_updated = true;
+ }
+
+ if (flags & FLAGS_NUMA) {
+ u32 __user *naddr = (void *)uaddr + size / 2;
+
+ if (node == FUTEX_NO_NODE) {
+ node = numa_node_id();
+ node_updated = true;
+ }
+ if (node_updated && put_user_inline(node, naddr))
+ return -EFAULT;
+ }
+
+ key->both.node = node;
+
+ /*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ /*
+ * On no-MMU, shared futexes are treated as private, therefore
+ * we must not include the current process in the key. Since
+ * there is only one address space, the address is a unique key
+ * on its own.
+ */
+ if (IS_ENABLED(CONFIG_MMU))
+ key->private.mm = mm;
+ else
+ key->private.mm = NULL;
+
+ key->private.address = address;
+ return 0;
+ }
+
+again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
+ /*
+ * If write access is not required (eg. FUTEX_WAIT), try
+ * and get read-only access.
+ */
+ if (err == -EFAULT && rw == FUTEX_READ) {
+ err = get_user_pages_fast(address, 1, 0, &page);
+ ro = 1;
+ }
+ if (err < 0)
+ return err;
+ else
+ err = 0;
+
+ /*
+ * The treatment of mapping from this point on is critical. The folio
+ * lock protects many things but in this context the folio lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the folio lock is not needed in all cases being
+ * considered here and folio lock forces unnecessarily serialization.
+ * From this point on, mapping will be re-verified if necessary and
+ * folio lock will be acquired only if it is unavoidable
+ *
+ * Mapping checks require the folio so it is looked up now. For
+ * anonymous pages, it does not matter if the folio is split
+ * in the future as the key is based on the address. For
+ * filesystem-backed pages, the precise page is required as the
+ * index of the page determines the key.
+ */
+ folio = page_folio(page);
+ mapping = READ_ONCE(folio->mapping);
+
+ /*
+ * If folio->mapping is NULL, then it cannot be an anonymous
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the folio lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_inode_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for folio->mapping.
+ */
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Folio lock is required to identify which special case above
+ * applies. If this is really a shmem page then the folio lock
+ * will prevent unexpected transitions.
+ */
+ folio_lock(folio);
+ shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
+ folio_unlock(folio);
+ folio_put(folio);
+
+ if (shmem_swizzled)
+ goto again;
+
+ return -EFAULT;
+ }
+
+ /*
+ * Private mappings are handled in a simple way.
+ *
+ * If the futex key is stored in anonymous memory, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+ * it's a read-only handle, it's expected that futexes attach to
+ * the object not the particular process.
+ */
+ if (folio_test_anon(folio)) {
+ /*
+ * A RO anonymous page will never change and thus doesn't make
+ * sense for futex operations.
+ */
+ if (unlikely(should_fail_futex(true)) || ro) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
+ key->private.mm = mm;
+ key->private.address = address;
+
+ } else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the folio->mapping must be traversed. Ordinarily this should
+ * be stabilised under folio lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update i_pages or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(folio->mapping) != mapping) {
+ rcu_read_unlock();
+ folio_put(folio);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ folio_put(folio);
+
+ goto again;
+ }
+
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+ key->shared.i_seq = get_inode_sequence_number(inode);
+ key->shared.pgoff = page_pgoff(folio, page);
+ rcu_read_unlock();
+ }
+
+out:
+ folio_put(folio);
+ return err;
+}
+
+/**
+ * fault_in_user_writeable() - Fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non-destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+int fault_in_user_writeable(u32 __user *uaddr)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ mmap_read_lock(mm);
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE, NULL);
+ mmap_read_unlock(mm);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb: the hash bucket the futex_q's reside in
+ * @key: the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
+{
+ struct futex_q *this;
+
+ plist_for_each_entry(this, &hb->chain, list) {
+ if (futex_match(&this->key, key))
+ return this;
+ }
+ return NULL;
+}
+
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @ret: owner's current futex lock status
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
+/**
+ * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+void __futex_unqueue(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
+ return;
+ lockdep_assert_held(q->lock_ptr);
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+}
+
+/* The key must be already stored in q->key. */
+void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
+ __acquires(&hb->lock)
+{
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all futex_q_lock()
+ * users end up calling futex_queue(). Similarly, for housekeeping,
+ * decrement the counter at futex_q_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
+
+ q->lock_ptr = &hb->lock;
+
+ spin_lock(&hb->lock);
+}
+
+void futex_q_unlock(struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ futex_hb_waiters_dec(hb);
+ spin_unlock(&hb->lock);
+}
+
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
+{
+ int prio;
+
+ /*
+ * The priority used to register this element is
+ * - either the real thread-priority for the real-time threads
+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
+ * - or MAX_RT_PRIO for non-RT threads.
+ * Thus, all RT-threads are woken first in priority order, and
+ * the others are woken last, in FIFO order.
+ */
+ prio = min(current->normal_prio, MAX_RT_PRIO);
+
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ q->task = task;
+}
+
+/**
+ * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
+ * be paired with exactly one earlier call to futex_queue().
+ *
+ * Return:
+ * - 1 - if the futex_q was still queued (and we removed unqueued it);
+ * - 0 - if the futex_q was already removed by the waking thread
+ */
+int futex_unqueue(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+ int ret = 0;
+
+ /* RCU so lock_ptr is not going away during locking. */
+ guard(rcu)();
+ /* In the common case we don't take the spinlock, which is nice. */
+retry:
+ /*
+ * q->lock_ptr can change between this read and the following spin_lock.
+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+ * optimizing lock_ptr out of the logic below.
+ */
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ if (lock_ptr != NULL) {
+ spin_lock(lock_ptr);
+ /*
+ * q->lock_ptr can change between reading it and
+ * spin_lock(), causing us to take the wrong lock. This
+ * corrects the race condition.
+ *
+ * Reasoning goes like this: if we have the wrong lock,
+ * q->lock_ptr must have changed (maybe several times)
+ * between reading it and the spin_lock(). It can
+ * change again after the spin_lock() but only if it was
+ * already changed before the spin_lock(). It cannot,
+ * however, change back to the original value. Therefore
+ * we can detect whether we acquired the correct lock.
+ */
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+ __futex_unqueue(q);
+
+ BUG_ON(q->pi_state);
+
+ spin_unlock(lock_ptr);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+void futex_q_lockptr_lock(struct futex_q *q)
+{
+ spinlock_t *lock_ptr;
+
+ /*
+ * See futex_unqueue() why lock_ptr can change.
+ */
+ guard(rcu)();
+retry:
+ lock_ptr = READ_ONCE(q->lock_ptr);
+ spin_lock(lock_ptr);
+
+ if (unlikely(lock_ptr != q->lock_ptr)) {
+ spin_unlock(lock_ptr);
+ goto retry;
+ }
+}
+
+/*
+ * PI futexes can not be requeued and must remove themselves from the hash
+ * bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ */
+void futex_unqueue_pi(struct futex_q *q)
+{
+ /*
+ * If the lock was not acquired (due to timeout or signal) then the
+ * rt_waiter is removed before futex_q is. If this is observed by
+ * an unlocker after dropping the rtmutex wait lock and before
+ * acquiring the hash bucket lock, then the unlocker dequeues the
+ * futex_q from the hash bucket list to guarantee consistent state
+ * vs. userspace. Therefore the dequeue here must be conditional.
+ */
+ if (!plist_node_empty(&q->list))
+ __futex_unqueue(q);
+
+ BUG_ON(!q->pi_state);
+ put_pi_state(q->pi_state);
+ q->pi_state = NULL;
+}
+
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING true
+#define HANDLE_DEATH_LIST false
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+ bool pi, bool pending_op)
+{
+ u32 uval, nval, mval;
+ pid_t owner;
+ int err;
+
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+ return -1;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+ /*
+ * Special case for regular (non PI) futexes. The unlock path in
+ * user space has two race scenarios:
+ *
+ * 1. The unlock path releases the user space futex value and
+ * before it can execute the futex() syscall to wake up
+ * waiters it is killed.
+ *
+ * 2. A woken up waiter is killed before it can acquire the
+ * futex in user space.
+ *
+ * In the second case, the wake up notification could be generated
+ * by the unlock path in user space after setting the futex value
+ * to zero or by the kernel after setting the OWNER_DIED bit below.
+ *
+ * In both cases the TID validation below prevents a wakeup of
+ * potential waiters which can cause these waiters to block
+ * forever.
+ *
+ * In both cases the following conditions are met:
+ *
+ * 1) task->robust_list->list_op_pending != NULL
+ * @pending_op == true
+ * 2) The owner part of user space futex value == 0
+ * 3) Regular futex: @pi == false
+ *
+ * If these conditions are met, it is safe to attempt waking up a
+ * potential waiter without touching the user space futex value and
+ * trying to set the OWNER_DIED bit. If the futex value is zero,
+ * the rest of the user space mutex state is consistent, so a woken
+ * waiter will just take over the uncontended futex. Setting the
+ * OWNER_DIED bit would create inconsistent state and malfunction
+ * of the user space owner died handling. Otherwise, the OWNER_DIED
+ * bit is already set, and the woken waiter is expected to deal with
+ * this.
+ */
+ owner = uval & FUTEX_TID_MASK;
+
+ if (pending_op && !pi && !owner) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ return 0;
+ }
+
+ if (owner != task_pid_vnr(curr))
+ return 0;
+
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
+ switch (err) {
+ case -EFAULT:
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+
+ case -EAGAIN:
+ cond_resched();
+ goto retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return err;
+ }
+ }
+
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS)) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ }
+
+ return 0;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user * __user *head,
+ unsigned int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long __user *)head))
+ return -EFAULT;
+
+ *entry = (void __user *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void exit_robust_list(struct task_struct *curr)
+{
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ unsigned long futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * don't process it twice:
+ */
+ if (entry != pending) {
+ if (handle_futex_death((void __user *)entry + futex_offset,
+ curr, pi, HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+
+ if (pending) {
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int next_pi;
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi,
+ HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+ }
+}
+#endif
+
+#ifdef CONFIG_FUTEX_PI
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+static void exit_pi_state_list(struct task_struct *curr)
+{
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+ union futex_key key = FUTEX_KEY_INIT;
+
+ /*
+ * The mutex mm_struct::futex_hash_lock might be acquired.
+ */
+ might_sleep();
+ /*
+ * Ensure the hash remains stable (no resize) during the while loop
+ * below. The hb pointer is acquired under the pi_lock so we can't block
+ * on the mutex.
+ */
+ WARN_ON(curr != current);
+ guard(private_hash)();
+ /*
+ * We are a ZOMBIE and nobody can enqueue itself on
+ * pi_state_list anymore, but we have to be careful
+ * versus waiters unqueueing themselves:
+ */
+ raw_spin_lock_irq(&curr->pi_lock);
+ while (!list_empty(head)) {
+ next = head->next;
+ pi_state = list_entry(next, struct futex_pi_state, list);
+ key = pi_state->key;
+ if (1) {
+ CLASS(hb, hb)(&key);
+
+ /*
+ * We can race against put_pi_state() removing itself from the
+ * list (a waiter going away). put_pi_state() will first
+ * decrement the reference count and then modify the list, so
+ * its possible to see the list entry but fail this reference
+ * acquire.
+ *
+ * In that case; drop the locks to let put_pi_state() make
+ * progress and retry the loop.
+ */
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
+ raw_spin_unlock_irq(&curr->pi_lock);
+ cpu_relax();
+ raw_spin_lock_irq(&curr->pi_lock);
+ continue;
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
+ /* retain curr->pi_lock for the loop invariant */
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ WARN_ON(pi_state->owner != curr);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+
+ raw_spin_unlock(&curr->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ }
+
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
+ raw_spin_lock_irq(&curr->pi_lock);
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+}
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
+#endif
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in make_task_dead().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent further damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+}
+
+static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
+ struct futex_private_hash *fph)
+{
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+ fhb->priv = fph;
+#endif
+ atomic_set(&fhb->waiters, 0);
+ plist_head_init(&fhb->chain);
+ spin_lock_init(&fhb->lock);
+}
+
+#define FH_CUSTOM 0x01
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * The counter we're about to switch to must have fully switched;
+ * otherwise it would be impossible for it to have reported success
+ * from futex_ref_is_dead().
+ */
+ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+ /*
+ * Set the atomic to the bias value such that futex_ref_{get,put}()
+ * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+ * when folding in the percpu count.
+ */
+ atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ smp_store_release(&fph->state, FR_ATOMIC);
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+ unsigned int count = 0;
+ long ret;
+ int cpu;
+
+ /*
+ * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+ * and per this RCU callback, everybody must now observe this state and
+ * use the atomic variable.
+ */
+ WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+ /*
+ * Therefore the per-cpu counter is now stable, sum and reset.
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ count += *ptr;
+ *ptr = 0;
+ }
+
+ /*
+ * Re-init for the next cycle.
+ */
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+ /*
+ * Add actual count, subtract bias and initial refcount.
+ *
+ * The moment this atomic operation happens, futex_ref_is_dead() can
+ * become true.
+ */
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ if (!ret)
+ wake_up_var(mm);
+
+ WARN_ON_ONCE(ret < 0);
+ mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+ if (fph->state == FR_PERCPU) {
+ /*
+ * Per this extra grace-period, everybody must now observe
+ * fph as the current fph and no previously observed fph's
+ * are in-flight.
+ *
+ * Notably, nobody will now rely on the atomic
+ * futex_ref_is_dead() state anymore so we can begin the
+ * migration of the per-cpu counter into the atomic.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ __futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * Can only transition the current fph;
+ */
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ /*
+ * We enqueue at least one RCU callback. Ensure mm stays if the task
+ * exits before the transition is completed.
+ */
+ mmget(mm);
+
+ /*
+ * In order to avoid the following scenario:
+ *
+ * futex_hash() __futex_pivot_hash()
+ * guard(rcu); guard(mm->futex_hash_lock);
+ * fph = mm->futex_phash;
+ * rcu_assign_pointer(&mm->futex_phash, new);
+ * futex_hash_allocate()
+ * futex_ref_drop()
+ * fph->state = FR_ATOMIC;
+ * atomic_set(, BIAS);
+ *
+ * futex_private_hash_get(fph); // OOPS
+ *
+ * Where an old fph (which is FR_ATOMIC) and should fail on
+ * inc_not_zero, will succeed because a new transition is started and
+ * the atomic is bias'ed away from 0.
+ *
+ * There must be at least one full grace-period between publishing a
+ * new fph and trying to replace it.
+ */
+ if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ /*
+ * There was a grace-period, we can begin now.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(preempt)();
+
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_inc(*mm->futex_ref);
+ return true;
+ }
+
+ return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(preempt)();
+
+ if (READ_ONCE(fph->state) == FR_PERCPU) {
+ __this_cpu_dec(*mm->futex_ref);
+ return false;
+ }
+
+ return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU)
+ return false;
+
+ return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+ mutex_init(&mm->futex_hash_lock);
+ RCU_INIT_POINTER(mm->futex_phash, NULL);
+ mm->futex_phash_new = NULL;
+ /* futex-ref */
+ mm->futex_ref = NULL;
+ atomic_long_set(&mm->futex_atomic, 0);
+ mm->futex_batches = get_state_synchronize_rcu();
+ return 0;
+}
+
+void futex_hash_free(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ free_percpu(mm->futex_ref);
+ kvfree(mm->futex_phash_new);
+ fph = rcu_dereference_raw(mm->futex_phash);
+ if (fph)
+ kvfree(fph);
+}
+
+static bool futex_pivot_pending(struct mm_struct *mm)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+
+ if (!mm->futex_phash_new)
+ return true;
+
+ fph = rcu_dereference(mm->futex_phash);
+ return futex_ref_is_dead(fph);
+}
+
+static bool futex_hash_less(struct futex_private_hash *a,
+ struct futex_private_hash *b)
+{
+ /* user provided always wins */
+ if (!a->custom && b->custom)
+ return true;
+ if (a->custom && !b->custom)
+ return false;
+
+ /* zero-sized hash wins */
+ if (!b->hash_mask)
+ return true;
+ if (!a->hash_mask)
+ return false;
+
+ /* keep the biggest */
+ if (a->hash_mask < b->hash_mask)
+ return true;
+ if (a->hash_mask > b->hash_mask)
+ return false;
+
+ return false; /* equal */
+}
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct futex_private_hash *fph;
+ bool custom = flags & FH_CUSTOM;
+ int i;
+
+ if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
+ return -EINVAL;
+
+ /*
+ * Once we've disabled the global hash there is no way back.
+ */
+ scoped_guard(rcu) {
+ fph = rcu_dereference(mm->futex_phash);
+ if (fph && !fph->hash_mask) {
+ if (custom)
+ return -EBUSY;
+ return 0;
+ }
+ }
+
+ if (!mm->futex_ref) {
+ /*
+ * This will always be allocated by the first thread and
+ * therefore requires no locking.
+ */
+ mm->futex_ref = alloc_percpu(unsigned int);
+ if (!mm->futex_ref)
+ return -ENOMEM;
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ }
+
+ fph = kvzalloc(struct_size(fph, queues, hash_slots),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ if (!fph)
+ return -ENOMEM;
+
+ fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
+ fph->custom = custom;
+ fph->mm = mm;
+
+ for (i = 0; i < hash_slots; i++)
+ futex_hash_bucket_init(&fph->queues[i], fph);
+
+ if (custom) {
+ /*
+ * Only let prctl() wait / retry; don't unduly delay clone().
+ */
+again:
+ wait_var_event(mm, futex_pivot_pending(mm));
+ }
+
+ scoped_guard(mutex, &mm->futex_hash_lock) {
+ struct futex_private_hash *free __free(kvfree) = NULL;
+ struct futex_private_hash *cur, *new;
+
+ cur = rcu_dereference_protected(mm->futex_phash,
+ lockdep_is_held(&mm->futex_hash_lock));
+ new = mm->futex_phash_new;
+ mm->futex_phash_new = NULL;
+
+ if (fph) {
+ if (cur && !cur->hash_mask) {
+ /*
+ * If two threads simultaneously request the global
+ * hash then the first one performs the switch,
+ * the second one returns here.
+ */
+ free = fph;
+ mm->futex_phash_new = new;
+ return -EBUSY;
+ }
+ if (cur && !new) {
+ /*
+ * If we have an existing hash, but do not yet have
+ * allocated a replacement hash, drop the initial
+ * reference on the existing hash.
+ */
+ futex_ref_drop(cur);
+ }
+
+ if (new) {
+ /*
+ * Two updates raced; throw out the lesser one.
+ */
+ if (futex_hash_less(new, fph)) {
+ free = new;
+ new = fph;
+ } else {
+ free = fph;
+ }
+ } else {
+ new = fph;
+ }
+ fph = NULL;
+ }
+
+ if (new) {
+ /*
+ * Will set mm->futex_phash_new on failure;
+ * futex_private_hash_get() will try again.
+ */
+ if (!__futex_pivot_hash(mm, new) && custom)
+ goto again;
+ }
+ }
+ return 0;
+}
+
+int futex_hash_allocate_default(void)
+{
+ unsigned int threads, buckets, current_buckets = 0;
+ struct futex_private_hash *fph;
+
+ if (!current->mm)
+ return 0;
+
+ scoped_guard(rcu) {
+ threads = min_t(unsigned int,
+ get_nr_threads(current),
+ num_online_cpus());
+
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph) {
+ if (fph->custom)
+ return 0;
+
+ current_buckets = fph->hash_mask + 1;
+ }
+ }
+
+ /*
+ * The default allocation will remain within
+ * 16 <= threads * 4 <= global hash size
+ */
+ buckets = roundup_pow_of_two(4 * threads);
+ buckets = clamp(buckets, 16, futex_hashmask + 1);
+
+ if (current_buckets >= buckets)
+ return 0;
+
+ return futex_hash_allocate(buckets, 0);
+}
+
+static int futex_hash_get_slots(void)
+{
+ struct futex_private_hash *fph;
+
+ guard(rcu)();
+ fph = rcu_dereference(current->mm->futex_phash);
+ if (fph && fph->hash_mask)
+ return fph->hash_mask + 1;
+ return 0;
+}
+
+#else
+
+static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
+{
+ return -EINVAL;
+}
+
+static int futex_hash_get_slots(void)
+{
+ return 0;
+}
+
+#endif
+
+int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
+{
+ unsigned int flags = FH_CUSTOM;
+ int ret;
+
+ switch (arg2) {
+ case PR_FUTEX_HASH_SET_SLOTS:
+ if (arg4)
+ return -EINVAL;
+ ret = futex_hash_allocate(arg3, flags);
+ break;
+
+ case PR_FUTEX_HASH_GET_SLOTS:
+ ret = futex_hash_get_slots();
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+static int __init futex_init(void)
+{
+ unsigned long hashsize, i;
+ unsigned int order, n;
+ unsigned long size;
+
+#ifdef CONFIG_BASE_SMALL
+ hashsize = 16;
+#else
+ hashsize = 256 * num_possible_cpus();
+ hashsize /= num_possible_nodes();
+ hashsize = max(4, hashsize);
+ hashsize = roundup_pow_of_two(hashsize);
+#endif
+ futex_hashshift = ilog2(hashsize);
+ size = sizeof(struct futex_hash_bucket) * hashsize;
+ order = get_order(size);
+
+ for_each_node(n) {
+ struct futex_hash_bucket *table;
+
+ if (order > MAX_PAGE_ORDER)
+ table = vmalloc_huge_node(size, GFP_KERNEL, n);
+ else
+ table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
+
+ BUG_ON(!table);
+
+ for (i = 0; i < hashsize; i++)
+ futex_hash_bucket_init(&table[i], NULL);
+
+ futex_queues[n] = table;
+ }
+
+ futex_hashmask = hashsize - 1;
+ pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
+ hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
+ order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
+ return 0;
+}
+core_initcall(futex_init);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
new file mode 100644
index 000000000000..30c2afa03889
--- /dev/null
+++ b/kernel/futex/futex.h
@@ -0,0 +1,450 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FUTEX_H
+#define _FUTEX_H
+
+#include <linux/futex.h>
+#include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <linux/cleanup.h>
+
+#ifdef CONFIG_PREEMPT_RT
+#include <linux/rcuwait.h>
+#endif
+
+#include <asm/futex.h>
+
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SIZE_8 0x0000
+#define FLAGS_SIZE_16 0x0001
+#define FLAGS_SIZE_32 0x0002
+#define FLAGS_SIZE_64 0x0003
+
+#define FLAGS_SIZE_MASK 0x0003
+
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED 0x0010
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED 0x0000
+#endif
+#define FLAGS_CLOCKRT 0x0020
+#define FLAGS_HAS_TIMEOUT 0x0040
+#define FLAGS_NUMA 0x0080
+#define FLAGS_STRICT 0x0100
+#define FLAGS_MPOL 0x0200
+
+/* FUTEX_ to FLAGS_ */
+static inline unsigned int futex_to_flags(unsigned int op)
+{
+ unsigned int flags = FLAGS_SIZE_32;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ flags |= FLAGS_SHARED;
+
+ if (op & FUTEX_CLOCK_REALTIME)
+ flags |= FLAGS_CLOCKRT;
+
+ return flags;
+}
+
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE)
+
+/* FUTEX2_ to FLAGS_ */
+static inline unsigned int futex2_to_flags(unsigned int flags2)
+{
+ unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
+
+ if (!(flags2 & FUTEX2_PRIVATE))
+ flags |= FLAGS_SHARED;
+
+ if (flags2 & FUTEX2_NUMA)
+ flags |= FLAGS_NUMA;
+
+ if (flags2 & FUTEX2_MPOL)
+ flags |= FLAGS_MPOL;
+
+ return flags;
+}
+
+static inline unsigned int futex_size(unsigned int flags)
+{
+ return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
+static inline bool futex_flags_valid(unsigned int flags)
+{
+ /* Only 64bit futexes for 64bit code */
+ if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+ if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
+ return false;
+ }
+
+ /* Only 32bit futexes are implemented -- for now */
+ if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
+ return false;
+
+ /*
+ * Must be able to represent both FUTEX_NO_NODE and every valid nodeid
+ * in a futex word.
+ */
+ if (flags & FLAGS_NUMA) {
+ int bits = 8 * futex_size(flags);
+ u64 max = ~0ULL;
+
+ max >>= 64 - bits;
+ if (nr_node_ids >= max)
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool futex_validate_input(unsigned int flags, u64 val)
+{
+ int bits = 8 * futex_size(flags);
+
+ if (bits < 64 && (val >> bits))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_FAIL_FUTEX
+extern bool should_fail_futex(bool fshared);
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif
+
+/*
+ * Hash buckets are shared by all the futex_keys that hash to the same
+ * location. Each key may have multiple futex_q structures, one for each task
+ * waiting on a futex.
+ */
+struct futex_hash_bucket {
+ atomic_t waiters;
+ spinlock_t lock;
+ struct plist_head chain;
+ struct futex_private_hash *priv;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+ /*
+ * list of 'owned' pi_state instances - these have to be
+ * cleaned up in do_exit() if the task exits prematurely:
+ */
+ struct list_head list;
+
+ /*
+ * The PI object:
+ */
+ struct rt_mutex_base pi_mutex;
+
+ struct task_struct *owner;
+ refcount_t refcount;
+
+ union futex_key key;
+} __randomize_layout;
+
+struct futex_q;
+typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
+
+/**
+ * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list: priority-sorted list of tasks waiting on this futex
+ * @task: the task waiting on the futex
+ * @lock_ptr: the hash bucket lock
+ * @wake: the wake handler for this queue
+ * @wake_data: data associated with the wake handler
+ * @key: the key the futex is hashed on
+ * @pi_state: optional priority inheritance state
+ * @rt_waiter: rt_waiter storage for use with requeue_pi
+ * @requeue_pi_key: the requeue_pi target futex key
+ * @bitset: bitset for the optional bitmasked wakeup
+ * @requeue_state: State field for futex_requeue_pi()
+ * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true
+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
+ *
+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
+ * we can wake only the relevant ones (hashed queues may be shared).
+ *
+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
+ * The order of wakeup is always to make the first condition true, then
+ * the second.
+ *
+ * PI futexes are typically woken before they are removed from the hash list via
+ * the rt_mutex code. See futex_unqueue_pi().
+ */
+struct futex_q {
+ struct plist_node list;
+
+ struct task_struct *task;
+ spinlock_t *lock_ptr;
+ futex_wake_fn *wake;
+ void *wake_data;
+ union futex_key key;
+ struct futex_pi_state *pi_state;
+ struct rt_mutex_waiter *rt_waiter;
+ union futex_key *requeue_pi_key;
+ u32 bitset;
+ atomic_t requeue_state;
+ bool drop_hb_ref;
+#ifdef CONFIG_PREEMPT_RT
+ struct rcuwait requeue_wait;
+#endif
+} __randomize_layout;
+
+extern const struct futex_q futex_q_init;
+
+enum futex_access {
+ FUTEX_READ,
+ FUTEX_WRITE
+};
+
+extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+ enum futex_access rw);
+extern void futex_q_lockptr_lock(struct futex_q *q);
+extern struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns);
+
+extern struct futex_hash_bucket *futex_hash(union futex_key *key);
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+extern void futex_hash_get(struct futex_hash_bucket *hb);
+extern void futex_hash_put(struct futex_hash_bucket *hb);
+
+extern struct futex_private_hash *futex_private_hash(void);
+extern void futex_private_hash_put(struct futex_private_hash *fph);
+
+#else /* !CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
+static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
+static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
+static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
+#endif
+
+DEFINE_CLASS(hb, struct futex_hash_bucket *,
+ if (_T) futex_hash_put(_T),
+ futex_hash(key), union futex_key *key);
+
+DEFINE_CLASS(private_hash, struct futex_private_hash *,
+ if (_T) futex_private_hash_put(_T),
+ futex_private_hash(), void);
+
+/**
+ * futex_match - Check whether two futex keys are equal
+ * @key1: Pointer to key1
+ * @key2: Pointer to key2
+ *
+ * Return 1 if two futex_keys are equal, 0 otherwise.
+ */
+static inline int futex_match(union futex_key *key1, union futex_key *key2)
+{
+ return (key1 && key2
+ && key1->both.word == key2->both.word
+ && key1->both.ptr == key2->both.ptr
+ && key1->both.offset == key2->both.offset);
+}
+
+extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task);
+extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout);
+extern bool __futex_wake_mark(struct futex_q *q);
+extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
+
+extern int fault_in_user_writeable(u32 __user *uaddr);
+extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
+
+static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int ret;
+
+ pagefault_disable();
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+ pagefault_enable();
+
+ return ret;
+}
+
+/* Read from user memory with pagefaults disabled */
+static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
+{
+ guard(pagefault)();
+ return get_user_inline(*dest, from);
+}
+
+extern void __futex_unqueue(struct futex_q *q);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task);
+extern int futex_unqueue(struct futex_q *q);
+
+/**
+ * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ * @task: Task queueing this futex
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * futex_queue() is typically paired with exactly one call to futex_unqueue(). The
+ * exceptions involve the PI related operations, which may use futex_unqueue_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ *
+ * Note that @task may be NULL, for async usage of futexes.
+ */
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
+ __releases(&hb->lock)
+{
+ __futex_queue(q, hb, task);
+ spin_unlock(&hb->lock);
+}
+
+extern void futex_unqueue_pi(struct futex_q *q);
+
+extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
+ /*
+ * Full barrier (A), see the ordering comment above.
+ */
+ smp_mb__after_atomic();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Full barrier (B), see the ordering comment above.
+ */
+ smp_mb();
+ return atomic_read(&hb->waiters);
+#else
+ return 1;
+#endif
+}
+
+extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb);
+extern void futex_q_unlock(struct futex_hash_bucket *hb);
+
+
+extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters);
+
+extern int refill_pi_state_cache(void);
+extern void get_pi_state(struct futex_pi_state *pi_state);
+extern void put_pi_state(struct futex_pi_state *pi_state);
+extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
+
+/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 > hb2)
+ swap(hb1, hb2);
+
+ spin_lock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+}
+
+static inline void
+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ spin_unlock(&hb1->lock);
+ if (hb1 != hb2)
+ spin_unlock(&hb2->lock);
+}
+
+/* syscalls */
+
+extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
+ val, ktime_t *abs_time, u32 bitset, u32 __user
+ *uaddr2);
+
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
+ int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi);
+
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset);
+
+extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset);
+
+/**
+ * struct futex_vector - Auxiliary struct for futex_waitv()
+ * @w: Userspace provided data
+ * @q: Kernel side data
+ *
+ * Struct used to build an array with all data need for futex_waitv()
+ */
+struct futex_vector {
+ struct futex_waitv w;
+ struct futex_q q;
+};
+
+extern int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data);
+
+extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
+ int *woken);
+
+extern int futex_unqueue_multiple(struct futex_vector *v, int count);
+
+extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to);
+
+extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
+
+extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
+
+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
+
+extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
+
+#endif /* _FUTEX_H */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
new file mode 100644
index 000000000000..dacb2330f1fb
--- /dev/null
+++ b/kernel/futex/pi.c
@@ -0,0 +1,1294 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/slab.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * PI code:
+ */
+int refill_pi_state_cache(void)
+{
+ struct futex_pi_state *pi_state;
+
+ if (likely(current->pi_state_cache))
+ return 0;
+
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
+
+ if (!pi_state)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pi_state->list);
+ /* pi_mutex gets initialized later */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ pi_state->key = FUTEX_KEY_INIT;
+
+ current->pi_state_cache = pi_state;
+
+ return 0;
+}
+
+static struct futex_pi_state *alloc_pi_state(void)
+{
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+ WARN_ON(!pi_state);
+ current->pi_state_cache = NULL;
+
+ return pi_state;
+}
+
+static void pi_state_update_owner(struct futex_pi_state *pi_state,
+ struct task_struct *new_owner)
+{
+ struct task_struct *old_owner = pi_state->owner;
+
+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
+
+ if (old_owner) {
+ raw_spin_lock(&old_owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&old_owner->pi_lock);
+ }
+
+ if (new_owner) {
+ raw_spin_lock(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ raw_spin_unlock(&new_owner->pi_lock);
+ }
+}
+
+void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
+}
+
+/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ */
+void put_pi_state(struct futex_pi_state *pi_state)
+{
+ if (!pi_state)
+ return;
+
+ if (!refcount_dec_and_test(&pi_state->refcount))
+ return;
+
+ /*
+ * If pi_state->owner is NULL, the owner is most probably dying
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
+ pi_state_update_owner(pi_state, NULL);
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex);
+ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
+ }
+
+ if (current->pi_state_cache) {
+ kfree(pi_state);
+ } else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+ * refcount is at 0 - put it back to 1.
+ */
+ pi_state->owner = NULL;
+ refcount_set(&pi_state->refcount, 1);
+ current->pi_state_cache = pi_state;
+ }
+}
+
+/*
+ * We need to check the following states:
+ *
+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
+ *
+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
+ *
+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
+ *
+ * [4] Found | Found | NULL | 0 | 1 | Valid
+ * [5] Found | Found | NULL | >0 | 1 | Invalid
+ *
+ * [6] Found | Found | task | 0 | 1 | Valid
+ *
+ * [7] Found | Found | NULL | Any | 0 | Invalid
+ *
+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
+ * [9] Found | Found | task | 0 | 0 | Invalid
+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
+ *
+ * [1] Indicates that the kernel can acquire the futex atomically. We
+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
+ * thread is found then it indicates that the owner TID has died.
+ *
+ * [3] Invalid. The waiter is queued on a non PI futex
+ *
+ * [4] Valid state after exit_robust_list(), which sets the user space
+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5] The user space value got manipulated between exit_robust_list()
+ * and exit_pi_state_list()
+ *
+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
+ * the pi_state but cannot access the user space value.
+ *
+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8] Owner and user space value match
+ *
+ * [9] There is no transient state which sets the user space TID to 0
+ * except exit_robust_list(), but this is indicated by the
+ * FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync. Except one error case where the kernel is denied
+ * write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ * pi_mutex->owner -> pi_state->owner, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
+ */
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ u32 uval2;
+ int ret;
+
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
+ WARN_ON(!refcount_read(&pi_state->refcount));
+
+ /*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
+ /*
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
+ */
+ if (pid)
+ goto out_einval;
+ /*
+ * Take a ref on the state and return success. [4]
+ */
+ goto out_attach;
+ }
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_attach;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ goto out_einval;
+ }
+
+ /*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ *ps = pi_state;
+ return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
+ */
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (futex_get_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
+}
+
+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ /*
+ * No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
+ */
+ struct futex_pi_state *pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make @p
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
+ pi_state->owner = p;
+
+ *ps = pi_state;
+}
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct task_struct *p;
+
+ /*
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
+ */
+ if (!pid)
+ return -EAGAIN;
+ p = find_get_task_by_vpid(pid);
+ if (!p)
+ return handle_exit_race(uaddr, uval, NULL);
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
+ /*
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+ /*
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
+ */
+ int ret = handle_exit_race(uaddr, uval, p);
+
+ raw_spin_unlock_irq(&p->pi_lock);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
+ return ret;
+ }
+
+ __attach_to_pi_owner(p, key, ps);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ int err;
+ u32 curval;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (unlikely(err))
+ return err;
+
+ /* If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
+/**
+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Return:
+ * - 0 - ready to wait;
+ * - 1 - acquired the lock;
+ * - <0 - error
+ *
+ * The hb->lock must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ */
+int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
+{
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *top_waiter;
+ int ret;
+
+ /*
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
+ */
+ if (futex_get_value_locked(&uval, uaddr))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Detect deadlocks.
+ */
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
+ return -EDEADLK;
+
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
+ /*
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
+ */
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
+
+ /*
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
+ */
+ if (!(uval & FUTEX_TID_MASK)) {
+ /*
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
+ */
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
+
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+
+ /*
+ * If the waiter bit was requested the caller also needs PI
+ * state attached to the new owner of the user space futex.
+ *
+ * @task is guaranteed to be alive and it cannot be exiting
+ * because it is either sleeping or waiting in
+ * futex_requeue_pi_wakeup_sync().
+ *
+ * No need to do the full attach_to_pi_owner() exercise
+ * because @task is known and valid.
+ */
+ if (set_waiters) {
+ raw_spin_lock_irq(&task->pi_lock);
+ __attach_to_pi_owner(task, key, ps);
+ raw_spin_unlock_irq(&task->pi_lock);
+ }
+ return 1;
+ }
+
+ /*
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
+ */
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
+ /*
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
+ */
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
+}
+
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
+ struct rt_mutex_waiter *top_waiter)
+{
+ struct task_struct *new_owner;
+ bool postunlock = false;
+ DEFINE_RT_WAKE_Q(wqh);
+ u32 curval, newval;
+ int ret = 0;
+
+ new_owner = top_waiter->task;
+
+ /*
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
+ */
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+ if (unlikely(should_fail_futex(true))) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (!ret && (curval != uval)) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
+
+ if (!ret) {
+ /*
+ * This is a point of no return; once we modified the uval
+ * there is no going back and subsequent operations must
+ * not fail.
+ */
+ pi_state_update_owner(pi_state, new_owner);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
+ }
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+
+ return ret;
+}
+
+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner, *newowner;
+ u32 uval, curval, newval, newtid;
+ int err = 0;
+
+ oldowner = pi_state->owner;
+
+ /*
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the pi_state
+ * because we can fault here. Imagine swapped out pages or a fork
+ * that marked all the anonymous memory readonly for cow.
+ *
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID checks when attaching to PI state .
+ */
+retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 0;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock. pi_state is correct. Tell caller. */
+ return 1;
+ }
+
+ /*
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_err;
+ }
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 1;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ err = futex_get_value_locked(&uval, uaddr);
+ if (err)
+ goto handle_err;
+
+ for (;;) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
+ if (err)
+ goto handle_err;
+
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
+ pi_state_update_owner(pi_state, newowner);
+
+ return argowner == current;
+
+ /*
+ * In order to reschedule or handle a page fault, we need to drop the
+ * locks here. In the case of a fault, this gives the other task
+ * (either the highest priority waiter itself or the task which stole
+ * the rtmutex) the chance to try the fixup of the pi_state. So once we
+ * are back from handling the fault we need to check the pi_state after
+ * reacquiring the locks and before trying to do another fixup. When
+ * the fixup has been done already we simply return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
+ */
+handle_err:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(q->lock_ptr);
+
+ switch (err) {
+ case -EFAULT:
+ err = fault_in_user_writeable(uaddr);
+ break;
+
+ case -EAGAIN:
+ cond_resched();
+ err = 0;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ futex_q_lockptr_lock(q);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return argowner == current;
+
+ /* Retry if err was -EAGAIN or the fault in succeeded */
+ if (!err)
+ goto retry;
+
+ /*
+ * fault_in_user_writeable() failed so user state is immutable. At
+ * best we can make the kernel state consistent but user state will
+ * be most likely hosed and any subsequent unlock operation will be
+ * rejected due to PI futex rule [10].
+ *
+ * Ensure that the rtmutex owner is also the pi_state owner despite
+ * the user space value claiming something different. There is no
+ * point in unlocking the rtmutex if current is the owner as it
+ * would need to wait until the next waiter has taken the rtmutex
+ * to guarantee consistent state. Keep it simple. Userspace asked
+ * for this wreckaged state.
+ *
+ * The rtmutex has an owner - either current or some other
+ * task. See the EAGAIN loop above.
+ */
+ pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+
+ return err;
+}
+
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ int ret;
+
+ lockdep_assert_held(q->lock_ptr);
+
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ ret = __fixup_pi_state_owner(uaddr, q, argowner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+/**
+ * fixup_pi_owner() - Post lock pi_state and corner case management
+ * @uaddr: user address of the futex
+ * @q: futex_q (contains pi_state and access to the rt_mutex)
+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Return:
+ * - 1 - success, lock taken;
+ * - 0 - success, lock not taken;
+ * - <0 - on error (-EFAULT)
+ */
+int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+{
+ if (locked) {
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
+ *
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
+ */
+ if (q->pi_state->owner != current)
+ return fixup_pi_state_owner(uaddr, q, current);
+ return 1;
+ }
+
+ /*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current)
+ return fixup_pi_state_owner(uaddr, q, NULL);
+
+ /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner of the rt_mutex. Warn and establish consistent state.
+ */
+ if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
+ return fixup_pi_state_owner(uaddr, q, current);
+
+ return 0;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
+ */
+int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_q q = futex_q_init;
+ DEFINE_WAKE_Q(wake_q);
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+
+ to = futex_setup_timer(time, &timeout, flags, 0);
+
+retry:
+ ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+retry_private:
+ if (1) {
+ CLASS(hb, hb)(&q.key);
+
+ futex_q_lock(&q, hb);
+
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
+ if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
+ switch (ret) {
+ case 1:
+ /* We got the lock. */
+ ret = 0;
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_q_unlock(hb);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock_put_key;
+ }
+ }
+
+ WARN_ON(!q.pi_state);
+
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+ __futex_queue(&q, hb, current);
+
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
+ }
+
+ /*
+ * Caution; releasing @hb in-scope. The hb->lock is still locked
+ * while the reference is dropped. The reference can not be dropped
+ * after the unlock because if a user initiated resize is in progress
+ * then we might need to wake him. This can not be done after the
+ * rt_mutex_pre_schedule() invocation. The hb will remain valid because
+ * the thread, performing resize, will block on hb->lock during
+ * the requeue.
+ */
+ futex_hash_put(no_free_ptr(hb));
+ /*
+ * Must be done before we enqueue the waiter, here is unfortunately
+ * under the hb lock, but that *should* work because it does nothing.
+ */
+ rt_mutex_pre_schedule();
+
+ rt_mutex_init_waiter(&rt_waiter);
+
+ /*
+ * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ goto cleanup;
+ }
+
+ if (unlikely(to))
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
+cleanup:
+ /*
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * unwind the above, however we canont lock hb->lock because
+ * rt_mutex already has a waiter enqueued and hb->lock can itself try
+ * and enqueue an rt_waiter through rtlock.
+ *
+ * Doing the cleanup without holding hb->lock can cause inconsistent
+ * state between hb and pi_state, but only in the direction of not
+ * seeing a waiter that is leaving.
+ *
+ * See futex_unlock_pi(), it deals with this inconsistency.
+ *
+ * There be dragons here, since we must deal with the inconsistency on
+ * the way out (here), it is impossible to detect/warn about the race
+ * the other way around (missing an incoming waiter).
+ *
+ * What could possibly go wrong...
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+ /*
+ * Now that the rt_waiter has been dequeued, it is safe to use
+ * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+ * the
+ */
+ futex_q_lockptr_lock(&q);
+ /*
+ * Waiter is unqueued.
+ */
+ rt_mutex_post_schedule();
+no_block:
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
+ * the lock, clear our -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from futex_unlock_pi() */
+ futex_hash_put(hb);
+ }
+ goto out;
+
+out_unlock_put_key:
+ futex_q_unlock(hb);
+ goto out;
+
+uaddr_faulted:
+ futex_q_unlock(hb);
+
+ ret = fault_in_user_writeable(uaddr);
+ if (ret)
+ goto out;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+{
+ u32 curval, uval, vpid = task_pid_vnr(current);
+ union futex_key key = FUTEX_KEY_INIT;
+ struct futex_q *top_waiter;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+retry:
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+ /*
+ * We release only a lock we actually own:
+ */
+ if ((uval & FUTEX_TID_MASK) != vpid)
+ return -EPERM;
+
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
+ if (ret)
+ return ret;
+
+ CLASS(hb, hb)(&key);
+ spin_lock(&hb->lock);
+retry_hb:
+
+ /*
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
+ */
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+ struct rt_mutex_waiter *rt_waiter;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and thereby
+ * wake_futex_pi() must observe any new waiters.
+ *
+ * Since the cleanup: case in futex_lock_pi() removes the
+ * rt_waiter without holding hb->lock, it is possible for
+ * wake_futex_pi() to not find a waiter while the above does,
+ * in this case the waiter is on the way out and it can be
+ * ignored.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Futex vs rt_mutex waiter state -- if there are no rt_mutex
+ * waiters even though futex thinks there are, then the waiter
+ * is leaving. The entry needs to be removed from the list so a
+ * new futex_lock_pi() is not using this stale PI-state while
+ * the futex is available in user space again.
+ * There can be more than one task on its way out so it needs
+ * to retry.
+ */
+ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+ if (!rt_waiter) {
+ /*
+ * Acquire a reference for the leaving waiter to ensure
+ * valid futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ top_waiter->drop_hb_ref = true;
+ __futex_unqueue(top_waiter);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ goto retry_hb;
+ }
+
+ get_pi_state(pi_state);
+ spin_unlock(&hb->lock);
+
+ /* drops pi_state->pi_mutex.wait_lock */
+ ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
+ */
+ if (!ret)
+ return ret;
+ /*
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
+ */
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ /*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN)
+ goto pi_retry;
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
+ return ret;
+ }
+
+ /*
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
+ */
+ if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
+ spin_unlock(&hb->lock);
+ switch (ret) {
+ case -EFAULT:
+ goto pi_faulted;
+
+ case -EAGAIN:
+ goto pi_retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ return ret;
+ }
+ }
+
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
+out_unlock:
+ spin_unlock(&hb->lock);
+ return ret;
+
+pi_retry:
+ cond_resched();
+ goto retry;
+
+pi_faulted:
+
+ ret = fault_in_user_writeable(uaddr);
+ if (!ret)
+ goto retry;
+
+ return ret;
+}
+
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
new file mode 100644
index 000000000000..d818b4d47f1b
--- /dev/null
+++ b/kernel/futex/requeue.c
@@ -0,0 +1,913 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/plist.h>
+#include <linux/sched/signal.h>
+
+#include "futex.h"
+#include "../locking/rtmutex_common.h"
+
+/*
+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
+ * underlying rtmutex. The task which is about to be requeued could have
+ * just woken up (timeout, signal). After the wake up the task has to
+ * acquire hash bucket lock, which is held by the requeue code. As a task
+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
+ * and the hash bucket lock blocking would collide and corrupt state.
+ *
+ * On !PREEMPT_RT this is not a problem and everything could be serialized
+ * on hash bucket lock, but aside of having the benefit of common code,
+ * this allows to avoid doing the requeue when the task is already on the
+ * way out and taking the hash bucket lock of the original uaddr1 when the
+ * requeue has been completed.
+ *
+ * The following state transitions are valid:
+ *
+ * On the waiter side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
+ *
+ * On the requeue side:
+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
+ *
+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
+ * signals that the waiter is already on the way out. It also means that
+ * the waiter is still on the 'wait' futex, i.e. uaddr1.
+ *
+ * The waiter side signals early wakeup to the requeue side either through
+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
+ * which means the wakeup is interleaving with a requeue in progress it has
+ * to wait for the requeue side to change the state. Either to DONE/LOCKED
+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
+ * the requeue side when the requeue attempt failed via deadlock detection
+ * and therefore the waiter q is still on the uaddr1 futex.
+ */
+enum {
+ Q_REQUEUE_PI_NONE = 0,
+ Q_REQUEUE_PI_IGNORE,
+ Q_REQUEUE_PI_IN_PROGRESS,
+ Q_REQUEUE_PI_WAIT,
+ Q_REQUEUE_PI_DONE,
+ Q_REQUEUE_PI_LOCKED,
+};
+
+const struct futex_q futex_q_init = {
+ /* list gets initialized in futex_queue()*/
+ .wake = futex_wake_mark,
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY,
+ .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
+};
+
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q: the futex_q to requeue
+ * @hb1: the source hash_bucket
+ * @hb2: the target hash_bucket
+ * @key2: the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(&hb1->chain != &hb2->chain)) {
+ plist_del(&q->list, &hb1->chain);
+ futex_hb_waiters_dec(hb1);
+ futex_hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
+ q->lock_ptr = &hb2->lock;
+ /*
+ * hb1 and hb2 belong to the same futex_hash_bucket_private
+ * because if we managed get a reference on hb1 then it can't be
+ * replaced. Therefore we avoid put(hb1)+get(hb2) here.
+ */
+ }
+ q->key = *key2;
+}
+
+static inline bool futex_requeue_pi_prepare(struct futex_q *q,
+ struct futex_pi_state *pi_state)
+{
+ int old, new;
+
+ /*
+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
+ * ignore the waiter.
+ */
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return false;
+
+ /*
+ * futex_proxy_trylock_atomic() might have set it to
+ * IN_PROGRESS and a interleaved early wake to WAIT.
+ *
+ * It was considered to have an extra state for that
+ * trylock, but that would just add more conditionals
+ * all over the place for a dubious value.
+ */
+ if (old != Q_REQUEUE_PI_NONE)
+ break;
+
+ new = Q_REQUEUE_PI_IN_PROGRESS;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ q->pi_state = pi_state;
+ return true;
+}
+
+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ if (old == Q_REQUEUE_PI_IGNORE)
+ return;
+
+ if (locked >= 0) {
+ /* Requeue succeeded. Set DONE or LOCKED */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
+ old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_DONE + locked;
+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+ /* Deadlock, no early wakeup interleave */
+ new = Q_REQUEUE_PI_NONE;
+ } else {
+ /* Deadlock, early wakeup interleave. */
+ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
+ new = Q_REQUEUE_PI_IGNORE;
+ }
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+#ifdef CONFIG_PREEMPT_RT
+ /* If the waiter interleaved with the requeue let it know */
+ if (unlikely(old == Q_REQUEUE_PI_WAIT))
+ rcuwait_wake_up(&q->requeue_wait);
+#endif
+}
+
+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
+{
+ int old, new;
+
+ old = atomic_read_acquire(&q->requeue_state);
+ do {
+ /* Is requeue done already? */
+ if (old >= Q_REQUEUE_PI_DONE)
+ return old;
+
+ /*
+ * If not done, then tell the requeue code to either ignore
+ * the waiter or to wake it up once the requeue is done.
+ */
+ new = Q_REQUEUE_PI_WAIT;
+ if (old == Q_REQUEUE_PI_NONE)
+ new = Q_REQUEUE_PI_IGNORE;
+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+
+ /* If the requeue was in progress, wait for it to complete */
+ if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+#ifdef CONFIG_PREEMPT_RT
+ rcuwait_wait_event(&q->requeue_wait,
+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
+ TASK_UNINTERRUPTIBLE);
+#else
+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
+#endif
+ }
+
+ /*
+ * Requeue is now either prohibited or complete. Reread state
+ * because during the wait above it might have changed. Nothing
+ * will modify q->requeue_state after this point.
+ */
+ return atomic_read(&q->requeue_state);
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * @q: the futex_q
+ * @key: the key of the requeue target futex
+ * @hb: the hash_bucket of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.
+ *
+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
+ * the wakeup on the right futex.
+ *
+ * 2) Dequeue @q from the hash bucket.
+ *
+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+ * acquisition.
+ *
+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+ * the waiter has to fixup the pi state.
+ *
+ * 5) Complete the requeue state so the waiter can make progress. After
+ * this point the waiter task can return from the syscall immediately in
+ * case that the pi state does not have to be fixed up.
+ *
+ * 6) Wake the waiter task.
+ *
+ * Must be called with both q->lock_ptr and hb->lock held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+ struct futex_hash_bucket *hb)
+{
+ struct task_struct *task;
+
+ q->key = *key;
+ __futex_unqueue(q);
+
+ WARN_ON(!q->rt_waiter);
+ q->rt_waiter = NULL;
+ /*
+ * Acquire a reference for the waiter to ensure valid
+ * futex_q::lock_ptr.
+ */
+ futex_hash_get(hb);
+ q->drop_hb_ref = true;
+ q->lock_ptr = &hb->lock;
+ task = READ_ONCE(q->task);
+
+ /* Signal locked state to the waiter */
+ futex_requeue_pi_complete(q, 1);
+ wake_up_state(task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
+ * Return:
+ * - 0 - failed to acquire the lock atomically;
+ * - >0 - acquired the lock, return value is vpid of the top_waiter
+ * - <0 - error
+ */
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
+{
+ struct futex_q *top_waiter;
+ u32 curval;
+ int ret;
+
+ if (futex_get_value_locked(&curval, pifutex))
+ return -EFAULT;
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unnecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
+ top_waiter = futex_top_waiter(hb1, key1);
+
+ /* There are no waiters, nothing for us to do. */
+ if (!top_waiter)
+ return 0;
+
+ /*
+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+ * and waiting on the 'waitqueue' futex which is always !PI.
+ */
+ if (!top_waiter->rt_waiter || top_waiter->pi_state)
+ return -EINVAL;
+
+ /* Ensure we requeue to the expected futex. */
+ if (!futex_match(top_waiter->requeue_pi_key, key2))
+ return -EINVAL;
+
+ /* Ensure that this does not race against an early wakeup */
+ if (!futex_requeue_pi_prepare(top_waiter, NULL))
+ return -EAGAIN;
+
+ /*
+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+ * in the contended case or if @set_waiters is true.
+ *
+ * In the contended case PI state is attached to the lock owner. If
+ * the user space lock can be acquired then PI state is attached to
+ * the new owner (@top_waiter->task) when @set_waiters is true.
+ */
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ exiting, set_waiters);
+ if (ret == 1) {
+ /*
+ * Lock was acquired in user space and PI state was
+ * attached to @top_waiter->task. That means state is fully
+ * consistent and the waiter can return to user space
+ * immediately after the wakeup.
+ */
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+ } else if (ret < 0) {
+ /* Rewind top_waiter::requeue_state */
+ futex_requeue_pi_complete(top_waiter, ret);
+ } else {
+ /*
+ * futex_lock_pi_atomic() did not acquire the user space
+ * futex, but managed to establish the proxy lock and pi
+ * state. top_waiter::requeue_state cannot be fixed up here
+ * because the waiter is not enqueued on the rtmutex
+ * yet. This is handled at the callsite depending on the
+ * result of rt_mutex_start_proxy_lock() which is
+ * guaranteed to be reached with this function returning 0.
+ */
+ }
+ return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * @uaddr1: source futex user address
+ * @flags1: futex flags (FLAGS_SHARED, etc.)
+ * @uaddr2: target futex user address
+ * @flags2: futex flags (FLAGS_SHARED, etc.)
+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * @cmpval: @uaddr1 expected value (or %NULL)
+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Return:
+ * - >=0 - on success, the number of tasks requeued or woken;
+ * - <0 - on error
+ */
+int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+ u32 __user *uaddr2, unsigned int flags2,
+ int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ int task_count = 0, ret;
+ struct futex_pi_state *pi_state = NULL;
+ struct futex_q *this, *next;
+ DEFINE_WAKE_Q(wake_q);
+
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
+ /*
+ * When PI not supported: return -ENOSYS if requeue_pi is true,
+ * consequently the compiler knows requeue_pi is always false past
+ * this point which will optimize away all the conditional code
+ * further down.
+ */
+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+ return -ENOSYS;
+
+ if (requeue_pi) {
+ /*
+ * Requeue PI only works on two distinct uaddrs. This
+ * check is only valid for private futexes. See below.
+ */
+ if (uaddr1 == uaddr2)
+ return -EINVAL;
+
+ /*
+ * futex_requeue() allows the caller to define the number
+ * of waiters to wake up via the @nr_wake argument. With
+ * REQUEUE_PI, waking up more than one waiter is creating
+ * more problems than it solves. Waking up a waiter makes
+ * only sense if the PI futex @uaddr2 is uncontended as
+ * this allows the requeue code to acquire the futex
+ * @uaddr2 before waking the waiter. The waiter can then
+ * return to user space without further action. A secondary
+ * wakeup would just make the futex_wait_requeue_pi()
+ * handling more complex, because that code would have to
+ * look up pi_state and do more or less all the handling
+ * which the requeue code has to do for the to be requeued
+ * waiters. So restrict the number of waiters to wake to
+ * one, and only wake it up when the PI futex is
+ * uncontended. Otherwise requeue it and let the unlock of
+ * the PI futex handle the wakeup.
+ *
+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and
+ * pthread_cond_broadcast() must use nr_wake=1.
+ */
+ if (nr_wake != 1)
+ return -EINVAL;
+
+ /*
+ * requeue_pi requires a pi_state, try to allocate it now
+ * without any locks in case it fails.
+ */
+ if (refill_pi_state_cache())
+ return -ENOMEM;
+ }
+
+retry:
+ ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags2, &key2,
+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ /*
+ * The check above which compares uaddrs is not sufficient for
+ * shared futexes. We need to compare the keys:
+ */
+ if (requeue_pi && futex_match(&key1, &key2))
+ return -EINVAL;
+
+retry_private:
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
+
+ futex_hb_waiters_inc(hb2);
+ double_lock_hb(hb1, hb2);
+
+ if (likely(cmpval != NULL)) {
+ u32 curval;
+
+ ret = futex_get_value_locked(&curval, uaddr1);
+
+ if (unlikely(ret)) {
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+
+ ret = get_user(curval, uaddr1);
+ if (ret)
+ return ret;
+
+ if (!(flags1 & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+ if (curval != *cmpval) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+
+ if (requeue_pi) {
+ struct task_struct *exiting = NULL;
+
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ *
+ * Updates topwaiter::requeue_state if a top waiter exists.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+ &key2, &pi_state,
+ &exiting, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
+ *
+ * The top waiter's requeue_state is up to date:
+ *
+ * - If the lock was acquired atomically (ret == 1), then
+ * the state is Q_REQUEUE_PI_LOCKED.
+ *
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
+ * - If the trylock failed with an error (ret < 0) then
+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
+ * interleaved early wakeup.
+ *
+ * - If the trylock did not succeed (ret == 0) then the
+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+ * This will be cleaned up in the loop below, which
+ * cannot fail because futex_proxy_trylock_atomic() did
+ * the same sanity checks for requeue_pi as the loop
+ * below does.
+ */
+ switch (ret) {
+ case 0:
+ /* We hold a reference on the pi state. */
+ break;
+
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
+ /*
+ * If the above failed, then pi_state is NULL and
+ * waiter::requeue_state is correct.
+ */
+ case -EFAULT:
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ ret = fault_in_user_writeable(uaddr2);
+ if (!ret)
+ goto retry;
+ return ret;
+ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+ * - EAGAIN: The user space value changed.
+ */
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+ goto out_unlock;
+ }
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (task_count - nr_wake >= nr_requeue)
+ break;
+
+ if (!futex_match(&this->key, &key1))
+ continue;
+
+ /*
+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+ * be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
+ */
+ if ((requeue_pi && !this->rt_waiter) ||
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Plain futexes just wake or requeue and are done */
+ if (!requeue_pi) {
+ if (++task_count <= nr_wake)
+ this->wake(&wake_q, this);
+ else
+ requeue_futex(this, hb1, hb2, &key2);
+ continue;
+ }
+
+ /* Ensure we requeue to the expected futex for requeue_pi. */
+ if (!futex_match(this->requeue_pi_key, &key2)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /*
+ * Requeue nr_requeue waiters and possibly one more in the case
+ * of requeue_pi if we couldn't acquire the lock atomically.
+ *
+ * Prepare the waiter to take the rt_mutex. Take a refcount
+ * on the pi_state and store the pointer in the futex_q
+ * object of the waiter.
+ */
+ get_pi_state(pi_state);
+
+ /* Don't requeue when the waiter is already on the way out. */
+ if (!futex_requeue_pi_prepare(this, pi_state)) {
+ /*
+ * Early woken waiter signaled that it is on the
+ * way out. Drop the pi_state reference and try the
+ * next waiter. @this->pi_state is still NULL.
+ */
+ put_pi_state(pi_state);
+ continue;
+ }
+
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
+ this->task);
+
+ if (ret == 1) {
+ /*
+ * We got the lock. We do neither drop the refcount
+ * on pi_state nor clear this->pi_state because the
+ * waiter needs the pi_state for cleaning up the
+ * user space value. It will drop the refcount
+ * after doing so. this::requeue_state is updated
+ * in the wakeup as well.
+ */
+ requeue_pi_wake_futex(this, &key2, hb2);
+ task_count++;
+ } else if (!ret) {
+ /* Waiter is queued, move it to hb2 */
+ requeue_futex(this, hb1, hb2, &key2);
+ futex_requeue_pi_complete(this, 0);
+ task_count++;
+ } else {
+ /*
+ * rt_mutex_start_proxy_lock() detected a potential
+ * deadlock when we tried to queue that waiter.
+ * Drop the pi_state reference which we took above
+ * and remove the pointer to the state from the
+ * waiters futex_q object.
+ */
+ this->pi_state = NULL;
+ put_pi_state(pi_state);
+ futex_requeue_pi_complete(this, ret);
+ /*
+ * We stop queueing more waiters and let user space
+ * deal with the mess.
+ */
+ break;
+ }
+ }
+
+ /*
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
+out_unlock:
+ futex_hb_waiters_dec(hb2);
+ double_unlock_hb(hb1, hb2);
+ }
+ wake_up_q(&wake_q);
+ return ret ? ret : task_count;
+}
+
+/**
+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
+ * @hb: the hash_bucket futex_q was original enqueued on
+ * @q: the futex_q woken while waiting to be requeued
+ * @timeout: the timeout associated with the wait (NULL if none)
+ *
+ * Determine the cause for the early wakeup.
+ *
+ * Return:
+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+ struct futex_q *q,
+ struct hrtimer_sleeper *timeout)
+{
+ int ret;
+
+ /*
+ * With the hb lock held, we avoid races while we process the wakeup.
+ * We only need to hold hb (and not hb2) to ensure atomicity as the
+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+ * It can't be requeued from uaddr2 to something else since we don't
+ * support a PI aware source futex for requeue.
+ */
+ WARN_ON_ONCE(&hb->lock != q->lock_ptr);
+
+ /*
+ * We were woken prior to requeue by a timeout or a signal.
+ * Unqueue the futex_q and determine which it was.
+ */
+ plist_del(&q->list, &hb->chain);
+ futex_hb_waiters_dec(hb);
+
+ /* Handle spurious wakeups gracefully */
+ ret = -EWOULDBLOCK;
+ if (timeout && !timeout->task)
+ ret = -ETIMEDOUT;
+ else if (signal_pending(current))
+ ret = -ERESTARTNOINTR;
+ return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr: the futex we initially wait on (non-pi)
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
+ * the same type, no requeueing from private to shared, etc.
+ * @val: the expected value of uaddr
+ * @abs_time: absolute timeout
+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
+ * @uaddr2: the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
+ *
+ * We call schedule in futex_wait_queue() when we enqueue and return there
+ * via the following--
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue
+ * 3) signal
+ * 4) timeout
+ *
+ * If 3, cleanup and return -ERESTARTNOINTR.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Return:
+ * - 0 - On success;
+ * - <0 - On error
+ */
+int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+ u32 val, ktime_t *abs_time, u32 bitset,
+ u32 __user *uaddr2)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct rt_mutex_waiter rt_waiter;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
+ struct rt_mutex_base *pi_mutex;
+ int res, ret;
+
+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
+ return -ENOSYS;
+
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
+ if (!bitset)
+ return -EINVAL;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+
+ /*
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ goto out;
+
+ q.bitset = bitset;
+ q.rt_waiter = &rt_waiter;
+ q.requeue_pi_key = &key2;
+
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current);
+ if (ret)
+ goto out;
+
+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+ futex_do_wait(&q, to);
+
+ switch (futex_requeue_pi_wakeup_sync(&q)) {
+ case Q_REQUEUE_PI_IGNORE:
+ {
+ CLASS(hb, hb)(&q.key);
+ /* The waiter is still on uaddr1 */
+ spin_lock(&hb->lock);
+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+ spin_unlock(&hb->lock);
+ }
+ break;
+
+ case Q_REQUEUE_PI_LOCKED:
+ /* The requeue acquired the lock */
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ futex_q_lockptr_lock(&q);
+ ret = fixup_pi_owner(uaddr2, &q, true);
+ /*
+ * Drop the reference to the pi state which the
+ * requeue_pi() code acquired for us.
+ */
+ put_pi_state(q.pi_state);
+ spin_unlock(q.lock_ptr);
+ /*
+ * Adjust the return value. It's either -EFAULT or
+ * success (1) but the caller expects 0 for success.
+ */
+ ret = ret < 0 ? ret : 0;
+ }
+ break;
+
+ case Q_REQUEUE_PI_DONE:
+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+
+ /*
+ * See futex_unlock_pi()'s cleanup: comment.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ futex_q_lockptr_lock(&q);
+ debug_rt_mutex_free_waiter(&rt_waiter);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+ res = fixup_pi_owner(uaddr2, &q, !ret);
+ /*
+ * If fixup_pi_owner() returned an error, propagate that. If it
+ * acquired the lock, clear -ETIMEDOUT or -EINTR.
+ */
+ if (res)
+ ret = (res < 0) ? res : 0;
+
+ futex_unqueue_pi(&q);
+ spin_unlock(q.lock_ptr);
+
+ if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart
+ * by calling futex_lock_pi() directly. We could
+ * restart this syscall, but it would detect that
+ * the user space "val" changed and return
+ * -EWOULDBLOCK. Save the overhead of the restart
+ * and return -EWOULDBLOCK directly.
+ */
+ ret = -EWOULDBLOCK;
+ }
+ break;
+ default:
+ BUG();
+ }
+ if (q.drop_hb_ref) {
+ CLASS(hb, hb)(&q.key);
+ /* Additional reference from requeue_pi_wake_futex() */
+ futex_hash_put(hb);
+ }
+
+out:
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+ return ret;
+}
+
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
new file mode 100644
index 000000000000..880c9bf2f315
--- /dev/null
+++ b/kernel/futex/syscalls.c
@@ -0,0 +1,518 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/syscalls.h>
+#include <linux/time_namespace.h>
+
+#include "futex.h"
+
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list() - Set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+ size_t, len)
+{
+ /*
+ * The kernel knows only one size for now:
+ */
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->robust_list = head;
+
+ return 0;
+}
+
+static inline void __user *futex_task_robust_list(struct task_struct *p, bool compat)
+{
+#ifdef CONFIG_COMPAT
+ if (compat)
+ return p->compat_robust_list;
+#endif
+ return p->robust_list;
+}
+
+static void __user *futex_get_robust_list_common(int pid, bool compat)
+{
+ struct task_struct *p = current;
+ void __user *head;
+ int ret;
+
+ scoped_guard(rcu) {
+ if (pid) {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ return (void __user *)ERR_PTR(-ESRCH);
+ }
+ get_task_struct(p);
+ }
+
+ /*
+ * Hold exec_update_lock to serialize with concurrent exec()
+ * so ptrace_may_access() is checked against stable credentials
+ */
+ ret = down_read_killable(&p->signal->exec_update_lock);
+ if (ret)
+ goto err_put;
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = futex_task_robust_list(p, compat);
+
+ up_read(&p->signal->exec_update_lock);
+ put_task_struct(p);
+
+ return head;
+
+err_unlock:
+ up_read(&p->signal->exec_update_lock);
+err_put:
+ put_task_struct(p);
+ return (void __user *)ERR_PTR(ret);
+}
+
+/**
+ * sys_get_robust_list() - Get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+SYSCALL_DEFINE3(get_robust_list, int, pid,
+ struct robust_list_head __user * __user *, head_ptr,
+ size_t __user *, len_ptr)
+{
+ struct robust_list_head __user *head = futex_get_robust_list_common(pid, false);
+
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(head, head_ptr);
+}
+
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+{
+ unsigned int flags = futex_to_flags(op);
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (flags & FLAGS_CLOCKRT) {
+ if (cmd != FUTEX_WAIT_BITSET &&
+ cmd != FUTEX_WAIT_REQUEUE_PI &&
+ cmd != FUTEX_LOCK_PI2)
+ return -ENOSYS;
+ }
+
+ switch (cmd) {
+ case FUTEX_WAIT:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAIT_BITSET:
+ return futex_wait(uaddr, flags, val, timeout, val3);
+ case FUTEX_WAKE:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ fallthrough;
+ case FUTEX_WAKE_BITSET:
+ return futex_wake(uaddr, flags, val, val3);
+ case FUTEX_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
+ case FUTEX_CMP_REQUEUE:
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
+ case FUTEX_WAKE_OP:
+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+ case FUTEX_LOCK_PI:
+ flags |= FLAGS_CLOCKRT;
+ fallthrough;
+ case FUTEX_LOCK_PI2:
+ return futex_lock_pi(uaddr, flags, timeout, 0);
+ case FUTEX_UNLOCK_PI:
+ return futex_unlock_pi(uaddr, flags);
+ case FUTEX_TRYLOCK_PI:
+ return futex_lock_pi(uaddr, flags, NULL, 1);
+ case FUTEX_WAIT_REQUEUE_PI:
+ val3 = FUTEX_BITSET_MATCH_ANY;
+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
+ case FUTEX_CMP_REQUEUE_PI:
+ return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
+ }
+ return -ENOSYS;
+}
+
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+ switch (cmd) {
+ case FUTEX_WAIT:
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ return true;
+ }
+ return false;
+}
+
+static __always_inline int
+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+{
+ if (!timespec64_valid(ts))
+ return -EINVAL;
+
+ *t = timespec64_to_ktime(*ts);
+ if (cmd == FUTEX_WAIT)
+ *t = ktime_add_safe(ktime_get(), *t);
+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+ return 0;
+}
+
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ const struct __kernel_timespec __user *, utime,
+ u32 __user *, uaddr2, u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
+ if (get_timespec64(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+
+/**
+ * futex_parse_waitv - Parse a waitv array from userspace
+ * @futexv: Kernel side list of waiters to be filled
+ * @uwaitv: Userspace list to be parsed
+ * @nr_futexes: Length of futexv
+ * @wake: Wake to call when futex is woken
+ * @wake_data: Data for the wake handler
+ *
+ * Return: Error code on failure, 0 on success
+ */
+int futex_parse_waitv(struct futex_vector *futexv,
+ struct futex_waitv __user *uwaitv,
+ unsigned int nr_futexes, futex_wake_fn *wake,
+ void *wake_data)
+{
+ struct futex_waitv aux;
+ unsigned int i;
+
+ for (i = 0; i < nr_futexes; i++) {
+ unsigned int flags;
+
+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
+ return -EFAULT;
+
+ if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
+ return -EINVAL;
+
+ flags = futex2_to_flags(aux.flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, aux.val))
+ return -EINVAL;
+
+ futexv[i].w.flags = flags;
+ futexv[i].w.val = aux.val;
+ futexv[i].w.uaddr = aux.uaddr;
+ futexv[i].q = futex_q_init;
+ futexv[i].q.wake = wake;
+ futexv[i].q.wake_data = wake_data;
+ }
+
+ return 0;
+}
+
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+ clockid_t clockid, struct hrtimer_sleeper *to)
+{
+ int flag_clkid = 0, flag_init = 0;
+ struct timespec64 ts;
+ ktime_t time;
+ int ret;
+
+ if (!timeout)
+ return 0;
+
+ if (clockid == CLOCK_REALTIME) {
+ flag_clkid = FLAGS_CLOCKRT;
+ flag_init = FUTEX_CLOCK_REALTIME;
+ }
+
+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+ return -EINVAL;
+
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+
+ /*
+ * Since there's no opcode for futex_waitv, use
+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
+ */
+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+ if (ret)
+ return ret;
+
+ futex_setup_timer(&time, to, flag_clkid, 0);
+ return 0;
+}
+
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+}
+
+/**
+ * sys_futex_waitv - Wait on a list of futexes
+ * @waiters: List of futexes to wait on
+ * @nr_futexes: Length of futexv
+ * @flags: Flag for timeout (monotonic/realtime)
+ * @timeout: Optional absolute timeout.
+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
+ *
+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for
+ * the operation. Each waiter has individual flags. The `flags` argument for
+ * the syscall should be used solely for specifying the timeout as realtime, if
+ * needed. Flags for private futexes, sizes, etc. should be used on the
+ * individual flags of each waiter.
+ *
+ * Returns the array index of one of the woken futexes. No further information
+ * is provided: any number of other futexes may also have been woken by the
+ * same event, and if more than one futex was woken, the retrned index may
+ * refer to any one of them. (It is not necessaryily the futex with the
+ * smallest index, nor the one most recently woken, nor...)
+ */
+
+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
+ unsigned int, nr_futexes, unsigned int, flags,
+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ struct futex_vector *futexv;
+ int ret;
+
+ /* This syscall supports no flags for now */
+ if (flags)
+ return -EINVAL;
+
+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
+ return -EINVAL;
+
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
+
+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv) {
+ ret = -ENOMEM;
+ goto destroy_timer;
+ }
+
+ ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
+ NULL);
+ if (!ret)
+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
+
+ kfree(futexv);
+
+destroy_timer:
+ if (timeout)
+ futex2_destroy_timeout(&to);
+ return ret;
+}
+
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr: Address of the futex(es) to wake
+ * @mask: bitmask
+ * @nr: Number of the futexes to wake
+ * @flags: FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_wake,
+ void __user *, uaddr,
+ unsigned long, mask,
+ int, nr,
+ unsigned int, flags)
+{
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
+}
+
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr: Address of the futex to wait on
+ * @val: Value of @uaddr
+ * @mask: bitmask
+ * @flags: FUTEX2 flags
+ * @timeout: Optional absolute timeout
+ * @clockid: Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+ void __user *, uaddr,
+ unsigned long, val,
+ unsigned long, mask,
+ unsigned int, flags,
+ struct __kernel_timespec __user *, timeout,
+ clockid_t, clockid)
+{
+ struct hrtimer_sleeper to;
+ int ret;
+
+ if (flags & ~FUTEX2_VALID_MASK)
+ return -EINVAL;
+
+ flags = futex2_to_flags(flags);
+ if (!futex_flags_valid(flags))
+ return -EINVAL;
+
+ if (!futex_validate_input(flags, val) ||
+ !futex_validate_input(flags, mask))
+ return -EINVAL;
+
+ if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+ return ret;
+
+ ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+ if (timeout)
+ futex2_destroy_timeout(&to);
+
+ return ret;
+}
+
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters: array describing the source and destination futex
+ * @flags: unused
+ * @nr_wake: number of futexes to wake
+ * @nr_requeue: number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+ struct futex_waitv __user *, waiters,
+ unsigned int, flags,
+ int, nr_wake,
+ int, nr_requeue)
+{
+ struct futex_vector futexes[2];
+ u32 cmpval;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!waiters)
+ return -EINVAL;
+
+ ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
+ if (ret)
+ return ret;
+
+ cmpval = futexes[0].w.val;
+
+ return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+ u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+ nr_wake, nr_requeue, &cmpval, 0);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head = futex_get_robust_list_common(pid, true);
+
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ int ret, cmd = op & FUTEX_CMD_MASK;
+ ktime_t t, *tp = NULL;
+ struct timespec64 ts;
+
+ if (utime && futex_cmd_has_timeout(cmd)) {
+ if (get_old_timespec32(&ts, utime))
+ return -EFAULT;
+ ret = futex_init_timeout(cmd, op, &ts, &t);
+ if (ret)
+ return ret;
+ tp = &t;
+ }
+
+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+}
+#endif /* CONFIG_COMPAT_32BIT_TIME */
+
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
new file mode 100644
index 000000000000..e2bbe5509ec2
--- /dev/null
+++ b/kernel/futex/waitwake.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/plist.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+#include <linux/freezer.h>
+
+#include "futex.h"
+
+/*
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ * uval = *futex;
+ * *futex = newval;
+ * sys_futex(WAKE, futex);
+ * futex_wake(futex);
+ * if (queue_empty())
+ * return;
+ * if (uval == val)
+ * lock(hash_bucket(futex));
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ * futex_wait(futex, val);
+ *
+ * waiters++; (a)
+ * smp_mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `--------> smp_mb(); (B)
+ * if (uval == val)
+ * queue();
+ * unlock(hash_bucket(futex));
+ * schedule(); if (waiters)
+ * lock(hash_bucket(futex));
+ * else wake_waiters(futex);
+ * waiters--; (b) unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read (see futex_hb_waiters_pending()).
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in futex_q_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
+ */
+
+bool __futex_wake_mark(struct futex_q *q)
+{
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return false;
+
+ __futex_unqueue(q);
+ /*
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+ * is written, without taking any locks. This is possible in the event
+ * of a spurious wakeup, for example. A memory barrier is required here
+ * to prevent the following store to lock_ptr from getting ahead of the
+ * plist_del in __futex_unqueue().
+ */
+ smp_store_release(&q->lock_ptr, NULL);
+
+ return true;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+ struct task_struct *p = q->task;
+
+ get_task_struct(p);
+
+ if (!__futex_wake_mark(q)) {
+ put_task_struct(p);
+ return;
+ }
+
+ /*
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock.
+ */
+ wake_q_add_safe(wake_q, p);
+}
+
+/*
+ * Wake up waiters matching bitset queued on this futex (uaddr).
+ */
+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+{
+ struct futex_q *this, *next;
+ union futex_key key = FUTEX_KEY_INIT;
+ DEFINE_WAKE_Q(wake_q);
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+ if ((flags & FLAGS_STRICT) && !nr_wake)
+ return 0;
+
+ CLASS(hb, hb)(&key);
+
+ /* Make sure we really have tasks to wakeup */
+ if (!futex_hb_waiters_pending(hb))
+ return ret;
+
+ spin_lock(&hb->lock);
+
+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
+ if (futex_match (&this->key, &key)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ break;
+ }
+
+ /* Check if one of the bits is set in both bitsets */
+ if (!(this->bitset & bitset))
+ continue;
+
+ this->wake(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ spin_unlock(&hb->lock);
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+ unsigned int op = (encoded_op & 0x70000000) >> 28;
+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
+ int oldval, ret;
+
+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+ if (oparg < 0 || oparg > 31) {
+ /*
+ * kill this print and return -EINVAL when userspace
+ * is sane again
+ */
+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+ current->comm, oparg);
+ oparg &= 31;
+ }
+ oparg = 1 << oparg;
+ }
+
+ pagefault_disable();
+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+ pagefault_enable();
+ if (ret)
+ return ret;
+
+ switch (cmp) {
+ case FUTEX_OP_CMP_EQ:
+ return oldval == cmparg;
+ case FUTEX_OP_CMP_NE:
+ return oldval != cmparg;
+ case FUTEX_OP_CMP_LT:
+ return oldval < cmparg;
+ case FUTEX_OP_CMP_GE:
+ return oldval >= cmparg;
+ case FUTEX_OP_CMP_LE:
+ return oldval <= cmparg;
+ case FUTEX_OP_CMP_GT:
+ return oldval > cmparg;
+ default:
+ return -ENOSYS;
+ }
+}
+
+/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+ int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+ struct futex_q *this, *next;
+ int ret, op_ret;
+ DEFINE_WAKE_Q(wake_q);
+
+retry:
+ ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+ ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ if (1) {
+ CLASS(hb, hb1)(&key1);
+ CLASS(hb, hb2)(&key2);
+
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+ double_unlock_hb(hb1, hb2);
+
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
+ ret = op_ret;
+ return ret;
+ }
+
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ return ret;
+ }
+
+ cond_resched();
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+ goto retry;
+ }
+
+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+ if (futex_match(&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ op_ret = 0;
+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+ if (futex_match(&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ this->wake(&wake_q, this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+out_unlock:
+ double_unlock_hb(hb1, hb2);
+ }
+ wake_up_q(&wake_q);
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart);
+
+/**
+ * futex_do_wait() - wait for wakeup, timeout, or signal
+ * @q: the futex_q to queue up on
+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
+ */
+void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
+{
+ /* Arm the timer */
+ if (timeout)
+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
+
+ /*
+ * If we have been removed from the hash list, then another task
+ * has tried to wake us, and we can skip the call to schedule().
+ */
+ if (likely(!plist_node_empty(&q->list))) {
+ /*
+ * If the timer has already expired, current will already be
+ * flagged for rescheduling. Only call schedule if there
+ * is no timeout, or if it has yet to expire.
+ */
+ if (!timeout || timeout->task)
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * futex_unqueue_multiple - Remove various futexes from their hash bucket
+ * @v: The list of futexes to unqueue
+ * @count: Number of futexes in the list
+ *
+ * Helper to unqueue a list of futexes. This can't fail.
+ *
+ * Return:
+ * - >=0 - Index of the last futex that was awoken;
+ * - -1 - No futex was awoken
+ */
+int futex_unqueue_multiple(struct futex_vector *v, int count)
+{
+ int ret = -1, i;
+
+ for (i = 0; i < count; i++) {
+ if (!futex_unqueue(&v[i].q))
+ ret = i;
+ }
+
+ return ret;
+}
+
+/**
+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
+ * @vs: The futex list to wait on
+ * @count: The size of the list
+ * @woken: Index of the last woken futex, if any. Used to notify the
+ * caller that it can return this index to userspace (return parameter)
+ *
+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
+ * the futex list is invalid or if any futex was already awoken. On success the
+ * task is ready to interruptible sleep.
+ *
+ * Return:
+ * - 1 - One of the futexes was woken by another thread
+ * - 0 - Success
+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
+ */
+int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+{
+ bool retry = false;
+ int ret, i;
+ u32 uval;
+
+ /*
+ * Make sure to have a reference on the private_hash such that we
+ * don't block on rehash after changing the task state below.
+ */
+ guard(private_hash)();
+
+ /*
+ * Enqueuing multiple futexes is tricky, because we need to enqueue
+ * each futex on the list before dealing with the next one to avoid
+ * deadlocking on the hash bucket. But, before enqueuing, we need to
+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
+ * lose any wake events, which cannot be done before the get_futex_key
+ * of the next key, because it calls get_user_pages, which can sleep.
+ * Thus, we fetch the list of futexes keys in two steps, by first
+ * pinning all the memory keys in the futex key, and only then we read
+ * each key and queue the corresponding futex.
+ *
+ * Private futexes doesn't need to recalculate hash in retry, so skip
+ * get_futex_key() when retrying.
+ */
+retry:
+ for (i = 0; i < count; i++) {
+ if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
+ continue;
+
+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
+ vs[i].w.flags,
+ &vs[i].q.key, FUTEX_READ);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+
+ for (i = 0; i < count; i++) {
+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
+ struct futex_q *q = &vs[i].q;
+ u32 val = vs[i].w.val;
+
+ if (1) {
+ CLASS(hb, hb)(&q->key);
+
+ futex_q_lock(q, hb);
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (!ret && uval == val) {
+ /*
+ * The bucket lock can't be held while dealing with the
+ * next futex. Queue each futex at this moment so hb can
+ * be unlocked.
+ */
+ futex_queue(q, hb, current);
+ continue;
+ }
+
+ futex_q_unlock(hb);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Even if something went wrong, if we find out that a futex
+ * was woken, we don't return error and return this index to
+ * userspace
+ */
+ *woken = futex_unqueue_multiple(vs, i);
+ if (*woken >= 0)
+ return 1;
+
+ if (ret) {
+ /*
+ * If we need to handle a page fault, we need to do so
+ * without any lock and any enqueued futex (otherwise
+ * we could lose some wakeup). So we do it here, after
+ * undoing all the work done so far. In success, we
+ * retry all the work.
+ */
+ if (get_user(uval, uaddr))
+ return -EFAULT;
+
+ retry = true;
+ goto retry;
+ }
+
+ if (uval != val)
+ return -EWOULDBLOCK;
+ }
+
+ return 0;
+}
+
+/**
+ * futex_sleep_multiple - Check sleeping conditions and sleep
+ * @vs: List of futexes to wait for
+ * @count: Length of vs
+ * @to: Timeout
+ *
+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
+ * been woken up.
+ */
+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ if (to && !to->task)
+ return;
+
+ for (; count; count--, vs++) {
+ if (!READ_ONCE(vs->q.lock_ptr))
+ return;
+ }
+
+ schedule();
+}
+
+/**
+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
+ * @vs: The list of futexes to wait on
+ * @count: The number of objects
+ * @to: Timeout before giving up and returning to userspace
+ *
+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
+ * sleeps on a group of futexes and returns on the first futex that is
+ * wake, or after the timeout has elapsed.
+ *
+ * Return:
+ * - >=0 - Hint to the futex that was awoken
+ * - <0 - On error
+ */
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to)
+{
+ int ret, hint = 0;
+
+ if (to)
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+
+ while (1) {
+ ret = futex_wait_multiple_setup(vs, count, &hint);
+ if (ret) {
+ if (ret > 0) {
+ /* A futex was woken during setup */
+ ret = hint;
+ }
+ return ret;
+ }
+
+ futex_sleep_multiple(vs, count, to);
+
+ __set_current_state(TASK_RUNNING);
+
+ ret = futex_unqueue_multiple(vs, count);
+ if (ret >= 0)
+ return ret;
+
+ if (to && !to->task)
+ return -ETIMEDOUT;
+ else if (signal_pending(current))
+ return -ERESTARTSYS;
+ /*
+ * The final case is a spurious wakeup, for
+ * which just retry.
+ */
+ }
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr: the futex userspace address
+ * @val: the expected value
+ * @flags: futex flags (FLAGS_SHARED, etc.)
+ * @q: the associated futex_q
+ * @key2: the second futex_key if used for requeue PI
+ * @task: Task queueing this futex
+ *
+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
+ * compare it with the expected value. Handle atomic faults internally.
+ * Return with the hb lock held on success, and unlocked on failure.
+ *
+ * Return:
+ * - 0 - uaddr contains val and hb has been locked;
+ * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not
+ * be read, does not contain the expected value or is not properly aligned.
+ */
+int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+ struct futex_q *q, union futex_key *key2,
+ struct task_struct *task)
+{
+ u32 uval;
+ int ret;
+
+ /*
+ * Access the page AFTER the hash-bucket is locked.
+ * Order is important:
+ *
+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
+ *
+ * The basic logical guarantee of a futex is that it blocks ONLY
+ * if cond(var) is known to be true at the time of blocking, for
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
+ * cond(var) false, which would violate the guarantee.
+ *
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
+ */
+retry:
+ ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
+ if (unlikely(ret != 0))
+ return ret;
+
+retry_private:
+ if (1) {
+ CLASS(hb, hb)(&q->key);
+
+ futex_q_lock(q, hb);
+
+ ret = futex_get_value_locked(&uval, uaddr);
+
+ if (ret) {
+ futex_q_unlock(hb);
+
+ ret = get_user(uval, uaddr);
+ if (ret)
+ return ret;
+
+ if (!(flags & FLAGS_SHARED))
+ goto retry_private;
+
+ goto retry;
+ }
+
+ if (uval != val) {
+ futex_q_unlock(hb);
+ return -EWOULDBLOCK;
+ }
+
+ if (key2 && futex_match(&q->key, key2)) {
+ futex_q_unlock(hb);
+ return -EINVAL;
+ }
+
+ /*
+ * The task state is guaranteed to be set before another task can
+ * wake it. set_current_state() is implemented using smp_store_mb() and
+ * futex_queue() calls spin_unlock() upon completion, both serializing
+ * access to the hash list and forcing another memory barrier.
+ */
+ if (task == current)
+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ futex_queue(q, hb, task);
+ }
+
+ return ret;
+}
+
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ struct hrtimer_sleeper *to, u32 bitset)
+{
+ struct futex_q q = futex_q_init;
+ int ret;
+
+ if (!bitset)
+ return -EINVAL;
+
+ q.bitset = bitset;
+
+retry:
+ /*
+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
+ * is initialized.
+ */
+ ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);
+ if (ret)
+ return ret;
+
+ /* futex_queue and wait for wakeup, timeout, or a signal. */
+ futex_do_wait(&q, to);
+
+ /* If we were woken (and unqueued), we succeeded, whatever. */
+ if (!futex_unqueue(&q))
+ return 0;
+
+ if (to && !to->task)
+ return -ETIMEDOUT;
+
+ /*
+ * We expect signal_pending(current), but we might be the
+ * victim of a spurious wakeup as well.
+ */
+ if (!signal_pending(current))
+ goto retry;
+
+ return -ERESTARTSYS;
+}
+
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+ struct hrtimer_sleeper timeout, *to;
+ struct restart_block *restart;
+ int ret;
+
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
+
+ ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+ /* No timeout, nothing to clean up. */
+ if (!to)
+ return ret;
+
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+
+ if (ret == -ERESTARTSYS) {
+ restart = &current->restart_block;
+ restart->futex.uaddr = uaddr;
+ restart->futex.val = val;
+ restart->futex.time = *abs_time;
+ restart->futex.bitset = bitset;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+
+ return set_restart_fn(restart, futex_wait_restart);
+ }
+
+ return ret;
+}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = restart->futex.uaddr;
+ ktime_t t, *tp = NULL;
+
+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+ t = restart->futex.time;
+ tp = &t;
+ }
+ restart->fn = do_no_restart_syscall;
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
+}
+
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
deleted file mode 100644
index 55c8c9349cfe..000000000000
--- a/kernel/futex_compat.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * linux/kernel/futex_compat.c
- *
- * Futex compatibililty routines.
- *
- * Copyright 2006, Red Hat, Inc., Ingo Molnar
- */
-
-#include <linux/linkage.h>
-#include <linux/compat.h>
-#include <linux/nsproxy.h>
-#include <linux/futex.h>
-#include <linux/ptrace.h>
-#include <linux/syscalls.h>
-
-#include <asm/uaccess.h>
-
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
-{
- if (get_user(*uentry, head))
- return -EFAULT;
-
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
-
- return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
- compat_long_t futex_offset)
-{
- compat_uptr_t base = ptr_to_compat(entry);
- void __user *uaddr = compat_ptr(base + futex_offset);
-
- return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-void compat_exit_robust_list(struct task_struct *curr)
-{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
- compat_uptr_t uentry, next_uentry, upending;
- compat_long_t futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != (struct robust_list __user *) &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * dont process it twice:
- */
- if (entry != pending) {
- void __user *uaddr = futex_uaddr(entry, futex_offset);
-
- if (handle_futex_death(uaddr, curr, pi))
- return;
- }
- if (rc)
- return;
- uentry = next_uentry;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
- if (pending) {
- void __user *uaddr = futex_uaddr(pending, futex_offset);
-
- handle_futex_death(uaddr, curr, pip);
- }
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->compat_robust_list = head;
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
- compat_uptr_t __user *, head_ptr,
- compat_size_t __user *, len_ptr)
-{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct compat_timespec __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- struct timespec ts;
- ktime_t t, *tp = NULL;
- int val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
-
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (compat_get_timespec(&ts, utime))
- return -EFAULT;
- if (!timespec_valid(&ts))
- return -EINVAL;
-
- t = timespec_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- tp = &t;
- }
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (int) (unsigned long) utime;
-
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index c92e44855ddd..04f4ebdc3cf5 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -1,11 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- select CONSTRUCTORS if !UML
+ depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR
+ select CONSTRUCTORS
default n
- ---help---
+ help
This option enables gcov-based code profiling (e.g. for code coverage
measurements).
@@ -37,10 +39,11 @@ config ARCH_HAS_GCOV_PROFILE_ALL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
+ depends on !COMPILE_TEST
depends on GCOV_KERNEL
depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
- ---help---
+ help
This options activates profiling for the entire kernel.
If unsure, say N.
@@ -49,34 +52,4 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
-choice
- prompt "Specify GCOV format"
- depends on GCOV_KERNEL
- default GCOV_FORMAT_AUTODETECT
- ---help---
- The gcov format is usually determined by the GCC version, but there are
- exceptions where format changes are integrated in lower-version GCCs.
- In such a case use this option to adjust the format used in the kernel
- accordingly.
-
- If unsure, choose "Autodetect".
-
-config GCOV_FORMAT_AUTODETECT
- bool "Autodetect"
- ---help---
- Select this option to use the format that corresponds to your GCC
- version.
-
-config GCOV_FORMAT_3_4
- bool "GCC 3.4 format"
- ---help---
- Select this option to use the format defined by GCC 3.4.
-
-config GCOV_FORMAT_4_7
- bool "GCC 4.7 format"
- ---help---
- Select this option to use the format defined by GCC 4.7.
-
-endchoice
-
endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 752d6486b67e..ccd02afaeffb 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,7 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
-obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \
- gcc_3_4.o, gcc_4_7.o)
+obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o
+CFLAGS_gcc_base.o += -Wno-missing-prototypes -Wno-missing-declarations
+obj-$(CONFIG_CC_IS_CLANG) += clang.o
+CFLAGS_clang.o += -Wno-missing-prototypes -Wno-missing-declarations
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index a744098e4eb7..073a3738c5e6 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code maintains a list of active profiling data structures.
*
@@ -21,76 +22,8 @@
#include <linux/sched.h>
#include "gcov.h"
-static int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
-
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
- static unsigned int gcov_version;
-
- mutex_lock(&gcov_lock);
- if (gcov_version == 0) {
- gcov_version = gcov_info_version(info);
- /*
- * Printing gcc's version magic may prove useful for debugging
- * incompatibility reports.
- */
- pr_info("version magic: 0x%x\n", gcov_version);
- }
- /*
- * Add new profiling data structure to list and inform event
- * listener.
- */
- gcov_info_link(info);
- if (gcov_events_enabled)
- gcov_event(GCOV_ADD, info);
- mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
+int gcov_events_enabled;
+DEFINE_MUTEX(gcov_lock);
/**
* gcov_enable_events - enable event reporting through gcov_event()
@@ -116,12 +49,56 @@ void gcov_enable_events(void)
mutex_unlock(&gcov_lock);
}
-#ifdef CONFIG_MODULES
-static inline int within(void *addr, void *start, unsigned long size)
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+size_t store_gcov_u64(void *buffer, size_t off, u64 v)
{
- return ((addr >= start) && (addr < start + size));
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
}
+#ifdef CONFIG_MODULES
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -136,7 +113,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within(info, mod->module_core, mod->core_size)) {
+ if (gcov_info_within_module(info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..8b888a6193cc
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data. LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ * llvm_gcda_start_file()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * [... repeats for each function ...]
+ * llvm_gcda_summary_info()
+ * llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit. This forces us to keep some local state
+ * about which module we're dealing with at the moment. On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include "gcov.h"
+
+typedef void (*llvm_gcov_callback)(void);
+
+struct gcov_info {
+ struct list_head head;
+
+ const char *filename;
+ unsigned int version;
+ u32 checksum;
+
+ struct list_head functions;
+};
+
+struct gcov_fn_info {
+ struct list_head head;
+
+ u32 ident;
+ u32 checksum;
+ u32 cfg_checksum;
+
+ u32 num_counters;
+ u64 *counters;
+};
+
+static struct gcov_info *current_info;
+
+static LIST_HEAD(clang_gcov_list);
+
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+ struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ INIT_LIST_HEAD(&info->functions);
+
+ mutex_lock(&gcov_lock);
+
+ list_add_tail(&info->head, &clang_gcov_list);
+ current_info = info;
+ writeout();
+ current_info = NULL;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+
+void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
+{
+ current_info->filename = orig_filename;
+ current_info->version = version;
+ current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+
+void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum)
+{
+ struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ info->ident = ident;
+ info->checksum = func_checksum;
+ info->cfg_checksum = cfg_checksum;
+ list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+ struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+ struct gcov_fn_info, head);
+
+ info->num_counters = num_counters;
+ info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return list_first_entry_or_null(&clang_gcov_list,
+ struct gcov_info, head);
+ if (list_is_last(&info->head, &clang_gcov_list))
+ return NULL;
+ return list_next_entry(info, head);
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ list_add_tail(&info->head, &clang_gcov_list);
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ /* Generic code unlinks while iterating. */
+ __list_del_entry(&info->head);
+}
+
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info->filename, mod);
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn;
+
+ list_for_each_entry(fn, &info->functions, head)
+ memset(fn->counters, 0,
+ sizeof(fn->counters[0]) * fn->num_counters);
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+ &info1->functions, struct gcov_fn_info, head);
+ struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+ &info2->functions, struct gcov_fn_info, head);
+
+ if (info1->checksum != info2->checksum)
+ return false;
+ if (!fn_ptr1)
+ return fn_ptr1 == fn_ptr2;
+ while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+ !list_is_last(&fn_ptr2->head, &info2->functions)) {
+ if (fn_ptr1->checksum != fn_ptr2->checksum)
+ return false;
+ if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ return false;
+ fn_ptr1 = list_next_entry(fn_ptr1, head);
+ fn_ptr2 = list_next_entry(fn_ptr2, head);
+ }
+ return list_is_last(&fn_ptr1->head, &info1->functions) &&
+ list_is_last(&fn_ptr2->head, &info2->functions);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dst: profiling data set to which data is added
+ * @src: profiling data set which is added
+ *
+ * Adds profiling counts of @src to @dst.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_fn_info *dfn_ptr;
+ struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+ struct gcov_fn_info, head);
+
+ list_for_each_entry(dfn_ptr, &dst->functions, head) {
+ u32 i;
+
+ for (i = 0; i < sfn_ptr->num_counters; i++)
+ dfn_ptr->counters[i] += sfn_ptr->counters[i];
+
+ sfn_ptr = list_next_entry(sfn_ptr, head);
+ }
+}
+
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+ size_t cv_size; /* counter values size */
+ struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+ GFP_KERNEL);
+ if (!fn_dup)
+ return NULL;
+ INIT_LIST_HEAD(&fn_dup->head);
+
+ cv_size = fn->num_counters * sizeof(fn->counters[0]);
+ fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL);
+ if (!fn_dup->counters) {
+ kfree(fn_dup);
+ return NULL;
+ }
+
+ memcpy(fn_dup->counters, fn->counters, cv_size);
+
+ return fn_dup;
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_fn_info *fn;
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ INIT_LIST_HEAD(&dup->head);
+ INIT_LIST_HEAD(&dup->functions);
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err;
+
+ list_for_each_entry(fn, &info->functions, head) {
+ struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+
+ if (!fn_dup)
+ goto err;
+ list_add_tail(&fn_dup->head, &dup->functions);
+ }
+
+ return dup;
+
+err:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn, *tmp;
+
+ list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+ kvfree(fn->counters);
+ list_del(&fn->head);
+ kfree(fn);
+ }
+ kfree(info->filename);
+ kfree(info);
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->checksum);
+
+ list_for_each_entry(fi_ptr, &info->functions, head) {
+ u32 i;
+
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, 3);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+ for (i = 0; i < fi_ptr->num_counters; i++)
+ pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+ }
+
+ return pos;
+}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index edf67c493a8e..01520689b57c 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code exports profiling data as debugfs files to userspace.
*
@@ -25,6 +26,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
+#include <linux/mm.h>
#include "gcov.h"
/**
@@ -57,13 +59,12 @@ struct gcov_node {
struct dentry *dentry;
struct dentry **links;
int num_loaded;
- char name[0];
+ char name[];
};
static const char objtree[] = OBJTREE;
static const char srctree[] = SRCTREE;
static struct gcov_node root_node;
-static struct dentry *reset_dentry;
static LIST_HEAD(all_head);
static DEFINE_MUTEX(node_lock);
@@ -85,6 +86,115 @@ static int __init gcov_persist_setup(char *str)
}
__setup("gcov_persist=", gcov_persist_setup);
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ size_t size;
+ loff_t pos;
+ char buffer[] __counted_by(size);
+};
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+static struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+ size_t size;
+
+ /* Dry-run to get the actual buffer size. */
+ size = convert_to_gcda(NULL, info);
+
+ iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->info = info;
+ iter->size = size;
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+}
+
+
+/**
+ * gcov_iter_free - free iterator data
+ * @iter: file iterator
+ */
+static void gcov_iter_free(struct gcov_iterator *iter)
+{
+ kvfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+static void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+static int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
+
/*
* seq_file.start() implementation for gcov data files. Note that the
* gcov_iterator interface is designed to be more restrictive than seq_file
@@ -108,9 +218,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
+ (*pos)++;
if (gcov_iter_next(iter))
return NULL;
- (*pos)++;
return iter;
}
@@ -386,8 +496,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
goto out_err;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
- if (!node->links[i])
- goto out_err;
kfree(target);
}
@@ -449,11 +557,6 @@ static struct gcov_node *new_node(struct gcov_node *parent,
parent->dentry, node, &gcov_data_fops);
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
- if (!node->dentry) {
- pr_warn("could not create file\n");
- kfree(node);
- return NULL;
- }
if (info)
add_links(node, parent->dentry);
list_add(&node->list, &parent->children);
@@ -760,32 +863,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
/* Create debugfs entries. */
static __init int gcov_fs_init(void)
{
- int rc = -EIO;
-
init_node(&root_node, NULL, NULL, NULL);
/*
* /sys/kernel/debug/gcov will be parent for the reset control file
* and all profiling files.
*/
root_node.dentry = debugfs_create_dir("gcov", NULL);
- if (!root_node.dentry)
- goto err_remove;
/*
* Create reset file which resets all profiling counts when written
* to.
*/
- reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
- NULL, &gcov_reset_fops);
- if (!reset_dentry)
- goto err_remove;
+ debugfs_create_file("reset", 0600, root_node.dentry, NULL,
+ &gcov_reset_fops);
/* Replay previous events to get our fs hierarchy up-to-date. */
gcov_enable_events();
return 0;
-
-err_remove:
- pr_err("init failed\n");
- debugfs_remove(root_node.dentry);
-
- return rc;
}
device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
deleted file mode 100644
index 27bc88a35013..000000000000
--- a/kernel/gcov/gcc_3_4.c
+++ /dev/null
@@ -1,562 +0,0 @@
-/*
- * This code provides functions to handle gcc's profiling data format
- * introduced with gcc 3.4. Future versions of gcc may change the gcov
- * format (as happened before), so all format-specific information needs
- * to be kept modular and easily exchangeable.
- *
- * This file is based on gcc-internal definitions. Functions and data
- * structures are defined to be compatible with gcc counterparts.
- * For a better understanding, refer to gcc source: gcc/gcov-io.h.
- *
- * Copyright IBM Corp. 2009
- * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
- *
- * Uses gcc-internal data definitions.
- */
-
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
-#include "gcov.h"
-
-#define GCOV_COUNTERS 5
-
-static struct gcov_info *gcov_info_head;
-
-/**
- * struct gcov_fn_info - profiling meta data per function
- * @ident: object file-unique function identifier
- * @checksum: function checksum
- * @n_ctrs: number of values per counter type belonging to this function
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- */
-struct gcov_fn_info {
- unsigned int ident;
- unsigned int checksum;
- unsigned int n_ctrs[0];
-};
-
-/**
- * struct gcov_ctr_info - profiling data per counter type
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- * @merge: merge function for counter values of this type (unused)
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
- unsigned int num;
- gcov_type *values;
- void (*merge)(gcov_type *, unsigned int);
-};
-
-/**
- * struct gcov_info - profiling data per object file
- * @version: gcov version magic indicating the gcc version used for compilation
- * @next: list head for a singly-linked list
- * @stamp: time stamp
- * @filename: name of the associated gcov data file
- * @n_functions: number of instrumented functions
- * @functions: function data
- * @ctr_mask: mask specifying which counter types are active
- * @counts: counter data per counter type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
- unsigned int version;
- struct gcov_info *next;
- unsigned int stamp;
- const char *filename;
- unsigned int n_functions;
- const struct gcov_fn_info *functions;
- unsigned int ctr_mask;
- struct gcov_ctr_info counts[0];
-};
-
-/**
- * gcov_info_filename - return info filename
- * @info: profiling data set
- */
-const char *gcov_info_filename(struct gcov_info *info)
-{
- return info->filename;
-}
-
-/**
- * gcov_info_version - return info version
- * @info: profiling data set
- */
-unsigned int gcov_info_version(struct gcov_info *info)
-{
- return info->version;
-}
-
-/**
- * gcov_info_next - return next profiling data set
- * @info: profiling data set
- *
- * Returns next gcov_info following @info or first gcov_info in the chain if
- * @info is %NULL.
- */
-struct gcov_info *gcov_info_next(struct gcov_info *info)
-{
- if (!info)
- return gcov_info_head;
-
- return info->next;
-}
-
-/**
- * gcov_info_link - link/add profiling data set to the list
- * @info: profiling data set
- */
-void gcov_info_link(struct gcov_info *info)
-{
- info->next = gcov_info_head;
- gcov_info_head = info;
-}
-
-/**
- * gcov_info_unlink - unlink/remove profiling data set from the list
- * @prev: previous profiling data set
- * @info: profiling data set
- */
-void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
-{
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
-}
-
-/* Symbolic links to be created for each profiling data file. */
-const struct gcov_link gcov_link[] = {
- { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
- { 0, NULL},
-};
-
-/*
- * Determine whether a counter is active. Based on gcc magic. Doesn't change
- * at run-time.
- */
-static int counter_active(struct gcov_info *info, unsigned int type)
-{
- return (1 << type) & info->ctr_mask;
-}
-
-/* Determine number of active counters. Based on gcc magic. */
-static unsigned int num_counter_active(struct gcov_info *info)
-{
- unsigned int i;
- unsigned int result = 0;
-
- for (i = 0; i < GCOV_COUNTERS; i++) {
- if (counter_active(info, i))
- result++;
- }
- return result;
-}
-
-/**
- * gcov_info_reset - reset profiling data to zero
- * @info: profiling data set
- */
-void gcov_info_reset(struct gcov_info *info)
-{
- unsigned int active = num_counter_active(info);
- unsigned int i;
-
- for (i = 0; i < active; i++) {
- memset(info->counts[i].values, 0,
- info->counts[i].num * sizeof(gcov_type));
- }
-}
-
-/**
- * gcov_info_is_compatible - check if profiling data can be added
- * @info1: first profiling data set
- * @info2: second profiling data set
- *
- * Returns non-zero if profiling data can be added, zero otherwise.
- */
-int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
-{
- return (info1->stamp == info2->stamp);
-}
-
-/**
- * gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
- *
- * Adds profiling counts of @source to @dest.
- */
-void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
-{
- unsigned int i;
- unsigned int j;
-
- for (i = 0; i < num_counter_active(dest); i++) {
- for (j = 0; j < dest->counts[i].num; j++) {
- dest->counts[i].values[j] +=
- source->counts[i].values[j];
- }
- }
-}
-
-/* Get size of function info entry. Based on gcc magic. */
-static size_t get_fn_size(struct gcov_info *info)
-{
- size_t size;
-
- size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
- sizeof(unsigned int);
- if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
- size = ALIGN(size, __alignof__(struct gcov_fn_info));
- return size;
-}
-
-/* Get address of function info entry. Based on gcc magic. */
-static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
-{
- return (struct gcov_fn_info *)
- ((char *) info->functions + fn * get_fn_size(info));
-}
-
-/**
- * gcov_info_dup - duplicate profiling data set
- * @info: profiling data set to duplicate
- *
- * Return newly allocated duplicate on success, %NULL on error.
- */
-struct gcov_info *gcov_info_dup(struct gcov_info *info)
-{
- struct gcov_info *dup;
- unsigned int i;
- unsigned int active;
-
- /* Duplicate gcov_info. */
- active = num_counter_active(info);
- dup = kzalloc(sizeof(struct gcov_info) +
- sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
- if (!dup)
- return NULL;
- dup->version = info->version;
- dup->stamp = info->stamp;
- dup->n_functions = info->n_functions;
- dup->ctr_mask = info->ctr_mask;
- /* Duplicate filename. */
- dup->filename = kstrdup(info->filename, GFP_KERNEL);
- if (!dup->filename)
- goto err_free;
- /* Duplicate table of functions. */
- dup->functions = kmemdup(info->functions, info->n_functions *
- get_fn_size(info), GFP_KERNEL);
- if (!dup->functions)
- goto err_free;
- /* Duplicate counter arrays. */
- for (i = 0; i < active ; i++) {
- struct gcov_ctr_info *ctr = &info->counts[i];
- size_t size = ctr->num * sizeof(gcov_type);
-
- dup->counts[i].num = ctr->num;
- dup->counts[i].merge = ctr->merge;
- dup->counts[i].values = vmalloc(size);
- if (!dup->counts[i].values)
- goto err_free;
- memcpy(dup->counts[i].values, ctr->values, size);
- }
- return dup;
-
-err_free:
- gcov_info_free(dup);
- return NULL;
-}
-
-/**
- * gcov_info_free - release memory for profiling data set duplicate
- * @info: profiling data set duplicate to free
- */
-void gcov_info_free(struct gcov_info *info)
-{
- unsigned int active = num_counter_active(info);
- unsigned int i;
-
- for (i = 0; i < active ; i++)
- vfree(info->counts[i].values);
- kfree(info->functions);
- kfree(info->filename);
- kfree(info);
-}
-
-/**
- * struct type_info - iterator helper array
- * @ctr_type: counter type
- * @offset: index of the first value of the current function for this type
- *
- * This array is needed to convert the in-memory data format into the in-file
- * data format:
- *
- * In-memory:
- * for each counter type
- * for each function
- * values
- *
- * In-file:
- * for each function
- * for each counter type
- * values
- *
- * See gcc source gcc/gcov-io.h for more information on data organization.
- */
-struct type_info {
- int ctr_type;
- unsigned int offset;
-};
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @record: record type
- * @function: function number
- * @type: counter type
- * @count: index into values array
- * @num_types: number of counter types
- * @type_info: helper array to get values-array offset for current function
- */
-struct gcov_iterator {
- struct gcov_info *info;
-
- int record;
- unsigned int function;
- unsigned int type;
- unsigned int count;
-
- int num_types;
- struct type_info type_info[0];
-};
-
-static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
-{
- return get_fn_info(iter->info, iter->function);
-}
-
-static struct type_info *get_type(struct gcov_iterator *iter)
-{
- return &iter->type_info[iter->type];
-}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(sizeof(struct gcov_iterator) +
- num_counter_active(info) * sizeof(struct type_info),
- GFP_KERNEL);
- if (iter)
- iter->info = info;
-
- return iter;
-}
-
-/**
- * gcov_iter_free - release memory for iterator
- * @iter: file iterator to free
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- int i;
-
- iter->record = 0;
- iter->function = 0;
- iter->type = 0;
- iter->count = 0;
- iter->num_types = 0;
- for (i = 0; i < GCOV_COUNTERS; i++) {
- if (counter_active(iter->info, i)) {
- iter->type_info[iter->num_types].ctr_type = i;
- iter->type_info[iter->num_types++].offset = 0;
- }
- }
-}
-
-/* Mapping of logical record number to actual file content. */
-#define RECORD_FILE_MAGIC 0
-#define RECORD_GCOV_VERSION 1
-#define RECORD_TIME_STAMP 2
-#define RECORD_FUNCTION_TAG 3
-#define RECORD_FUNCTON_TAG_LEN 4
-#define RECORD_FUNCTION_IDENT 5
-#define RECORD_FUNCTION_CHECK 6
-#define RECORD_COUNT_TAG 7
-#define RECORD_COUNT_LEN 8
-#define RECORD_COUNT 9
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- switch (iter->record) {
- case RECORD_FILE_MAGIC:
- case RECORD_GCOV_VERSION:
- case RECORD_FUNCTION_TAG:
- case RECORD_FUNCTON_TAG_LEN:
- case RECORD_FUNCTION_IDENT:
- case RECORD_COUNT_TAG:
- /* Advance to next record */
- iter->record++;
- break;
- case RECORD_COUNT:
- /* Advance to next count */
- iter->count++;
- /* fall through */
- case RECORD_COUNT_LEN:
- if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
- iter->record = 9;
- break;
- }
- /* Advance to next counter type */
- get_type(iter)->offset += iter->count;
- iter->count = 0;
- iter->type++;
- /* fall through */
- case RECORD_FUNCTION_CHECK:
- if (iter->type < iter->num_types) {
- iter->record = 7;
- break;
- }
- /* Advance to next function */
- iter->type = 0;
- iter->function++;
- /* fall through */
- case RECORD_TIME_STAMP:
- if (iter->function < iter->info->n_functions)
- iter->record = 3;
- else
- iter->record = -1;
- break;
- }
- /* Check for EOF. */
- if (iter->record == -1)
- return -EINVAL;
- else
- return 0;
-}
-
-/**
- * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
- * @seq: seq_file handle
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file.
- */
-static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
-{
- return seq_write(seq, &v, sizeof(v));
-}
-
-/**
- * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
- * @seq: seq_file handle
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first.
- */
-static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
-{
- u32 data[2];
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- return seq_write(seq, data, sizeof(data));
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- int rc = -EINVAL;
-
- switch (iter->record) {
- case RECORD_FILE_MAGIC:
- rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
- break;
- case RECORD_GCOV_VERSION:
- rc = seq_write_gcov_u32(seq, iter->info->version);
- break;
- case RECORD_TIME_STAMP:
- rc = seq_write_gcov_u32(seq, iter->info->stamp);
- break;
- case RECORD_FUNCTION_TAG:
- rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
- break;
- case RECORD_FUNCTON_TAG_LEN:
- rc = seq_write_gcov_u32(seq, 2);
- break;
- case RECORD_FUNCTION_IDENT:
- rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
- break;
- case RECORD_FUNCTION_CHECK:
- rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
- break;
- case RECORD_COUNT_TAG:
- rc = seq_write_gcov_u32(seq,
- GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
- break;
- case RECORD_COUNT_LEN:
- rc = seq_write_gcov_u32(seq,
- get_func(iter)->n_ctrs[iter->type] * 2);
- break;
- case RECORD_COUNT:
- rc = seq_write_gcov_u64(seq,
- iter->info->counts[iter->type].
- values[iter->count + get_type(iter)->offset]);
- break;
- }
- return rc;
-}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 826ba9fb5e32..ffde93d051a4 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This code provides functions to handle gcc's profiling data format
* introduced with gcc 4.7.
@@ -14,18 +15,28 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include "gcov.h"
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#if (__GNUC__ >= 15)
+#define GCOV_COUNTERS 10
+#elif (__GNUC__ >= 14)
#define GCOV_COUNTERS 9
-#else
+#elif (__GNUC__ >= 10)
#define GCOV_COUNTERS 8
+#else
+#define GCOV_COUNTERS 9
#endif
#define GCOV_TAG_FUNCTION_LENGTH 3
+/* Since GCC 12.1 sizes are in BYTES and not in WORDS (4B). */
+#if (__GNUC__ >= 12)
+#define GCOV_UNIT_SIZE 4
+#else
+#define GCOV_UNIT_SIZE 1
+#endif
+
static struct gcov_info *gcov_info_head;
/**
@@ -63,7 +74,7 @@ struct gcov_fn_info {
unsigned int ident;
unsigned int lineno_checksum;
unsigned int cfg_checksum;
- struct gcov_ctr_info ctrs[0];
+ struct gcov_ctr_info ctrs[];
};
/**
@@ -71,6 +82,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -83,6 +95,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
@@ -145,6 +161,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
gcov_info_head = info->next;
}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info, mod);
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
@@ -210,10 +238,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
/**
* gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
+ * @dst: profiling data set to which data is added
+ * @src: profiling data set which is added
*
- * Adds profiling counts of @source to @dest.
+ * Adds profiling counts of @src to @dst.
*/
void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
{
@@ -293,7 +321,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
cv_size = sizeof(gcov_type) * sci_ptr->num;
- dci_ptr->values = vmalloc(cv_size);
+ dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL);
if (!dci_ptr->values)
goto err_free;
@@ -335,7 +363,7 @@ void gcov_info_free(struct gcov_info *info)
ci_ptr = info->functions[fi_idx]->ctrs;
for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++)
- vfree(ci_ptr->values);
+ kvfree(ci_ptr->values);
kfree(info->functions[fi_idx]);
}
@@ -346,71 +374,6 @@ free_info:
kfree(info);
}
-#define ITER_STRIDE PAGE_SIZE
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @buffer: buffer containing file data
- * @size: size of buffer
- * @pos: current position in file
- */
-struct gcov_iterator {
- struct gcov_info *info;
- void *buffer;
- size_t size;
- loff_t pos;
-};
-
-/**
- * store_gcov_u32 - store 32 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
- * store anything.
- */
-static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
- *data = v;
- }
-
- return sizeof(*data);
-}
-
-/**
- * store_gcov_u64 - store 64 bit number in gcov format to buffer
- * @buffer: target buffer or NULL
- * @off: offset into the buffer
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
- * anything.
- */
-static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
-{
- u32 *data;
-
- if (buffer) {
- data = buffer + off;
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- }
-
- return sizeof(*data) * 2;
-}
-
/**
* convert_to_gcda - convert profiling data set to gcda file format
* @buffer: the buffer to store file data or %NULL if no data should be stored
@@ -418,7 +381,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
*
* Returns the number of bytes that were/would have been stored into the buffer.
*/
-static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+size_t convert_to_gcda(char *buffer, struct gcov_info *info)
{
struct gcov_fn_info *fi_ptr;
struct gcov_ctr_info *ci_ptr;
@@ -432,12 +395,18 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
pos += store_gcov_u32(buffer, pos, info->version);
pos += store_gcov_u32(buffer, pos, info->stamp);
+#if (__GNUC__ >= 12)
+ /* Use zero as checksum of the compilation unit. */
+ pos += store_gcov_u32(buffer, pos, 0);
+#endif
+
for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) {
fi_ptr = info->functions[fi_idx];
/* Function record. */
pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
- pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH);
+ pos += store_gcov_u32(buffer, pos,
+ GCOV_TAG_FUNCTION_LENGTH * GCOV_UNIT_SIZE);
pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum);
pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
@@ -451,7 +420,8 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
/* Counter record. */
pos += store_gcov_u32(buffer, pos,
GCOV_TAG_FOR_COUNTER(ct_idx));
- pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2);
+ pos += store_gcov_u32(buffer, pos,
+ ci_ptr->num * 2 * GCOV_UNIT_SIZE);
for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) {
pos += store_gcov_u64(buffer, pos,
@@ -464,102 +434,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
return pos;
}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
- if (!iter)
- goto err_free;
-
- iter->info = info;
- /* Dry-run to get the actual buffer size. */
- iter->size = convert_to_gcda(NULL, info);
- iter->buffer = vmalloc(iter->size);
- if (!iter->buffer)
- goto err_free;
-
- convert_to_gcda(iter->buffer, info);
-
- return iter;
-
-err_free:
- kfree(iter);
- return NULL;
-}
-
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- vfree(iter->buffer);
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- iter->pos = 0;
-}
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- if (iter->pos < iter->size)
- iter->pos += ITER_STRIDE;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- return 0;
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- size_t len;
-
- if (iter->pos >= iter->size)
- return -EINVAL;
-
- len = ITER_STRIDE;
- if (iter->pos + len > iter->size)
- len = iter->size - iter->pos;
-
- seq_write(seq, iter->buffer + iter->pos, len);
-
- return 0;
-}
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+ static unsigned int gcov_version;
+
+ mutex_lock(&gcov_lock);
+ if (gcov_version == 0) {
+ gcov_version = gcov_info_version(info);
+ /*
+ * Printing gcc's version magic may prove useful for debugging
+ * incompatibility reports.
+ */
+ pr_info("version magic: 0x%x\n", gcov_version);
+ }
+ /*
+ * Add new profiling data structure to list and inform event
+ * listener.
+ */
+ gcov_info_link(info);
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 92c8e22a29ed..912b8ea01d33 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Profiling infrastructure declarations.
*
@@ -14,6 +15,7 @@
#ifndef GCOV_H
#define GCOV_H GCOV_H
+#include <linux/module.h>
#include <linux/types.h>
/*
@@ -45,6 +47,8 @@ unsigned int gcov_info_version(struct gcov_info *info);
struct gcov_info *gcov_info_next(struct gcov_info *info);
void gcov_info_link(struct gcov_info *info);
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
+size_t convert_to_gcda(char *buffer, struct gcov_info *info);
/* Base interface. */
enum gcov_action {
@@ -55,16 +59,9 @@ enum gcov_action {
void gcov_event(enum gcov_action action, struct gcov_info *info);
void gcov_enable_events(void);
-/* Iterator control. */
-struct seq_file;
-struct gcov_iterator;
-
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
-void gcov_iter_free(struct gcov_iterator *iter);
-void gcov_iter_start(struct gcov_iterator *iter);
-int gcov_iter_next(struct gcov_iterator *iter);
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
+/* writing helpers */
+size_t store_gcov_u32(void *buffer, size_t off, u32 v);
+size_t store_gcov_u64(void *buffer, size_t off, u64 v);
/* gcov_info control. */
void gcov_info_reset(struct gcov_info *info);
@@ -82,4 +79,7 @@ struct gcov_link {
};
extern const struct gcov_link gcov_link[];
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
+
#endif /* GCOV_H */
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
new file mode 100755
index 000000000000..896a503dfb29
--- /dev/null
+++ b/kernel/gen_kheaders.sh
@@ -0,0 +1,50 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS.
+set -e
+tarfile=$1
+srclist=$2
+objlist=$3
+timestamp=$4
+
+dir=$(dirname "${tarfile}")
+tmpdir=${dir}/.tmp_dir
+depfile=${dir}/.$(basename "${tarfile}").d
+
+# generate dependency list.
+{
+ echo
+ echo "deps_${tarfile} := \\"
+ sed 's:\(.*\): \1 \\:' "${srclist}"
+ sed -n '/^include\/generated\/autoconf\.h$/!s:\(.*\): \1 \\:p' "${objlist}"
+ echo
+ echo "${tarfile}: \$(deps_${tarfile})"
+ echo
+ echo "\$(deps_${tarfile}):"
+
+} > "${depfile}"
+
+rm -rf "${tmpdir}"
+mkdir "${tmpdir}"
+
+# shellcheck disable=SC2154 # srctree is passed as an env variable
+sed "s:^${srctree}/::" "${srclist}" | ${TAR} -c -f - -C "${srctree}" -T - | ${TAR} -xf - -C "${tmpdir}"
+${TAR} -c -f - -T "${objlist}" | ${TAR} -xf - -C "${tmpdir}"
+
+# Remove comments except SDPX lines
+# Use a temporary file to store directory contents to prevent find/xargs from
+# seeing temporary files created by perl.
+find "${tmpdir}" -type f -print0 > "${tmpdir}.contents.txt"
+xargs -0 -P8 -n1 \
+ perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' \
+ < "${tmpdir}.contents.txt"
+rm -f "${tmpdir}.contents.txt"
+
+# Create archive and try to normalize metadata for reproducibility.
+${TAR} "${timestamp:+--mtime=$timestamp}" \
+ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
+ -I "${XZ}" -cf "${tarfile}" -C "${tmpdir}/" . > /dev/null
+
+rm -rf "${tmpdir}"
diff --git a/kernel/groups.c b/kernel/groups.c
index 74d431d25251..9b43da22647d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Supplementary group IDs
*/
@@ -5,57 +6,29 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <linux/sort.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
struct group_info *groups_alloc(int gidsetsize)
{
- struct group_info *group_info;
- int nblocks;
- int i;
-
- nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
- /* Make sure we always allocate at least one indirect block pointer */
- nblocks = nblocks ? : 1;
- group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
- if (!group_info)
+ struct group_info *gi;
+ gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT);
+ if (!gi)
return NULL;
- group_info->ngroups = gidsetsize;
- group_info->nblocks = nblocks;
- atomic_set(&group_info->usage, 1);
-
- if (gidsetsize <= NGROUPS_SMALL)
- group_info->blocks[0] = group_info->small_block;
- else {
- for (i = 0; i < nblocks; i++) {
- kgid_t *b;
- b = (void *)__get_free_page(GFP_USER);
- if (!b)
- goto out_undo_partial_alloc;
- group_info->blocks[i] = b;
- }
- }
- return group_info;
-out_undo_partial_alloc:
- while (--i >= 0) {
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
- return NULL;
+ refcount_set(&gi->usage, 1);
+ gi->ngroups = gidsetsize;
+ return gi;
}
EXPORT_SYMBOL(groups_alloc);
void groups_free(struct group_info *group_info)
{
- if (group_info->blocks[0] != group_info->small_block) {
- int i;
- for (i = 0; i < group_info->nblocks; i++)
- free_page((unsigned long)group_info->blocks[i]);
- }
- kfree(group_info);
+ kvfree(group_info);
}
EXPORT_SYMBOL(groups_free);
@@ -70,7 +43,7 @@ static int groups_to_user(gid_t __user *grouplist,
for (i = 0; i < count; i++) {
gid_t gid;
- gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i));
+ gid = from_kgid_munged(user_ns, group_info->gid[i]);
if (put_user(gid, grouplist+i))
return -EFAULT;
}
@@ -95,40 +68,26 @@ static int groups_from_user(struct group_info *group_info,
if (!gid_valid(kgid))
return -EINVAL;
- GROUP_AT(group_info, i) = kgid;
+ group_info->gid[i] = kgid;
}
return 0;
}
-/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
+static int gid_cmp(const void *_a, const void *_b)
{
- int base, max, stride;
- int gidsetsize = group_info->ngroups;
-
- for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
- ; /* nothing */
- stride /= 3;
-
- while (stride) {
- max = gidsetsize - stride;
- for (base = 0; base < max; base++) {
- int left = base;
- int right = left + stride;
- kgid_t tmp = GROUP_AT(group_info, right);
-
- while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) {
- GROUP_AT(group_info, right) =
- GROUP_AT(group_info, left);
- right = left;
- left -= stride;
- }
- GROUP_AT(group_info, right) = tmp;
- }
- stride /= 3;
- }
+ kgid_t a = *(kgid_t *)_a;
+ kgid_t b = *(kgid_t *)_b;
+
+ return gid_gt(a, b) - gid_lt(a, b);
}
+void groups_sort(struct group_info *group_info)
+{
+ sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
+ gid_cmp, NULL);
+}
+EXPORT_SYMBOL(groups_sort);
+
/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
{
@@ -141,9 +100,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
- if (gid_gt(grp, GROUP_AT(group_info, mid)))
+ if (gid_gt(grp, group_info->gid[mid]))
left = mid + 1;
- else if (gid_lt(grp, GROUP_AT(group_info, mid)))
+ else if (gid_lt(grp, group_info->gid[mid]))
right = mid;
else
return 1;
@@ -159,7 +118,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
- groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
}
@@ -176,13 +134,26 @@ EXPORT_SYMBOL(set_groups);
int set_current_groups(struct group_info *group_info)
{
struct cred *new;
+ const struct cred *old;
+ int retval;
new = prepare_creds();
if (!new)
return -ENOMEM;
+ old = current_cred();
+
set_groups(new, group_info);
+
+ retval = security_task_fix_setgroups(new, old);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
EXPORT_SYMBOL(set_current_groups);
@@ -215,7 +186,7 @@ bool may_setgroups(void)
{
struct user_namespace *user_ns = current_user_ns();
- return ns_capable(user_ns, CAP_SETGID) &&
+ return ns_capable_setid(user_ns, CAP_SETGID) &&
userns_may_setgroups(user_ns);
}
@@ -243,6 +214,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index e0f90c2b57aa..d2254c91450b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Detect Hung Task
*
@@ -14,14 +15,28 @@
#include <linux/kthread.h>
#include <linux/lockdep.h>
#include <linux/export.h>
+#include <linux/panic_notifier.h>
#include <linux/sysctl.h>
+#include <linux/suspend.h>
#include <linux/utsname.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/sysctl.h>
+#include <linux/hung_task.h>
+#include <linux/rwsem.h>
+#include <linux/sys_info.h>
+
#include <trace/events/sched.h>
/*
* The number of tasks checked:
*/
-int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Total number of tasks detected as hung since boot:
+ */
+static unsigned long __read_mostly sysctl_hung_task_detect_count;
/*
* Limit number of tasks checked in a batch.
@@ -30,35 +45,48 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
* is disabled during the critical section. It also controls the size of
* the RCU grace period. So it needs to be upper-bound.
*/
-#define HUNG_TASK_BATCHING 1024
+#define HUNG_TASK_LOCK_BREAK (HZ / 10)
/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
-int __read_mostly sysctl_hung_task_warnings = 10;
+/*
+ * Zero (default value) means use sysctl_hung_task_timeout_secs:
+ */
+static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+
+static int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
+static bool hung_task_call_panic;
static struct task_struct *watchdog_task;
/*
- * Should we panic (and reboot, if panic_timeout= is set) when a
- * hung task is detected:
+ * A bitmask to control what kinds of system info to be printed when
+ * a hung task is detected, it could be task, memory, lock etc. Refer
+ * include/linux/sys_info.h for detailed bit definition.
*/
-unsigned int __read_mostly sysctl_hung_task_panic =
- CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+static unsigned long hung_task_si_mask;
-static int __init hung_task_panic_setup(char *str)
-{
- int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
+#ifdef CONFIG_SMP
+/*
+ * Should we dump all CPUs backtraces in a hung task event?
+ * Defaults to 0, can be changed via sysctl.
+ */
+static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+#else
+#define sysctl_hung_task_all_cpu_backtrace 0
+#endif /* CONFIG_SMP */
- if (rc)
- return rc;
- return 1;
-}
-__setup("hung_task_panic=", hung_task_panic_setup);
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+static unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC;
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -72,16 +100,19 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
-static void check_hung_task(struct task_struct *t, unsigned long timeout)
+static bool task_is_hung(struct task_struct *t, unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
+ unsigned int state = READ_ONCE(t->__state);
/*
- * Ensure the task is not frozen.
- * Also, skip vfork and any other user process that freezer should skip.
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer
*/
- if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
- return;
+ if (!(state & TASK_UNINTERRUPTIBLE) ||
+ (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN)))
+ return false;
/*
* When a freshly created task is scheduled once, changes its state to
@@ -89,42 +120,156 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* musn't be checked.
*/
if (unlikely(!switch_count))
- return;
+ return false;
if (switch_count != t->last_switch_count) {
t->last_switch_count = switch_count;
+ t->last_switch_time = jiffies;
+ return false;
+ }
+ if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static void debug_show_blocker(struct task_struct *task, unsigned long timeout)
+{
+ struct task_struct *g, *t;
+ unsigned long owner, blocker, blocker_type;
+ const char *rwsem_blocked_by, *rwsem_blocked_as;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
+
+ blocker = READ_ONCE(task->blocker);
+ if (!blocker)
+ return;
+
+ blocker_type = hung_task_get_blocker_type(blocker);
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_SEM:
+ owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ owner = (unsigned long)rwsem_owner(
+ hung_task_blocker_to_lock(blocker));
+ rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
+ "reader" : "writer";
+ rwsem_blocked_by = is_rwsem_reader_owned(
+ hung_task_blocker_to_lock(blocker)) ?
+ "reader" : "writer";
+ break;
+ default:
+ WARN_ON_ONCE(1);
return;
}
- trace_sched_process_hang(t);
- if (!sysctl_hung_task_warnings)
+ if (unlikely(!owner)) {
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
+ task->comm, task->pid);
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
+ }
+ return;
+ }
+
+ /* Ensure the owner information is correct. */
+ for_each_process_thread(g, t) {
+ if ((unsigned long)t != owner)
+ continue;
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
+ task->comm, task->pid, t->comm, t->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
+ task->comm, task->pid, t->comm, t->pid);
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
+ task->comm, task->pid, rwsem_blocked_as, t->comm,
+ t->pid, rwsem_blocked_by);
+ break;
+ }
+ /* Avoid duplicated task dump, skip if the task is also hung. */
+ if (!task_is_hung(t, timeout))
+ sched_show_task(t);
+ return;
+ }
+}
+#else
+static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout)
+{
+}
+#endif
+
+static void check_hung_task(struct task_struct *t, unsigned long timeout,
+ unsigned long prev_detect_count)
+{
+ unsigned long total_hung_task;
+
+ if (!task_is_hung(t, timeout))
return;
- if (sysctl_hung_task_warnings > 0)
- sysctl_hung_task_warnings--;
+ /*
+ * This counter tracks the total number of tasks detected as hung
+ * since boot.
+ */
+ sysctl_hung_task_detect_count++;
+
+ total_hung_task = sysctl_hung_task_detect_count - prev_detect_count;
+ trace_sched_process_hang(t);
+
+ if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_call_panic = true;
+ }
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
- pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, timeout);
- pr_err(" %s %s %.*s\n",
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
- pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- debug_show_held_locks(t);
+ if (sysctl_hung_task_warnings || hung_task_call_panic) {
+ if (sysctl_hung_task_warnings > 0)
+ sysctl_hung_task_warnings--;
+ pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
+ t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
+ pr_err(" %s %s %.*s\n",
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+ if (t->flags & PF_POSTCOREDUMP)
+ pr_err(" Blocked by coredump.\n");
+ pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ debug_show_blocker(t, timeout);
+
+ if (!sysctl_hung_task_warnings)
+ pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
+ }
touch_nmi_watchdog();
-
- if (sysctl_hung_task_panic) {
- trigger_all_cpu_backtrace();
- panic("hung_task: blocked tasks");
- }
}
/*
@@ -158,8 +303,11 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
- int batch_count = HUNG_TASK_BATCHING;
+ unsigned long last_break = jiffies;
struct task_struct *g, *t;
+ unsigned long prev_detect_count = sysctl_hung_task_detect_count;
+ int need_warning = sysctl_hung_task_warnings;
+ unsigned long si_mask = hung_task_si_mask;
/*
* If the system crashed already then all bets are off,
@@ -168,34 +316,53 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;
+
rcu_read_lock();
for_each_process_thread(g, t) {
+
if (!max_count--)
goto unlock;
- if (!--batch_count) {
- batch_count = HUNG_TASK_BATCHING;
+ if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
if (!rcu_lock_break(g, t))
goto unlock;
+ last_break = jiffies;
}
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, timeout);
+
+ check_hung_task(t, timeout, prev_detect_count);
}
unlock:
rcu_read_unlock();
+
+ if (!(sysctl_hung_task_detect_count - prev_detect_count))
+ return;
+
+ if (need_warning || hung_task_call_panic) {
+ si_mask |= SYS_INFO_LOCKS;
+
+ if (sysctl_hung_task_all_cpu_backtrace)
+ si_mask |= SYS_INFO_ALL_BT;
+ }
+
+ sys_info(si_mask);
+
+ if (hung_task_call_panic)
+ panic("hung_task: blocked tasks");
}
-static unsigned long timeout_jiffies(unsigned long timeout)
+static long hung_timeout_jiffies(unsigned long last_checked,
+ unsigned long timeout)
{
/* timeout of 0 will disable the watchdog */
- return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+ return timeout ? last_checked - jiffies + timeout * HZ :
+ MAX_SCHEDULE_TIMEOUT;
}
+#ifdef CONFIG_SYSCTL
/*
* Process updating of timeout sysctl
*/
-int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void __user *buffer,
+static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,
+ void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@@ -211,6 +378,89 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
+static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
+static const struct ctl_table hung_task_sysctls[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "hung_task_all_cpu_backtrace",
+ .data = &sysctl_hung_task_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+ {
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+ {
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_check_interval_secs",
+ .data = &sysctl_hung_task_check_interval_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_NEG_ONE,
+ },
+ {
+ .procname = "hung_task_detect_count",
+ .data = &sysctl_hung_task_detect_count,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0444,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "hung_task_sys_info",
+ .data = &hung_task_si_mask,
+ .maxlen = sizeof(hung_task_si_mask),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
+};
+
+static void __init hung_task_sysctl_init(void)
+{
+ register_sysctl_init("kernel", hung_task_sysctls);
+}
+#else
+#define hung_task_sysctl_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
@@ -219,23 +469,54 @@ void reset_hung_task_detector(void)
}
EXPORT_SYMBOL_GPL(reset_hung_task_detector);
+static bool hung_detector_suspended;
+
+static int hungtask_pm_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case PM_SUSPEND_PREPARE:
+ case PM_HIBERNATION_PREPARE:
+ case PM_RESTORE_PREPARE:
+ hung_detector_suspended = true;
+ break;
+ case PM_POST_SUSPEND:
+ case PM_POST_HIBERNATION:
+ case PM_POST_RESTORE:
+ hung_detector_suspended = false;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
/*
* kthread which checks for tasks stuck in D state
*/
static int watchdog(void *dummy)
{
+ unsigned long hung_last_checked = jiffies;
+
set_user_nice(current, 0);
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
-
- while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
- timeout = sysctl_hung_task_timeout_secs;
-
- if (atomic_xchg(&reset_hung_task, 0))
+ unsigned long interval = sysctl_hung_task_check_interval_secs;
+ long t;
+
+ if (interval == 0)
+ interval = timeout;
+ interval = min_t(unsigned long, interval, timeout);
+ t = hung_timeout_jiffies(hung_last_checked, interval);
+ if (t <= 0) {
+ if (!atomic_xchg(&reset_hung_task, 0) &&
+ !hung_detector_suspended)
+ check_hung_uninterruptible_tasks(timeout);
+ hung_last_checked = jiffies;
continue;
-
- check_hung_uninterruptible_tasks(timeout);
+ }
+ schedule_timeout_interruptible(t);
}
return 0;
@@ -244,7 +525,12 @@ static int watchdog(void *dummy)
static int __init hung_task_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+ /* Disable hung task detector on suspend */
+ pm_notifier(hungtask_pm_notify, 0);
+
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+ hung_task_sysctl_init();
return 0;
}
diff --git a/kernel/iomem.c b/kernel/iomem.c
new file mode 100644
index 000000000000..75e61c1c6bc0
--- /dev/null
+++ b/kernel/iomem.c
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/ioremap.h>
+
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size,
+ unsigned long flags)
+{
+#ifdef ioremap_cache
+ return (__force void *)ioremap_cache(offset, size);
+#else
+ return (__force void *)ioremap(offset, size);
+#endif
+}
+#endif
+
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
+{
+ return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+ unsigned long flags)
+{
+ unsigned long pfn = PHYS_PFN(offset);
+
+ /* In the simple case just return the existing linear address */
+ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+ arch_memremap_can_ram_remap(offset, size, flags))
+ return __va(offset);
+
+ return NULL; /* fallback to arch_memremap_wb */
+}
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ * MEMREMAP_ENC, MEMREMAP_DEC
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable. In the case of multiple flags, the different
+ * mapping types will be attempted in the order listed below until one of
+ * them succeeds.
+ *
+ * MEMREMAP_WB - matches the default mapping for System RAM on
+ * the architecture. This is usually a read-allocate write-back cache.
+ * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility. Attempts to
+ * map System RAM with this mapping type will fail.
+ *
+ * MEMREMAP_WC - establish a writecombine mapping, whereby writes may
+ * be coalesced together (e.g. in the CPU's write buffers), but is otherwise
+ * uncached. Attempts to map System RAM with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+ int is_ram = region_intersects(offset, size,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+ void *addr = NULL;
+
+ if (!flags)
+ return NULL;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ /* Try all mapping types requested until one returns non-NULL */
+ if (flags & MEMREMAP_WB) {
+ /*
+ * MEMREMAP_WB is special in that it can be satisfied
+ * from the direct map. Some archs depend on the
+ * capability of memremap() to autodetect cases where
+ * the requested range is potentially in System RAM.
+ */
+ if (is_ram == REGION_INTERSECTS)
+ addr = try_ram_remap(offset, size, flags);
+ if (!addr)
+ addr = arch_memremap_wb(offset, size, flags);
+ }
+
+ /*
+ * If we don't have a mapping yet and other request flags are
+ * present then we will be attempting to establish a new virtual
+ * address mapping. Enforce that this mapping is not aliasing
+ * System RAM.
+ */
+ if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
+ WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ if (!addr && (flags & MEMREMAP_WT))
+ addr = ioremap_wt(offset, size);
+
+ if (!addr && (flags & MEMREMAP_WC))
+ addr = ioremap_wc(offset, size);
+
+ return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+ if (is_ioremap_addr(addr))
+ iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+ memunmap(*(void **)res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+ return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
+{
+ void **ptr, *addr;
+
+ ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
+ dev_to_node(dev));
+ if (!ptr)
+ return ERR_PTR(-ENOMEM);
+
+ addr = memremap(offset, size, flags);
+ if (addr) {
+ *ptr = addr;
+ devres_add(dev, ptr);
+ } else {
+ devres_free(ptr);
+ return ERR_PTR(-ENXIO);
+ }
+
+ return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+ WARN_ON(devres_release(dev, devm_memremap_release,
+ devm_memremap_match, addr));
+}
+EXPORT_SYMBOL(devm_memunmap);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9a76e3beda54..1b4254d19a73 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "IRQ subsystem"
# Options selectable by the architecture code
@@ -5,10 +6,6 @@ menu "IRQ subsystem"
config MAY_HAVE_SPARSE_IRQ
bool
-# Legacy support, required for itanic
-config GENERIC_IRQ_LEGACY
- bool
-
# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
bool
@@ -21,31 +18,31 @@ config GENERIC_IRQ_SHOW
config GENERIC_IRQ_SHOW_LEVEL
bool
-# Facility to allocate a hardware interrupt. This is legacy support
-# and should not be used in new code. Use irq domains instead.
-config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+# Supports effective affinity mask
+config GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ depends on SMP
bool
# Support for delayed migration from interrupt context
config GENERIC_PENDING_IRQ
bool
+# Support for generic irq migrating off cpu before the cpu is offline.
+config GENERIC_IRQ_MIGRATION
+ bool
+
# Alpha specific irq affinity mechanism
config AUTO_IRQ_AFFINITY
bool
+# Interrupt injection mechanism
+config GENERIC_IRQ_INJECTION
+ bool
+
# Tasklet based software resend for pending interrupts on enable_irq()
config HARDIRQS_SW_RESEND
bool
-# Preflow handler support for fasteoi (sparc64)
-config IRQ_PREFLOW_FASTEOI
- bool
-
-# Edge style eoi based handler (cell)
-config IRQ_EDGE_EOI_HANDLER
- bool
-
# Generic configurable interrupt chip implementation
config GENERIC_IRQ_CHIP
bool
@@ -55,33 +52,58 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+# Support for simulated interrupts
+config IRQ_SIM
+ bool
+ select IRQ_WORK
+ select IRQ_DOMAIN
+
# Support for hierarchical irq domains
config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
-# Generic MSI interrupt support
-config GENERIC_MSI_IRQ
+# Support for obsolete non-mapping irq domains
+config IRQ_DOMAIN_NOMAP
+ bool
+ select IRQ_DOMAIN
+
+# Support for hierarchical fasteoi+edge and fasteoi+level handlers
+config IRQ_FASTEOI_HIERARCHY_HANDLERS
+ bool
+
+# Generic IRQ IPI support
+config GENERIC_IRQ_IPI
+ bool
+ depends on SMP
+ select IRQ_DOMAIN_HIERARCHY
+
+# Generic IRQ IPI Mux support
+config GENERIC_IRQ_IPI_MUX
bool
+ depends on SMP
# Generic MSI hierarchical interrupt domain support
-config GENERIC_MSI_IRQ_DOMAIN
+config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
- select GENERIC_MSI_IRQ
-config HANDLE_DOMAIN_IRQ
+# irqchip drivers should select this if they call iommu_dma_prepare_msi()
+config IRQ_MSI_IOMMU
bool
-config IRQ_DOMAIN_DEBUG
- bool "Expose hardware/virtual IRQ mapping via debugfs"
- depends on IRQ_DOMAIN && DEBUG_FS
- help
- This option will show the mapping relationship between hardware irq
- numbers and Linux irq numbers. The mapping is exposed via debugfs
- in the file "irq_domain_mapping".
+config IRQ_TIMINGS
+ bool
+
+config GENERIC_IRQ_MATRIX_ALLOCATOR
+ bool
- If you don't know what this means you don't need it.
+config GENERIC_IRQ_RESERVATION_MODE
+ bool
+
+# Snapshot for interrupt statistics
+config GENERIC_IRQ_STAT_SNAPSHOT
+ bool
# Support forced irq threading
config IRQ_FORCED_THREADING
@@ -89,7 +111,7 @@ config IRQ_FORCED_THREADING
config SPARSE_IRQ
bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
- ---help---
+ help
Sparse irq numbering is useful for distro kernels that want
to define a high CONFIG_NR_CPUS value but still want to have
@@ -100,4 +122,47 @@ config SPARSE_IRQ
If you don't know what to do here, say N.
+config GENERIC_IRQ_DEBUGFS
+ bool "Expose irq internals in debugfs"
+ depends on DEBUG_FS
+ select GENERIC_IRQ_INJECTION
+ default n
+ help
+
+ Exposes internal state information through debugfs. Mostly for
+ developers and debugging of hard to diagnose interrupt problems.
+
+ If you don't know what to do here, say N.
+
+# Clear forwarded VM interrupts during kexec.
+# This option ensures the kernel clears active states for interrupts
+# forwarded to virtual machines (VMs) during a machine kexec.
+config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
+ bool
+
+config IRQ_KUNIT_TEST
+ bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y
+ depends on SPARSE_IRQ
+ default KUNIT_ALL_TESTS
+ select IRQ_DOMAIN
+ imply SMP
+ help
+ This option enables KUnit tests for the IRQ subsystem API. These are
+ only for development and testing, not for regular kernel use cases.
+
+ If unsure, say N.
+
endmenu
+
+config GENERIC_IRQ_MULTI_HANDLER
+ bool
+ help
+ Allow to specify the low level IRQ handler at run time.
+
+# Cavium Octeon is the last system to use this deprecated option
+# Do not even think of enabling this on any new platform
+config DEPRECATED_IRQ_CPU_ONOFFLINE
+ bool
+ depends on CAVIUM_OCTEON_SOC
+ default CAVIUM_OCTEON_SOC
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index d12123526e2b..6ab3a4055667 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,9 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
-obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o
+obj-$(CONFIG_IRQ_TIMINGS) += timings.o
+ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
+ CFLAGS_timings.o += -DDEBUG
+endif
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
+obj-$(CONFIG_IRQ_SIM) += irq_sim.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
+obj-$(CONFIG_SMP) += affinity.o
+obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
+obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
+obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
new file mode 100644
index 000000000000..4013e6ad2b2f
--- /dev/null
+++ b/kernel/irq/affinity.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016 Thomas Gleixner.
+ * Copyright (C) 2016-2017 Christoph Hellwig.
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/group_cpus.h>
+
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+ affd->nr_sets = 1;
+ affd->set_size[0] = affvecs;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs: The total number of vectors
+ * @affd: Description of the affinity requirements
+ *
+ * Returns the irq_affinity_desc pointer or NULL if allocation failed.
+ */
+struct irq_affinity_desc *
+irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+{
+ unsigned int affvecs, curvec, usedvecs, i;
+ struct irq_affinity_desc *masks = NULL;
+
+ /*
+ * Determine the number of vectors which need interrupt affinities
+ * assigned. If the pre/post request exhausts the available vectors
+ * then nothing to do here except for invoking the calc_sets()
+ * callback so the device driver can adjust to the situation.
+ */
+ if (nvecs > affd->pre_vectors + affd->post_vectors)
+ affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+ else
+ affvecs = 0;
+
+ /*
+ * Simple invocations do not provide a calc_sets() callback. Install
+ * the generic one.
+ */
+ if (!affd->calc_sets)
+ affd->calc_sets = default_calc_sets;
+
+ /* Recalculate the sets */
+ affd->calc_sets(affd, affvecs);
+
+ if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
+ return NULL;
+
+ /* Nothing to assign? */
+ if (!affvecs)
+ return NULL;
+
+ masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ return NULL;
+
+ /* Fill out vectors at the beginning that don't need affinity */
+ for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+ cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
+ /*
+ * Spread on present CPUs starting from affd->pre_vectors. If we
+ * have multiple sets, build each sets affinity mask separately.
+ */
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int nr_masks, this_vecs = affd->set_size[i];
+ struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks);
+
+ if (!result) {
+ kfree(masks);
+ return NULL;
+ }
+
+ for (int j = 0; j < nr_masks; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
+ curvec += nr_masks;
+ usedvecs += nr_masks;
+ }
+
+ /* Fill out vectors at the end that don't need affinity */
+ if (usedvecs >= affvecs)
+ curvec = affd->pre_vectors + affvecs;
+ else
+ curvec = affd->pre_vectors + usedvecs;
+ for (; curvec < nvecs; curvec++)
+ cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
+ /* Mark the managed interrupts */
+ for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
+ masks[i].is_managed = 1;
+
+ return masks;
+}
+
+/**
+ * irq_calc_affinity_vectors - Calculate the optimal number of vectors
+ * @minvec: The minimum number of vectors available
+ * @maxvec: The maximum number of vectors available
+ * @affd: Description of the affinity requirements
+ */
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+ const struct irq_affinity *affd)
+{
+ unsigned int resv = affd->pre_vectors + affd->post_vectors;
+ unsigned int set_vecs;
+
+ if (resv > minvec)
+ return 0;
+
+ if (affd->calc_sets) {
+ set_vecs = maxvec - resv;
+ } else {
+ cpus_read_lock();
+ set_vecs = cpumask_weight(cpu_possible_mask);
+ cpus_read_unlock();
+ }
+
+ return resv + min(set_vecs, maxvec - resv);
+}
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 0119b9d467ae..d0af8a8b3ae6 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/autoprobe.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the interrupt probing code and driver APIs.
@@ -44,18 +43,16 @@ unsigned long probe_irq_on(void)
* flush such a longstanding irq before considering it as spurious.
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
*/
if (desc->irq_data.chip->irq_set_type)
- desc->irq_data.chip->irq_set_type(&desc->irq_data,
- IRQ_TYPE_PROBE);
- irq_startup(desc, false);
+ desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE);
+ irq_activate_and_startup(desc, IRQ_NORESEND);
}
- raw_spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
@@ -67,13 +64,12 @@ unsigned long probe_irq_on(void)
* happened in the previous stage, it may have masked itself)
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
- if (irq_startup(desc, false))
+ if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
- raw_spin_unlock_irq(&desc->lock);
}
/*
@@ -85,18 +81,16 @@ unsigned long probe_irq_on(void)
* Now filter out any obviously spurious interrupts
*/
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
- } else
- if (i < 32)
- mask |= 1 << i;
+ irq_shutdown_and_deactivate(desc);
+ } else if (i < 32) {
+ mask |= 1 << i;
+ }
}
- raw_spin_unlock_irq(&desc->lock);
}
return mask;
@@ -122,15 +116,14 @@ unsigned int probe_irq_mask(unsigned long val)
int i;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
@@ -161,8 +154,7 @@ int probe_irq_off(unsigned long val)
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
@@ -170,9 +162,8 @@ int probe_irq_off(unsigned long val)
nr_of_irqs++;
}
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eb9a4ea394ab..678f094d261a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1,13 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/chip.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the core interrupt handling code, for irq-chip
- * based architectures.
- *
- * Detailed information is available in Documentation/DocBook/genericirq
+ * This file contains the core interrupt handling code, for irq-chip based
+ * architectures. Detailed information is available in
+ * Documentation/core-api/genericirq.rst
*/
#include <linux/irq.h>
@@ -21,103 +19,95 @@
#include "internals.h"
-/**
- * irq_set_chip - set the irq chip for an irq
- * @irq: irq number
- * @chip: pointer to irq chip description structure
- */
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+static irqreturn_t bad_chained_irq(int irq, void *dev_id)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ WARN_ONCE(1, "Chained irq %d should not call an action\n", irq);
+ return IRQ_NONE;
+}
- if (!desc)
- return -EINVAL;
+/*
+ * Chained handlers should never call action on their IRQ. This default
+ * action will emit warning if such thing happens.
+ */
+struct irqaction chained_action = {
+ .handler = bad_chained_irq,
+};
- if (!chip)
- chip = &no_irq_chip;
+/**
+ * irq_set_chip - set the irq chip for an irq
+ * @irq: irq number
+ * @chip: pointer to irq chip description structure
+ */
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
+{
+ int ret = -EINVAL;
- desc->irq_data.chip = chip;
- irq_put_desc_unlock(desc, flags);
- /*
- * For !CONFIG_SPARSE_IRQ make the irq show up in
- * allocated_irqs.
- */
- irq_mark_irq(irq);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
+ ret = 0;
+ }
+ /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */
+ if (!ret)
+ irq_mark_irq(irq);
+ return ret;
}
EXPORT_SYMBOL(irq_set_chip);
/**
- * irq_set_type - set the irq trigger type for an irq
- * @irq: irq number
- * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ * irq_set_irq_type - set the irq trigger type for an irq
+ * @irq: irq number
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
int irq_set_irq_type(unsigned int irq, unsigned int type)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
-
- if (!desc)
- return -EINVAL;
-
- type &= IRQ_TYPE_SENSE_MASK;
- ret = __irq_set_trigger(desc, irq, type);
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL)
+ return __irq_set_trigger(scoped_irqdesc, type);
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_type);
/**
- * irq_set_handler_data - set irq handler data for an irq
- * @irq: Interrupt number
- * @data: Pointer to interrupt specific data
+ * irq_set_handler_data - set irq handler data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to interrupt specific data
*
- * Set the hardware irq controller data for an irq
+ * Set the hardware irq controller data for an irq
*/
int irq_set_handler_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_data.handler_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_common_data.handler_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_handler_data);
/**
- * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
- * @irq_base: Interrupt number base
- * @irq_offset: Interrupt number offset
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq at offset
+ * Set the MSI descriptor entry for an irq at offset
*/
-int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
- struct msi_desc *entry)
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- desc->irq_data.msi_desc = entry;
- if (entry && !irq_offset)
- entry->irq = irq_base;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq_base + irq_offset, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->irq_common_data.msi_desc = entry;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * irq_set_msi_desc - set MSI descriptor data for an irq
- * @irq: Interrupt number
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq
+ * Set the MSI descriptor entry for an irq
*/
int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
@@ -125,22 +115,19 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
}
/**
- * irq_set_chip_data - set irq chip data for an irq
- * @irq: Interrupt number
- * @data: Pointer to chip specific data
+ * irq_set_chip_data - set irq chip data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to chip specific data
*
- * Set the hardware irq chip data for an irq
+ * Set the hardware irq chip data for an irq
*/
int irq_set_chip_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_data.chip_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_chip_data);
@@ -157,62 +144,230 @@ static void irq_state_clr_disabled(struct irq_desc *desc)
irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
}
-static void irq_state_set_disabled(struct irq_desc *desc)
+static void irq_state_clr_masked(struct irq_desc *desc)
{
- irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
}
-static void irq_state_clr_masked(struct irq_desc *desc)
+static void irq_state_clr_started(struct irq_desc *desc)
{
- irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
}
-static void irq_state_set_masked(struct irq_desc *desc)
+static void irq_state_set_started(struct irq_desc *desc)
{
- irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
+ irqd_set(&desc->irq_data, IRQD_IRQ_STARTED);
+}
+
+enum {
+ IRQ_STARTUP_NORMAL,
+ IRQ_STARTUP_MANAGED,
+ IRQ_STARTUP_ABORT,
+};
+
+#ifdef CONFIG_SMP
+static int
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ if (!irqd_affinity_is_managed(d))
+ return IRQ_STARTUP_NORMAL;
+
+ irqd_clr_managed_shutdown(d);
+
+ if (!cpumask_intersects(aff, cpu_online_mask)) {
+ /*
+ * Catch code which fiddles with enable_irq() on a managed
+ * and potentially shutdown IRQ. Chained interrupt
+ * installment or irq auto probing should not happen on
+ * managed irqs either.
+ */
+ if (WARN_ON_ONCE(force))
+ return IRQ_STARTUP_ABORT;
+ /*
+ * The interrupt was requested, but there is no online CPU
+ * in it's affinity mask. Put it into managed shutdown
+ * state and let the cpu hotplug mechanism start it up once
+ * a CPU in the mask becomes available.
+ */
+ return IRQ_STARTUP_ABORT;
+ }
+ /*
+ * Managed interrupts have reserved resources, so this should not
+ * happen.
+ */
+ if (WARN_ON(irq_domain_activate_irq(d, false)))
+ return IRQ_STARTUP_ABORT;
+ return IRQ_STARTUP_MANAGED;
+}
+
+void irq_startup_managed(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ /*
+ * Clear managed-shutdown flag, so we don't repeat managed-startup for
+ * multiple hotplugs, and cause imbalanced disable depth.
+ */
+ irqd_clr_managed_shutdown(d);
+
+ /*
+ * Only start it up when the disable depth is 1, so that a disable,
+ * hotunplug, hotplug sequence does not end up enabling it during
+ * hotplug unconditionally.
+ */
+ desc->depth--;
+ if (!desc->depth)
+ irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+}
+
+#else
+static __always_inline int
+__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
+ bool force)
+{
+ return IRQ_STARTUP_NORMAL;
+}
+#endif
+
+static void irq_enable(struct irq_desc *desc)
+{
+ if (!irqd_irq_disabled(&desc->irq_data)) {
+ unmask_irq(desc);
+ } else {
+ irq_state_clr_disabled(desc);
+ if (desc->irq_data.chip->irq_enable) {
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ } else {
+ unmask_irq(desc);
+ }
+ }
}
-int irq_startup(struct irq_desc *desc, bool resend)
+static int __irq_startup(struct irq_desc *desc)
{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
int ret = 0;
- irq_state_clr_disabled(desc);
- desc->depth = 0;
+ /* Warn if this interrupt is not activated but try nevertheless */
+ WARN_ON_ONCE(!irqd_is_activated(d));
- irq_domain_activate_irq(&desc->irq_data);
- if (desc->irq_data.chip->irq_startup) {
- ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+ if (d->chip->irq_startup) {
+ ret = d->chip->irq_startup(d);
+ irq_state_clr_disabled(desc);
irq_state_clr_masked(desc);
} else {
irq_enable(desc);
}
+ irq_state_set_started(desc);
+ return ret;
+}
+
+int irq_startup(struct irq_desc *desc, bool resend, bool force)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ const struct cpumask *aff = irq_data_get_affinity_mask(d);
+ int ret = 0;
+
+ desc->depth = 0;
+
+ if (irqd_is_started(d)) {
+ irq_enable(desc);
+ } else {
+ switch (__irq_startup_managed(desc, aff, force)) {
+ case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
+ ret = __irq_startup(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
+ break;
+ case IRQ_STARTUP_MANAGED:
+ irq_do_set_affinity(d, aff, false);
+ ret = __irq_startup(desc);
+ break;
+ case IRQ_STARTUP_ABORT:
+ desc->depth = 1;
+ irqd_set_managed_shutdown(d);
+ return 0;
+ }
+ }
if (resend)
- check_irq_resend(desc, desc->irq_data.irq);
+ check_irq_resend(desc, false);
+
return ret;
}
+int irq_activate(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ if (!irqd_affinity_is_managed(d))
+ return irq_domain_activate_irq(d, false);
+ return 0;
+}
+
+int irq_activate_and_startup(struct irq_desc *desc, bool resend)
+{
+ if (WARN_ON(irq_activate(desc)))
+ return 0;
+ return irq_startup(desc, resend, IRQ_START_FORCE);
+}
+
+static void __irq_disable(struct irq_desc *desc, bool mask);
+
void irq_shutdown(struct irq_desc *desc)
{
- irq_state_set_disabled(desc);
- desc->depth = 1;
- if (desc->irq_data.chip->irq_shutdown)
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
- else if (desc->irq_data.chip->irq_disable)
- desc->irq_data.chip->irq_disable(&desc->irq_data);
- else
- desc->irq_data.chip->irq_mask(&desc->irq_data);
+ if (irqd_is_started(&desc->irq_data)) {
+ clear_irq_resend(desc);
+ /*
+ * Increment disable depth, so that a managed shutdown on
+ * CPU hotunplug preserves the actual disabled state when the
+ * CPU comes back online. See irq_startup_managed().
+ */
+ desc->depth++;
+
+ if (desc->irq_data.chip->irq_shutdown) {
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ irq_state_set_disabled(desc);
+ irq_state_set_masked(desc);
+ } else {
+ __irq_disable(desc, true);
+ }
+ irq_state_clr_started(desc);
+ }
+}
+
+
+void irq_shutdown_and_deactivate(struct irq_desc *desc)
+{
+ irq_shutdown(desc);
+ /*
+ * This must be called even if the interrupt was never started up,
+ * because the activation can happen before the interrupt is
+ * available for request/startup. It has it's own state tracking so
+ * it's safe to call it unconditionally.
+ */
irq_domain_deactivate_irq(&desc->irq_data);
- irq_state_set_masked(desc);
}
-void irq_enable(struct irq_desc *desc)
+static void __irq_disable(struct irq_desc *desc, bool mask)
{
- irq_state_clr_disabled(desc);
- if (desc->irq_data.chip->irq_enable)
- desc->irq_data.chip->irq_enable(&desc->irq_data);
- else
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
- irq_state_clr_masked(desc);
+ if (irqd_irq_disabled(&desc->irq_data)) {
+ if (mask)
+ mask_irq(desc);
+ } else {
+ irq_state_set_disabled(desc);
+ if (desc->irq_data.chip->irq_disable) {
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ irq_state_set_masked(desc);
+ } else if (mask) {
+ mask_irq(desc);
+ }
+ }
}
/**
@@ -227,14 +382,17 @@ void irq_enable(struct irq_desc *desc)
* disabled. If an interrupt happens, then the interrupt flow
* handler masks the line at the hardware level and marks it
* pending.
+ *
+ * If the interrupt chip does not implement the irq_disable callback,
+ * a driver can disable the lazy approach for a particular irq line by
+ * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can
+ * be used for devices which cannot disable the interrupt at the
+ * device level under certain circumstances and have to use
+ * disable_irq[_nosync] instead.
*/
void irq_disable(struct irq_desc *desc)
{
- irq_state_set_disabled(desc);
- if (desc->irq_data.chip->irq_disable) {
- desc->irq_data.chip->irq_disable(&desc->irq_data);
- irq_state_set_masked(desc);
- }
+ __irq_disable(desc, irq_settings_disable_unlazy(desc));
}
void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
@@ -257,18 +415,21 @@ void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
static inline void mask_ack_irq(struct irq_desc *desc)
{
- if (desc->irq_data.chip->irq_mask_ack)
+ if (desc->irq_data.chip->irq_mask_ack) {
desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
- else {
- desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_state_set_masked(desc);
+ } else {
+ mask_irq(desc);
if (desc->irq_data.chip->irq_ack)
desc->irq_data.chip->irq_ack(&desc->irq_data);
}
- irq_state_set_masked(desc);
}
void mask_irq(struct irq_desc *desc)
{
+ if (irqd_irq_masked(&desc->irq_data))
+ return;
+
if (desc->irq_data.chip->irq_mask) {
desc->irq_data.chip->irq_mask(&desc->irq_data);
irq_state_set_masked(desc);
@@ -277,6 +438,9 @@ void mask_irq(struct irq_desc *desc)
void unmask_irq(struct irq_desc *desc)
{
+ if (!irqd_irq_masked(&desc->irq_data))
+ return;
+
if (desc->irq_data.chip->irq_unmask) {
desc->irq_data.chip->irq_unmask(&desc->irq_data);
irq_state_clr_masked(desc);
@@ -290,70 +454,36 @@ void unmask_threaded_irq(struct irq_desc *desc)
if (chip->flags & IRQCHIP_EOI_THREADED)
chip->irq_eoi(&desc->irq_data);
- if (chip->irq_unmask) {
- chip->irq_unmask(&desc->irq_data);
- irq_state_clr_masked(desc);
- }
+ unmask_irq(desc);
}
-/*
- * handle_nested_irq - Handle a nested irq from a irq thread
- * @irq: the interrupt number
- *
- * Handle interrupts which are nested into a threaded interrupt
- * handler. The handler function is called inside the calling
- * threads context.
- */
-void handle_nested_irq(unsigned int irq)
+/* Busy wait until INPROGRESS is cleared */
+static bool irq_wait_on_inprogress(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- irqreturn_t action_ret;
-
- might_sleep();
-
- raw_spin_lock_irq(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
-
- action = desc->action;
- if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ if (IS_ENABLED(CONFIG_SMP)) {
+ do {
+ raw_spin_unlock(&desc->lock);
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ } while (irqd_irq_inprogress(&desc->irq_data));
+
+ /* Might have been disabled in meantime */
+ return !irqd_irq_disabled(&desc->irq_data) && desc->action;
}
-
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
- raw_spin_unlock_irq(&desc->lock);
-
- action_ret = action->thread_fn(action->irq, action->dev_id);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock_irq(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock_irq(&desc->lock);
-}
-EXPORT_SYMBOL_GPL(handle_nested_irq);
-
-static bool irq_check_poll(struct irq_desc *desc)
-{
- if (!(desc->istate & IRQS_POLL_INPROGRESS))
- return false;
- return irq_wait_for_poll(desc);
+ return false;
}
-static bool irq_may_run(struct irq_desc *desc)
+static bool irq_can_handle_pm(struct irq_desc *desc)
{
- unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+ struct irq_data *irqd = &desc->irq_data;
+ const struct cpumask *aff;
/*
* If the interrupt is not in progress and is not an armed
* wakeup interrupt, proceed.
*/
- if (!irqd_has_set(&desc->irq_data, mask))
+ if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED))
return true;
/*
@@ -361,50 +491,170 @@ static bool irq_may_run(struct irq_desc *desc)
* and suspended, disable it and notify the pm core about the
* event.
*/
- if (irq_pm_check_wakeup(desc))
+ if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) {
+ irq_pm_handle_wakeup(desc);
+ return false;
+ }
+
+ /* Check whether the interrupt is polled on another CPU */
+ if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) {
+ if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+ "irq poll in progress on cpu %d for irq %d\n",
+ smp_processor_id(), desc->irq_data.irq))
+ return false;
+ return irq_wait_on_inprogress(desc);
+ }
+
+ /* The below works only for single target interrupts */
+ if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) ||
+ !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq)
return false;
/*
- * Handle a potential concurrent poll on a different core.
+ * If the interrupt affinity was moved to this CPU and the
+ * interrupt is currently handled on the previous target CPU, then
+ * busy wait for INPROGRESS to be cleared. Otherwise for edge type
+ * interrupts the handler might get stuck on the previous target:
+ *
+ * CPU 0 CPU 1 (new target)
+ * handle_edge_irq()
+ * repeat:
+ * handle_event() handle_edge_irq()
+ * if (INPROGESS) {
+ * set(PENDING);
+ * mask();
+ * return;
+ * }
+ * if (PENDING) {
+ * clear(PENDING);
+ * unmask();
+ * goto repeat;
+ * }
+ *
+ * This happens when the device raises interrupts with a high rate
+ * and always before handle_event() completes and the CPU0 handler
+ * can clear INPROGRESS. This has been observed in virtual machines.
*/
- return irq_check_poll(desc);
+ aff = irq_data_get_effective_affinity_mask(irqd);
+ if (cpumask_first(aff) != smp_processor_id())
+ return false;
+ return irq_wait_on_inprogress(desc);
+}
+
+static inline bool irq_can_handle_actions(struct irq_desc *desc)
+{
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ return false;
+ }
+ return true;
+}
+
+static inline bool irq_can_handle(struct irq_desc *desc)
+{
+ if (!irq_can_handle_pm(desc))
+ return false;
+
+ return irq_can_handle_actions(desc);
}
/**
- * handle_simple_irq - Simple and software-decoded IRQs.
- * @irq: the interrupt number
- * @desc: the interrupt description structure for this irq
- *
- * Simple interrupts are either sent from a demultiplexing interrupt
- * handler or come from hardware, where no interrupt hardware control
- * is necessary.
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
*
- * Note: The caller is expected to handle the ack, clear, mask and
- * unmask issues if necessary.
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling threads
+ * context.
*/
-void
-handle_simple_irq(unsigned int irq, struct irq_desc *desc)
+void handle_nested_irq(unsigned int irq)
{
- raw_spin_lock(&desc->lock);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
- if (!irq_may_run(desc))
- goto out_unlock;
+ might_sleep();
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (!irq_can_handle_actions(desc))
+ return;
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ action = desc->action;
+ kstat_incr_irqs_this_cpu(desc);
+ atomic_inc(&desc->threads_active);
}
- handle_irq_event(desc);
+ action_ret = IRQ_NONE;
+ for_each_action_of_desc(desc, action)
+ action_ret |= action->thread_fn(action->irq, action->dev_id);
+
+ if (!irq_settings_no_debug(desc))
+ note_interrupt(desc, action_ret);
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ wake_threads_waitq(desc);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control is
+ * necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and unmask
+ * issues if necessary.
+ */
+void handle_simple_irq(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&desc->lock);
+
+ if (!irq_can_handle_pm(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
+ return;
+ }
+
+ if (!irq_can_handle_actions(desc))
+ return;
+
+ kstat_incr_irqs_this_cpu(desc);
+ handle_irq_event(desc);
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
+/**
+ * handle_untracked_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Untracked interrupts are sent from a demultiplexing interrupt handler
+ * when the demultiplexer does not know which device it its multiplexed irq
+ * domain generated the interrupt. IRQ's handled through here are not
+ * subjected to stats tracking, randomness, or spurious interrupt
+ * detection.
+ *
+ * Note: Like handle_simple_irq, the caller is expected to handle the ack,
+ * clear, mask and unmask issues if necessary.
+ */
+void handle_untracked_irq(struct irq_desc *desc)
+{
+ scoped_guard(raw_spinlock, &desc->lock) {
+ if (!irq_can_handle(desc))
+ return;
+
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ }
+
+ __handle_irq_event_percpu(desc);
+
+ scoped_guard(raw_spinlock, &desc->lock)
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+}
+EXPORT_SYMBOL_GPL(handle_untracked_irq);
+
/*
* Called unconditionally from handle_level_irq() and only for oneshot
* interrupts from handle_fasteoi_irq()
@@ -424,55 +674,29 @@ static void cond_unmask_irq(struct irq_desc *desc)
}
/**
- * handle_level_irq - Level type irq handler
- * @irq: the interrupt number
- * @desc: the interrupt description structure for this irq
+ * handle_level_irq - Level type irq handler
+ * @desc: the interrupt description structure for this irq
*
- * Level type interrupts are active as long as the hardware line has
- * the active level. This may require to mask the interrupt and unmask
- * it after the associated handler has acknowledged the device, so the
- * interrupt line is back to inactive.
+ * Level type interrupts are active as long as the hardware line has the
+ * active level. This may require to mask the interrupt and unmask it after
+ * the associated handler has acknowledged the device, so the interrupt
+ * line is back to inactive.
*/
-void
-handle_level_irq(unsigned int irq, struct irq_desc *desc)
+void handle_level_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
mask_ack_irq(desc);
- if (!irq_may_run(desc))
- goto out_unlock;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
-
- /*
- * If its disabled or no action available
- * keep it masked and get out of here
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
- }
+ if (!irq_can_handle(desc))
+ return;
+ kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
cond_unmask_irq(desc);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
-#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
-static inline void preflow_handler(struct irq_desc *desc)
-{
- if (desc->preflow_handler)
- desc->preflow_handler(&desc->irq_data);
-}
-#else
-static inline void preflow_handler(struct irq_desc *desc) { }
-#endif
-
static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
{
if (!(desc->istate & IRQS_ONESHOT)) {
@@ -494,96 +718,119 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
}
}
+static inline void cond_eoi_irq(struct irq_chip *chip, struct irq_data *data)
+{
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(data);
+}
+
/**
- * handle_fasteoi_irq - irq handler for transparent controllers
- * @irq: the interrupt number
- * @desc: the interrupt description structure for this irq
+ * handle_fasteoi_irq - irq handler for transparent controllers
+ * @desc: the interrupt description structure for this irq
*
- * Only a single callback will be issued to the chip: an ->eoi()
- * call when the interrupt has been serviced. This enables support
- * for modern forms of interrupt handlers, which handle the flow
- * details in hardware, transparently.
+ * Only a single callback will be issued to the chip: an ->eoi() call when
+ * the interrupt has been serviced. This enables support for modern forms
+ * of interrupt handlers, which handle the flow details in hardware,
+ * transparently.
*/
-void
-handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_fasteoi_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
-
- if (!irq_may_run(desc))
- goto out;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ guard(raw_spinlock)(&desc->lock);
/*
- * If its disabled or no action available
- * then mask it and get out of here:
+ * When an affinity change races with IRQ handling, the next interrupt
+ * can arrive on the new CPU before the original CPU has completed
+ * handling the previous one - it may need to be resent.
*/
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
+ if (!irq_can_handle_pm(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
+
+ if (!irq_can_handle_actions(desc)) {
mask_irq(desc);
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
+ kstat_incr_irqs_this_cpu(desc);
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- preflow_handler(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
+ /*
+ * When the race described above happens this will resend the interrupt.
+ */
+ if (unlikely(desc->istate & IRQS_PENDING))
+ check_irq_resend(desc, false);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
- * handle_edge_irq - edge type IRQ handler
- * @irq: the interrupt number
+ * handle_fasteoi_nmi - irq handler for NMI interrupt lines
* @desc: the interrupt description structure for this irq
*
- * Interrupt occures on the falling and/or rising edge of a hardware
- * signal. The occurrence is latched into the irq controller hardware
- * and must be acked in order to be reenabled. After the ack another
- * interrupt can happen on the same source even before the first one
- * is handled by the associated event handler. If this happens it
- * might be necessary to disable (mask) the interrupt depending on the
- * controller hardware. This requires to reenable the interrupt inside
- * of the loop which handles the interrupts which have arrived while
- * the handler was running. If all pending interrupts are handled, the
- * loop is left.
+ * A simple NMI-safe handler, considering the restrictions
+ * from request_nmi.
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
*/
-void
-handle_edge_irq(unsigned int irq, struct irq_desc *desc)
+void handle_fasteoi_nmi(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
- if (!irq_may_run(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
+ __kstat_incr_irqs_this_cpu(desc);
+ trace_irq_handler_entry(irq, action);
/*
- * If its disabled or no action available then mask it and get
- * out of here.
+ * NMIs cannot be shared, there is only one action.
*/
- if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
+
+/**
+ * handle_edge_irq - edge type IRQ handler
+ * @desc: the interrupt description structure for this irq
+ *
+ * Interrupt occurs on the falling and/or rising edge of a hardware
+ * signal. The occurrence is latched into the irq controller hardware and
+ * must be acked in order to be reenabled. After the ack another interrupt
+ * can happen on the same source even before the first one is handled by
+ * the associated event handler. If this happens it might be necessary to
+ * disable (mask) the interrupt depending on the controller hardware. This
+ * requires to reenable the interrupt inside of the loop which handles the
+ * interrupts which have arrived while the handler was running. If all
+ * pending interrupts are handled, the loop is left.
+ */
+void handle_edge_irq(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&desc->lock);
+
+ if (!irq_can_handle(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
- goto out_unlock;
+ return;
}
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
@@ -591,13 +838,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
do {
if (unlikely(!desc->action)) {
mask_irq(desc);
- goto out_unlock;
+ return;
}
/*
* When another irq arrived while we were handling
* one, we could have masked the irq.
- * Renable it, if it was not disabled in meantime.
+ * Reenable it, if it was not disabled in meantime.
*/
if (unlikely(desc->istate & IRQS_PENDING)) {
if (!irqd_irq_disabled(&desc->irq_data) &&
@@ -607,80 +854,30 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
handle_irq_event(desc);
- } while ((desc->istate & IRQS_PENDING) &&
- !irqd_irq_disabled(&desc->irq_data));
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data));
}
EXPORT_SYMBOL(handle_edge_irq);
-#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
-/**
- * handle_edge_eoi_irq - edge eoi type IRQ handler
- * @irq: the interrupt number
- * @desc: the interrupt description structure for this irq
- *
- * Similar as the above handle_edge_irq, but using eoi and w/o the
- * mask/unmask logic.
- */
-void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
-{
- struct irq_chip *chip = irq_desc_get_chip(desc);
-
- raw_spin_lock(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- if (!irq_may_run(desc)) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
-
- /*
- * If its disabled or no action available then mask it and get
- * out of here.
- */
- if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
-
- kstat_incr_irqs_this_cpu(irq, desc);
-
- do {
- if (unlikely(!desc->action))
- goto out_eoi;
-
- handle_irq_event(desc);
-
- } while ((desc->istate & IRQS_PENDING) &&
- !irqd_irq_disabled(&desc->irq_data));
-
-out_eoi:
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
-}
-#endif
-
/**
* handle_percpu_irq - Per CPU local irq handler
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Per CPU interrupts on SMP machines without locking requirements
*/
-void
-handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
- handle_irq_event_percpu(desc, desc->action);
+ handle_irq_event_percpu(desc);
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
@@ -688,7 +885,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
/**
* handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Per CPU interrupts on SMP machines without locking requirements. Same as
@@ -698,36 +894,49 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
* contain the real device id for the cpu on which this handler is
* called
*/
-void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_devid_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- struct irqaction *action = desc->action;
- void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
+ unsigned int irq = irq_desc_get_irq(desc);
+ unsigned int cpu = smp_processor_id();
+ struct irqaction *action;
irqreturn_t res;
- kstat_incr_irqs_this_cpu(irq, desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
- trace_irq_handler_entry(irq, action);
- res = action->handler(irq, dev_id);
- trace_irq_handler_exit(irq, action, res);
+ for (action = desc->action; action; action = action->next)
+ if (cpumask_test_cpu(cpu, action->affinity))
+ break;
+
+ if (likely(action)) {
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+ } else {
+ bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
+
+ if (enabled)
+ irq_percpu_disable(desc, cpu);
+
+ pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n",
+ enabled ? " and unmasked" : "", irq, cpu);
+ }
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
}
-void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
- const char *name)
+static void
+__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
+ int is_chained, const char *name)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
if (!handle) {
handle = handle_bad_irq;
} else {
@@ -745,17 +954,17 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
break;
/*
* Bail out if the outer chip is not set up
- * and the interrrupt supposed to be started
+ * and the interrupt supposed to be started
* right away.
*/
if (WARN_ON(is_chained))
- goto out;
+ return;
/* Try the parent */
irq_data = irq_data->parent_data;
}
#endif
if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
- goto out;
+ return;
}
/* Uninstall? */
@@ -763,24 +972,62 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
irq_state_set_disabled(desc);
+ if (is_chained) {
+ desc->action = NULL;
+ WARN_ON(irq_chip_pm_put(irq_desc_get_irq_data(desc)));
+ }
desc->depth = 1;
}
desc->handle_irq = handle;
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
+ unsigned int type = irqd_get_trigger_type(&desc->irq_data);
+
+ /*
+ * We're about to start this interrupt immediately,
+ * hence the need to set the trigger configuration.
+ * But the .set_type callback may have overridden the
+ * flow handler, ignoring that we're dealing with a
+ * chained interrupt. Reset it immediately because we
+ * do know better.
+ */
+ if (type != IRQ_TYPE_NONE) {
+ __irq_set_trigger(desc, type);
+ desc->handle_irq = handle;
+ }
+
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
- irq_startup(desc, true);
+ desc->action = &chained_action;
+ WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc)));
+ irq_activate_and_startup(desc, IRQ_RESEND);
}
-out:
- irq_put_desc_busunlock(desc, flags);
+}
+
+void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
+{
+ scoped_irqdesc_get_and_buslock(irq, 0)
+ __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
+void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+ void *data)
+{
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+
+ desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
+ }
+}
+EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
+
void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
@@ -790,30 +1037,38 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ unsigned long trigger, tmp;
+ /*
+ * Warn when a driver sets the no autoenable flag on an already
+ * active interrupt.
+ */
+ WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
- if (!desc)
- return;
- irq_settings_clr_and_set(desc, clr, set);
+ irq_settings_clr_and_set(desc, clr, set);
- irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
- IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
- if (irq_settings_has_no_balance_set(desc))
- irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
- if (irq_settings_is_per_cpu(desc))
- irqd_set(&desc->irq_data, IRQD_PER_CPU);
- if (irq_settings_can_move_pcntxt(desc))
- irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
- if (irq_settings_is_level(desc))
- irqd_set(&desc->irq_data, IRQD_LEVEL);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
- irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+ irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+ IRQD_TRIGGER_MASK | IRQD_LEVEL);
+ if (irq_settings_has_no_balance_set(desc))
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ if (irq_settings_is_per_cpu(desc))
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ if (irq_settings_is_level(desc))
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
- irq_put_desc_unlock(desc, flags);
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
+
+ irqd_set(&desc->irq_data, trigger);
+ }
}
EXPORT_SYMBOL_GPL(irq_modify_status);
+#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
/**
* irq_cpu_online - Invoke all irq_cpu_online functions.
*
@@ -822,25 +1077,21 @@ EXPORT_SYMBOL_GPL(irq_modify_status);
*/
void irq_cpu_online(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_online &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_online(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -852,29 +1103,208 @@ void irq_cpu_online(void)
*/
void irq_cpu_offline(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_offline &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_offline(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+#endif
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+
+#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
+/**
+ * handle_fasteoi_ack_irq - irq handler for edge hierarchy stacked on
+ * transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_ack() function called.
+ */
+void handle_fasteoi_ack_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ guard(raw_spinlock)(&desc->lock);
+
+ if (!irq_can_handle_pm(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
+
+ if (unlikely(!irq_can_handle_actions(desc))) {
+ mask_irq(desc);
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
+
+ desc->irq_data.chip->irq_ack(&desc->irq_data);
+
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
+
+/**
+ * handle_fasteoi_mask_irq - irq handler for level hierarchy stacked on
+ * transparent controllers
+ *
+ * @desc: the interrupt description structure for this irq
+ *
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_mask_ack() function called.
+ */
+void handle_fasteoi_mask_irq(struct irq_desc *desc)
+{
+ struct irq_chip *chip = desc->irq_data.chip;
+
+ guard(raw_spinlock)(&desc->lock);
+ mask_ack_irq(desc);
+
+ if (!irq_can_handle(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
+
+ kstat_incr_irqs_this_cpu(desc);
+
+ handle_irq_event(desc);
+
+ cond_unmask_eoi_irq(desc, chip);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
+
+#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
+
+/**
+ * irq_chip_set_parent_state - set the state of a parent interrupt.
+ *
+ * @data: Pointer to interrupt specific data
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * Conditional success, if the underlying irqchip does not implement it.
+ */
+int irq_chip_set_parent_state(struct irq_data *data,
+ enum irqchip_irq_state which,
+ bool val)
+{
+ data = data->parent_data;
+
+ if (!data || !data->chip->irq_set_irqchip_state)
+ return 0;
+
+ return data->chip->irq_set_irqchip_state(data, which, val);
+}
+EXPORT_SYMBOL_GPL(irq_chip_set_parent_state);
+
+/**
+ * irq_chip_get_parent_state - get the state of a parent interrupt.
+ *
+ * @data: Pointer to interrupt specific data
+ * @which: one of IRQCHIP_STATE_* the caller wants to know
+ * @state: a pointer to a boolean where the state is to be stored
+ *
+ * Conditional success, if the underlying irqchip does not implement it.
+ */
+int irq_chip_get_parent_state(struct irq_data *data,
+ enum irqchip_irq_state which,
+ bool *state)
+{
+ data = data->parent_data;
+
+ if (!data || !data->chip->irq_get_irqchip_state)
+ return 0;
+
+ return data->chip->irq_get_irqchip_state(data, which, state);
+}
+EXPORT_SYMBOL_GPL(irq_chip_get_parent_state);
+
+/**
+ * irq_chip_shutdown_parent - Shutdown the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_shutdown() callback of the parent if available or falls
+ * back to irq_chip_disable_parent().
+ */
+void irq_chip_shutdown_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_shutdown)
+ parent->chip->irq_shutdown(parent);
+ else
+ irq_chip_disable_parent(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent);
+
+/**
+ * irq_chip_startup_parent - Startup the parent interrupt
+ * @data: Pointer to interrupt specific data
+ *
+ * Invokes the irq_startup() callback of the parent if available or falls
+ * back to irq_chip_enable_parent().
+ */
+unsigned int irq_chip_startup_parent(struct irq_data *data)
+{
+ struct irq_data *parent = data->parent_data;
+
+ if (parent->chip->irq_startup)
+ return parent->chip->irq_startup(parent);
+
+ irq_chip_enable_parent(data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_chip_startup_parent);
+
+/**
+ * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
+ * NULL)
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_enable_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_enable)
+ data->chip->irq_enable(data);
+ else
+ data->chip->irq_unmask(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_enable_parent);
+
+/**
+ * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
+ * NULL)
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_disable_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_disable)
+ data->chip->irq_disable(data);
+ else
+ data->chip->irq_mask(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_disable_parent);
+
/**
* irq_chip_ack_parent - Acknowledge the parent interrupt
* @data: Pointer to interrupt specific data
@@ -884,6 +1314,7 @@ void irq_chip_ack_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_ack(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_ack_parent);
/**
* irq_chip_mask_parent - Mask the parent interrupt
@@ -894,6 +1325,18 @@ void irq_chip_mask_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_mask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
+
+/**
+ * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask_ack(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent);
/**
* irq_chip_unmask_parent - Unmask the parent interrupt
@@ -904,6 +1347,7 @@ void irq_chip_unmask_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_unmask(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_unmask_parent);
/**
* irq_chip_eoi_parent - Invoke EOI on the parent interrupt
@@ -914,6 +1358,7 @@ void irq_chip_eoi_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_eoi(data);
}
+EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
/**
* irq_chip_set_affinity_parent - Set affinity on the parent interrupt
@@ -921,7 +1366,7 @@ void irq_chip_eoi_parent(struct irq_data *data)
* @dest: The affinity mask to set
* @force: Flag to enforce setting (disable online checks)
*
- * Conditinal, as the underlying parent chip might not implement it.
+ * Conditional, as the underlying parent chip might not implement it.
*/
int irq_chip_set_affinity_parent(struct irq_data *data,
const struct cpumask *dest, bool force)
@@ -932,6 +1377,25 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_affinity_parent);
+
+/**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_set_type)
+ return data->chip->irq_set_type(data, type);
+
+ return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(irq_chip_set_type_parent);
/**
* irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
@@ -946,10 +1410,25 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
if (data->chip && data->chip->irq_retrigger)
return data->chip->irq_retrigger(data);
- return -ENOSYS;
+ return 0;
}
+EXPORT_SYMBOL_GPL(irq_chip_retrigger_hierarchy);
/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @vcpu_info: The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_vcpu_affinity)
+ return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
+ return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(irq_chip_set_vcpu_affinity_parent);
+/**
* irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
* @data: Pointer to interrupt specific data
* @on: Whether to set or reset the wake-up capability of this irq
@@ -959,15 +1438,48 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
{
data = data->parent_data;
+
+ if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
if (data->chip->irq_set_wake)
return data->chip->irq_set_wake(data, on);
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
+
+/**
+ * irq_chip_request_resources_parent - Request resources on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+int irq_chip_request_resources_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_request_resources)
+ return data->chip->irq_request_resources(data);
+
+ /* no error on missing optional irq_chip::irq_request_resources */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
+
+/**
+ * irq_chip_release_resources_parent - Release resources on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_release_resources_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_release_resources)
+ data->chip->irq_release_resources(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
#endif
/**
- * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * irq_chip_compose_msi_msg - Compose msi message for a irq chip
* @data: Pointer to interrupt specific data
* @msg: Pointer to the MSI message
*
@@ -977,17 +1489,61 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
*/
int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct irq_data *pos = NULL;
+ struct irq_data *pos;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- for (; data; data = data->parent_data)
-#endif
+ for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) {
if (data->chip && data->chip->irq_compose_msi_msg)
pos = data;
+ }
+
if (!pos)
return -ENOSYS;
pos->chip->irq_compose_msi_msg(pos, msg);
-
return 0;
}
+
+static struct device *irq_get_pm_device(struct irq_data *data)
+{
+ if (data->domain)
+ return data->domain->pm_dev;
+
+ return NULL;
+}
+
+/**
+ * irq_chip_pm_get - Enable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Enable the power to the IRQ chip referenced by the interrupt data
+ * structure.
+ */
+int irq_chip_pm_get(struct irq_data *data)
+{
+ struct device *dev = irq_get_pm_device(data);
+ int retval = 0;
+
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_resume_and_get(dev);
+
+ return retval;
+}
+
+/**
+ * irq_chip_pm_put - Disable power for an IRQ chip
+ * @data: Pointer to interrupt specific data
+ *
+ * Disable the power to the IRQ chip referenced by the interrupt data
+ * structure, belongs. Note that power will only be disabled, once this
+ * function has been called for all IRQs that have called irq_chip_pm_get().
+ */
+int irq_chip_pm_put(struct irq_data *data)
+{
+ struct device *dev = irq_get_pm_device(data);
+ int retval = 0;
+
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_put(dev);
+
+ return (retval < 0) ? retval : 0;
+}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
new file mode 100644
index 000000000000..755346ea9819
--- /dev/null
+++ b/kernel/irq/cpuhotplug.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic cpu hotunplug interrupt migration code copied from the
+ * arch/arm implementation
+ *
+ * Copyright (C) Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/interrupt.h>
+#include <linux/ratelimit.h>
+#include <linux/irq.h>
+#include <linux/sched/isolation.h>
+
+#include "internals.h"
+
+/* For !GENERIC_IRQ_EFFECTIVE_AFF_MASK this looks at general affinity mask */
+static inline bool irq_needs_fixup(struct irq_data *d)
+{
+ const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+ unsigned int cpu = smp_processor_id();
+
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ /*
+ * The cpumask_empty() check is a workaround for interrupt chips,
+ * which do not implement effective affinity, but the architecture has
+ * enabled the config switch. Use the general affinity mask instead.
+ */
+ if (cpumask_empty(m))
+ m = irq_data_get_affinity_mask(d);
+
+ /*
+ * Sanity check. If the mask is not empty when excluding the outgoing
+ * CPU then it must contain at least one online CPU. The outgoing CPU
+ * has been removed from the online mask already.
+ */
+ if (cpumask_any_but(m, cpu) < nr_cpu_ids &&
+ !cpumask_intersects(m, cpu_online_mask)) {
+ /*
+ * If this happens then there was a missed IRQ fixup at some
+ * point. Warn about it and enforce fixup.
+ */
+ pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n",
+ cpumask_pr_args(m), d->irq, cpu);
+ return true;
+ }
+#endif
+ return cpumask_test_cpu(cpu, m);
+}
+
+static bool migrate_one_irq(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *chip = irq_data_get_irq_chip(d);
+ bool maskchip = !irq_can_move_pcntxt(d) && !irqd_irq_masked(d);
+ const struct cpumask *affinity;
+ bool brokeaff = false;
+ int err;
+
+ /*
+ * IRQ chip might be already torn down, but the irq descriptor is
+ * still in the radix tree. Also if the chip has no affinity setter,
+ * nothing can be done here.
+ */
+ if (!chip || !chip->irq_set_affinity) {
+ pr_debug("IRQ %u: Unable to migrate away\n", d->irq);
+ return false;
+ }
+
+ /*
+ * Complete an eventually pending irq move cleanup. If this
+ * interrupt was moved in hard irq context, then the vectors need
+ * to be cleaned up. It can't wait until this interrupt actually
+ * happens and this CPU was involved.
+ */
+ irq_force_complete_move(desc);
+
+ /*
+ * No move required, if:
+ * - Interrupt is per cpu
+ * - Interrupt is not started
+ * - Affinity mask does not include this CPU.
+ *
+ * Note: Do not check desc->action as this might be a chained
+ * interrupt.
+ */
+ if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !irq_needs_fixup(d)) {
+ /*
+ * If an irq move is pending, abort it if the dying CPU is
+ * the sole target.
+ */
+ irq_fixup_move_pending(desc, false);
+ return false;
+ }
+
+ /*
+ * If there is a setaffinity pending, then try to reuse the pending
+ * mask, so the last change of the affinity does not get lost. If
+ * there is no move pending or the pending mask does not contain
+ * any online CPU, use the current affinity mask.
+ */
+ if (irq_fixup_move_pending(desc, true))
+ affinity = irq_desc_get_pending_mask(desc);
+ else
+ affinity = irq_data_get_affinity_mask(d);
+
+ /* Mask the chip for interrupts which cannot move in process context */
+ if (maskchip && chip->irq_mask)
+ chip->irq_mask(d);
+
+ if (!cpumask_intersects(affinity, cpu_online_mask)) {
+ /*
+ * If the interrupt is managed, then shut it down and leave
+ * the affinity untouched.
+ */
+ if (irqd_affinity_is_managed(d)) {
+ irqd_set_managed_shutdown(d);
+ irq_shutdown_and_deactivate(desc);
+ return false;
+ }
+ affinity = cpu_online_mask;
+ brokeaff = true;
+ }
+ /*
+ * Do not set the force argument of irq_do_set_affinity() as this
+ * disables the masking of offline CPUs from the supplied affinity
+ * mask and therefore might keep/reassign the irq to the outgoing
+ * CPU.
+ */
+ err = irq_do_set_affinity(d, affinity, false);
+
+ /*
+ * If there are online CPUs in the affinity mask, but they have no
+ * vectors left to make the migration work, try to break the
+ * affinity by migrating to any online CPU.
+ */
+ if (err == -ENOSPC && !irqd_affinity_is_managed(d) && affinity != cpu_online_mask) {
+ pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with online CPUs\n",
+ d->irq, cpumask_pr_args(affinity));
+
+ affinity = cpu_online_mask;
+ brokeaff = true;
+
+ err = irq_do_set_affinity(d, affinity, false);
+ }
+
+ if (err) {
+ pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
+ d->irq, err);
+ brokeaff = false;
+ }
+
+ if (maskchip && chip->irq_unmask)
+ chip->irq_unmask(d);
+
+ return brokeaff;
+}
+
+/**
+ * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu
+ *
+ * The current CPU has been marked offline. Migrate IRQs off this CPU.
+ * If the affinity settings do not allow other CPUs, force them onto any
+ * available CPU.
+ *
+ * Note: we must iterate over all IRQs, whether they have an attached
+ * action structure or not, as we need to get chained interrupts too.
+ */
+void irq_migrate_all_off_this_cpu(void)
+{
+ struct irq_desc *desc;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ bool affinity_broken;
+
+ desc = irq_to_desc(irq);
+ scoped_guard(raw_spinlock, &desc->lock)
+ affinity_broken = migrate_one_irq(desc);
+
+ if (affinity_broken) {
+ pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
+ irq, smp_processor_id());
+ }
+ }
+}
+
+static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
+{
+ const struct cpumask *hk_mask;
+
+ if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
+ return false;
+
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
+ if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
+ return false;
+
+ return cpumask_test_cpu(cpu, hk_mask);
+}
+
+static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
+{
+ struct irq_data *data = irq_desc_get_irq_data(desc);
+ const struct cpumask *affinity = irq_data_get_affinity_mask(data);
+
+ if (!irqd_affinity_is_managed(data) || !desc->action ||
+ !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
+ return;
+
+ if (irqd_is_managed_and_shutdown(data))
+ irq_startup_managed(desc);
+
+ /*
+ * If the interrupt can only be directed to a single target
+ * CPU then it is already assigned to a CPU in the affinity
+ * mask. No point in trying to move it around unless the
+ * isolation mechanism requests to move it to an upcoming
+ * housekeeping CPU.
+ */
+ if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu))
+ irq_set_affinity_locked(data, affinity, false);
+}
+
+/**
+ * irq_affinity_online_cpu - Restore affinity for managed interrupts
+ * @cpu: Upcoming CPU for which interrupts should be restored
+ */
+int irq_affinity_online_cpu(unsigned int cpu)
+{
+ struct irq_desc *desc;
+ unsigned int irq;
+
+ irq_lock_sparse();
+ for_each_active_irq(irq) {
+ desc = irq_to_desc(irq);
+ scoped_guard(raw_spinlock_irq, &desc->lock)
+ irq_restore_affinity_of_irq(desc, cpu);
+ }
+ irq_unlock_sparse();
+
+ return 0;
+}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index e75e29e4434a..8ccb326d2977 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -1,9 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Debugging printout:
*/
-#include <linux/kallsyms.h>
-
#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
/* FIXME */
@@ -11,16 +10,21 @@
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
- printk("->handle_irq(): %p, ", desc->handle_irq);
- print_symbol("%s\n", (unsigned long)desc->handle_irq);
- printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
- print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+ printk("->handle_irq(): %p, %pS\n",
+ desc->handle_irq, desc->handle_irq);
+ printk("->irq_data.chip(): %p, %pS\n",
+ desc->irq_data.chip, desc->irq_data.chip);
printk("->action(): %p\n", desc->action);
if (desc->action) {
- printk("->action->handler(): %p, ", desc->action->handler);
- print_symbol("%s\n", (unsigned long)desc->action->handler);
+ printk("->action->handler(): %p, %pS\n",
+ desc->action->handler, desc->action->handler);
}
___P(IRQ_LEVEL);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
new file mode 100644
index 000000000000..3527defd2890
--- /dev/null
+++ b/kernel/irq/debugfs.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
+
+#include <linux/irqdomain.h>
+#include <linux/irq.h>
+#include <linux/uaccess.h>
+
+#include "internals.h"
+
+static struct dentry *irq_dir;
+
+void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+ const struct irq_bit_descr *sd, int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++, sd++) {
+ if (state & sd->mask)
+ seq_printf(m, "%*s%s\n", ind + 12, "", sd->name);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
+{
+ struct irq_data *data = irq_desc_get_irq_data(desc);
+ const struct cpumask *msk;
+
+ msk = irq_data_get_affinity_mask(data);
+ seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ msk = irq_data_get_effective_affinity_mask(data);
+ seq_printf(m, "effectiv: %*pbl\n", cpumask_pr_args(msk));
+#endif
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ msk = desc->pending_mask;
+ seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk));
+#endif
+}
+#else
+static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { }
+#endif
+
+static const struct irq_bit_descr irqchip_flags[] = {
+ BIT_MASK_DESCR(IRQCHIP_SET_TYPE_MASKED),
+ BIT_MASK_DESCR(IRQCHIP_EOI_IF_HANDLED),
+ BIT_MASK_DESCR(IRQCHIP_MASK_ON_SUSPEND),
+ BIT_MASK_DESCR(IRQCHIP_ONOFFLINE_ENABLED),
+ BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE),
+ BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
+ BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
+ BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
+ BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
+ BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
+ BIT_MASK_DESCR(IRQCHIP_IMMUTABLE),
+ BIT_MASK_DESCR(IRQCHIP_MOVE_DEFERRED),
+};
+
+static void
+irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
+{
+ struct irq_chip *chip = data->chip;
+
+ if (!chip) {
+ seq_printf(m, "chip: None\n");
+ return;
+ }
+ seq_printf(m, "%*schip: ", ind, "");
+ if (chip->irq_print_chip)
+ chip->irq_print_chip(data, m);
+ else
+ seq_printf(m, "%s", chip->name);
+ seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
+ irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
+ ARRAY_SIZE(irqchip_flags));
+}
+
+static void
+irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
+{
+ seq_printf(m, "%*sdomain: %s\n", ind, "",
+ data->domain ? data->domain->name : "");
+ seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq);
+ irq_debug_show_chip(m, data, ind + 1);
+ if (data->domain && data->domain->ops && data->domain->ops->debug_show)
+ data->domain->ops->debug_show(m, NULL, data, ind + 1);
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ if (!data->parent_data)
+ return;
+ seq_printf(m, "%*sparent:\n", ind + 1, "");
+ irq_debug_show_data(m, data->parent_data, ind + 4);
+#endif
+}
+
+static const struct irq_bit_descr irqdata_states[] = {
+ BIT_MASK_DESCR(IRQ_TYPE_EDGE_RISING),
+ BIT_MASK_DESCR(IRQ_TYPE_EDGE_FALLING),
+ BIT_MASK_DESCR(IRQ_TYPE_LEVEL_HIGH),
+ BIT_MASK_DESCR(IRQ_TYPE_LEVEL_LOW),
+ BIT_MASK_DESCR(IRQD_LEVEL),
+
+ BIT_MASK_DESCR(IRQD_ACTIVATED),
+ BIT_MASK_DESCR(IRQD_IRQ_STARTED),
+ BIT_MASK_DESCR(IRQD_IRQ_DISABLED),
+ BIT_MASK_DESCR(IRQD_IRQ_MASKED),
+ BIT_MASK_DESCR(IRQD_IRQ_INPROGRESS),
+
+ BIT_MASK_DESCR(IRQD_PER_CPU),
+ BIT_MASK_DESCR(IRQD_NO_BALANCING),
+
+ BIT_MASK_DESCR(IRQD_SINGLE_TARGET),
+ BIT_MASK_DESCR(IRQD_AFFINITY_SET),
+ BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
+ BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
+ BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
+ BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+ BIT_MASK_DESCR(IRQD_CAN_RESERVE),
+
+ BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
+
+ BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
+ BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
+
+ BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET),
+
+ BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
+
+ BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
+
+ BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS),
+};
+
+static const struct irq_bit_descr irqdesc_states[] = {
+ BIT_MASK_DESCR(_IRQ_NOPROBE),
+ BIT_MASK_DESCR(_IRQ_NOREQUEST),
+ BIT_MASK_DESCR(_IRQ_NOTHREAD),
+ BIT_MASK_DESCR(_IRQ_NOAUTOEN),
+ BIT_MASK_DESCR(_IRQ_NESTED_THREAD),
+ BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID),
+ BIT_MASK_DESCR(_IRQ_IS_POLLED),
+ BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
+ BIT_MASK_DESCR(_IRQ_HIDDEN),
+};
+
+static const struct irq_bit_descr irqdesc_istates[] = {
+ BIT_MASK_DESCR(IRQS_AUTODETECT),
+ BIT_MASK_DESCR(IRQS_SPURIOUS_DISABLED),
+ BIT_MASK_DESCR(IRQS_POLL_INPROGRESS),
+ BIT_MASK_DESCR(IRQS_ONESHOT),
+ BIT_MASK_DESCR(IRQS_REPLAY),
+ BIT_MASK_DESCR(IRQS_WAITING),
+ BIT_MASK_DESCR(IRQS_PENDING),
+ BIT_MASK_DESCR(IRQS_SUSPENDED),
+ BIT_MASK_DESCR(IRQS_NMI),
+};
+
+
+static int irq_debug_show(struct seq_file *m, void *p)
+{
+ struct irq_desc *desc = m->private;
+ struct irq_data *data;
+
+ guard(raw_spinlock_irq)(&desc->lock);
+ data = irq_desc_get_irq_data(desc);
+ seq_printf(m, "handler: %ps\n", desc->handle_irq);
+ seq_printf(m, "device: %s\n", desc->dev_name);
+ seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
+ irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
+ ARRAY_SIZE(irqdesc_states));
+ seq_printf(m, "istate: 0x%08x\n", desc->istate);
+ irq_debug_show_bits(m, 0, desc->istate, irqdesc_istates,
+ ARRAY_SIZE(irqdesc_istates));
+ seq_printf(m, "ddepth: %u\n", desc->depth);
+ seq_printf(m, "wdepth: %u\n", desc->wake_depth);
+ seq_printf(m, "dstate: 0x%08x\n", irqd_get(data));
+ irq_debug_show_bits(m, 0, irqd_get(data), irqdata_states,
+ ARRAY_SIZE(irqdata_states));
+ seq_printf(m, "node: %d\n", irq_data_get_node(data));
+ irq_debug_show_masks(m, desc);
+ irq_debug_show_data(m, data, 0);
+ return 0;
+}
+
+static int irq_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_debug_show, inode->i_private);
+}
+
+static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct irq_desc *desc = file_inode(file)->i_private;
+ char buf[8] = { 0, };
+ size_t size;
+
+ size = min(sizeof(buf) - 1, count);
+ if (copy_from_user(buf, user_buf, size))
+ return -EFAULT;
+
+ if (!strncmp(buf, "trigger", size)) {
+ int err = irq_inject_interrupt(irq_desc_get_irq(desc));
+
+ return err ? err : count;
+ }
+
+ return count;
+}
+
+static const struct file_operations dfs_irq_ops = {
+ .open = irq_debug_open,
+ .write = irq_debug_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ const char *name = dev_name(dev);
+
+ if (name)
+ desc->dev_name = kstrdup(name, GFP_KERNEL);
+}
+
+void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
+{
+ char name [12];
+
+ if (!irq_dir || !desc || desc->debugfs_file)
+ return;
+
+ sprintf(name, "%u", irq);
+ desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
+ &dfs_irq_ops);
+}
+
+static int __init irq_debugfs_init(void)
+{
+ struct dentry *root_dir;
+ int irq;
+
+ root_dir = debugfs_create_dir("irq", NULL);
+
+ irq_domain_debugfs_init(root_dir);
+
+ irq_dir = debugfs_create_dir("irqs", root_dir);
+
+ irq_lock_sparse();
+ for_each_active_irq(irq)
+ irq_add_debugfs_entry(irq, irq_to_desc(irq));
+ irq_unlock_sparse();
+
+ return 0;
+}
+__initcall(irq_debugfs_init);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d5d0f7345c54..b41188698622 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -1,7 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/device.h>
#include <linux/gfp.h>
+#include <linux/irq.h>
+
+#include "internals.h"
/*
* Device resource management aware IRQ request/free implementation.
@@ -25,29 +30,22 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
return this->irq == match->irq && this->dev_id == match->dev_id;
}
-/**
- * devm_request_threaded_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @thread_fn: function to be called in a threaded interrupt context. NULL
- * for devices which handle everything in @handler
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_threaded_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
- *
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
- */
-int devm_request_threaded_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, irq_handler_t thread_fn,
- unsigned long irqflags, const char *devname,
- void *dev_id)
+static int devm_request_result(struct device *dev, int rc, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ const char *devname)
+{
+ if (rc >= 0)
+ return rc;
+
+ return dev_err_probe(dev, rc, "request_irq(%u) %ps %ps %s\n",
+ irq, handler, thread_fn, devname ? : "");
+}
+
+static int __devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ irq_handler_t thread_fn,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -57,6 +55,9 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
+ if (!devname)
+ devname = dev_name(dev);
+
rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
dev_id);
if (rc) {
@@ -70,30 +71,48 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
return 0;
}
-EXPORT_SYMBOL(devm_request_threaded_irq);
/**
- * devm_request_any_context_irq - allocate an interrupt line for a managed device
- * @dev: device to request interrupt for
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs
- * @thread_fn: function to be called in a threaded interrupt context. NULL
- * for devices which handle everything in @handler
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
+ * devm_request_threaded_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @thread_fn: Function to be called in a threaded interrupt context. NULL
+ * for devices which handle everything in @handler
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
*
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as
- * request_any_context_irq(). IRQs requested with this function will be
- * automatically freed on driver detach.
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_threaded_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
*
- * If an IRQ allocated with this function needs to be freed
- * separately, devm_free_irq() must be used.
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
+ *
+ * Return: 0 on success or a negative error number.
*/
-int devm_request_any_context_irq(struct device *dev, unsigned int irq,
- irq_handler_t handler, unsigned long irqflags,
- const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, irq_handler_t thread_fn,
+ unsigned long irqflags, const char *devname,
+ void *dev_id)
+{
+ int rc = __devm_request_threaded_irq(dev, irq, handler, thread_fn,
+ irqflags, devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, thread_fn, devname);
+}
+EXPORT_SYMBOL(devm_request_threaded_irq);
+
+static int __devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
{
struct irq_devres *dr;
int rc;
@@ -103,8 +122,11 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
if (!dr)
return -ENOMEM;
+ if (!devname)
+ devname = dev_name(dev);
+
rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
- if (rc) {
+ if (rc < 0) {
devres_free(dr);
return rc;
}
@@ -113,7 +135,41 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
dr->dev_id = dev_id;
devres_add(dev, dr);
- return 0;
+ return rc;
+}
+
+/**
+ * devm_request_any_context_irq - allocate an interrupt line for a managed device with error logging
+ * @dev: Device to request interrupt for
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the interrupt occurs
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * Except for the extra @dev argument, this function takes the same
+ * arguments and performs the same function as request_any_context_irq().
+ * Interrupts requested with this function will be automatically freed on
+ * driver detach.
+ *
+ * If an interrupt allocated with this function needs to be freed
+ * separately, devm_free_irq() must be used.
+ *
+ * When the request fails, an error message is printed with contextual
+ * information (device name, interrupt number, handler functions and
+ * error code). Don't add extra error messages at the call sites.
+ *
+ * Return: IRQC_IS_HARDIRQ or IRQC_IS_NESTED on success, or a negative error
+ * number.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ int rc = __devm_request_any_context_irq(dev, irq, handler, irqflags,
+ devname, dev_id);
+
+ return devm_request_result(dev, rc, irq, handler, NULL, devname);
}
EXPORT_SYMBOL(devm_request_any_context_irq);
@@ -132,8 +188,184 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
{
struct irq_devres match_data = { irq, dev_id };
- WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
+ WARN_ON(devres_release(dev, devm_irq_release, devm_irq_match,
&match_data));
- free_irq(irq, dev_id);
}
EXPORT_SYMBOL(devm_free_irq);
+
+struct irq_desc_devres {
+ unsigned int from;
+ unsigned int cnt;
+};
+
+static void devm_irq_desc_release(struct device *dev, void *res)
+{
+ struct irq_desc_devres *this = res;
+
+ irq_free_descs(this->from, this->cnt);
+}
+
+/**
+ * __devm_irq_alloc_descs - Allocate and initialize a range of irq descriptors
+ * for a managed device
+ * @dev: Device to allocate the descriptors for
+ * @irq: Allocate for specific irq number if irq >= 0
+ * @from: Start the search from this irq number
+ * @cnt: Number of consecutive irqs to allocate
+ * @node: Preferred node on which the irq descriptor should be allocated
+ * @owner: Owning module (can be NULL)
+ * @affinity: Optional pointer to an irq_affinity_desc array of size @cnt
+ * which hints where the irq descriptors should be allocated
+ * and which default affinities to use
+ *
+ * Returns the first irq number or error code.
+ *
+ * Note: Use the provided wrappers (devm_irq_alloc_desc*) for simplicity.
+ */
+int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
+ unsigned int cnt, int node, struct module *owner,
+ const struct irq_affinity_desc *affinity)
+{
+ struct irq_desc_devres *dr;
+ int base;
+
+ dr = devres_alloc(devm_irq_desc_release, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ base = __irq_alloc_descs(irq, from, cnt, node, owner, affinity);
+ if (base < 0) {
+ devres_free(dr);
+ return base;
+ }
+
+ dr->from = base;
+ dr->cnt = cnt;
+ devres_add(dev, dr);
+
+ return base;
+}
+EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs);
+
+#ifdef CONFIG_GENERIC_IRQ_CHIP
+/**
+ * devm_irq_alloc_generic_chip - Allocate and initialize a generic chip
+ * for a managed device
+ * @dev: Device to allocate the generic chip for
+ * @name: Name of the irq chip
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @irq_base: Interrupt base nr for this chip
+ * @reg_base: Register base address (virtual)
+ * @handler: Default flow handler associated with this chip
+ *
+ * Returns an initialized irq_chip_generic structure. The chip defaults
+ * to the primary (index 0) irq_chip_type and @handler
+ */
+struct irq_chip_generic *
+devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
+ unsigned int irq_base, void __iomem *reg_base,
+ irq_flow_handler_t handler)
+{
+ struct irq_chip_generic *gc;
+
+ gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
+ if (gc)
+ irq_init_generic_chip(gc, name, num_ct,
+ irq_base, reg_base, handler);
+
+ return gc;
+}
+EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip);
+
+struct irq_generic_chip_devres {
+ struct irq_chip_generic *gc;
+ u32 msk;
+ unsigned int clr;
+ unsigned int set;
+};
+
+static void devm_irq_remove_generic_chip(struct device *dev, void *res)
+{
+ struct irq_generic_chip_devres *this = res;
+
+ irq_remove_generic_chip(this->gc, this->msk, this->clr, this->set);
+}
+
+/**
+ * devm_irq_setup_generic_chip - Setup a range of interrupts with a generic
+ * chip for a managed device
+ *
+ * @dev: Device to setup the generic chip for
+ * @gc: Generic irq chip holding all data
+ * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @flags: Flags for initialization
+ * @clr: IRQ_* bits to clear
+ * @set: IRQ_* bits to set
+ *
+ * Set up max. 32 interrupts starting from gc->irq_base. Note, this
+ * initializes all interrupts to the primary irq_chip_type and its
+ * associated handler.
+ */
+int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc,
+ u32 msk, enum irq_gc_flags flags,
+ unsigned int clr, unsigned int set)
+{
+ struct irq_generic_chip_devres *dr;
+
+ dr = devres_alloc(devm_irq_remove_generic_chip,
+ sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ irq_setup_generic_chip(gc, msk, flags, clr, set);
+
+ dr->gc = gc;
+ dr->msk = msk;
+ dr->clr = clr;
+ dr->set = set;
+ devres_add(dev, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip);
+#endif /* CONFIG_GENERIC_IRQ_CHIP */
+
+#ifdef CONFIG_IRQ_DOMAIN
+static void devm_irq_domain_remove(struct device *dev, void *res)
+{
+ struct irq_domain **domain = res;
+
+ irq_domain_remove(*domain);
+}
+
+/**
+ * devm_irq_domain_instantiate() - Instantiate a new irq domain data for a
+ * managed device.
+ * @dev: Device to instantiate the domain for
+ * @info: Domain information pointer pointing to the information for this
+ * domain
+ *
+ * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
+ */
+struct irq_domain *devm_irq_domain_instantiate(struct device *dev,
+ const struct irq_domain_info *info)
+{
+ struct irq_domain *domain;
+ struct irq_domain **dr;
+
+ dr = devres_alloc(devm_irq_domain_remove, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return ERR_PTR(-ENOMEM);
+
+ domain = irq_domain_instantiate(info);
+ if (!IS_ERR(domain)) {
+ *dr = domain;
+ devres_add(dev, dr);
+ } else {
+ devres_free(dr);
+ }
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_instantiate);
+#endif /* CONFIG_IRQ_DOMAIN */
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 2feb6feca0cc..7fe6cffe7d0d 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
@@ -12,7 +13,7 @@
/*
* What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
+ * Each architecture has to answer this themselves.
*/
static void ack_bad(struct irq_data *data)
{
@@ -42,6 +43,7 @@ struct irq_chip no_irq_chip = {
.irq_enable = noop,
.irq_disable = noop,
.irq_ack = ack_bad,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
/*
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 61024e8abdef..3cd0c40282c0 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Library implementing the most common irq chip callback functions
*
@@ -24,6 +25,7 @@ static DEFINE_RAW_SPINLOCK(gc_lock);
void irq_gc_noop(struct irq_data *d)
{
}
+EXPORT_SYMBOL_GPL(irq_gc_noop);
/**
* irq_gc_mask_disable_reg - Mask chip via disable register
@@ -38,11 +40,11 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
- irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
/**
* irq_gc_mask_set_bit - Mask chip via setting bit in mask register
@@ -57,10 +59,9 @@ void irq_gc_mask_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache |= mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -77,10 +78,9 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache &= ~mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -97,11 +97,11 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
- irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
/**
* irq_gc_ack_set_bit - Ack pending interrupt via setting bit
@@ -113,9 +113,8 @@ void irq_gc_ack_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -129,26 +128,34 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = ~d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
/**
- * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
+ * irq_gc_mask_disable_and_ack_set - Mask and ack pending interrupt
* @d: irq_data
+ *
+ * This generic implementation of the irq_mask_ack method is for chips
+ * with separate enable/disable registers instead of a single mask
+ * register and where a pending interrupt is acknowledged by setting a
+ * bit.
+ *
+ * Note: This is the only permutation currently used. Similar generic
+ * functions should be added here if other permutations are required.
*/
-void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
{
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
- irq_reg_writel(gc, mask, ct->regs.mask);
+ guard(raw_spinlock)(&gc->lock);
+ irq_reg_writel(gc, mask, ct->regs.disable);
+ *ct->mask_cache &= ~mask;
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set);
/**
* irq_gc_eoi - EOI interrupt
@@ -160,9 +167,8 @@ void irq_gc_eoi(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.eoi);
- irq_gc_unlock(gc);
}
/**
@@ -182,14 +188,14 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
if (!(mask & gc->wake_enabled))
return -EINVAL;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
if (on)
gc->wake_active |= mask;
else
gc->wake_active &= ~mask;
- irq_gc_unlock(gc);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_gc_set_wake);
static u32 irq_readl_be(void __iomem *addr)
{
@@ -201,16 +207,19 @@ static void irq_writel_be(u32 val, void __iomem *addr)
iowrite32be(val, addr);
}
-static void
-irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
- int num_ct, unsigned int irq_base,
- void __iomem *reg_base, irq_flow_handler_t handler)
+void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+ int num_ct, unsigned int irq_base,
+ void __iomem *reg_base, irq_flow_handler_t handler)
{
+ struct irq_chip_type *ct = gc->chip_types;
+ int i;
+
raw_spin_lock_init(&gc->lock);
gc->num_ct = num_ct;
gc->irq_base = irq_base;
gc->reg_base = reg_base;
- gc->chip_types->chip.name = name;
+ for (i = 0; i < num_ct; i++)
+ ct[i].chip.name = name;
gc->chip_types->handler = handler;
}
@@ -230,9 +239,8 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
void __iomem *reg_base, irq_flow_handler_t handler)
{
struct irq_chip_generic *gc;
- unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
- gc = kzalloc(sz, GFP_KERNEL);
+ gc = kzalloc(struct_size(gc, chip_types, num_ct), GFP_KERNEL);
if (gc) {
irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
handler);
@@ -260,99 +268,174 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
/**
- * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
- * @d: irq domain for which to allocate chips
- * @irqs_per_chip: Number of interrupts each chip handles
- * @num_ct: Number of irq_chip_type instances associated with this
- * @name: Name of the irq chip
- * @handler: Default flow handler associated with these chips
- * @clr: IRQ_* bits to clear in the mapping function
- * @set: IRQ_* bits to set in the mapping function
- * @gcflags: Generic chip specific setup flags
+ * irq_domain_alloc_generic_chips - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @info: Generic chip information
+ *
+ * Return: 0 on success, negative error code on failure
*/
-int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
- int num_ct, const char *name,
- irq_flow_handler_t handler,
- unsigned int clr, unsigned int set,
- enum irq_gc_flags gcflags)
+int irq_domain_alloc_generic_chips(struct irq_domain *d,
+ const struct irq_domain_chip_generic_info *info)
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
- int numchips, sz, i;
- unsigned long flags;
+ int numchips, i;
+ size_t dgc_sz;
+ size_t gc_sz;
+ size_t sz;
void *tmp;
+ int ret;
if (d->gc)
return -EBUSY;
- numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
+ numchips = DIV_ROUND_UP(d->revmap_size, info->irqs_per_chip);
if (!numchips)
return -EINVAL;
/* Allocate a pointer, generic chip and chiptypes for each chip */
- sz = sizeof(*dgc) + numchips * sizeof(gc);
- sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+ gc_sz = struct_size(gc, chip_types, info->num_ct);
+ dgc_sz = struct_size(dgc, gc, numchips);
+ sz = dgc_sz + numchips * gc_sz;
tmp = dgc = kzalloc(sz, GFP_KERNEL);
if (!dgc)
return -ENOMEM;
- dgc->irqs_per_chip = irqs_per_chip;
+ dgc->irqs_per_chip = info->irqs_per_chip;
dgc->num_chips = numchips;
- dgc->irq_flags_to_set = set;
- dgc->irq_flags_to_clear = clr;
- dgc->gc_flags = gcflags;
+ dgc->irq_flags_to_set = info->irq_flags_to_set;
+ dgc->irq_flags_to_clear = info->irq_flags_to_clear;
+ dgc->gc_flags = info->gc_flags;
+ dgc->exit = info->exit;
d->gc = dgc;
/* Calc pointer to the first generic chip */
- tmp += sizeof(*dgc) + numchips * sizeof(gc);
+ tmp += dgc_sz;
for (i = 0; i < numchips; i++) {
/* Store the pointer to the generic chip */
dgc->gc[i] = gc = tmp;
- irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
- NULL, handler);
+ irq_init_generic_chip(gc, info->name, info->num_ct,
+ i * dgc->irqs_per_chip, NULL,
+ info->handler);
gc->domain = d;
- if (gcflags & IRQ_GC_BE_IO) {
+ if (dgc->gc_flags & IRQ_GC_BE_IO) {
gc->reg_readl = &irq_readl_be;
gc->reg_writel = &irq_writel_be;
}
- raw_spin_lock_irqsave(&gc_lock, flags);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock_irqrestore(&gc_lock, flags);
+ if (info->init) {
+ ret = info->init(gc);
+ if (ret)
+ goto err;
+ }
+
+ scoped_guard (raw_spinlock_irqsave, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
/* Calc pointer to the next generic chip */
- tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+ tmp += gc_sz;
}
- d->name = name;
return 0;
+
+err:
+ while (i--) {
+ if (dgc->exit)
+ dgc->exit(dgc->gc[i]);
+ irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0);
+ }
+ d->gc = NULL;
+ kfree(dgc);
+ return ret;
}
-EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+EXPORT_SYMBOL_GPL(irq_domain_alloc_generic_chips);
/**
- * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
- * @d: irq domain pointer
- * @hw_irq: Hardware interrupt number
+ * irq_domain_remove_generic_chips - Remove generic chips from an irq domain
+ * @d: irq domain for which generic chips are to be removed
*/
-struct irq_chip_generic *
-irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+void irq_domain_remove_generic_chips(struct irq_domain *d)
+{
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int i;
+
+ if (!dgc)
+ return;
+
+ for (i = 0; i < dgc->num_chips; i++) {
+ if (dgc->exit)
+ dgc->exit(dgc->gc[i]);
+ irq_remove_generic_chip(dgc->gc[i], ~0U, 0, 0);
+ }
+ d->gc = NULL;
+ kfree(dgc);
+}
+EXPORT_SYMBOL_GPL(irq_domain_remove_generic_chips);
+
+/**
+ * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain
+ * @d: irq domain for which to allocate chips
+ * @irqs_per_chip: Number of interrupts each chip handles (max 32)
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @name: Name of the irq chip
+ * @handler: Default flow handler associated with these chips
+ * @clr: IRQ_* bits to clear in the mapping function
+ * @set: IRQ_* bits to set in the mapping function
+ * @gcflags: Generic chip specific setup flags
+ */
+int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+ int num_ct, const char *name,
+ irq_flow_handler_t handler,
+ unsigned int clr, unsigned int set,
+ enum irq_gc_flags gcflags)
+{
+ struct irq_domain_chip_generic_info info = {
+ .irqs_per_chip = irqs_per_chip,
+ .num_ct = num_ct,
+ .name = name,
+ .handler = handler,
+ .irq_flags_to_clear = clr,
+ .irq_flags_to_set = set,
+ .gc_flags = gcflags,
+ };
+
+ return irq_domain_alloc_generic_chips(d, &info);
+}
+EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
+
+static struct irq_chip_generic *
+__irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
{
struct irq_domain_chip_generic *dgc = d->gc;
int idx;
if (!dgc)
- return NULL;
+ return ERR_PTR(-ENODEV);
idx = hw_irq / dgc->irqs_per_chip;
if (idx >= dgc->num_chips)
- return NULL;
+ return ERR_PTR(-EINVAL);
return dgc->gc[idx];
}
+
+/**
+ * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
+ * @d: irq domain pointer
+ * @hw_irq: Hardware interrupt number
+ */
+struct irq_chip_generic *
+irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+ struct irq_chip_generic *gc = __irq_get_domain_generic_chip(d, hw_irq);
+
+ return !IS_ERR(gc) ? gc : NULL;
+}
EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
/*
- * Separate lockdep class for interrupt chip which can nest irq_desc
- * lock.
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
+ * lock and request mutex.
*/
static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
@@ -360,21 +443,16 @@ static struct lock_class_key irq_nested_lock_class;
int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_hw_number_t hw_irq)
{
- struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
struct irq_domain_chip_generic *dgc = d->gc;
struct irq_chip_generic *gc;
struct irq_chip_type *ct;
struct irq_chip *chip;
- unsigned long flags;
int idx;
- if (!d->gc)
- return -ENODEV;
-
- idx = hw_irq / dgc->irqs_per_chip;
- if (idx >= dgc->num_chips)
- return -EINVAL;
- gc = dgc->gc[idx];
+ gc = __irq_get_domain_generic_chip(d, hw_irq);
+ if (IS_ERR(gc))
+ return PTR_ERR(gc);
idx = hw_irq % dgc->irqs_per_chip;
@@ -389,31 +467,50 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
/* We only init the cache for the first mapping of a generic chip */
if (!gc->installed) {
- raw_spin_lock_irqsave(&gc->lock, flags);
+ guard(raw_spinlock_irqsave)(&gc->lock);
irq_gc_init_mask_cache(gc, dgc->gc_flags);
- raw_spin_unlock_irqrestore(&gc->lock, flags);
}
/* Mark the interrupt as installed */
set_bit(idx, &gc->installed);
if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(virq, &irq_nested_lock_class);
+ irq_set_lockdep_class(virq, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (chip->irq_calc_mask)
chip->irq_calc_mask(data);
else
data->mask = 1 << idx;
- irq_set_chip_and_handler(virq, chip, ct->handler);
- irq_set_chip_data(virq, gc);
+ irq_domain_set_info(d, virq, hw_irq, chip, gc, ct->handler, NULL, NULL);
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_map_generic_chip);
-struct irq_domain_ops irq_generic_chip_ops = {
+void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int hw_irq = data->hwirq;
+ struct irq_chip_generic *gc;
+ int irq_idx;
+
+ gc = irq_get_domain_generic_chip(d, hw_irq);
+ if (!gc)
+ return;
+
+ irq_idx = hw_irq % dgc->irqs_per_chip;
+
+ clear_bit(irq_idx, &gc->installed);
+ irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+ NULL);
+
+}
+
+const struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
+ .unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
@@ -438,9 +535,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
struct irq_chip *chip = &ct->chip;
unsigned int i;
- raw_spin_lock(&gc_lock);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
irq_gc_init_mask_cache(gc, flags);
@@ -449,7 +545,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
continue;
if (flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(i, &irq_nested_lock_class);
+ irq_set_lockdep_class(i, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (!(flags & IRQ_GC_NO_MASK)) {
struct irq_data *d = irq_get_irq_data(i);
@@ -503,21 +600,33 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
unsigned int clr, unsigned int set)
{
- unsigned int i = gc->irq_base;
+ unsigned int i, virq;
- raw_spin_lock(&gc_lock);
- list_del(&gc->list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_del(&gc->list);
- for (; msk; msk >>= 1, i++) {
+ for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
continue;
+ /*
+ * Interrupt domain based chips store the base hardware
+ * interrupt number in gc::irq_base. Otherwise gc::irq_base
+ * contains the base Linux interrupt number.
+ */
+ if (gc->domain) {
+ virq = irq_find_mapping(gc->domain, gc->irq_base + i);
+ if (!virq)
+ continue;
+ } else {
+ virq = gc->irq_base + i;
+ }
+
/* Remove handler first. That will mask the irq line */
- irq_set_handler(i, NULL);
- irq_set_chip(i, &no_irq_chip);
- irq_set_chip_data(i, NULL);
- irq_modify_status(i, clr, set);
+ irq_set_handler(virq, NULL);
+ irq_set_chip(virq, &no_irq_chip);
+ irq_set_chip_data(virq, NULL);
+ irq_modify_status(virq, clr, set);
}
}
EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
@@ -541,7 +650,7 @@ static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
}
#ifdef CONFIG_PM
-static int irq_gc_suspend(void)
+static int irq_gc_suspend(void *data)
{
struct irq_chip_generic *gc;
@@ -554,17 +663,23 @@ static int irq_gc_suspend(void)
if (data)
ct->chip.irq_suspend(data);
}
+
+ if (gc->suspend)
+ gc->suspend(gc);
}
return 0;
}
-static void irq_gc_resume(void)
+static void irq_gc_resume(void *data)
{
struct irq_chip_generic *gc;
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
+ if (gc->resume)
+ gc->resume(gc);
+
if (ct->chip.irq_resume) {
struct irq_data *data = irq_gc_get_irq_data(gc);
@@ -578,7 +693,7 @@ static void irq_gc_resume(void)
#define irq_gc_resume NULL
#endif
-static void irq_gc_shutdown(void)
+static void irq_gc_shutdown(void *data)
{
struct irq_chip_generic *gc;
@@ -594,15 +709,19 @@ static void irq_gc_shutdown(void)
}
}
-static struct syscore_ops irq_gc_syscore_ops = {
+static const struct syscore_ops irq_gc_syscore_ops = {
.suspend = irq_gc_suspend,
.resume = irq_gc_resume,
.shutdown = irq_gc_shutdown,
};
+static struct syscore irq_gc_syscore = {
+ .ops = &irq_gc_syscore_ops,
+};
+
static int __init irq_gc_init_ops(void)
{
- register_syscore_ops(&irq_gc_syscore_ops);
+ register_syscore(&irq_gc_syscore);
return 0;
}
device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 635480270858..786f5570a640 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/handle.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the core interrupt handling code.
- *
- * Detailed information is available in Documentation/DocBook/genericirq
+ * This file contains the core interrupt handling code. Detailed
+ * information is available in Documentation/core-api/genericirq.rst
*
*/
@@ -16,23 +14,31 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <asm/irq_regs.h>
+
#include <trace/events/irq.h>
#include "internals.h"
+#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
+void (*handle_arch_irq)(struct pt_regs *) __ro_after_init;
+#endif
+
/**
* handle_bad_irq - handle spurious and unhandled irqs
- * @irq: the interrupt number
* @desc: description of the interrupt
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
-void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+void handle_bad_irq(struct irq_desc *desc)
{
+ unsigned int irq = irq_desc_get_irq(desc);
+
print_irq_desc(irq, desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
ack_bad_irq(irq);
}
+EXPORT_SYMBOL_GPL(handle_bad_irq);
/*
* Special, empty irq handler:
@@ -127,23 +133,87 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
*/
atomic_inc(&desc->threads_active);
- wake_up_process(action->thread);
+ /*
+ * This might be a premature wakeup before the thread reached the
+ * thread function and set the IRQTF_READY bit. It's waiting in
+ * kthread code with state UNINTERRUPTIBLE. Once it reaches the
+ * thread function it waits with INTERRUPTIBLE. The wakeup is not
+ * lost in that case because the thread is guaranteed to observe
+ * the RUN flag before it goes to sleep in wait_for_interrupt().
+ */
+ wake_up_state(action->thread, TASK_INTERRUPTIBLE);
}
-irqreturn_t
-handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
+static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled);
+static u64 irqhandler_duration_threshold_ns __ro_after_init;
+
+static int __init irqhandler_duration_check_setup(char *arg)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul(arg, 0, &val);
+ if (ret) {
+ pr_err("Unable to parse irqhandler.duration_warn_us setting: ret=%d\n", ret);
+ return 0;
+ }
+
+ if (!val) {
+ pr_err("Invalid irqhandler.duration_warn_us setting, must be > 0\n");
+ return 0;
+ }
+
+ irqhandler_duration_threshold_ns = val * 1000;
+ static_branch_enable(&irqhandler_duration_check_enabled);
+
+ return 1;
+}
+__setup("irqhandler.duration_warn_us=", irqhandler_duration_check_setup);
+
+static inline void irqhandler_duration_check(u64 ts_start, unsigned int irq,
+ const struct irqaction *action)
+{
+ u64 delta_ns = local_clock() - ts_start;
+
+ if (unlikely(delta_ns > irqhandler_duration_threshold_ns)) {
+ pr_warn_ratelimited("[CPU%u] long duration of IRQ[%u:%ps], took: %llu us\n",
+ smp_processor_id(), irq, action->handler,
+ div_u64(delta_ns, NSEC_PER_USEC));
+ }
+}
+
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
- unsigned int flags = 0, irq = desc->irq_data.irq;
+ unsigned int irq = desc->irq_data.irq;
+ struct irqaction *action;
- do {
+ record_irq_time(desc);
+
+ for_each_action_of_desc(desc, action) {
irqreturn_t res;
+ /*
+ * If this IRQ would be threaded under force_irqthreads, mark it so.
+ */
+ if (irq_settings_can_thread(desc) &&
+ !(action->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)))
+ lockdep_hardirq_threaded();
+
trace_irq_handler_entry(irq, action);
- res = action->handler(irq, action->dev_id);
+
+ if (static_branch_unlikely(&irqhandler_duration_check_enabled)) {
+ u64 ts_start = local_clock();
+
+ res = action->handler(irq, action->dev_id);
+ irqhandler_duration_check(ts_start, irq, action);
+ } else {
+ res = action->handler(irq, action->dev_id);
+ }
+
trace_irq_handler_exit(irq, action, res);
- if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+ if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
irq, action->handler))
local_irq_disable();
@@ -159,10 +229,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
}
__irq_wake_thread(desc, action);
-
- /* Fall through to add to randomness */
- case IRQ_HANDLED:
- flags |= action->flags;
break;
default:
@@ -170,28 +236,62 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
}
retval |= res;
- action = action->next;
- } while (action);
+ }
+
+ return retval;
+}
+
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
+{
+ irqreturn_t retval;
+
+ retval = __handle_irq_event_percpu(desc);
- add_interrupt_randomness(irq, flags);
+ add_interrupt_randomness(desc->irq_data.irq);
- if (!noirqdebug)
- note_interrupt(irq, desc, retval);
+ if (!irq_settings_no_debug(desc))
+ note_interrupt(desc, retval);
return retval;
}
irqreturn_t handle_irq_event(struct irq_desc *desc)
{
- struct irqaction *action = desc->action;
irqreturn_t ret;
desc->istate &= ~IRQS_PENDING;
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
- ret = handle_irq_event_percpu(desc, action);
+ ret = handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
return ret;
}
+
+#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
+int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
+{
+ if (handle_arch_irq)
+ return -EBUSY;
+
+ handle_arch_irq = handle_irq;
+ return 0;
+}
+
+/**
+ * generic_handle_arch_irq - root irq handler for architectures which do no
+ * entry accounting themselves
+ * @regs: Register file coming from the low-level handling code
+ */
+asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+
+ irq_enter();
+ old_regs = set_irq_regs(regs);
+ handle_arch_irq(regs);
+ set_irq_regs(old_regs);
+ irq_exit();
+}
+#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af936..0164ca48da59 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* IRQ subsystem internal functions and variables:
*
@@ -7,16 +8,21 @@
*/
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
+#include <linux/pm_runtime.h>
+#include <linux/sched/clock.h>
#ifdef CONFIG_SPARSE_IRQ
-# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+# define MAX_SPARSE_IRQS INT_MAX
#else
-# define IRQ_BITMAP_BITS NR_IRQS
+# define MAX_SPARSE_IRQS NR_IRQS
#endif
#define istate core_internal_state__do_not_mess_with_it
extern bool noirqdebug;
+extern int irq_poll_cpu;
+
+extern struct irqaction chained_action;
/*
* Bits used by threaded handlers:
@@ -24,12 +30,14 @@ extern bool noirqdebug;
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
+ * IRQTF_READY - signals that irq thread is ready
*/
enum {
IRQTF_RUNTHREAD,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
+ IRQTF_READY,
};
/*
@@ -40,10 +48,15 @@ enum {
* detection
* IRQS_POLL_INPROGRESS - polling in progress
* IRQS_ONESHOT - irq is not unmasked in primary handler
- * IRQS_REPLAY - irq is replayed
+ * IRQS_REPLAY - irq has been resent and will not be resent
+ * again until the handler has run and cleared
+ * this flag.
* IRQS_WAITING - irq is waiting
- * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_PENDING - irq needs to be resent and should be resent
+ * at the next available opportunity.
* IRQS_SUSPENDED - irq is suspended
+ * IRQS_NMI - irq line is used to deliver NMIs
+ * IRQS_SYSFS - descriptor has been added to sysfs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -54,21 +67,31 @@ enum {
IRQS_WAITING = 0x00000080,
IRQS_PENDING = 0x00000200,
IRQS_SUSPENDED = 0x00000800,
+ IRQS_TIMINGS = 0x00001000,
+ IRQS_NMI = 0x00002000,
+ IRQS_SYSFS = 0x00004000,
};
#include "debug.h"
#include "settings.h"
-#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc);
+extern void __enable_irq(struct irq_desc *desc);
+
+#define IRQ_RESEND true
+#define IRQ_NORESEND false
+
+#define IRQ_START_FORCE true
+#define IRQ_START_COND false
-extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
+extern int irq_activate(struct irq_desc *desc);
+extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
+extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
+extern void irq_startup_managed(struct irq_desc *desc);
-extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
-extern void irq_enable(struct irq_desc *desc);
+extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
@@ -78,24 +101,22 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
-extern void irq_lock_sparse(void);
-extern void irq_unlock_sparse(void);
#else
extern void irq_mark_irq(unsigned int irq);
-static inline void irq_lock_sparse(void) { }
-static inline void irq_unlock_sparse(void) { }
#endif
-extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq);
-bool irq_wait_for_poll(struct irq_desc *desc);
+int check_irq_resend(struct irq_desc *desc, bool inject);
+void clear_irq_resend(struct irq_desc *desc);
+void irq_resend_init(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
+void wake_threads_waitq(struct irq_desc *desc);
+
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -110,13 +131,21 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
-extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
-
-extern void irq_set_thread_affinity(struct irq_desc *desc);
+extern bool irq_can_set_affinity_usr(unsigned int irq);
extern int irq_do_set_affinity(struct irq_data *data,
const struct cpumask *dest, bool force);
+#ifdef CONFIG_SMP
+extern int irq_setup_affinity(struct irq_desc *desc);
+#else
+static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
+#endif
+
+
+#define for_each_action_of_desc(desc, act) \
+ for (act = desc->action; act; act = act->next)
+
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
@@ -136,33 +165,38 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check);
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check);
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
-static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, true, check);
-}
+__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true);
+__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc,
+ __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus),
+ unsigned long flags; bool bus);
-static inline void
-irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus,
+ unsigned int check)
{
- __irq_put_desc_unlock(desc, flags, true);
-}
+ class_irqdesc_lock_t _t = { .bus = bus, };
-static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, false, check);
+ _t.lock = __irq_get_desc_lock(irq, &_t.flags, bus, check);
+
+ return _t;
}
-static inline void
-irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+#define scoped_irqdesc_get_and_lock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, false, _check)
+
+#define scoped_irqdesc_get_and_buslock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, true, _check)
+
+#define scoped_irqdesc ((struct irq_desc *)(__guard_ptr(irqdesc_lock)(&scope)))
+
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
+
+static inline unsigned int irqd_get(struct irq_data *d)
{
- __irq_put_desc_unlock(desc, flags, false);
+ return __irqd_to_state(d);
}
/*
@@ -170,43 +204,320 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
*/
static inline void irqd_set_move_pending(struct irq_data *d)
{
- d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+ __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING;
}
static inline void irqd_clr_move_pending(struct irq_data *d)
{
- d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+ __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
+}
+
+static inline void irqd_set_managed_shutdown(struct irq_data *d)
+{
+ __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN;
+}
+
+static inline void irqd_clr_managed_shutdown(struct irq_data *d)
+{
+ __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN;
}
static inline void irqd_clear(struct irq_data *d, unsigned int mask)
{
- d->state_use_accessors &= ~mask;
+ __irqd_to_state(d) &= ~mask;
}
static inline void irqd_set(struct irq_data *d, unsigned int mask)
{
- d->state_use_accessors |= mask;
+ __irqd_to_state(d) |= mask;
}
static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
{
- return d->state_use_accessors & mask;
+ return __irqd_to_state(d) & mask;
}
-static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+static inline void irq_state_set_disabled(struct irq_desc *desc)
{
- __this_cpu_inc(*desc->kstat_irqs);
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+}
+
+static inline void irq_state_set_masked(struct irq_desc *desc)
+{
+ irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
+}
+
+#undef __irqd_to_state
+
+static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+{
+ __this_cpu_inc(desc->kstat_irqs->cnt);
__this_cpu_inc(kstat.irqs_sum);
}
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+{
+ __kstat_incr_irqs_this_cpu(desc);
+ desc->tot_count++;
+}
+
+static inline int irq_desc_get_node(struct irq_desc *desc)
+{
+ return irq_common_data_get_node(&desc->irq_common_data);
+}
+
+static inline int irq_desc_is_chained(struct irq_desc *desc)
+{
+ return (desc->action && desc->action == &chained_action);
+}
+
+static inline bool irq_is_nmi(struct irq_desc *desc)
+{
+ return desc->istate & IRQS_NMI;
+}
+
#ifdef CONFIG_PM_SLEEP
-bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_handle_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
#else
-static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { }
static inline void
irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
static inline void
irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
#endif
+
+#ifdef CONFIG_IRQ_TIMINGS
+
+#define IRQ_TIMINGS_SHIFT 5
+#define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT)
+#define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1)
+
+/**
+ * struct irq_timings - irq timings storing structure
+ * @values: a circular buffer of u64 encoded <timestamp,irq> values
+ * @count: the number of elements in the array
+ */
+struct irq_timings {
+ u64 values[IRQ_TIMINGS_SIZE];
+ int count;
+};
+
+DECLARE_PER_CPU(struct irq_timings, irq_timings);
+
+extern void irq_timings_free(int irq);
+extern int irq_timings_alloc(int irq);
+
+static inline void irq_remove_timings(struct irq_desc *desc)
+{
+ desc->istate &= ~IRQS_TIMINGS;
+
+ irq_timings_free(irq_desc_get_irq(desc));
+}
+
+static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
+{
+ int irq = irq_desc_get_irq(desc);
+ int ret;
+
+ /*
+ * We don't need the measurement because the idle code already
+ * knows the next expiry event.
+ */
+ if (act->flags & __IRQF_TIMER)
+ return;
+
+ /*
+ * In case the timing allocation fails, we just want to warn,
+ * not fail, so letting the system boot anyway.
+ */
+ ret = irq_timings_alloc(irq);
+ if (ret) {
+ pr_warn("Failed to allocate irq timing stats for irq%d (%d)",
+ irq, ret);
+ return;
+ }
+
+ desc->istate |= IRQS_TIMINGS;
+}
+
+extern void irq_timings_enable(void);
+extern void irq_timings_disable(void);
+
+DECLARE_STATIC_KEY_FALSE(irq_timing_enabled);
+
+/*
+ * The interrupt number and the timestamp are encoded into a single
+ * u64 variable to optimize the size.
+ * 48 bit time stamp and 16 bit IRQ number is way sufficient.
+ * Who cares an IRQ after 78 hours of idle time?
+ */
+static inline u64 irq_timing_encode(u64 timestamp, int irq)
+{
+ return (timestamp << 16) | irq;
+}
+
+static inline int irq_timing_decode(u64 value, u64 *timestamp)
+{
+ *timestamp = value >> 16;
+ return value & U16_MAX;
+}
+
+static __always_inline void irq_timings_push(u64 ts, int irq)
+{
+ struct irq_timings *timings = this_cpu_ptr(&irq_timings);
+
+ timings->values[timings->count & IRQ_TIMINGS_MASK] =
+ irq_timing_encode(ts, irq);
+
+ timings->count++;
+}
+
+/*
+ * The function record_irq_time is only called in one place in the
+ * interrupts handler. We want this function always inline so the code
+ * inside is embedded in the function and the static key branching
+ * code can act at the higher level. Without the explicit
+ * __always_inline we can end up with a function call and a small
+ * overhead in the hotpath for nothing.
+ */
+static __always_inline void record_irq_time(struct irq_desc *desc)
+{
+ if (!static_branch_likely(&irq_timing_enabled))
+ return;
+
+ if (desc->istate & IRQS_TIMINGS)
+ irq_timings_push(local_clock(), irq_desc_get_irq(desc));
+}
+#else
+static inline void irq_remove_timings(struct irq_desc *desc) {}
+static inline void irq_setup_timings(struct irq_desc *desc,
+ struct irqaction *act) {};
+static inline void record_irq_time(struct irq_desc *desc) {}
+#endif /* CONFIG_IRQ_TIMINGS */
+
+
+#ifdef CONFIG_GENERIC_IRQ_CHIP
+void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+ int num_ct, unsigned int irq_base,
+ void __iomem *reg_base, irq_flow_handler_t handler);
+#else
+static inline void
+irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+ int num_ct, unsigned int irq_base,
+ void __iomem *reg_base, irq_flow_handler_t handler) { }
+#endif /* CONFIG_GENERIC_IRQ_CHIP */
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+ return !(data->chip->flags & IRQCHIP_MOVE_DEFERRED);
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+ return irqd_is_setaffinity_pending(data);
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+ cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+ cpumask_copy(mask, desc->pending_mask);
+}
+static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
+{
+ return desc->pending_mask;
+}
+bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
+void irq_force_complete_move(struct irq_desc *desc);
+#else /* CONFIG_GENERIC_PENDING_IRQ */
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+ return true;
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+ return false;
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+}
+static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
+{
+ return NULL;
+}
+static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
+{
+ return false;
+}
+static inline void irq_force_complete_move(struct irq_desc *desc) { }
+#endif /* !CONFIG_GENERIC_PENDING_IRQ */
+
+#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
+{
+ irqd_set_activated(data);
+ return 0;
+}
+static inline void irq_domain_deactivate_irq(struct irq_data *data)
+{
+ irqd_clr_activated(data);
+}
+#endif
+
+static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irqd->parent_data;
+#else
+ return NULL;
+#endif
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include <linux/debugfs.h>
+
+struct irq_bit_descr {
+ unsigned int mask;
+ char *name;
+};
+
+#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
+
+void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+ const struct irq_bit_descr *sd, int size);
+
+void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
+static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
+{
+ debugfs_remove(desc->debugfs_file);
+ kfree(desc->dev_name);
+}
+void irq_debugfs_copy_devname(int irq, struct device *dev);
+# ifdef CONFIG_IRQ_DOMAIN
+void irq_domain_debugfs_init(struct dentry *root);
+# else
+static inline void irq_domain_debugfs_init(struct dentry *root)
+{
+}
+# endif
+#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
+static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
+{
+}
+static inline void irq_remove_debugfs_entry(struct irq_desc *d)
+{
+}
+static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+}
+#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
new file mode 100644
index 000000000000..fa4fc18c6131
--- /dev/null
+++ b/kernel/irq/ipi-mux.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Multiplex several virtual IPIs over a single HW IPI.
+ *
+ * Copyright The Asahi Linux Contributors
+ * Copyright (c) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "ipi-mux: " fmt
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+struct ipi_mux_cpu {
+ atomic_t enable;
+ atomic_t bits;
+};
+
+static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
+static struct irq_domain *ipi_mux_domain;
+static void (*ipi_mux_send)(unsigned int cpu);
+
+static void ipi_mux_mask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+
+ atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
+}
+
+static void ipi_mux_unmask(struct irq_data *d)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+
+ atomic_or(ibit, &icpu->enable);
+
+ /*
+ * The atomic_or() above must complete before the atomic_read()
+ * below to avoid racing ipi_mux_send_mask().
+ */
+ smp_mb__after_atomic();
+
+ /* If a pending IPI was unmasked, raise a parent IPI immediately. */
+ if (atomic_read(&icpu->bits) & ibit)
+ ipi_mux_send(smp_processor_id());
+}
+
+static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ u32 ibit = BIT(irqd_to_hwirq(d));
+ unsigned long pending;
+ int cpu;
+
+ for_each_cpu(cpu, mask) {
+ icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
+
+ /*
+ * This sequence is the mirror of the one in ipi_mux_unmask();
+ * see the comment there. Additionally, release semantics
+ * ensure that the vIPI flag set is ordered after any shared
+ * memory accesses that precede it. This therefore also pairs
+ * with the atomic_fetch_andnot in ipi_mux_process().
+ */
+ pending = atomic_fetch_or_release(ibit, &icpu->bits);
+
+ /*
+ * The atomic_fetch_or_release() above must complete
+ * before the atomic_read() below to avoid racing with
+ * ipi_mux_unmask().
+ */
+ smp_mb__after_atomic();
+
+ /*
+ * The flag writes must complete before the physical IPI is
+ * issued to another CPU. This is implied by the control
+ * dependency on the result of atomic_read() below, which is
+ * itself already ordered after the vIPI flag write.
+ */
+ if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit))
+ ipi_mux_send(cpu);
+ }
+}
+
+static const struct irq_chip ipi_mux_chip = {
+ .name = "IPI Mux",
+ .irq_mask = ipi_mux_mask,
+ .irq_unmask = ipi_mux_unmask,
+ .ipi_send_mask = ipi_mux_send_mask,
+};
+
+static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_percpu_devid(virq + i);
+ irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL,
+ handle_percpu_devid_irq, NULL, NULL);
+ }
+
+ return 0;
+}
+
+static const struct irq_domain_ops ipi_mux_domain_ops = {
+ .alloc = ipi_mux_domain_alloc,
+ .free = irq_domain_free_irqs_top,
+};
+
+/**
+ * ipi_mux_process - Process multiplexed virtual IPIs
+ */
+void ipi_mux_process(void)
+{
+ struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
+ irq_hw_number_t hwirq;
+ unsigned long ipis;
+ unsigned int en;
+
+ /*
+ * Reading enable mask does not need to be ordered as long as
+ * this function is called from interrupt handler because only
+ * the CPU itself can change it's own enable mask.
+ */
+ en = atomic_read(&icpu->enable);
+
+ /*
+ * Clear the IPIs we are about to handle. This pairs with the
+ * atomic_fetch_or_release() in ipi_mux_send_mask().
+ */
+ ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
+
+ for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int))
+ generic_handle_domain_irq(ipi_mux_domain, hwirq);
+}
+
+/**
+ * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
+ * parent IPI.
+ * @nr_ipi: number of virtual IPIs to create. This should
+ * be <= BITS_PER_TYPE(int)
+ * @mux_send: callback to trigger parent IPI for a particular CPU
+ *
+ * Returns first virq of the newly created virtual IPIs upon success
+ * or <=0 upon failure
+ */
+int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu))
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int rc;
+
+ if (ipi_mux_domain)
+ return -EEXIST;
+
+ if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)
+ return -EINVAL;
+
+ ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu));
+ if (!ipi_mux_pcpu)
+ return -ENOMEM;
+
+ fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
+ if (!fwnode) {
+ pr_err("unable to create IPI Mux fwnode\n");
+ rc = -ENOMEM;
+ goto fail_free_cpu;
+ }
+
+ domain = irq_domain_create_linear(fwnode, nr_ipi,
+ &ipi_mux_domain_ops, NULL);
+ if (!domain) {
+ pr_err("unable to add IPI Mux domain\n");
+ rc = -ENOMEM;
+ goto fail_free_fwnode;
+ }
+
+ domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE;
+ irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI);
+
+ rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL);
+ if (rc <= 0) {
+ pr_err("unable to alloc IRQs from IPI Mux domain\n");
+ goto fail_free_domain;
+ }
+
+ ipi_mux_domain = domain;
+ ipi_mux_send = mux_send;
+
+ return rc;
+
+fail_free_domain:
+ irq_domain_remove(domain);
+fail_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail_free_cpu:
+ free_percpu(ipi_mux_pcpu);
+ return rc;
+}
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
new file mode 100644
index 000000000000..961d4af76af3
--- /dev/null
+++ b/kernel/irq/ipi.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2015 Imagination Technologies Ltd
+ * Author: Qais Yousef <qais.yousef@imgtec.com>
+ *
+ * This file contains driver APIs to the IPI subsystem.
+ */
+
+#define pr_fmt(fmt) "genirq/ipi: " fmt
+
+#include <linux/irqdomain.h>
+#include <linux/irq.h>
+
+/**
+ * irq_reserve_ipi() - Setup an IPI to destination cpumask
+ * @domain: IPI domain
+ * @dest: cpumask of CPUs which can receive the IPI
+ *
+ * Allocate a virq that can be used to send IPI to any CPU in dest mask.
+ *
+ * Return: Linux IRQ number on success or error code on failure
+ */
+int irq_reserve_ipi(struct irq_domain *domain,
+ const struct cpumask *dest)
+{
+ unsigned int nr_irqs, offset;
+ struct irq_data *data;
+ int virq, i;
+
+ if (!domain ||!irq_domain_is_ipi(domain)) {
+ pr_warn("Reservation on a non IPI domain\n");
+ return -EINVAL;
+ }
+
+ if (!cpumask_subset(dest, cpu_possible_mask)) {
+ pr_warn("Reservation is not in possible_cpu_mask\n");
+ return -EINVAL;
+ }
+
+ nr_irqs = cpumask_weight(dest);
+ if (!nr_irqs) {
+ pr_warn("Reservation for empty destination mask\n");
+ return -EINVAL;
+ }
+
+ if (irq_domain_is_ipi_single(domain)) {
+ /*
+ * If the underlying implementation uses a single HW irq on
+ * all cpus then we only need a single Linux irq number for
+ * it. We have no restrictions vs. the destination mask. The
+ * underlying implementation can deal with holes nicely.
+ */
+ nr_irqs = 1;
+ offset = 0;
+ } else {
+ unsigned int next;
+
+ /*
+ * The IPI requires a separate HW irq on each CPU. We require
+ * that the destination mask is consecutive. If an
+ * implementation needs to support holes, it can reserve
+ * several IPI ranges.
+ */
+ offset = cpumask_first(dest);
+ /*
+ * Find a hole and if found look for another set bit after the
+ * hole. For now we don't support this scenario.
+ */
+ next = cpumask_next_zero(offset, dest);
+ if (next < nr_cpu_ids)
+ next = cpumask_next(next, dest);
+ if (next < nr_cpu_ids) {
+ pr_warn("Destination mask has holes\n");
+ return -EINVAL;
+ }
+ }
+
+ virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
+ if (virq <= 0) {
+ pr_warn("Can't reserve IPI, failed to alloc descs\n");
+ return -ENOMEM;
+ }
+
+ virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
+ (void *) dest, true, NULL);
+
+ if (virq <= 0) {
+ pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
+ goto free_descs;
+ }
+
+ for (i = 0; i < nr_irqs; i++) {
+ data = irq_get_irq_data(virq + i);
+ cpumask_copy(data->common->affinity, dest);
+ data->common->ipi_offset = offset;
+ irq_set_status_flags(virq + i, IRQ_NO_BALANCING);
+ }
+ return virq;
+
+free_descs:
+ irq_free_descs(virq, nr_irqs);
+ return -EBUSY;
+}
+
+/**
+ * irq_destroy_ipi() - unreserve an IPI that was previously allocated
+ * @irq: Linux IRQ number to be destroyed
+ * @dest: cpumask of CPUs which should have the IPI removed
+ *
+ * The IPIs allocated with irq_reserve_ipi() are returned to the system
+ * destroying all virqs associated with them.
+ *
+ * Return: %0 on success or error code on failure.
+ */
+int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
+{
+ struct irq_data *data = irq_get_irq_data(irq);
+ const struct cpumask *ipimask;
+ struct irq_domain *domain;
+ unsigned int nr_irqs;
+
+ if (!irq || !data)
+ return -EINVAL;
+
+ domain = data->domain;
+ if (WARN_ON(domain == NULL))
+ return -EINVAL;
+
+ if (!irq_domain_is_ipi(domain)) {
+ pr_warn("Trying to destroy a non IPI domain!\n");
+ return -EINVAL;
+ }
+
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask)))
+ /*
+ * Must be destroying a subset of CPUs to which this IPI
+ * was set up to target
+ */
+ return -EINVAL;
+
+ if (irq_domain_is_ipi_per_cpu(domain)) {
+ irq = irq + cpumask_first(dest) - data->common->ipi_offset;
+ nr_irqs = cpumask_weight(dest);
+ } else {
+ nr_irqs = 1;
+ }
+
+ irq_domain_free_irqs(irq, nr_irqs);
+ return 0;
+}
+
+/**
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a CPU
+ * @irq: Linux IRQ number
+ * @cpu: the target CPU
+ *
+ * When dealing with coprocessors IPI, we need to inform the coprocessor of
+ * the hwirq it needs to use to receive and send IPIs.
+ *
+ * Return: hwirq value on success or INVALID_HWIRQ on failure.
+ */
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
+{
+ struct irq_data *data = irq_get_irq_data(irq);
+ const struct cpumask *ipimask;
+
+ if (!data || cpu >= nr_cpu_ids)
+ return INVALID_HWIRQ;
+
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask || !cpumask_test_cpu(cpu, ipimask))
+ return INVALID_HWIRQ;
+
+ /*
+ * Get the real hardware irq number if the underlying implementation
+ * uses a separate irq per cpu. If the underlying implementation uses
+ * a single hardware irq for all cpus then the IPI send mechanism
+ * needs to take care of the cpu destinations.
+ */
+ if (irq_domain_is_ipi_per_cpu(data->domain))
+ data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
+
+ return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
+}
+EXPORT_SYMBOL_GPL(ipi_get_hwirq);
+
+static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
+ const struct cpumask *dest, unsigned int cpu)
+{
+ const struct cpumask *ipimask;
+
+ if (!chip || !data)
+ return -EINVAL;
+
+ if (!chip->ipi_send_single && !chip->ipi_send_mask)
+ return -EINVAL;
+
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+
+ ipimask = irq_data_get_affinity_mask(data);
+ if (!ipimask)
+ return -EINVAL;
+
+ if (dest) {
+ if (!cpumask_subset(dest, ipimask))
+ return -EINVAL;
+ } else {
+ if (!cpumask_test_cpu(cpu, ipimask))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * __ipi_send_single - send an IPI to a target Linux SMP CPU
+ * @desc: pointer to irq_desc of the IRQ
+ * @cpu: destination CPU, must in the destination mask passed to
+ * irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Return: %0 on success or negative error number on failure.
+ */
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
+{
+ struct irq_data *data = irq_desc_get_irq_data(desc);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+#ifdef DEBUG
+ /*
+ * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+ * Since the callers should be arch or core code which is generally
+ * trusted, only check for errors when debugging.
+ */
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+ return -EINVAL;
+#endif
+ if (!chip->ipi_send_single) {
+ chip->ipi_send_mask(data, cpumask_of(cpu));
+ return 0;
+ }
+
+ /* FIXME: Store this information in irqdata flags */
+ if (irq_domain_is_ipi_per_cpu(data->domain) &&
+ cpu != data->common->ipi_offset) {
+ /* use the correct data for that cpu */
+ unsigned irq = data->irq + cpu - data->common->ipi_offset;
+
+ data = irq_get_irq_data(irq);
+ }
+ chip->ipi_send_single(data, cpu);
+ return 0;
+}
+
+/**
+ * __ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * @desc: pointer to irq_desc of the IRQ
+ * @dest: dest CPU(s), must be a subset of the mask passed to
+ * irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Return: %0 on success or negative error number on failure.
+ */
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
+{
+ struct irq_data *data = irq_desc_get_irq_data(desc);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ unsigned int cpu;
+
+#ifdef DEBUG
+ /*
+ * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+ * Since the callers should be arch or core code which is generally
+ * trusted, only check for errors when debugging.
+ */
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+ return -EINVAL;
+#endif
+ if (chip->ipi_send_mask) {
+ chip->ipi_send_mask(data, dest);
+ return 0;
+ }
+
+ if (irq_domain_is_ipi_per_cpu(data->domain)) {
+ unsigned int base = data->irq;
+
+ for_each_cpu(cpu, dest) {
+ unsigned irq = base + cpu - data->common->ipi_offset;
+
+ data = irq_get_irq_data(irq);
+ chip->ipi_send_single(data, cpu);
+ }
+ } else {
+ for_each_cpu(cpu, dest)
+ chip->ipi_send_single(data, cpu);
+ }
+ return 0;
+}
+
+/**
+ * ipi_send_single - Send an IPI to a single CPU
+ * @virq: Linux IRQ number from irq_reserve_ipi()
+ * @cpu: destination CPU, must in the destination mask passed to
+ * irq_reserve_ipi()
+ *
+ * Return: %0 on success or negative error number on failure.
+ */
+int ipi_send_single(unsigned int virq, unsigned int cpu)
+{
+ struct irq_desc *desc = irq_to_desc(virq);
+ struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+ struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+ return -EINVAL;
+
+ return __ipi_send_single(desc, cpu);
+}
+EXPORT_SYMBOL_GPL(ipi_send_single);
+
+/**
+ * ipi_send_mask - Send an IPI to target CPU(s)
+ * @virq: Linux IRQ number from irq_reserve_ipi()
+ * @dest: dest CPU(s), must be a subset of the mask passed to
+ * irq_reserve_ipi()
+ *
+ * Return: %0 on success or negative error number on failure.
+ */
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
+{
+ struct irq_desc *desc = irq_to_desc(virq);
+ struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+ struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+ if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+ return -EINVAL;
+
+ return __ipi_send_mask(desc, dest);
+}
+EXPORT_SYMBOL_GPL(ipi_send_mask);
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
new file mode 100644
index 000000000000..ae4c9cbd1b4b
--- /dev/null
+++ b/kernel/irq/irq_sim.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
+ * Copyright (C) 2020 Bartosz Golaszewski <bgolaszewski@baylibre.com>
+ */
+
+#include <linux/cleanup.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irq_sim.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+
+struct irq_sim_work_ctx {
+ struct irq_work work;
+ unsigned int irq_count;
+ unsigned long *pending;
+ struct irq_domain *domain;
+ struct irq_sim_ops ops;
+ void *user_data;
+};
+
+struct irq_sim_irq_ctx {
+ bool enabled;
+ struct irq_sim_work_ctx *work_ctx;
+};
+
+static void irq_sim_irqmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = false;
+}
+
+static void irq_sim_irqunmask(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+ irq_ctx->enabled = true;
+}
+
+static int irq_sim_set_type(struct irq_data *data, unsigned int type)
+{
+ /* We only support rising and falling edge trigger types. */
+ if (type & ~IRQ_TYPE_EDGE_BOTH)
+ return -EINVAL;
+
+ irqd_set_trigger_type(data, type);
+
+ return 0;
+}
+
+static int irq_sim_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool *state)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_PENDING:
+ if (irq_ctx->enabled)
+ *state = test_bit(hwirq, irq_ctx->work_ctx->pending);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int irq_sim_set_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which, bool state)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ switch (which) {
+ case IRQCHIP_STATE_PENDING:
+ if (irq_ctx->enabled) {
+ assign_bit(hwirq, irq_ctx->work_ctx->pending, state);
+ if (state)
+ irq_work_queue(&irq_ctx->work_ctx->work);
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int irq_sim_request_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_requested)
+ return work_ctx->ops.irq_sim_irq_requested(work_ctx->domain,
+ hwirq,
+ work_ctx->user_data);
+
+ return 0;
+}
+
+static void irq_sim_release_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_released)
+ work_ctx->ops.irq_sim_irq_released(work_ctx->domain, hwirq,
+ work_ctx->user_data);
+}
+
+static struct irq_chip irq_sim_irqchip = {
+ .name = "irq_sim",
+ .irq_mask = irq_sim_irqmask,
+ .irq_unmask = irq_sim_irqunmask,
+ .irq_set_type = irq_sim_set_type,
+ .irq_get_irqchip_state = irq_sim_get_irqchip_state,
+ .irq_set_irqchip_state = irq_sim_set_irqchip_state,
+ .irq_request_resources = irq_sim_request_resources,
+ .irq_release_resources = irq_sim_release_resources,
+};
+
+static void irq_sim_handle_irq(struct irq_work *work)
+{
+ struct irq_sim_work_ctx *work_ctx;
+ unsigned int offset = 0;
+ int irqnum;
+
+ work_ctx = container_of(work, struct irq_sim_work_ctx, work);
+
+ while (!bitmap_empty(work_ctx->pending, work_ctx->irq_count)) {
+ offset = find_next_bit(work_ctx->pending,
+ work_ctx->irq_count, offset);
+ clear_bit(offset, work_ctx->pending);
+ irqnum = irq_find_mapping(work_ctx->domain, offset);
+ handle_simple_irq(irq_to_desc(irqnum));
+ }
+}
+
+static int irq_sim_domain_map(struct irq_domain *domain,
+ unsigned int virq, irq_hw_number_t hw)
+{
+ struct irq_sim_work_ctx *work_ctx = domain->host_data;
+ struct irq_sim_irq_ctx *irq_ctx;
+
+ irq_ctx = kzalloc(sizeof(*irq_ctx), GFP_KERNEL);
+ if (!irq_ctx)
+ return -ENOMEM;
+
+ irq_set_chip(virq, &irq_sim_irqchip);
+ irq_set_chip_data(virq, irq_ctx);
+ irq_set_handler(virq, handle_simple_irq);
+ irq_modify_status(virq, IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
+ irq_ctx->work_ctx = work_ctx;
+
+ return 0;
+}
+
+static void irq_sim_domain_unmap(struct irq_domain *domain, unsigned int virq)
+{
+ struct irq_sim_irq_ctx *irq_ctx;
+ struct irq_data *irqd;
+
+ irqd = irq_domain_get_irq_data(domain, virq);
+ irq_ctx = irq_data_get_irq_chip_data(irqd);
+
+ irq_set_handler(virq, NULL);
+ irq_domain_reset_irq_data(irqd);
+ kfree(irq_ctx);
+}
+
+static const struct irq_domain_ops irq_sim_domain_ops = {
+ .map = irq_sim_domain_map,
+ .unmap = irq_sim_domain_unmap,
+};
+
+/**
+ * irq_domain_create_sim - Create a new interrupt simulator irq_domain and
+ * allocate a range of dummy interrupts.
+ *
+ * @fwnode: struct fwnode_handle to be associated with this domain.
+ * @num_irqs: Number of interrupts to allocate.
+ *
+ * On success: return a new irq_domain object.
+ * On failure: a negative errno wrapped with ERR_PTR().
+ */
+struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
+ unsigned int num_irqs)
+{
+ return irq_domain_create_sim_full(fwnode, num_irqs, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(irq_domain_create_sim);
+
+struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
+ struct irq_sim_work_ctx *work_ctx __free(kfree) =
+ kzalloc(sizeof(*work_ctx), GFP_KERNEL);
+
+ if (!work_ctx)
+ return ERR_PTR(-ENOMEM);
+
+ unsigned long *pending __free(bitmap) = bitmap_zalloc(num_irqs, GFP_KERNEL);
+ if (!pending)
+ return ERR_PTR(-ENOMEM);
+
+ work_ctx->domain = irq_domain_create_linear(fwnode, num_irqs,
+ &irq_sim_domain_ops,
+ work_ctx);
+ if (!work_ctx->domain)
+ return ERR_PTR(-ENOMEM);
+
+ work_ctx->irq_count = num_irqs;
+ work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
+ work_ctx->pending = no_free_ptr(pending);
+ work_ctx->user_data = data;
+
+ if (ops)
+ memcpy(&work_ctx->ops, ops, sizeof(*ops));
+
+ return no_free_ptr(work_ctx)->domain;
+}
+EXPORT_SYMBOL_GPL(irq_domain_create_sim_full);
+
+/**
+ * irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free
+ * the interrupt descriptors and allocated memory.
+ *
+ * @domain: The interrupt simulator domain to tear down.
+ */
+void irq_domain_remove_sim(struct irq_domain *domain)
+{
+ struct irq_sim_work_ctx *work_ctx = domain->host_data;
+
+ irq_work_sync(&work_ctx->work);
+ bitmap_free(work_ctx->pending);
+ kfree(work_ctx);
+
+ irq_domain_remove(domain);
+}
+EXPORT_SYMBOL_GPL(irq_domain_remove_sim);
+
+static void devm_irq_domain_remove_sim(void *data)
+{
+ struct irq_domain *domain = data;
+
+ irq_domain_remove_sim(domain);
+}
+
+/**
+ * devm_irq_domain_create_sim - Create a new interrupt simulator for
+ * a managed device.
+ *
+ * @dev: Device to initialize the simulator object for.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
+ * @num_irqs: Number of interrupts to allocate
+ *
+ * On success: return a new irq_domain object.
+ * On failure: a negative errno wrapped with ERR_PTR().
+ */
+struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
+ struct fwnode_handle *fwnode,
+ unsigned int num_irqs)
+{
+ return devm_irq_domain_create_sim_full(dev, fwnode, num_irqs,
+ NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
+
+struct irq_domain *
+devm_irq_domain_create_sim_full(struct device *dev,
+ struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
+ struct irq_domain *domain;
+ int ret;
+
+ domain = irq_domain_create_sim_full(fwnode, num_irqs, ops, data);
+ if (IS_ERR(domain))
+ return domain;
+
+ ret = devm_add_action_or_reset(dev, devm_irq_domain_remove_sim, domain);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return domain;
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim_full);
diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c
new file mode 100644
index 000000000000..e2d31914b3c4
--- /dev/null
+++ b/kernel/irq/irq_test.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: LGPL-2.1+
+
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqdomain.h>
+#include <linux/nodemask.h>
+#include <kunit/test.h>
+
+#include "internals.h"
+
+static irqreturn_t noop_handler(int irq, void *data)
+{
+ return IRQ_HANDLED;
+}
+
+static void noop(struct irq_data *data) { }
+static unsigned int noop_ret(struct irq_data *data) { return 0; }
+
+static int noop_affinity(struct irq_data *data, const struct cpumask *dest,
+ bool force)
+{
+ irq_data_update_effective_affinity(data, dest);
+
+ return 0;
+}
+
+static struct irq_chip fake_irq_chip = {
+ .name = "fake",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = noop,
+ .irq_mask = noop,
+ .irq_unmask = noop,
+ .irq_set_affinity = noop_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static int irq_test_setup_fake_irq(struct kunit *test, struct irq_affinity_desc *affd)
+{
+ struct irq_desc *desc;
+ int virq;
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, affd);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ /* On some architectures, IRQs are NOREQUEST | NOPROBE by default. */
+ irq_settings_clr_norequest(desc);
+
+ return virq;
+}
+
+static void irq_disable_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_test_setup_fake_irq(test, NULL);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_free_disabled_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_test_setup_fake_irq(test, NULL);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ free_irq(virq, NULL);
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_shutdown_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ .mask = CPU_MASK_ALL,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for managed shutdown");
+
+ virq = irq_test_setup_fake_irq(test, &affinity);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ irq_shutdown_and_deactivate(desc);
+
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+
+ KUNIT_EXPECT_EQ(test, irq_activate(desc), 0);
+#ifdef CONFIG_SMP
+ irq_startup_managed(desc);
+#endif
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_cpuhotplug_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for CPU hotplug");
+ if (!get_cpu_device(1))
+ kunit_skip(test, "requires more than 1 CPU for CPU hotplug");
+ if (!cpu_is_hotpluggable(1))
+ kunit_skip(test, "CPU 1 must be hotpluggable");
+ if (!cpu_online(1))
+ kunit_skip(test, "CPU 1 must be online");
+
+ cpumask_copy(&affinity.mask, cpumask_of(1));
+
+ virq = irq_test_setup_fake_irq(test, &affinity);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ KUNIT_EXPECT_EQ(test, remove_cpu(1), 0);
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+ KUNIT_EXPECT_EQ(test, add_cpu(1), 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static struct kunit_case irq_test_cases[] = {
+ KUNIT_CASE(irq_disable_depth_test),
+ KUNIT_CASE(irq_free_disabled_test),
+ KUNIT_CASE(irq_shutdown_depth_test),
+ KUNIT_CASE(irq_cpuhotplug_test),
+ {}
+};
+
+static struct kunit_suite irq_test_suite = {
+ .name = "irq_test_cases",
+ .test_cases = irq_test_cases,
+};
+
+kunit_test_suite(irq_test_suite);
+MODULE_DESCRIPTION("IRQ unit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 99793b9b6d23..6acf268f005b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -1,10 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner, Russell King
*
- * This file contains the interrupt descriptor management code
- *
- * Detailed information is available in Documentation/DocBook/genericirq
+ * This file contains the interrupt descriptor management code. Detailed
+ * information is available in Documentation/core-api/genericirq.rst
*
*/
#include <linux/irq.h>
@@ -12,9 +12,10 @@
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
-#include <linux/radix-tree.h>
-#include <linux/bitmap.h>
+#include <linux/maple_tree.h>
#include <linux/irqdomain.h>
+#include <linux/sysfs.h>
+#include <linux/string_choices.h>
#include "internals.h"
@@ -24,10 +25,25 @@
static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&irq_default_affinity);
+ cpulist_parse(str, irq_default_affinity);
+ /*
+ * Set at least the boot cpu. We don't want to end up with
+ * bugreports caused by random commandline masks
+ */
+ cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+ return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
static void __init init_irq_default_affinity(void)
{
- alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
- cpumask_setall(irq_default_affinity);
+ if (!cpumask_available(irq_default_affinity))
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ if (cpumask_empty(irq_default_affinity))
+ cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)
@@ -36,100 +52,371 @@ static void __init init_irq_default_affinity(void)
#endif
#ifdef CONFIG_SMP
-static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+static int alloc_masks(struct irq_desc *desc, int node)
{
- if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+ if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
+ GFP_KERNEL, node))
return -ENOMEM;
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity,
+ GFP_KERNEL, node)) {
+ free_cpumask_var(desc->irq_common_data.affinity);
+ return -ENOMEM;
+ }
+#endif
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
- free_cpumask_var(desc->irq_data.affinity);
+ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) {
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
+ free_cpumask_var(desc->irq_common_data.affinity);
return -ENOMEM;
}
#endif
return 0;
}
-static void desc_smp_init(struct irq_desc *desc, int node)
+static void desc_smp_init(struct irq_desc *desc, int node,
+ const struct cpumask *affinity)
{
- desc->irq_data.node = node;
- cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+ if (!affinity)
+ affinity = irq_default_affinity;
+ cpumask_copy(desc->irq_common_data.affinity, affinity);
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_clear(desc->pending_mask);
#endif
+#ifdef CONFIG_NUMA
+ desc->irq_common_data.node = node;
+#endif
}
-static inline int desc_node(struct irq_desc *desc)
+static void free_masks(struct irq_desc *desc)
{
- return desc->irq_data.node;
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ free_cpumask_var(desc->pending_mask);
+#endif
+ free_cpumask_var(desc->irq_common_data.affinity);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
}
#else
static inline int
-alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
-static inline void desc_smp_init(struct irq_desc *desc, int node) { }
-static inline int desc_node(struct irq_desc *desc) { return 0; }
+alloc_masks(struct irq_desc *desc, int node) { return 0; }
+static inline void
+desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
+static inline void free_masks(struct irq_desc *desc) { }
#endif
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
- struct module *owner)
+ const struct cpumask *affinity, struct module *owner)
{
int cpu;
+ desc->irq_common_data.handler_data = NULL;
+ desc->irq_common_data.msi_desc = NULL;
+
+ desc->irq_data.common = &desc->irq_common_data;
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
- desc->irq_data.handler_data = NULL;
- desc->irq_data.msi_desc = NULL;
irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+ irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
desc->handle_irq = handle_bad_irq;
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
+ desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
- *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
- desc_smp_init(desc, node);
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
+ desc_smp_init(desc, node, affinity);
+}
+
+static unsigned int nr_irqs = NR_IRQS;
+
+/**
+ * irq_get_nr_irqs() - Number of interrupts supported by the system.
+ */
+unsigned int irq_get_nr_irqs(void)
+{
+ return nr_irqs;
}
+EXPORT_SYMBOL_GPL(irq_get_nr_irqs);
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
+/**
+ * irq_set_nr_irqs() - Set the number of interrupts supported by the system.
+ * @nr: New number of interrupts.
+ *
+ * Return: @nr.
+ */
+unsigned int irq_set_nr_irqs(unsigned int nr)
+{
+ nr_irqs = nr;
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(irq_set_nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
+static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
+ MT_FLAGS_ALLOC_RANGE |
+ MT_FLAGS_LOCK_EXTERN |
+ MT_FLAGS_USE_RCU,
+ sparse_irq_lock);
-#ifdef CONFIG_SPARSE_IRQ
+static int irq_find_free_area(unsigned int from, unsigned int cnt)
+{
+ MA_STATE(mas, &sparse_irqs, 0, 0);
-static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
+ if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt))
+ return -ENOSPC;
+ return mas.index;
+}
-static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+static unsigned int irq_find_at_or_after(unsigned int offset)
{
- radix_tree_insert(&irq_desc_tree, irq, desc);
+ unsigned long index = offset;
+ struct irq_desc *desc;
+
+ guard(rcu)();
+ desc = mt_find(&sparse_irqs, &index, nr_irqs);
+
+ return desc ? irq_desc_get_irq(desc) : nr_irqs;
}
-struct irq_desc *irq_to_desc(unsigned int irq)
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
{
- return radix_tree_lookup(&irq_desc_tree, irq);
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0);
}
-EXPORT_SYMBOL(irq_to_desc);
static void delete_irq_desc(unsigned int irq)
{
- radix_tree_delete(&irq_desc_tree, irq);
+ MA_STATE(mas, &sparse_irqs, irq, irq);
+ mas_erase(&mas);
}
-#ifdef CONFIG_SMP
-static void free_masks(struct irq_desc *desc)
+#ifdef CONFIG_SPARSE_IRQ
+static const struct kobj_type irq_kobj_type;
+#endif
+
+static int init_desc(struct irq_desc *desc, int irq, int node,
+ unsigned int flags,
+ const struct cpumask *affinity,
+ struct module *owner)
{
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- free_cpumask_var(desc->pending_mask);
+ desc->kstat_irqs = alloc_percpu(struct irqstat);
+ if (!desc->kstat_irqs)
+ return -ENOMEM;
+
+ if (alloc_masks(desc, node)) {
+ free_percpu(desc->kstat_irqs);
+ return -ENOMEM;
+ }
+
+ raw_spin_lock_init(&desc->lock);
+ lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ mutex_init(&desc->request_mutex);
+ init_waitqueue_head(&desc->wait_for_threads);
+ desc_set_defaults(irq, desc, node, affinity, owner);
+ irqd_set(&desc->irq_data, flags);
+ irq_resend_init(desc);
+#ifdef CONFIG_SPARSE_IRQ
+ kobject_init(&desc->kobj, &irq_kobj_type);
+ init_rcu_head(&desc->rcu);
#endif
- free_cpumask_var(desc->irq_data.affinity);
+
+ return 0;
}
-#else
-static inline void free_masks(struct irq_desc *desc) { }
+
+#ifdef CONFIG_SPARSE_IRQ
+
+static void irq_kobj_release(struct kobject *kobj);
+
+#ifdef CONFIG_SYSFS
+static struct kobject *irq_kobj_base;
+
+#define IRQ_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ ssize_t ret = 0;
+ char *p = "";
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ unsigned int c = irq_desc_kstat_cpu(desc, cpu);
+
+ ret += sysfs_emit_at(buf, ret, "%s%u", p, c);
+ p = ",";
+ }
+
+ ret += sysfs_emit_at(buf, ret, "\n");
+ return ret;
+}
+IRQ_ATTR_RO(per_cpu_count);
+
+static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+
+ guard(raw_spinlock_irq)(&desc->lock);
+ if (desc->irq_data.chip && desc->irq_data.chip->name)
+ return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name);
+ return 0;
+}
+IRQ_ATTR_RO(chip_name);
+
+static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+
+ guard(raw_spinlock_irq)(&desc->lock);
+ if (desc->irq_data.domain)
+ return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq);
+ return 0;
+}
+IRQ_ATTR_RO(hwirq);
+
+static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
+
+}
+IRQ_ATTR_RO(type);
+
+static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data)));
+}
+IRQ_ATTR_RO(wakeup);
+
+static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+
+ guard(raw_spinlock_irq)(&desc->lock);
+ if (desc->name)
+ return sysfs_emit(buf, "%s\n", desc->name);
+ return 0;
+}
+IRQ_ATTR_RO(name);
+
+static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+ struct irqaction *action;
+ ssize_t ret = 0;
+ char *p = "";
+
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ for_each_action_of_desc(desc, action) {
+ ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name);
+ p = ",";
+ }
+ }
+
+ if (ret)
+ ret += sysfs_emit_at(buf, ret, "\n");
+ return ret;
+}
+IRQ_ATTR_RO(actions);
+
+static struct attribute *irq_attrs[] = {
+ &per_cpu_count_attr.attr,
+ &chip_name_attr.attr,
+ &hwirq_attr.attr,
+ &type_attr.attr,
+ &wakeup_attr.attr,
+ &name_attr.attr,
+ &actions_attr.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(irq);
+
+static const struct kobj_type irq_kobj_type = {
+ .release = irq_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = irq_groups,
+};
+
+static void irq_sysfs_add(int irq, struct irq_desc *desc)
+{
+ if (irq_kobj_base) {
+ /*
+ * Continue even in case of failure as this is nothing
+ * crucial and failures in the late irq_sysfs_init()
+ * cannot be rolled back.
+ */
+ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
+ pr_warn("Failed to add kobject for irq %d\n", irq);
+ else
+ desc->istate |= IRQS_SYSFS;
+ }
+}
+
+static void irq_sysfs_del(struct irq_desc *desc)
+{
+ /*
+ * Only invoke kobject_del() when kobject_add() was successfully
+ * invoked for the descriptor. This covers both early boot, where
+ * sysfs is not initialized yet, and the case of a failed
+ * kobject_add() invocation.
+ */
+ if (desc->istate & IRQS_SYSFS)
+ kobject_del(&desc->kobj);
+}
+
+static int __init irq_sysfs_init(void)
+{
+ struct irq_desc *desc;
+ int irq;
+
+ /* Prevent concurrent irq alloc/free */
+ guard(mutex)(&sparse_irq_lock);
+ irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
+ if (!irq_kobj_base)
+ return -ENOMEM;
+
+ /* Add the already allocated interrupts */
+ for_each_irq_desc(irq, desc)
+ irq_sysfs_add(irq, desc);
+ return 0;
+}
+postcore_initcall(irq_sysfs_init);
+
+#else /* !CONFIG_SYSFS */
+
+static const struct kobj_type irq_kobj_type = {
+ .release = irq_kobj_release,
+};
+
+static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
+static void irq_sysfs_del(struct irq_desc *desc) {}
+
+#endif /* CONFIG_SYSFS */
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+ return mtree_load(&sparse_irqs, irq);
+}
+#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
+EXPORT_SYMBOL_GPL(irq_to_desc);
#endif
void irq_lock_sparse(void)
@@ -142,40 +429,47 @@ void irq_unlock_sparse(void)
mutex_unlock(&sparse_irq_lock);
}
-static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
+static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
+ const struct cpumask *affinity,
+ struct module *owner)
{
struct irq_desc *desc;
- gfp_t gfp = GFP_KERNEL;
+ int ret;
- desc = kzalloc_node(sizeof(*desc), gfp, node);
+ desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
if (!desc)
return NULL;
- /* allocate based on nr_cpu_ids */
- desc->kstat_irqs = alloc_percpu(unsigned int);
- if (!desc->kstat_irqs)
- goto err_desc;
- if (alloc_masks(desc, gfp, node))
- goto err_kstat;
-
- raw_spin_lock_init(&desc->lock);
- lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-
- desc_set_defaults(irq, desc, node, owner);
+ ret = init_desc(desc, irq, node, flags, affinity, owner);
+ if (unlikely(ret)) {
+ kfree(desc);
+ return NULL;
+ }
return desc;
+}
-err_kstat:
+static void irq_kobj_release(struct kobject *kobj)
+{
+ struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+
+ free_masks(desc);
free_percpu(desc->kstat_irqs);
-err_desc:
kfree(desc);
- return NULL;
+}
+
+static void delayed_free_desc(struct rcu_head *rhp)
+{
+ struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
+
+ kobject_put(&desc->kobj);
}
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
+ irq_remove_debugfs_entry(desc);
unregister_irq_proc(irq, desc);
/*
@@ -183,48 +477,73 @@ static void free_desc(unsigned int irq)
* kstat_irq_usr(). Once we deleted the descriptor from the
* sparse tree we can free it. Access in proc will fail to
* lookup the descriptor.
+ *
+ * The sysfs entry must be serialized against a concurrent
+ * irq_sysfs_init() as well.
*/
- mutex_lock(&sparse_irq_lock);
+ irq_sysfs_del(desc);
delete_irq_desc(irq);
- mutex_unlock(&sparse_irq_lock);
- free_masks(desc);
- free_percpu(desc->kstat_irqs);
- kfree(desc);
+ /*
+ * We free the descriptor, masks and stat fields via RCU. That
+ * allows demultiplex interrupts to do rcu based management of
+ * the child interrupts.
+ * This also allows us to use rcu in kstat_irqs_usr().
+ */
+ call_rcu(&desc->rcu, delayed_free_desc);
}
static int alloc_descs(unsigned int start, unsigned int cnt, int node,
+ const struct irq_affinity_desc *affinity,
struct module *owner)
{
struct irq_desc *desc;
int i;
+ /* Validate affinity mask(s) */
+ if (affinity) {
+ for (i = 0; i < cnt; i++) {
+ if (cpumask_empty(&affinity[i].mask))
+ return -EINVAL;
+ }
+ }
+
for (i = 0; i < cnt; i++) {
- desc = alloc_desc(start + i, node, owner);
+ const struct cpumask *mask = NULL;
+ unsigned int flags = 0;
+
+ if (affinity) {
+ if (affinity->is_managed) {
+ flags = IRQD_AFFINITY_MANAGED |
+ IRQD_MANAGED_SHUTDOWN;
+ }
+ flags |= IRQD_AFFINITY_SET;
+ mask = &affinity->mask;
+ node = cpu_to_node(cpumask_first(mask));
+ affinity++;
+ }
+
+ desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
- mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
- mutex_unlock(&sparse_irq_lock);
+ irq_sysfs_add(start + i, desc);
+ irq_add_debugfs_entry(start + i, desc);
}
return start;
err:
for (i--; i >= 0; i--)
free_desc(start + i);
-
- mutex_lock(&sparse_irq_lock);
- bitmap_clear(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
return -ENOMEM;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static bool irq_expand_nr_irqs(unsigned int nr)
{
- if (nr > IRQ_BITMAP_BITS)
- return -ENOMEM;
+ if (nr > MAX_SPARSE_IRQS)
+ return false;
nr_irqs = nr;
- return 0;
+ return true;
}
int __init early_irq_init(void)
@@ -236,20 +555,20 @@ int __init early_irq_init(void)
/* Let arch update nr_irqs and return the nr of preallocated irqs */
initcnt = arch_probe_nr_irqs();
- printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+ printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
+ NR_IRQS, nr_irqs, initcnt);
- if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
- nr_irqs = IRQ_BITMAP_BITS;
+ if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS))
+ nr_irqs = MAX_SPARSE_IRQS;
- if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
- initcnt = IRQ_BITMAP_BITS;
+ if (WARN_ON(initcnt > MAX_SPARSE_IRQS))
+ initcnt = MAX_SPARSE_IRQS;
if (initcnt > nr_irqs)
nr_irqs = initcnt;
for (i = 0; i < initcnt; i++) {
- desc = alloc_desc(i, node, NULL);
- set_bit(i, allocated_irqs);
+ desc = alloc_desc(i, node, 0, NULL, NULL);
irq_insert_desc(i, desc);
}
return arch_early_irq_init();
@@ -268,23 +587,29 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
- struct irq_desc *desc;
+ int ret;
init_irq_default_affinity();
- printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+ printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
- desc = irq_desc;
count = ARRAY_SIZE(irq_desc);
for (i = 0; i < count; i++) {
- desc[i].kstat_irqs = alloc_percpu(unsigned int);
- alloc_masks(&desc[i], GFP_KERNEL, node);
- raw_spin_lock_init(&desc[i].lock);
- lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- desc_set_defaults(i, &desc[i], node, NULL);
+ ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL);
+ if (unlikely(ret))
+ goto __free_desc_res;
}
+
return arch_early_irq_init();
+
+__free_desc_res:
+ while (--i >= 0) {
+ free_masks(irq_desc + i);
+ free_percpu(irq_desc[i].kstat_irqs);
+ }
+
+ return ret;
}
struct irq_desc *irq_to_desc(unsigned int irq)
@@ -296,14 +621,14 @@ EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, desc_node(desc), NULL);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
+ delete_irq_desc(irq);
}
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+ const struct irq_affinity_desc *affinity,
struct module *owner)
{
u32 i;
@@ -312,86 +637,135 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
struct irq_desc *desc = irq_to_desc(start + i);
desc->owner = owner;
+ irq_insert_desc(start + i, desc);
}
return start;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static inline bool irq_expand_nr_irqs(unsigned int nr)
{
- return -ENOMEM;
+ return false;
}
void irq_mark_irq(unsigned int irq)
{
- mutex_lock(&sparse_irq_lock);
- bitmap_set(allocated_irqs, irq, 1);
- mutex_unlock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
+ irq_insert_desc(irq, irq_desc + irq);
}
-#ifdef CONFIG_GENERIC_IRQ_LEGACY
-void irq_init_desc(unsigned int irq)
+#endif /* !CONFIG_SPARSE_IRQ */
+
+int handle_irq_desc(struct irq_desc *desc)
{
- free_desc(irq);
-}
-#endif
+ struct irq_data *data;
-#endif /* !CONFIG_SPARSE_IRQ */
+ if (!desc)
+ return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data)))
+ return -EPERM;
+
+ generic_handle_irq_desc(desc);
+ return 0;
+}
/**
* generic_handle_irq - Invoke the handler for a particular irq
* @irq: The irq number to handle
*
- */
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
+ */
int generic_handle_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc)
- return -EINVAL;
- generic_handle_irq_desc(irq, desc);
- return 0;
+ return handle_irq_desc(irq_to_desc(irq));
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
-#ifdef CONFIG_HANDLE_DOMAIN_IRQ
/**
- * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
- * @domain: The domain where to perform the lookup
- * @hwirq: The HW irq number to convert to a logical one
- * @lookup: Whether to perform the domain lookup or not
- * @regs: Register file coming from the low-level handling code
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
+ * @irq: The irq number to handle
*
- * Returns: 0 on success, or -EINVAL if conversion has failed
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
*/
-int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
- bool lookup, struct pt_regs *regs)
+int generic_handle_irq_safe(unsigned int irq)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
- unsigned int irq = hwirq;
- int ret = 0;
+ unsigned long flags;
+ int ret;
- irq_enter();
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_to_desc(irq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
#ifdef CONFIG_IRQ_DOMAIN
- if (lookup)
- irq = irq_find_mapping(domain, hwirq);
-#endif
+/**
+ * generic_handle_domain_irq - Invoke the handler for a HW irq belonging
+ * to a domain.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ * This function must be called from an IRQ context with irq regs
+ * initialized.
+ */
+int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
+{
+ return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
- /*
- * Some hardware gives randomly wrong interrupts. Rather
- * than crashing, do something sensible.
- */
- if (unlikely(!irq || irq >= nr_irqs)) {
- ack_bad_irq(irq);
- ret = -EINVAL;
- } else {
- generic_handle_irq(irq);
- }
+ /**
+ * generic_handle_irq_safe - Invoke the handler for a HW irq belonging
+ * to a domain from any context.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process
+ * context). If the interrupt is marked as 'enforce IRQ-context only' then
+ * the function must be invoked from hard interrupt context.
+ */
+int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
+{
+ unsigned long flags;
+ int ret;
- irq_exit();
- set_irq_regs(old_regs);
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+ local_irq_restore(flags);
return ret;
}
+EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);
+
+/**
+ * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
+ * to a domain.
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ *
+ * This function must be called from an NMI context with irq regs
+ * initialized.
+ **/
+int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
+{
+ WARN_ON_ONCE(!in_nmi());
+ return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
+}
#endif
/* Dynamic interrupt handling */
@@ -408,30 +782,29 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
+ guard(mutex)(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
-
- mutex_lock(&sparse_irq_lock);
- bitmap_clear(allocated_irqs, from, cnt);
- mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
/**
- * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * __irq_alloc_descs - allocate and initialize a range of irq descriptors
* @irq: Allocate for specific irq number if irq >= 0
* @from: Start the search from this irq number
* @cnt: Number of consecutive irqs to allocate.
* @node: Preferred node on which the irq descriptor should be allocated
* @owner: Owning module (can be NULL)
+ * @affinity: Optional pointer to an affinity mask array of size @cnt which
+ * hints where the irq descriptors should be allocated and which
+ * default affinities to use
*
* Returns the first irq number or error code
*/
-int __ref
-__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner)
+int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+ struct module *owner, const struct irq_affinity_desc *affinity)
{
- int start, ret;
+ int start;
if (!cnt)
return -EINVAL;
@@ -449,81 +822,20 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from = arch_dynirq_lower_bound(from);
}
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
- from, cnt, 0);
- ret = -EEXIST;
+ start = irq_find_free_area(from, cnt);
if (irq >=0 && start != irq)
- goto err;
+ return -EEXIST;
if (start + cnt > nr_irqs) {
- ret = irq_expand_nr_irqs(start + cnt);
- if (ret)
- goto err;
+ if (!irq_expand_nr_irqs(start + cnt))
+ return -ENOMEM;
}
-
- bitmap_set(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, owner);
-
-err:
- mutex_unlock(&sparse_irq_lock);
- return ret;
+ return alloc_descs(start, cnt, node, affinity, owner);
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
-#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
-/**
- * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
- * @cnt: number of interrupts to allocate
- * @node: node on which to allocate
- *
- * Returns an interrupt number > 0 or 0, if the allocation fails.
- */
-unsigned int irq_alloc_hwirqs(int cnt, int node)
-{
- int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
-
- if (irq < 0)
- return 0;
-
- for (i = irq; cnt > 0; i++, cnt--) {
- if (arch_setup_hwirq(i, node))
- goto err;
- irq_clear_status_flags(i, _IRQ_NOREQUEST);
- }
- return irq;
-
-err:
- for (i--; i >= irq; i--) {
- irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
- arch_teardown_hwirq(i);
- }
- irq_free_descs(irq, cnt);
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
-
-/**
- * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
- * @from: Free from irq number
- * @cnt: number of interrupts to free
- *
- */
-void irq_free_hwirqs(unsigned int from, int cnt)
-{
- int i, j;
-
- for (i = from, j = cnt; j > 0; i++, j--) {
- irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
- arch_teardown_hwirq(i);
- }
- irq_free_descs(from, cnt);
-}
-EXPORT_SYMBOL_GPL(irq_free_hwirqs);
-#endif
-
/**
* irq_get_next_irq - get next allocated irq number
* @offset: where to start the search
@@ -532,34 +844,35 @@ EXPORT_SYMBOL_GPL(irq_free_hwirqs);
*/
unsigned int irq_get_next_irq(unsigned int offset)
{
- return find_next_bit(allocated_irqs, nr_irqs, offset);
+ return irq_find_at_or_after(offset);
}
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check)
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc;
- if (desc) {
- if (check & _IRQ_DESC_CHECK) {
- if ((check & _IRQ_DESC_PERCPU) &&
- !irq_settings_is_per_cpu_devid(desc))
- return NULL;
-
- if (!(check & _IRQ_DESC_PERCPU) &&
- irq_settings_is_per_cpu_devid(desc))
- return NULL;
- }
+ desc = irq_to_desc(irq);
+ if (!desc)
+ return NULL;
- if (bus)
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, *flags);
+ if (check & _IRQ_DESC_CHECK) {
+ if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc))
+ return NULL;
+
+ if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc))
+ return NULL;
}
+
+ if (bus)
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, *flags);
+
return desc;
}
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+ __releases(&desc->lock)
{
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (bus)
@@ -570,10 +883,7 @@ int irq_set_percpu_devid(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc)
- return -EINVAL;
-
- if (desc->percpu_enabled)
+ if (!desc || desc->percpu_enabled)
return -EINVAL;
desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
@@ -587,7 +897,7 @@ int irq_set_percpu_devid(unsigned int irq)
void kstat_incr_irq_this_cpu(unsigned int irq)
{
- kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+ kstat_incr_irqs_this_cpu(irq_to_desc(irq));
}
/**
@@ -603,46 +913,88 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
- return desc && desc->kstat_irqs ?
- *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+ return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}
-/**
- * kstat_irqs - Get the statistics for an interrupt
- * @irq: The interrupt number
- *
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. The caller must ensure that the interrupt is not removed
- * concurrently.
- */
-unsigned int kstat_irqs(unsigned int irq)
+static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ unsigned int sum = 0;
int cpu;
- int sum = 0;
+
+ if (!irq_settings_is_per_cpu_devid(desc) &&
+ !irq_settings_is_per_cpu(desc) &&
+ !irq_is_nmi(desc))
+ return data_race(desc->tot_count);
+
+ for_each_cpu(cpu, cpumask)
+ sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu));
+ return sum;
+}
+
+static unsigned int kstat_irqs(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
if (!desc || !desc->kstat_irqs)
return 0;
- for_each_possible_cpu(cpu)
- sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
- return sum;
+ return kstat_irqs_desc(desc, cpu_possible_mask);
}
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+
+void kstat_snapshot_irqs(void)
+{
+ struct irq_desc *desc;
+ unsigned int irq;
+
+ for_each_irq_desc(irq, desc) {
+ if (!desc->kstat_irqs)
+ continue;
+ this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt));
+ }
+}
+
+unsigned int kstat_get_irq_since_snapshot(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->kstat_irqs)
+ return 0;
+ return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref);
+}
+
+#endif
+
/**
- * kstat_irqs_usr - Get the statistics for an interrupt
+ * kstat_irqs_usr - Get the statistics for an interrupt from thread context
* @irq: The interrupt number
*
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. Contrary to kstat_irqs() this can be called from any
- * preemptible context. It's protected against concurrent removal of
- * an interrupt descriptor when sparse irqs are enabled.
+ * Returns the sum of interrupt counts on all cpus since boot for @irq.
+ *
+ * It uses rcu to protect the access since a concurrent removal of an
+ * interrupt descriptor is observing an rcu grace period before
+ * delayed_free_desc()/irq_kobj_release().
*/
unsigned int kstat_irqs_usr(unsigned int irq)
{
- int sum;
+ unsigned int sum;
- irq_lock_sparse();
+ rcu_read_lock();
sum = kstat_irqs(irq);
- irq_unlock_sparse();
+ rcu_read_unlock();
return sum;
}
+
+#ifdef CONFIG_LOCKDEP
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+ struct lock_class_key *request_class)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ lockdep_set_class(&desc->lock, lock_class);
+ lockdep_set_class(&desc->request_mutex, request_class);
+ }
+}
+EXPORT_SYMBOL_GPL(__irq_set_lockdep_class);
+#endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7fac311057b8..2652c4cfd877 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,5 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+
#define pr_fmt(fmt) "irq: " fmt
+#include <linux/acpi.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/interrupt.h>
@@ -20,56 +23,357 @@
static LIST_HEAD(irq_domain_list);
static DEFINE_MUTEX(irq_domain_mutex);
-static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
-static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
- irq_hw_number_t hwirq, int node);
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq);
+
+struct irqchip_fwid {
+ struct fwnode_handle fwnode;
+ unsigned int type;
+ char *name;
+ phys_addr_t *pa;
+};
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void debugfs_add_domain_dir(struct irq_domain *d);
+static void debugfs_remove_domain_dir(struct irq_domain *d);
+#else
+static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
+static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
+#endif
+
+static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode)
+{
+ struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+ return fwid->name;
+}
+
+const struct fwnode_operations irqchip_fwnode_ops = {
+ .get_name = irqchip_fwnode_get_name,
+};
+EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
/**
- * __irq_domain_add() - Allocate a new irq_domain data structure
- * @of_node: optional device-tree node of the interrupt controller
- * @size: Size of linear map; 0 for radix mapping only
- * @hwirq_max: Maximum number of interrupts supported by controller
- * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
- * direct mapping
- * @ops: domain callbacks
- * @host_data: Controller private data pointer
+ * __irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
+ * identifying an irq domain
+ * @type: Type of irqchip_fwnode. See linux/irqdomain.h
+ * @id: Optional user provided id if name != NULL
+ * @name: Optional user provided domain name
+ * @pa: Optional user-provided physical address
*
- * Allocates and initialize and irq_domain structure.
- * Returns pointer to IRQ domain, or NULL on failure.
+ * Allocate a struct irqchip_fwid, and return a pointer to the embedded
+ * fwnode_handle (or NULL on failure).
+ *
+ * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are
+ * solely to transport name information to irqdomain creation code. The
+ * node is not stored. For other types the pointer is kept in the irq
+ * domain struct.
*/
-struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
- irq_hw_number_t hwirq_max, int direct_max,
- const struct irq_domain_ops *ops,
- void *host_data)
+struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
+ const char *name,
+ phys_addr_t *pa)
{
- struct irq_domain *domain;
+ struct irqchip_fwid *fwid;
+ char *n;
+
+ fwid = kzalloc(sizeof(*fwid), GFP_KERNEL);
+
+ switch (type) {
+ case IRQCHIP_FWNODE_NAMED:
+ n = kasprintf(GFP_KERNEL, "%s", name);
+ break;
+ case IRQCHIP_FWNODE_NAMED_ID:
+ n = kasprintf(GFP_KERNEL, "%s-%d", name, id);
+ break;
+ default:
+ n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa);
+ break;
+ }
- domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
- GFP_KERNEL, of_node_to_nid(of_node));
- if (WARN_ON(!domain))
+ if (!fwid || !n) {
+ kfree(fwid);
+ kfree(n);
return NULL;
+ }
+
+ fwid->type = type;
+ fwid->name = n;
+ fwid->pa = pa;
+ fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops);
+ return &fwid->fwnode;
+}
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
+
+/**
+ * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
+ * @fwnode: fwnode_handle to free
+ *
+ * Free a fwnode_handle allocated with irq_domain_alloc_fwnode.
+ */
+void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
+{
+ struct irqchip_fwid *fwid;
+
+ if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))
+ return;
+
+ fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+ kfree(fwid->name);
+ kfree(fwid);
+}
+EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
+
+static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token)
+{
+ if (bus_token == DOMAIN_BUS_ANY)
+ domain->name = kasprintf(GFP_KERNEL, "%s", base);
+ else
+ domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token);
+ if (!domain->name)
+ return -ENOMEM;
+
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode,
+ enum irq_domain_bus_token bus_token, const char *suffix)
+{
+ const char *sep = suffix ? "-" : "";
+ const char *suf = suffix ? : "";
+ char *name;
+
+ if (bus_token == DOMAIN_BUS_ANY)
+ name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf);
+ else
+ name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * fwnode paths contain '/', which debugfs is legitimately unhappy
+ * about. Replace them with ':', which does the trick and is not as
+ * offensive as '\'...
+ */
+ domain->name = strreplace(name, '/', ':');
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token)
+{
+ static atomic_t unknown_domains;
+ int id = atomic_inc_return(&unknown_domains);
+
+ if (bus_token == DOMAIN_BUS_ANY)
+ domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id);
+ else
+ domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token);
+ if (!domain->name)
+ return -ENOMEM;
+
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ return 0;
+}
+
+static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info)
+{
+ enum irq_domain_bus_token bus_token = info->bus_token;
+ const struct fwnode_handle *fwnode = info->fwnode;
+
+ if (is_fwnode_irqchip(fwnode)) {
+ struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+ /*
+ * The name_suffix is only intended to be used to avoid a name
+ * collision when multiple domains are created for a single
+ * device and the name is picked using a real device node.
+ * (Typical use-case is regmap-IRQ controllers for devices
+ * providing more than one physical IRQ.) There should be no
+ * need to use name_suffix with irqchip-fwnode.
+ */
+ if (info->name_suffix)
+ return -EINVAL;
+
+ switch (fwid->type) {
+ case IRQCHIP_FWNODE_NAMED:
+ case IRQCHIP_FWNODE_NAMED_ID:
+ return alloc_name(domain, fwid->name, bus_token);
+ default:
+ domain->name = fwid->name;
+ if (bus_token != DOMAIN_BUS_ANY)
+ return alloc_name(domain, fwid->name, bus_token);
+ }
+
+ } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) {
+ return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix);
+ }
+
+ if (domain->name)
+ return 0;
+
+ if (fwnode)
+ pr_err("Invalid fwnode type for irqdomain\n");
+ return alloc_unknown_name(domain, bus_token);
+}
+
+static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info)
+{
+ struct irq_domain *domain;
+ int err;
+
+ if (WARN_ON((info->size && info->direct_max) ||
+ (!IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) && info->direct_max) ||
+ (info->direct_max && info->direct_max != info->hwirq_max)))
+ return ERR_PTR(-EINVAL);
+
+ domain = kzalloc_node(struct_size(domain, revmap, info->size),
+ GFP_KERNEL, of_node_to_nid(to_of_node(info->fwnode)));
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ err = irq_domain_set_name(domain, info);
+ if (err) {
+ kfree(domain);
+ return ERR_PTR(err);
+ }
+
+ domain->fwnode = fwnode_handle_get(info->fwnode);
+ fwnode_dev_initialized(domain->fwnode, true);
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
- domain->ops = ops;
- domain->host_data = host_data;
- domain->of_node = of_node_get(of_node);
- domain->hwirq_max = hwirq_max;
- domain->revmap_size = size;
- domain->revmap_direct_max_irq = direct_max;
+ domain->ops = info->ops;
+ domain->host_data = info->host_data;
+ domain->bus_token = info->bus_token;
+ domain->hwirq_max = info->hwirq_max;
+
+ if (info->direct_max)
+ domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP;
+
+ domain->revmap_size = info->size;
+
+ /*
+ * Hierarchical domains use the domain lock of the root domain
+ * (innermost domain).
+ *
+ * For non-hierarchical domains (as for root domains), the root
+ * pointer is set to the domain itself so that &domain->root->mutex
+ * always points to the right lock.
+ */
+ mutex_init(&domain->mutex);
+ domain->root = domain;
+
irq_domain_check_hierarchy(domain);
+ return domain;
+}
+
+static void __irq_domain_publish(struct irq_domain *domain)
+{
mutex_lock(&irq_domain_mutex);
+ debugfs_add_domain_dir(domain);
list_add(&domain->link, &irq_domain_list);
mutex_unlock(&irq_domain_mutex);
pr_debug("Added domain %s\n", domain->name);
+}
+
+static void irq_domain_free(struct irq_domain *domain)
+{
+ fwnode_dev_initialized(domain->fwnode, false);
+ fwnode_handle_put(domain->fwnode);
+ if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
+ kfree(domain->name);
+ kfree(domain);
+}
+
+static void irq_domain_instantiate_descs(const struct irq_domain_info *info)
+{
+ if (!IS_ENABLED(CONFIG_SPARSE_IRQ))
+ return;
+
+ if (irq_alloc_descs(info->virq_base, info->virq_base, info->size,
+ of_node_to_nid(to_of_node(info->fwnode))) < 0) {
+ pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+ info->virq_base);
+ }
+}
+
+static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info,
+ bool cond_alloc_descs, bool force_associate)
+{
+ struct irq_domain *domain;
+ int err;
+
+ domain = __irq_domain_create(info);
+ if (IS_ERR(domain))
+ return domain;
+
+ domain->flags |= info->domain_flags;
+ domain->exit = info->exit;
+ domain->dev = info->dev;
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ if (info->parent) {
+ domain->root = info->parent->root;
+ domain->parent = info->parent;
+ }
+#endif
+
+ if (info->dgc_info) {
+ err = irq_domain_alloc_generic_chips(domain, info->dgc_info);
+ if (err)
+ goto err_domain_free;
+ }
+
+ if (info->init) {
+ err = info->init(domain);
+ if (err)
+ goto err_domain_gc_remove;
+ }
+
+ __irq_domain_publish(domain);
+
+ if (cond_alloc_descs && info->virq_base > 0)
+ irq_domain_instantiate_descs(info);
+
+ /*
+ * Legacy interrupt domains have a fixed Linux interrupt number
+ * associated. Other interrupt domains can request association by
+ * providing a Linux interrupt number > 0.
+ */
+ if (force_associate || info->virq_base > 0) {
+ irq_domain_associate_many(domain, info->virq_base, info->hwirq_base,
+ info->size - info->hwirq_base);
+ }
+
return domain;
+
+err_domain_gc_remove:
+ if (info->dgc_info)
+ irq_domain_remove_generic_chips(domain);
+err_domain_free:
+ irq_domain_free(domain);
+ return ERR_PTR(err);
+}
+
+/**
+ * irq_domain_instantiate() - Instantiate a new irq domain data structure
+ * @info: Domain information pointer pointing to the information for this domain
+ *
+ * Return: A pointer to the instantiated irq domain or an ERR_PTR value.
+ */
+struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info)
+{
+ return __irq_domain_instantiate(info, false, false);
}
-EXPORT_SYMBOL_GPL(__irq_domain_add);
+EXPORT_SYMBOL_GPL(irq_domain_instantiate);
/**
* irq_domain_remove() - Remove an irq domain.
@@ -81,14 +385,13 @@ EXPORT_SYMBOL_GPL(__irq_domain_add);
*/
void irq_domain_remove(struct irq_domain *domain)
{
+ if (domain->exit)
+ domain->exit(domain);
+
mutex_lock(&irq_domain_mutex);
+ debugfs_remove_domain_dir(domain);
- /*
- * radix_tree_delete() takes care of destroying the root
- * node when all entries are removed. Shout if there are
- * any mappings left.
- */
- WARN_ON(domain->revmap_tree.height);
+ WARN_ON(!radix_tree_empty(&domain->revmap_tree));
list_del(&domain->link);
@@ -96,20 +399,53 @@ void irq_domain_remove(struct irq_domain *domain)
* If the going away domain is the default one, reset it.
*/
if (unlikely(irq_default_domain == domain))
- irq_set_default_host(NULL);
+ irq_set_default_domain(NULL);
mutex_unlock(&irq_domain_mutex);
- pr_debug("Removed domain %s\n", domain->name);
+ if (domain->flags & IRQ_DOMAIN_FLAG_DESTROY_GC)
+ irq_domain_remove_generic_chips(domain);
- of_node_put(domain->of_node);
- kfree(domain);
+ pr_debug("Removed domain %s\n", domain->name);
+ irq_domain_free(domain);
}
EXPORT_SYMBOL_GPL(irq_domain_remove);
+void irq_domain_update_bus_token(struct irq_domain *domain,
+ enum irq_domain_bus_token bus_token)
+{
+ char *name;
+
+ if (domain->bus_token == bus_token)
+ return;
+
+ mutex_lock(&irq_domain_mutex);
+
+ domain->bus_token = bus_token;
+
+ name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token);
+ if (!name) {
+ mutex_unlock(&irq_domain_mutex);
+ return;
+ }
+
+ debugfs_remove_domain_dir(domain);
+
+ if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
+ kfree(domain->name);
+ else
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+
+ domain->name = name;
+ debugfs_add_domain_dir(domain);
+
+ mutex_unlock(&irq_domain_mutex);
+}
+EXPORT_SYMBOL_GPL(irq_domain_update_bus_token);
+
/**
- * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
- * @of_node: pointer to interrupt controller's device tree node.
+ * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs
+ * @fwnode: firmware node for the interrupt controller
* @size: total number of irqs in mapping
* @first_irq: first number of irq block assigned to the domain,
* pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
@@ -125,87 +461,80 @@ EXPORT_SYMBOL_GPL(irq_domain_remove);
* irqs get mapped dynamically on the fly. However, if the controller requires
* static virq assignments (non-DT boot) then it will set that up correctly.
*/
-struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
- unsigned int size,
- unsigned int first_irq,
- const struct irq_domain_ops *ops,
- void *host_data)
+struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
+ unsigned int size,
+ unsigned int first_irq,
+ const struct irq_domain_ops *ops,
+ void *host_data)
{
- struct irq_domain *domain;
-
- domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
- if (!domain)
- return NULL;
-
- if (first_irq > 0) {
- if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
- /* attempt to allocated irq_descs */
- int rc = irq_alloc_descs(first_irq, first_irq, size,
- of_node_to_nid(of_node));
- if (rc < 0)
- pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
- first_irq);
- }
- irq_domain_associate_many(domain, first_irq, 0, size);
- }
-
- return domain;
+ struct irq_domain_info info = {
+ .fwnode = fwnode,
+ .size = size,
+ .hwirq_max = size,
+ .virq_base = first_irq,
+ .ops = ops,
+ .host_data = host_data,
+ };
+ struct irq_domain *domain = __irq_domain_instantiate(&info, true, false);
+
+ return IS_ERR(domain) ? NULL : domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_domain_create_simple);
-/**
- * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: total number of irqs in legacy mapping
- * @first_irq: first number of irq block assigned to the domain
- * @first_hwirq: first hwirq number to use for the translation. Should normally
- * be '0', but a positive integer can be used if the effective
- * hwirqs numbering does not begin at zero.
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- *
- * Note: the map() callback will be called before this function returns
- * for all legacy interrupts except 0 (which is always the invalid irq for
- * a legacy controller).
- */
-struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
unsigned int size,
unsigned int first_irq,
irq_hw_number_t first_hwirq,
const struct irq_domain_ops *ops,
void *host_data)
{
- struct irq_domain *domain;
-
- domain = __irq_domain_add(of_node, first_hwirq + size,
- first_hwirq + size, 0, ops, host_data);
- if (domain)
- irq_domain_associate_many(domain, first_irq, first_hwirq, size);
-
- return domain;
+ struct irq_domain_info info = {
+ .fwnode = fwnode,
+ .size = first_hwirq + size,
+ .hwirq_max = first_hwirq + size,
+ .hwirq_base = first_hwirq,
+ .virq_base = first_irq,
+ .ops = ops,
+ .host_data = host_data,
+ };
+ struct irq_domain *domain = __irq_domain_instantiate(&info, false, true);
+
+ return IS_ERR(domain) ? NULL : domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
+EXPORT_SYMBOL_GPL(irq_domain_create_legacy);
/**
- * irq_find_host() - Locates a domain for a given device node
- * @node: device-tree node of the interrupt controller
+ * irq_find_matching_fwspec() - Locates a domain for a given fwspec
+ * @fwspec: FW specifier for an interrupt
+ * @bus_token: domain-specific data
*/
-struct irq_domain *irq_find_host(struct device_node *node)
+struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
{
struct irq_domain *h, *found = NULL;
+ struct fwnode_handle *fwnode = fwspec->fwnode;
int rc;
- /* We might want to match the legacy controller last since
+ /*
+ * We might want to match the legacy controller last since
* it might potentially be set to match all interrupts in
* the absence of a device node. This isn't a problem so far
* yet though...
+ *
+ * bus_token == DOMAIN_BUS_ANY matches any domain, any other
+ * values must generate an exact match for the domain to be
+ * selected.
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
- if (h->ops->match)
- rc = h->ops->match(h, node);
+ if (h->ops->select && bus_token != DOMAIN_BUS_ANY)
+ rc = h->ops->select(h, fwspec, bus_token);
+ else if (h->ops->match)
+ rc = h->ops->match(h, to_of_node(fwnode), bus_token);
else
- rc = (h->of_node != NULL) && (h->of_node == node);
+ rc = ((fwnode != NULL) && (h->fwnode == fwnode) &&
+ ((bus_token == DOMAIN_BUS_ANY) ||
+ (h->bus_token == bus_token)));
if (rc) {
found = h;
@@ -215,10 +544,10 @@ struct irq_domain *irq_find_host(struct device_node *node)
mutex_unlock(&irq_domain_mutex);
return found;
}
-EXPORT_SYMBOL_GPL(irq_find_host);
+EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
- * irq_set_default_host() - Set a "default" irq domain
+ * irq_set_default_domain() - Set a "default" irq domain
* @domain: default domain pointer
*
* For convenience, it's possible to set a "default" domain that will be used
@@ -226,15 +555,69 @@ EXPORT_SYMBOL_GPL(irq_find_host);
* platforms that want to manipulate a few hard coded interrupt numbers that
* aren't properly represented in the device-tree.
*/
-void irq_set_default_host(struct irq_domain *domain)
+void irq_set_default_domain(struct irq_domain *domain)
{
pr_debug("Default domain set to @0x%p\n", domain);
irq_default_domain = domain;
}
-EXPORT_SYMBOL_GPL(irq_set_default_host);
+EXPORT_SYMBOL_GPL(irq_set_default_domain);
+
+/**
+ * irq_get_default_domain() - Retrieve the "default" irq domain
+ *
+ * Returns: the default domain, if any.
+ *
+ * Modern code should never use this. This should only be used on
+ * systems that cannot implement a firmware->fwnode mapping (which
+ * both DT and ACPI provide).
+ */
+struct irq_domain *irq_get_default_domain(void)
+{
+ return irq_default_domain;
+}
+EXPORT_SYMBOL_GPL(irq_get_default_domain);
+
+static bool irq_domain_is_nomap(struct irq_domain *domain)
+{
+ return IS_ENABLED(CONFIG_IRQ_DOMAIN_NOMAP) &&
+ (domain->flags & IRQ_DOMAIN_FLAG_NO_MAP);
+}
+
+static void irq_domain_clear_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq)
+{
+ lockdep_assert_held(&domain->root->mutex);
+
+ if (irq_domain_is_nomap(domain))
+ return;
+
+ if (hwirq < domain->revmap_size)
+ rcu_assign_pointer(domain->revmap[hwirq], NULL);
+ else
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+}
-void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+static void irq_domain_set_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ struct irq_data *irq_data)
+{
+ /*
+ * This also makes sure that all domains point to the same root when
+ * called from irq_domain_insert_irq() for each domain in a hierarchy.
+ */
+ lockdep_assert_held(&domain->root->mutex);
+
+ if (irq_domain_is_nomap(domain))
+ return;
+
+ if (hwirq < domain->revmap_size)
+ rcu_assign_pointer(domain->revmap[hwirq], irq_data);
+ else
+ radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
+}
+
+static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
irq_hw_number_t hwirq;
@@ -244,6 +627,9 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
return;
hwirq = irq_data->hwirq;
+
+ mutex_lock(&domain->root->mutex);
+
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
@@ -259,19 +645,16 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
irq_data->domain = NULL;
irq_data->hwirq = 0;
+ domain->mapcount--;
/* Clear reverse map for this hwirq */
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ irq_domain_clear_mapping(domain, hwirq);
+
+ mutex_unlock(&domain->root->mutex);
}
-int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq)
+static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
{
struct irq_data *irq_data = irq_get_irq_data(virq);
int ret;
@@ -284,7 +667,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
return -EINVAL;
- mutex_lock(&irq_domain_mutex);
irq_data->hwirq = hwirq;
irq_data->domain = domain;
if (domain->ops->map) {
@@ -301,44 +683,47 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
}
irq_data->domain = NULL;
irq_data->hwirq = 0;
- mutex_unlock(&irq_domain_mutex);
return ret;
}
-
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && irq_data->chip)
- domain->name = irq_data->chip->name;
}
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
- mutex_unlock(&revmap_trees_mutex);
- }
- mutex_unlock(&irq_domain_mutex);
+ domain->mapcount++;
+ irq_domain_set_mapping(domain, hwirq, irq_data);
irq_clear_status_flags(virq, IRQ_NOREQUEST);
return 0;
}
+
+int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ret;
+
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_associate_locked(domain, virq, hwirq);
+ mutex_unlock(&domain->root->mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
irq_hw_number_t hwirq_base, int count)
{
+ struct device_node *of_node;
int i;
+ of_node = irq_domain_get_of_node(domain);
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
- of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+ of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
- for (i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
- }
}
EXPORT_SYMBOL_GPL(irq_domain_associate_many);
+#ifdef CONFIG_IRQ_DOMAIN_NOMAP
/**
* irq_create_direct_mapping() - Allocate an irq for direct mapping
* @domain: domain to allocate the irq for or NULL for default domain
@@ -351,19 +736,21 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
*/
unsigned int irq_create_direct_mapping(struct irq_domain *domain)
{
+ struct device_node *of_node;
unsigned int virq;
if (domain == NULL)
domain = irq_default_domain;
- virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ of_node = irq_domain_get_of_node(domain);
+ virq = irq_alloc_desc_from(1, of_node_to_nid(of_node));
if (!virq) {
pr_debug("create_direct virq allocation failed\n");
return 0;
}
- if (virq >= domain->revmap_direct_max_irq) {
- pr_err("ERROR: no free irqs available below %i maximum\n",
- domain->revmap_direct_max_irq);
+ if (virq >= domain->hwirq_max) {
+ pr_err("ERROR: no free irqs available below %lu maximum\n",
+ domain->hwirq_max);
irq_free_desc(virq);
return 0;
}
@@ -377,141 +764,243 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
+#endif
+
+static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
+{
+ struct device_node *of_node = irq_domain_get_of_node(domain);
+ int virq;
+
+ pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+
+ /* Allocate a virtual interrupt number */
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
+ if (virq <= 0) {
+ pr_debug("-> virq allocation failed\n");
+ return 0;
+ }
+
+ if (irq_domain_associate_locked(domain, virq, hwirq)) {
+ irq_free_desc(virq);
+ return 0;
+ }
+
+ pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
+ hwirq, of_node_full_name(of_node), virq);
+
+ return virq;
+}
/**
- * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
* @hwirq: hardware irq number in that domain space
+ * @affinity: irq affinity
*
* Only one mapping per hardware interrupt is permitted. Returns a linux
* irq number.
* If the sense/trigger is to be specified, set_irq_type() should be called
* on the number returned from that call.
*/
-unsigned int irq_create_mapping(struct irq_domain *domain,
- irq_hw_number_t hwirq)
+unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
int virq;
- pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
-
- /* Look for default domain if nececssary */
+ /* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL) {
WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
return 0;
}
- pr_debug("-> using domain @%p\n", domain);
+
+ mutex_lock(&domain->root->mutex);
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
- pr_debug("-> existing mapping on virq %d\n", virq);
- return virq;
+ pr_debug("existing mapping on virq %d\n", virq);
+ goto out;
}
- /* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq,
- of_node_to_nid(domain->of_node));
- if (virq <= 0) {
- pr_debug("-> virq allocation failed\n");
- return 0;
- }
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity);
+out:
+ mutex_unlock(&domain->root->mutex);
- if (irq_domain_associate(domain, virq, hwirq)) {
- irq_free_desc(virq);
- return 0;
- }
+ return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
- pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, of_node_full_name(domain->of_node), virq);
+static int irq_domain_translate(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ irq_hw_number_t *hwirq, unsigned int *type)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ if (d->ops->translate)
+ return d->ops->translate(d, fwspec, hwirq, type);
+#endif
+ if (d->ops->xlate)
+ return d->ops->xlate(d, to_of_node(fwspec->fwnode),
+ fwspec->param, fwspec->param_count,
+ hwirq, type);
- return virq;
+ /* If domain has no translation, then we assume interrupt line */
+ *hwirq = fwspec->param[0];
+ return 0;
}
-EXPORT_SYMBOL_GPL(irq_create_mapping);
-/**
- * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
- * @domain: domain owning the interrupt range
- * @irq_base: beginning of linux IRQ range
- * @hwirq_base: beginning of hardware IRQ range
- * @count: Number of interrupts to map
- *
- * This routine is used for allocating and mapping a range of hardware
- * irqs to linux irqs where the linux irq numbers are at pre-defined
- * locations. For use by controllers that already have static mappings
- * to insert in to the domain.
- *
- * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
- * domain insertion.
- *
- * 0 is returned upon success, while any failure to establish a static
- * mapping is treated as an error.
- */
-int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
- irq_hw_number_t hwirq_base, int count)
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count, struct irq_fwspec *fwspec)
{
- int ret;
+ int i;
- ret = irq_alloc_descs(irq_base, irq_base, count,
- of_node_to_nid(domain->of_node));
- if (unlikely(ret < 0))
- return ret;
+ fwspec->fwnode = of_fwnode_handle(np);
+ fwspec->param_count = count;
- irq_domain_associate_many(domain, irq_base, hwirq_base, count);
- return 0;
+ for (i = 0; i < count; i++)
+ fwspec->param[i] = args[i];
}
-EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
+EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
-unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
- irq_hw_number_t hwirq;
+
+ if (fwspec->fwnode) {
+ domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
+ if (!domain)
+ domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY);
+ } else {
+ domain = irq_default_domain;
+ }
+
+ return domain;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info)
+{
+ struct irq_domain *domain = fwspec_to_domain(fwspec);
+
+ memset(info, 0, sizeof(*info));
+
+ if (!domain || !domain->ops->get_fwspec_info)
+ return 0;
+
+ return domain->ops->get_fwspec_info(fwspec, info);
+}
+#endif
+
+unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
+{
unsigned int type = IRQ_TYPE_NONE;
+ struct irq_domain *domain;
+ struct irq_data *irq_data;
+ irq_hw_number_t hwirq;
int virq;
- domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
+ domain = fwspec_to_domain(fwspec);
if (!domain) {
pr_warn("no irq domain found for %s !\n",
- of_node_full_name(irq_data->np));
+ of_node_full_name(to_of_node(fwspec->fwnode)));
return 0;
}
- /* If domain has no translation, then we assume interrupt line */
- if (domain->ops->xlate == NULL)
- hwirq = irq_data->args[0];
- else {
- if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
- irq_data->args_count, &hwirq, &type))
- return 0;
- }
+ if (irq_domain_translate(domain, fwspec, &hwirq, &type))
+ return 0;
- if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * WARN if the irqchip returns a type with bits
+ * outside the sense mask set and clear these bits.
+ */
+ if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
+ type &= IRQ_TYPE_SENSE_MASK;
+
+ mutex_lock(&domain->root->mutex);
+
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq) {
/*
- * If we've already configured this interrupt,
- * don't do it again, or hell will break loose.
+ * If the trigger type is not specified or matches the
+ * current trigger type then we are done so return the
+ * interrupt number.
*/
- virq = irq_find_mapping(domain, hwirq);
- if (virq)
- return virq;
+ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
+ goto out;
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
- if (virq <= 0)
- return 0;
+ /*
+ * If the trigger type has not been set yet, then set
+ * it now and return the interrupt number.
+ */
+ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
+ irq_data = irq_get_irq_data(virq);
+ if (!irq_data) {
+ virq = 0;
+ goto out;
+ }
+
+ irqd_set_trigger_type(irq_data, type);
+ goto out;
+ }
+
+ pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
+ hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
+ virq = 0;
+ goto out;
+ }
+
+ if (irq_domain_is_hierarchy(domain)) {
+ if (irq_domain_is_msi_device(domain)) {
+ mutex_unlock(&domain->root->mutex);
+ virq = msi_device_domain_alloc_wired(domain, hwirq, type);
+ mutex_lock(&domain->root->mutex);
+ } else
+ virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE,
+ fwspec, false, NULL);
+ if (virq <= 0) {
+ virq = 0;
+ goto out;
+ }
} else {
/* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
+ virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);
if (!virq)
- return virq;
+ goto out;
+ }
+
+ irq_data = irq_get_irq_data(virq);
+ if (WARN_ON(!irq_data)) {
+ virq = 0;
+ goto out;
}
- /* Set type if specified and different than the current one */
- if (type != IRQ_TYPE_NONE &&
- type != irq_get_trigger_type(virq))
- irq_set_irq_type(virq, type);
+ /* Store trigger type */
+ irqd_set_trigger_type(irq_data, type);
+out:
+ mutex_unlock(&domain->root->mutex);
+
return virq;
}
+EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
+
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+{
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(irq_data->np, irq_data->args,
+ irq_data->args_count, &fwspec);
+
+ return irq_create_fwspec_mapping(&fwspec);
+}
EXPORT_SYMBOL_GPL(irq_create_of_mapping);
/**
@@ -520,144 +1009,85 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
*/
void irq_dispose_mapping(unsigned int virq)
{
- struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *irq_data;
struct irq_domain *domain;
- if (!virq || !irq_data)
+ irq_data = virq ? irq_get_irq_data(virq) : NULL;
+ if (!irq_data)
return;
domain = irq_data->domain;
if (WARN_ON(domain == NULL))
return;
- irq_domain_disassociate(domain, virq);
- irq_free_desc(virq);
+ if (irq_domain_is_hierarchy(domain)) {
+ irq_domain_free_one_irq(domain, virq);
+ } else {
+ irq_domain_disassociate(domain, virq);
+ irq_free_desc(virq);
+ }
}
EXPORT_SYMBOL_GPL(irq_dispose_mapping);
/**
- * irq_find_mapping() - Find a linux irq from an hw irq number.
+ * __irq_resolve_mapping() - Find a linux irq from a hw irq number.
* @domain: domain owning this hardware interrupt
* @hwirq: hardware irq number in that domain space
+ * @irq: optional pointer to return the Linux irq if required
+ *
+ * Returns the interrupt descriptor.
*/
-unsigned int irq_find_mapping(struct irq_domain *domain,
- irq_hw_number_t hwirq)
+struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ unsigned int *irq)
{
+ struct irq_desc *desc = NULL;
struct irq_data *data;
- /* Look for default domain if nececssary */
+ /* Look for default domain if necessary */
if (domain == NULL)
domain = irq_default_domain;
if (domain == NULL)
- return 0;
+ return desc;
+
+ if (irq_domain_is_nomap(domain)) {
+ if (hwirq < domain->hwirq_max) {
+ data = irq_domain_get_irq_data(domain, hwirq);
+ if (data && data->hwirq == hwirq)
+ desc = irq_data_to_desc(data);
+ if (irq && desc)
+ *irq = hwirq;
+ }
- if (hwirq < domain->revmap_direct_max_irq) {
- data = irq_domain_get_irq_data(domain, hwirq);
- if (data && data->hwirq == hwirq)
- return hwirq;
+ return desc;
}
+ rcu_read_lock();
/* Check if the hwirq is in the linear revmap. */
if (hwirq < domain->revmap_size)
- return domain->linear_revmap[hwirq];
-
- rcu_read_lock();
- data = radix_tree_lookup(&domain->revmap_tree, hwirq);
- rcu_read_unlock();
- return data ? data->irq : 0;
-}
-EXPORT_SYMBOL_GPL(irq_find_mapping);
-
-#ifdef CONFIG_IRQ_DOMAIN_DEBUG
-static int virq_debug_show(struct seq_file *m, void *private)
-{
- unsigned long flags;
- struct irq_desc *desc;
- struct irq_domain *domain;
- struct radix_tree_iter iter;
- void *data, **slot;
- int i;
-
- seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
- "name", "mapped", "linear-max", "direct-max", "devtree-node");
- mutex_lock(&irq_domain_mutex);
- list_for_each_entry(domain, &irq_domain_list, link) {
- int count = 0;
- radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
- count++;
- seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
- domain == irq_default_domain ? '*' : ' ', domain->name,
- domain->revmap_size + count, domain->revmap_size,
- domain->revmap_direct_max_irq,
- domain->of_node ? of_node_full_name(domain->of_node) : "");
- }
- mutex_unlock(&irq_domain_mutex);
-
- seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
- "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
- "active", "type", "domain");
-
- for (i = 1; i < nr_irqs; i++) {
- desc = irq_to_desc(i);
- if (!desc)
- continue;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- domain = desc->irq_data.domain;
-
- if (domain) {
- struct irq_chip *chip;
- int hwirq = desc->irq_data.hwirq;
- bool direct;
-
- seq_printf(m, "%5d ", i);
- seq_printf(m, "0x%05x ", hwirq);
-
- chip = irq_desc_get_chip(desc);
- seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
-
- data = irq_desc_get_chip_data(desc);
- seq_printf(m, data ? "0x%p " : " %p ", data);
-
- seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
- direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
- seq_printf(m, "%6s%-8s ",
- (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
- direct ? "(DIRECT)" : "");
- seq_printf(m, "%s\n", desc->irq_data.domain->name);
- }
+ data = rcu_dereference(domain->revmap[hwirq]);
+ else
+ data = radix_tree_lookup(&domain->revmap_tree, hwirq);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (likely(data)) {
+ desc = irq_data_to_desc(data);
+ if (irq)
+ *irq = data->irq;
}
- return 0;
-}
-
-static int virq_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, virq_debug_show, inode->i_private);
-}
-
-static const struct file_operations virq_debug_fops = {
- .open = virq_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init irq_debugfs_init(void)
-{
- if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
- NULL, &virq_debug_fops) == NULL)
- return -ENOMEM;
-
- return 0;
+ rcu_read_unlock();
+ return desc;
}
-__initcall(irq_debugfs_init);
-#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
+EXPORT_SYMBOL_GPL(__irq_resolve_mapping);
/**
* irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with one cell
* bindings where the cell value maps directly to the hwirq number.
@@ -676,6 +1106,12 @@ EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
/**
* irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with two cell
* bindings where the cell values map directly to the hwirq number
@@ -685,16 +1121,46 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
const u32 *intspec, unsigned int intsize,
irq_hw_number_t *out_hwirq, unsigned int *out_type)
{
- if (WARN_ON(intsize < 2))
- return -EINVAL;
- *out_hwirq = intspec[0];
- *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
- return 0;
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+ return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type);
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
/**
+ * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Device Tree interrupt specifier translation function for two or three
+ * cell bindings, where the cell values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_type)
+{
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+
+ return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type);
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell);
+
+/**
* irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
*
* Device Tree IRQ specifier translation function which works with either one
* or two cell bindings where the cell values map directly to the hwirq number
@@ -712,7 +1178,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
if (WARN_ON(intsize < 1))
return -EINVAL;
*out_hwirq = intspec[0];
- *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+ if (intsize > 1)
+ *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
+ else
+ *out_type = IRQ_TYPE_NONE;
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
@@ -722,81 +1191,128 @@ const struct irq_domain_ops irq_domain_simple_ops = {
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-static int irq_domain_alloc_descs(int virq, unsigned int cnt,
- irq_hw_number_t hwirq, int node)
+/**
+ * irq_domain_translate_onecell() - Generic translate for direct one cell
+ * bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ */
+int irq_domain_translate_onecell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 1))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = IRQ_TYPE_NONE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_onecell);
+
+/**
+ * irq_domain_translate_twocell() - Generic translate for direct two cell
+ * bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_translate_twocell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+
+/**
+ * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell
+ * bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Firmware interrupt specifier translation function for two or three cell
+ * specifications, where the parameter values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (fwspec->param_count == 2) {
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ if (fwspec->param_count == 3) {
+ *out_hwirq = fwspec->param[1];
+ *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell);
+
+int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
+ int node, const struct irq_affinity_desc *affinity)
{
unsigned int hint;
if (virq >= 0) {
- virq = irq_alloc_descs(virq, virq, cnt, node);
+ virq = __irq_alloc_descs(virq, virq, cnt, node, THIS_MODULE,
+ affinity);
} else {
- hint = hwirq % nr_irqs;
+ hint = hwirq % irq_get_nr_irqs();
if (hint == 0)
hint++;
- virq = irq_alloc_descs_from(hint, cnt, node);
- if (virq <= 0 && hint > 1)
- virq = irq_alloc_descs_from(1, cnt, node);
+ virq = __irq_alloc_descs(-1, hint, cnt, node, THIS_MODULE,
+ affinity);
+ if (virq <= 0 && hint > 1) {
+ virq = __irq_alloc_descs(-1, 1, cnt, node, THIS_MODULE,
+ affinity);
+ }
}
return virq;
}
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
/**
- * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
- * @parent: Parent irq domain to associate with the new domain
- * @flags: Irq domain flags associated to the domain
- * @size: Size of the domain. See below
- * @node: Optional device-tree node of the interrupt controller
- * @ops: Pointer to the interrupt domain callbacks
- * @host_data: Controller private data pointer
- *
- * If @size is 0 a tree domain is created, otherwise a linear domain.
- *
- * If successful the parent is associated to the new domain and the
- * domain flags are set.
- * Returns pointer to IRQ domain, or NULL on failure.
+ * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
+ * @irq_data: The pointer to irq_data
*/
-struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
- unsigned int flags,
- unsigned int size,
- struct device_node *node,
- const struct irq_domain_ops *ops,
- void *host_data)
+void irq_domain_reset_irq_data(struct irq_data *irq_data)
{
- struct irq_domain *domain;
-
- if (size)
- domain = irq_domain_add_linear(node, size, ops, host_data);
- else
- domain = irq_domain_add_tree(node, ops, host_data);
- if (domain) {
- domain->parent = parent;
- domain->flags |= flags;
- }
-
- return domain;
+ irq_data->hwirq = 0;
+ irq_data->chip = &no_irq_chip;
+ irq_data->chip_data = NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
static void irq_domain_insert_irq(int virq)
{
struct irq_data *data;
for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
struct irq_domain *domain = data->domain;
- irq_hw_number_t hwirq = data->hwirq;
-
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = virq;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_insert(&domain->revmap_tree, hwirq, data);
- mutex_unlock(&revmap_trees_mutex);
- }
- /* If not already assigned, give the domain the chip's name */
- if (!domain->name && data->chip)
- domain->name = data->chip->name;
+ domain->mapcount++;
+ irq_domain_set_mapping(domain, data->hwirq, data);
}
irq_clear_status_flags(virq, IRQ_NOREQUEST);
@@ -815,13 +1331,8 @@ static void irq_domain_remove_irq(int virq)
struct irq_domain *domain = data->domain;
irq_hw_number_t hwirq = data->hwirq;
- if (hwirq < domain->revmap_size) {
- domain->linear_revmap[hwirq] = 0;
- } else {
- mutex_lock(&revmap_trees_mutex);
- radix_tree_delete(&domain->revmap_tree, hwirq);
- mutex_unlock(&revmap_trees_mutex);
- }
+ domain->mapcount--;
+ irq_domain_clear_mapping(domain, hwirq);
}
}
@@ -830,17 +1341,29 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
{
struct irq_data *irq_data;
- irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+ irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL,
+ irq_data_get_node(child));
if (irq_data) {
child->parent_data = irq_data;
irq_data->irq = child->irq;
- irq_data->node = child->node;
+ irq_data->common = child->common;
irq_data->domain = domain;
}
return irq_data;
}
+static void __irq_domain_free_hierarchy(struct irq_data *irq_data)
+{
+ struct irq_data *tmp;
+
+ while (irq_data) {
+ tmp = irq_data;
+ irq_data = irq_data->parent_data;
+ kfree(tmp);
+ }
+}
+
static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *irq_data, *tmp;
@@ -852,12 +1375,84 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
irq_data->parent_data = NULL;
irq_data->domain = NULL;
- while (tmp) {
- irq_data = tmp;
- tmp = tmp->parent_data;
- kfree(irq_data);
+ __irq_domain_free_hierarchy(tmp);
+ }
+}
+
+/**
+ * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy
+ * @domain: IRQ domain from which the hierarchy is to be disconnected
+ * @virq: IRQ number where the hierarchy is to be trimmed
+ *
+ * Marks the @virq level belonging to @domain as disconnected.
+ * Returns -EINVAL if @virq doesn't have a valid irq_data pointing
+ * to @domain.
+ *
+ * Its only use is to be able to trim levels of hierarchy that do not
+ * have any real meaning for this interrupt, and that the driver marks
+ * as such from its .alloc() callback.
+ */
+int irq_domain_disconnect_hierarchy(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irqd;
+
+ irqd = irq_domain_get_irq_data(domain, virq);
+ if (!irqd)
+ return -EINVAL;
+
+ irqd->chip = ERR_PTR(-ENOTCONN);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_disconnect_hierarchy);
+
+static int irq_domain_trim_hierarchy(unsigned int virq)
+{
+ struct irq_data *tail, *irqd, *irq_data;
+
+ irq_data = irq_get_irq_data(virq);
+ tail = NULL;
+
+ /* The first entry must have a valid irqchip */
+ if (IS_ERR_OR_NULL(irq_data->chip))
+ return -EINVAL;
+
+ /*
+ * Validate that the irq_data chain is sane in the presence of
+ * a hierarchy trimming marker.
+ */
+ for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) {
+ /* Can't have a valid irqchip after a trim marker */
+ if (irqd->chip && tail)
+ return -EINVAL;
+
+ /* Can't have an empty irqchip before a trim marker */
+ if (!irqd->chip && !tail)
+ return -EINVAL;
+
+ if (IS_ERR(irqd->chip)) {
+ /* Only -ENOTCONN is a valid trim marker */
+ if (PTR_ERR(irqd->chip) != -ENOTCONN)
+ return -EINVAL;
+
+ tail = irq_data;
}
}
+
+ /* No trim marker, nothing to do */
+ if (!tail)
+ return 0;
+
+ pr_info("IRQ%d: trimming hierarchy from %s\n",
+ virq, tail->parent_data->domain->name);
+
+ /* Sever the inner part of the hierarchy... */
+ irqd = tail;
+ tail = tail->parent_data;
+ irqd->parent_data = NULL;
+ __irq_domain_free_hierarchy(tail);
+
+ return 0;
}
static int irq_domain_alloc_irq_data(struct irq_domain *domain,
@@ -901,6 +1496,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
/**
* irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
@@ -911,7 +1507,8 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
* @chip_data: The associated chip data
*/
int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq,
+ const struct irq_chip *chip,
void *chip_data)
{
struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -920,11 +1517,12 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return -ENOENT;
irq_data->hwirq = hwirq;
- irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
return 0;
}
+EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
/**
* irq_domain_set_info - Set the complete data for a @virq in @domain
@@ -938,7 +1536,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -946,17 +1544,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
__irq_set_handler(virq, handler, 0, handler_name);
irq_set_handler_data(virq, handler_data);
}
-
-/**
- * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
- * @irq_data: The pointer to irq_data
- */
-void irq_domain_reset_irq_data(struct irq_data *irq_data)
-{
- irq_data->hwirq = 0;
- irq_data->chip = &no_irq_chip;
- irq_data->chip_data = NULL;
-}
+EXPORT_SYMBOL(irq_domain_set_info);
/**
* irq_domain_free_irqs_common - Clear irq_data and free the parent
@@ -977,6 +1565,7 @@ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
}
irq_domain_free_irqs_parent(domain, virq, nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common);
/**
* irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
@@ -995,52 +1584,89 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
}
irq_domain_free_irqs_common(domain, virq, nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top);
-static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
-{
- return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
-}
-
-static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
+static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
unsigned int nr_irqs)
{
- domain->ops->free(domain, irq_base, nr_irqs);
- if (irq_domain_is_auto_recursive(domain)) {
- BUG_ON(!domain->parent);
- irq_domain_free_irqs_recursive(domain->parent, irq_base,
- nr_irqs);
+ unsigned int i;
+
+ if (!domain->ops->free)
+ return;
+
+ for (i = 0; i < nr_irqs; i++) {
+ if (irq_domain_get_irq_data(domain, irq_base + i))
+ domain->ops->free(domain, irq_base + i, 1);
}
}
-static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
- unsigned int irq_base,
+static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base,
unsigned int nr_irqs, void *arg)
{
- int ret = 0;
- struct irq_domain *parent = domain->parent;
- bool recursive = irq_domain_is_auto_recursive(domain);
-
- BUG_ON(recursive && !parent);
- if (recursive)
- ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
- nr_irqs, arg);
- if (ret >= 0)
- ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
- if (ret < 0 && recursive)
- irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
+ return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+}
+
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ int i, ret, virq;
+
+ if (realloc && irq_base >= 0) {
+ virq = irq_base;
+ } else {
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
+ affinity);
+ if (virq < 0) {
+ pr_debug("cannot allocate IRQ(base %d, count %d)\n",
+ irq_base, nr_irqs);
+ return virq;
+ }
+ }
+
+ if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
+ pr_debug("cannot allocate memory for IRQ%d\n", virq);
+ ret = -ENOMEM;
+ goto out_free_desc;
+ }
+
+ ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ goto out_free_irq_data;
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = irq_domain_trim_hierarchy(virq + i);
+ if (ret)
+ goto out_free_irq_data;
+ }
+
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_insert_irq(virq + i);
+
+ return virq;
+out_free_irq_data:
+ irq_domain_free_irq_data(virq, nr_irqs);
+out_free_desc:
+ irq_free_descs(virq, nr_irqs);
return ret;
}
/**
* __irq_domain_alloc_irqs - Allocate IRQs from domain
* @domain: domain to allocate from
- * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
* @nr_irqs: number of IRQs to allocate
* @node: NUMA node id for memory allocation
* @arg: domain specific argument
* @realloc: IRQ descriptors have already been allocated if true
+ * @affinity: Optional irq affinity mask for multiqueue devices
*
* Allocate IRQ numbers and initialized all data structures to support
* hierarchy IRQ domains.
@@ -1050,15 +1676,15 @@ static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
* The whole process to setup an IRQ has been split into two steps.
* The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
* descriptor and required hardware resources. The second step,
- * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * irq_domain_activate_irq(), is to program the hardware with preallocated
* resources. In this way, it's easier to rollback when failing to
* allocate resources.
*/
int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
unsigned int nr_irqs, int node, void *arg,
- bool realloc)
+ bool realloc, const struct irq_affinity_desc *affinity)
{
- int i, ret, virq;
+ int ret;
if (domain == NULL) {
domain = irq_default_domain;
@@ -1066,46 +1692,188 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
return -EINVAL;
}
- if (!domain->ops->alloc) {
- pr_debug("domain->ops->alloc() is NULL\n");
- return -ENOSYS;
- }
+ mutex_lock(&domain->root->mutex);
+ ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg,
+ realloc, affinity);
+ mutex_unlock(&domain->root->mutex);
- if (realloc && irq_base >= 0) {
- virq = irq_base;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
+
+/* The irq_data was moved, fix the revmap to refer to the new location */
+static void irq_domain_fix_revmap(struct irq_data *d)
+{
+ void __rcu **slot;
+
+ lockdep_assert_held(&d->domain->root->mutex);
+
+ if (irq_domain_is_nomap(d->domain))
+ return;
+
+ /* Fix up the revmap. */
+ if (d->hwirq < d->domain->revmap_size) {
+ /* Not using radix tree */
+ rcu_assign_pointer(d->domain->revmap[d->hwirq], d);
} else {
- virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
- if (virq < 0) {
- pr_debug("cannot allocate IRQ(base %d, count %d)\n",
- irq_base, nr_irqs);
- return virq;
- }
+ slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+ if (slot)
+ radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
}
+}
- if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
- pr_debug("cannot allocate memory for IRQ%d\n", virq);
- ret = -ENOMEM;
- goto out_free_desc;
- }
+/**
+ * irq_domain_push_irq() - Push a domain in to the top of a hierarchy.
+ * @domain: Domain to push.
+ * @virq: Irq to push the domain in to.
+ * @arg: Passed to the irq_domain_ops alloc() function.
+ *
+ * For an already existing irqdomain hierarchy, as might be obtained
+ * via a call to pci_enable_msix(), add an additional domain to the
+ * head of the processing chain. Must be called before request_irq()
+ * has been called.
+ */
+int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
+ struct irq_desc *desc;
+ int rv = 0;
- mutex_lock(&irq_domain_mutex);
- ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
- if (ret < 0) {
- mutex_unlock(&irq_domain_mutex);
- goto out_free_irq_data;
+ /*
+ * Check that no action has been set, which indicates the virq
+ * is in a state where this function doesn't have to deal with
+ * races between interrupt handling and maintaining the
+ * hierarchy. This will catch gross misuse. Attempting to
+ * make the check race free would require holding locks across
+ * calls to struct irq_domain_ops->alloc(), which could lead
+ * to deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (WARN_ON(!irq_domain_is_hierarchy(domain)))
+ return -EINVAL;
+
+ if (!irq_data)
+ return -EINVAL;
+
+ if (domain->parent != irq_data->domain)
+ return -EINVAL;
+
+ parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!parent_irq_data)
+ return -ENOMEM;
+
+ mutex_lock(&domain->root->mutex);
+
+ /* Copy the original irq_data. */
+ *parent_irq_data = *irq_data;
+
+ /*
+ * Overwrite the irq_data, which is embedded in struct irq_desc, with
+ * values for this domain.
+ */
+ irq_data->parent_data = parent_irq_data;
+ irq_data->domain = domain;
+ irq_data->mask = 0;
+ irq_data->hwirq = 0;
+ irq_data->chip = NULL;
+ irq_data->chip_data = NULL;
+
+ /* May (probably does) set hwirq, chip, etc. */
+ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ if (rv) {
+ /* Restore the original irq_data. */
+ *irq_data = *parent_irq_data;
+ kfree(parent_irq_data);
+ goto error;
}
- for (i = 0; i < nr_irqs; i++)
- irq_domain_insert_irq(virq + i);
- mutex_unlock(&irq_domain_mutex);
- return virq;
+ irq_domain_fix_revmap(parent_irq_data);
+ irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);
+error:
+ mutex_unlock(&domain->root->mutex);
-out_free_irq_data:
- irq_domain_free_irq_data(virq, nr_irqs);
-out_free_desc:
- irq_free_descs(virq, nr_irqs);
- return ret;
+ return rv;
+}
+EXPORT_SYMBOL_GPL(irq_domain_push_irq);
+
+/**
+ * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy.
+ * @domain: Domain to remove.
+ * @virq: Irq to remove the domain from.
+ *
+ * Undo the effects of a call to irq_domain_push_irq(). Must be
+ * called either before request_irq() or after free_irq().
+ */
+int irq_domain_pop_irq(struct irq_domain *domain, int virq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+ struct irq_data *parent_irq_data;
+ struct irq_data *tmp_irq_data;
+ struct irq_desc *desc;
+
+ /*
+ * Check that no action is set, which indicates the virq is in
+ * a state where this function doesn't have to deal with races
+ * between interrupt handling and maintaining the hierarchy.
+ * This will catch gross misuse. Attempting to make the check
+ * race free would require holding locks across calls to
+ * struct irq_domain_ops->free(), which could lead to
+ * deadlock, so we just do a simple check before starting.
+ */
+ desc = irq_to_desc(virq);
+ if (!desc)
+ return -EINVAL;
+ if (WARN_ON(desc->action))
+ return -EBUSY;
+
+ if (domain == NULL)
+ return -EINVAL;
+
+ if (!irq_data)
+ return -EINVAL;
+
+ tmp_irq_data = irq_domain_get_irq_data(domain, virq);
+
+ /* We can only "pop" if this domain is at the top of the list */
+ if (WARN_ON(irq_data != tmp_irq_data))
+ return -EINVAL;
+
+ if (WARN_ON(irq_data->domain != domain))
+ return -EINVAL;
+
+ parent_irq_data = irq_data->parent_data;
+ if (WARN_ON(!parent_irq_data))
+ return -EINVAL;
+
+ mutex_lock(&domain->root->mutex);
+
+ irq_data->parent_data = NULL;
+
+ irq_domain_clear_mapping(domain, irq_data->hwirq);
+ irq_domain_free_irqs_hierarchy(domain, virq, 1);
+
+ /* Restore the original irq_data. */
+ *irq_data = *parent_irq_data;
+
+ irq_domain_fix_revmap(irq_data);
+
+ mutex_unlock(&domain->root->mutex);
+
+ kfree(parent_irq_data);
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
/**
* irq_domain_free_irqs - Free IRQ number and associated data structures
@@ -1115,81 +1883,118 @@ out_free_desc:
void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_domain *domain;
int i;
if (WARN(!data || !data->domain || !data->domain->ops->free,
"NULL pointer, cannot free irq\n"))
return;
- mutex_lock(&irq_domain_mutex);
+ domain = data->domain;
+
+ mutex_lock(&domain->root->mutex);
for (i = 0; i < nr_irqs; i++)
irq_domain_remove_irq(virq + i);
- irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
- mutex_unlock(&irq_domain_mutex);
+ irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs);
+ mutex_unlock(&domain->root->mutex);
irq_domain_free_irq_data(virq, nr_irqs);
irq_free_descs(virq, nr_irqs);
}
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq)
+{
+ if (irq_domain_is_msi_device(domain))
+ msi_device_domain_free_wired(domain, virq);
+ else
+ irq_domain_free_irqs(virq, 1);
+}
+
/**
* irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @domain: Domain below which interrupts must be allocated
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to allocate
* @arg: Allocation data (arch/domain specific)
- *
- * Check whether the domain has been setup recursive. If not allocate
- * through the parent domain.
*/
int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs,
void *arg)
{
- /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
- if (irq_domain_is_auto_recursive(domain))
- return 0;
+ if (!domain->parent)
+ return -ENOSYS;
- domain = domain->parent;
- if (domain)
- return irq_domain_alloc_irqs_recursive(domain, irq_base,
- nr_irqs, arg);
- return -ENOSYS;
+ return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base,
+ nr_irqs, arg);
}
+EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
/**
* irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @domain: Domain below which interrupts must be freed
* @irq_base: Base IRQ number
* @nr_irqs: Number of IRQs to free
- *
- * Check whether the domain has been setup recursive. If not free
- * through the parent domain.
*/
void irq_domain_free_irqs_parent(struct irq_domain *domain,
unsigned int irq_base, unsigned int nr_irqs)
{
- /* irq_domain_free_irqs_recursive() will call parent's free */
- if (!irq_domain_is_auto_recursive(domain) && domain->parent)
- irq_domain_free_irqs_recursive(domain->parent, irq_base,
- nr_irqs);
+ if (!domain->parent)
+ return;
+
+ irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs);
+}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
+
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ __irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
+{
+ int ret = 0;
+
+ if (irqd && irqd->domain) {
+ struct irq_domain *domain = irqd->domain;
+
+ if (irqd->parent_data)
+ ret = __irq_domain_activate_irq(irqd->parent_data,
+ reserve);
+ if (!ret && domain->ops->activate) {
+ ret = domain->ops->activate(domain, irqd, reserve);
+ /* Rollback in case of error */
+ if (ret && irqd->parent_data)
+ __irq_domain_deactivate_irq(irqd->parent_data);
+ }
+ }
+ return ret;
}
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
- * @irq_data: outermost irq_data associated with interrupt
+ * @irq_data: Outermost irq_data associated with interrupt
+ * @reserve: If set only reserve an interrupt vector instead of assigning one
*
* This is the second step to call domain_ops->activate to program interrupt
* controllers, so the interrupt could actually get delivered.
*/
-void irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
+ int ret = 0;
- if (irq_data->parent_data)
- irq_domain_activate_irq(irq_data->parent_data);
- if (domain->ops->activate)
- domain->ops->activate(domain, irq_data);
- }
+ if (!irqd_is_activated(irq_data))
+ ret = __irq_domain_activate_irq(irq_data, reserve);
+ if (!ret)
+ irqd_set_activated(irq_data);
+ return ret;
}
/**
@@ -1202,13 +2007,9 @@ void irq_domain_activate_irq(struct irq_data *irq_data)
*/
void irq_domain_deactivate_irq(struct irq_data *irq_data)
{
- if (irq_data && irq_data->domain) {
- struct irq_domain *domain = irq_data->domain;
-
- if (domain->ops->deactivate)
- domain->ops->deactivate(domain, irq_data);
- if (irq_data->parent_data)
- irq_domain_deactivate_irq(irq_data->parent_data);
+ if (irqd_is_activated(irq_data)) {
+ __irq_domain_deactivate_irq(irq_data);
+ irqd_clr_activated(irq_data);
}
}
@@ -1219,7 +2020,7 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
-/**
+/*
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
* @domain: domain to match
* @virq: IRQ number to get irq_data
@@ -1231,8 +2032,115 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
}
+EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
-static void irq_domain_check_hierarchy(struct irq_domain *domain)
+/*
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hardware interrupt number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated interrupt chip data
+ * @handler: The interrupt flow handler
+ * @handler_data: The interrupt flow handler data
+ * @handler_name: The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
+ void *chip_data, irq_flow_handler_t handler,
+ void *handler_data, const char *handler_name)
{
+ irq_set_chip_and_handler_name(virq, chip, handler, handler_name);
+ irq_set_chip_data(virq, chip_data);
+ irq_set_handler_data(virq, handler_data);
+}
+
+static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc, const struct irq_affinity_desc *affinity)
+{
+ return -EINVAL;
}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain) { }
+static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { }
+
#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include "internals.h"
+
+static struct dentry *domain_dir;
+
+static const struct irq_bit_descr irqdomain_flags[] = {
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_HIERARCHY),
+ BIT_MASK_DESCR(IRQ_DOMAIN_NAME_ALLOCATED),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_PER_CPU),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_IPI_SINGLE),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_ISOLATED_MSI),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NO_MAP),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_PARENT),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_MSI_DEVICE),
+ BIT_MASK_DESCR(IRQ_DOMAIN_FLAG_NONCORE),
+};
+
+static void irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
+{
+ seq_printf(m, "%*sname: %s\n", ind, "", d->name);
+ seq_printf(m, "%*ssize: %u\n", ind + 1, "", d->revmap_size);
+ seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
+ seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags);
+ irq_debug_show_bits(m, ind, d->flags, irqdomain_flags, ARRAY_SIZE(irqdomain_flags));
+ if (d->ops && d->ops->debug_show)
+ d->ops->debug_show(m, d, NULL, ind + 1);
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ if (!d->parent)
+ return;
+ seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name);
+ irq_domain_debug_show_one(m, d->parent, ind + 4);
+#endif
+}
+
+static int irq_domain_debug_show(struct seq_file *m, void *p)
+{
+ struct irq_domain *d = m->private;
+
+ /* Default domain? Might be NULL */
+ if (!d) {
+ if (!irq_default_domain)
+ return 0;
+ d = irq_default_domain;
+ }
+ irq_domain_debug_show_one(m, d, 0);
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
+
+static void debugfs_add_domain_dir(struct irq_domain *d)
+{
+ if (!d->name || !domain_dir)
+ return;
+ debugfs_create_file(d->name, 0444, domain_dir, d,
+ &irq_domain_debug_fops);
+}
+
+static void debugfs_remove_domain_dir(struct irq_domain *d)
+{
+ debugfs_lookup_and_remove(d->name, domain_dir);
+}
+
+void __init irq_domain_debugfs_init(struct dentry *root)
+{
+ struct irq_domain *d;
+
+ domain_dir = debugfs_create_dir("domains", root);
+
+ debugfs_create_file("default", 0444, domain_dir, NULL,
+ &irq_domain_debug_fops);
+ mutex_lock(&irq_domain_mutex);
+ list_for_each_entry(d, &irq_domain_list, link)
+ debugfs_add_domain_dir(d);
+ mutex_unlock(&irq_domain_mutex);
+}
+#endif
diff --git a/kernel/irq/kexec.c b/kernel/irq/kexec.c
new file mode 100644
index 000000000000..1a3deffe6b5b
--- /dev/null
+++ b/kernel/irq/kexec.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqnr.h>
+
+#include "internals.h"
+
+void machine_kexec_mask_interrupts(void)
+{
+ struct irq_desc *desc;
+ unsigned int i;
+
+ for_each_irq_desc(i, desc) {
+ struct irq_chip *chip;
+ int check_eoi = 1;
+
+ chip = irq_desc_get_chip(desc);
+ if (!chip || !irqd_is_started(&desc->irq_data))
+ continue;
+
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD)) {
+ /*
+ * First try to remove the active state from an interrupt which is forwarded
+ * to a VM. If the interrupt is not forwarded, try to EOI the interrupt.
+ */
+ check_eoi = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
+ }
+
+ if (check_eoi && chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+ chip->irq_eoi(&desc->irq_data);
+
+ irq_shutdown(desc);
+ }
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e68932bb308e..0bb29316b436 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/manage.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006 Thomas Gleixner
*
@@ -14,31 +13,36 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/sched/isolation.h>
+#include <uapi/linux/sched/types.h>
#include <linux/task_work.h>
#include "internals.h"
-#ifdef CONFIG_IRQ_FORCED_THREADING
-__read_mostly bool force_irqthreads;
+#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
+DEFINE_STATIC_KEY_FALSE(force_irqthreads_key);
static int __init setup_forced_irqthreads(char *arg)
{
- force_irqthreads = true;
+ static_branch_enable(&force_irqthreads_key);
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
#endif
-static void __synchronize_hardirq(struct irq_desc *desc)
+static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
+
+static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
+ struct irq_data *irqd = irq_desc_get_irq_data(desc);
bool inprogress;
do {
- unsigned long flags;
-
/*
* Wait until we're out of the critical section. This might
* give the wrong answer due to the lack of memory barriers.
@@ -47,37 +51,53 @@ static void __synchronize_hardirq(struct irq_desc *desc)
cpu_relax();
/* Ok, that indicated we're done: double-check carefully. */
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
inprogress = irqd_irq_inprogress(&desc->irq_data);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ /*
+ * If requested and supported, check at the chip whether it
+ * is in flight at the hardware level, i.e. already pending
+ * in a CPU and waiting for service and acknowledge.
+ */
+ if (!inprogress && sync_chip) {
+ /*
+ * Ignore the return code. inprogress is only updated
+ * when the chip supports it.
+ */
+ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
+ &inprogress);
+ }
/* Oops, that failed? */
} while (inprogress);
}
/**
- * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
*
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this
- * function while holding a resource the IRQ handler may need you
- * will deadlock. It does not take associated threaded handlers
- * into account.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock. It does not take
+ * associated threaded handlers into account.
*
- * Do not use this for shutdown scenarios where you must be sure
- * that all parts (hardirq and threaded handler) have completed.
+ * Do not use this for shutdown scenarios where you must be sure that all
+ * parts (hardirq and threaded handler) have completed.
*
- * Returns: false if a threaded handler is active.
+ * Returns: false if a threaded handler is active.
*
- * This function may be called - with care - from IRQ context.
+ * This function may be called - with care - from IRQ context.
+ *
+ * It does not check whether there is an interrupt in flight at the
+ * hardware level, but not serviced yet, as this might deadlock when called
+ * with interrupts disabled and the target CPU of the interrupt is the
+ * current CPU.
*/
bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
- __synchronize_hardirq(desc);
+ __synchronize_hardirq(desc, false);
return !atomic_read(&desc->threads_active);
}
@@ -85,113 +105,184 @@ bool synchronize_hardirq(unsigned int irq)
}
EXPORT_SYMBOL(synchronize_hardirq);
+static void __synchronize_irq(struct irq_desc *desc)
+{
+ __synchronize_hardirq(desc, true);
+ /*
+ * We made sure that no hardirq handler is running. Now verify that no
+ * threaded handlers are active.
+ */
+ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+}
+
/**
- * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
+ *
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
*
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
*
- * This function may be called - with care - from IRQ context.
+ * It optionally makes sure (when the irq chip supports that method)
+ * that the interrupt is not pending in any CPU and waiting for
+ * service.
*/
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc) {
- __synchronize_hardirq(desc);
- /*
- * We made sure that no hardirq handler is
- * running. Now verify that no threaded handlers are
- * active.
- */
- wait_event(desc->wait_for_threads,
- !atomic_read(&desc->threads_active));
- }
+ if (desc)
+ __synchronize_irq(desc);
}
EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
+static bool __irq_can_set_affinity(struct irq_desc *desc)
+{
+ if (!desc || !irqd_can_balance(&desc->irq_data) ||
+ !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+ return false;
+ return true;
+}
+
/**
- * irq_can_set_affinity - Check if the affinity of a given irq can be set
- * @irq: Interrupt to check
+ * irq_can_set_affinity - Check if the affinity of a given irq can be set
+ * @irq: Interrupt to check
*
*/
int irq_can_set_affinity(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ return __irq_can_set_affinity(irq_to_desc(irq));
+}
- if (!desc || !irqd_can_balance(&desc->irq_data) ||
- !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
- return 0;
+/**
+ * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
+ * @irq: Interrupt to check
+ *
+ * Like irq_can_set_affinity() above, but additionally checks for the
+ * AFFINITY_MANAGED flag.
+ */
+bool irq_can_set_affinity_usr(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
- return 1;
+ return __irq_can_set_affinity(desc) &&
+ !irqd_affinity_is_managed(&desc->irq_data);
}
/**
- * irq_set_thread_affinity - Notify irq threads to adjust affinity
- * @desc: irq descriptor which has affitnity changed
+ * irq_set_thread_affinity - Notify irq threads to adjust affinity
+ * @desc: irq descriptor which has affinity changed
*
- * We just set IRQTF_AFFINITY and delegate the affinity setting
- * to the interrupt thread itself. We can not call
- * set_cpus_allowed_ptr() here as we hold desc->lock and this
- * code can be called from hard interrupt context.
+ * Just set IRQTF_AFFINITY and delegate the affinity setting to the
+ * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as
+ * we hold desc->lock and this code can be called from hard interrupt
+ * context.
*/
-void irq_set_thread_affinity(struct irq_desc *desc)
+static void irq_set_thread_affinity(struct irq_desc *desc)
{
- struct irqaction *action = desc->action;
+ struct irqaction *action;
- while (action) {
- if (action->thread)
+ for_each_action_of_desc(desc, action) {
+ if (action->thread) {
set_bit(IRQTF_AFFINITY, &action->thread_flags);
- action = action->next;
+ wake_up_process(action->thread);
+ }
+ if (action->secondary && action->secondary->thread) {
+ set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ wake_up_process(action->secondary->thread);
+ }
}
}
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-static inline bool irq_can_move_pcntxt(struct irq_data *data)
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+static void irq_validate_effective_affinity(struct irq_data *data)
{
- return irqd_can_move_in_process_context(data);
-}
-static inline bool irq_move_pending(struct irq_data *data)
-{
- return irqd_is_setaffinity_pending(data);
-}
-static inline void
-irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
-{
- cpumask_copy(desc->pending_mask, mask);
-}
-static inline void
-irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
-{
- cpumask_copy(mask, desc->pending_mask);
+ const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+ if (!cpumask_empty(m))
+ return;
+ pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
+ chip->name, data->irq);
}
#else
-static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
-static inline bool irq_move_pending(struct irq_data *data) { return false; }
-static inline void
-irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
-static inline void
-irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+static inline void irq_validate_effective_affinity(struct irq_data *data) { }
#endif
+static DEFINE_PER_CPU(struct cpumask, __tmp_mask);
+
int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
+ struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask);
struct irq_desc *desc = irq_data_to_desc(data);
struct irq_chip *chip = irq_data_get_irq_chip(data);
+ const struct cpumask *prog_mask;
int ret;
- ret = chip->irq_set_affinity(data, mask, force);
+ if (!chip || !chip->irq_set_affinity)
+ return -EINVAL;
+
+ /*
+ * If this is a managed interrupt and housekeeping is enabled on
+ * it check whether the requested affinity mask intersects with
+ * a housekeeping CPU. If so, then remove the isolated CPUs from
+ * the mask and just keep the housekeeping CPU(s). This prevents
+ * the affinity setter from routing the interrupt to an isolated
+ * CPU to avoid that I/O submitted from a housekeeping CPU causes
+ * interrupts on an isolated one.
+ *
+ * If the masks do not intersect or include online CPU(s) then
+ * keep the requested mask. The isolated target CPUs are only
+ * receiving interrupts when the I/O operation was submitted
+ * directly from them.
+ *
+ * If all housekeeping CPUs in the affinity mask are offline, the
+ * interrupt will be migrated by the CPU hotplug code once a
+ * housekeeping CPU which belongs to the affinity mask comes
+ * online.
+ */
+ if (irqd_affinity_is_managed(data) &&
+ housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
+ const struct cpumask *hk_mask;
+
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
+
+ cpumask_and(tmp_mask, mask, hk_mask);
+ if (!cpumask_intersects(tmp_mask, cpu_online_mask))
+ prog_mask = mask;
+ else
+ prog_mask = tmp_mask;
+ } else {
+ prog_mask = mask;
+ }
+
+ /*
+ * Make sure we only provide online CPUs to the irqchip,
+ * unless we are being asked to force the affinity (in which
+ * case we do as we are told).
+ */
+ cpumask_and(tmp_mask, prog_mask, cpu_online_mask);
+ if (!force && !cpumask_empty(tmp_mask))
+ ret = chip->irq_set_affinity(data, tmp_mask, force);
+ else if (force)
+ ret = chip->irq_set_affinity(data, mask, force);
+ else
+ ret = -EINVAL;
+
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
- cpumask_copy(data->affinity, mask);
+ cpumask_copy(desc->irq_common_data.affinity, mask);
+ fallthrough;
case IRQ_SET_MASK_OK_NOCOPY:
+ irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
ret = 0;
}
@@ -199,6 +290,63 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline int irq_set_affinity_pending(struct irq_data *data,
+ const struct cpumask *dest)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ irqd_set_move_pending(data);
+ irq_copy_pending(desc, dest);
+ return 0;
+}
+#else
+static inline int irq_set_affinity_pending(struct irq_data *data,
+ const struct cpumask *dest)
+{
+ return -EBUSY;
+}
+#endif
+
+static int irq_try_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ int ret = irq_do_set_affinity(data, dest, force);
+
+ /*
+ * In case that the underlying vector management is busy and the
+ * architecture supports the generic pending mechanism then utilize
+ * this to avoid returning an error to user space.
+ */
+ if (ret == -EBUSY && !force)
+ ret = irq_set_affinity_pending(data, dest);
+ return ret;
+}
+
+static bool irq_set_affinity_deactivated(struct irq_data *data,
+ const struct cpumask *mask)
+{
+ struct irq_desc *desc = irq_data_to_desc(data);
+
+ /*
+ * Handle irq chips which can handle affinity only in activated
+ * state correctly
+ *
+ * If the interrupt is not yet activated, just store the affinity
+ * mask and do not call the chip driver at all. On activation the
+ * driver has to make sure anyway that the interrupt is in a
+ * usable state so startup works.
+ */
+ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
+ irqd_is_activated(data) || !irqd_affinity_on_activate(data))
+ return false;
+
+ cpumask_copy(desc->irq_common_data.affinity, mask);
+ irq_data_update_effective_affinity(data, mask);
+ irqd_set(data, IRQD_AFFINITY_SET);
+ return true;
+}
+
int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
bool force)
{
@@ -209,8 +357,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_can_move_pcntxt(data)) {
- ret = irq_do_set_affinity(data, mask, force);
+ if (irq_set_affinity_deactivated(data, mask))
+ return 0;
+
+ if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
+ ret = irq_try_set_affinity(data, mask, force);
} else {
irqd_set_move_pending(data);
irq_copy_pending(desc, mask);
@@ -218,61 +369,147 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (desc->affinity_notify) {
kref_get(&desc->affinity_notify->kref);
- schedule_work(&desc->affinity_notify->work);
+ if (!schedule_work(&desc->affinity_notify->work)) {
+ /* Work was already scheduled, drop our extra ref */
+ kref_put(&desc->affinity_notify->kref,
+ desc->affinity_notify->release);
+ }
}
irqd_set(data, IRQD_AFFINITY_SET);
return ret;
}
-int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force)
+/**
+ * irq_update_affinity_desc - Update affinity management for an interrupt
+ * @irq: The interrupt number to update
+ * @affinity: Pointer to the affinity descriptor
+ *
+ * This interface can be used to configure the affinity management of
+ * interrupts which have been allocated already.
+ *
+ * There are certain limitations on when it may be used - attempts to use it
+ * for when the kernel is configured for generic IRQ reservation mode (in
+ * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with
+ * managed/non-managed interrupt accounting. In addition, attempts to use it on
+ * an interrupt which is already started or which has already been configured
+ * as managed will also fail, as these mean invalid init state or double init.
+ */
+int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity)
+{
+ /*
+ * Supporting this with the reservation scheme used by x86 needs
+ * some more thought. Fail it for now.
+ */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
+ return -EOPNOTSUPP;
+
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ bool activated;
+
+ /* Requires the interrupt to be shut down */
+ if (irqd_is_started(&desc->irq_data))
+ return -EBUSY;
+
+ /* Interrupts which are already managed cannot be modified */
+ if (irqd_affinity_is_managed(&desc->irq_data))
+ return -EBUSY;
+ /*
+ * Deactivate the interrupt. That's required to undo
+ * anything an earlier activation has established.
+ */
+ activated = irqd_is_activated(&desc->irq_data);
+ if (activated)
+ irq_domain_deactivate_irq(&desc->irq_data);
+
+ if (affinity->is_managed) {
+ irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+ irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
+ }
+
+ cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
+
+ /* Restore the activation state */
+ if (activated)
+ irq_domain_activate_irq(&desc->irq_data, false);
+ return 0;
+ }
+ return -EINVAL;
+}
+
+static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
+ bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- int ret;
if (!desc)
return -EINVAL;
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
}
-int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+/**
+ * irq_set_affinity - Set the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ * Fails if cpumask does not contain an online CPU
+ */
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ return __irq_set_affinity(irq, cpumask, false);
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity);
- if (!desc)
- return -EINVAL;
- desc->affinity_hint = m;
- irq_put_desc_unlock(desc, flags);
- /* set the initial affinity to prevent every interrupt being on CPU0 */
- if (m)
+/**
+ * irq_force_affinity - Force the irq affinity of a given irq
+ * @irq: Interrupt to set affinity
+ * @cpumask: cpumask
+ *
+ * Same as irq_set_affinity, but without checking the mask against
+ * online cpus.
+ *
+ * Solely for low level cpu hotplug code, where we need to make per
+ * cpu interrupts affine before the cpu becomes online.
+ */
+int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+ return __irq_set_affinity(irq, cpumask, true);
+}
+EXPORT_SYMBOL_GPL(irq_force_affinity);
+
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity)
+{
+ int ret = -EINVAL;
+
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->affinity_hint = m;
+ ret = 0;
+ }
+
+ if (!ret && m && setaffinity)
__irq_set_affinity(irq, m, false);
- return 0;
+ return ret;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
- struct irq_affinity_notify *notify =
- container_of(work, struct irq_affinity_notify, work);
+ struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work);
struct irq_desc *desc = irq_to_desc(notify->irq);
cpumask_var_t cpumask;
- unsigned long flags;
if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
goto out;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_move_pending(&desc->irq_data))
- irq_get_pending(cpumask, desc);
- else
- cpumask_copy(cpumask, desc->irq_data.affinity);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ if (irq_move_pending(&desc->irq_data))
+ irq_get_pending(cpumask, desc);
+ else
+ cpumask_copy(cpumask, desc->irq_common_data.affinity);
+ }
notify->notify(notify, cpumask);
@@ -282,27 +519,25 @@ out:
}
/**
- * irq_set_affinity_notifier - control notification of IRQ affinity changes
- * @irq: Interrupt for which to enable/disable notification
- * @notify: Context for notification, or %NULL to disable
- * notification. Function pointers must be initialised;
- * the other fields will be initialised by this function.
- *
- * Must be called in process context. Notification may only be enabled
- * after the IRQ is allocated and must be disabled before the IRQ is
- * freed using free_irq().
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated and must be disabled before the IRQ is freed
+ * using free_irq().
*/
-int
-irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_affinity_notify *old_notify;
- unsigned long flags;
/* The release function is promised process context */
might_sleep();
- if (!desc)
+ if (!desc || irq_is_nmi(desc))
return -EINVAL;
/* Complete initialisation of *notify */
@@ -312,13 +547,18 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
INIT_WORK(&notify->work, irq_affinity_notify);
}
- raw_spin_lock_irqsave(&desc->lock, flags);
- old_notify = desc->affinity_notify;
- desc->affinity_notify = notify;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
+ }
- if (old_notify)
+ if (old_notify) {
+ if (cancel_work_sync(&old_notify->work)) {
+ /* Pending work had a ref, put that one too */
+ kref_put(&old_notify->kref, old_notify->release);
+ }
kref_put(&old_notify->kref, old_notify->release);
+ }
return 0;
}
@@ -328,71 +568,90 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
/*
* Generic version of the affinity autoselector.
*/
-static int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+int irq_setup_affinity(struct irq_desc *desc)
{
struct cpumask *set = irq_default_affinity;
- int node = desc->irq_data.node;
+ int node = irq_desc_get_node(desc);
+
+ static DEFINE_RAW_SPINLOCK(mask_lock);
+ static struct cpumask mask;
/* Excludes PER_CPU and NO_BALANCE interrupts */
- if (!irq_can_set_affinity(irq))
+ if (!__irq_can_set_affinity(desc))
return 0;
+ guard(raw_spinlock)(&mask_lock);
/*
- * Preserve an userspace affinity setup, but make sure that
- * one of the targets is online.
+ * Preserve the managed affinity setting and a userspace affinity
+ * setup, but make sure that one of the targets is online.
*/
- if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
- if (cpumask_intersects(desc->irq_data.affinity,
+ if (irqd_affinity_is_managed(&desc->irq_data) ||
+ irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+ if (cpumask_intersects(desc->irq_common_data.affinity,
cpu_online_mask))
- set = desc->irq_data.affinity;
+ set = desc->irq_common_data.affinity;
else
irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
}
- cpumask_and(mask, cpu_online_mask, set);
+ cpumask_and(&mask, cpu_online_mask, set);
+ if (cpumask_empty(&mask))
+ cpumask_copy(&mask, cpu_online_mask);
+
if (node != NUMA_NO_NODE) {
const struct cpumask *nodemask = cpumask_of_node(node);
/* make sure at least one of the cpus in nodemask is online */
- if (cpumask_intersects(mask, nodemask))
- cpumask_and(mask, mask, nodemask);
+ if (cpumask_intersects(&mask, nodemask))
+ cpumask_and(&mask, &mask, nodemask);
}
- irq_do_set_affinity(&desc->irq_data, mask, false);
- return 0;
+ return irq_do_set_affinity(&desc->irq_data, &mask, false);
}
#else
-static inline int
-setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
+/* Wrapper for ALPHA specific affinity selector magic */
+int irq_setup_affinity(struct irq_desc *desc)
{
- return irq_select_affinity(irq);
+ return irq_select_affinity(irq_desc_get_irq(desc));
}
-#endif
+#endif /* CONFIG_AUTO_IRQ_AFFINITY */
+#endif /* CONFIG_SMP */
-/*
- * Called when affinity is set via /proc/irq
+
+/**
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ * specific data for percpu_devid interrupts
+ *
+ * This function uses the vCPU specific data to set the vCPU affinity for
+ * an irq. The vCPU specific data is passed from outside, such as KVM. One
+ * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity().
*/
-int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- int ret;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ struct irq_data *data;
+ struct irq_chip *chip;
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = setup_affinity(irq, desc, mask);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
-}
+ data = irq_desc_get_irq_data(desc);
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ break;
-#else
-static inline int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
-{
- return 0;
+ data = irqd_get_parent_data(data);
+ } while (data);
+
+ if (!data)
+ return -ENOSYS;
+ return chip->irq_set_vcpu_affinity(data, vcpu_info);
+ }
+ return -EINVAL;
}
-#endif
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
-void __disable_irq(struct irq_desc *desc, unsigned int irq)
+void __disable_irq(struct irq_desc *desc)
{
if (!desc->depth++)
irq_disable(desc);
@@ -400,26 +659,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq)
static int __disable_irq_nosync(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- __disable_irq(desc, irq);
- irq_put_desc_busunlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ __disable_irq(scoped_irqdesc);
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * disable_irq_nosync - disable an irq without waiting
- * @irq: Interrupt to disable
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
*
- * Disable the selected interrupt line. Disables and Enables are
- * nested.
- * Unlike disable_irq(), this function does not ensure existing
- * instances of the IRQ handler have completed before returning.
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
*
- * This function may be called from IRQ context.
+ * This function may be called from IRQ context.
*/
void disable_irq_nosync(unsigned int irq)
{
@@ -428,65 +684,93 @@ void disable_irq_nosync(unsigned int irq)
EXPORT_SYMBOL(disable_irq_nosync);
/**
- * disable_irq - disable an irq and wait for completion
- * @irq: Interrupt to disable
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are nested.
+ *
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * Can only be called from preemptible code as it might sleep when an
+ * interrupt thread is associated to @irq.
*
- * This function may be called - with care - from IRQ context.
*/
void disable_irq(unsigned int irq)
{
+ might_sleep();
if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
EXPORT_SYMBOL(disable_irq);
/**
- * disable_hardirq - disables an irq and waits for hardirq completion
- * @irq: Interrupt to disable
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are nested.
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this function while
- * holding a resource the hard IRQ handler may need you will deadlock.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the hard IRQ handler may need you will deadlock.
*
- * When used to optimistically disable an interrupt from atomic context
- * the return value must be checked.
+ * When used to optimistically disable an interrupt from atomic context the
+ * return value must be checked.
*
- * Returns: false if a threaded handler is active.
+ * Returns: false if a threaded handler is active.
*
- * This function may be called - with care - from IRQ context.
+ * This function may be called - with care - from IRQ context.
*/
bool disable_hardirq(unsigned int irq)
{
if (!__disable_irq_nosync(irq))
return synchronize_hardirq(irq);
-
return false;
}
EXPORT_SYMBOL_GPL(disable_hardirq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq)
+/**
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are nested.
+ *
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ */
+void disable_nmi_nosync(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+}
+
+void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
case 0:
err_out:
- WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+ WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n",
+ irq_desc_get_irq(desc));
break;
case 1: {
if (desc->istate & IRQS_SUSPENDED)
goto err_out;
/* Prevent probing on this irq: */
irq_settings_set_noprobe(desc);
- irq_enable(desc);
- check_irq_resend(desc, irq);
- /* fall-through */
+ /*
+ * Call irq_startup() not irq_enable() here because the
+ * interrupt might be marked NOAUTOEN so irq_startup()
+ * needs to be invoked when it gets enabled the first time.
+ * This is also required when __enable_irq() is invoked for
+ * a managed and shutdown interrupt from the S3 resume
+ * path.
+ *
+ * If it was already started up, then irq_startup() will
+ * invoke irq_enable() under the hood.
+ */
+ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
+ break;
}
default:
desc->depth--;
@@ -494,33 +778,40 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq)
}
/**
- * enable_irq - enable handling of an irq
- * @irq: Interrupt to enable
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
*
- * Undoes the effect of one call to disable_irq(). If this
- * matches the last disable, processing of interrupts on this
- * IRQ line is re-enabled.
+ * Undoes the effect of one call to disable_irq(). If this matches the
+ * last disable, processing of interrupts on this IRQ line is re-enabled.
*
- * This function may be called from IRQ context only when
- * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ * This function may be called from IRQ context only when
+ * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return;
- if (WARN(!desc->irq_data.chip,
- KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
- goto out;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- __enable_irq(desc, irq);
-out:
- irq_put_desc_busunlock(desc, flags);
+ if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq))
+ return;
+ __enable_irq(desc);
+ }
}
EXPORT_SYMBOL(enable_irq);
+/**
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
+ *
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this matches the last
+ * disable, processing of interrupts on this IRQ line is re-enabled.
+ */
+void enable_nmi(unsigned int irq)
+{
+ enable_irq(irq);
+}
+
static int set_irq_wake_real(unsigned int irq, unsigned int on)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -536,50 +827,59 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
}
/**
- * irq_set_irq_wake - control irq power management wakeup
- * @irq: interrupt to control
- * @on: enable/disable power management wakeup
+ * irq_set_irq_wake - control irq power management wakeup
+ * @irq: interrupt to control
+ * @on: enable/disable power management wakeup
+ *
+ * Enable/disable power management wakeup mode, which is disabled by
+ * default. Enables and disables must match, just as they match for
+ * non-wakeup mode support.
*
- * Enable/disable power management wakeup mode, which is
- * disabled by default. Enables and disables must match,
- * just as they match for non-wakeup mode support.
+ * Wakeup mode lets this IRQ wake the system from sleep states like
+ * "suspend to RAM".
*
- * Wakeup mode lets this IRQ wake the system from sleep
- * states like "suspend to RAM".
+ * Note: irq enable/disable state is completely orthogonal to the
+ * enable/disable state of irq wake. An irq can be disabled with
+ * disable_irq() and still wake the system as long as the irq has wake
+ * enabled. If this does not hold, then the underlying irq chip and the
+ * related driver need to be investigated.
*/
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
+ int ret = 0;
- if (!desc)
- return -EINVAL;
+ /* Don't use NMIs as wake up interrupts please */
+ if (irq_is_nmi(desc))
+ return -EINVAL;
- /* wakeup-capable irqs can be shared between drivers that
- * don't need to have the same sleep mode behaviors.
- */
- if (on) {
- if (desc->wake_depth++ == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 0;
- else
- irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
- }
- } else {
- if (desc->wake_depth == 0) {
- WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
- } else if (--desc->wake_depth == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 1;
- else
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ /*
+ * wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
+ if (on) {
+ if (desc->wake_depth++ == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 0;
+ else
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
+ } else {
+ if (desc->wake_depth == 0) {
+ WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
+ } else if (--desc->wake_depth == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 1;
+ else
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
}
+ return ret;
}
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_wake);
@@ -588,26 +888,20 @@ EXPORT_SYMBOL(irq_set_irq_wake);
* particular irq has been exclusively allocated or is available
* for driver use.
*/
-int can_request_irq(unsigned int irq, unsigned long irqflags)
+bool can_request_irq(unsigned int irq, unsigned long irqflags)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- int canrequest = 0;
-
- if (!desc)
- return 0;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (irq_settings_can_request(desc)) {
- if (!desc->action ||
- irqflags & desc->action->flags & IRQF_SHARED)
- canrequest = 1;
+ if (irq_settings_can_request(desc)) {
+ if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED)
+ return true;
+ }
}
- irq_put_desc_unlock(desc, flags);
- return canrequest;
+ return false;
}
-int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags)
+int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
{
struct irq_chip *chip = desc->irq_data.chip;
int ret, unmask = 0;
@@ -617,13 +911,12 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_debug("No set_type function for IRQ %d (%s)\n", irq,
+ pr_debug("No set_type function for IRQ %d (%s)\n",
+ irq_desc_get_irq(desc),
chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
- flags &= IRQ_TYPE_SENSE_MASK;
-
if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
if (!irqd_irq_masked(&desc->irq_data))
mask_irq(desc);
@@ -631,7 +924,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unmask = 1;
}
- /* caller masked out all except trigger mode flags */
+ /* Mask all flags except trigger mode */
+ flags &= IRQ_TYPE_SENSE_MASK;
ret = chip->irq_set_type(&desc->irq_data, flags);
switch (ret) {
@@ -639,6 +933,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
+ fallthrough;
case IRQ_SET_MASK_OK_NOCOPY:
flags = irqd_get_trigger_type(&desc->irq_data);
@@ -653,8 +948,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
ret = 0;
break;
default:
- pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
- flags, irq, chip->irq_set_type);
+ pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
+ flags, irq_desc_get_irq(desc), chip->irq_set_type);
}
if (unmask)
unmask_irq(desc);
@@ -664,17 +959,13 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
#ifdef CONFIG_HARDIRQS_SW_RESEND
int irq_set_parent(int irq, int parent_irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
-
- desc->parent_irq = parent_irq;
-
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->parent_irq = parent_irq;
+ return 0;
+ }
+ return -EINVAL;
}
+EXPORT_SYMBOL_GPL(irq_set_parent);
#endif
/*
@@ -697,11 +988,65 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
return IRQ_NONE;
}
-static int irq_wait_for_interrupt(struct irqaction *action)
+static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
+{
+ WARN(1, "Secondary action handler called for irq %d\n", irq);
+ return IRQ_NONE;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Check whether we need to change the affinity of the interrupt thread.
+ */
+static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+{
+ cpumask_var_t mask;
+
+ if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
+ return;
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * In case we are out of memory we set IRQTF_AFFINITY again and
+ * try again next time
+ */
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ return;
+ }
+
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ const struct cpumask *m;
+
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
+ }
+
+ set_cpus_allowed_ptr(current, mask);
+ free_cpumask_var(mask);
+}
+#else
+static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
+#endif
+
+static int irq_wait_for_interrupt(struct irq_desc *desc,
+ struct irqaction *action)
{
- set_current_state(TASK_INTERRUPTIBLE);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ irq_thread_check_affinity(desc, action);
- while (!kthread_should_stop()) {
+ if (kthread_should_stop()) {
+ /* may need to run one last time */
+ if (test_and_clear_bit(IRQTF_RUNTHREAD,
+ &action->thread_flags)) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+ __set_current_state(TASK_RUNNING);
+ return -1;
+ }
if (test_and_clear_bit(IRQTF_RUNTHREAD,
&action->thread_flags)) {
@@ -709,10 +1054,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
return 0;
}
schedule();
- set_current_state(TASK_INTERRUPTIBLE);
}
- __set_current_state(TASK_RUNNING);
- return -1;
}
/*
@@ -723,7 +1065,8 @@ static int irq_wait_for_interrupt(struct irqaction *action)
static void irq_finalize_oneshot(struct irq_desc *desc,
struct irqaction *action)
{
- if (!(desc->istate & IRQS_ONESHOT))
+ if (!(desc->istate & IRQS_ONESHOT) ||
+ action->handler == irq_forced_secondary_handler)
return;
again:
chip_bus_lock(desc);
@@ -739,7 +1082,7 @@ again:
* to IRQS_INPROGRESS and the irq line is masked forever.
*
* This also serializes the state of shared oneshot handlers
- * versus "desc->threads_onehsot |= action->thread_mask;" in
+ * versus "desc->threads_oneshot |= action->thread_mask;" in
* irq_wake_thread(). See the comment there which explains the
* serialization.
*/
@@ -769,82 +1112,43 @@ out_unlock:
chip_bus_sync_unlock(desc);
}
-#ifdef CONFIG_SMP
/*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Interrupts explicitly requested as threaded interrupts want to be
+ * preemptible - many of them need to sleep and wait for slow busses to
+ * complete.
*/
-static void
-irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
- cpumask_var_t mask;
- bool valid = true;
-
- if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
- return;
-
- /*
- * In case we are out of memory we set IRQTF_AFFINITY again and
- * try again next time
- */
- if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
- set_bit(IRQTF_AFFINITY, &action->thread_flags);
- return;
- }
+ irqreturn_t ret = action->thread_fn(action->irq, action->dev_id);
- raw_spin_lock_irq(&desc->lock);
- /*
- * This code is triggered unconditionally. Check the affinity
- * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
- */
- if (desc->irq_data.affinity)
- cpumask_copy(mask, desc->irq_data.affinity);
- else
- valid = false;
- raw_spin_unlock_irq(&desc->lock);
+ if (ret == IRQ_HANDLED)
+ atomic_inc(&desc->threads_handled);
- if (valid)
- set_cpus_allowed_ptr(current, mask);
- free_cpumask_var(mask);
+ irq_finalize_oneshot(desc, action);
+ return ret;
}
-#else
-static inline void
-irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
-#endif
/*
- * Interrupts which are not explicitely requested as threaded
+ * Interrupts which are not explicitly requested as threaded
* interrupts rely on the implicit bh/preempt disable of the hard irq
* context. So we need to disable bh here to avoid deadlocks and other
* side effects.
*/
-static irqreturn_t
-irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
irqreturn_t ret;
local_bh_disable();
- ret = action->thread_fn(action->irq, action->dev_id);
- irq_finalize_oneshot(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
+ ret = irq_thread_fn(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
local_bh_enable();
return ret;
}
-/*
- * Interrupts explicitly requested as threaded interrupts want to be
- * preemtible - many of them need to sleep and wait for slow busses to
- * complete.
- */
-static irqreturn_t irq_thread_fn(struct irq_desc *desc,
- struct irqaction *action)
-{
- irqreturn_t ret;
-
- ret = action->thread_fn(action->irq, action->dev_id);
- irq_finalize_oneshot(desc, action);
- return ret;
-}
-
-static void wake_threads_waitq(struct irq_desc *desc)
+void wake_threads_waitq(struct irq_desc *desc)
{
if (atomic_dec_and_test(&desc->threads_active))
wake_up(&desc->wait_for_threads);
@@ -877,6 +1181,42 @@ static void irq_thread_dtor(struct callback_head *unused)
irq_finalize_oneshot(desc, action);
}
+static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
+{
+ struct irqaction *secondary = action->secondary;
+
+ if (WARN_ON_ONCE(!secondary))
+ return;
+
+ guard(raw_spinlock_irq)(&desc->lock);
+ __irq_wake_thread(desc, secondary);
+}
+
+/*
+ * Internal function to notify that a interrupt thread is ready.
+ */
+static void irq_thread_set_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ set_bit(IRQTF_READY, &action->thread_flags);
+ wake_up(&desc->wait_for_threads);
+}
+
+/*
+ * Internal function to wake up a interrupt thread and wait until it is
+ * ready.
+ */
+static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ if (!action || !action->thread)
+ return;
+
+ wake_up_process(action->thread);
+ wait_event(desc->wait_for_threads,
+ test_bit(IRQTF_READY, &action->thread_flags));
+}
+
/*
* Interrupt handler thread
*/
@@ -888,25 +1228,28 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
- if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
- &action->thread_flags))
+ irq_thread_set_ready(desc, action);
+
+ if (action->handler == irq_forced_secondary_handler)
+ sched_set_fifo_secondary(current);
+ else
+ sched_set_fifo(current);
+
+ if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
+ &action->thread_flags))
handler_fn = irq_forced_thread_fn;
else
handler_fn = irq_thread_fn;
init_task_work(&on_exit_work, irq_thread_dtor);
- task_work_add(current, &on_exit_work, false);
-
- irq_thread_check_affinity(desc, action);
+ task_work_add(current, &on_exit_work, TWA_NONE);
- while (!irq_wait_for_interrupt(action)) {
+ while (!irq_wait_for_interrupt(desc, action)) {
irqreturn_t action_ret;
- irq_thread_check_affinity(desc, action);
-
action_ret = handler_fn(desc, action);
- if (action_ret == IRQ_HANDLED)
- atomic_inc(&desc->threads_handled);
+ if (action_ret == IRQ_WAKE_THREAD)
+ irq_wake_secondary(desc, action);
wake_threads_waitq(desc);
}
@@ -914,57 +1257,74 @@ static int irq_thread(void *data)
/*
* This is the regular exit path. __free_irq() is stopping the
* thread via kthread_stop() after calling
- * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
- * oneshot mask bit can be set. We cannot verify that as we
- * cannot touch the oneshot mask at this point anymore as
- * __setup_irq() might have given out currents thread_mask
- * again.
+ * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the
+ * oneshot mask bit can be set.
*/
- task_work_cancel(current, irq_thread_dtor);
+ task_work_cancel_func(current, irq_thread_dtor);
return 0;
}
/**
- * irq_wake_thread - wake the irq thread for the action identified by dev_id
- * @irq: Interrupt line
- * @dev_id: Device identity for which the thread should be woken
- *
+ * irq_wake_thread - wake the irq thread for the action identified by dev_id
+ * @irq: Interrupt line
+ * @dev_id: Device identity for which the thread should be woken
*/
void irq_wake_thread(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return;
- raw_spin_lock_irqsave(&desc->lock, flags);
- for (action = desc->action; action; action = action->next) {
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ for_each_action_of_desc(desc, action) {
if (action->dev_id == dev_id) {
if (action->thread)
__irq_wake_thread(desc, action);
break;
}
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL_GPL(irq_wake_thread);
-static void irq_setup_forced_threading(struct irqaction *new)
+static int irq_setup_forced_threading(struct irqaction *new)
{
- if (!force_irqthreads)
- return;
+ if (!force_irqthreads())
+ return 0;
if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
- return;
+ return 0;
+
+ /*
+ * No further action required for interrupts which are requested as
+ * threaded interrupts already
+ */
+ if (new->handler == irq_default_primary_handler)
+ return 0;
new->flags |= IRQF_ONESHOT;
- if (!new->thread_fn) {
- set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
- new->thread_fn = new->handler;
- new->handler = irq_default_primary_handler;
+ /*
+ * Handle the case where we have a real primary handler and a
+ * thread handler. We force thread them as well by creating a
+ * secondary action.
+ */
+ if (new->handler && new->thread_fn) {
+ /* Allocate the secondary action */
+ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!new->secondary)
+ return -ENOMEM;
+ new->secondary->handler = irq_forced_secondary_handler;
+ new->secondary->thread_fn = new->thread_fn;
+ new->secondary->dev_id = new->dev_id;
+ new->secondary->irq = new->irq;
+ new->secondary->name = new->name;
}
+ /* Deal with the primary handler */
+ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+ new->thread_fn = new->handler;
+ new->handler = irq_default_primary_handler;
+ return 0;
}
static int irq_request_resources(struct irq_desc *desc)
@@ -984,9 +1344,107 @@ static void irq_release_resources(struct irq_desc *desc)
c->irq_release_resources(d);
}
+static bool irq_supports_nmi(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /* Only IRQs directly managed by the root irqchip can be set as NMI */
+ if (d->parent_data)
+ return false;
+#endif
+ /* Don't support NMIs for chips behind a slow bus */
+ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock)
+ return false;
+
+ return d->chip->flags & IRQCHIP_SUPPORTS_NMI;
+}
+
+static int irq_nmi_setup(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL;
+}
+
+static void irq_nmi_teardown(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_nmi_teardown)
+ c->irq_nmi_teardown(d);
+}
+
+static int
+setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
+{
+ struct task_struct *t;
+
+ if (!secondary) {
+ t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+ new->name);
+ } else {
+ t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
+ new->name);
+ }
+
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+
+ /*
+ * We keep the reference to the task struct even if
+ * the thread dies to avoid that the interrupt code
+ * references an already freed task_struct.
+ */
+ new->thread = get_task_struct(t);
+
+ /*
+ * The affinity can not be established yet, but it will be once the
+ * interrupt is enabled. Delay and defer the actual setting to the
+ * thread itself once it is ready to run. In the meantime, prevent
+ * it from ever being re-affined directly by cpuset or
+ * housekeeping. The proper way to do it is to re-affine the whole
+ * vector.
+ */
+ kthread_bind_mask(t, cpu_possible_mask);
+
+ /*
+ * Ensure the thread adjusts the affinity once it reaches the
+ * thread function.
+ */
+ new->thread_flags = BIT(IRQTF_AFFINITY);
+
+ return 0;
+}
+
+static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new)
+{
+ do {
+ if (cpumask_intersects(old->affinity, new->affinity) ||
+ old->percpu_dev_id == new->percpu_dev_id)
+ return false;
+
+ old = old->next;
+ } while (old);
+
+ return true;
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
+ *
+ * Locking rules:
+ *
+ * desc->request_mutex Provides serialization against a concurrent free_irq()
+ * chip_bus_lock Provides serialization for slow bus operations
+ * desc->lock Provides serialization against hard interrupts
+ *
+ * chip_bus_lock and desc->lock are sufficient for all other management and
+ * interrupt related functions. desc->request_mutex solely serializes
+ * request/free_irq().
*/
static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
@@ -994,7 +1452,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
struct irqaction *old, **old_ptr;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
- cpumask_var_t mask;
+ bool per_cpu_devid;
if (!desc)
return -EINVAL;
@@ -1004,6 +1462,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!try_module_get(desc->owner))
return -ENODEV;
+ per_cpu_devid = irq_settings_is_per_cpu_devid(desc);
+
+ new->irq = irq;
+
+ /*
+ * If the trigger type is not specified by the caller,
+ * then use the default for this interrupt.
+ */
+ if (!(new->flags & IRQF_TRIGGER_MASK))
+ new->flags |= irqd_get_trigger_type(&desc->irq_data);
+
/*
* Check whether the interrupt nests into another interrupt
* thread.
@@ -1021,8 +1490,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
new->handler = irq_nested_primary_handler;
} else {
- if (irq_settings_can_thread(desc))
- irq_setup_forced_threading(new);
+ if (irq_settings_can_thread(desc)) {
+ ret = irq_setup_forced_threading(new);
+ if (ret)
+ goto out_mput;
+ }
}
/*
@@ -1031,42 +1503,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* thread.
*/
if (new->thread_fn && !nested) {
- struct task_struct *t;
- static const struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
-
- t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
- new->name);
- if (IS_ERR(t)) {
- ret = PTR_ERR(t);
+ ret = setup_irq_thread(new, irq, false);
+ if (ret)
goto out_mput;
+ if (new->secondary) {
+ ret = setup_irq_thread(new->secondary, irq, true);
+ if (ret)
+ goto out_thread;
}
-
- sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
-
- /*
- * We keep the reference to the task struct even if
- * the thread dies to avoid that the interrupt code
- * references an already freed task_struct.
- */
- get_task_struct(t);
- new->thread = t;
- /*
- * Tell the thread to set its affinity. This is
- * important for shared interrupt handlers as we do
- * not invoke setup_affinity() for the secondary
- * handlers as everything is already set up. Even for
- * interrupts marked with IRQF_NO_BALANCE this is
- * correct as we want the thread to move to the cpu(s)
- * on which the requesting code placed the interrupt.
- */
- set_bit(IRQTF_AFFINITY, &new->thread_flags);
- }
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
- ret = -ENOMEM;
- goto out_thread;
}
/*
@@ -1082,7 +1526,36 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->flags &= ~IRQF_ONESHOT;
/*
+ * Protects against a concurrent __free_irq() call which might wait
+ * for synchronize_hardirq() to complete without holding the optional
+ * chip bus lock and desc->lock. Also protects against handing out
+ * a recycled oneshot thread_mask bit while it's still in use by
+ * its previous owner.
+ */
+ mutex_lock(&desc->request_mutex);
+
+ /*
+ * Acquire bus lock as the irq_request_resources() callback below
+ * might rely on the serialization or the magic power management
+ * functions which are abusing the irq_bus_lock() callback,
+ */
+ chip_bus_lock(desc);
+
+ /* First installed action requests resources. */
+ if (!desc->action) {
+ ret = irq_request_resources(desc);
+ if (ret) {
+ pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+ new->name, irq, desc->irq_data.chip->name);
+ goto out_bus_unlock;
+ }
+ }
+
+ /*
* The following block of code has to be executed atomically
+ * protected against a concurrent interrupt and any of the other
+ * management calls which are not serialized via
+ * desc->request_mutex or the optional bus lock.
*/
raw_spin_lock_irqsave(&desc->lock, flags);
old_ptr = &desc->action;
@@ -1094,10 +1567,43 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* fields must have IRQF_SHARED set and the bits which
* set the trigger type must match. Also all must
* agree on ONESHOT.
+ * Interrupt lines used for NMIs cannot be shared.
+ */
+ unsigned int oldtype;
+
+ if (irq_is_nmi(desc) && !per_cpu_devid) {
+ pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (per_cpu_devid && !valid_percpu_irqaction(old, new)) {
+ pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * If nobody did set the configuration before, inherit
+ * the one provided by the requester.
*/
+ if (irqd_trigger_type_was_set(&desc->irq_data)) {
+ oldtype = irqd_get_trigger_type(&desc->irq_data);
+ } else {
+ oldtype = new->flags & IRQF_TRIGGER_MASK;
+ irqd_set_trigger_type(&desc->irq_data, oldtype);
+ }
+
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
- ((old->flags ^ new->flags) & IRQF_ONESHOT))
+ (oldtype != (new->flags & IRQF_TRIGGER_MASK)))
+ goto mismatch;
+
+ if ((old->flags & IRQF_ONESHOT) &&
+ (new->flags & IRQF_COND_ONESHOT))
+ new->flags |= IRQF_ONESHOT;
+ else if ((old->flags ^ new->flags) & IRQF_ONESHOT)
goto mismatch;
/* All handlers must agree on per-cpuness */
@@ -1131,7 +1637,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (thread_mask == ~0UL) {
ret = -EBUSY;
- goto out_mask;
+ goto out_unlock;
}
/*
* The thread_mask for the action is or'ed to
@@ -1153,7 +1659,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* thread_mask assigned. See the loop above which or's
* all existing action->thread_mask bits.
*/
- new->thread_mask = 1 << ffz(thread_mask);
+ new->thread_mask = 1UL << ffz(thread_mask);
} else if (new->handler == irq_default_primary_handler &&
!(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
@@ -1172,31 +1678,37 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* has. The type flags are unreliable as the
* underlying chip implementation can override them.
*/
- pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
- irq);
+ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n",
+ new->name, irq);
ret = -EINVAL;
- goto out_mask;
+ goto out_unlock;
}
if (!shared) {
- ret = irq_request_resources(desc);
- if (ret) {
- pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
- new->name, irq, desc->irq_data.chip->name);
- goto out_mask;
- }
-
- init_waitqueue_head(&desc->wait_for_threads);
-
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
- ret = __irq_set_trigger(desc, irq,
- new->flags & IRQF_TRIGGER_MASK);
+ ret = __irq_set_trigger(desc,
+ new->flags & IRQF_TRIGGER_MASK);
if (ret)
- goto out_mask;
+ goto out_unlock;
}
+ /*
+ * Activate the interrupt. That activation must happen
+ * independently of IRQ_NOAUTOEN. request_irq() can fail
+ * and the callers are supposed to handle
+ * that. enable_irq() of an interrupt requested with
+ * IRQ_NOAUTOEN is not supposed to fail. The activation
+ * keeps it in shutdown mode, it merily associates
+ * resources if necessary and if that's not possible it
+ * fails. Interrupts which are in managed shutdown mode
+ * will simply ignore that activation request.
+ */
+ ret = irq_activate(desc);
+ if (ret)
+ goto out_unlock;
+
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
IRQS_ONESHOT | IRQS_WAITING);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -1204,37 +1716,47 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (new->flags & IRQF_PERCPU) {
irqd_set(&desc->irq_data, IRQD_PER_CPU);
irq_settings_set_per_cpu(desc);
+ if (new->flags & IRQF_NO_DEBUG)
+ irq_settings_set_no_debug(desc);
}
+ if (noirqdebug)
+ irq_settings_set_no_debug(desc);
+
if (new->flags & IRQF_ONESHOT)
desc->istate |= IRQS_ONESHOT;
- if (irq_settings_can_autoenable(desc))
- irq_startup(desc, true);
- else
- /* Undo nested disables: */
- desc->depth = 1;
-
/* Exclude IRQ from balancing if requested */
if (new->flags & IRQF_NOBALANCING) {
irq_settings_set_no_balancing(desc);
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
- /* Set default affinity mask once everything is setup */
- setup_affinity(irq, desc, mask);
+ if (!(new->flags & IRQF_NO_AUTOEN) &&
+ irq_settings_can_autoenable(desc)) {
+ irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ } else if (!per_cpu_devid) {
+ /*
+ * Shared interrupts do not go well with disabling
+ * auto enable. The sharing interrupt might request
+ * it while it's still disabled and then wait for
+ * interrupts forever.
+ */
+ WARN_ON_ONCE(new->flags & IRQF_SHARED);
+ /* Undo nested disables: */
+ desc->depth = 1;
+ }
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
- unsigned int omsk = irq_settings_get_trigger_mask(desc);
+ unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
- pr_warning("irq %d uses trigger mode %u; requested %u\n",
- irq, nmsk, omsk);
+ pr_warn("irq %d uses trigger mode %u; requested %u\n",
+ irq, omsk, nmsk);
}
- new->irq = irq;
*old_ptr = new;
irq_pm_install_action(desc, new);
@@ -1249,23 +1771,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
- /*
- * Strictly no need to wake it up, but hung_task complains
- * when no hard interrupt wakes the thread up.
- */
- if (new->thread)
- wake_up_process(new->thread);
+ irq_setup_timings(desc, new);
+
+ wake_up_and_wait_for_irq_thread_ready(desc, new);
+ wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
- free_cpumask_var(mask);
-
return 0;
mismatch:
@@ -1278,60 +1798,47 @@ mismatch:
}
ret = -EBUSY;
-out_mask:
+out_unlock:
raw_spin_unlock_irqrestore(&desc->lock, flags);
- free_cpumask_var(mask);
+
+ if (!desc->action)
+ irq_release_resources(desc);
+out_bus_unlock:
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
out_thread:
if (new->thread) {
struct task_struct *t = new->thread;
new->thread = NULL;
- kthread_stop(t);
- put_task_struct(t);
+ kthread_stop_put(t);
+ }
+ if (new->secondary && new->secondary->thread) {
+ struct task_struct *t = new->secondary->thread;
+
+ new->secondary->thread = NULL;
+ kthread_stop_put(t);
}
out_mput:
module_put(desc->owner);
return ret;
}
-/**
- * setup_irq - setup an interrupt
- * @irq: Interrupt line to setup
- * @act: irqaction for the interrupt
- *
- * Used to statically setup interrupts in the early boot process.
- */
-int setup_irq(unsigned int irq, struct irqaction *act)
-{
- int retval;
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
- return -EINVAL;
- chip_bus_lock(desc);
- retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(setup_irq);
-
/*
* Internal function to unregister an irqaction - used to free
* regular and special interrupts that are part of the architecture.
*/
-static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
+static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ unsigned irq = desc->irq_data.irq;
struct irqaction *action, **action_ptr;
unsigned long flags;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
- if (!desc)
- return NULL;
-
+ mutex_lock(&desc->request_mutex);
+ chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
/*
@@ -1345,7 +1852,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (!action) {
WARN(1, "Trying to free already-free IRQ %d\n", irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
return NULL;
}
@@ -1361,8 +1869,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
+ irq_settings_clr_disable_unlazy(desc);
+ /* Only shutdown. Deactivate after synchronize_hardirq() */
irq_shutdown(desc);
- irq_release_resources(desc);
}
#ifdef CONFIG_SMP
@@ -1372,11 +1881,30 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
#endif
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ /*
+ * Drop bus_lock here so the changes which were done in the chip
+ * callbacks above are synced out to the irq chips which hang
+ * behind a slow bus (I2C, SPI) before calling synchronize_hardirq().
+ *
+ * Aside of that the bus_lock can also be taken from the threaded
+ * handler in irq_finalize_oneshot() which results in a deadlock
+ * because kthread_stop() would wait forever for the thread to
+ * complete, which is blocked on the bus lock.
+ *
+ * The still held desc->request_mutex() protects against a
+ * concurrent request_irq() of this irq so the release of resources
+ * and timing data is properly serialized.
+ */
+ chip_bus_sync_unlock(desc);
unregister_handler_proc(irq, action);
- /* Make sure it's not being used on another CPU: */
- synchronize_irq(irq);
+ /*
+ * Make sure it's not being used on another CPU and if the chip
+ * supports it also make sure that there is no (not yet serviced)
+ * interrupt in flight at the hardware level.
+ */
+ __synchronize_irq(desc);
#ifdef CONFIG_DEBUG_SHIRQ
/*
@@ -1385,7 +1913,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
* is so by doing an extra call to the handler ....
*
* ( We do this after actually deregistering it, to make sure that a
- * 'real' IRQ doesn't run in * parallel with our fake. )
+ * 'real' IRQ doesn't run in parallel with our fake. )
*/
if (action->flags & IRQF_SHARED) {
local_irq_save(flags);
@@ -1394,104 +1922,170 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
}
#endif
+ /*
+ * The action has already been removed above, but the thread writes
+ * its oneshot mask bit when it completes. Though request_mutex is
+ * held across this which prevents __setup_irq() from handing out
+ * the same bit to a newly requested action.
+ */
if (action->thread) {
- kthread_stop(action->thread);
- put_task_struct(action->thread);
+ kthread_stop_put(action->thread);
+ if (action->secondary && action->secondary->thread)
+ kthread_stop_put(action->secondary->thread);
}
+ /* Last action releases resources */
+ if (!desc->action) {
+ /*
+ * Reacquire bus lock as irq_release_resources() might
+ * require it to deallocate resources over the slow bus.
+ */
+ chip_bus_lock(desc);
+ /*
+ * There is no interrupt on the fly anymore. Deactivate it
+ * completely.
+ */
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ irq_domain_deactivate_irq(&desc->irq_data);
+
+ irq_release_resources(desc);
+ chip_bus_sync_unlock(desc);
+ irq_remove_timings(desc);
+ }
+
+ mutex_unlock(&desc->request_mutex);
+
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
+ kfree(action->secondary);
return action;
}
/**
- * remove_irq - free an interrupt
- * @irq: Interrupt line to free
- * @act: irqaction for the interrupt
+ * free_irq - free an interrupt allocated with request_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
*
- * Used to remove interrupts statically setup by the early boot process.
- */
-void remove_irq(unsigned int irq, struct irqaction *act)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
- __free_irq(irq, act->dev_id);
-}
-EXPORT_SYMBOL_GPL(remove_irq);
-
-/**
- * free_irq - free an interrupt allocated with request_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
- *
- * Remove an interrupt handler. The handler is removed and if the
- * interrupt line is no longer in use by any driver it is disabled.
- * On a shared IRQ the caller must ensure the interrupt is disabled
- * on the card it drives before calling this function. The function
- * does not return until any executing interrupts for this IRQ
- * have completed.
- *
- * This function must not be called from interrupt context.
+ * Remove an interrupt handler. The handler is removed and if the interrupt
+ * line is no longer in use by any driver it is disabled. On a shared IRQ
+ * the caller must ensure the interrupt is disabled on the card it drives
+ * before calling this function. The function does not return until any
+ * executing interrupts for this IRQ have completed.
+ *
+ * This function must not be called from interrupt context.
+ *
+ * Returns the devname argument passed to request_irq.
*/
-void free_irq(unsigned int irq, void *dev_id)
+const void *free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ const char *devname;
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
- return;
+ return NULL;
#ifdef CONFIG_SMP
if (WARN_ON(desc->affinity_notify))
desc->affinity_notify = NULL;
#endif
- chip_bus_lock(desc);
- kfree(__free_irq(irq, dev_id));
- chip_bus_sync_unlock(desc);
+ action = __free_irq(desc, dev_id);
+
+ if (!action)
+ return NULL;
+
+ devname = action->name;
+ kfree(action);
+ return devname;
}
EXPORT_SYMBOL(free_irq);
+/* This function must be called with desc->lock held */
+static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
+{
+ const char *devname = NULL;
+
+ desc->istate &= ~IRQS_NMI;
+
+ if (!WARN_ON(desc->action == NULL)) {
+ irq_pm_remove_action(desc, desc->action);
+ devname = desc->action->name;
+ unregister_handler_proc(irq, desc->action);
+
+ kfree(desc->action);
+ desc->action = NULL;
+ }
+
+ irq_settings_clr_disable_unlazy(desc);
+ irq_shutdown_and_deactivate(desc);
+
+ irq_release_resources(desc);
+
+ irq_chip_pm_put(&desc->irq_data);
+ module_put(desc->owner);
+
+ return devname;
+}
+
+const void *free_nmi(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || WARN_ON(!irq_is_nmi(desc)))
+ return NULL;
+
+ if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return NULL;
+
+ /* NMI still enabled */
+ if (WARN_ON(desc->depth == 0))
+ disable_nmi_nosync(irq);
+
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ irq_nmi_teardown(desc);
+ return __cleanup_nmi(irq, desc);
+}
+
/**
- * request_threaded_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Primary handler for threaded interrupts
- * If NULL and thread_fn != NULL the default
- * primary handler is installed
- * @thread_fn: Function called from the irq handler thread
- * If NULL, no irq thread is created
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. From the point this
- * call is made your handler function may be invoked. Since
- * your handler function must clear any interrupt the board
- * raises, you must take care both to initialise your hardware
- * and to set up the interrupt handler in the right order.
- *
- * If you want to set up a threaded irq handler for your device
- * then you need to supply @handler and @thread_fn. @handler is
- * still called in hard interrupt context and has to check
- * whether the interrupt originates from the device. If yes it
- * needs to disable the interrupt on the device and return
- * IRQ_WAKE_THREAD which will wake up the handler thread and run
- * @thread_fn. This split handler design is necessary to support
- * shared interrupts.
- *
- * Dev_id must be globally unique. Normally the address of the
- * device data structure is used as the cookie. Since the handler
- * receives this value it makes sense to use it.
- *
- * If your interrupt is shared you must pass a non NULL dev_id
- * as this is required when freeing the interrupt.
- *
- * Flags:
+ * request_threaded_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Primary handler for threaded interrupts.
+ * If handler is NULL and thread_fn != NULL
+ * the default primary handler is installed.
+ * @thread_fn: Function called from the irq handler thread
+ * If NULL, no irq thread is created
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. From the point this call is made your handler function
+ * may be invoked. Since your handler function must clear any interrupt the
+ * board raises, you must take care both to initialise your hardware and to
+ * set up the interrupt handler in the right order.
+ *
+ * If you want to set up a threaded irq handler for your device then you
+ * need to supply @handler and @thread_fn. @handler is still called in hard
+ * interrupt context and has to check whether the interrupt originates from
+ * the device. If yes it needs to disable the interrupt on the device and
+ * return IRQ_WAKE_THREAD which will wake up the handler thread and run
+ * @thread_fn. This split handler design is necessary to support shared
+ * interrupts.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id as this is
+ * required when freeing the interrupt.
+ *
+ * Flags:
*
* IRQF_SHARED Interrupt is shared
* IRQF_TRIGGER_* Specify active edge(s) or level
- *
+ * IRQF_ONESHOT Run thread_fn with interrupt line masked
*/
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
@@ -1501,16 +2095,24 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
struct irq_desc *desc;
int retval;
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
/*
* Sanity-check: shared interrupts must pass in a real dev-ID,
* otherwise we'll have trouble later trying to figure out
* which interrupt is which (messes up the interrupt freeing
* logic etc).
*
+ * Also shared interrupts do not go well with disabling auto enable.
+ * The sharing interrupt might request it while it's still disabled
+ * and then wait for interrupts forever.
+ *
* Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
* it cannot be set along with IRQF_NO_SUSPEND.
*/
if (((irqflags & IRQF_SHARED) && !dev_id) ||
+ ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) ||
(!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
@@ -1539,12 +2141,19 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
action->name = devname;
action->dev_id = dev_id;
- chip_bus_lock(desc);
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0) {
+ kfree(action);
+ return retval;
+ }
+
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
- if (retval)
+ if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
+ kfree(action->secondary);
kfree(action);
+ }
#ifdef CONFIG_DEBUG_SHIRQ_FIXME
if (!retval && (irqflags & IRQF_SHARED)) {
@@ -1570,28 +2179,31 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL(request_threaded_irq);
/**
- * request_any_context_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Threaded handler for threaded interrupts.
- * @flags: Interrupt type flags
- * @name: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. It selects either a
- * hardirq or threaded handling method depending on the
- * context.
- *
- * On failure, it returns a negative value. On success,
- * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ * request_any_context_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @flags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It selects either a hardirq or threaded handling
+ * method depending on the context.
+ *
+ * Returns: On failure, it returns a negative value. On success, it returns either
+ * IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
*/
int request_any_context_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *name, void *dev_id)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc;
int ret;
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
+ desc = irq_to_desc(irq);
if (!desc)
return -EINVAL;
@@ -1606,133 +2218,243 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
}
EXPORT_SYMBOL_GPL(request_any_context_irq);
-void enable_percpu_irq(unsigned int irq, unsigned int type)
+/**
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It sets up the IRQ line to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * and return a negative value.
+ */
+int request_nmi(unsigned int irq, irq_handler_t handler,
+ unsigned long irqflags, const char *name, void *dev_id)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+ struct irqaction *action;
+ struct irq_desc *desc;
+ int retval;
- if (!desc)
- return;
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
- type &= IRQ_TYPE_SENSE_MASK;
- if (type != IRQ_TYPE_NONE) {
- int ret;
+ /* NMI cannot be shared, used for Polling */
+ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL))
+ return -EINVAL;
- ret = __irq_set_trigger(desc, irq, type);
+ if (!(irqflags & IRQF_PERCPU))
+ return -EINVAL;
- if (ret) {
- WARN(1, "failed to set type for IRQ%d\n", irq);
- goto out;
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || (irq_settings_can_autoenable(desc) &&
+ !(irqflags & IRQF_NO_AUTOEN)) ||
+ !irq_settings_can_request(desc) ||
+ WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING;
+ action->name = name;
+ action->dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ return -EINVAL;
}
+ return 0;
}
- irq_percpu_enable(desc, cpu);
-out:
- irq_put_desc_unlock(desc, flags);
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
+void enable_percpu_irq(unsigned int irq, unsigned int type)
+{
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ struct irq_desc *desc = scoped_irqdesc;
+
+ /*
+ * If the trigger type is not specified by the caller, then
+ * use the default for this interrupt.
+ */
+ type &= IRQ_TYPE_SENSE_MASK;
+ if (type == IRQ_TYPE_NONE)
+ type = irqd_get_trigger_type(&desc->irq_data);
+
+ if (type != IRQ_TYPE_NONE) {
+ if (__irq_set_trigger(desc, type)) {
+ WARN(1, "failed to set type for IRQ%d\n", irq);
+ return;
+ }
+ }
+ irq_percpu_enable(desc, smp_processor_id());
+ }
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
-void disable_percpu_irq(unsigned int irq)
+void enable_percpu_nmi(unsigned int irq, unsigned int type)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+ enable_percpu_irq(irq, type);
+}
- if (!desc)
- return;
+/**
+ * irq_percpu_is_enabled - Check whether the per cpu irq is enabled
+ * @irq: Linux irq number to check for
+ *
+ * Must be called from a non migratable context. Returns the enable
+ * state of a per cpu interrupt on the current cpu.
+ */
+bool irq_percpu_is_enabled(unsigned int irq)
+{
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled);
+ return false;
+}
+EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);
- irq_percpu_disable(desc, cpu);
- irq_put_desc_unlock(desc, flags);
+void disable_percpu_irq(unsigned int irq)
+{
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ irq_percpu_disable(scoped_irqdesc, smp_processor_id());
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
+void disable_percpu_nmi(unsigned int irq)
+{
+ disable_percpu_irq(irq);
+}
+
/*
* Internal function to unregister a percpu irqaction.
*/
static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- unsigned long flags;
+ struct irqaction *action, **action_ptr;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
if (!desc)
return NULL;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ action_ptr = &desc->action;
+ for (;;) {
+ action = *action_ptr;
- action = desc->action;
- if (!action || action->percpu_dev_id != dev_id) {
- WARN(1, "Trying to free already-free IRQ %d\n", irq);
- goto bad;
- }
+ if (!action) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ return NULL;
+ }
- if (!cpumask_empty(desc->percpu_enabled)) {
- WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
- irq, cpumask_first(desc->percpu_enabled));
- goto bad;
- }
+ if (action->percpu_dev_id == dev_id)
+ break;
- /* Found it - now remove it from the list of entries: */
- desc->action = NULL;
+ action_ptr = &action->next;
+ }
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (cpumask_intersects(desc->percpu_enabled, action->affinity)) {
+ WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq,
+ cpumask_first_and(desc->percpu_enabled, action->affinity));
+ return NULL;
+ }
- unregister_handler_proc(irq, action);
+ /* Found it - now remove it from the list of entries: */
+ *action_ptr = action->next;
+
+ /* Demote from NMI if we killed the last action */
+ if (!desc->action)
+ desc->istate &= ~IRQS_NMI;
+ }
+ unregister_handler_proc(irq, action);
+ irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
return action;
-
-bad:
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return NULL;
}
/**
- * remove_percpu_irq - free a per-cpu interrupt
- * @irq: Interrupt line to free
- * @act: irqaction for the interrupt
+ * free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove a percpu interrupt handler. The handler is removed, but the
+ * interrupt line is not disabled. This must be done on each CPU before
+ * calling this function. The function does not return until any executing
+ * interrupts for this IRQ have completed.
*
- * Used to remove interrupts statically setup by the early boot process.
+ * This function must not be called from interrupt context.
*/
-void remove_percpu_irq(unsigned int irq, struct irqaction *act)
+void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (desc && irq_settings_is_per_cpu_devid(desc))
- __free_percpu_irq(irq, act->percpu_dev_id);
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return;
+
+ chip_bus_lock(desc);
+ kfree(__free_percpu_irq(irq, dev_id));
+ chip_bus_sync_unlock(desc);
}
+EXPORT_SYMBOL_GPL(free_percpu_irq);
-/**
- * free_percpu_irq - free an interrupt allocated with request_percpu_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
- *
- * Remove a percpu interrupt handler. The handler is removed, but
- * the interrupt line is not disabled. This must be done on each
- * CPU before calling this function. The function does not return
- * until any executing interrupts for this IRQ have completed.
- *
- * This function must not be called from interrupt context.
- */
-void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return;
- chip_bus_lock(desc);
+ if (WARN_ON(!irq_is_nmi(desc)))
+ return;
+
kfree(__free_percpu_irq(irq, dev_id));
- chip_bus_sync_unlock(desc);
}
/**
- * setup_percpu_irq - setup a per-cpu interrupt
- * @irq: Interrupt line to setup
- * @act: irqaction for the interrupt
+ * setup_percpu_irq - setup a per-cpu interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
*
* Used to statically setup per-cpu interrupts in the early boot process.
*/
@@ -1743,30 +2465,70 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
if (!desc || !irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
- chip_bus_lock(desc);
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ return retval;
+
retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
+
+ if (retval)
+ irq_chip_pm_put(&desc->irq_data);
return retval;
}
+static
+struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags,
+ const char *devname, const cpumask_t *affinity,
+ void __percpu *dev_id)
+{
+ struct irqaction *action;
+
+ if (!affinity)
+ affinity = cpu_possible_mask;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return NULL;
+
+ action->handler = handler;
+ action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
+ action->name = devname;
+ action->percpu_dev_id = dev_id;
+ action->affinity = affinity;
+
+ /*
+ * We allow some form of sharing for non-overlapping affinity
+ * masks. Obviously, covering all CPUs prevents any sharing in
+ * the first place.
+ */
+ if (!cpumask_equal(affinity, cpu_possible_mask))
+ action->flags |= IRQF_SHARED;
+
+ return action;
+}
+
/**
- * request_percpu_irq - allocate a percpu interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * @devname: An ascii name for the claiming device
- * @dev_id: A percpu cookie passed back to the handler function
- *
- * This call allocates interrupt resources, but doesn't
- * automatically enable the interrupt. It has to be done on each
- * CPU using enable_percpu_irq().
- *
- * Dev_id must be globally unique. It is a per-cpu variable, and
- * the handler gets called with the interrupted CPU's instance of
- * that variable.
+ * __request_percpu_irq - allocate a percpu interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @flags: Interrupt type flags (IRQF_TIMER only)
+ * @devname: An ascii name for the claiming device
+ * @affinity: A cpumask describing the target CPUs for this interrupt
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources, but doesn't enable the interrupt
+ * on any CPU, as all percpu-devid interrupts are flagged with IRQ_NOAUTOEN.
+ * It has to be done on each CPU using enable_percpu_irq().
+ *
+ * @dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
*/
-int request_percpu_irq(unsigned int irq, irq_handler_t handler,
- const char *devname, void __percpu *dev_id)
+int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
+ unsigned long flags, const char *devname,
+ const cpumask_t *affinity, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -1780,55 +2542,161 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
!irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
- action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (flags && flags != IRQF_TIMER)
+ return -EINVAL;
+
+ action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id);
if (!action)
return -ENOMEM;
- action->handler = handler;
- action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
- action->name = devname;
- action->percpu_dev_id = dev_id;
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0) {
+ kfree(action);
+ return retval;
+ }
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
- if (retval)
+ if (retval) {
+ irq_chip_pm_put(&desc->irq_data);
kfree(action);
+ }
return retval;
}
+EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
- * irq_get_irqchip_state - returns the irqchip state of a interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: One of IRQCHIP_STATE_* the caller wants to know about
- * @state: a pointer to a boolean where the state is to be storeed
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @affinity: A cpumask describing the target CPUs for this interrupt
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
*
- * This call snapshots the internal irqchip state of an
- * interrupt, returning into @state the bit corresponding to
- * stage @which
+ * @dev_id must be globally unique. It is a per-cpu variable, and the
+ * handler gets called with the interrupted CPU's instance of that
+ * variable.
*
- * This function should be called with preemption disabled if the
- * interrupt controller has per-cpu registers.
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
*/
-int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool *state)
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name,
+ const struct cpumask *affinity, void __percpu *dev_id)
{
+ struct irqaction *action;
struct irq_desc *desc;
- struct irq_data *data;
- struct irq_chip *chip;
- unsigned long flags;
- int err = -EINVAL;
+ int retval;
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
- data = irq_desc_get_irq_data(desc);
+ if (!desc || !irq_settings_can_request(desc) ||
+ !irq_settings_is_per_cpu_devid(desc) ||
+ irq_settings_can_autoenable(desc) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ /* The line cannot be NMI already if the new request covers all CPUs */
+ if (irq_is_nmi(desc) &&
+ (!affinity || cpumask_equal(affinity, cpu_possible_mask)))
+ return -EINVAL;
+
+ action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING,
+ name, affinity, dev_id);
+ if (!action)
+ return -ENOMEM;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc->istate |= IRQS_NMI;
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
+/**
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
+ *
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * returning a negative value.
+ */
+int prepare_percpu_nmi(unsigned int irq)
+{
+ int ret = -EINVAL;
+
+ WARN_ON(preemptible());
+
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN(!irq_is_nmi(scoped_irqdesc),
+ "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq))
+ return -EINVAL;
+
+ ret = irq_nmi_setup(scoped_irqdesc);
+ if (ret)
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
+ }
+ return ret;
+}
+
+/**
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be removed
+ *
+ * This call undoes the setup done by prepare_percpu_nmi().
+ *
+ * IRQ line should not be enabled for the current CPU.
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ */
+void teardown_percpu_nmi(unsigned int irq)
+{
+ WARN_ON(preemptible());
+
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN_ON(!irq_is_nmi(scoped_irqdesc)))
+ return;
+ irq_nmi_teardown(scoped_irqdesc);
+ }
+}
+
+static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state)
+{
+ struct irq_chip *chip;
+ int err = -EINVAL;
do {
chip = irq_data_get_irq_chip(data);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
if (chip->irq_get_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -1840,52 +2708,103 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
if (data)
err = chip->irq_get_irqchip_state(data, which, state);
-
- irq_put_desc_busunlock(desc, flags);
return err;
}
/**
- * irq_set_irqchip_state - set the state of a forwarded interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: State to be restored (one of IRQCHIP_STATE_*)
- * @val: Value corresponding to @which
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be stored
*
- * This call sets the internal irqchip state of an interrupt,
- * depending on the value of @which.
+ * This call snapshots the internal irqchip state of an interrupt,
+ * returning into @state the bit corresponding to stage @which
*
- * This function should be called with preemption disabled if the
- * interrupt controller has per-cpu registers.
+ * This function should be called with preemption disabled if the interrupt
+ * controller has per-cpu registers.
*/
-int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool val)
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state)
{
- struct irq_desc *desc;
- struct irq_data *data;
- struct irq_chip *chip;
- unsigned long flags;
- int err = -EINVAL;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ return __irq_get_irqchip_state(data, which, state);
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
- data = irq_desc_get_irq_data(desc);
+/**
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * This call sets the internal irqchip state of an interrupt, depending on
+ * the value of @which.
+ *
+ * This function should be called with migration disabled if the interrupt
+ * controller has per-cpu registers.
+ */
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val)
+{
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
+ struct irq_chip *chip;
- do {
- chip = irq_data_get_irq_chip(data);
- if (chip->irq_set_irqchip_state)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
+ do {
+ chip = irq_data_get_irq_chip(data);
- if (data)
- err = chip->irq_set_irqchip_state(data, which, val);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
- irq_put_desc_busunlock(desc, flags);
- return err;
+ if (chip->irq_set_irqchip_state)
+ break;
+
+ data = irqd_get_parent_data(data);
+ } while (data);
+
+ if (data)
+ return chip->irq_set_irqchip_state(data, which, val);
+ }
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_has_action - Check whether an interrupt is requested
+ * @irq: The linux irq number
+ *
+ * Returns: A snapshot of the current state
+ */
+bool irq_has_action(unsigned int irq)
+{
+ bool res;
+
+ rcu_read_lock();
+ res = irq_desc_has_action(irq_to_desc(irq));
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL_GPL(irq_has_action);
+
+/**
+ * irq_check_status_bit - Check whether bits in the irq descriptor status are set
+ * @irq: The linux irq number
+ * @bitmask: The bitmask to evaluate
+ *
+ * Returns: True if one of the bits in @bitmask is set
+ */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
+{
+ struct irq_desc *desc;
+ bool res = false;
+
+ rcu_read_lock();
+ desc = irq_to_desc(irq);
+ if (desc)
+ res = !!(desc->status_use_accessors & bitmask);
+ rcu_read_unlock();
+ return res;
}
+EXPORT_SYMBOL_GPL(irq_check_status_bit);
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
new file mode 100644
index 000000000000..8f222d1cccec
--- /dev/null
+++ b/kernel/irq/matrix.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+struct cpumap {
+ unsigned int available;
+ unsigned int allocated;
+ unsigned int managed;
+ unsigned int managed_allocated;
+ bool initialized;
+ bool online;
+ unsigned long *managed_map;
+ unsigned long alloc_map[];
+};
+
+struct irq_matrix {
+ unsigned int matrix_bits;
+ unsigned int alloc_start;
+ unsigned int alloc_end;
+ unsigned int alloc_size;
+ unsigned int global_available;
+ unsigned int global_reserved;
+ unsigned int systembits_inalloc;
+ unsigned int total_allocated;
+ unsigned int online_maps;
+ struct cpumap __percpu *maps;
+ unsigned long *system_map;
+ unsigned long scratch_map[];
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq_matrix.h>
+
+/**
+ * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
+ * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS
+ * @alloc_start: From which bit the allocation search starts
+ * @alloc_end: At which bit the allocation search ends, i.e first
+ * invalid bit
+ */
+__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+ unsigned int alloc_start,
+ unsigned int alloc_end)
+{
+ unsigned int cpu, matrix_size = BITS_TO_LONGS(matrix_bits);
+ struct irq_matrix *m;
+
+ m = kzalloc(struct_size(m, scratch_map, matrix_size * 2), GFP_KERNEL);
+ if (!m)
+ return NULL;
+
+ m->system_map = &m->scratch_map[matrix_size];
+
+ m->matrix_bits = matrix_bits;
+ m->alloc_start = alloc_start;
+ m->alloc_end = alloc_end;
+ m->alloc_size = alloc_end - alloc_start;
+ m->maps = __alloc_percpu(struct_size(m->maps, alloc_map, matrix_size * 2),
+ __alignof__(*m->maps));
+ if (!m->maps) {
+ kfree(m);
+ return NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ cm->managed_map = &cm->alloc_map[matrix_size];
+ }
+
+ return m;
+}
+
+/**
+ * irq_matrix_online - Bring the local CPU matrix online
+ * @m: Matrix pointer
+ */
+void irq_matrix_online(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(cm->online);
+
+ if (!cm->initialized) {
+ cm->available = m->alloc_size;
+ cm->available -= cm->managed + m->systembits_inalloc;
+ cm->initialized = true;
+ }
+ m->global_available += cm->available;
+ cm->online = true;
+ m->online_maps++;
+ trace_irq_matrix_online(m);
+}
+
+/**
+ * irq_matrix_offline - Bring the local CPU matrix offline
+ * @m: Matrix pointer
+ */
+void irq_matrix_offline(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ /* Update the global available size */
+ m->global_available -= cm->available;
+ cm->online = false;
+ m->online_maps--;
+ trace_irq_matrix_offline(m);
+}
+
+static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm,
+ unsigned int num, bool managed)
+{
+ unsigned int area, start = m->alloc_start;
+ unsigned int end = m->alloc_end;
+
+ bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end);
+ bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end);
+ area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0);
+ if (area >= end)
+ return area;
+ if (managed)
+ bitmap_set(cm->managed_map, area, num);
+ else
+ bitmap_set(cm->alloc_map, area, num);
+ return area;
+}
+
+/* Find the best CPU which has the lowest vector allocation count */
+static unsigned int matrix_find_best_cpu(struct irq_matrix *m,
+ const struct cpumask *msk)
+{
+ unsigned int cpu, best_cpu, maxavl = 0;
+ struct cpumap *cm;
+
+ best_cpu = UINT_MAX;
+
+ for_each_cpu(cpu, msk) {
+ cm = per_cpu_ptr(m->maps, cpu);
+
+ if (!cm->online || cm->available <= maxavl)
+ continue;
+
+ best_cpu = cpu;
+ maxavl = cm->available;
+ }
+ return best_cpu;
+}
+
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+ const struct cpumask *msk)
+{
+ unsigned int cpu, best_cpu, allocated = UINT_MAX;
+ struct cpumap *cm;
+
+ best_cpu = UINT_MAX;
+
+ for_each_cpu(cpu, msk) {
+ cm = per_cpu_ptr(m->maps, cpu);
+
+ if (!cm->online || cm->managed_allocated > allocated)
+ continue;
+
+ best_cpu = cpu;
+ allocated = cm->managed_allocated;
+ }
+ return best_cpu;
+}
+
+/**
+ * irq_matrix_assign_system - Assign system wide entry in the matrix
+ * @m: Matrix pointer
+ * @bit: Which bit to reserve
+ * @replace: Replace an already allocated vector with a system
+ * vector at the same bit position.
+ *
+ * The BUG_ON()s below are on purpose. If this goes wrong in the
+ * early boot process, then the chance to survive is about zero.
+ * If this happens when the system is life, it's not much better.
+ */
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit,
+ bool replace)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ BUG_ON(bit > m->matrix_bits);
+ BUG_ON(m->online_maps > 1 || (m->online_maps && !replace));
+
+ set_bit(bit, m->system_map);
+ if (replace) {
+ BUG_ON(!test_and_clear_bit(bit, cm->alloc_map));
+ cm->allocated--;
+ m->total_allocated--;
+ }
+ if (bit >= m->alloc_start && bit < m->alloc_end)
+ m->systembits_inalloc++;
+
+ trace_irq_matrix_assign_system(bit, m);
+}
+
+/**
+ * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be reserved.
+ *
+ * Can be called for offline CPUs. Note, this will only reserve one bit
+ * on all CPUs in @msk, but it's not guaranteed that the bits are at the
+ * same offset on all CPUs
+ */
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu, failed_cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit;
+
+ bit = matrix_alloc_area(m, cm, 1, true);
+ if (bit >= m->alloc_end)
+ goto cleanup;
+ cm->managed++;
+ if (cm->online) {
+ cm->available--;
+ m->global_available--;
+ }
+ trace_irq_matrix_reserve_managed(bit, cpu, m, cm);
+ }
+ return 0;
+cleanup:
+ failed_cpu = cpu;
+ for_each_cpu(cpu, msk) {
+ if (cpu == failed_cpu)
+ break;
+ irq_matrix_remove_managed(m, cpumask_of(cpu));
+ }
+ return -ENOSPC;
+}
+
+/**
+ * irq_matrix_remove_managed - Remove managed interrupts in a CPU map
+ * @m: Matrix pointer
+ * @msk: On which CPUs the bits should be removed
+ *
+ * Can be called for offline CPUs
+ *
+ * This removes not allocated managed interrupts from the map. It does
+ * not matter which one because the managed interrupts free their
+ * allocation when they shut down. If not, the accounting is screwed,
+ * but all what can be done at this point is warn about it.
+ */
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, msk) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+ unsigned int bit, end = m->alloc_end;
+
+ if (WARN_ON_ONCE(!cm->managed))
+ continue;
+
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+
+ bit = find_first_bit(m->scratch_map, end);
+ if (WARN_ON_ONCE(bit >= end))
+ continue;
+
+ clear_bit(bit, cm->managed_map);
+
+ cm->managed--;
+ if (cm->online) {
+ cm->available++;
+ m->global_available++;
+ }
+ trace_irq_matrix_remove_managed(bit, cpu, m, cm);
+ }
+}
+
+/**
+ * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: Which CPUs to search in
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
+ */
+int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk,
+ unsigned int *mapped_cpu)
+{
+ unsigned int bit, cpu, end;
+ struct cpumap *cm;
+
+ if (cpumask_empty(msk))
+ return -EINVAL;
+
+ cpu = matrix_find_best_cpu_managed(m, msk);
+ if (cpu == UINT_MAX)
+ return -ENOSPC;
+
+ cm = per_cpu_ptr(m->maps, cpu);
+ end = m->alloc_end;
+ /* Get managed bit which are not allocated */
+ bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+ bit = find_first_bit(m->scratch_map, end);
+ if (bit >= end)
+ return -ENOSPC;
+ set_bit(bit, cm->alloc_map);
+ cm->allocated++;
+ cm->managed_allocated++;
+ m->total_allocated++;
+ *mapped_cpu = cpu;
+ trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
+ return bit;
+}
+
+/**
+ * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map
+ * @m: Matrix pointer
+ * @bit: Which bit to mark
+ *
+ * This should only be used to mark preallocated vectors
+ */
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+ if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map)))
+ return;
+ cm->allocated++;
+ m->total_allocated++;
+ cm->available--;
+ m->global_available--;
+ trace_irq_matrix_assign(bit, smp_processor_id(), m, cm);
+}
+
+/**
+ * irq_matrix_reserve - Reserve interrupts
+ * @m: Matrix pointer
+ *
+ * This is merely a book keeping call. It increments the number of globally
+ * reserved interrupt bits w/o actually allocating them. This allows to
+ * setup interrupt descriptors w/o assigning low level resources to it.
+ * The actual allocation happens when the interrupt gets activated.
+ */
+void irq_matrix_reserve(struct irq_matrix *m)
+{
+ if (m->global_reserved == m->global_available)
+ pr_warn("Interrupt reservation exceeds available resources\n");
+
+ m->global_reserved++;
+ trace_irq_matrix_reserve(m);
+}
+
+/**
+ * irq_matrix_remove_reserved - Remove interrupt reservation
+ * @m: Matrix pointer
+ *
+ * This is merely a book keeping call. It decrements the number of globally
+ * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
+ * interrupt was never in use and a real vector allocated, which undid the
+ * reservation.
+ */
+void irq_matrix_remove_reserved(struct irq_matrix *m)
+{
+ m->global_reserved--;
+ trace_irq_matrix_remove_reserved(m);
+}
+
+/**
+ * irq_matrix_alloc - Allocate a regular interrupt in a CPU map
+ * @m: Matrix pointer
+ * @msk: Which CPUs to search in
+ * @reserved: Allocate previously reserved interrupts
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
+ */
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+ bool reserved, unsigned int *mapped_cpu)
+{
+ unsigned int cpu, bit;
+ struct cpumap *cm;
+
+ /*
+ * Not required in theory, but matrix_find_best_cpu() uses
+ * for_each_cpu() which ignores the cpumask on UP .
+ */
+ if (cpumask_empty(msk))
+ return -EINVAL;
+
+ cpu = matrix_find_best_cpu(m, msk);
+ if (cpu == UINT_MAX)
+ return -ENOSPC;
+
+ cm = per_cpu_ptr(m->maps, cpu);
+ bit = matrix_alloc_area(m, cm, 1, false);
+ if (bit >= m->alloc_end)
+ return -ENOSPC;
+ cm->allocated++;
+ cm->available--;
+ m->total_allocated++;
+ m->global_available--;
+ if (reserved)
+ m->global_reserved--;
+ *mapped_cpu = cpu;
+ trace_irq_matrix_alloc(bit, cpu, m, cm);
+ return bit;
+
+}
+
+/**
+ * irq_matrix_free - Free allocated interrupt in the matrix
+ * @m: Matrix pointer
+ * @cpu: Which CPU map needs be updated
+ * @bit: The bit to remove
+ * @managed: If true, the interrupt is managed and not accounted
+ * as available.
+ */
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+ unsigned int bit, bool managed)
+{
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+ return;
+
+ if (WARN_ON_ONCE(!test_and_clear_bit(bit, cm->alloc_map)))
+ return;
+
+ cm->allocated--;
+ if(managed)
+ cm->managed_allocated--;
+
+ if (cm->online)
+ m->total_allocated--;
+
+ if (!managed) {
+ cm->available++;
+ if (cm->online)
+ m->global_available++;
+ }
+ trace_irq_matrix_free(bit, cpu, m, cm);
+}
+
+/**
+ * irq_matrix_available - Get the number of globally available irqs
+ * @m: Pointer to the matrix to query
+ * @cpudown: If true, the local CPU is about to go down, adjust
+ * the number of available irqs accordingly
+ */
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ if (!cpudown)
+ return m->global_available;
+ return m->global_available - cm->available;
+}
+
+/**
+ * irq_matrix_reserved - Get the number of globally reserved irqs
+ * @m: Pointer to the matrix to query
+ */
+unsigned int irq_matrix_reserved(struct irq_matrix *m)
+{
+ return m->global_reserved;
+}
+
+/**
+ * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU
+ * @m: Pointer to the matrix to search
+ *
+ * This returns number of allocated non-managed interrupts.
+ */
+unsigned int irq_matrix_allocated(struct irq_matrix *m)
+{
+ struct cpumap *cm = this_cpu_ptr(m->maps);
+
+ return cm->allocated - cm->managed_allocated;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+/**
+ * irq_matrix_debug_show - Show detailed allocation information
+ * @sf: Pointer to the seq_file to print to
+ * @m: Pointer to the matrix allocator
+ * @ind: Indentation for the print format
+ *
+ * Note, this is a lockless snapshot.
+ */
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
+{
+ unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits);
+ int cpu;
+
+ seq_printf(sf, "Online bitmaps: %6u\n", m->online_maps);
+ seq_printf(sf, "Global available: %6u\n", m->global_available);
+ seq_printf(sf, "Global reserved: %6u\n", m->global_reserved);
+ seq_printf(sf, "Total allocated: %6u\n", m->total_allocated);
+ seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
+ m->system_map);
+ seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " ");
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+ seq_printf(sf, "%*s %4d %4u %4u %4u %4u %*pbl\n", ind, " ",
+ cpu, cm->available, cm->managed,
+ cm->managed_allocated, cm->allocated,
+ m->matrix_bits, cm->alloc_map);
+ }
+ cpus_read_unlock();
+}
+#endif
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index ca3f4aaff707..f2b2929986ff 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,27 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/irq.h>
#include <linux/interrupt.h>
#include "internals.h"
+/**
+ * irq_fixup_move_pending - Cleanup irq move pending from a dying CPU
+ * @desc: Interrupt descriptor to clean up
+ * @force_clear: If set clear the move pending bit unconditionally.
+ * If not set, clear it only when the dying CPU is the
+ * last one in the pending mask.
+ *
+ * Returns true if the pending bit was set and the pending mask contains an
+ * online CPU other than the dying CPU.
+ */
+bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
+{
+ struct irq_data *data = irq_desc_get_irq_data(desc);
+
+ if (!irqd_is_setaffinity_pending(data))
+ return false;
+
+ /*
+ * The outgoing CPU might be the last online target in a pending
+ * interrupt move. If that's the case clear the pending move bit.
+ */
+ if (!cpumask_intersects(desc->pending_mask, cpu_online_mask)) {
+ irqd_clr_move_pending(data);
+ return false;
+ }
+ if (force_clear)
+ irqd_clr_move_pending(data);
+ return true;
+}
+
+void irq_force_complete_move(struct irq_desc *desc)
+{
+ for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = irqd_get_parent_data(d)) {
+ if (d->chip && d->chip->irq_force_complete_move) {
+ d->chip->irq_force_complete_move(d);
+ return;
+ }
+ }
+}
+
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
- struct irq_chip *chip = idata->chip;
+ struct irq_data *data = &desc->irq_data;
+ struct irq_chip *chip = data->chip;
- if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
+ if (likely(!irqd_is_setaffinity_pending(data)))
return;
+ irqd_clr_move_pending(data);
+
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (!irqd_can_balance(&desc->irq_data)) {
+ if (irqd_is_per_cpu(data)) {
WARN_ON(1);
return;
}
- irqd_clr_move_pending(&desc->irq_data);
-
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
@@ -42,18 +84,33 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
- irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
+ if (cpumask_intersects(desc->pending_mask, cpu_online_mask)) {
+ int ret;
+ ret = irq_do_set_affinity(data, desc->pending_mask, false);
+ /*
+ * If the there is a cleanup pending in the underlying
+ * vector management, reschedule the move for the next
+ * interrupt. Leave desc->pending_mask intact.
+ */
+ if (ret == -EBUSY) {
+ irqd_set_move_pending(data);
+ return;
+ }
+ }
cpumask_clear(desc->pending_mask);
}
-void irq_move_irq(struct irq_data *idata)
+void __irq_move_irq(struct irq_data *idata)
{
bool masked;
- if (likely(!irqd_is_setaffinity_pending(idata)))
- return;
+ /*
+ * Get top level irq_data when CONFIG_IRQ_DOMAIN_HIERARCHY is enabled,
+ * and it should be optimized away when CONFIG_IRQ_DOMAIN_HIERARCHY is
+ * disabled. So we avoid an "#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY" here.
+ */
+ idata = irq_desc_get_irq_data(irq_data_to_desc(idata));
if (unlikely(irqd_irq_disabled(idata)))
return;
@@ -70,3 +127,13 @@ void irq_move_irq(struct irq_data *idata)
if (!masked)
idata->chip->irq_unmask(idata);
}
+
+bool irq_can_move_in_process_context(struct irq_data *data)
+{
+ /*
+ * Get the top level irq_data in the hierarchy, which is optimized
+ * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled.
+ */
+ data = irq_desc_get_irq_data(irq_data_to_desc(data));
+ return irq_can_move_pcntxt(data);
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 474de5cb394d..68886881fe10 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1,22 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/msi.c
- *
* Copyright (C) 2014 Intel Corp.
* Author: Jiang Liu <jiang.liu@linux.intel.com>
*
* This file is licensed under GPLv2.
*
- * This file contains common code to support Message Signalled Interrupt for
+ * This file contains common code to support Message Signaled Interrupts for
* PCI compatible and non PCI compatible devices.
*/
-#include <linux/types.h>
#include <linux/device.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/msi.h>
-
-/* Temparory solution for building, will be removed later */
+#include <linux/mutex.h>
#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/xarray.h>
+
+#include "internals.h"
+
+/**
+ * struct msi_device_data - MSI per device data
+ * @properties: MSI properties which are interesting to drivers
+ * @mutex: Mutex protecting the MSI descriptor store
+ * @__domains: Internal data for per device MSI domains
+ * @__iter_idx: Index to search the next entry for iterators
+ */
+struct msi_device_data {
+ unsigned long properties;
+ struct mutex mutex;
+ struct msi_dev_domain __domains[MSI_MAX_DEVICE_IRQDOMAINS];
+ unsigned long __iter_idx;
+};
+
+/**
+ * struct msi_ctrl - MSI internal management control structure
+ * @domid: ID of the domain on which management operations should be done
+ * @first: First (hardware) slot index to operate on
+ * @last: Last (hardware) slot index to operate on
+ * @nirqs: The number of Linux interrupts to allocate. Can be larger
+ * than the range due to PCI/multi-MSI.
+ */
+struct msi_ctrl {
+ unsigned int domid;
+ unsigned int first;
+ unsigned int last;
+ unsigned int nirqs;
+};
+
+/* Invalid Xarray index which is outside of any searchable range */
+#define MSI_XA_MAX_INDEX (ULONG_MAX - 1)
+/* The maximum domain size */
+#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)
+
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
+static inline int msi_sysfs_create_group(struct device *dev);
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg);
+
+/**
+ * msi_alloc_desc - Allocate an initialized msi_desc
+ * @dev: Pointer to the device for which this is allocated
+ * @nvec: The number of vectors used in this entry
+ * @affinity: Optional pointer to an affinity mask array size of @nvec
+ *
+ * If @affinity is not %NULL then an affinity array[@nvec] is allocated
+ * and the affinity masks and flags from @affinity are copied.
+ *
+ * Return: pointer to allocated &msi_desc on success or %NULL on failure
+ */
+static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
+ const struct irq_affinity_desc *affinity)
+{
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+
+ if (!desc)
+ return NULL;
+
+ desc->dev = dev;
+ desc->nvec_used = nvec;
+ if (affinity) {
+ desc->affinity = kmemdup_array(affinity, nvec, sizeof(*desc->affinity), GFP_KERNEL);
+ if (!desc->affinity) {
+ kfree(desc);
+ return NULL;
+ }
+ }
+ return desc;
+}
+
+static void msi_free_desc(struct msi_desc *desc)
+{
+ kfree(desc->affinity);
+ kfree(desc);
+}
+
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
+ unsigned int domid, unsigned int index)
+{
+ struct msi_device_data *md = dev->msi.data;
+ struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
+ int ret;
+
+ hwsize = msi_domain_get_hwsize(dev, domid);
+
+ if (index == MSI_ANY_INDEX) {
+ struct xa_limit limit = { .min = 0, .max = hwsize - 1 };
+ unsigned int index;
+
+ /* Let the xarray allocate a free index within the limit */
+ ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL);
+ if (ret)
+ goto fail;
+
+ desc->msi_index = index;
+ return 0;
+ } else {
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
+ desc->msi_index = index;
+ ret = xa_insert(xa, index, desc, GFP_KERNEL);
+ if (ret)
+ goto fail;
+ return 0;
+ }
+fail:
+ msi_free_desc(desc);
+ return ret;
+}
+
+/**
+ * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and
+ * insert it at @init_desc->msi_index
+ *
+ * @dev: Pointer to the device for which the descriptor is allocated
+ * @domid: The id of the interrupt domain to which the desriptor is added
+ * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc)
+{
+ struct msi_desc *desc;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
+ if (!desc)
+ return -ENOMEM;
+
+ /* Copy type specific data to the new descriptor. */
+ desc->pci = init_desc->pci;
+
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
+}
+
+static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
+{
+ switch (filter) {
+ case MSI_DESC_ALL:
+ return true;
+ case MSI_DESC_NOTASSOCIATED:
+ return !desc->irq;
+ case MSI_DESC_ASSOCIATED:
+ return !!desc->irq;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
+{
+ unsigned int hwsize;
+
+ if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
+ return false;
+ return true;
+}
+
+static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl->domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ xa_erase(xa, idx);
+
+ /* Leak the descriptor when it is still referenced */
+ if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+ continue;
+ msi_free_desc(desc);
+ }
+}
+
+/**
+ * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain
+ * @dev: Device for which to free the descriptors
+ * @domid: Id of the domain to operate on
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
+ */
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+
+ msi_domain_free_descs(dev, &ctrl);
+}
+
+/**
+ * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @ctrl: Allocation control struct
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ unsigned int idx;
+ int ret;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ for (idx = ctrl->first; idx <= ctrl->last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
+ if (ret)
+ goto fail;
+ }
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_domain_free_descs(dev, ctrl);
+ return ret;
+}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
{
@@ -31,13 +284,372 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static void msi_device_data_release(struct device *dev, void *res)
+{
+ struct msi_device_data *md = res;
+ int i;
+
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) {
+ msi_remove_device_irq_domain(dev, i);
+ WARN_ON_ONCE(!xa_empty(&md->__domains[i].store));
+ xa_destroy(&md->__domains[i].store);
+ }
+ dev->msi.data = NULL;
+}
+
+/**
+ * msi_setup_device_data - Setup MSI device data
+ * @dev: Device for which MSI device data should be set up
+ *
+ * Return: 0 on success, appropriate error code otherwise
+ *
+ * This can be called more than once for @dev. If the MSI device data is
+ * already allocated the call succeeds. The allocated memory is
+ * automatically released when the device is destroyed.
+ */
+int msi_setup_device_data(struct device *dev)
+{
+ struct msi_device_data *md;
+ int ret, i;
+
+ if (dev->msi.data)
+ return 0;
+
+ md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ ret = msi_sysfs_create_group(dev);
+ if (ret) {
+ devres_free(md);
+ return ret;
+ }
+
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
+ xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
+
+ /*
+ * If @dev::msi::domain is set and is a global MSI domain, copy the
+ * pointer into the domain array so all code can operate on domain
+ * ids. The NULL pointer check is required to keep the legacy
+ * architecture specific PCI/MSI support working.
+ */
+ if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
+ md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
+
+ mutex_init(&md->mutex);
+ dev->msi.data = md;
+ devres_add(dev, md);
+ return 0;
+}
+
+/**
+ * __msi_lock_descs - Lock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
+ */
+void __msi_lock_descs(struct device *dev)
+{
+ mutex_lock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(__msi_lock_descs);
+
+/**
+ * __msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
+ */
+void __msi_unlock_descs(struct device *dev)
+{
+ /* Invalidate the index which was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
+ mutex_unlock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(__msi_unlock_descs);
+
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct xarray *xa = &md->__domains[domid].store;
+ struct msi_desc *desc;
+
+ xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) {
+ if (msi_desc_match(desc, filter))
+ return desc;
+ }
+ md->__iter_idx = MSI_XA_MAX_INDEX;
+ return NULL;
+}
+
+/**
+ * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device
+ * @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
+ *
+ * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
+ * must be invoked before the call.
+ *
+ * Return: Pointer to the first MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ md->__iter_idx = 0;
+ return msi_find_desc(md, domid, filter);
+}
+EXPORT_SYMBOL_GPL(msi_domain_first_desc);
+
+/**
+ * msi_next_desc - Get the next MSI descriptor of a device
+ * @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
+ *
+ * The first invocation of msi_next_desc() has to be preceeded by a
+ * successful invocation of __msi_first_desc(). Consecutive invocations are
+ * only valid if the previous one was successful. All these operations have
+ * to be done within the same MSI mutex held region.
+ *
+ * Return: Pointer to the next MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
+ return NULL;
+
+ md->__iter_idx++;
+ return msi_find_desc(md, domid, filter);
+}
+EXPORT_SYMBOL_GPL(msi_next_desc);
+
+/**
+ * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain
+ * @dev: Device to operate on
+ * @domid: Domain ID of the interrupt domain associated to the device
+ * @index: MSI interrupt index to look for (0-based)
+ *
+ * Return: The Linux interrupt number on success (> 0), 0 if not found
+ */
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
+{
+ struct msi_desc *desc;
+ bool pcimsi = false;
+ struct xarray *xa;
+
+ if (!dev->msi.data)
+ return 0;
+
+ if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ /* This check is only valid for the PCI default MSI domain */
+ if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
+ pcimsi = to_pci_dev(dev)->msi_enabled;
+
+ guard(msi_descs_lock)(dev);
+ xa = &dev->msi.data->__domains[domid].store;
+ desc = xa_load(xa, pcimsi ? 0 : index);
+ if (desc && desc->irq) {
+ /*
+ * PCI-MSI has only one descriptor for multiple interrupts.
+ * PCI-MSIX and platform MSI use a descriptor per
+ * interrupt.
+ */
+ if (!pcimsi)
+ return desc->irq;
+ if (index < desc->nvec_used)
+ return desc->irq + index;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(msi_domain_get_virq);
+
+#ifdef CONFIG_SYSFS
+static struct attribute *msi_dev_attrs[] = {
+ NULL
+};
+
+static const struct attribute_group msi_irqs_group = {
+ .name = "msi_irqs",
+ .attrs = msi_dev_attrs,
+};
+
+static inline int msi_sysfs_create_group(struct device *dev)
+{
+ return devm_device_add_group(dev, &msi_irqs_group);
+}
+
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ /* MSI vs. MSIX is per device not per interrupt */
+ bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
+
+ return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs = desc->sysfs_attrs;
+ int i;
+
+ if (!attrs)
+ return;
+
+ desc->sysfs_attrs = NULL;
+ for (i = 0; i < desc->nvec_used; i++) {
+ if (attrs[i].show)
+ sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ kfree(attrs[i].attr.name);
+ }
+ kfree(attrs);
+}
+
+static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs;
+ int ret, i;
+
+ attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ desc->sysfs_attrs = attrs;
+ for (i = 0; i < desc->nvec_used; i++) {
+ sysfs_attr_init(&attrs[i].attr);
+ attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
+ if (!attrs[i].attr.name) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ attrs[i].attr.mode = 0444;
+ attrs[i].show = msi_mode_show;
+
+ ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ if (ret) {
+ attrs[i].show = NULL;
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ msi_sysfs_remove_desc(dev, desc);
+ return ret;
+}
+
+#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN)
+/**
+ * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) which will get sysfs entries
+ */
+int msi_device_populate_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+ int ret;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ if (desc->sysfs_attrs)
+ continue;
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) for which to remove
+ * sysfs entries
+ */
+void msi_device_destroy_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ALL)
+ msi_sysfs_remove_desc(dev, desc);
+}
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */
+#else /* CONFIG_SYSFS */
+static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
+static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
+static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
+#endif /* !CONFIG_SYSFS */
+
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}
+
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
data->chip->irq_write_msi_msg(data, msg);
}
+static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the MSI provider has messed with the second message and
+ * not advertized that it is level-capable, signal the breakage.
+ */
+ WARN_ON(!((info->flags & MSI_FLAG_LEVEL_CAPABLE) &&
+ (info->chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)) &&
+ (msg[1].address_lo || msg[1].address_hi || msg[1].data));
+}
+
/**
* msi_domain_set_affinity - Generic affinity setter function for MSI domains
* @irq_data: The irq data associated to the interrupt
@@ -46,39 +658,44 @@ static inline void irq_chip_write_msi_msg(struct irq_data *data,
*
* Intended to be used by MSI interrupt controllers which are
* implemented with hierarchical domains.
+ *
+ * Return: IRQ_SET_MASK_* result code
*/
int msi_domain_set_affinity(struct irq_data *irq_data,
const struct cpumask *mask, bool force)
{
struct irq_data *parent = irq_data->parent_data;
- struct msi_msg msg;
+ struct msi_msg msg[2] = { [1] = { }, };
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
- BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+ msi_check_level(irq_data->domain, msg);
+ irq_chip_write_msi_msg(irq_data, msg);
}
return ret;
}
-static void msi_domain_activate(struct irq_domain *domain,
- struct irq_data *irq_data)
+static int msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool early)
{
- struct msi_msg msg;
+ struct msi_msg msg[2] = { [1] = { }, };
- BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+ msi_check_level(irq_data->domain, msg);
+ irq_chip_write_msi_msg(irq_data, msg);
+ return 0;
}
static void msi_domain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data)
{
- struct msi_msg msg;
+ struct msi_msg msg[2];
- memset(&msg, 0, sizeof(msg));
- irq_chip_write_msi_msg(irq_data, &msg);
+ memset(msg, 0, sizeof(msg));
+ irq_chip_write_msi_msg(irq_data, msg);
}
static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
@@ -89,18 +706,20 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
int i, ret;
- if (irq_find_mapping(domain, hwirq) > 0)
+ if (irq_resolve_mapping(domain, hwirq))
return -EEXIST;
- ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
- if (ret < 0)
- return ret;
+ if (domain->parent) {
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+ }
for (i = 0; i < nr_irqs; i++) {
ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
if (ret < 0) {
if (ops->msi_free) {
- for (i--; i > 0; i--)
+ for (i--; i >= 0; i--)
ops->msi_free(domain, info, virq + i);
}
irq_domain_free_irqs_top(domain, virq, nr_irqs);
@@ -124,14 +743,46 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-static struct irq_domain_ops msi_domain_ops = {
+static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fwspec,
+ irq_hw_number_t *hwirq, unsigned int *type)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * This will catch allocations through the regular irqdomain path except
+ * for MSI domains which really support this, e.g. MBIGEN.
+ */
+ if (!info->ops->msi_translate)
+ return -ENOTSUPP;
+ return info->ops->msi_translate(domain, fwspec, hwirq, type);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct msi_desc *desc = irqd ? irq_data_get_msi_desc(irqd) : NULL;
+
+ if (!desc)
+ return;
+
+ seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi);
+ seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo);
+ seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data);
+}
+#endif
+
+static const struct irq_domain_ops msi_domain_ops = {
.alloc = msi_domain_alloc,
.free = msi_domain_free,
.activate = msi_domain_activate,
.deactivate = msi_domain_deactivate,
+ .translate = msi_domain_translate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = msi_domain_debug_show,
+#endif
};
-#ifdef GENERIC_MSI_DOMAIN_OPS
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
msi_alloc_info_t *arg)
{
@@ -145,16 +796,15 @@ static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
return 0;
}
+static void msi_domain_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
+{
+}
+
static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
struct msi_desc *desc)
{
arg->desc = desc;
}
-#else
-#define msi_domain_ops_get_hwirq NULL
-#define msi_domain_ops_prepare NULL
-#define msi_domain_ops_set_desc NULL
-#endif /* !GENERIC_MSI_DOMAIN_OPS */
static int msi_domain_ops_init(struct irq_domain *domain,
struct msi_domain_info *info,
@@ -171,19 +821,12 @@ static int msi_domain_ops_init(struct irq_domain *domain,
return 0;
}
-static int msi_domain_ops_check(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev)
-{
- return 0;
-}
-
static struct msi_domain_ops msi_domain_ops_default = {
- .get_hwirq = msi_domain_ops_get_hwirq,
- .msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
- .msi_prepare = msi_domain_ops_prepare,
- .set_desc = msi_domain_ops_set_desc,
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_prepare = msi_domain_ops_prepare,
+ .msi_teardown = msi_domain_ops_teardown,
+ .set_desc = msi_domain_ops_set_desc,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -195,14 +838,17 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
+ if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
+ return;
+
if (ops->get_hwirq == NULL)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
ops->msi_init = msi_domain_ops_default.msi_init;
- if (ops->msi_check == NULL)
- ops->msi_check = msi_domain_ops_default.msi_check;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->msi_teardown == NULL)
+ ops->msi_teardown = msi_domain_ops_default.msi_teardown;
if (ops->set_desc == NULL)
ops->set_desc = msi_domain_ops_default.set_desc;
}
@@ -211,127 +857,921 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
{
struct irq_chip *chip = info->chip;
- BUG_ON(!chip);
- if (!chip->irq_mask)
- chip->irq_mask = pci_msi_mask_irq;
- if (!chip->irq_unmask)
- chip->irq_unmask = pci_msi_unmask_irq;
- if (!chip->irq_set_affinity)
+ BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
+ if (!chip->irq_set_affinity && !(info->flags & MSI_FLAG_NO_AFFINITY))
chip->irq_set_affinity = msi_domain_set_affinity;
}
+static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
+ struct msi_domain_info *info,
+ unsigned int flags,
+ struct irq_domain *parent)
+{
+ struct irq_domain *domain;
+
+ if (info->hwsize > MSI_XA_DOMAIN_SIZE)
+ return NULL;
+
+ /*
+ * Hardware size 0 is valid for backwards compatibility and for
+ * domains which are not backed by a hardware table. Grant the
+ * maximum index space.
+ */
+ if (!info->hwsize)
+ info->hwsize = MSI_XA_DOMAIN_SIZE;
+
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+
+ if (domain) {
+ irq_domain_update_bus_token(domain, info->bus_token);
+ domain->dev = info->dev;
+ if (info->flags & MSI_FLAG_PARENT_PM_DEV)
+ domain->pm_dev = parent->pm_dev;
+ }
+
+ return domain;
+}
+
/**
- * msi_create_irq_domain - Create a MSI interrupt domain
- * @of_node: Optional device-tree node of the interrupt controller
+ * msi_create_irq_domain - Create an MSI interrupt domain
+ * @fwnode: Optional fwnode of the interrupt controller
* @info: MSI domain info
* @parent: Parent irq domain
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
*/
-struct irq_domain *msi_create_irq_domain(struct device_node *node,
+struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
- if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
- msi_domain_update_dom_ops(info);
- if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
- msi_domain_update_chip_ops(info);
+ return __msi_create_irq_domain(fwnode, info, 0, parent);
+}
- return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
- info);
+/**
+ * msi_create_parent_irq_domain - Create an MSI-parent interrupt domain
+ * @info: MSI irqdomain creation info
+ * @msi_parent_ops: MSI parent callbacks and configuration
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
+ */
+struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info,
+ const struct msi_parent_ops *msi_parent_ops)
+{
+ struct irq_domain *d;
+
+ info->hwirq_max = max(info->hwirq_max, info->size);
+ info->size = info->hwirq_max;
+ info->domain_flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ info->bus_token = msi_parent_ops->bus_select_token;
+
+ d = irq_domain_instantiate(info);
+ if (IS_ERR(d))
+ return NULL;
+
+ d->msi_parent_ops = msi_parent_ops;
+ return d;
}
+EXPORT_SYMBOL_GPL(msi_create_parent_irq_domain);
/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
- * @dev: Pointer to device struct of the device for which the interrupts
- * are allocated
- * @nvec: The number of interrupts to allocate
+ * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
+ * in the domain hierarchy
+ * @dev: The device for which the domain should be created
+ * @domain: The domain in the hierarchy this op is being called on
+ * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to
+ * be created
+ * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE
+ * domain to be created
+ *
+ * Return: true on success, false otherwise
+ *
+ * This is the most complex problem of per device MSI domains and the
+ * underlying interrupt domain hierarchy:
+ *
+ * The device domain to be initialized requests the broadest feature set
+ * possible and the underlying domain hierarchy puts restrictions on it.
+ *
+ * That's trivial for a simple parent->child relationship, but it gets
+ * interesting with an intermediate domain: root->parent->child. The
+ * intermediate 'parent' can expand the capabilities which the 'root'
+ * domain is providing. So that creates a classic hen and egg problem:
+ * Which entity is doing the restrictions/expansions?
+ *
+ * One solution is to let the root domain handle the initialization that's
+ * why there is the @domain and the @msi_parent_domain pointer.
+ */
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info)
+{
+ struct irq_domain *parent = domain->parent;
+
+ if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops ||
+ !parent->msi_parent_ops->init_dev_msi_info))
+ return false;
+
+ return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain,
+ msi_child_info);
+}
+
+/**
+ * msi_create_device_irq_domain - Create a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @template: MSI domain info bundle used as template
+ * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited)
+ * @domain_data: Optional pointer to domain specific data which is set in
+ * msi_domain_info::data
+ * @chip_data: Optional pointer to chip specific data which is set in
+ * msi_domain_info::chip_data
+ *
+ * Return: True on success, false otherwise
+ *
+ * There is no firmware node required for this interface because the per
+ * device domains are software constructs which are actually closer to the
+ * hardware reality than any firmware can describe them.
+ *
+ * The domain name and the irq chip name for a MSI device domain are
+ * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)"
+ *
+ * $PREFIX: Optional prefix provided by the underlying MSI parent domain
+ * via msi_parent_ops::prefix. If that pointer is NULL the prefix
+ * is empty.
+ * $CHIPNAME: The name of the irq_chip in @template
+ * $DEVNAME: The name of the device
+ *
+ * This results in understandable chip names and hardware interrupt numbers
+ * in e.g. /proc/interrupts
+ *
+ * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix
+ * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-'
+ *
+ * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect
+ * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device
+ * IR-PCI-MSIX-0000:3d:00.0 2-edge
+ *
+ * On IMS domains the hardware interrupt number is either a table entry
+ * index or a purely software managed index but it is guaranteed to be
+ * unique.
+ *
+ * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All
+ * subsequent operations on the domain depend on the domain id.
+ *
+ * The domain is automatically freed when the device is removed via devres
+ * in the context of @dev::msi::data freeing, but it can also be
+ * independently removed via @msi_remove_device_irq_domain().
+ */
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data)
+{
+ struct irq_domain *domain, *parent = dev->msi.domain;
+ const struct msi_parent_ops *pops;
+ struct fwnode_handle *fwnode;
+
+ if (!irq_domain_is_msi_parent(parent))
+ return false;
+
+ if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
+ return false;
+
+ struct msi_domain_template *bundle __free(kfree) =
+ kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ if (!bundle)
+ return false;
+
+ bundle->info.hwsize = hwsize;
+ bundle->info.chip = &bundle->chip;
+ bundle->info.ops = &bundle->ops;
+ bundle->info.data = domain_data;
+ bundle->info.chip_data = chip_data;
+ bundle->info.alloc_data = &bundle->alloc_info;
+ bundle->info.dev = dev;
+
+ pops = parent->msi_parent_ops;
+ snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
+ pops->prefix ? : "", bundle->chip.name, dev_name(dev));
+ bundle->chip.name = bundle->name;
+
+ /*
+ * Using the device firmware node is required for wire to MSI
+ * device domains so that the existing firmware results in a domain
+ * match.
+ * All other device domains like PCI/MSI use the named firmware
+ * node as they are not guaranteed to have a fwnode. They are never
+ * looked up and always handled in the context of the device.
+ */
+ struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL;
+
+ if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE))
+ fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name);
+ else
+ fwnode = dev->fwnode;
+
+ if (!fwnode)
+ return false;
+
+ if (msi_setup_device_data(dev))
+ return false;
+
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
+ return false;
+
+ if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
+ return false;
+
+ domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
+ if (!domain)
+ return false;
+
+ dev->msi.data->__domains[domid].domain = domain;
+
+ if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
+ dev->msi.data->__domains[domid].domain = NULL;
+ irq_domain_remove(domain);
+ return false;
+ }
+
+ /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */
+ retain_and_null_ptr(bundle);
+ retain_and_null_ptr(fwnode_alloced);
+ return true;
+}
+
+/**
+ * msi_remove_device_irq_domain - Free a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ */
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
+{
+ struct fwnode_handle *fwnode = NULL;
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ guard(msi_descs_lock)(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain || !irq_domain_is_msi_device(domain))
+ return;
+
+ dev->msi.data->__domains[domid].domain = NULL;
+ info = domain->host_data;
+
+ info->ops->msi_teardown(domain, info->alloc_data);
+
+ if (irq_domain_is_msi_device(domain))
+ fwnode = domain->fwnode;
+ irq_domain_remove(domain);
+ irq_domain_free_fwnode(fwnode);
+ kfree(container_of(info, struct msi_domain_template, info));
+}
+
+/**
+ * msi_match_device_irq_domain - Match a device irq domain against a bus token
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @bus_token: Bus token to match against the domain bus token
*
- * Returns 0 on success or an error code.
+ * Return: True if device domain exists and bus tokens match.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ guard(msi_descs_lock)(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (domain && irq_domain_is_msi_device(domain)) {
+ info = domain->host_data;
+ return info->bus_token == bus_token;
+ }
+ return false;
+}
+
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+
+ return ops->msi_prepare(domain, dev, nvec, arg);
+}
+
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ struct msi_desc *desc;
+
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ case DOMAIN_BUS_VMD_MSI:
+ break;
+ default:
+ return false;
+ }
+
+ if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+ return false;
+
+ if (info->flags & MSI_FLAG_NO_MASK)
+ return false;
+
+ /*
+ * Checking the first MSI descriptor is sufficient. MSIX supports
+ * masking and MSI does so when the can_mask attribute is set.
+ */
+ desc = msi_first_desc(dev, MSI_DESC_ALL);
+ return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
+}
+
+static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
+ int allocated)
+{
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ case DOMAIN_BUS_VMD_MSI:
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ break;
+ fallthrough;
+ default:
+ return -ENOSPC;
+ }
+
+ /* Let a failed PCI multi MSI allocation retry */
+ if (desc->nvec_used > 1)
+ return 1;
+
+ /* If there was a successful allocation let the caller know */
+ return allocated ? allocated : -ENOSPC;
+}
+
+#define VIRQ_CAN_RESERVE 0x01
+#define VIRQ_ACTIVATE 0x02
+
+static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
+{
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
+ int ret;
+
+ if (!(vflags & VIRQ_CAN_RESERVE)) {
+ irqd_clr_can_reserve(irqd);
+
+ /*
+ * If the interrupt is managed but no CPU is available to
+ * service it, shut it down until better times. Note that
+ * we only do this on the !RESERVE path as x86 (the only
+ * architecture using this flag) deals with this in a
+ * different way by using a catch-all vector.
+ */
+ if ((vflags & VIRQ_ACTIVATE) &&
+ irqd_affinity_is_managed(irqd) &&
+ !cpumask_intersects(irq_data_get_affinity_mask(irqd),
+ cpu_online_mask)) {
+ irqd_set_managed_shutdown(irqd);
+ return 0;
+ }
+ }
+
+ if (!(vflags & VIRQ_ACTIVATE))
+ return 0;
+
+ ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
+ if (ret)
+ return ret;
+ /*
+ * If the interrupt uses reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (vflags & VIRQ_CAN_RESERVE)
+ irqd_clr_activated(irqd);
+ return 0;
+}
+
+static int populate_alloc_info(struct irq_domain *domain, struct device *dev,
+ unsigned int nirqs, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the caller has provided a template alloc info, use that. Once
+ * all users of msi_create_irq_domain() have been eliminated, this
+ * should be the only source of allocation information, and the
+ * prepare call below should be finally removed.
+ */
+ if (!info->alloc_data)
+ return msi_domain_prepare_irqs(domain, dev, nirqs, arg);
+
+ *arg = *info->alloc_data;
+ return 0;
+}
+
+static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
+ struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- msi_alloc_info_t arg;
+ unsigned int vflags = 0, allocated = 0;
+ msi_alloc_info_t arg = { };
struct msi_desc *desc;
- int i, ret, virq = -1;
+ unsigned long idx;
+ int i, ret, virq;
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, &arg);
+ ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
- for_each_msi_entry(desc, dev) {
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
+ vflags |= VIRQ_ACTIVATE;
+
+ /*
+ * Interrupt can use a reserved vector and will not occupy
+ * a real device vector until the interrupt is requested.
+ */
+ if (msi_check_reservation_mode(domain, info, dev))
+ vflags |= VIRQ_CAN_RESERVE;
+
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+ continue;
+
+ /* This should return -ECONFUSED... */
+ if (WARN_ON_ONCE(allocated >= ctrl->nirqs))
+ return -EINVAL;
+
+ if (ops->prepare_desc)
+ ops->prepare_desc(domain, &arg, desc);
+
ops->set_desc(&arg, desc);
- if (info->flags & MSI_FLAG_IDENTITY_MAP)
- virq = (int)ops->get_hwirq(info, &arg);
- else
- virq = -1;
-
- virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
- dev_to_node(dev), &arg, false);
- if (virq < 0) {
- ret = -ENOSPC;
- if (ops->handle_error)
- ret = ops->handle_error(domain, desc, ret);
- if (ops->msi_finish)
- ops->msi_finish(&arg, ret);
- return ret;
- }
- for (i = 0; i < desc->nvec_used; i++)
+ virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
+ dev_to_node(dev), &arg, false,
+ desc->affinity);
+ if (virq < 0)
+ return msi_handle_pci_fail(domain, desc, allocated);
+
+ for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
+ irq_debugfs_copy_devname(virq + i, dev);
+ ret = msi_init_virq(domain, virq + i, vflags);
+ if (ret)
+ return ret;
+ }
+ if (info->flags & MSI_FLAG_DEV_SYSFS) {
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
+ }
+ allocated++;
+ }
+ return 0;
+}
+
+static int msi_domain_alloc_simple_msi_descs(struct device *dev,
+ struct msi_domain_info *info,
+ struct msi_ctrl *ctrl)
+{
+ if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
+ return 0;
+
+ return msi_domain_add_simple_msi_descs(dev, ctrl);
+}
+
+static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+ int ret;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return -ENODEV;
+
+ info = domain->host_data;
+
+ ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
+ if (ret)
+ return ret;
+
+ ops = info->ops;
+ if (ops->domain_alloc_irqs)
+ return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
+
+ return __msi_domain_alloc_irqs(dev, domain, ctrl);
+}
+
+static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ int ret = __msi_domain_alloc_locked(dev, ctrl);
+
+ if (ret)
+ msi_domain_free_locked(dev, ctrl);
+ return ret;
+}
+
+/**
+ * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own descriptor
+ * allocation/free.
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ .nirqs = last + 1 - first,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+
+ guard(msi_descs_lock)(dev);
+ return msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
+}
+EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range);
+
+/**
+ * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @nirqs: The number of interrupts to allocate
+ *
+ * This function scans all MSI descriptors of the MSI domain and allocates interrupts
+ * for all unassigned ones. That function is to be used for MSI domain usage where
+ * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X].
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = 0,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
+ .nirqs = nirqs,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+static struct msi_map __msi_domain_alloc_irq_at(struct device *dev, unsigned int domid,
+ unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
+ struct irq_domain *domain;
+ struct msi_map map = { };
+ struct msi_desc *desc;
+ int ret;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain) {
+ map.index = -ENODEV;
+ return map;
+ }
+
+ desc = msi_alloc_desc(dev, 1, affdesc);
+ if (!desc) {
+ map.index = -ENOMEM;
+ return map;
}
- if (ops->msi_finish)
- ops->msi_finish(&arg, 0);
+ if (icookie)
+ desc->data.icookie = *icookie;
- for_each_msi_entry(desc, dev) {
- if (desc->nvec_used == 1)
- dev_dbg(dev, "irq %d for MSI\n", virq);
- else
- dev_dbg(dev, "irq [%d-%d] for MSI\n",
- virq, virq + desc->nvec_used - 1);
+ ret = msi_insert_desc(dev, desc, domid, index);
+ if (ret) {
+ map.index = ret;
+ return map;
}
- return 0;
+ ctrl.first = ctrl.last = desc->msi_index;
+
+ ret = __msi_domain_alloc_irqs(dev, domain, &ctrl);
+ if (ret) {
+ map.index = ret;
+ msi_domain_free_locked(dev, &ctrl);
+ } else {
+ map.index = desc->msi_index;
+ map.virq = desc->irq;
+ }
+ return map;
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ guard(msi_descs_lock)(dev);
+ return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
+}
+
+/**
+ * msi_device_domain_alloc_wired - Allocate a "wired" interrupt on @domain
+ * @domain: The domain to allocate on
+ * @hwirq: The hardware interrupt number to allocate for
+ * @type: The interrupt type
+ *
+ * This weirdness supports wire to MSI controllers like MBIGEN.
+ *
+ * @hwirq is the hardware interrupt number which is handed in from
+ * irq_create_fwspec_mapping(). As the wire to MSI domain is sparse, but
+ * sized in firmware, the hardware interrupt number cannot be used as MSI
+ * index. For the underlying irq chip the MSI index is irrelevant and
+ * all it needs is the hardware interrupt number.
+ *
+ * To handle this the MSI index is allocated with MSI_ANY_INDEX and the
+ * hardware interrupt number is stored along with the type information in
+ * msi_desc::cookie so the underlying interrupt chip and domain code can
+ * retrieve it.
+ *
+ * Return: The Linux interrupt number (> 0) or an error code
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq,
+ unsigned int type)
{
+ unsigned int domid = MSI_DEFAULT_DOMAIN;
+ union msi_instance_cookie icookie = { };
+ struct device *dev = domain->dev;
+ struct msi_map map = { };
+
+ if (WARN_ON_ONCE(!dev || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
+ return -EINVAL;
+
+ icookie.value = ((u64)type << 32) | hwirq;
+
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain))
+ map.index = -EINVAL;
+ else
+ map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie);
+ return map.index >= 0 ? map.virq : map.index;
+}
+
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
+{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
+ struct msi_domain_info *info = domain->host_data;
+ struct irq_data *irqd;
struct msi_desc *desc;
+ unsigned long idx;
+ int i;
- for_each_msi_entry(desc, dev) {
- /*
- * We might have failed to allocate an MSI early
- * enough that there is no IRQ associated to this
- * entry. If that's the case, don't do anything.
- */
- if (desc->irq) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
+ /* Make sure all interrupts are deactivated */
+ for (i = 0; i < desc->nvec_used; i++) {
+ irqd = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (irqd && irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
}
+
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ if (info->flags & MSI_FLAG_DEV_SYSFS)
+ msi_sysfs_remove_desc(dev, desc);
+ desc->irq = 0;
}
}
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
+ msi_domain_free_descs(dev, ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ guard(msi_descs_lock)(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+}
+EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all);
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
+ */
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
+{
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
+}
+
+/**
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
+ */
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
+{
+ guard(msi_descs_lock)(dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
+}
+
+/**
+ * msi_device_domain_free_wired - Free a wired interrupt in @domain
+ * @domain: The domain to free the interrupt on
+ * @virq: The Linux interrupt number to free
+ *
+ * This is the counterpart of msi_device_domain_alloc_wired() for the
+ * weird wired to MSI converting domains.
+ */
+void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq)
+{
+ struct msi_desc *desc = irq_get_msi_desc(virq);
+ struct device *dev = domain->dev;
+
+ if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
+ return;
+
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain))
+ return;
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
+ desc->msi_index);
+}
+
/**
* msi_get_domain_info - Get the MSI interrupt domain info for @domain
* @domain: The interrupt domain to retrieve data from
*
- * Returns the pointer to the msi_domain_info stored in
- * @domain->host_data.
+ * Return: the pointer to the msi_domain_info stored in @domain->host_data.
*/
struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
+/**
+ * msi_device_has_isolated_msi - True if the device has isolated MSI
+ * @dev: The device to check
+ *
+ * Isolated MSI means that HW modeled by an irq_domain on the path from the
+ * initiating device to the CPU will validate that the MSI message specifies an
+ * interrupt number that the device is authorized to trigger. This must block
+ * devices from triggering interrupts they are not authorized to trigger.
+ * Currently authorization means the MSI vector is one assigned to the device.
+ *
+ * This is interesting for securing VFIO use cases where a rouge MSI (eg created
+ * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to
+ * impact outside its security domain, eg userspace triggering interrupts on
+ * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM
+ * triggering interrupts on another VM.
+ */
+bool msi_device_has_isolated_msi(struct device *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(dev);
+
+ for (; domain; domain = domain->parent)
+ if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI)
+ return true;
+ return arch_is_isolated_msi();
+}
+EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 5204a6d1b985..99ff65466d87 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/pm.c
- *
* Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file contains power management functions related to interrupts.
@@ -14,17 +13,13 @@
#include "internals.h"
-bool irq_pm_check_wakeup(struct irq_desc *desc)
+void irq_pm_handle_wakeup(struct irq_desc *desc)
{
- if (irqd_is_wakeup_armed(&desc->irq_data)) {
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
- desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
- desc->depth++;
- irq_disable(desc);
- pm_system_wakeup();
- return true;
- }
- return false;
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_irq_wakeup(irq_desc_get_irq(desc));
}
/*
@@ -47,8 +42,7 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth++;
WARN_ON_ONCE(desc->no_suspend_depth &&
- (desc->no_suspend_depth +
- desc->cond_suspend_depth) != desc->nr_actions);
+ (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions);
}
/*
@@ -68,13 +62,28 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth--;
}
-static bool suspend_device_irq(struct irq_desc *desc, int irq)
+static bool suspend_device_irq(struct irq_desc *desc)
{
- if (!desc->action || desc->no_suspend_depth)
+ unsigned long chipflags = irq_desc_get_chip(desc)->flags;
+ struct irq_data *irqd = &desc->irq_data;
+
+ if (!desc->action || irq_desc_is_chained(desc) ||
+ desc->no_suspend_depth)
return false;
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ if (irqd_is_wakeup_set(irqd)) {
+ irqd_set(irqd, IRQD_WAKEUP_ARMED);
+
+ if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) &&
+ irqd_irq_disabled(irqd)) {
+ /*
+ * Interrupt marked for wakeup is in disabled state.
+ * Enable interrupt here to unmask/enable in irqchip
+ * to be able to resume with such interrupts.
+ */
+ __enable_irq(desc);
+ irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
/*
* We return true here to force the caller to issue
* synchronize_irq(). We need to make sure that the
@@ -85,7 +94,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq)
}
desc->istate |= IRQS_SUSPENDED;
- __disable_irq(desc, irq);
+ __disable_irq(desc);
/*
* Hardware which has no wakeup source configuration facility
@@ -93,7 +102,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq)
* chip level. The chip implementation indicates that with
* IRQCHIP_MASK_ON_SUSPEND.
*/
- if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ if (chipflags & IRQCHIP_MASK_ON_SUSPEND)
mask_irq(desc);
return true;
}
@@ -120,22 +129,33 @@ void suspend_device_irqs(void)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
bool sync;
- raw_spin_lock_irqsave(&desc->lock, flags);
- sync = suspend_device_irq(desc, irq);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (irq_settings_is_nested_thread(desc))
+ continue;
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ sync = suspend_device_irq(desc);
if (sync)
synchronize_irq(irq);
}
}
-EXPORT_SYMBOL_GPL(suspend_device_irqs);
-static void resume_irq(struct irq_desc *desc, int irq)
+static void resume_irq(struct irq_desc *desc)
{
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ struct irq_data *irqd = &desc->irq_data;
+
+ irqd_clear(irqd, IRQD_WAKEUP_ARMED);
+
+ if (irqd_is_enabled_on_suspend(irqd)) {
+ /*
+ * Interrupt marked for wakeup was enabled during suspend
+ * entry. Disable such interrupts to restore them back to
+ * original state.
+ */
+ __disable_irq(desc);
+ irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
if (desc->istate & IRQS_SUSPENDED)
goto resume;
@@ -146,9 +166,11 @@ static void resume_irq(struct irq_desc *desc, int irq)
/* Pretend that it got disabled ! */
desc->depth++;
+ irq_state_set_disabled(desc);
+ irq_state_set_masked(desc);
resume:
desc->istate &= ~IRQS_SUSPENDED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
static void resume_irqs(bool want_early)
@@ -157,36 +179,58 @@ static void resume_irqs(bool want_early)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
- bool is_early = desc->action &&
- desc->action->flags & IRQF_EARLY_RESUME;
+ bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME;
if (!is_early && want_early)
continue;
+ if (irq_settings_is_nested_thread(desc))
+ continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
- resume_irq(desc, irq);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ resume_irq(desc);
}
}
/**
- * irq_pm_syscore_ops - enable interrupt lines early
+ * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup
+ * @irq: Interrupt to rearm
+ */
+void rearm_wake_irq(unsigned int irq)
+{
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
+
+ if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data))
+ return;
+
+ desc->istate &= ~IRQS_SUSPENDED;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ __enable_irq(desc);
+ }
+}
+
+/**
+ * irq_pm_syscore_resume - enable interrupt lines early
+ * @data: syscore context
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
*/
-static void irq_pm_syscore_resume(void)
+static void irq_pm_syscore_resume(void *data)
{
resume_irqs(true);
}
-static struct syscore_ops irq_pm_syscore_ops = {
+static const struct syscore_ops irq_pm_syscore_ops = {
.resume = irq_pm_syscore_resume,
};
+static struct syscore irq_pm_syscore = {
+ .ops = &irq_pm_syscore_ops,
+};
+
static int __init irq_pm_init_ops(void)
{
- register_syscore_ops(&irq_pm_syscore_ops);
+ register_syscore(&irq_pm_syscore);
return 0;
}
@@ -203,4 +247,3 @@ void resume_device_irqs(void)
{
resume_irqs(false);
}
-EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index df2f4642d1e7..77258eafbf63 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/proc.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the /proc/irq/ handling code.
@@ -12,6 +11,7 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/mutex.h>
#include "internals.h"
@@ -36,69 +36,113 @@ static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-static int show_irq_affinity(int type, struct seq_file *m, void *v)
+enum {
+ AFFINITY,
+ AFFINITY_LIST,
+ EFFECTIVE,
+ EFFECTIVE_LIST,
+};
+
+static int show_irq_affinity(int type, struct seq_file *m)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- const struct cpumask *mask = desc->irq_data.affinity;
-
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (irqd_is_setaffinity_pending(&desc->irq_data))
- mask = desc->pending_mask;
+ const struct cpumask *mask;
+
+ guard(raw_spinlock_irq)(&desc->lock);
+
+ switch (type) {
+ case AFFINITY:
+ case AFFINITY_LIST:
+ mask = desc->irq_common_data.affinity;
+ if (irq_move_pending(&desc->irq_data))
+ mask = irq_desc_get_pending_mask(desc);
+ break;
+ case EFFECTIVE:
+ case EFFECTIVE_LIST:
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ break;
#endif
- if (type)
+ default:
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case AFFINITY_LIST:
+ case EFFECTIVE_LIST:
seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
- else
+ break;
+ case AFFINITY:
+ case EFFECTIVE:
seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
+ break;
+ }
return 0;
}
static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- unsigned long flags;
cpumask_var_t mask;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (desc->affinity_hint)
- cpumask_copy(mask, desc->affinity_hint);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (desc->affinity_hint)
+ cpumask_copy(mask, desc->affinity_hint);
+ }
seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
-
return 0;
}
-#ifndef is_affinity_mask_valid
-#define is_affinity_mask_valid(val) 1
-#endif
-
int no_irq_affinity;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
- return show_irq_affinity(0, m, v);
+ return show_irq_affinity(AFFINITY, m);
}
static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
{
- return show_irq_affinity(1, m, v);
+ return show_irq_affinity(AFFINITY_LIST, m);
}
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+ /*
+ * If the interrupt is started up already then this fails. The
+ * interrupt is assigned to an online CPU already. There is no
+ * point to move it around randomly. Tell user space that the
+ * selected mask is bogus.
+ *
+ * If not then any change to the affinity is pointless because the
+ * startup code invokes irq_setup_affinity() which will select
+ * a online CPU anyway.
+ */
+ return -EINVAL;
+}
+#else
+/* ALPHA magic affinity auto selector. Keep it for historical reasons. */
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+ return irq_select_affinity(irq);
+}
+#endif
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
+ unsigned int irq = (int)(long)pde_data(file_inode(file));
cpumask_var_t new_value;
int err;
- if (!irq_can_set_affinity(irq) || no_irq_affinity)
- return -EIO;
+ if (!irq_can_set_affinity_usr(irq) || no_irq_affinity)
+ return -EPERM;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
if (type)
@@ -108,23 +152,21 @@ static ssize_t write_irq_affinity(int type, struct file *file,
if (err)
goto free_cpumask;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto free_cpumask;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
* one online CPU still has to be targeted.
*/
if (!cpumask_intersects(new_value, cpu_online_mask)) {
- /* Special case for empty set - allow the architecture
- code to set default SMP affinity. */
- err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
+ /*
+ * Special case for empty set - allow the architecture code
+ * to set default SMP affinity.
+ */
+ err = irq_select_affinity_usr(irq) ? -EINVAL : count;
} else {
- irq_set_affinity(irq, new_value);
- err = count;
+ err = irq_set_affinity(irq, new_value);
+ if (!err)
+ err = count;
}
free_cpumask:
@@ -146,41 +188,41 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_proc_show, pde_data(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_list_proc_show, pde_data(inode));
}
-static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_affinity_proc_fops = {
- .open = irq_affinity_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = irq_affinity_proc_write,
+static const struct proc_ops irq_affinity_proc_ops = {
+ .proc_open = irq_affinity_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = irq_affinity_proc_write,
};
-static const struct file_operations irq_affinity_hint_proc_fops = {
- .open = irq_affinity_hint_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops irq_affinity_list_proc_ops = {
+ .proc_open = irq_affinity_list_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = irq_affinity_list_proc_write,
};
-static const struct file_operations irq_affinity_list_proc_fops = {
- .open = irq_affinity_list_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = irq_affinity_list_proc_write,
-};
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+static int irq_effective_aff_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(EFFECTIVE, m);
+}
+
+static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(EFFECTIVE_LIST, m);
+}
+#endif
static int default_affinity_show(struct seq_file *m, void *v)
{
@@ -194,18 +236,13 @@ static ssize_t default_affinity_write(struct file *file,
cpumask_var_t new_value;
int err;
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto out;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto out;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
@@ -226,36 +263,24 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE_DATA(inode));
+ return single_open(file, default_affinity_show, pde_data(inode));
}
-static const struct file_operations default_affinity_proc_fops = {
- .open = default_affinity_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = default_affinity_write,
+static const struct proc_ops default_affinity_proc_ops = {
+ .proc_open = default_affinity_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = default_affinity_write,
};
static int irq_node_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long) m->private);
- seq_printf(m, "%d\n", desc->irq_data.node);
+ seq_printf(m, "%d\n", irq_desc_get_node(desc));
return 0;
}
-
-static int irq_node_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_node_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_node_proc_fops = {
- .open = irq_node_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -268,49 +293,30 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
return 0;
}
-static int irq_spurious_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_spurious_proc_fops = {
- .open = irq_spurious_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
#define MAX_NAMELEN 128
-static int name_unique(unsigned int irq, struct irqaction *new_action)
+static bool name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
- int ret = 1;
- raw_spin_lock_irqsave(&desc->lock, flags);
- for (action = desc->action ; action; action = action->next) {
+ guard(raw_spinlock_irq)(&desc->lock);
+ for_each_action_of_desc(desc, action) {
if ((action != new_action) && action->name &&
- !strcmp(new_action->name, action->name)) {
- ret = 0;
- break;
- }
+ !strcmp(new_action->name, action->name))
+ return false;
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ return true;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
- char name [MAX_NAMELEN];
+ char name[MAX_NAMELEN];
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc->dir || action->dir || !action->name ||
- !name_unique(irq, action))
+ if (!desc->dir || action->dir || !action->name || !name_unique(irq, action))
return;
- memset(name, 0, MAX_NAMELEN);
snprintf(name, MAX_NAMELEN, "%s", action->name);
/* create /proc/irq/1234/handler/ */
@@ -323,38 +329,57 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
+ static DEFINE_MUTEX(register_lock);
+ void __maybe_unused *irqp = (void *)(unsigned long) irq;
char name [MAX_NAMELEN];
- if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
+ if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
return;
- memset(name, 0, MAX_NAMELEN);
- sprintf(name, "%d", irq);
+ /*
+ * irq directories are registered only when a handler is
+ * added, not when the descriptor is created, so multiple
+ * tasks might try to register at the same time.
+ */
+ guard(mutex)(&register_lock);
+
+ if (desc->dir)
+ return;
/* create /proc/irq/1234 */
+ sprintf(name, "%u", irq);
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
return;
#ifdef CONFIG_SMP
+ umode_t umode = S_IRUGO;
+
+ if (irq_can_set_affinity_usr(desc->irq_data.irq))
+ umode |= S_IWUSR;
+
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", 0644, desc->dir,
- &irq_affinity_proc_fops, (void *)(long)irq);
+ proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
- proc_create_data("affinity_hint", 0444, desc->dir,
- &irq_affinity_hint_proc_fops, (void *)(long)irq);
+ proc_create_single_data("affinity_hint", 0444, desc->dir,
+ irq_affinity_hint_proc_show, irqp);
/* create /proc/irq/<irq>/smp_affinity_list */
- proc_create_data("smp_affinity_list", 0644, desc->dir,
- &irq_affinity_list_proc_fops, (void *)(long)irq);
-
- proc_create_data("node", 0444, desc->dir,
- &irq_node_proc_fops, (void *)(long)irq);
+ proc_create_data("smp_affinity_list", umode, desc->dir,
+ &irq_affinity_list_proc_ops, irqp);
+
+ proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp);
+# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ proc_create_single_data("effective_affinity", 0444, desc->dir,
+ irq_effective_aff_proc_show, irqp);
+ proc_create_single_data("effective_affinity_list", 0444, desc->dir,
+ irq_effective_aff_list_proc_show, irqp);
+# endif
#endif
+ proc_create_single_data("spurious", 0444, desc->dir,
+ irq_spurious_proc_show, (void *)(long)irq);
- proc_create_data("spurious", 0444, desc->dir,
- &irq_spurious_proc_fops, (void *)(long)irq);
}
void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -368,10 +393,13 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
remove_proc_entry("affinity_hint", desc->dir);
remove_proc_entry("smp_affinity_list", desc->dir);
remove_proc_entry("node", desc->dir);
+# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+ remove_proc_entry("effective_affinity", desc->dir);
+ remove_proc_entry("effective_affinity_list", desc->dir);
+# endif
#endif
remove_proc_entry("spurious", desc->dir);
- memset(name, 0, MAX_NAMELEN);
sprintf(name, "%u", irq);
remove_proc_entry(name, root_irq_dir);
}
@@ -387,7 +415,7 @@ static void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
proc_create("irq/default_smp_affinity", 0644, NULL,
- &default_affinity_proc_fops);
+ &default_affinity_proc_ops);
#endif
}
@@ -406,12 +434,8 @@ void init_irq_proc(void)
/*
* Create entries for all existing IRQs.
*/
- for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
+ for_each_irq_desc(irq, desc)
register_irq_proc(irq, desc);
- }
}
#ifdef CONFIG_GENERIC_IRQ_SHOW
@@ -422,14 +446,14 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec)
}
#ifndef ACTUAL_NR_IRQS
-# define ACTUAL_NR_IRQS nr_irqs
+# define ACTUAL_NR_IRQS irq_get_nr_irqs()
#endif
int show_interrupts(struct seq_file *p, void *v)
{
+ const unsigned int nr_irqs = irq_get_nr_irqs();
static int prec;
- unsigned long flags, any_count = 0;
int i = *(loff_t *) v, j;
struct irqaction *action;
struct irq_desc *desc;
@@ -451,40 +475,44 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
- irq_lock_sparse();
+ guard(rcu)();
desc = irq_to_desc(i);
- if (!desc)
- goto outsparse;
+ if (!desc || irq_settings_is_hidden(desc))
+ return 0;
- raw_spin_lock_irqsave(&desc->lock, flags);
- for_each_online_cpu(j)
- any_count |= kstat_irqs_cpu(i, j);
- action = desc->action;
- if (!action && !any_count)
- goto out;
+ if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs)
+ return 0;
+
+ seq_printf(p, "%*d:", prec, i);
+ for_each_online_cpu(j) {
+ unsigned int cnt = desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0;
- seq_printf(p, "%*d: ", prec, i);
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+ seq_put_decimal_ull_width(p, " ", cnt, 10);
+ }
+ seq_putc(p, ' ');
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
else if (desc->irq_data.chip->name)
- seq_printf(p, " %8s", desc->irq_data.chip->name);
+ seq_printf(p, "%8s", desc->irq_data.chip->name);
else
- seq_printf(p, " %8s", "-");
+ seq_printf(p, "%8s", "-");
} else {
- seq_printf(p, " %8s", "None");
+ seq_printf(p, "%8s", "None");
}
if (desc->irq_data.domain)
- seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
+ seq_printf(p, " %*lu", prec, desc->irq_data.hwirq);
+ else
+ seq_printf(p, " %*s", prec, "");
#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
#endif
if (desc->name)
seq_printf(p, "-%-8s", desc->name);
+ action = desc->action;
if (action) {
seq_printf(p, " %s", action->name);
while ((action = action->next) != NULL)
@@ -492,10 +520,6 @@ int show_interrupts(struct seq_file *p, void *v)
}
seq_putc(p, '\n');
-out:
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-outsparse:
- irq_unlock_sparse();
return 0;
}
#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 9065107f083e..ca9cc1b806a9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/resend.c
- *
* Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
* Copyright (C) 2005-2006, Thomas Gleixner
*
@@ -22,70 +21,174 @@
#ifdef CONFIG_HARDIRQS_SW_RESEND
-/* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
+/* hlist_head to handle software resend of interrupts: */
+static HLIST_HEAD(irq_resend_list);
+static DEFINE_RAW_SPINLOCK(irq_resend_lock);
/*
* Run software resends of IRQ's
*/
-static void resend_irqs(unsigned long arg)
+static void resend_irqs(struct tasklet_struct *unused)
{
- struct irq_desc *desc;
- int irq;
-
- while (!bitmap_empty(irqs_resend, nr_irqs)) {
- irq = find_first_bit(irqs_resend, nr_irqs);
- clear_bit(irq, irqs_resend);
- desc = irq_to_desc(irq);
- local_irq_disable();
- desc->handle_irq(irq, desc);
- local_irq_enable();
+ guard(raw_spinlock_irq)(&irq_resend_lock);
+ while (!hlist_empty(&irq_resend_list)) {
+ struct irq_desc *desc;
+
+ desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node);
+ hlist_del_init(&desc->resend_node);
+
+ raw_spin_unlock(&irq_resend_lock);
+ desc->handle_irq(desc);
+ raw_spin_lock(&irq_resend_lock);
}
}
/* Tasklet to handle resend: */
-static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+static DECLARE_TASKLET(resend_tasklet, resend_irqs);
+
+static int irq_sw_resend(struct irq_desc *desc)
+{
+ /*
+ * Validate whether this interrupt can be safely injected from
+ * non interrupt context
+ */
+ if (irqd_is_handle_enforce_irqctx(&desc->irq_data))
+ return -EINVAL;
+
+ /*
+ * If the interrupt is running in the thread context of the parent
+ * irq we need to be careful, because we cannot trigger it
+ * directly.
+ */
+ if (irq_settings_is_nested_thread(desc)) {
+ /*
+ * If the parent_irq is valid, we retrigger the parent,
+ * otherwise we do nothing.
+ */
+ if (!desc->parent_irq)
+ return -EINVAL;
+
+ desc = irq_to_desc(desc->parent_irq);
+ if (!desc)
+ return -EINVAL;
+ }
+ /* Add to resend_list and activate the softirq: */
+ scoped_guard(raw_spinlock, &irq_resend_lock) {
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
+ }
+ tasklet_schedule(&resend_tasklet);
+ return 0;
+}
+
+void clear_irq_resend(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&irq_resend_lock);
+ hlist_del_init(&desc->resend_node);
+}
+
+void irq_resend_init(struct irq_desc *desc)
+{
+ INIT_HLIST_NODE(&desc->resend_node);
+}
+#else
+void clear_irq_resend(struct irq_desc *desc) {}
+void irq_resend_init(struct irq_desc *desc) {}
+
+static int irq_sw_resend(struct irq_desc *desc)
+{
+ return -EINVAL;
+}
#endif
+static int try_retrigger(struct irq_desc *desc)
+{
+ if (desc->irq_data.chip->irq_retrigger)
+ return desc->irq_data.chip->irq_retrigger(&desc->irq_data);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irq_chip_retrigger_hierarchy(&desc->irq_data);
+#else
+ return 0;
+#endif
+}
+
/*
* IRQ resend
*
* Is called with interrupts disabled and desc->lock held.
*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+int check_irq_resend(struct irq_desc *desc, bool inject)
{
+ int err = 0;
+
/*
- * We do not resend level type interrupts. Level type
- * interrupts are resent by hardware when they are still
- * active. Clear the pending bit so suspend/resume does not
- * get confused.
+ * We do not resend level type interrupts. Level type interrupts
+ * are resent by hardware when they are still active. Clear the
+ * pending bit so suspend/resume does not get confused.
*/
if (irq_settings_is_level(desc)) {
desc->istate &= ~IRQS_PENDING;
- return;
+ return -EINVAL;
}
+
if (desc->istate & IRQS_REPLAY)
- return;
- if (desc->istate & IRQS_PENDING) {
- desc->istate &= ~IRQS_PENDING;
+ return -EBUSY;
+
+ if (!(desc->istate & IRQS_PENDING) && !inject)
+ return 0;
+
+ desc->istate &= ~IRQS_PENDING;
+
+ if (!try_retrigger(desc))
+ err = irq_sw_resend(desc);
+
+ /* If the retrigger was successful, mark it with the REPLAY bit */
+ if (!err)
desc->istate |= IRQS_REPLAY;
+ return err;
+}
- if (!desc->irq_data.chip->irq_retrigger ||
- !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
-#ifdef CONFIG_HARDIRQS_SW_RESEND
- /*
- * If the interrupt has a parent irq and runs
- * in the thread context of the parent irq,
- * retrigger the parent.
- */
- if (desc->parent_irq &&
- irq_settings_is_nested_thread(desc))
- irq = desc->parent_irq;
- /* Set it pending and activate the softirq: */
- set_bit(irq, irqs_resend);
- tasklet_schedule(&resend_tasklet);
-#endif
- }
+#ifdef CONFIG_GENERIC_IRQ_INJECTION
+/**
+ * irq_inject_interrupt - Inject an interrupt for testing/error injection
+ * @irq: The interrupt number
+ *
+ * This function must only be used for debug and testing purposes!
+ *
+ * Especially on x86 this can cause a premature completion of an interrupt
+ * affinity change causing the interrupt line to become stale. Very
+ * unlikely, but possible.
+ *
+ * The injection can fail for various reasons:
+ * - Interrupt is not activated
+ * - Interrupt is NMI type or currently replaying
+ * - Interrupt is level type
+ * - Interrupt does not support hardware retrigger and software resend is
+ * either not enabled or not possible for the interrupt.
+ */
+int irq_inject_interrupt(unsigned int irq)
+{
+ int err = -EINVAL;
+
+ /* Try the state injection hardware interface first */
+ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true))
+ return 0;
+
+ /* That failed, try via the resend mechanism */
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+
+ /*
+ * Only try to inject when the interrupt is:
+ * - not NMI type
+ * - activated
+ */
+ if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data))
+ err = check_irq_resend(desc, true);
}
+ return err;
}
+EXPORT_SYMBOL_GPL(irq_inject_interrupt);
+#endif
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 3320b84cc60f..00b3bd127692 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Internal header to deal with irq_desc->status which will be renamed
* to irq_desc->settings.
@@ -10,11 +11,13 @@ enum {
_IRQ_NOREQUEST = IRQ_NOREQUEST,
_IRQ_NOTHREAD = IRQ_NOTHREAD,
_IRQ_NOAUTOEN = IRQ_NOAUTOEN,
- _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
_IRQ_NO_BALANCING = IRQ_NO_BALANCING,
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQ_IS_POLLED = IRQ_IS_POLLED,
+ _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
+ _IRQ_HIDDEN = IRQ_HIDDEN,
+ _IRQ_NO_DEBUG = IRQ_NO_DEBUG,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -28,6 +31,9 @@ enum {
#define IRQ_NESTED_THREAD GOT_YOU_MORON
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#define IRQ_IS_POLLED GOT_YOU_MORON
+#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
+#define IRQ_HIDDEN GOT_YOU_MORON
+#define IRQ_NO_DEBUG GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -135,11 +141,6 @@ static inline void irq_settings_set_noprobe(struct irq_desc *desc)
desc->status_use_accessors |= _IRQ_NOPROBE;
}
-static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
-{
- return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
-}
-
static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
{
return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
@@ -154,3 +155,28 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_IS_POLLED;
}
+
+static inline bool irq_settings_disable_unlazy(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY;
+}
+
+static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
+}
+
+static inline bool irq_settings_is_hidden(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_HIDDEN;
+}
+
+static inline void irq_settings_set_no_debug(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NO_DEBUG;
+}
+
+static inline bool irq_settings_no_debug(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NO_DEBUG;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..73280ccb74b0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * linux/kernel/irq/spurious.c
- *
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains spurious interrupt handling.
@@ -9,7 +8,6 @@
#include <linux/jiffies.h>
#include <linux/irq.h>
#include <linux/module.h>
-#include <linux/kallsyms.h>
#include <linux/interrupt.h>
#include <linux/moduleparam.h>
#include <linux/timer.h>
@@ -19,78 +17,43 @@
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
-static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
-static int irq_poll_cpu;
+static void poll_spurious_irqs(struct timer_list *unused);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
+int irq_poll_cpu;
static atomic_t irq_poll_active;
/*
- * We wait here for a poller to finish.
- *
- * If the poll runs on this CPU, then we yell loudly and return
- * false. That will leave the interrupt line disabled in the worst
- * case, but it should never happen.
- *
- * We wait until the poller is done and then recheck disabled and
- * action (about to be disabled). Only if it's still active, we return
- * true and let the handler run.
- */
-bool irq_wait_for_poll(struct irq_desc *desc)
-{
- if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
- "irq poll in progress on cpu %d for irq %d\n",
- smp_processor_id(), desc->irq_data.irq))
- return false;
-
-#ifdef CONFIG_SMP
- do {
- raw_spin_unlock(&desc->lock);
- while (irqd_irq_inprogress(&desc->irq_data))
- cpu_relax();
- raw_spin_lock(&desc->lock);
- } while (irqd_irq_inprogress(&desc->irq_data));
- /* Might have been disabled in meantime */
- return !irqd_irq_disabled(&desc->irq_data) && desc->action;
-#else
- return false;
-#endif
-}
-
-
-/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(int irq, struct irq_desc *desc, bool force)
+static bool try_one_irq(struct irq_desc *desc, bool force)
{
- irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
+ bool ret = false;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
/*
- * PER_CPU, nested thread interrupts and interrupts explicitely
+ * PER_CPU, nested thread interrupts and interrupts explicitly
* marked polled are excluded from polling.
*/
- if (irq_settings_is_per_cpu(desc) ||
- irq_settings_is_nested_thread(desc) ||
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc) ||
irq_settings_is_polled(desc))
- goto out;
+ return false;
/*
* Do not poll disabled interrupts unless the spurious
- * disabled poller asks explicitely.
+ * disabled poller asks explicitly.
*/
if (irqd_irq_disabled(&desc->irq_data) && !force)
- goto out;
+ return false;
/*
* All handlers must agree on IRQF_SHARED, so we test just the
* first.
*/
action = desc->action;
- if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER))
- goto out;
+ if (!action || !(action->flags & IRQF_SHARED) || (action->flags & __IRQF_TIMER))
+ return false;
/* Already running on another processor */
if (irqd_irq_inprogress(&desc->irq_data)) {
@@ -99,21 +62,19 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
* CPU to go looking for our mystery interrupt too
*/
desc->istate |= IRQS_PENDING;
- goto out;
+ return false;
}
/* Mark it poll in progress */
desc->istate |= IRQS_POLL_INPROGRESS;
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
- ret = IRQ_HANDLED;
+ ret = true;
/* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
-out:
- raw_spin_unlock(&desc->lock);
- return ret == IRQ_HANDLED;
+ return ret;
}
static int misrouted_irq(int irq)
@@ -133,7 +94,7 @@ static int misrouted_irq(int irq)
if (i == irq) /* Already tried */
continue;
- if (try_one_irq(i, desc, false))
+ if (try_one_irq(desc, false))
ok = 1;
}
out:
@@ -142,7 +103,7 @@ out:
return ok;
}
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
{
struct irq_desc *desc;
int i;
@@ -158,24 +119,24 @@ static void poll_spurious_irqs(unsigned long dummy)
continue;
/* Racy but it doesn't matter */
- state = desc->istate;
- barrier();
+ state = READ_ONCE(desc->istate);
if (!(state & IRQS_SPURIOUS_DISABLED))
continue;
local_irq_disable();
- try_one_irq(i, desc, true);
+ try_one_irq(desc, true);
local_irq_enable();
}
out:
atomic_dec(&irq_poll_active);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
static inline int bad_action_ret(irqreturn_t action_ret)
{
- if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+ unsigned int r = action_ret;
+
+ if (likely(r <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
return 0;
return 1;
}
@@ -188,22 +149,17 @@ static inline int bad_action_ret(irqreturn_t action_ret)
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
-static void
-__report_bad_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
+ unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
- unsigned long flags;
- if (bad_action_ret(action_ret)) {
- printk(KERN_ERR "irq event %d: bogus return value %x\n",
- irq, action_ret);
- } else {
- printk(KERN_ERR "irq %d: nobody cared (try booting with "
- "the \"irqpoll\" option)\n", irq);
- }
+ if (bad_action_ret(action_ret))
+ pr_err("irq event %d: bogus return value %x\n", irq, action_ret);
+ else
+ pr_err("irq %d: nobody cared (try booting with the \"irqpoll\" option)\n", irq);
dump_stack();
- printk(KERN_ERR "handlers:\n");
+ pr_err("handlers:\n");
/*
* We need to take desc->lock here. note_interrupt() is called
@@ -211,42 +167,36 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
* with something else removing an action. It's ok to take
* desc->lock here. See synchronize_irq().
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
- action = desc->action;
- while (action) {
- printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ for_each_action_of_desc(desc, action) {
+ pr_err("[<%p>] %ps", action->handler, action->handler);
if (action->thread_fn)
- printk(KERN_CONT " threaded [<%p>] %pf",
- action->thread_fn, action->thread_fn);
- printk(KERN_CONT "\n");
- action = action->next;
+ pr_cont(" threaded [<%p>] %ps", action->thread_fn, action->thread_fn);
+ pr_cont("\n");
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
-static void
-report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
}
}
-static inline int
-try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static inline bool try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
{
struct irqaction *action;
if (!irqfixup)
- return 0;
+ return false;
/* We didn't actually handle the IRQ - see if it was misrouted? */
if (action_ret == IRQ_NONE)
- return 1;
+ return true;
/*
* But for 'irqfixup == 2' we also do it for handled interrupts if
@@ -254,33 +204,30 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
* traditional PC timer interrupt.. Legacy)
*/
if (irqfixup < 2)
- return 0;
+ return false;
if (!irq)
- return 1;
+ return true;
/*
* Since we don't get the descriptor lock, "action" can
- * change under us. We don't really care, but we don't
- * want to follow a NULL pointer. So tell the compiler to
- * just load it once by using a barrier.
+ * change under us.
*/
- action = desc->action;
- barrier();
+ action = READ_ONCE(desc->action);
return action && (action->flags & IRQF_IRQPOLL);
}
#define SPURIOUS_DEFERRED 0x80000000
-void note_interrupt(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
- if (desc->istate & IRQS_POLL_INPROGRESS ||
- irq_settings_is_polled(desc))
+ unsigned int irq;
+
+ if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
- report_bad_irq(irq, desc, action_ret);
+ report_bad_irq(desc, action_ret);
return;
}
@@ -295,7 +242,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
* So in case a thread is woken, we just note the fact and
* defer the analysis to the next hardware interrupt.
*
- * The threaded handlers store whether they sucessfully
+ * The threaded handlers store whether they successfully
* handled an interrupt and we check whether that number
* changed versus the last invocation.
*
@@ -398,12 +345,17 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
desc->last_unhandled = jiffies;
}
+ irq = irq_desc_get_irq(desc);
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
}
+ if (likely(!desc->irqs_unhandled))
+ return;
+
+ /* Now getting into unhandled irq detection */
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
@@ -413,17 +365,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
/*
* The interrupt is stuck
*/
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
/*
* Now kill the IRQ
*/
- printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ pr_emerg("Disabling IRQ #%d\n", irq);
desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
irq_disable(desc);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
@@ -433,35 +384,36 @@ bool noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
noirqdebug = 1;
- printk(KERN_INFO "IRQ lockup detection disabled\n");
-
+ pr_info("IRQ lockup detection disabled\n");
return 1;
}
-
__setup("noirqdebug", noirqdebug_setup);
module_param(noirqdebug, bool, 0644);
MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqfixup boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 1;
- printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
- printk(KERN_WARNING "This may impact system performance.\n");
-
+ pr_warn("Misrouted IRQ fixup support enabled.\n");
+ pr_warn("This may impact system performance.\n");
return 1;
}
-
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ pr_warn("irqpoll boot option not supported with PREEMPT_RT\n");
+ return 1;
+ }
irqfixup = 2;
- printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
- "enabled\n");
- printk(KERN_WARNING "This may significantly impact system "
- "performance\n");
+ pr_warn("Misrouted IRQ fixup and polling support enabled\n");
+ pr_warn("This may significantly impact system performance\n");
return 1;
}
-
__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
new file mode 100644
index 000000000000..4b7315e99bd6
--- /dev/null
+++ b/kernel/irq/timings.c
@@ -0,0 +1,959 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+#define pr_fmt(fmt) "irq_timings: " fmt
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/static_key.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/idr.h>
+#include <linux/irq.h>
+#include <linux/math64.h>
+#include <linux/log2.h>
+
+#include <trace/events/irq.h>
+
+#include "internals.h"
+
+DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
+
+DEFINE_PER_CPU(struct irq_timings, irq_timings);
+
+static DEFINE_IDR(irqt_stats);
+
+void irq_timings_enable(void)
+{
+ static_branch_enable(&irq_timing_enabled);
+}
+
+void irq_timings_disable(void)
+{
+ static_branch_disable(&irq_timing_enabled);
+}
+
+/*
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
+ *
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
+ *
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
+ *
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
+ *
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
+ *
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
+ *
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
+ *
+ * 1. Suffix array
+ *
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
+ *
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
+ *
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
+ *
+ * 2. Log interval
+ *
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonably assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
+ *
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
+ *
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
+ *
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
+ *
+ * 3. Exponential Moving Average
+ *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ * -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ * For each interval:
+ * Compute the index = ilog2(interval)
+ * Compute a new_ema(buffer[index], interval)
+ * Store the index in a circular buffer
+ *
+ * Compute the suffix array of the indexes
+ *
+ * For each suffix:
+ * If the suffix is reverse-found 3 times
+ * Return suffix
+ *
+ * Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
+ *
+ * Example based on real values:
+ *
+ * Example 1 : MMC write/read interrupt interval:
+ *
+ * 223947, 1240, 1384, 1386, 1386,
+ * 217416, 1236, 1384, 1386, 1387,
+ * 214719, 1241, 1386, 1387, 1384,
+ * 213696, 1234, 1384, 1386, 1388,
+ * 219904, 1240, 1385, 1389, 1385,
+ * 212240, 1240, 1386, 1386, 1386,
+ * 214415, 1236, 1384, 1386, 1387,
+ * 214276, 1234, 1384, 1388, ?
+ *
+ * For each element, apply ilog2(value)
+ *
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, ?
+ *
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
+ * we can be confident if the pattern repeats itself three times it is
+ * a repeating pattern.
+ *
+ * 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, ?
+ *
+ * Suffixes are:
+ *
+ * 1) 8, 15, 8, 8, 8 <- max period
+ * 2) 8, 15, 8, 8
+ * 3) 8, 15, 8
+ * 4) 8, 15 <- min period
+ *
+ * From there we search the repeating pattern for each suffix.
+ *
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
+ * | | | | | | | | | | | | | | |
+ * 8, 15, 8, 8, 8 | | | | | | | | | |
+ * 8, 15, 8, 8, 8 | | | | |
+ * 8, 15, 8, 8, 8
+ *
+ * When moving the suffix, we found exactly 3 matches.
+ *
+ * The first suffix with period 5 is repeating.
+ *
+ * The next event is (3 * max_period) % suffix_period
+ *
+ * In this example, the result 0, so the next event is suffix[0] => 8
+ *
+ * However, 8 is the index in the array of exponential moving average
+ * which was calculated on the fly when storing the values, so the
+ * interval is ema[8] = 1366
+ *
+ *
+ * Example 2:
+ *
+ * 4, 3, 5, 100,
+ * 3, 3, 5, 117,
+ * 4, 4, 5, 112,
+ * 4, 3, 4, 110,
+ * 3, 5, 3, 117,
+ * 4, 4, 5, 112,
+ * 4, 3, 4, 110,
+ * 3, 4, 5, 112,
+ * 4, 3, 4, 110
+ *
+ * ilog2
+ *
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4
+ *
+ * Max period 5:
+ * 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ * 1) 0, 0, 4, 0, 0
+ * 2) 0, 0, 4, 0
+ * 3) 0, 0, 4
+ * 4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ * | | | | | | X
+ * 0, 0, 4, 0, 0, | X
+ * 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ * | | | | | | | | | | | | | | |
+ * 0, 0, 4, 0, | | | | | | | | | | |
+ * 0, 0, 4, 0, | | | | | | |
+ * 0, 0, 4, 0, | | |
+ * 0 0 4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
+ */
+#define EMA_ALPHA_VAL 64
+#define EMA_ALPHA_SHIFT 7
+
+#define PREDICTION_PERIOD_MIN 3
+#define PREDICTION_PERIOD_MAX 5
+#define PREDICTION_FACTOR 4
+#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
+
+/*
+ * Number of elements in the circular buffer: If it happens it was
+ * flushed before, then the number of elements could be smaller than
+ * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
+ * used as we wrapped. The index begins from zero when we did not
+ * wrap. That could be done in a nicer way with the proper circular
+ * array structure type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ */
+#define for_each_irqts(i, irqts) \
+ for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
+ 0 : irqts->count & IRQ_TIMINGS_MASK, \
+ irqts->count = min(IRQ_TIMINGS_SIZE, \
+ irqts->count); \
+ irqts->count > 0; irqts->count--, \
+ i = (i + 1) & IRQ_TIMINGS_MASK)
+
+struct irqt_stat {
+ u64 last_ts;
+ u64 ema_time[PREDICTION_BUFFER_SIZE];
+ int timings[IRQ_TIMINGS_SIZE];
+ int circ_timings[IRQ_TIMINGS_SIZE];
+ int count;
+};
+
+/*
+ * Exponential moving average computation
+ */
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
+{
+ s64 diff;
+
+ if (unlikely(!ema_old))
+ return value;
+
+ diff = (value - ema_old) * EMA_ALPHA_VAL;
+ /*
+ * We can use a s64 type variable to be added with the u64
+ * ema_old variable as this one will never have its topmost
+ * bit set, it will be always smaller than 2^63 nanosec
+ * interrupt interval (292 years).
+ */
+ return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+ int period;
+
+ /*
+ * Move the beginning pointer to the end minus the max period x 3.
+ * We are at the point we can begin searching the pattern
+ */
+ buffer = &buffer[len - (period_max * 3)];
+
+ /* Adjust the length to the maximum allowed period x 3 */
+ len = period_max * 3;
+
+ /*
+ * The buffer contains the suite of intervals, in a ilog2
+ * basis, we are looking for a repetition. We point the
+ * beginning of the search three times the length of the
+ * period beginning at the end of the buffer. We do that for
+ * each suffix.
+ */
+ for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
+
+ /*
+ * The first comparison always succeed because the
+ * suffix is deduced from the first n-period bytes of
+ * the buffer and we compare the initial suffix with
+ * itself, so we can skip the first iteration.
+ */
+ int idx = period;
+ size_t size = period;
+
+ /*
+ * We look if the suite with period 'i' repeat
+ * itself. If it is truncated at the end, as it
+ * repeats we can use the period to find out the next
+ * element with the modulo.
+ */
+ while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
+
+ /*
+ * Move the index in a period basis
+ */
+ idx += size;
+
+ /*
+ * If this condition is reached, all previous
+ * memcmp were successful, so the period is
+ * found.
+ */
+ if (idx == len)
+ return buffer[len % period];
+
+ /*
+ * If the remaining elements to compare are
+ * smaller than the period, readjust the size
+ * of the comparison for the last iteration.
+ */
+ if (len - idx < period)
+ size = len - idx;
+ }
+ }
+
+ return -1;
+}
+
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+ int index, i, period_max, count, start, min = INT_MAX;
+
+ if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+ irqs->count = irqs->last_ts = 0;
+ return U64_MAX;
+ }
+
+ /*
+ * As we want to find three times the repetition, we need a
+ * number of intervals greater or equal to three times the
+ * maximum period, otherwise we truncate the max period.
+ */
+ period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+ PREDICTION_PERIOD_MAX : irqs->count / 3;
+
+ /*
+ * If we don't have enough irq timings for this prediction,
+ * just bail out.
+ */
+ if (period_max <= PREDICTION_PERIOD_MIN)
+ return U64_MAX;
+
+ /*
+ * 'count' will depends if the circular buffer wrapped or not
+ */
+ count = irqs->count < IRQ_TIMINGS_SIZE ?
+ irqs->count : IRQ_TIMINGS_SIZE;
+
+ start = irqs->count < IRQ_TIMINGS_SIZE ?
+ 0 : (irqs->count & IRQ_TIMINGS_MASK);
+
+ /*
+ * Copy the content of the circular buffer into another buffer
+ * in order to linearize the buffer instead of dealing with
+ * wrapping indexes and shifted array which will be prone to
+ * error and extremely difficult to debug.
+ */
+ for (i = 0; i < count; i++) {
+ int index = (start + i) & IRQ_TIMINGS_MASK;
+
+ irqs->timings[i] = irqs->circ_timings[index];
+ min = min_t(int, irqs->timings[i], min);
+ }
+
+ index = irq_timings_next_event_index(irqs->timings, count, period_max);
+ if (index < 0)
+ return irqs->last_ts + irqs->ema_time[min];
+
+ return irqs->last_ts + irqs->ema_time[index];
+}
+
+static __always_inline int irq_timings_interval_index(u64 interval)
+{
+ /*
+ * The PREDICTION_FACTOR increase the interval size for the
+ * array of exponential average.
+ */
+ u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
+
+ return likely(interval_us) ? ilog2(interval_us) : 0;
+}
+
+static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
+ u64 interval)
+{
+ int index;
+
+ /*
+ * Get the index in the ema table for this interrupt.
+ */
+ index = irq_timings_interval_index(interval);
+
+ if (index > PREDICTION_BUFFER_SIZE - 1) {
+ irqs->count = 0;
+ return;
+ }
+
+ /*
+ * Store the index as an element of the pattern in another
+ * circular array.
+ */
+ irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
+
+ irqs->ema_time[index] = irq_timings_ema_new(interval,
+ irqs->ema_time[index]);
+
+ irqs->count++;
+}
+
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
+{
+ u64 old_ts = irqs->last_ts;
+ u64 interval;
+
+ /*
+ * The timestamps are absolute time values, we need to compute
+ * the timing interval between two interrupts.
+ */
+ irqs->last_ts = ts;
+
+ /*
+ * The interval type is u64 in order to deal with the same
+ * type in our computation, that prevent mindfuck issues with
+ * overflow, sign and division.
+ */
+ interval = ts - old_ts;
+
+ /*
+ * The interrupt triggered more than one second apart, that
+ * ends the sequence as predictable for our purpose. In this
+ * case, assume we have the beginning of a sequence and the
+ * timestamp is the first value. As it is impossible to
+ * predict anything at this point, return.
+ *
+ * Note the first timestamp of the sequence will always fall
+ * in this test because the old_ts is zero. That is what we
+ * want as we need another timestamp to compute an interval.
+ */
+ if (interval >= NSEC_PER_SEC) {
+ irqs->count = 0;
+ return;
+ }
+
+ __irq_timings_store(irq, irqs, interval);
+}
+
+/**
+ * irq_timings_next_event - Return when the next event is supposed to arrive
+ * @now: current time
+ *
+ * During the last busy cycle, the number of interrupts is incremented
+ * and stored in the irq_timings structure. This information is
+ * necessary to:
+ *
+ * - know if the index in the table wrapped up:
+ *
+ * If more than the array size interrupts happened during the
+ * last busy/idle cycle, the index wrapped up and we have to
+ * begin with the next element in the array which is the last one
+ * in the sequence, otherwise it is at the index 0.
+ *
+ * - have an indication of the interrupts activity on this CPU
+ * (eg. irq/sec)
+ *
+ * The values are 'consumed' after inserting in the statistical model,
+ * thus the count is reinitialized.
+ *
+ * The array of values **must** be browsed in the time direction, the
+ * timestamp must increase between an element and the next one.
+ *
+ * Returns a nanosec time based estimation of the earliest interrupt,
+ * U64_MAX otherwise.
+ */
+u64 irq_timings_next_event(u64 now)
+{
+ struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+ struct irqt_stat *irqs;
+ struct irqt_stat __percpu *s;
+ u64 ts, next_evt = U64_MAX;
+ int i, irq = 0;
+
+ /*
+ * This function must be called with the local irq disabled in
+ * order to prevent the timings circular buffer to be updated
+ * while we are reading it.
+ */
+ lockdep_assert_irqs_disabled();
+
+ if (!irqts->count)
+ return next_evt;
+
+ /*
+ * Number of elements in the circular buffer: If it happens it
+ * was flushed before, then the number of elements could be
+ * smaller than IRQ_TIMINGS_SIZE, so the count is used,
+ * otherwise the array size is used as we wrapped. The index
+ * begins from zero when we did not wrap. That could be done
+ * in a nicer way with the proper circular array structure
+ * type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ *
+ * Inject measured irq/timestamp to the pattern prediction
+ * model while decrementing the counter because we consume the
+ * data from our circular buffer.
+ */
+ for_each_irqts(i, irqts) {
+ irq = irq_timing_decode(irqts->values[i], &ts);
+ s = idr_find(&irqt_stats, irq);
+ if (s)
+ irq_timings_store(irq, this_cpu_ptr(s), ts);
+ }
+
+ /*
+ * Look in the list of interrupts' statistics, the earliest
+ * next event.
+ */
+ idr_for_each_entry(&irqt_stats, s, i) {
+
+ irqs = this_cpu_ptr(s);
+
+ ts = __irq_timings_next_event(irqs, i, now);
+ if (ts <= now)
+ return now;
+
+ if (ts < next_evt)
+ next_evt = ts;
+ }
+
+ return next_evt;
+}
+
+void irq_timings_free(int irq)
+{
+ struct irqt_stat __percpu *s;
+
+ s = idr_find(&irqt_stats, irq);
+ if (s) {
+ free_percpu(s);
+ idr_remove(&irqt_stats, irq);
+ }
+}
+
+int irq_timings_alloc(int irq)
+{
+ struct irqt_stat __percpu *s;
+ int id;
+
+ /*
+ * Some platforms can have the same private interrupt per cpu,
+ * so this function may be called several times with the
+ * same interrupt number. Just bail out in case the per cpu
+ * stat structure is already allocated.
+ */
+ s = idr_find(&irqt_stats, irq);
+ if (s)
+ return 0;
+
+ s = alloc_percpu(*s);
+ if (!s)
+ return -ENOMEM;
+
+ idr_preload(GFP_KERNEL);
+ id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
+ idr_preload_end();
+
+ if (id < 0) {
+ free_percpu(s);
+ return id;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_TEST_IRQ_TIMINGS
+struct timings_intervals {
+ u64 *intervals;
+ size_t count;
+};
+
+/*
+ * Intervals are given in nanosecond base
+ */
+static u64 intervals0[] __initdata = {
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000,
+};
+
+static u64 intervals1[] __initdata = {
+ 223947000, 1240000, 1384000, 1386000, 1386000,
+ 217416000, 1236000, 1384000, 1386000, 1387000,
+ 214719000, 1241000, 1386000, 1387000, 1384000,
+ 213696000, 1234000, 1384000, 1386000, 1388000,
+ 219904000, 1240000, 1385000, 1389000, 1385000,
+ 212240000, 1240000, 1386000, 1386000, 1386000,
+ 214415000, 1236000, 1384000, 1386000, 1387000,
+ 214276000, 1234000,
+};
+
+static u64 intervals2[] __initdata = {
+ 4000, 3000, 5000, 100000,
+ 3000, 3000, 5000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 5000, 3000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 4000, 5000, 112000,
+ 4000,
+};
+
+static u64 intervals3[] __initdata = {
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+};
+
+static u64 intervals4[] __initdata = {
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000,
+};
+
+static struct timings_intervals tis[] __initdata = {
+ { intervals0, ARRAY_SIZE(intervals0) },
+ { intervals1, ARRAY_SIZE(intervals1) },
+ { intervals2, ARRAY_SIZE(intervals2) },
+ { intervals3, ARRAY_SIZE(intervals3) },
+ { intervals4, ARRAY_SIZE(intervals4) },
+};
+
+static int __init irq_timings_test_next_index(struct timings_intervals *ti)
+{
+ int _buffer[IRQ_TIMINGS_SIZE];
+ int buffer[IRQ_TIMINGS_SIZE];
+ int index, start, i, count, period_max;
+
+ count = ti->count - 1;
+
+ period_max = count > (3 * PREDICTION_PERIOD_MAX) ?
+ PREDICTION_PERIOD_MAX : count / 3;
+
+ /*
+ * Inject all values except the last one which will be used
+ * to compare with the next index result.
+ */
+ pr_debug("index suite: ");
+
+ for (i = 0; i < count; i++) {
+ index = irq_timings_interval_index(ti->intervals[i]);
+ _buffer[i & IRQ_TIMINGS_MASK] = index;
+ pr_cont("%d ", index);
+ }
+
+ start = count < IRQ_TIMINGS_SIZE ? 0 :
+ count & IRQ_TIMINGS_MASK;
+
+ count = min_t(int, count, IRQ_TIMINGS_SIZE);
+
+ for (i = 0; i < count; i++) {
+ int index = (start + i) & IRQ_TIMINGS_MASK;
+ buffer[i] = _buffer[index];
+ }
+
+ index = irq_timings_next_event_index(buffer, count, period_max);
+ i = irq_timings_interval_index(ti->intervals[ti->count - 1]);
+
+ if (index != i) {
+ pr_err("Expected (%d) and computed (%d) next indexes differ\n",
+ i, index);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init irq_timings_next_index_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+
+ ret = irq_timings_test_next_index(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqs(struct timings_intervals *ti)
+{
+ struct irqt_stat __percpu *s;
+ struct irqt_stat *irqs;
+ int i, index, ret, irq = 0xACE5;
+
+ ret = irq_timings_alloc(irq);
+ if (ret) {
+ pr_err("Failed to allocate irq timings\n");
+ return ret;
+ }
+
+ s = idr_find(&irqt_stats, irq);
+ if (!s) {
+ ret = -EIDRM;
+ goto out;
+ }
+
+ irqs = this_cpu_ptr(s);
+
+ for (i = 0; i < ti->count; i++) {
+
+ index = irq_timings_interval_index(ti->intervals[i]);
+ pr_debug("%d: interval=%llu ema_index=%d\n",
+ i, ti->intervals[i], index);
+
+ __irq_timings_store(irq, irqs, ti->intervals[i]);
+ if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ ret = -EBADSLT;
+ pr_err("Failed to store in the circular buffer\n");
+ goto out;
+ }
+ }
+
+ if (irqs->count != ti->count) {
+ ret = -ERANGE;
+ pr_err("Count differs\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ irq_timings_free(irq);
+
+ return ret;
+}
+
+static int __init irq_timings_irqs_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+ ret = irq_timings_test_irqs(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqts(struct irq_timings *irqts,
+ unsigned count)
+{
+ int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
+ int i, irq, oirq = 0xBEEF;
+ u64 ots = 0xDEAD, ts;
+
+ /*
+ * Fill the circular buffer by using the dedicated function.
+ */
+ for (i = 0; i < count; i++) {
+ pr_debug("%d: index=%d, ts=%llX irq=%X\n",
+ i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i);
+
+ irq_timings_push(ots + i, oirq + i);
+ }
+
+ /*
+ * Compute the first elements values after the index wrapped
+ * up or not.
+ */
+ ots += start;
+ oirq += start;
+
+ /*
+ * Test the circular buffer count is correct.
+ */
+ pr_debug("---> Checking timings array count (%d) is right\n", count);
+ if (WARN_ON(irqts->count != count))
+ return -EINVAL;
+
+ /*
+ * Test the macro allowing to browse all the irqts.
+ */
+ pr_debug("---> Checking the for_each_irqts() macro\n");
+ for_each_irqts(i, irqts) {
+
+ irq = irq_timing_decode(irqts->values[i], &ts);
+
+ pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n",
+ i, ts, ots, irq, oirq);
+
+ if (WARN_ON(ts != ots || irq != oirq))
+ return -EINVAL;
+
+ ots++; oirq++;
+ }
+
+ /*
+ * The circular buffer should have be flushed when browsed
+ * with for_each_irqts
+ */
+ pr_debug("---> Checking timings array is empty after browsing it\n");
+ if (WARN_ON(irqts->count))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init irq_timings_irqts_selftest(void)
+{
+ struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+ int i, ret;
+
+ /*
+ * Test the circular buffer with different number of
+ * elements. The purpose is to test at the limits (empty, half
+ * full, full, wrapped with the cursor at the boundaries,
+ * wrapped several times, etc ...
+ */
+ int count[] = { 0,
+ IRQ_TIMINGS_SIZE >> 1,
+ IRQ_TIMINGS_SIZE,
+ IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
+ 2 * IRQ_TIMINGS_SIZE,
+ (2 * IRQ_TIMINGS_SIZE) + 3,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(count); i++) {
+
+ pr_info("---> Checking the timings with %d/%d values\n",
+ count[i], IRQ_TIMINGS_SIZE);
+
+ ret = irq_timings_test_irqts(irqts, count[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_selftest(void)
+{
+ int ret;
+
+ pr_info("------------------- selftest start -----------------\n");
+
+ /*
+ * At this point, we don't except any subsystem to use the irq
+ * timings but us, so it should not be enabled.
+ */
+ if (static_branch_unlikely(&irq_timing_enabled)) {
+ pr_warn("irq timings already initialized, skipping selftest\n");
+ return 0;
+ }
+
+ ret = irq_timings_irqts_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_irqs_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_next_index_selftest();
+out:
+ pr_info("---------- selftest end with %s -----------\n",
+ ret ? "failure" : "success");
+
+ return ret;
+}
+early_initcall(irq_timings_selftest);
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index cbf9fb899d92..73f7e1fd4ab4 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
*
* Provides a framework for enqueueing and running callbacks from hardirq
* context. The enqueueing is NMI-safe.
@@ -17,35 +18,54 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <asm/processor.h>
+#include <linux/kasan.h>
+#include <trace/events/ipi.h>
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
+static DEFINE_PER_CPU(struct task_struct *, irq_workd);
+
+static void wake_irq_workd(void)
+{
+ struct task_struct *tsk = __this_cpu_read(irq_workd);
+
+ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
+ wake_up_process(tsk);
+}
+
+#ifdef CONFIG_SMP
+static void irq_work_wake(struct irq_work *entry)
+{
+ wake_irq_workd();
+}
+
+static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
+ IRQ_WORK_INIT_HARD(irq_work_wake);
+#endif
+
+static int irq_workd_should_run(unsigned int cpu)
+{
+ return !llist_empty(this_cpu_ptr(&lazy_list));
+}
/*
* Claim the entry so that no one else will poke at it.
*/
static bool irq_work_claim(struct irq_work *work)
{
- unsigned long flags, oflags, nflags;
+ int oflags;
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
/*
- * Start with our best wish as a premise but only trust any
- * flag value after cmpxchg() result.
+ * If the work is already pending, no need to raise the IPI.
+ * The pairing smp_mb() in irq_work_single() makes sure
+ * everything we did before is visible.
*/
- flags = work->flags & ~IRQ_WORK_PENDING;
- for (;;) {
- nflags = flags | IRQ_WORK_FLAGS;
- oflags = cmpxchg(&work->flags, flags, nflags);
- if (oflags == flags)
- break;
- if (oflags & IRQ_WORK_PENDING)
- return false;
- flags = oflags;
- cpu_relax();
- }
-
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
return true;
}
@@ -56,7 +76,58 @@ void __weak arch_irq_work_raise(void)
*/
}
-#ifdef CONFIG_SMP
+static __always_inline void irq_work_raise(struct irq_work *work)
+{
+ if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
+ trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
+
+ arch_irq_work_raise();
+}
+
+/* Enqueue on current CPU, work must already be claimed and preempt disabled */
+static void __irq_work_queue_local(struct irq_work *work)
+{
+ struct llist_head *list;
+ bool rt_lazy_work = false;
+ bool lazy_work = false;
+ int work_flags;
+
+ work_flags = atomic_read(&work->node.a_flags);
+ if (work_flags & IRQ_WORK_LAZY)
+ lazy_work = true;
+ else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(work_flags & IRQ_WORK_HARD_IRQ))
+ rt_lazy_work = true;
+
+ if (lazy_work || rt_lazy_work)
+ list = this_cpu_ptr(&lazy_list);
+ else
+ list = this_cpu_ptr(&raised_list);
+
+ if (!llist_add(&work->node.llist, list))
+ return;
+
+ /* If the work is "lazy", handle it from next tick if any */
+ if (!lazy_work || tick_nohz_tick_stopped())
+ irq_work_raise(work);
+}
+
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return false;
+
+ /* Queue the entry and raise the IPI if needed. */
+ preempt_disable();
+ __irq_work_queue_local(work);
+ preempt_enable();
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+
/*
* Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
@@ -65,49 +136,50 @@ void __weak arch_irq_work_raise(void)
*/
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
+#ifndef CONFIG_SMP
+ return irq_work_queue(work);
+
+#else /* CONFIG_SMP: */
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(cpu));
- /* Arch remote IPI send/receive backend aren't NMI safe */
- WARN_ON_ONCE(in_nmi());
-
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
- arch_send_call_function_single_ipi(cpu);
+ kasan_record_aux_stack(work);
- return true;
-}
-EXPORT_SYMBOL_GPL(irq_work_queue_on);
-#endif
+ preempt_disable();
+ if (cpu != smp_processor_id()) {
+ /* Arch remote IPI send/receive backend aren't NMI safe */
+ WARN_ON_ONCE(in_nmi());
-/* Enqueue the irq work @work on the current CPU */
-bool irq_work_queue(struct irq_work *work)
-{
- /* Only queue if not already pending */
- if (!irq_work_claim(work))
- return false;
+ /*
+ * On PREEMPT_RT the items which are not marked as
+ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
+ * item is used on the remote CPU to wake the thread.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {
- /* Queue the entry and raise the IPI if needed. */
- preempt_disable();
+ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
+ goto out;
- /* If the work is "lazy", handle it from next tick if any */
- if (work->flags & IRQ_WORK_LAZY) {
- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
+ work = &per_cpu(irq_work_wakeup, cpu);
+ if (!irq_work_claim(work))
+ goto out;
+ }
+
+ __smp_call_single_queue(cpu, &work->node.llist);
} else {
- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
- arch_irq_work_raise();
+ __irq_work_queue_local(work);
}
-
+out:
preempt_enable();
return true;
+#endif /* CONFIG_SMP */
}
-EXPORT_SYMBOL_GPL(irq_work_queue);
bool irq_work_needs_cpu(void)
{
@@ -126,40 +198,58 @@ bool irq_work_needs_cpu(void)
return true;
}
+void irq_work_single(void *arg)
+{
+ struct irq_work *work = arg;
+ int flags;
+
+ /*
+ * Clear the PENDING bit, after this point the @work can be re-used.
+ * The PENDING bit acts as a lock, and we own it, so we can clear it
+ * without atomic ops.
+ */
+ flags = atomic_read(&work->node.a_flags);
+ flags &= ~IRQ_WORK_PENDING;
+ atomic_set(&work->node.a_flags, flags);
+
+ /*
+ * See irq_work_claim().
+ */
+ smp_mb();
+
+ lockdep_irq_work_enter(flags);
+ work->func(work);
+ lockdep_irq_work_exit(flags);
+
+ /*
+ * Clear the BUSY bit, if set, and return to the free state if no-one
+ * else claimed it meanwhile.
+ */
+ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
+
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt())
+ rcuwait_wake_up(&work->irqwait);
+}
+
static void irq_work_run_list(struct llist_head *list)
{
- unsigned long flags;
- struct irq_work *work;
+ struct irq_work *work, *tmp;
struct llist_node *llnode;
- BUG_ON(!irqs_disabled());
+ /*
+ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
+ * in a per-CPU thread in preemptible context. Only the items which are
+ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
+ */
+ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));
if (llist_empty(list))
return;
llnode = llist_del_all(list);
- while (llnode != NULL) {
- work = llist_entry(llnode, struct irq_work, llnode);
-
- llnode = llist_next(llnode);
-
- /*
- * Clear the PENDING bit, after this point the @work
- * can be re-used.
- * Make it immediately visible so that other CPUs trying
- * to claim that work don't rely on us to handle their data
- * while we are in the middle of the func.
- */
- flags = work->flags & ~IRQ_WORK_PENDING;
- xchg(&work->flags, flags);
-
- work->func(work);
- /*
- * Clear the BUSY bit and return to the free state if
- * no-one else claimed it meanwhile.
- */
- (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
- }
+ llist_for_each_entry_safe(work, tmp, llnode, node.llist)
+ irq_work_single(work);
}
/*
@@ -169,7 +259,10 @@ static void irq_work_run_list(struct llist_head *list)
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -179,7 +272,11 @@ void irq_work_tick(void)
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
- irq_work_run_list(this_cpu_ptr(&lazy_list));
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+ else
+ wake_irq_workd();
}
/*
@@ -188,9 +285,43 @@ void irq_work_tick(void)
*/
void irq_work_sync(struct irq_work *work)
{
- WARN_ON_ONCE(irqs_disabled());
+ lockdep_assert_irqs_enabled();
+ might_sleep();
+
+ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
+ !arch_irq_work_has_interrupt()) {
+ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
+ TASK_UNINTERRUPTIBLE);
+ return;
+ }
- while (work->flags & IRQ_WORK_BUSY)
+ while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
+
+static void run_irq_workd(unsigned int cpu)
+{
+ irq_work_run_list(this_cpu_ptr(&lazy_list));
+}
+
+static void irq_workd_setup(unsigned int cpu)
+{
+ sched_set_fifo_low(current);
+}
+
+static struct smp_hotplug_thread irqwork_threads = {
+ .store = &irq_workd,
+ .setup = irq_workd_setup,
+ .thread_should_run = irq_workd_should_run,
+ .thread_fn = run_irq_workd,
+ .thread_comm = "irq_work/%u",
+};
+
+static __init int irq_work_init_threads(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
+ return 0;
+}
+early_initcall(irq_work_init_threads);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 9019f15deab2..7cb19e601426 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* jump label support
*
* Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
- * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2011 Peter Zijlstra
*
*/
#include <linux/memory.h>
@@ -14,10 +15,11 @@
#include <linux/err.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <asm/sections.h>
-#ifdef HAVE_JUMP_LABEL
-
-/* mutex to protect coming/going of the the jump_label table */
+/* mutex to protect coming/going of the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);
void jump_label_lock(void)
@@ -35,91 +37,336 @@ static int jump_label_cmp(const void *a, const void *b)
const struct jump_entry *jea = a;
const struct jump_entry *jeb = b;
- if (jea->key < jeb->key)
+ /*
+ * Entrires are sorted by key.
+ */
+ if (jump_entry_key(jea) < jump_entry_key(jeb))
+ return -1;
+
+ if (jump_entry_key(jea) > jump_entry_key(jeb))
+ return 1;
+
+ /*
+ * In the batching mode, entries should also be sorted by the code
+ * inside the already sorted list of entries, enabling a bsearch in
+ * the vector.
+ */
+ if (jump_entry_code(jea) < jump_entry_code(jeb))
return -1;
- if (jea->key > jeb->key)
+ if (jump_entry_code(jea) > jump_entry_code(jeb))
return 1;
return 0;
}
+static void jump_label_swap(void *a, void *b, int size)
+{
+ long delta = (unsigned long)a - (unsigned long)b;
+ struct jump_entry *jea = a;
+ struct jump_entry *jeb = b;
+ struct jump_entry tmp = *jea;
+
+ jea->code = jeb->code - delta;
+ jea->target = jeb->target - delta;
+ jea->key = jeb->key - delta;
+
+ jeb->code = tmp.code + delta;
+ jeb->target = tmp.target + delta;
+ jeb->key = tmp.key + delta;
+}
+
static void
jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
{
unsigned long size;
+ void *swapfn = NULL;
+
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE))
+ swapfn = jump_label_swap;
size = (((unsigned long)stop - (unsigned long)start)
/ sizeof(struct jump_entry));
- sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+ sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn);
+}
+
+static void jump_label_update(struct static_key *key);
+
+/*
+ * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h.
+ * The use of 'atomic_read()' requires atomic.h and its problematic for some
+ * kernel headers such as kernel.h and others. Since static_key_count() is not
+ * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok
+ * to have it be a function here. Similarly, for 'static_key_enable()' and
+ * 'static_key_disable()', which require bug.h. This should allow jump_label.h
+ * to be included from most/all places for CONFIG_JUMP_LABEL.
+ */
+int static_key_count(struct static_key *key)
+{
+ /*
+ * -1 means the first static_key_slow_inc() is in progress.
+ * static_key_enabled() must return true, so return 1 here.
+ */
+ int n = atomic_read(&key->enabled);
+
+ return n >= 0 ? n : 1;
}
+EXPORT_SYMBOL_GPL(static_key_count);
-static void jump_label_update(struct static_key *key, int enable);
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
+{
+ int v;
+
+ STATIC_KEY_CHECK_USE(key);
+ /*
+ * Negative key->enabled has a special meaning: it sends
+ * static_key_slow_inc/dec() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update().
+ *
+ * The INT_MAX overflow condition is either used by the networking
+ * code to reset or detected in the slow path of
+ * static_key_slow_inc_cpuslocked().
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ if (v <= 0 || v == INT_MAX)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
+ lockdep_assert_cpus_held();
+
+ /*
+ * Careful if we get concurrent static_key_slow_inc/dec() calls;
+ * later calls must wait for the first one to _finish_ the
+ * jump_label_update() process. At the same time, however,
+ * the jump_label_update() call below wants to see
+ * static_key_enabled(&key) for jumps to be updated properly.
+ */
+ if (static_key_fast_inc_not_disabled(key))
+ return true;
+
+ guard(mutex)(&jump_label_mutex);
+ /* Try to mark it as 'enabling in progress. */
+ if (!atomic_cmpxchg(&key->enabled, 0, -1)) {
+ jump_label_update(key);
+ /*
+ * Ensure that when static_key_fast_inc_not_disabled() or
+ * static_key_dec_not_one() observe the positive value,
+ * they must also observe all the text changes.
+ */
+ atomic_set_release(&key->enabled, 1);
+ } else {
+ /*
+ * While holding the mutex this should never observe
+ * anything else than a value >= 1 and succeed
+ */
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key)))
+ return false;
+ }
+ return true;
+}
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
- if (atomic_inc_not_zero(&key->enabled))
+ bool ret;
+
+ cpus_read_lock();
+ ret = static_key_slow_inc_cpuslocked(key);
+ cpus_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(static_key_slow_inc);
+
+void static_key_enable_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+ lockdep_assert_cpus_held();
+
+ if (atomic_read(&key->enabled) > 0) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
return;
+ }
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_ENABLE);
- else
- jump_label_update(key, JUMP_LABEL_DISABLE);
+ atomic_set(&key->enabled, -1);
+ jump_label_update(key);
+ /*
+ * See static_key_slow_inc().
+ */
+ atomic_set_release(&key->enabled, 1);
}
- atomic_inc(&key->enabled);
jump_label_unlock();
}
-EXPORT_SYMBOL_GPL(static_key_slow_inc);
+EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
+
+void static_key_enable(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_enable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
-static void __static_key_slow_dec(struct static_key *key,
- unsigned long rate_limit, struct delayed_work *work)
+void static_key_disable_cpuslocked(struct static_key *key)
{
- if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
- WARN(atomic_read(&key->enabled) < 0,
- "jump label: negative count!\n");
+ STATIC_KEY_CHECK_USE(key);
+ lockdep_assert_cpus_held();
+
+ if (atomic_read(&key->enabled) != 1) {
+ WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
return;
}
- if (rate_limit) {
- atomic_inc(&key->enabled);
- schedule_delayed_work(work, rate_limit);
- } else {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_DISABLE);
- else
- jump_label_update(key, JUMP_LABEL_ENABLE);
- }
+ jump_label_lock();
+ if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
+ jump_label_update(key);
jump_label_unlock();
}
+EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
+
+void static_key_disable(struct static_key *key)
+{
+ cpus_read_lock();
+ static_key_disable_cpuslocked(key);
+ cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
+static bool static_key_dec_not_one(struct static_key *key)
+{
+ int v;
+
+ /*
+ * Go into the slow path if key::enabled is less than or equal than
+ * one. One is valid to shut down the key, anything less than one
+ * is an imbalance, which is handled at the call site.
+ *
+ * That includes the special case of '-1' which is set in
+ * static_key_slow_inc_cpuslocked(), but that's harmless as it is
+ * fully serialized in the slow path below. By the time this task
+ * acquires the jump label lock the value is back to one and the
+ * retry under the lock must succeed.
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ /*
+ * Warn about the '-1' case though; since that means a
+ * decrement is concurrent with a first (0->1) increment. IOW
+ * people are trying to disable something that wasn't yet fully
+ * enabled. This suggests an ordering problem on the user side.
+ */
+ WARN_ON_ONCE(v < 0);
+
+ /*
+ * Warn about underflow, and lie about success in an attempt to
+ * not make things worse.
+ */
+ if (WARN_ON_ONCE(v == 0))
+ return true;
+
+ if (v <= 1)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1)));
+
+ return true;
+}
+
+static void __static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ lockdep_assert_cpus_held();
+ int val;
+
+ if (static_key_dec_not_one(key))
+ return;
+
+ guard(mutex)(&jump_label_mutex);
+ val = atomic_read(&key->enabled);
+ /*
+ * It should be impossible to observe -1 with jump_label_mutex held,
+ * see static_key_slow_inc_cpuslocked().
+ */
+ if (WARN_ON_ONCE(val == -1))
+ return;
+ /*
+ * Cannot already be 0, something went sideways.
+ */
+ if (WARN_ON_ONCE(val == 0))
+ return;
+
+ if (atomic_dec_and_test(&key->enabled))
+ jump_label_update(key);
+}
+
+static void __static_key_slow_dec(struct static_key *key)
+{
+ cpus_read_lock();
+ __static_key_slow_dec_cpuslocked(key);
+ cpus_read_unlock();
+}
-static void jump_label_update_timeout(struct work_struct *work)
+void jump_label_update_timeout(struct work_struct *work)
{
struct static_key_deferred *key =
container_of(work, struct static_key_deferred, work.work);
- __static_key_slow_dec(&key->key, 0, NULL);
+ __static_key_slow_dec(&key->key);
}
+EXPORT_SYMBOL_GPL(jump_label_update_timeout);
void static_key_slow_dec(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
- __static_key_slow_dec(key, 0, NULL);
+ STATIC_KEY_CHECK_USE(key);
+ __static_key_slow_dec(key);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
-void static_key_slow_dec_deferred(struct static_key_deferred *key)
+void static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ STATIC_KEY_CHECK_USE(key);
+ __static_key_slow_dec_cpuslocked(key);
+}
+
+void __static_key_slow_dec_deferred(struct static_key *key,
+ struct delayed_work *work,
+ unsigned long timeout)
{
- STATIC_KEY_CHECK_USE();
- __static_key_slow_dec(&key->key, key->timeout, &key->work);
+ STATIC_KEY_CHECK_USE(key);
+
+ if (static_key_dec_not_one(key))
+ return;
+
+ schedule_delayed_work(work, timeout);
+}
+EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);
+
+void __static_key_deferred_flush(void *key, struct delayed_work *work)
+{
+ STATIC_KEY_CHECK_USE(key);
+ flush_delayed_work(work);
}
-EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+EXPORT_SYMBOL_GPL(__static_key_deferred_flush);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
@@ -127,67 +374,153 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit);
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
- if (entry->code <= (unsigned long)end &&
- entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+ if (jump_entry_code(entry) <= (unsigned long)end &&
+ jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start)
return 1;
return 0;
}
static int __jump_label_text_reserved(struct jump_entry *iter_start,
- struct jump_entry *iter_stop, void *start, void *end)
+ struct jump_entry *iter_stop, void *start, void *end, bool init)
{
struct jump_entry *iter;
iter = iter_start;
while (iter < iter_stop) {
- if (addr_conflict(iter, start, end))
- return 1;
+ if (init || !jump_entry_is_init(iter)) {
+ if (addr_conflict(iter, start, end))
+ return 1;
+ }
iter++;
}
return 0;
}
-/*
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
+#ifndef arch_jump_label_transform_static
+static void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ /* nothing to do on most architectures */
+}
+#endif
+
+static inline struct jump_entry *static_key_entries(struct static_key *key)
+{
+ WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
+ return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+static inline bool static_key_type(struct static_key *key)
+{
+ return key->type & JUMP_TYPE_TRUE;
+}
+
+static inline bool static_key_linked(struct static_key *key)
+{
+ return key->type & JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_clear_linked(struct static_key *key)
+{
+ key->type &= ~JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_set_linked(struct static_key *key)
+{
+ key->type |= JUMP_TYPE_LINKED;
+}
+
+/***
+ * A 'struct static_key' uses a union such that it either points directly
+ * to a table of 'struct jump_entry' or to a linked list of modules which in
+ * turn point to 'struct jump_entry' tables.
+ *
+ * The two lower bits of the pointer are used to keep track of which pointer
+ * type is in use and to store the initial branch direction, we use an access
+ * function which preserves these bits.
*/
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+static void static_key_set_entries(struct static_key *key,
+ struct jump_entry *entries)
{
- arch_jump_label_transform(entry, type);
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->entries = entries;
+ key->type |= type;
}
-static void __jump_label_update(struct static_key *key,
- struct jump_entry *entry,
- struct jump_entry *stop, int enable)
+static enum jump_label_type jump_label_type(struct jump_entry *entry)
{
- for (; (entry < stop) &&
- (entry->key == (jump_label_t)(unsigned long)key);
- entry++) {
+ struct static_key *key = jump_entry_key(entry);
+ bool enabled = static_key_enabled(key);
+ bool branch = jump_entry_is_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return enabled ^ branch;
+}
+
+static bool jump_label_can_update(struct jump_entry *entry, bool init)
+{
+ /*
+ * Cannot update code that was in an init text area.
+ */
+ if (!init && jump_entry_is_init(entry))
+ return false;
+
+ if (!kernel_text_address(jump_entry_code(entry))) {
/*
- * entry->code set to 0 invalidates module init text sections
- * kernel_text_address() verifies we are not in core kernel
- * init code, see jump_label_invalidate_module_init().
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
*/
- if (entry->code && kernel_text_address(entry->code))
- arch_jump_label_transform(entry, enable);
+ WARN_ONCE(!jump_entry_is_init(entry),
+ "can't patch jump_label at %pS",
+ (void *)jump_entry_code(entry));
+ return false;
}
+
+ return true;
}
-static enum jump_label_type jump_label_type(struct static_key *key)
+#ifndef HAVE_JUMP_LABEL_BATCH
+static void __jump_label_update(struct static_key *key,
+ struct jump_entry *entry,
+ struct jump_entry *stop,
+ bool init)
{
- bool true_branch = jump_label_get_branch_default(key);
- bool state = static_key_enabled(key);
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
+ if (jump_label_can_update(entry, init))
+ arch_jump_label_transform(entry, jump_label_type(entry));
+ }
+}
+#else
+static void __jump_label_update(struct static_key *key,
+ struct jump_entry *entry,
+ struct jump_entry *stop,
+ bool init)
+{
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
- if ((!true_branch && state) || (true_branch && !state))
- return JUMP_LABEL_ENABLE;
+ if (!jump_label_can_update(entry, init))
+ continue;
- return JUMP_LABEL_DISABLE;
+ if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
+ /*
+ * Queue is full: Apply the current queue and try again.
+ */
+ arch_jump_label_transform_apply();
+ BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
+ }
+ }
+ arch_jump_label_transform_apply();
}
+#endif
void __init jump_label_init(void)
{
@@ -196,87 +529,170 @@ void __init jump_label_init(void)
struct static_key *key = NULL;
struct jump_entry *iter;
+ /*
+ * Since we are initializing the static_key.enabled field with
+ * with the 'raw' int values (to avoid pulling in atomic.h) in
+ * jump_label.h, let's make sure that is safe. There are only two
+ * cases to check since we initialize to 0 or 1.
+ */
+ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
+ BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);
+
+ if (static_key_initialized)
+ return;
+
+ cpus_read_lock();
jump_label_lock();
jump_label_sort_entries(iter_start, iter_stop);
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
+ bool in_init;
+
+ /* rewrite NOPs */
+ if (jump_label_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+
+ in_init = init_section_contains((void *)jump_entry_code(iter), 1);
+ jump_entry_set_init(iter, in_init);
- iterk = (struct static_key *)(unsigned long)iter->key;
- arch_jump_label_transform_static(iter, jump_label_type(iterk));
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
key = iterk;
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
-#ifdef CONFIG_MODULES
- key->next = NULL;
-#endif
+ static_key_set_entries(key, iter);
}
static_key_initialized = true;
jump_label_unlock();
+ cpus_read_unlock();
+}
+
+static inline bool static_key_sealed(struct static_key *key)
+{
+ return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK);
+}
+
+static inline void static_key_seal(struct static_key *key)
+{
+ unsigned long type = key->type & JUMP_TYPE_TRUE;
+ key->type = JUMP_TYPE_LINKED | type;
+}
+
+void jump_label_init_ro(void)
+{
+ struct jump_entry *iter_start = __start___jump_table;
+ struct jump_entry *iter_stop = __stop___jump_table;
+ struct jump_entry *iter;
+
+ if (WARN_ON_ONCE(!static_key_initialized))
+ return;
+
+ cpus_read_lock();
+ jump_label_lock();
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ struct static_key *iterk = jump_entry_key(iter);
+
+ if (!is_kernel_ro_after_init((unsigned long)iterk))
+ continue;
+
+ if (static_key_sealed(iterk))
+ continue;
+
+ static_key_seal(iterk);
+ }
+
+ jump_label_unlock();
+ cpus_read_unlock();
}
#ifdef CONFIG_MODULES
+enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool type = static_key_type(key);
+ bool branch = jump_entry_is_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return type ^ branch;
+}
+
struct static_key_mod {
struct static_key_mod *next;
struct jump_entry *entries;
struct module *mod;
};
+static inline struct static_key_mod *static_key_mod(struct static_key *key)
+{
+ WARN_ON_ONCE(!static_key_linked(key));
+ return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+/***
+ * key->type and key->next are the same via union.
+ * This sets key->next and preserves the type bits.
+ *
+ * See additional comments above static_key_set_entries().
+ */
+static void static_key_set_mod(struct static_key *key,
+ struct static_key_mod *mod)
+{
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->next = mod;
+ key->type |= type;
+}
+
static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
+ int ret;
- mod = __module_text_address((unsigned long)start);
+ scoped_guard(rcu) {
+ mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
+ }
if (!mod)
return 0;
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
-
- return __jump_label_text_reserved(mod->jump_entries,
+ ret = __jump_label_text_reserved(mod->jump_entries,
mod->jump_entries + mod->num_jump_entries,
- start, end);
-}
+ start, end, mod->state == MODULE_STATE_COMING);
-static void __jump_label_mod_update(struct static_key *key, int enable)
-{
- struct static_key_mod *mod = key->next;
+ module_put(mod);
- while (mod) {
- struct module *m = mod->mod;
-
- __jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries,
- enable);
- mod = mod->next;
- }
+ return ret;
}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
+static void __jump_label_mod_update(struct static_key *key)
{
- struct jump_entry *iter_start = mod->jump_entries;
- struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
- struct jump_entry *iter;
+ struct static_key_mod *mod;
- /* if the module doesn't have jump label entries, just return */
- if (iter_start == iter_stop)
- return;
+ for (mod = static_key_mod(key); mod; mod = mod->next) {
+ struct jump_entry *stop;
+ struct module *m;
- for (iter = iter_start; iter < iter_stop; iter++) {
- arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+ /*
+ * NULL if the static_key is defined in a module
+ * that does not use it
+ */
+ if (!mod->entries)
+ continue;
+
+ m = mod->mod;
+ if (!m)
+ stop = __stop___jump_table;
+ else
+ stop = m->jump_entries + m->num_jump_entries;
+ __jump_label_update(key, mod->entries, stop,
+ m && m->state == MODULE_STATE_COMING);
}
}
@@ -286,7 +702,7 @@ static int jump_label_add_module(struct module *mod)
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
struct static_key *key = NULL;
- struct static_key_mod *jlm;
+ struct static_key_mod *jlm, *jlm2;
/* if the module doesn't have jump label entries, just return */
if (iter_start == iter_stop)
@@ -296,30 +712,57 @@ static int jump_label_add_module(struct module *mod)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
+ bool in_init;
+
+ in_init = within_module_init(jump_entry_code(iter), mod);
+ jump_entry_set_init(iter, in_init);
- iterk = (struct static_key *)(unsigned long)iter->key;
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
key = iterk;
- if (__module_address(iter->key) == mod) {
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
- key->next = NULL;
+ if (within_module((unsigned long)key, mod)) {
+ static_key_set_entries(key, iter);
continue;
}
+
+ /*
+ * If the key was sealed at init, then there's no need to keep a
+ * reference to its module entries - just patch them now and be
+ * done with it.
+ */
+ if (static_key_sealed(key))
+ goto do_poke;
+
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
+ if (!static_key_linked(key)) {
+ jlm2 = kzalloc(sizeof(struct static_key_mod),
+ GFP_KERNEL);
+ if (!jlm2) {
+ kfree(jlm);
+ return -ENOMEM;
+ }
+ scoped_guard(rcu)
+ jlm2->mod = __module_address((unsigned long)key);
+
+ jlm2->entries = static_key_entries(key);
+ jlm2->next = NULL;
+ static_key_set_mod(key, jlm2);
+ static_key_set_linked(key);
+ }
jlm->mod = mod;
jlm->entries = iter;
- jlm->next = key->next;
- key->next = jlm;
-
- if (jump_label_type(key) == JUMP_LABEL_ENABLE)
- __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+ jlm->next = static_key_mod(key);
+ static_key_set_mod(key, jlm);
+ static_key_set_linked(key);
+
+ /* Only update if we've changed from our initial state */
+do_poke:
+ if (jump_label_type(iter) != jump_label_init_type(iter))
+ __jump_label_update(key, iter, iter_stop, true);
}
return 0;
@@ -334,38 +777,48 @@ static void jump_label_del_module(struct module *mod)
struct static_key_mod *jlm, **prev;
for (iter = iter_start; iter < iter_stop; iter++) {
- if (iter->key == (jump_label_t)(unsigned long)key)
+ if (jump_entry_key(iter) == key)
continue;
- key = (struct static_key *)(unsigned long)iter->key;
+ key = jump_entry_key(iter);
+
+ if (within_module((unsigned long)key, mod))
+ continue;
+
+ /* No @jlm allocated because key was sealed at init. */
+ if (static_key_sealed(key))
+ continue;
- if (__module_address(iter->key) == mod)
+ /* No memory during module load */
+ if (WARN_ON(!static_key_linked(key)))
continue;
prev = &key->next;
- jlm = key->next;
+ jlm = static_key_mod(key);
while (jlm && jlm->mod != mod) {
prev = &jlm->next;
jlm = jlm->next;
}
- if (jlm) {
+ /* No memory during module load */
+ if (WARN_ON(!jlm))
+ continue;
+
+ if (prev == &key->next)
+ static_key_set_mod(key, jlm->next);
+ else
*prev = jlm->next;
- kfree(jlm);
- }
- }
-}
-static void jump_label_invalidate_module_init(struct module *mod)
-{
- struct jump_entry *iter_start = mod->jump_entries;
- struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
- struct jump_entry *iter;
+ kfree(jlm);
- for (iter = iter_start; iter < iter_stop; iter++) {
- if (within_module_init(iter->code, mod))
- iter->code = 0;
+ jlm = static_key_mod(key);
+ /* if only one etry is left, fold it back into the static_key */
+ if (jlm->next == NULL) {
+ static_key_set_entries(key, jlm->entries);
+ static_key_clear_linked(key);
+ kfree(jlm);
+ }
}
}
@@ -376,30 +829,29 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
struct module *mod = data;
int ret = 0;
+ cpus_read_lock();
+ jump_label_lock();
+
switch (val) {
case MODULE_STATE_COMING:
- jump_label_lock();
ret = jump_label_add_module(mod);
- if (ret)
+ if (ret) {
+ WARN(1, "Failed to allocate memory: jump_label may not work properly.\n");
jump_label_del_module(mod);
- jump_label_unlock();
+ }
break;
case MODULE_STATE_GOING:
- jump_label_lock();
jump_label_del_module(mod);
- jump_label_unlock();
- break;
- case MODULE_STATE_LIVE:
- jump_label_lock();
- jump_label_invalidate_module_init(mod);
- jump_label_unlock();
break;
}
+ jump_label_unlock();
+ cpus_read_unlock();
+
return notifier_from_errno(ret);
}
-struct notifier_block jump_label_module_nb = {
+static struct notifier_block jump_label_module_nb = {
.notifier_call = jump_label_module_notify,
.priority = 1, /* higher than tracepoints */
};
@@ -427,8 +879,9 @@ early_initcall(jump_label_init_module);
*/
int jump_label_text_reserved(void *start, void *end)
{
+ bool init = system_state < SYSTEM_RUNNING;
int ret = __jump_label_text_reserved(__start___jump_table,
- __stop___jump_table, start, end);
+ __stop___jump_table, start, end, init);
if (ret)
return ret;
@@ -439,22 +892,66 @@ int jump_label_text_reserved(void *start, void *end)
return ret;
}
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = jump_label_get_entries(key);
-
+ bool init = system_state < SYSTEM_RUNNING;
+ struct jump_entry *entry;
#ifdef CONFIG_MODULES
- struct module *mod = __module_address((unsigned long)key);
+ struct module *mod;
- __jump_label_mod_update(key, enable);
+ if (static_key_linked(key)) {
+ __jump_label_mod_update(key);
+ return;
+ }
- if (mod)
- stop = mod->jump_entries + mod->num_jump_entries;
+ scoped_guard(rcu) {
+ mod = __module_address((unsigned long)key);
+ if (mod) {
+ stop = mod->jump_entries + mod->num_jump_entries;
+ init = mod->state == MODULE_STATE_COMING;
+ }
+ }
#endif
+ entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, stop, enable);
+ __jump_label_update(key, entry, stop, init);
}
-#endif
+#ifdef CONFIG_STATIC_KEYS_SELFTEST
+static DEFINE_STATIC_KEY_TRUE(sk_true);
+static DEFINE_STATIC_KEY_FALSE(sk_false);
+
+static __init int jump_label_test(void)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ WARN_ON(static_key_enabled(&sk_true.key) != true);
+ WARN_ON(static_key_enabled(&sk_false.key) != false);
+
+ WARN_ON(!static_branch_likely(&sk_true));
+ WARN_ON(!static_branch_unlikely(&sk_true));
+ WARN_ON(static_branch_likely(&sk_false));
+ WARN_ON(static_branch_unlikely(&sk_false));
+
+ static_branch_disable(&sk_true);
+ static_branch_enable(&sk_false);
+
+ WARN_ON(static_key_enabled(&sk_true.key) == true);
+ WARN_ON(static_key_enabled(&sk_false.key) == false);
+
+ WARN_ON(static_branch_likely(&sk_true));
+ WARN_ON(static_branch_unlikely(&sk_true));
+ WARN_ON(!static_branch_likely(&sk_false));
+ WARN_ON(!static_branch_unlikely(&sk_false));
+
+ static_branch_enable(&sk_true);
+ static_branch_disable(&sk_false);
+ }
+
+ return 0;
+}
+early_initcall(jump_label_test);
+#endif /* STATIC_KEYS_SELFTEST */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5c5987f10819..049e296f586c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
*
@@ -12,7 +13,6 @@
* compression (see scripts/kallsyms.c for a more complete description)
*/
#include <linux/kallsyms.h>
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
@@ -20,68 +20,19 @@
#include <linux/err.h>
#include <linux/proc_fs.h>
#include <linux/sched.h> /* for cond_resched */
-#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/filter.h>
+#include <linux/ftrace.h>
+#include <linux/kprobes.h>
+#include <linux/build_bug.h>
#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bsearch.h>
+#include <linux/btf_ids.h>
-#include <asm/sections.h>
-
-#ifdef CONFIG_KALLSYMS_ALL
-#define all_var 1
-#else
-#define all_var 0
-#endif
-
-/*
- * These will be re-linked against their real values
- * during the second link stage.
- */
-extern const unsigned long kallsyms_addresses[] __weak;
-extern const u8 kallsyms_names[] __weak;
-
-/*
- * Tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV).
- */
-extern const unsigned long kallsyms_num_syms
-__attribute__((weak, section(".rodata")));
-
-extern const u8 kallsyms_token_table[] __weak;
-extern const u16 kallsyms_token_index[] __weak;
-
-extern const unsigned long kallsyms_markers[] __weak;
-
-static inline int is_kernel_inittext(unsigned long addr)
-{
- if (addr >= (unsigned long)_sinittext
- && addr <= (unsigned long)_einittext)
- return 1;
- return 0;
-}
-
-static inline int is_kernel_text(unsigned long addr)
-{
- if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
- arch_is_kernel_text(addr))
- return 1;
- return in_gate_area_no_mm(addr);
-}
-
-static inline int is_kernel(unsigned long addr)
-{
- if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
- return 1;
- return in_gate_area_no_mm(addr);
-}
-
-static int is_ksym_addr(unsigned long addr)
-{
- if (all_var)
- return is_kernel(addr);
-
- return is_kernel_text(addr) || is_kernel_inittext(addr);
-}
+#include "kallsyms_internal.h"
/*
* Expand a compressed symbol data into the resulting uncompressed string,
@@ -92,18 +43,27 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
char *result, size_t maxlen)
{
int len, skipped_first = 0;
- const u8 *tptr, *data;
+ const char *tptr;
+ const u8 *data;
/* Get the compressed symbol length from the first symbol byte. */
data = &kallsyms_names[off];
len = *data;
data++;
+ off++;
+
+ /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7F) | (*data << 7);
+ data++;
+ off++;
+ }
/*
* Update the offset to return the offset for the next symbol on
* the compressed stream.
*/
- off += len + 1;
+ off += len;
/*
* For every byte on the compressed symbol data, copy the table
@@ -143,8 +103,11 @@ static char kallsyms_get_symbol_type(unsigned int off)
{
/*
* Get just the first code, look it up in the token table,
- * and return the first char from this token.
+ * and return the first char from this token. If MSB of length
+ * is 1, it is a "big" symbol, so needs an additional byte.
*/
+ if (kallsyms_names[off] & 0x80)
+ off++;
return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
}
@@ -156,7 +119,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
static unsigned int get_symbol_offset(unsigned long pos)
{
const u8 *name;
- int i;
+ int i, len;
/*
* Use the closest marker we have. We have markers every 256 positions,
@@ -170,31 +133,116 @@ static unsigned int get_symbol_offset(unsigned long pos)
* so we just need to add the len to the current pointer for every
* symbol we wish to skip.
*/
- for (i = 0; i < (pos & 0xFF); i++)
- name = name + (*name) + 1;
+ for (i = 0; i < (pos & 0xFF); i++) {
+ len = *name;
+
+ /*
+ * If MSB is 1, it is a "big" symbol, so we need to look into
+ * the next byte (and skip it, too).
+ */
+ if ((len & 0x80) != 0)
+ len = ((len & 0x7F) | (name[1] << 7)) + 1;
+
+ name = name + len + 1;
+ }
return name - kallsyms_names;
}
-/* Lookup the address for this symbol. Returns 0 if not found. */
-unsigned long kallsyms_lookup_name(const char *name)
+unsigned long kallsyms_sym_address(int idx)
+{
+ /* values are unsigned offsets */
+ return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+}
+
+static unsigned int get_symbol_seq(int index)
+{
+ unsigned int i, seq = 0;
+
+ for (i = 0; i < 3; i++)
+ seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i];
+
+ return seq;
+}
+
+static int kallsyms_lookup_names(const char *name,
+ unsigned int *start,
+ unsigned int *end)
{
+ int ret;
+ int low, mid, high;
+ unsigned int seq, off;
char namebuf[KSYM_NAME_LEN];
- unsigned long i;
- unsigned int off;
- for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ low = 0;
+ high = kallsyms_num_syms - 1;
- if (strcmp(namebuf, name) == 0)
- return kallsyms_addresses[i];
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ seq = get_symbol_seq(mid);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = strcmp(name, namebuf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
+ }
+
+ if (low > high)
+ return -ESRCH;
+
+ low = mid;
+ while (low) {
+ seq = get_symbol_seq(low - 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (strcmp(name, namebuf))
+ break;
+ low--;
+ }
+ *start = low;
+
+ if (end) {
+ high = mid;
+ while (high < kallsyms_num_syms - 1) {
+ seq = get_symbol_seq(high + 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (strcmp(name, namebuf))
+ break;
+ high++;
+ }
+ *end = high;
}
+
+ return 0;
+}
+
+/* Lookup the address for this symbol. Returns 0 if not found. */
+unsigned long kallsyms_lookup_name(const char *name)
+{
+ int ret;
+ unsigned int i;
+
+ /* Skip the search for empty string. */
+ if (!*name)
+ return 0;
+
+ ret = kallsyms_lookup_names(name, &i, NULL);
+ if (!ret)
+ return kallsyms_sym_address(get_symbol_seq(i));
+
return module_kallsyms_lookup_name(name);
}
-EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
-int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
- unsigned long),
+/*
+ * Iterate over all symbols in vmlinux. For symbols from modules use
+ * module_kallsyms_on_each_symbol instead.
+ */
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
@@ -204,13 +252,31 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+ ret = fn(data, namebuf, kallsyms_sym_address(i));
if (ret != 0)
return ret;
+ cond_resched();
}
- return module_kallsyms_on_each_symbol(fn, data);
+ return 0;
+}
+
+int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
+ const char *name, void *data)
+{
+ int ret;
+ unsigned int i, start, end;
+
+ ret = kallsyms_lookup_names(name, &start, &end);
+ if (ret)
+ return 0;
+
+ for (i = start; !ret && i <= end; i++) {
+ ret = fn(data, kallsyms_sym_address(get_symbol_seq(i)));
+ cond_resched();
+ }
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
@@ -219,16 +285,13 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
- /* This kernel should never had been booted. */
- BUG_ON(!kallsyms_addresses);
-
- /* Do a binary search on the sorted kallsyms_addresses array. */
+ /* Do a binary search on the sorted kallsyms_offsets array. */
low = 0;
high = kallsyms_num_syms;
while (high - low > 1) {
mid = low + (high - low) / 2;
- if (kallsyms_addresses[mid] <= addr)
+ if (kallsyms_sym_address(mid) <= addr)
low = mid;
else
high = mid;
@@ -238,15 +301,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
* Search for the first aliased symbol. Aliased
* symbols are symbols with the same address.
*/
- while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low))
--low;
- symbol_start = kallsyms_addresses[low];
+ symbol_start = kallsyms_sym_address(low);
/* Search for next non-aliased symbol. */
for (i = low + 1; i < kallsyms_num_syms; i++) {
- if (kallsyms_addresses[i] > symbol_start) {
- symbol_end = kallsyms_addresses[i];
+ if (kallsyms_sym_address(i) > symbol_start) {
+ symbol_end = kallsyms_sym_address(i);
break;
}
}
@@ -255,7 +318,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
- else if (all_var)
+ else if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
symbol_end = (unsigned long)_end;
else
symbol_end = (unsigned long)_etext;
@@ -276,24 +339,22 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
unsigned long *offset)
{
char namebuf[KSYM_NAME_LEN];
- if (is_ksym_addr(addr))
- return !!get_symbol_pos(addr, symbolsize, offset);
- return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+ if (is_ksym_addr(addr)) {
+ get_symbol_pos(addr, symbolsize, offset);
+ return 1;
+ }
+ return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) ||
+ !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
-/*
- * Lookup an address
- * - modname is set to NULL if it's in the kernel.
- * - We guarantee that the returned name is valid until we reschedule even if.
- * It resides in a module.
- * - We also guarantee that modname will be valid until rescheduled.
- */
-const char *kallsyms_lookup(unsigned long addr,
- unsigned long *symbolsize,
- unsigned long *offset,
- char **modname, char *namebuf)
+static int kallsyms_lookup_buildid(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset, char **modname,
+ const unsigned char **modbuildid, char *namebuf)
{
+ int ret;
+
namebuf[KSYM_NAME_LEN - 1] = 0;
namebuf[0] = 0;
@@ -306,12 +367,45 @@ const char *kallsyms_lookup(unsigned long addr,
namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
- return namebuf;
+ if (modbuildid)
+ *modbuildid = NULL;
+
+ return strlen(namebuf);
}
- /* See if it's in a module. */
- return module_address_lookup(addr, symbolsize, offset, modname,
- namebuf);
+ /* See if it's in a module or a BPF JITed image. */
+ ret = module_address_lookup(addr, symbolsize, offset,
+ modname, modbuildid, namebuf);
+ if (!ret)
+ ret = bpf_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
+
+ if (!ret)
+ ret = ftrace_mod_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
+
+ return ret;
+}
+
+/*
+ * Lookup an address
+ * - modname is set to NULL if it's in the kernel.
+ * - We guarantee that the returned name is valid until we reschedule even if.
+ * It resides in a module.
+ * - We also guarantee that modname will be valid until rescheduled.
+ */
+const char *kallsyms_lookup(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset,
+ char **modname, char *namebuf)
+{
+ int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname,
+ NULL, namebuf);
+
+ if (!ret)
+ return NULL;
+
+ return namebuf;
}
int lookup_symbol_name(unsigned long addr, char *symname)
@@ -332,50 +426,39 @@ int lookup_symbol_name(unsigned long addr, char *symname)
return lookup_module_symbol_name(addr, symname);
}
-int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- name[0] = '\0';
- name[KSYM_NAME_LEN - 1] = '\0';
-
- if (is_ksym_addr(addr)) {
- unsigned long pos;
-
- pos = get_symbol_pos(addr, size, offset);
- /* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos),
- name, KSYM_NAME_LEN);
- modname[0] = '\0';
- return 0;
- }
- /* See if it's in a module. */
- return lookup_module_symbol_attrs(addr, size, offset, modname, name);
-}
-
/* Look up a kernel symbol and return it in a text buffer. */
static int __sprint_symbol(char *buffer, unsigned long address,
- int symbol_offset, int add_offset)
+ int symbol_offset, int add_offset, int add_buildid)
{
char *modname;
- const char *name;
+ const unsigned char *buildid;
unsigned long offset, size;
int len;
address += symbol_offset;
- name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
- if (!name)
+ len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid,
+ buffer);
+ if (!len)
return sprintf(buffer, "0x%lx", address - symbol_offset);
- if (name != buffer)
- strcpy(buffer, name);
- len = strlen(buffer);
offset -= symbol_offset;
if (add_offset)
len += sprintf(buffer + len, "+%#lx/%#lx", offset, size);
- if (modname)
- len += sprintf(buffer + len, " [%s]", modname);
+ if (modname) {
+ len += sprintf(buffer + len, " [%s", modname);
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ if (add_buildid && buildid) {
+ /* build ID should match length of sprintf */
+#if IS_ENABLED(CONFIG_MODULES)
+ static_assert(sizeof(typeof_member(struct module, build_id)) == 20);
+#endif
+ len += sprintf(buffer + len, " %20phN", buildid);
+ }
+#endif
+ len += sprintf(buffer + len, "]");
+ }
return len;
}
@@ -393,11 +476,28 @@ static int __sprint_symbol(char *buffer, unsigned long address,
*/
int sprint_symbol(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0, 1);
+ return __sprint_symbol(buffer, address, 0, 1, 0);
}
EXPORT_SYMBOL_GPL(sprint_symbol);
/**
+ * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size, module name and module build ID to @buffer if possible. If no
+ * symbol was found, just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol_build_id(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0, 1, 1);
+}
+EXPORT_SYMBOL_GPL(sprint_symbol_build_id);
+
+/**
* sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer
* @buffer: buffer to be stored
* @address: address to lookup
@@ -410,7 +510,7 @@ EXPORT_SYMBOL_GPL(sprint_symbol);
*/
int sprint_symbol_no_offset(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, 0, 0);
+ return __sprint_symbol(buffer, address, 0, 0, 0);
}
EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
@@ -430,47 +530,115 @@ EXPORT_SYMBOL_GPL(sprint_symbol_no_offset);
*/
int sprint_backtrace(char *buffer, unsigned long address)
{
- return __sprint_symbol(buffer, address, -1, 1);
+ return __sprint_symbol(buffer, address, -1, 1, 0);
}
-/* Look up a kernel symbol and print it to the kernel messages. */
-void __print_symbol(const char *fmt, unsigned long address)
+/**
+ * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address. This function also appends the module build ID to
+ * the @buffer if @address is within a kernel module.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace_build_id(char *buffer, unsigned long address)
{
- char buffer[KSYM_SYMBOL_LEN];
-
- sprint_symbol(buffer, address);
-
- printk(fmt, buffer);
+ return __sprint_symbol(buffer, address, -1, 1, 1);
}
-EXPORT_SYMBOL(__print_symbol);
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
+ loff_t pos_mod_end;
+ loff_t pos_ftrace_mod_end;
+ loff_t pos_bpf_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
int exported;
+ int show_value;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
- &iter->type, iter->name, iter->module_name,
- &iter->exported) < 0)
+ int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_mod_end = iter->pos;
return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace
+ * purposes. In that case "__builtin__ftrace" is used as a module name, even
+ * though "__builtin__ftrace" is not a module.
+ */
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+ int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_ftrace_mod_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int get_ksymbol_bpf(struct kallsym_iter *iter)
+{
+ int ret;
+
+ strscpy(iter->module_name, "bpf", MODULE_NAME_LEN);
+ iter->exported = 0;
+ ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
+ &iter->value, &iter->type,
+ iter->name);
+ if (ret < 0) {
+ iter->pos_bpf_end = iter->pos;
+ return 0;
+ }
+
return 1;
}
+/*
+ * This uses "__builtin__kprobes" as a module name for symbols for pages
+ * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a
+ * module.
+ */
+static int get_ksymbol_kprobe(struct kallsym_iter *iter)
+{
+ strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+ iter->exported = 0;
+ return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
+ &iter->value, &iter->type,
+ iter->name) < 0 ? 0 : 1;
+}
+
/* Returns space to next name. */
static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
unsigned off = iter->nameoff;
iter->module_name[0] = '\0';
- iter->value = kallsyms_addresses[iter->pos];
+ iter->value = kallsyms_sym_address(iter->pos);
iter->type = kallsyms_get_symbol_type(off);
@@ -484,16 +652,43 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
+ if (new_pos == 0) {
+ iter->pos_mod_end = 0;
+ iter->pos_ftrace_mod_end = 0;
+ iter->pos_bpf_end = 0;
+ }
+}
+
+/*
+ * The end position (last + 1) of each additional kallsyms section is recorded
+ * in iter->pos_..._end as each section is added, and so can be used to
+ * determine which get_ksymbol_...() function to call next.
+ */
+static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
+{
+ iter->pos = pos;
+
+ if ((!iter->pos_mod_end || iter->pos_mod_end > pos) &&
+ get_ksymbol_mod(iter))
+ return 1;
+
+ if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) &&
+ get_ksymbol_ftrace_mod(iter))
+ return 1;
+
+ if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) &&
+ get_ksymbol_bpf(iter))
+ return 1;
+
+ return get_ksymbol_kprobe(iter);
}
/* Returns false if pos at or past end of file. */
static int update_iter(struct kallsym_iter *iter, loff_t pos)
{
/* Module symbols can be accessed randomly. */
- if (pos >= kallsyms_num_syms) {
- iter->pos = pos;
- return get_ksymbol_mod(iter);
- }
+ if (pos >= kallsyms_num_syms)
+ return update_iter_mod(iter, pos);
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
@@ -527,12 +722,15 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
+ void *value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
+ value = iter->show_value ? (void *)iter->value : NULL;
+
if (iter->module_name[0]) {
char type;
@@ -542,10 +740,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ seq_printf(m, "%px %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ seq_printf(m, "%px %c %s\n", value,
iter->type, iter->name);
return 0;
}
@@ -557,6 +755,95 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
+#ifdef CONFIG_BPF_SYSCALL
+
+struct bpf_iter__ksym {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kallsym_iter *, ksym);
+};
+
+static int ksym_prog_seq_show(struct seq_file *m, bool in_stop)
+{
+ struct bpf_iter__ksym ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = m;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.ksym = m ? m->private : NULL;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p)
+{
+ return ksym_prog_seq_show(m, false);
+}
+
+static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p)
+{
+ if (!p)
+ (void) ksym_prog_seq_show(m, true);
+ else
+ s_stop(m, p);
+}
+
+static const struct seq_operations bpf_iter_ksym_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = bpf_iter_ksym_seq_stop,
+ .show = bpf_iter_ksym_seq_show,
+};
+
+static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct kallsym_iter *iter = priv_data;
+
+ reset_iter(iter, 0);
+
+ /* cache here as in kallsyms_open() case; use current process
+ * credentials to tell BPF iterators if values should be shown.
+ */
+ iter->show_value = kallsyms_show_value(current_cred());
+
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym)
+
+static const struct bpf_iter_seq_info ksym_iter_seq_info = {
+ .seq_ops = &bpf_iter_ksym_ops,
+ .init_seq_private = bpf_iter_ksym_init,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct kallsym_iter),
+};
+
+static struct bpf_iter_reg ksym_iter_reg_info = {
+ .target = "ksym",
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ksym, ksym),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ksym_iter_seq_info,
+};
+
+BTF_ID_LIST_SINGLE(btf_ksym_iter_id, struct, kallsym_iter)
+
+static int __init bpf_ksym_iter_register(void)
+{
+ ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id;
+ return bpf_iter_reg_target(&ksym_iter_reg_info);
+}
+
+late_initcall(bpf_ksym_iter_register);
+
+#endif /* CONFIG_BPF_SYSCALL */
+
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
@@ -570,6 +857,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return -ENOMEM;
reset_iter(iter, 0);
+ /*
+ * Instead of checking this on every s_show() call, cache
+ * the result here at open time.
+ */
+ iter->show_value = kallsyms_show_value(file->f_cred);
return 0;
}
@@ -593,16 +885,16 @@ const char *kdb_walk_kallsyms(loff_t *pos)
}
#endif /* CONFIG_KGDB_KDB */
-static const struct file_operations kallsyms_operations = {
- .open = kallsyms_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
+static const struct proc_ops kallsyms_proc_ops = {
+ .proc_open = kallsyms_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release_private,
};
static int __init kallsyms_init(void)
{
- proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
+ proc_create("kallsyms", 0444, NULL, &kallsyms_proc_ops);
return 0;
}
device_initcall(kallsyms_init);
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
new file mode 100644
index 000000000000..9633782f8250
--- /dev/null
+++ b/kernel/kallsyms_internal.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef LINUX_KALLSYMS_INTERNAL_H_
+#define LINUX_KALLSYMS_INTERNAL_H_
+
+#include <linux/types.h>
+
+extern const int kallsyms_offsets[];
+extern const u8 kallsyms_names[];
+
+extern const unsigned int kallsyms_num_syms;
+extern const unsigned long kallsyms_relative_base;
+
+extern const char kallsyms_token_table[];
+extern const u16 kallsyms_token_index[];
+
+extern const unsigned int kallsyms_markers[];
+extern const u8 kallsyms_seqs_of_names[];
+
+#endif // LINUX_KALLSYMS_INTERNAL_H_
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
new file mode 100644
index 000000000000..2b082a7e24a2
--- /dev/null
+++ b/kernel/kallsyms_selftest.c
@@ -0,0 +1,446 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the function and performance of kallsyms
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd., 2022
+ *
+ * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei
+ */
+
+#define pr_fmt(fmt) "kallsyms_selftest: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+#include "kallsyms_internal.h"
+#include "kallsyms_selftest.h"
+
+
+#define MAX_NUM_OF_RECORDS 64
+
+struct test_stat {
+ int min;
+ int max;
+ int save_cnt;
+ int real_cnt;
+ int perf;
+ u64 sum;
+ char *name;
+ unsigned long addr;
+ unsigned long addrs[MAX_NUM_OF_RECORDS];
+};
+
+struct test_item {
+ char *name;
+ unsigned long addr;
+};
+
+#define ITEM_FUNC(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)s, \
+ }
+
+#define ITEM_DATA(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)&s, \
+ }
+
+
+static int kallsyms_test_var_bss_static;
+static int kallsyms_test_var_data_static = 1;
+int kallsyms_test_var_bss;
+int kallsyms_test_var_data = 1;
+
+static int kallsyms_test_func_static(void)
+{
+ kallsyms_test_var_bss_static++;
+ kallsyms_test_var_data_static++;
+
+ return 0;
+}
+
+int kallsyms_test_func(void)
+{
+ return kallsyms_test_func_static();
+}
+
+__weak int kallsyms_test_func_weak(void)
+{
+ kallsyms_test_var_bss++;
+ kallsyms_test_var_data++;
+ return 0;
+}
+
+static struct test_item test_items[] = {
+ ITEM_FUNC(kallsyms_test_func_static),
+ ITEM_FUNC(kallsyms_test_func),
+ ITEM_FUNC(kallsyms_test_func_weak),
+ ITEM_FUNC(vmalloc_noprof),
+ ITEM_FUNC(vfree),
+#ifdef CONFIG_KALLSYMS_ALL
+ ITEM_DATA(kallsyms_test_var_bss_static),
+ ITEM_DATA(kallsyms_test_var_data_static),
+ ITEM_DATA(kallsyms_test_var_bss),
+ ITEM_DATA(kallsyms_test_var_data),
+#endif
+};
+
+static char stub_name[KSYM_NAME_LEN];
+
+static int stat_symbol_len(void *data, const char *name, unsigned long addr)
+{
+ *(u32 *)data += strlen(name);
+
+ return 0;
+}
+
+static void test_kallsyms_compression_ratio(void)
+{
+ u32 pos, off, len, num;
+ u32 ratio, total_size, total_len = 0;
+
+ kallsyms_on_each_symbol(stat_symbol_len, &total_len);
+
+ /*
+ * A symbol name cannot start with a number. This stub name helps us
+ * traverse the entire symbol table without finding a match. It's used
+ * for subsequent performance tests, and its length is the average
+ * length of all symbol names.
+ */
+ memset(stub_name, '4', sizeof(stub_name));
+ pos = total_len / kallsyms_num_syms;
+ stub_name[pos] = 0;
+
+ pos = 0;
+ num = 0;
+ off = 0;
+ while (pos < kallsyms_num_syms) {
+ len = kallsyms_names[off];
+ num++;
+ off++;
+ pos++;
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7f) | (kallsyms_names[off] << 7);
+ num++;
+ off++;
+ }
+ off += len;
+ }
+
+ /*
+ * 1. The length fields is not counted
+ * 2. The memory occupied by array kallsyms_token_table[] and
+ * kallsyms_token_index[] needs to be counted.
+ */
+ total_size = off - num;
+ pos = kallsyms_token_index[0xff];
+ total_size += pos + strlen(&kallsyms_token_table[pos]) + 1;
+ total_size += 0x100 * sizeof(u16);
+
+ pr_info(" ---------------------------------------------------------\n");
+ pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n");
+ pr_info("|---------------------------------------------------------|\n");
+ ratio = (u32)div_u64(10000ULL * total_size, total_len);
+ pr_info("| %10d | %10d | %10d | %2d.%-2d |\n",
+ kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100);
+ pr_info(" ---------------------------------------------------------\n");
+}
+
+static int lookup_name(void *data, const char *name, unsigned long addr)
+{
+ u64 t0, t1, t;
+ struct test_stat *stat = (struct test_stat *)data;
+
+ t0 = ktime_get_ns();
+ (void)kallsyms_lookup_name(name);
+ t1 = ktime_get_ns();
+
+ t = t1 - t0;
+ if (t < stat->min)
+ stat->min = t;
+
+ if (t > stat->max)
+ stat->max = t;
+
+ stat->real_cnt++;
+ stat->sum += t;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_lookup_name(void)
+{
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.min = INT_MAX;
+ kallsyms_on_each_symbol(lookup_name, &stat);
+ pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt);
+ pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n",
+ stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
+}
+
+static int find_symbol(void *data, const char *name, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ if (!strcmp(name, stat->name)) {
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ stat.perf = 1;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_symbol(find_symbol, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int match_symbol(void *data, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_match_symbol(void)
+{
+ u64 t0, t1;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ t0 = ktime_get_ns();
+ kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
+ t1 = ktime_get_ns();
+ pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int test_kallsyms_basic_function(void)
+{
+ int i, j, ret;
+ int next = 0, nr_failed = 0;
+ char *prefix;
+ unsigned short rand;
+ unsigned long addr, lookup_addr;
+ char namebuf[KSYM_NAME_LEN];
+ struct test_stat *stat, *stat2;
+
+ stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+ stat2 = stat + 1;
+
+ prefix = "kallsyms_lookup_name() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ addr = kallsyms_lookup_name(test_items[i].name);
+ if (addr != test_items[i].addr) {
+ nr_failed++;
+ pr_info("%s %s failed: addr=%lx, expect %lx\n",
+ prefix, test_items[i].name, addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_symbol(find_symbol, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_match_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ if (nr_failed) {
+ kfree(stat);
+ return -ESRCH;
+ }
+
+ for (i = 0; i < kallsyms_num_syms; i++) {
+ addr = kallsyms_sym_address(i);
+ if (!is_ksym_addr(addr))
+ continue;
+
+ ret = lookup_symbol_name(addr, namebuf);
+ if (unlikely(ret)) {
+ namebuf[0] = 0;
+ pr_info("%d: lookup_symbol_name(%lx) failed\n", i, addr);
+ goto failed;
+ }
+
+ lookup_addr = kallsyms_lookup_name(namebuf);
+
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ kallsyms_on_each_match_symbol(match_symbol, namebuf, stat);
+
+ /*
+ * kallsyms_on_each_symbol() is too slow, randomly select some
+ * symbols for test.
+ */
+ if (i >= next) {
+ memset(stat2, 0, sizeof(*stat2));
+ stat2->max = INT_MAX;
+ stat2->name = namebuf;
+ kallsyms_on_each_symbol(find_symbol, stat2);
+
+ /*
+ * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()
+ * need to get the same traversal result.
+ */
+ if (stat->addr != stat2->addr ||
+ stat->real_cnt != stat2->real_cnt ||
+ memcmp(stat->addrs, stat2->addrs,
+ stat->save_cnt * sizeof(stat->addrs[0]))) {
+ pr_info("%s: mismatch between kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()\n",
+ namebuf);
+ goto failed;
+ }
+
+ /*
+ * The average of random increments is 128, that is, one of
+ * them is tested every 128 symbols.
+ */
+ get_random_bytes(&rand, sizeof(rand));
+ next = i + (rand & 0xff) + 1;
+ }
+
+ /* Need to be found at least once */
+ if (!stat->real_cnt) {
+ pr_info("%s: Never found\n", namebuf);
+ goto failed;
+ }
+
+ /*
+ * kallsyms_lookup_name() returns the address of the first
+ * symbol found and cannot be NULL.
+ */
+ if (!lookup_addr) {
+ pr_info("%s: NULL lookup_addr?!\n", namebuf);
+ goto failed;
+ }
+ if (lookup_addr != stat->addrs[0]) {
+ pr_info("%s: lookup_addr != stat->addrs[0]\n", namebuf);
+ goto failed;
+ }
+
+ /*
+ * If the addresses of all matching symbols are recorded, the
+ * target address needs to be exist.
+ */
+ if (stat->real_cnt <= MAX_NUM_OF_RECORDS) {
+ for (j = 0; j < stat->save_cnt; j++) {
+ if (stat->addrs[j] == addr)
+ break;
+ }
+
+ if (j == stat->save_cnt) {
+ pr_info("%s: j == save_cnt?!\n", namebuf);
+ goto failed;
+ }
+ }
+ }
+
+ kfree(stat);
+
+ return 0;
+
+failed:
+ pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr);
+ kfree(stat);
+ return -ESRCH;
+}
+
+static int test_entry(void *p)
+{
+ int ret;
+
+ do {
+ schedule_timeout(5 * HZ);
+ } while (system_state != SYSTEM_RUNNING);
+
+ pr_info("start\n");
+ ret = test_kallsyms_basic_function();
+ if (ret) {
+ pr_info("abort\n");
+ return 0;
+ }
+
+ test_kallsyms_compression_ratio();
+ test_perf_kallsyms_lookup_name();
+ test_perf_kallsyms_on_each_symbol();
+ test_perf_kallsyms_on_each_match_symbol();
+ pr_info("finish\n");
+
+ return 0;
+}
+
+static int __init kallsyms_test_init(void)
+{
+ struct task_struct *t;
+
+ t = kthread_run_on_cpu(test_entry, NULL, 0, "kallsyms_test");
+ if (IS_ERR(t)) {
+ pr_info("Create kallsyms selftest task failed\n");
+ return PTR_ERR(t);
+ }
+
+ return 0;
+}
+late_initcall(kallsyms_test_init);
diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h
new file mode 100644
index 000000000000..c0ca548e2a22
--- /dev/null
+++ b/kernel/kallsyms_selftest.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef LINUX_KALLSYMS_SELFTEST_H_
+#define LINUX_KALLSYMS_SELFTEST_H_
+
+#include <linux/types.h>
+
+extern int kallsyms_test_var_bss;
+extern int kallsyms_test_var_data;
+
+extern int kallsyms_test_func(void);
+extern int kallsyms_test_func_weak(void);
+
+#endif // LINUX_KALLSYMS_SELFTEST_H_
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 0aa69ea1d8fd..7c1a65bd5f8d 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/fdtable.h>
@@ -11,6 +12,10 @@
#include <linux/bug.h>
#include <linux/err.h>
#include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
#include <asm/unistd.h>
@@ -56,44 +61,77 @@ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
static struct file *
get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
- struct file *file = NULL;
+ struct file *file;
- task_lock(task);
- rcu_read_lock();
-
- if (task->files)
- file = fcheck_files(task->files, idx);
-
- rcu_read_unlock();
- task_unlock(task);
+ file = fget_task(task, idx);
+ if (file)
+ fput(file);
return file;
}
-static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
- if (likely(m2 != m1))
- mutex_unlock(m2);
- mutex_unlock(m1);
+ if (likely(l2 != l1))
+ up_read(l2);
+ up_read(l1);
}
-static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
int err;
- if (m2 > m1)
- swap(m1, m2);
+ if (l2 > l1)
+ swap(l1, l2);
- err = mutex_lock_killable(m1);
- if (!err && likely(m1 != m2)) {
- err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+ err = down_read_killable(l1);
+ if (!err && likely(l1 != l2)) {
+ err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING);
if (err)
- mutex_unlock(m1);
+ up_read(l1);
}
return err;
}
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+ struct task_struct *task2,
+ unsigned long idx1,
+ struct kcmp_epoll_slot __user *uslot)
+{
+ struct file *filp, *filp_epoll, *filp_tgt;
+ struct kcmp_epoll_slot slot;
+
+ if (copy_from_user(&slot, uslot, sizeof(slot)))
+ return -EFAULT;
+
+ filp = get_file_raw_ptr(task1, idx1);
+ if (!filp)
+ return -EBADF;
+
+ filp_epoll = fget_task(task2, slot.efd);
+ if (!filp_epoll)
+ return -EBADF;
+
+ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+ fput(filp_epoll);
+
+ if (IS_ERR(filp_tgt))
+ return PTR_ERR(filp_tgt);
+
+ return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+ struct task_struct *task2,
+ unsigned long idx1,
+ struct kcmp_epoll_slot __user *uslot)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
unsigned long, idx1, unsigned long, idx2)
{
@@ -107,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
*/
task1 = find_task_by_vpid(pid1);
task2 = find_task_by_vpid(pid2);
- if (!task1 || !task2)
+ if (unlikely(!task1 || !task2))
goto err_no_task;
get_task_struct(task1);
@@ -118,12 +156,12 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
/*
* One should have enough rights to inspect task details.
*/
- ret = kcmp_lock(&task1->signal->cred_guard_mutex,
- &task2->signal->cred_guard_mutex);
+ ret = kcmp_lock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
if (ret)
goto err;
- if (!ptrace_may_access(task1, PTRACE_MODE_READ) ||
- !ptrace_may_access(task2, PTRACE_MODE_READ)) {
+ if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
+ !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) {
ret = -EPERM;
goto err_unlock;
}
@@ -165,14 +203,17 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
ret = -EOPNOTSUPP;
#endif
break;
+ case KCMP_EPOLL_TFD:
+ ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+ break;
default:
ret = -EINVAL;
break;
}
err_unlock:
- kcmp_unlock(&task1->signal->cred_guard_mutex,
- &task2->signal->cred_guard_mutex);
+ kcmp_unlock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
err:
put_task_struct(task1);
put_task_struct(task2);
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..6563141f5de9
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,1132 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "kcov: " fmt
+
+#define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kmsan-checks.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+#include <linux/refcount.h>
+#include <linux/log2.h>
+#include <asm/setup.h>
+
+#define kcov_debug(fmt, ...) pr_debug("%s: " fmt, __func__, ##__VA_ARGS__)
+
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ * - initial state after open()
+ * - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ * - then, mmap() call (several calls are allowed but not useful)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
+ */
+struct kcov {
+ /*
+ * Reference counter. We keep one for:
+ * - opened file descriptor
+ * - task with enabled coverage (we can't unwire it from another task)
+ * - each code section for remote coverage collection
+ */
+ refcount_t refcount;
+ /* The lock protects mode, size, area and t. */
+ spinlock_t lock;
+ enum kcov_mode mode;
+ /* Size of arena (in long's). */
+ unsigned int size;
+ /* Coverage buffer shared with user space. */
+ void *area;
+ /* Task for which we collect coverage, or NULL. */
+ struct task_struct *t;
+ /* Collecting coverage from remote (background) threads. */
+ bool remote;
+ /* Size of remote area (in long's). */
+ unsigned int remote_size;
+ /*
+ * Sequence is incremented each time kcov is reenabled, used by
+ * kcov_remote_stop(), see the comment there.
+ */
+ int sequence;
+};
+
+struct kcov_remote_area {
+ struct list_head list;
+ unsigned int size;
+};
+
+struct kcov_remote {
+ u64 handle;
+ struct kcov *kcov;
+ struct hlist_node hnode;
+};
+
+static DEFINE_SPINLOCK(kcov_remote_lock);
+static DEFINE_HASHTABLE(kcov_remote_map, 4);
+static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
+
+struct kcov_percpu_data {
+ void *irq_area;
+ local_lock_t lock;
+
+ unsigned int saved_mode;
+ unsigned int saved_size;
+ void *saved_area;
+ struct kcov *saved_kcov;
+ int saved_sequence;
+};
+
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
+/* Must be called with kcov_remote_lock locked. */
+static struct kcov_remote *kcov_remote_find(u64 handle)
+{
+ struct kcov_remote *remote;
+
+ hash_for_each_possible(kcov_remote_map, remote, hnode, handle) {
+ if (remote->handle == handle)
+ return remote;
+ }
+ return NULL;
+}
+
+/* Must be called with kcov_remote_lock locked. */
+static struct kcov_remote *kcov_remote_add(struct kcov *kcov, u64 handle)
+{
+ struct kcov_remote *remote;
+
+ if (kcov_remote_find(handle))
+ return ERR_PTR(-EEXIST);
+ remote = kmalloc(sizeof(*remote), GFP_ATOMIC);
+ if (!remote)
+ return ERR_PTR(-ENOMEM);
+ remote->handle = handle;
+ remote->kcov = kcov;
+ hash_add(kcov_remote_map, &remote->hnode, handle);
+ return remote;
+}
+
+/* Must be called with kcov_remote_lock locked. */
+static struct kcov_remote_area *kcov_remote_area_get(unsigned int size)
+{
+ struct kcov_remote_area *area;
+ struct list_head *pos;
+
+ list_for_each(pos, &kcov_remote_areas) {
+ area = list_entry(pos, struct kcov_remote_area, list);
+ if (area->size == size) {
+ list_del(&area->list);
+ return area;
+ }
+ }
+ return NULL;
+}
+
+/* Must be called with kcov_remote_lock locked. */
+static void kcov_remote_area_put(struct kcov_remote_area *area,
+ unsigned int size)
+{
+ INIT_LIST_HEAD(&area->list);
+ area->size = size;
+ list_add(&area->list, &kcov_remote_areas);
+ /*
+ * KMSAN doesn't instrument this file, so it may not know area->list
+ * is initialized. Unpoison it explicitly to avoid reports in
+ * kcov_remote_area_get().
+ */
+ kmsan_unpoison_memory(&area->list, sizeof(area->list));
+}
+
+/*
+ * Unlike in_serving_softirq(), this function returns false when called during
+ * a hardirq or an NMI that happened in the softirq context.
+ */
+static __always_inline bool in_softirq_really(void)
+{
+ return in_serving_softirq() && !in_hardirq() && !in_nmi();
+}
+
+static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+{
+ unsigned int mode;
+
+ /*
+ * We are interested in code coverage as a function of a syscall inputs,
+ * so we ignore code executed in interrupts, unless we are in a remote
+ * coverage collection section in a softirq.
+ */
+ if (!in_task() && !(in_softirq_really() && t->kcov_softirq))
+ return false;
+ mode = READ_ONCE(t->kcov_mode);
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_start().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+
+static notrace unsigned long canonicalize_ip(unsigned long ip)
+{
+#ifdef CONFIG_RANDOMIZE_BASE
+ ip -= kaslr_offset();
+#endif
+ return ip;
+}
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+ return;
+
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ /* Previously we write pc before updating pos. However, some
+ * early interrupt code could bypass check_kcov_mode() check
+ * and invoke __sanitizer_cov_trace_pc(). If such interrupt is
+ * raised between writing pc and updating pos, the pc could be
+ * overitten by the recursive __sanitizer_cov_trace_pc().
+ * Update pos before writing pc to avoid such interleaving.
+ */
+ WRITE_ONCE(area[0], pos);
+ barrier();
+ area[pos] = ip;
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ WRITE_ONCE(area[0], count + 1);
+ barrier();
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(kcov_u64 val, void *arg)
+{
+ u64 i;
+ u64 *cases = arg;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
+static void kcov_start(struct task_struct *t, struct kcov *kcov,
+ unsigned int size, void *area, enum kcov_mode mode,
+ int sequence)
+{
+ kcov_debug("t = %px, size = %u, area = %px\n", t, size, area);
+ t->kcov = kcov;
+ /* Cache in task struct for performance. */
+ t->kcov_size = size;
+ t->kcov_area = area;
+ t->kcov_sequence = sequence;
+ /* See comment in check_kcov_mode(). */
+ barrier();
+ WRITE_ONCE(t->kcov_mode, mode);
+}
+
+static void kcov_stop(struct task_struct *t)
+{
+ WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED);
+ barrier();
+ t->kcov = NULL;
+ t->kcov_size = 0;
+ t->kcov_area = NULL;
+}
+
+static void kcov_task_reset(struct task_struct *t)
+{
+ kcov_stop(t);
+ t->kcov_sequence = 0;
+ t->kcov_handle = 0;
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+ kcov_task_reset(t);
+ t->kcov_handle = current->kcov_handle;
+}
+
+static void kcov_reset(struct kcov *kcov)
+{
+ kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
+ kcov->remote = false;
+ kcov->remote_size = 0;
+ kcov->sequence++;
+}
+
+static void kcov_remote_reset(struct kcov *kcov)
+{
+ int bkt;
+ struct kcov_remote *remote;
+ struct hlist_node *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kcov_remote_lock, flags);
+ hash_for_each_safe(kcov_remote_map, bkt, tmp, remote, hnode) {
+ if (remote->kcov != kcov)
+ continue;
+ hash_del(&remote->hnode);
+ kfree(remote);
+ }
+ /* Do reset before unlock to prevent races with kcov_remote_start(). */
+ kcov_reset(kcov);
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
+}
+
+static void kcov_disable(struct task_struct *t, struct kcov *kcov)
+{
+ kcov_task_reset(t);
+ if (kcov->remote)
+ kcov_remote_reset(kcov);
+ else
+ kcov_reset(kcov);
+}
+
+static void kcov_get(struct kcov *kcov)
+{
+ refcount_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+ if (refcount_dec_and_test(&kcov->refcount)) {
+ kcov_remote_reset(kcov);
+ vfree(kcov->area);
+ kfree(kcov);
+ }
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+ struct kcov *kcov;
+ unsigned long flags;
+
+ kcov = t->kcov;
+ if (kcov == NULL)
+ return;
+
+ spin_lock_irqsave(&kcov->lock, flags);
+ kcov_debug("t = %px, kcov->t = %px\n", t, kcov->t);
+ /*
+ * For KCOV_ENABLE devices we want to make sure that t->kcov->t == t,
+ * which comes down to:
+ * WARN_ON(!kcov->remote && kcov->t != t);
+ *
+ * For KCOV_REMOTE_ENABLE devices, the exiting task is either:
+ *
+ * 1. A remote task between kcov_remote_start() and kcov_remote_stop().
+ * In this case we should print a warning right away, since a task
+ * shouldn't be exiting when it's in a kcov coverage collection
+ * section. Here t points to the task that is collecting remote
+ * coverage, and t->kcov->t points to the thread that created the
+ * kcov device. Which means that to detect this case we need to
+ * check that t != t->kcov->t, and this gives us the following:
+ * WARN_ON(kcov->remote && kcov->t != t);
+ *
+ * 2. The task that created kcov exiting without calling KCOV_DISABLE,
+ * and then again we make sure that t->kcov->t == t:
+ * WARN_ON(kcov->remote && kcov->t != t);
+ *
+ * By combining all three checks into one we get:
+ */
+ if (WARN_ON(kcov->t != t)) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return;
+ }
+ /* Just to not leave dangling references behind. */
+ kcov_disable(t, kcov);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+ int res = 0;
+ struct kcov *kcov = vma->vm_file->private_data;
+ unsigned long size, off;
+ struct page *page;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kcov->lock, flags);
+ size = kcov->size * sizeof(unsigned long);
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
+ vma->vm_end - vma->vm_start != size) {
+ res = -EINVAL;
+ goto exit;
+ }
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vm_flags_set(vma, VM_DONTEXPAND);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ res = vm_insert_page(vma, vma->vm_start + off, page);
+ if (res) {
+ pr_warn_once("kcov: vm_insert_page() failed\n");
+ return res;
+ }
+ }
+ return 0;
+exit:
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+ struct kcov *kcov;
+
+ kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+ if (!kcov)
+ return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
+ kcov->sequence = 1;
+ refcount_set(&kcov->refcount, 1);
+ spin_lock_init(&kcov->lock);
+ filep->private_data = kcov;
+ return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+ kcov_put(filep->private_data);
+ return 0;
+}
+
+static int kcov_get_mode(unsigned long arg)
+{
+ if (arg == KCOV_TRACE_PC)
+ return KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ return KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
+}
+
+/*
+ * Fault in a lazily-faulted vmalloc area before it can be used by
+ * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the
+ * vmalloc fault handling path is instrumented.
+ */
+static void kcov_fault_in_area(struct kcov *kcov)
+{
+ unsigned long stride = PAGE_SIZE / sizeof(unsigned long);
+ unsigned long *area = kcov->area;
+ unsigned long offset;
+
+ for (offset = 0; offset < kcov->size; offset += stride)
+ READ_ONCE(area[offset]);
+}
+
+static inline bool kcov_check_handle(u64 handle, bool common_valid,
+ bool uncommon_valid, bool zero_valid)
+{
+ if (handle & ~(KCOV_SUBSYSTEM_MASK | KCOV_INSTANCE_MASK))
+ return false;
+ switch (handle & KCOV_SUBSYSTEM_MASK) {
+ case KCOV_SUBSYSTEM_COMMON:
+ return (handle & KCOV_INSTANCE_MASK) ?
+ common_valid : zero_valid;
+ case KCOV_SUBSYSTEM_USB:
+ return uncommon_valid;
+ default:
+ return false;
+ }
+ return false;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+ unsigned long arg)
+{
+ struct task_struct *t;
+ unsigned long flags, unused;
+ int mode, i;
+ struct kcov_remote_arg *remote_arg;
+ struct kcov_remote *remote;
+
+ switch (cmd) {
+ case KCOV_ENABLE:
+ /*
+ * Enable coverage for the current task.
+ * At this point user must have been enabled trace mode,
+ * and mmapped the file. Coverage collection is disabled only
+ * at task exit or voluntary by KCOV_DISABLE. After that it can
+ * be enabled for another task.
+ */
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
+ return -EINVAL;
+ t = current;
+ if (kcov->t != NULL || t->kcov != NULL)
+ return -EBUSY;
+ mode = kcov_get_mode(arg);
+ if (mode < 0)
+ return mode;
+ kcov_fault_in_area(kcov);
+ kcov->mode = mode;
+ kcov_start(t, kcov, kcov->size, kcov->area, kcov->mode,
+ kcov->sequence);
+ kcov->t = t;
+ /* Put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
+ case KCOV_DISABLE:
+ /* Disable coverage for the current task. */
+ unused = arg;
+ if (unused != 0 || current->kcov != kcov)
+ return -EINVAL;
+ t = current;
+ if (WARN_ON(kcov->t != t))
+ return -EINVAL;
+ kcov_disable(t, kcov);
+ kcov_put(kcov);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
+ return -EINVAL;
+ t = current;
+ if (kcov->t != NULL || t->kcov != NULL)
+ return -EBUSY;
+ remote_arg = (struct kcov_remote_arg *)arg;
+ mode = kcov_get_mode(remote_arg->trace_mode);
+ if (mode < 0)
+ return mode;
+ if ((unsigned long)remote_arg->area_size >
+ LONG_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ kcov->mode = mode;
+ t->kcov = kcov;
+ t->kcov_mode = KCOV_MODE_REMOTE;
+ kcov->t = t;
+ kcov->remote = true;
+ kcov->remote_size = remote_arg->area_size;
+ spin_lock_irqsave(&kcov_remote_lock, flags);
+ for (i = 0; i < remote_arg->num_handles; i++) {
+ if (!kcov_check_handle(remote_arg->handles[i],
+ false, true, false)) {
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
+ kcov_disable(t, kcov);
+ return -EINVAL;
+ }
+ remote = kcov_remote_add(kcov, remote_arg->handles[i]);
+ if (IS_ERR(remote)) {
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
+ kcov_disable(t, kcov);
+ return PTR_ERR(remote);
+ }
+ }
+ if (remote_arg->common_handle) {
+ if (!kcov_check_handle(remote_arg->common_handle,
+ true, false, false)) {
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
+ kcov_disable(t, kcov);
+ return -EINVAL;
+ }
+ remote = kcov_remote_add(kcov,
+ remote_arg->common_handle);
+ if (IS_ERR(remote)) {
+ spin_unlock_irqrestore(&kcov_remote_lock,
+ flags);
+ kcov_disable(t, kcov);
+ return PTR_ERR(remote);
+ }
+ t->kcov_handle = remote_arg->common_handle;
+ }
+ spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ /* Put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct kcov *kcov;
+ int res;
+ struct kcov_remote_arg *remote_arg = NULL;
+ unsigned int remote_num_handles;
+ unsigned long remote_arg_size;
+ unsigned long size, flags;
+ void *area;
+
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
+ if (get_user(remote_num_handles, (unsigned __user *)(arg +
+ offsetof(struct kcov_remote_arg, num_handles))))
+ return -EFAULT;
+ if (remote_num_handles > KCOV_REMOTE_MAX_HANDLES)
+ return -EINVAL;
+ remote_arg_size = struct_size(remote_arg, handles,
+ remote_num_handles);
+ remote_arg = memdup_user((void __user *)arg, remote_arg_size);
+ if (IS_ERR(remote_arg))
+ return PTR_ERR(remote_arg);
+ if (remote_arg->num_handles != remote_num_handles) {
+ kfree(remote_arg);
+ return -EINVAL;
+ }
+ arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
+ }
+}
+
+static const struct file_operations kcov_fops = {
+ .open = kcov_open,
+ .unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
+ .mmap = kcov_mmap,
+ .release = kcov_close,
+};
+
+/*
+ * kcov_remote_start() and kcov_remote_stop() can be used to annotate a section
+ * of code in a kernel background thread or in a softirq to allow kcov to be
+ * used to collect coverage from that part of code.
+ *
+ * The handle argument of kcov_remote_start() identifies a code section that is
+ * used for coverage collection. A userspace process passes this handle to
+ * KCOV_REMOTE_ENABLE ioctl to make the used kcov device start collecting
+ * coverage for the code section identified by this handle.
+ *
+ * The usage of these annotations in the kernel code is different depending on
+ * the type of the kernel thread whose code is being annotated.
+ *
+ * For global kernel threads that are spawned in a limited number of instances
+ * (e.g. one USB hub_event() worker thread is spawned per USB HCD) and for
+ * softirqs, each instance must be assigned a unique 4-byte instance id. The
+ * instance id is then combined with a 1-byte subsystem id to get a handle via
+ * kcov_remote_handle(subsystem_id, instance_id).
+ *
+ * For local kernel threads that are spawned from system calls handler when a
+ * user interacts with some kernel interface (e.g. vhost workers), a handle is
+ * passed from a userspace process as the common_handle field of the
+ * kcov_remote_arg struct (note, that the user must generate a handle by using
+ * kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an
+ * arbitrary 4-byte non-zero number as the instance id). This common handle
+ * then gets saved into the task_struct of the process that issued the
+ * KCOV_REMOTE_ENABLE ioctl. When this process issues system calls that spawn
+ * kernel threads, the common handle must be retrieved via kcov_common_handle()
+ * and passed to the spawned threads via custom annotations. Those kernel
+ * threads must in turn be annotated with kcov_remote_start(common_handle) and
+ * kcov_remote_stop(). All of the threads that are spawned by the same process
+ * obtain the same handle, hence the name "common".
+ *
+ * See Documentation/dev-tools/kcov.rst for more details.
+ *
+ * Internally, kcov_remote_start() looks up the kcov device associated with the
+ * provided handle, allocates an area for coverage collection, and saves the
+ * pointers to kcov and area into the current task_struct to allow coverage to
+ * be collected via __sanitizer_cov_trace_pc().
+ * In turns kcov_remote_stop() clears those pointers from task_struct to stop
+ * collecting coverage and copies all collected coverage into the kcov area.
+ */
+
+static inline bool kcov_mode_enabled(unsigned int mode)
+{
+ return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
+}
+
+static void kcov_remote_softirq_start(struct task_struct *t)
+{
+ struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
+ unsigned int mode;
+
+ mode = READ_ONCE(t->kcov_mode);
+ barrier();
+ if (kcov_mode_enabled(mode)) {
+ data->saved_mode = mode;
+ data->saved_size = t->kcov_size;
+ data->saved_area = t->kcov_area;
+ data->saved_sequence = t->kcov_sequence;
+ data->saved_kcov = t->kcov;
+ kcov_stop(t);
+ }
+}
+
+static void kcov_remote_softirq_stop(struct task_struct *t)
+{
+ struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
+
+ if (data->saved_kcov) {
+ kcov_start(t, data->saved_kcov, data->saved_size,
+ data->saved_area, data->saved_mode,
+ data->saved_sequence);
+ data->saved_mode = 0;
+ data->saved_size = 0;
+ data->saved_area = NULL;
+ data->saved_sequence = 0;
+ data->saved_kcov = NULL;
+ }
+}
+
+void kcov_remote_start(u64 handle)
+{
+ struct task_struct *t = current;
+ struct kcov_remote *remote;
+ struct kcov *kcov;
+ unsigned int mode;
+ void *area;
+ unsigned int size;
+ int sequence;
+ unsigned long flags;
+
+ if (WARN_ON(!kcov_check_handle(handle, true, true, true)))
+ return;
+ if (!in_task() && !in_softirq_really())
+ return;
+
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
+
+ /*
+ * Check that kcov_remote_start() is not called twice in background
+ * threads nor called by user tasks (with enabled kcov).
+ */
+ mode = READ_ONCE(t->kcov_mode);
+ if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+ return;
+ }
+ /*
+ * Check that kcov_remote_start() is not called twice in softirqs.
+ * Note, that kcov_remote_start() can be called from a softirq that
+ * happened while collecting coverage from a background thread.
+ */
+ if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+ return;
+ }
+
+ spin_lock(&kcov_remote_lock);
+ remote = kcov_remote_find(handle);
+ if (!remote) {
+ spin_unlock(&kcov_remote_lock);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+ return;
+ }
+ kcov_debug("handle = %llx, context: %s\n", handle,
+ in_task() ? "task" : "softirq");
+ kcov = remote->kcov;
+ /* Put in kcov_remote_stop(). */
+ kcov_get(kcov);
+ /*
+ * Read kcov fields before unlock to prevent races with
+ * KCOV_DISABLE / kcov_remote_reset().
+ */
+ mode = kcov->mode;
+ sequence = kcov->sequence;
+ if (in_task()) {
+ size = kcov->remote_size;
+ area = kcov_remote_area_get(size);
+ } else {
+ size = CONFIG_KCOV_IRQ_AREA_SIZE;
+ area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
+ }
+ spin_unlock(&kcov_remote_lock);
+
+ /* Can only happen when in_task(). */
+ if (!area) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+ area = vmalloc(size * sizeof(unsigned long));
+ if (!area) {
+ kcov_put(kcov);
+ return;
+ }
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
+ }
+
+ /* Reset coverage size. */
+ *(u64 *)area = 0;
+
+ if (in_serving_softirq()) {
+ kcov_remote_softirq_start(t);
+ t->kcov_softirq = 1;
+ }
+ kcov_start(t, kcov, size, area, mode, sequence);
+
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+
+}
+EXPORT_SYMBOL(kcov_remote_start);
+
+static void kcov_move_area(enum kcov_mode mode, void *dst_area,
+ unsigned int dst_area_size, void *src_area)
+{
+ u64 word_size = sizeof(unsigned long);
+ u64 count_size, entry_size_log;
+ u64 dst_len, src_len;
+ void *dst_entries, *src_entries;
+ u64 dst_occupied, dst_free, bytes_to_move, entries_moved;
+
+ kcov_debug("%px %u <= %px %lu\n",
+ dst_area, dst_area_size, src_area, *(unsigned long *)src_area);
+
+ switch (mode) {
+ case KCOV_MODE_TRACE_PC:
+ dst_len = READ_ONCE(*(unsigned long *)dst_area);
+ src_len = *(unsigned long *)src_area;
+ count_size = sizeof(unsigned long);
+ entry_size_log = __ilog2_u64(sizeof(unsigned long));
+ break;
+ case KCOV_MODE_TRACE_CMP:
+ dst_len = READ_ONCE(*(u64 *)dst_area);
+ src_len = *(u64 *)src_area;
+ count_size = sizeof(u64);
+ BUILD_BUG_ON(!is_power_of_2(KCOV_WORDS_PER_CMP));
+ entry_size_log = __ilog2_u64(sizeof(u64) * KCOV_WORDS_PER_CMP);
+ break;
+ default:
+ WARN_ON(1);
+ return;
+ }
+
+ /* As arm can't divide u64 integers use log of entry size. */
+ if (dst_len > ((dst_area_size * word_size - count_size) >>
+ entry_size_log))
+ return;
+ dst_occupied = count_size + (dst_len << entry_size_log);
+ dst_free = dst_area_size * word_size - dst_occupied;
+ bytes_to_move = min(dst_free, src_len << entry_size_log);
+ dst_entries = dst_area + dst_occupied;
+ src_entries = src_area + count_size;
+ memcpy(dst_entries, src_entries, bytes_to_move);
+ entries_moved = bytes_to_move >> entry_size_log;
+
+ /*
+ * A write memory barrier is required here, to ensure
+ * that the writes from the memcpy() are visible before
+ * the count is updated. Without this, it is possible for
+ * a user to observe a new count value but stale
+ * coverage data.
+ */
+ smp_wmb();
+
+ switch (mode) {
+ case KCOV_MODE_TRACE_PC:
+ WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved);
+ break;
+ case KCOV_MODE_TRACE_CMP:
+ WRITE_ONCE(*(u64 *)dst_area, dst_len + entries_moved);
+ break;
+ default:
+ break;
+ }
+}
+
+/* See the comment before kcov_remote_start() for usage details. */
+void kcov_remote_stop(void)
+{
+ struct task_struct *t = current;
+ struct kcov *kcov;
+ unsigned int mode;
+ void *area;
+ unsigned int size;
+ int sequence;
+ unsigned long flags;
+
+ if (!in_task() && !in_softirq_really())
+ return;
+
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
+
+ mode = READ_ONCE(t->kcov_mode);
+ barrier();
+ if (!kcov_mode_enabled(mode)) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+ return;
+ }
+ /*
+ * When in softirq, check if the corresponding kcov_remote_start()
+ * actually found the remote handle and started collecting coverage.
+ */
+ if (in_serving_softirq() && !t->kcov_softirq) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+ return;
+ }
+ /* Make sure that kcov_softirq is only set when in softirq. */
+ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+ return;
+ }
+
+ kcov = t->kcov;
+ area = t->kcov_area;
+ size = t->kcov_size;
+ sequence = t->kcov_sequence;
+
+ kcov_stop(t);
+ if (in_serving_softirq()) {
+ t->kcov_softirq = 0;
+ kcov_remote_softirq_stop(t);
+ }
+
+ spin_lock(&kcov->lock);
+ /*
+ * KCOV_DISABLE could have been called between kcov_remote_start()
+ * and kcov_remote_stop(), hence the sequence check.
+ */
+ if (sequence == kcov->sequence && kcov->remote)
+ kcov_move_area(kcov->mode, kcov->area, kcov->size, area);
+ spin_unlock(&kcov->lock);
+
+ if (in_task()) {
+ spin_lock(&kcov_remote_lock);
+ kcov_remote_area_put(area, size);
+ spin_unlock(&kcov_remote_lock);
+ }
+
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
+
+ /* Get in kcov_remote_start(). */
+ kcov_put(kcov);
+}
+EXPORT_SYMBOL(kcov_remote_stop);
+
+/* See the comment before kcov_remote_start() for usage details. */
+u64 kcov_common_handle(void)
+{
+ if (!in_task())
+ return 0;
+ return current->kcov_handle;
+}
+EXPORT_SYMBOL(kcov_common_handle);
+
+#ifdef CONFIG_KCOV_SELFTEST
+static void __init selftest(void)
+{
+ unsigned long start;
+
+ pr_err("running self test\n");
+ /*
+ * Test that interrupts don't produce spurious coverage.
+ * The coverage callback filters out interrupt code, but only
+ * after the handler updates preempt count. Some code periodically
+ * leaks out of that section and leads to spurious coverage.
+ * It's hard to call the actual interrupt handler directly,
+ * so we just loop here for a bit waiting for a timer interrupt.
+ * We set kcov_mode to enable tracing, but don't setup the area,
+ * so any attempt to trace will crash. Note: we must not call any
+ * potentially traced functions in this region.
+ */
+ start = jiffies;
+ current->kcov_mode = KCOV_MODE_TRACE_PC;
+ while ((jiffies - start) * MSEC_PER_SEC / HZ < 300)
+ ;
+ current->kcov_mode = 0;
+ pr_err("done running self test\n");
+}
+#endif
+
+static int __init kcov_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long), cpu_to_node(cpu));
+ if (!area)
+ return -ENOMEM;
+ per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
+ }
+
+ /*
+ * The kcov debugfs file won't ever get removed and thus,
+ * there is no need to protect it against removal races. The
+ * use of debugfs_create_file_unsafe() is actually safe here.
+ */
+ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+
+#ifdef CONFIG_KCOV_SELFTEST
+ selftest();
+#endif
+
+ return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/kcsan/.kunitconfig b/kernel/kcsan/.kunitconfig
new file mode 100644
index 000000000000..e82f0f52ab0a
--- /dev/null
+++ b/kernel/kcsan/.kunitconfig
@@ -0,0 +1,24 @@
+# Note that the KCSAN tests need to run on an SMP setup.
+# Under kunit_tool, this can be done by using the --qemu_args
+# option to configure a machine with several cores. For example:
+# ./tools/testing/kunit/kunit.py run --kunitconfig=kernel/kcsan \
+# --arch=x86_64 --qemu_args="-smp 8"
+
+CONFIG_KUNIT=y
+
+CONFIG_DEBUG_KERNEL=y
+
+# Need some level of concurrency to test a concurrency sanitizer.
+CONFIG_SMP=y
+
+CONFIG_KCSAN=y
+CONFIG_KCSAN_KUNIT_TEST=y
+
+# Set these if you want to run test_barrier_nothreads
+#CONFIG_KCSAN_STRICT=y
+#CONFIG_KCSAN_WEAK_MEMORY=y
+
+# This prevents the test from timing out on many setups. Feel free to remove
+# (or alter) this, in conjunction with setting a different test timeout with,
+# for example, the --timeout kunit_tool option.
+CONFIG_KCSAN_REPORT_ONCE_IN_MS=100
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
new file mode 100644
index 000000000000..a45f3dfc8d14
--- /dev/null
+++ b/kernel/kcsan/Makefile
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+KCSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+UBSAN_SANITIZE := n
+
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+ $(call cc-option,-mno-outline-atomics) \
+ -fno-stack-protector -DDISABLE_BRANCH_PROFILING
+
+obj-y := core.o debugfs.o report.o
+
+KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
+obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
+
+CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
+obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
new file mode 100644
index 000000000000..8a7baf4e332e
--- /dev/null
+++ b/kernel/kcsan/core.c
@@ -0,0 +1,1371 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN core runtime.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kcsan: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+
+#include "encoding.h"
+#include "kcsan.h"
+#include "permissive.h"
+
+static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE);
+unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK;
+unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT;
+static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH;
+static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER);
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kcsan."
+module_param_named(early_enable, kcsan_early_enable, bool, 0);
+module_param_named(udelay_task, kcsan_udelay_task, uint, 0644);
+module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
+module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
+module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+static bool kcsan_weak_memory = true;
+module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
+#else
+#define kcsan_weak_memory false
+#endif
+
+bool kcsan_enabled;
+
+/* Per-CPU kcsan_ctx for interrupts */
+static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
+ .scoped_accesses = {LIST_POISON1, NULL},
+};
+
+/*
+ * Helper macros to index into adjacent slots, starting from address slot
+ * itself, followed by the right and left slots.
+ *
+ * The purpose is 2-fold:
+ *
+ * 1. if during insertion the address slot is already occupied, check if
+ * any adjacent slots are free;
+ * 2. accesses that straddle a slot boundary due to size that exceeds a
+ * slot's range may check adjacent slots if any watchpoint matches.
+ *
+ * Note that accesses with very large size may still miss a watchpoint; however,
+ * given this should be rare, this is a reasonable trade-off to make, since this
+ * will avoid:
+ *
+ * 1. excessive contention between watchpoint checks and setup;
+ * 2. larger number of simultaneous watchpoints without sacrificing
+ * performance.
+ *
+ * Example: SLOT_IDX values for KCSAN_CHECK_ADJACENT=1, where i is [0, 1, 2]:
+ *
+ * slot=0: [ 1, 2, 0]
+ * slot=9: [10, 11, 9]
+ * slot=63: [64, 65, 63]
+ */
+#define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS))
+
+/*
+ * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary
+ * slot (middle) is fine if we assume that races occur rarely. The set of
+ * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to
+ * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}.
+ */
+#define SLOT_IDX_FAST(slot, i) (slot + i)
+
+/*
+ * Watchpoints, with each entry encoded as defined in encoding.h: in order to be
+ * able to safely update and access a watchpoint without introducing locking
+ * overhead, we encode each watchpoint as a single atomic long. The initial
+ * zero-initialized state matches INVALID_WATCHPOINT.
+ *
+ * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to
+ * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path.
+ */
+static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
+
+/*
+ * Instructions to skip watching counter, used in should_watch(). We use a
+ * per-CPU counter to avoid excessive contention.
+ */
+static DEFINE_PER_CPU(long, kcsan_skip);
+
+/* For kcsan_prandom_u32_max(). */
+static DEFINE_PER_CPU(u32, kcsan_rand_state);
+
+static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
+ size_t size,
+ bool expect_write,
+ long *encoded_watchpoint)
+{
+ const int slot = watchpoint_slot(addr);
+ const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK;
+ atomic_long_t *watchpoint;
+ unsigned long wp_addr_masked;
+ size_t wp_size;
+ bool is_write;
+ int i;
+
+ BUILD_BUG_ON(CONFIG_KCSAN_NUM_WATCHPOINTS < NUM_SLOTS);
+
+ for (i = 0; i < NUM_SLOTS; ++i) {
+ watchpoint = &watchpoints[SLOT_IDX_FAST(slot, i)];
+ *encoded_watchpoint = atomic_long_read(watchpoint);
+ if (!decode_watchpoint(*encoded_watchpoint, &wp_addr_masked,
+ &wp_size, &is_write))
+ continue;
+
+ if (expect_write && !is_write)
+ continue;
+
+ /* Check if the watchpoint matches the access. */
+ if (matching_access(wp_addr_masked, wp_size, addr_masked, size))
+ return watchpoint;
+ }
+
+ return NULL;
+}
+
+static inline atomic_long_t *
+insert_watchpoint(unsigned long addr, size_t size, bool is_write)
+{
+ const int slot = watchpoint_slot(addr);
+ const long encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+ atomic_long_t *watchpoint;
+ int i;
+
+ /* Check slot index logic, ensuring we stay within array bounds. */
+ BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT);
+ BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0);
+ BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1);
+ BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS);
+
+ for (i = 0; i < NUM_SLOTS; ++i) {
+ long expect_val = INVALID_WATCHPOINT;
+
+ /* Try to acquire this slot. */
+ watchpoint = &watchpoints[SLOT_IDX(slot, i)];
+ if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint))
+ return watchpoint;
+ }
+
+ return NULL;
+}
+
+/*
+ * Return true if watchpoint was successfully consumed, false otherwise.
+ *
+ * This may return false if:
+ *
+ * 1. another thread already consumed the watchpoint;
+ * 2. the thread that set up the watchpoint already removed it;
+ * 3. the watchpoint was removed and then re-used.
+ */
+static __always_inline bool
+try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint)
+{
+ return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT);
+}
+
+/* Return true if watchpoint was not touched, false if already consumed. */
+static inline bool consume_watchpoint(atomic_long_t *watchpoint)
+{
+ return atomic_long_xchg_relaxed(watchpoint, CONSUMED_WATCHPOINT) != CONSUMED_WATCHPOINT;
+}
+
+/* Remove the watchpoint -- its slot may be reused after. */
+static inline void remove_watchpoint(atomic_long_t *watchpoint)
+{
+ atomic_long_set(watchpoint, INVALID_WATCHPOINT);
+}
+
+static __always_inline struct kcsan_ctx *get_ctx(void)
+{
+ /*
+ * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would
+ * also result in calls that generate warnings in uaccess regions.
+ */
+ return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
+}
+
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
+
+/* Check scoped accesses; never inline because this is a slow-path! */
+static noinline void kcsan_check_scoped_accesses(void)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+ struct kcsan_scoped_access *scoped_access;
+
+ if (ctx->disable_scoped)
+ return;
+
+ ctx->disable_scoped++;
+ list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
+ check_access(scoped_access->ptr, scoped_access->size,
+ scoped_access->type, scoped_access->ip);
+ }
+ ctx->disable_scoped--;
+}
+
+/* Rules for generic atomic accesses. Called from fast-path. */
+static __always_inline bool
+is_atomic(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
+{
+ if (type & KCSAN_ACCESS_ATOMIC)
+ return true;
+
+ /*
+ * Unless explicitly declared atomic, never consider an assertion access
+ * as atomic. This allows using them also in atomic regions, such as
+ * seqlocks, without implicitly changing their semantics.
+ */
+ if (type & KCSAN_ACCESS_ASSERT)
+ return false;
+
+ if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) &&
+ (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) &&
+ !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size))
+ return true; /* Assume aligned writes up to word size are atomic. */
+
+ if (ctx->atomic_next > 0) {
+ /*
+ * Because we do not have separate contexts for nested
+ * interrupts, in case atomic_next is set, we simply assume that
+ * the outer interrupt set atomic_next. In the worst case, we
+ * will conservatively consider operations as atomic. This is a
+ * reasonable trade-off to make, since this case should be
+ * extremely rare; however, even if extremely rare, it could
+ * lead to false positives otherwise.
+ */
+ if ((hardirq_count() >> HARDIRQ_SHIFT) < 2)
+ --ctx->atomic_next; /* in task, or outer interrupt */
+ return true;
+ }
+
+ return ctx->atomic_nest_count > 0 || ctx->in_flat_atomic;
+}
+
+static __always_inline bool
+should_watch(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
+{
+ /*
+ * Never set up watchpoints when memory operations are atomic.
+ *
+ * Need to check this first, before kcsan_skip check below: (1) atomics
+ * should not count towards skipped instructions, and (2) to actually
+ * decrement kcsan_atomic_next for consecutive instruction stream.
+ */
+ if (is_atomic(ctx, ptr, size, type))
+ return false;
+
+ if (this_cpu_dec_return(kcsan_skip) >= 0)
+ return false;
+
+ /*
+ * NOTE: If we get here, kcsan_skip must always be reset in slow path
+ * via reset_kcsan_skip() to avoid underflow.
+ */
+
+ /* this operation should be watched */
+ return true;
+}
+
+/*
+ * Returns a pseudo-random number in interval [0, ep_ro). Simple linear
+ * congruential generator, using constants from "Numerical Recipes".
+ */
+static u32 kcsan_prandom_u32_max(u32 ep_ro)
+{
+ u32 state = this_cpu_read(kcsan_rand_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(kcsan_rand_state, state);
+
+ return state % ep_ro;
+}
+
+static inline void reset_kcsan_skip(void)
+{
+ long skip_count = kcsan_skip_watch -
+ (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ?
+ kcsan_prandom_u32_max(kcsan_skip_watch) :
+ 0);
+ this_cpu_write(kcsan_skip, skip_count);
+}
+
+static __always_inline bool kcsan_is_enabled(struct kcsan_ctx *ctx)
+{
+ return READ_ONCE(kcsan_enabled) && !ctx->disable_count;
+}
+
+/* Introduce delay depending on context and configuration. */
+static void delay_access(int type)
+{
+ unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt;
+ /* For certain access types, skew the random delay to be longer. */
+ unsigned int skew_delay_order =
+ (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0;
+
+ delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
+ kcsan_prandom_u32_max(delay >> skew_delay_order) :
+ 0;
+ udelay(delay);
+}
+
+/*
+ * Reads the instrumented memory for value change detection; value change
+ * detection is currently done for accesses up to a size of 8 bytes.
+ */
+static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
+{
+ /*
+ * In the below we don't necessarily need the read of the location to
+ * be atomic, and we don't use READ_ONCE(), since all we need for race
+ * detection is to observe 2 different values.
+ *
+ * Furthermore, on certain architectures (such as arm64), READ_ONCE()
+ * may turn into more complex instructions than a plain load that cannot
+ * do unaligned accesses.
+ */
+ switch (size) {
+ case 1: return *(const volatile u8 *)ptr;
+ case 2: return *(const volatile u16 *)ptr;
+ case 4: return *(const volatile u32 *)ptr;
+ case 8: return *(const volatile u64 *)ptr;
+ default: return 0; /* Ignore; we do not diff the values. */
+ }
+}
+
+void kcsan_save_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ task->kcsan_save_irqtrace = task->irqtrace;
+#endif
+}
+
+void kcsan_restore_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ task->irqtrace = task->kcsan_save_irqtrace;
+#endif
+}
+
+static __always_inline int get_kcsan_stack_depth(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return current->kcsan_stack_depth;
+#else
+ BUILD_BUG();
+ return 0;
+#endif
+}
+
+static __always_inline void add_kcsan_stack_depth(int val)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ current->kcsan_stack_depth += val;
+#else
+ BUILD_BUG();
+#endif
+}
+
+static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return ctx->disable_scoped ? NULL : &ctx->reorder_access;
+#else
+ return NULL;
+#endif
+}
+
+static __always_inline bool
+find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access)
+ return false;
+
+ /*
+ * Note: If accesses are repeated while reorder_access is identical,
+ * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
+ */
+ return reorder_access->ptr == ptr && reorder_access->size == size &&
+ reorder_access->type == type && reorder_access->ip == ip;
+}
+
+static inline void
+set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access || !kcsan_weak_memory)
+ return;
+
+ /*
+ * To avoid nested interrupts or scheduler (which share kcsan_ctx)
+ * reading an inconsistent reorder_access, ensure that the below has
+ * exclusive access to reorder_access by disallowing concurrent use.
+ */
+ ctx->disable_scoped++;
+ barrier();
+ reorder_access->ptr = ptr;
+ reorder_access->size = size;
+ reorder_access->type = type | KCSAN_ACCESS_SCOPED;
+ reorder_access->ip = ip;
+ reorder_access->stack_depth = get_kcsan_stack_depth();
+ barrier();
+ ctx->disable_scoped--;
+}
+
+/*
+ * Pull everything together: check_access() below contains the performance
+ * critical operations; the fast-path (including check_access) functions should
+ * all be inlinable by the instrumentation functions.
+ *
+ * The slow-path (kcsan_found_watchpoint, kcsan_setup_watchpoint) are
+ * non-inlinable -- note that, we prefix these with "kcsan_" to ensure they can
+ * be filtered from the stacktrace, as well as give them unique names for the
+ * UACCESS whitelist of objtool. Each function uses user_access_save/restore(),
+ * since they do not access any user memory, but instrumentation is still
+ * emitted in UACCESS regions.
+ */
+
+static noinline void kcsan_found_watchpoint(const volatile void *ptr,
+ size_t size,
+ int type,
+ unsigned long ip,
+ atomic_long_t *watchpoint,
+ long encoded_watchpoint)
+{
+ const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
+ struct kcsan_ctx *ctx = get_ctx();
+ unsigned long flags;
+ bool consumed;
+
+ /*
+ * We know a watchpoint exists. Let's try to keep the race-window
+ * between here and finally consuming the watchpoint below as small as
+ * possible -- avoid unneccessarily complex code until consumed.
+ */
+
+ if (!kcsan_is_enabled(ctx))
+ return;
+
+ /*
+ * The access_mask check relies on value-change comparison. To avoid
+ * reporting a race where e.g. the writer set up the watchpoint, but the
+ * reader has access_mask!=0, we have to ignore the found watchpoint.
+ *
+ * reorder_access is never created from an access with access_mask set.
+ */
+ if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
+ return;
+
+ /*
+ * If the other thread does not want to ignore the access, and there was
+ * a value change as a result of this thread's operation, we will still
+ * generate a report of unknown origin.
+ *
+ * Use CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=n to filter.
+ */
+ if (!is_assert && kcsan_ignore_address(ptr))
+ return;
+
+ /*
+ * Consuming the watchpoint must be guarded by kcsan_is_enabled() to
+ * avoid erroneously triggering reports if the context is disabled.
+ */
+ consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint);
+
+ /* keep this after try_consume_watchpoint */
+ flags = user_access_save();
+
+ if (consumed) {
+ kcsan_save_irqtrace(current);
+ kcsan_report_set_info(ptr, size, type, ip, watchpoint - watchpoints);
+ kcsan_restore_irqtrace(current);
+ } else {
+ /*
+ * The other thread may not print any diagnostics, as it has
+ * already removed the watchpoint, or another thread consumed
+ * the watchpoint before this thread.
+ */
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]);
+ }
+
+ if (is_assert)
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
+ else
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]);
+
+ user_access_restore(flags);
+}
+
+static noinline void
+kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned long ip)
+{
+ const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
+ const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
+ atomic_long_t *watchpoint;
+ u64 old, new, diff;
+ enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ bool interrupt_watcher = kcsan_interrupt_watcher;
+ unsigned long ua_flags = user_access_save();
+ struct kcsan_ctx *ctx = get_ctx();
+ unsigned long access_mask = ctx->access_mask;
+ unsigned long irq_flags = 0;
+ bool is_reorder_access;
+
+ /*
+ * Always reset kcsan_skip counter in slow-path to avoid underflow; see
+ * should_watch().
+ */
+ reset_kcsan_skip();
+
+ if (!kcsan_is_enabled(ctx))
+ goto out;
+
+ /*
+ * Check to-ignore addresses after kcsan_is_enabled(), as we may access
+ * memory that is not yet initialized during early boot.
+ */
+ if (!is_assert && kcsan_ignore_address(ptr))
+ goto out;
+
+ if (!check_encodable((unsigned long)ptr, size)) {
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]);
+ goto out;
+ }
+
+ /*
+ * The local CPU cannot observe reordering of its own accesses, and
+ * therefore we need to take care of 2 cases to avoid false positives:
+ *
+ * 1. Races of the reordered access with interrupts. To avoid, if
+ * the current access is reorder_access, disable interrupts.
+ * 2. Avoid races of scoped accesses from nested interrupts (below).
+ */
+ is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
+ if (is_reorder_access)
+ interrupt_watcher = false;
+ /*
+ * Avoid races of scoped accesses from nested interrupts (or scheduler).
+ * Assume setting up a watchpoint for a non-scoped (normal) access that
+ * also conflicts with a current scoped access. In a nested interrupt,
+ * which shares the context, it would check a conflicting scoped access.
+ * To avoid, disable scoped access checking.
+ */
+ ctx->disable_scoped++;
+
+ /*
+ * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
+ * runtime is entered for every memory access, and potentially useful
+ * information is lost if dirtied by KCSAN.
+ */
+ kcsan_save_irqtrace(current);
+ if (!interrupt_watcher)
+ local_irq_save(irq_flags);
+
+ watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
+ if (watchpoint == NULL) {
+ /*
+ * Out of capacity: the size of 'watchpoints', and the frequency
+ * with which should_watch() returns true should be tweaked so
+ * that this case happens very rarely.
+ */
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]);
+ goto out_unlock;
+ }
+
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
+
+ /*
+ * Read the current value, to later check and infer a race if the data
+ * was modified via a non-instrumented access, e.g. from a device.
+ */
+ old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
+
+ /*
+ * Delay this thread, to increase probability of observing a racy
+ * conflicting access.
+ */
+ delay_access(type);
+
+ /*
+ * Re-read value, and check if it is as expected; if not, we infer a
+ * racy access.
+ */
+ if (!is_reorder_access) {
+ new = read_instrumented_memory(ptr, size);
+ } else {
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * because the memory location may no longer be accessible and
+ * could result in a fault.
+ */
+ new = 0;
+ access_mask = 0;
+ }
+
+ diff = old ^ new;
+ if (access_mask)
+ diff &= access_mask;
+
+ /*
+ * Check if we observed a value change.
+ *
+ * Also check if the data race should be ignored (the rules depend on
+ * non-zero diff); if it is to be ignored, the below rules for
+ * KCSAN_VALUE_CHANGE_MAYBE apply.
+ */
+ if (diff && !kcsan_ignore_data_race(size, type, old, new, diff))
+ value_change = KCSAN_VALUE_CHANGE_TRUE;
+
+ /* Check if this access raced with another. */
+ if (!consume_watchpoint(watchpoint)) {
+ /*
+ * Depending on the access type, map a value_change of MAYBE to
+ * TRUE (always report) or FALSE (never report).
+ */
+ if (value_change == KCSAN_VALUE_CHANGE_MAYBE) {
+ if (access_mask != 0) {
+ /*
+ * For access with access_mask, we require a
+ * value-change, as it is likely that races on
+ * ~access_mask bits are expected.
+ */
+ value_change = KCSAN_VALUE_CHANGE_FALSE;
+ } else if (size > 8 || is_assert) {
+ /* Always assume a value-change. */
+ value_change = KCSAN_VALUE_CHANGE_TRUE;
+ }
+ }
+
+ /*
+ * No need to increment 'data_races' counter, as the racing
+ * thread already did.
+ *
+ * Count 'assert_failures' for each failed ASSERT access,
+ * therefore both this thread and the racing thread may
+ * increment this counter.
+ */
+ if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
+
+ kcsan_report_known_origin(ptr, size, type, ip,
+ value_change, watchpoint - watchpoints,
+ old, new, access_mask);
+ } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
+ /* Inferring a race, since the value should not have changed. */
+
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]);
+ if (is_assert)
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
+
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) {
+ kcsan_report_unknown_origin(ptr, size, type, ip,
+ old, new, access_mask);
+ }
+ }
+
+ /*
+ * Remove watchpoint; must be after reporting, since the slot may be
+ * reused after this point.
+ */
+ remove_watchpoint(watchpoint);
+ atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
+
+out_unlock:
+ if (!interrupt_watcher)
+ local_irq_restore(irq_flags);
+ kcsan_restore_irqtrace(current);
+ ctx->disable_scoped--;
+
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * therefore never consider for reordering if access_mask is set.
+ * ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
+ */
+ if (!access_mask && !is_assert)
+ set_reorder_access(ctx, ptr, size, type, ip);
+out:
+ user_access_restore(ua_flags);
+}
+
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
+{
+ atomic_long_t *watchpoint;
+ long encoded_watchpoint;
+
+ /*
+ * Do nothing for 0 sized check; this comparison will be optimized out
+ * for constant sized instrumentation (__tsan_{read,write}N).
+ */
+ if (unlikely(size == 0))
+ return;
+
+again:
+ /*
+ * Avoid user_access_save in fast-path: find_watchpoint is safe without
+ * user_access_save, as the address that ptr points to is only used to
+ * check if a watchpoint exists; ptr is never dereferenced.
+ */
+ watchpoint = find_watchpoint((unsigned long)ptr, size,
+ !(type & KCSAN_ACCESS_WRITE),
+ &encoded_watchpoint);
+ /*
+ * It is safe to check kcsan_is_enabled() after find_watchpoint in the
+ * slow-path, as long as no state changes that cause a race to be
+ * detected and reported have occurred until kcsan_is_enabled() is
+ * checked.
+ */
+
+ if (unlikely(watchpoint != NULL))
+ kcsan_found_watchpoint(ptr, size, type, ip, watchpoint, encoded_watchpoint);
+ else {
+ struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
+
+ if (unlikely(should_watch(ctx, ptr, size, type))) {
+ kcsan_setup_watchpoint(ptr, size, type, ip);
+ return;
+ }
+
+ if (!(type & KCSAN_ACCESS_SCOPED)) {
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (reorder_access) {
+ /*
+ * reorder_access check: simulates reordering of
+ * the access after subsequent operations.
+ */
+ ptr = reorder_access->ptr;
+ type = reorder_access->type;
+ ip = reorder_access->ip;
+ /*
+ * Upon a nested interrupt, this context's
+ * reorder_access can be modified (shared ctx).
+ * We know that upon return, reorder_access is
+ * always invalidated by setting size to 0 via
+ * __tsan_func_exit(). Therefore we must read
+ * and check size after the other fields.
+ */
+ barrier();
+ size = READ_ONCE(reorder_access->size);
+ if (size)
+ goto again;
+ }
+ }
+
+ /*
+ * Always checked last, right before returning from runtime;
+ * if reorder_access is valid, checked after it was checked.
+ */
+ if (unlikely(ctx->scoped_accesses.prev))
+ kcsan_check_scoped_accesses();
+ }
+}
+
+/* === Public interface ===================================================== */
+
+void __init kcsan_init(void)
+{
+ int cpu;
+
+ BUG_ON(!in_task());
+
+ for_each_possible_cpu(cpu)
+ per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles();
+
+ /*
+ * We are in the init task, and no other tasks should be running;
+ * WRITE_ONCE without memory barrier is sufficient.
+ */
+ if (kcsan_early_enable) {
+ pr_info("enabled early\n");
+ WRITE_ONCE(kcsan_enabled, true);
+ }
+
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) ||
+ IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) ||
+ IS_ENABLED(CONFIG_KCSAN_PERMISSIVE) ||
+ IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+ pr_warn("non-strict mode configured - use CONFIG_KCSAN_STRICT=y to see all data races\n");
+ } else {
+ pr_info("strict mode configured\n");
+ }
+}
+
+/* === Exported interface =================================================== */
+
+void kcsan_disable_current(void)
+{
+ ++get_ctx()->disable_count;
+}
+EXPORT_SYMBOL(kcsan_disable_current);
+
+void kcsan_enable_current(void)
+{
+ if (get_ctx()->disable_count-- == 0) {
+ /*
+ * Warn if kcsan_enable_current() calls are unbalanced with
+ * kcsan_disable_current() calls, which causes disable_count to
+ * become negative and should not happen.
+ */
+ kcsan_disable_current(); /* restore to 0, KCSAN still enabled */
+ kcsan_disable_current(); /* disable to generate warning */
+ WARN(1, "Unbalanced %s()", __func__);
+ kcsan_enable_current();
+ }
+}
+EXPORT_SYMBOL(kcsan_enable_current);
+
+void kcsan_enable_current_nowarn(void)
+{
+ if (get_ctx()->disable_count-- == 0)
+ kcsan_disable_current();
+}
+EXPORT_SYMBOL(kcsan_enable_current_nowarn);
+
+void kcsan_nestable_atomic_begin(void)
+{
+ /*
+ * Do *not* check and warn if we are in a flat atomic region: nestable
+ * and flat atomic regions are independent from each other.
+ * See include/linux/kcsan.h: struct kcsan_ctx comments for more
+ * comments.
+ */
+
+ ++get_ctx()->atomic_nest_count;
+}
+EXPORT_SYMBOL(kcsan_nestable_atomic_begin);
+
+void kcsan_nestable_atomic_end(void)
+{
+ if (get_ctx()->atomic_nest_count-- == 0) {
+ /*
+ * Warn if kcsan_nestable_atomic_end() calls are unbalanced with
+ * kcsan_nestable_atomic_begin() calls, which causes
+ * atomic_nest_count to become negative and should not happen.
+ */
+ kcsan_nestable_atomic_begin(); /* restore to 0 */
+ kcsan_disable_current(); /* disable to generate warning */
+ WARN(1, "Unbalanced %s()", __func__);
+ kcsan_enable_current();
+ }
+}
+EXPORT_SYMBOL(kcsan_nestable_atomic_end);
+
+void kcsan_flat_atomic_begin(void)
+{
+ get_ctx()->in_flat_atomic = true;
+}
+EXPORT_SYMBOL(kcsan_flat_atomic_begin);
+
+void kcsan_flat_atomic_end(void)
+{
+ get_ctx()->in_flat_atomic = false;
+}
+EXPORT_SYMBOL(kcsan_flat_atomic_end);
+
+void kcsan_atomic_next(int n)
+{
+ get_ctx()->atomic_next = n;
+}
+EXPORT_SYMBOL(kcsan_atomic_next);
+
+void kcsan_set_access_mask(unsigned long mask)
+{
+ get_ctx()->access_mask = mask;
+}
+EXPORT_SYMBOL(kcsan_set_access_mask);
+
+struct kcsan_scoped_access *
+kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
+ struct kcsan_scoped_access *sa)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+
+ check_access(ptr, size, type, _RET_IP_);
+
+ ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
+
+ INIT_LIST_HEAD(&sa->list);
+ sa->ptr = ptr;
+ sa->size = size;
+ sa->type = type;
+ sa->ip = _RET_IP_;
+
+ if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
+ INIT_LIST_HEAD(&ctx->scoped_accesses);
+ list_add(&sa->list, &ctx->scoped_accesses);
+
+ ctx->disable_count--;
+ return sa;
+}
+EXPORT_SYMBOL(kcsan_begin_scoped_access);
+
+void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+
+ if (WARN(!ctx->scoped_accesses.prev, "Unbalanced %s()?", __func__))
+ return;
+
+ ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
+
+ list_del(&sa->list);
+ if (list_empty(&ctx->scoped_accesses))
+ /*
+ * Ensure we do not enter kcsan_check_scoped_accesses()
+ * slow-path if unnecessary, and avoids requiring list_empty()
+ * in the fast-path (to avoid a READ_ONCE() and potential
+ * uaccess warning).
+ */
+ ctx->scoped_accesses.prev = NULL;
+
+ ctx->disable_count--;
+
+ check_access(sa->ptr, sa->size, sa->type, sa->ip);
+}
+EXPORT_SYMBOL(kcsan_end_scoped_access);
+
+void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
+{
+ check_access(ptr, size, type, _RET_IP_);
+}
+EXPORT_SYMBOL(__kcsan_check_access);
+
+#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
+ void __kcsan_##name(void) \
+ { \
+ struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
+ if (!sa) \
+ return; \
+ if (order_before_cond) \
+ sa->size = 0; \
+ } \
+ EXPORT_SYMBOL(__kcsan_##name)
+
+DEFINE_MEMORY_BARRIER(mb, true);
+DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(release, true);
+
+/*
+ * KCSAN uses the same instrumentation that is emitted by supported compilers
+ * for ThreadSanitizer (TSAN).
+ *
+ * When enabled, the compiler emits instrumentation calls (the functions
+ * prefixed with "__tsan" below) for all loads and stores that it generated;
+ * inline asm is not instrumented.
+ *
+ * Note that, not all supported compiler versions distinguish aligned/unaligned
+ * accesses, but e.g. recent versions of Clang do. We simply alias the unaligned
+ * version to the generic version, which can handle both.
+ */
+
+#define DEFINE_TSAN_READ_WRITE(size) \
+ void __tsan_read##size(void *ptr); \
+ void __tsan_read##size(void *ptr) \
+ { \
+ check_access(ptr, size, 0, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__tsan_read##size); \
+ void __tsan_unaligned_read##size(void *ptr) \
+ __alias(__tsan_read##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_read##size); \
+ void __tsan_write##size(void *ptr); \
+ void __tsan_write##size(void *ptr) \
+ { \
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__tsan_write##size); \
+ void __tsan_unaligned_write##size(void *ptr) \
+ __alias(__tsan_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_write##size); \
+ void __tsan_read_write##size(void *ptr); \
+ void __tsan_read_write##size(void *ptr) \
+ { \
+ check_access(ptr, size, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, \
+ _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__tsan_read_write##size); \
+ void __tsan_unaligned_read_write##size(void *ptr) \
+ __alias(__tsan_read_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_read_write##size)
+
+DEFINE_TSAN_READ_WRITE(1);
+DEFINE_TSAN_READ_WRITE(2);
+DEFINE_TSAN_READ_WRITE(4);
+DEFINE_TSAN_READ_WRITE(8);
+DEFINE_TSAN_READ_WRITE(16);
+
+void __tsan_read_range(void *ptr, size_t size);
+void __tsan_read_range(void *ptr, size_t size)
+{
+ check_access(ptr, size, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(__tsan_read_range);
+
+void __tsan_write_range(void *ptr, size_t size);
+void __tsan_write_range(void *ptr, size_t size)
+{
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_);
+}
+EXPORT_SYMBOL(__tsan_write_range);
+
+/*
+ * Use of explicit volatile is generally disallowed [1], however, volatile is
+ * still used in various concurrent context, whether in low-level
+ * synchronization primitives or for legacy reasons.
+ * [1] https://lwn.net/Articles/233479/
+ *
+ * We only consider volatile accesses atomic if they are aligned and would pass
+ * the size-check of compiletime_assert_rwonce_type().
+ */
+#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \
+ void __tsan_volatile_read##size(void *ptr); \
+ void __tsan_volatile_read##size(void *ptr) \
+ { \
+ const bool is_atomic = size <= sizeof(long long) && \
+ IS_ALIGNED((unsigned long)ptr, size); \
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
+ return; \
+ check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0, \
+ _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__tsan_volatile_read##size); \
+ void __tsan_unaligned_volatile_read##size(void *ptr) \
+ __alias(__tsan_volatile_read##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \
+ void __tsan_volatile_write##size(void *ptr); \
+ void __tsan_volatile_write##size(void *ptr) \
+ { \
+ const bool is_atomic = size <= sizeof(long long) && \
+ IS_ALIGNED((unsigned long)ptr, size); \
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
+ return; \
+ check_access(ptr, size, \
+ KCSAN_ACCESS_WRITE | \
+ (is_atomic ? KCSAN_ACCESS_ATOMIC : 0), \
+ _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__tsan_volatile_write##size); \
+ void __tsan_unaligned_volatile_write##size(void *ptr) \
+ __alias(__tsan_volatile_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_volatile_write##size)
+
+DEFINE_TSAN_VOLATILE_READ_WRITE(1);
+DEFINE_TSAN_VOLATILE_READ_WRITE(2);
+DEFINE_TSAN_VOLATILE_READ_WRITE(4);
+DEFINE_TSAN_VOLATILE_READ_WRITE(8);
+DEFINE_TSAN_VOLATILE_READ_WRITE(16);
+
+/*
+ * Function entry and exit are used to determine the validty of reorder_access.
+ * Reordering of the access ends at the end of the function scope where the
+ * access happened. This is done for two reasons:
+ *
+ * 1. Artificially limits the scope where missing barriers are detected.
+ * This minimizes false positives due to uninstrumented functions that
+ * contain the required barriers but were missed.
+ *
+ * 2. Simplifies generating the stack trace of the access.
+ */
+void __tsan_func_entry(void *call_pc);
+noinline void __tsan_func_entry(void *call_pc)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ add_kcsan_stack_depth(1);
+}
+EXPORT_SYMBOL(__tsan_func_entry);
+
+void __tsan_func_exit(void);
+noinline void __tsan_func_exit(void)
+{
+ struct kcsan_scoped_access *reorder_access;
+
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ reorder_access = get_reorder_access(get_ctx());
+ if (!reorder_access)
+ goto out;
+
+ if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
+ /*
+ * Access check to catch cases where write without a barrier
+ * (supposed release) was last access in function: because
+ * instrumentation is inserted before the real access, a data
+ * race due to the write giving up a c-s would only be caught if
+ * we do the conflicting access after.
+ */
+ check_access(reorder_access->ptr, reorder_access->size,
+ reorder_access->type, reorder_access->ip);
+ reorder_access->size = 0;
+ reorder_access->stack_depth = INT_MIN;
+ }
+out:
+ add_kcsan_stack_depth(-1);
+}
+EXPORT_SYMBOL(__tsan_func_exit);
+
+void __tsan_init(void);
+void __tsan_init(void)
+{
+}
+EXPORT_SYMBOL(__tsan_init);
+
+/*
+ * Instrumentation for atomic builtins (__atomic_*, __sync_*).
+ *
+ * Normal kernel code _should not_ be using them directly, but some
+ * architectures may implement some or all atomics using the compilers'
+ * builtins.
+ *
+ * Note: If an architecture decides to fully implement atomics using the
+ * builtins, because they are implicitly instrumented by KCSAN (and KASAN,
+ * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via
+ * atomic-instrumented) is no longer necessary.
+ *
+ * TSAN instrumentation replaces atomic accesses with calls to any of the below
+ * functions, whose job is to also execute the operation itself.
+ */
+
+static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
+{
+ if (memorder == __ATOMIC_RELEASE ||
+ memorder == __ATOMIC_SEQ_CST ||
+ memorder == __ATOMIC_ACQ_REL)
+ __kcsan_release();
+}
+
+#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
+ { \
+ kcsan_atomic_builtin_memorder(memorder); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ return __atomic_load_n(ptr, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_load); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ kcsan_atomic_builtin_memorder(memorder); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ __atomic_store_n(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_store)
+
+#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ kcsan_atomic_builtin_memorder(memorder); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ return __atomic_##op##suffix(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_##op)
+
+/*
+ * Note: CAS operations are always classified as write, even in case they
+ * fail. We cannot perform check_access() after a write, as it might lead to
+ * false positives, in cases such as:
+ *
+ * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...)
+ *
+ * T1: if (__atomic_load_n(&p->flag, ...)) {
+ * modify *p;
+ * p->flag = 0;
+ * }
+ *
+ * The only downside is that, if there are 3 threads, with one CAS that
+ * succeeds, another CAS that fails, and an unmarked racing operation, we may
+ * point at the wrong CAS as the source of the race. However, if we assume that
+ * all CAS can succeed in some other execution, the data race is still valid.
+ */
+#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo); \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo) \
+ { \
+ kcsan_atomic_builtin_memorder(mo); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength)
+
+#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo); \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo) \
+ { \
+ kcsan_atomic_builtin_memorder(mo); \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
+ } \
+ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
+ return exp; \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val)
+
+#define DEFINE_TSAN_ATOMIC_OPS(bits) \
+ DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \
+ DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits)
+
+DEFINE_TSAN_ATOMIC_OPS(8);
+DEFINE_TSAN_ATOMIC_OPS(16);
+DEFINE_TSAN_ATOMIC_OPS(32);
+#ifdef CONFIG_64BIT
+DEFINE_TSAN_ATOMIC_OPS(64);
+#endif
+
+void __tsan_atomic_thread_fence(int memorder);
+void __tsan_atomic_thread_fence(int memorder)
+{
+ kcsan_atomic_builtin_memorder(memorder);
+ __atomic_thread_fence(memorder);
+}
+EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+
+/*
+ * In instrumented files, we emit instrumentation for barriers by mapping the
+ * kernel barriers to an __atomic_signal_fence(), which is interpreted specially
+ * and otherwise has no relation to a real __atomic_signal_fence(). No known
+ * kernel code uses __atomic_signal_fence().
+ *
+ * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
+ * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
+ * can be disabled via the __no_kcsan function attribute (vs. an explicit call
+ * which could not). When __no_kcsan is requested, __atomic_signal_fence()
+ * generates no code.
+ *
+ * Note: The result of using __atomic_signal_fence() with KCSAN enabled is
+ * potentially limiting the compiler's ability to reorder operations; however,
+ * if barriers were instrumented with explicit calls (without LTO), the compiler
+ * couldn't optimize much anyway. The result of a hypothetical architecture
+ * using __atomic_signal_fence() in normal code would be KCSAN false negatives.
+ */
+void __tsan_atomic_signal_fence(int memorder);
+noinline void __tsan_atomic_signal_fence(int memorder)
+{
+ switch (memorder) {
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
+ __kcsan_mb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
+ __kcsan_wmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
+ __kcsan_rmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
+ __kcsan_release();
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL(__tsan_atomic_signal_fence);
+
+#ifdef __HAVE_ARCH_MEMSET
+void *__tsan_memset(void *s, int c, size_t count);
+noinline void *__tsan_memset(void *s, int c, size_t count)
+{
+ /*
+ * Instead of not setting up watchpoints where accessed size is greater
+ * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE.
+ */
+ size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE);
+
+ check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ return memset(s, c, count);
+}
+#else
+void *__tsan_memset(void *s, int c, size_t count) __alias(memset);
+#endif
+EXPORT_SYMBOL(__tsan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__tsan_memmove(void *dst, const void *src, size_t len);
+noinline void *__tsan_memmove(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memmove(dst, src, len);
+}
+#else
+void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove);
+#endif
+EXPORT_SYMBOL(__tsan_memmove);
+
+#ifdef __HAVE_ARCH_MEMCPY
+void *__tsan_memcpy(void *dst, const void *src, size_t len);
+noinline void *__tsan_memcpy(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memcpy(dst, src, len);
+}
+#else
+void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy);
+#endif
+EXPORT_SYMBOL(__tsan_memcpy);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
new file mode 100644
index 000000000000..2af39ba5b70b
--- /dev/null
+++ b/kernel/kcsan/debugfs.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN debugfs interface.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kcsan: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bsearch.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+
+#include "kcsan.h"
+
+atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints",
+ [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints",
+ [KCSAN_COUNTER_DATA_RACES] = "data_races",
+ [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures",
+ [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity",
+ [KCSAN_COUNTER_REPORT_RACES] = "report_races",
+ [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin",
+ [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses",
+ [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives",
+};
+static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT);
+
+/*
+ * Addresses for filtering functions from reporting. This list can be used as a
+ * whitelist or blacklist.
+ */
+static struct {
+ unsigned long *addrs; /* array of addresses */
+ size_t size; /* current size */
+ int used; /* number of elements used */
+ bool sorted; /* if elements are sorted */
+ bool whitelist; /* if list is a blacklist or whitelist */
+} report_filterlist;
+static DEFINE_RAW_SPINLOCK(report_filterlist_lock);
+
+/*
+ * The microbenchmark allows benchmarking KCSAN core runtime only. To run
+ * multiple threads, pipe 'microbench=<iters>' from multiple tasks into the
+ * debugfs file. This will not generate any conflicts, and tests fast-path only.
+ */
+static noinline void microbenchmark(unsigned long iters)
+{
+ const struct kcsan_ctx ctx_save = current->kcsan_ctx;
+ const bool was_enabled = READ_ONCE(kcsan_enabled);
+ u64 cycles;
+
+ /* We may have been called from an atomic region; reset context. */
+ memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
+ /*
+ * Disable to benchmark fast-path for all accesses, and (expected
+ * negligible) call into slow-path, but never set up watchpoints.
+ */
+ WRITE_ONCE(kcsan_enabled, false);
+
+ pr_info("%s begin | iters: %lu\n", __func__, iters);
+
+ cycles = get_cycles();
+ while (iters--) {
+ unsigned long addr = iters & ((PAGE_SIZE << 8) - 1);
+ int type = !(iters & 0x7f) ? KCSAN_ACCESS_ATOMIC :
+ (!(iters & 0xf) ? KCSAN_ACCESS_WRITE : 0);
+ __kcsan_check_access((void *)addr, sizeof(long), type);
+ }
+ cycles = get_cycles() - cycles;
+
+ pr_info("%s end | cycles: %llu\n", __func__, cycles);
+
+ WRITE_ONCE(kcsan_enabled, was_enabled);
+ /* restore context */
+ current->kcsan_ctx = ctx_save;
+}
+
+static int cmp_filterlist_addrs(const void *rhs, const void *lhs)
+{
+ const unsigned long a = *(const unsigned long *)rhs;
+ const unsigned long b = *(const unsigned long *)lhs;
+
+ return a < b ? -1 : a == b ? 0 : 1;
+}
+
+bool kcsan_skip_report_debugfs(unsigned long func_addr)
+{
+ unsigned long symbolsize, offset;
+ unsigned long flags;
+ bool ret = false;
+
+ if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset))
+ return false;
+ func_addr -= offset; /* Get function start */
+
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
+ if (report_filterlist.used == 0)
+ goto out;
+
+ /* Sort array if it is unsorted, and then do a binary search. */
+ if (!report_filterlist.sorted) {
+ sort(report_filterlist.addrs, report_filterlist.used,
+ sizeof(unsigned long), cmp_filterlist_addrs, NULL);
+ report_filterlist.sorted = true;
+ }
+ ret = !!bsearch(&func_addr, report_filterlist.addrs,
+ report_filterlist.used, sizeof(unsigned long),
+ cmp_filterlist_addrs);
+ if (report_filterlist.whitelist)
+ ret = !ret;
+
+out:
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ return ret;
+}
+
+static void set_report_filterlist_whitelist(bool whitelist)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
+ report_filterlist.whitelist = whitelist;
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+}
+
+/* Returns 0 on success, error-code otherwise. */
+static ssize_t insert_report_filterlist(const char *func)
+{
+ unsigned long flags;
+ unsigned long addr = kallsyms_lookup_name(func);
+ unsigned long *delay_free = NULL;
+ unsigned long *new_addrs = NULL;
+ size_t new_size = 0;
+ ssize_t ret = 0;
+
+ if (!addr) {
+ pr_err("could not find function: '%s'\n", func);
+ return -ENOENT;
+ }
+
+retry_alloc:
+ /*
+ * Check if we need an allocation, and re-validate under the lock. Since
+ * the report_filterlist_lock is a raw, cannot allocate under the lock.
+ */
+ if (data_race(report_filterlist.used == report_filterlist.size)) {
+ new_size = (report_filterlist.size ?: 4) * 2;
+ delay_free = new_addrs = kmalloc_array(new_size, sizeof(unsigned long), GFP_KERNEL);
+ if (!new_addrs)
+ return -ENOMEM;
+ }
+
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
+ if (report_filterlist.used == report_filterlist.size) {
+ /* Check we pre-allocated enough, and retry if not. */
+ if (report_filterlist.used >= new_size) {
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ kfree(new_addrs); /* kfree(NULL) is safe */
+ delay_free = new_addrs = NULL;
+ goto retry_alloc;
+ }
+
+ if (report_filterlist.used)
+ memcpy(new_addrs, report_filterlist.addrs, report_filterlist.used * sizeof(unsigned long));
+ delay_free = report_filterlist.addrs; /* free the old list */
+ report_filterlist.addrs = new_addrs; /* switch to the new list */
+ report_filterlist.size = new_size;
+ }
+
+ /* Note: deduplicating should be done in userspace. */
+ report_filterlist.addrs[report_filterlist.used++] = addr;
+ report_filterlist.sorted = false;
+
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+
+ kfree(delay_free);
+ return ret;
+}
+
+static int show_info(struct seq_file *file, void *v)
+{
+ int i;
+ unsigned long flags;
+
+ /* show stats */
+ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled));
+ for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) {
+ seq_printf(file, "%s: %ld\n", counter_names[i],
+ atomic_long_read(&kcsan_counters[i]));
+ }
+
+ /* show filter functions, and filter type */
+ raw_spin_lock_irqsave(&report_filterlist_lock, flags);
+ seq_printf(file, "\n%s functions: %s\n",
+ report_filterlist.whitelist ? "whitelisted" : "blacklisted",
+ report_filterlist.used == 0 ? "none" : "");
+ for (i = 0; i < report_filterlist.used; ++i)
+ seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]);
+ raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+
+ return 0;
+}
+
+static int debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_info, NULL);
+}
+
+static ssize_t
+debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off)
+{
+ char kbuf[KSYM_NAME_LEN];
+ char *arg;
+ const size_t read_len = min(count, sizeof(kbuf) - 1);
+
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EFAULT;
+ kbuf[read_len] = '\0';
+ arg = strstrip(kbuf);
+
+ if (!strcmp(arg, "on")) {
+ WRITE_ONCE(kcsan_enabled, true);
+ } else if (!strcmp(arg, "off")) {
+ WRITE_ONCE(kcsan_enabled, false);
+ } else if (str_has_prefix(arg, "microbench=")) {
+ unsigned long iters;
+
+ if (kstrtoul(&arg[strlen("microbench=")], 0, &iters))
+ return -EINVAL;
+ microbenchmark(iters);
+ } else if (!strcmp(arg, "whitelist")) {
+ set_report_filterlist_whitelist(true);
+ } else if (!strcmp(arg, "blacklist")) {
+ set_report_filterlist_whitelist(false);
+ } else if (arg[0] == '!') {
+ ssize_t ret = insert_report_filterlist(&arg[1]);
+
+ if (ret < 0)
+ return ret;
+ } else {
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+static const struct file_operations debugfs_ops =
+{
+ .read = seq_read,
+ .open = debugfs_open,
+ .write = debugfs_write,
+ .release = single_release
+};
+
+static int __init kcsan_debugfs_init(void)
+{
+ debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops);
+ return 0;
+}
+
+late_initcall(kcsan_debugfs_init);
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
new file mode 100644
index 000000000000..170a2bb22f53
--- /dev/null
+++ b/kernel/kcsan/encoding.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KCSAN watchpoint encoding.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#ifndef _KERNEL_KCSAN_ENCODING_H
+#define _KERNEL_KCSAN_ENCODING_H
+
+#include <linux/bits.h>
+#include <linux/log2.h>
+#include <linux/mm.h>
+
+#include "kcsan.h"
+
+#define SLOT_RANGE PAGE_SIZE
+
+#define INVALID_WATCHPOINT 0
+#define CONSUMED_WATCHPOINT 1
+
+/*
+ * The maximum useful size of accesses for which we set up watchpoints is the
+ * max range of slots we check on an access.
+ */
+#define MAX_ENCODABLE_SIZE (SLOT_RANGE * (1 + KCSAN_CHECK_ADJACENT))
+
+/*
+ * Number of bits we use to store size info.
+ */
+#define WATCHPOINT_SIZE_BITS bits_per(MAX_ENCODABLE_SIZE)
+/*
+ * This encoding for addresses discards the upper (1 for is-write + SIZE_BITS);
+ * however, most 64-bit architectures do not use the full 64-bit address space.
+ * Also, in order for a false positive to be observable 2 things need to happen:
+ *
+ * 1. different addresses but with the same encoded address race;
+ * 2. and both map onto the same watchpoint slots;
+ *
+ * Both these are assumed to be very unlikely. However, in case it still
+ * happens, the report logic will filter out the false positive (see report.c).
+ */
+#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS)
+
+/* Bitmasks for the encoded watchpoint access information. */
+#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1)
+#define WATCHPOINT_SIZE_MASK GENMASK(BITS_PER_LONG-2, WATCHPOINT_ADDR_BITS)
+#define WATCHPOINT_ADDR_MASK GENMASK(WATCHPOINT_ADDR_BITS-1, 0)
+static_assert(WATCHPOINT_ADDR_MASK == (1UL << WATCHPOINT_ADDR_BITS) - 1);
+static_assert((WATCHPOINT_WRITE_MASK ^ WATCHPOINT_SIZE_MASK ^ WATCHPOINT_ADDR_MASK) == ~0UL);
+
+static inline bool check_encodable(unsigned long addr, size_t size)
+{
+ /*
+ * While we can encode addrs<PAGE_SIZE, avoid crashing with a NULL
+ * pointer deref inside KCSAN.
+ */
+ return addr >= PAGE_SIZE && size <= MAX_ENCODABLE_SIZE;
+}
+
+static inline long
+encode_watchpoint(unsigned long addr, size_t size, bool is_write)
+{
+ return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) |
+ (size << WATCHPOINT_ADDR_BITS) |
+ (addr & WATCHPOINT_ADDR_MASK));
+}
+
+static __always_inline bool decode_watchpoint(long watchpoint,
+ unsigned long *addr_masked,
+ size_t *size,
+ bool *is_write)
+{
+ if (watchpoint == INVALID_WATCHPOINT ||
+ watchpoint == CONSUMED_WATCHPOINT)
+ return false;
+
+ *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK;
+ *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS;
+ *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK);
+
+ return true;
+}
+
+/*
+ * Return watchpoint slot for an address.
+ */
+static __always_inline int watchpoint_slot(unsigned long addr)
+{
+ return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS;
+}
+
+static __always_inline bool matching_access(unsigned long addr1, size_t size1,
+ unsigned long addr2, size_t size2)
+{
+ unsigned long end_range1 = addr1 + size1 - 1;
+ unsigned long end_range2 = addr2 + size2 - 1;
+
+ return addr1 <= end_range2 && addr2 <= end_range1;
+}
+
+#endif /* _KERNEL_KCSAN_ENCODING_H */
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
new file mode 100644
index 000000000000..ae33c2a7f07e
--- /dev/null
+++ b/kernel/kcsan/kcsan.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please
+ * see Documentation/dev-tools/kcsan.rst.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#ifndef _KERNEL_KCSAN_KCSAN_H
+#define _KERNEL_KCSAN_KCSAN_H
+
+#include <linux/atomic.h>
+#include <linux/kcsan.h>
+#include <linux/sched.h>
+
+/* The number of adjacent watchpoints to check. */
+#define KCSAN_CHECK_ADJACENT 1
+#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT)
+
+extern unsigned int kcsan_udelay_task;
+extern unsigned int kcsan_udelay_interrupt;
+
+/*
+ * Globally enable and disable KCSAN.
+ */
+extern bool kcsan_enabled;
+
+/*
+ * Save/restore IRQ flags state trace dirtied by KCSAN.
+ */
+void kcsan_save_irqtrace(struct task_struct *task);
+void kcsan_restore_irqtrace(struct task_struct *task);
+
+/*
+ * Statistics counters displayed via debugfs; should only be modified in
+ * slow-paths.
+ */
+enum kcsan_counter_id {
+ /*
+ * Number of watchpoints currently in use.
+ */
+ KCSAN_COUNTER_USED_WATCHPOINTS,
+
+ /*
+ * Total number of watchpoints set up.
+ */
+ KCSAN_COUNTER_SETUP_WATCHPOINTS,
+
+ /*
+ * Total number of data races.
+ */
+ KCSAN_COUNTER_DATA_RACES,
+
+ /*
+ * Total number of ASSERT failures due to races. If the observed race is
+ * due to two conflicting ASSERT type accesses, then both will be
+ * counted.
+ */
+ KCSAN_COUNTER_ASSERT_FAILURES,
+
+ /*
+ * Number of times no watchpoints were available.
+ */
+ KCSAN_COUNTER_NO_CAPACITY,
+
+ /*
+ * A thread checking a watchpoint raced with another checking thread;
+ * only one will be reported.
+ */
+ KCSAN_COUNTER_REPORT_RACES,
+
+ /*
+ * Observed data value change, but writer thread unknown.
+ */
+ KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN,
+
+ /*
+ * The access cannot be encoded to a valid watchpoint.
+ */
+ KCSAN_COUNTER_UNENCODABLE_ACCESSES,
+
+ /*
+ * Watchpoint encoding caused a watchpoint to fire on mismatching
+ * accesses.
+ */
+ KCSAN_COUNTER_ENCODING_FALSE_POSITIVES,
+
+ KCSAN_COUNTER_COUNT, /* number of counters */
+};
+extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
+
+/*
+ * Returns true if data races in the function symbol that maps to func_addr
+ * (offsets are ignored) should *not* be reported.
+ */
+extern bool kcsan_skip_report_debugfs(unsigned long func_addr);
+
+/*
+ * Value-change states.
+ */
+enum kcsan_value_change {
+ /*
+ * Did not observe a value-change, however, it is valid to report the
+ * race, depending on preferences.
+ */
+ KCSAN_VALUE_CHANGE_MAYBE,
+
+ /*
+ * Did not observe a value-change, and it is invalid to report the race.
+ */
+ KCSAN_VALUE_CHANGE_FALSE,
+
+ /*
+ * The value was observed to change, and the race should be reported.
+ */
+ KCSAN_VALUE_CHANGE_TRUE,
+};
+
+/*
+ * The calling thread hit and consumed a watchpoint: set the access information
+ * to be consumed by the reporting thread. No report is printed yet.
+ */
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, int watchpoint_idx);
+
+/*
+ * The calling thread observed that the watchpoint it set up was hit and
+ * consumed: print the full report based on information set by the racing
+ * thread.
+ */
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask);
+
+/*
+ * No other thread was observed to race with the access, but the data value
+ * before and after the stall differs. Reports a race of "unknown origin".
+ */
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, u64 old, u64 new, u64 mask);
+
+#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
new file mode 100644
index 000000000000..219d22857c98
--- /dev/null
+++ b/kernel/kcsan/kcsan_test.c
@@ -0,0 +1,1625 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN test with various race scenarious to test runtime behaviour. Since the
+ * interface with which KCSAN's reports are obtained is via the console, this is
+ * the output we should verify. For each test case checks the presence (or
+ * absence) of generated reports. Relies on 'console' tracepoint to capture
+ * reports as they appear in the kernel log.
+ *
+ * Makes use of KUnit for test organization, and the Torture framework for test
+ * thread control.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Marco Elver <elver@google.com>
+ */
+
+#define pr_fmt(fmt) "kcsan_test: " fmt
+
+#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/jiffies.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/torture.h>
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+#include <trace/events/printk.h>
+
+#define KCSAN_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
+#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
+#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
+#else
+#define __KCSAN_ACCESS_RW(alt) (alt)
+#endif
+
+/* Points to current test-case memory access "kernels". */
+static void (*access_kernels[2])(void);
+
+static struct task_struct **threads; /* Lists of threads. */
+static unsigned long end_time; /* End time of test. */
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ int nlines;
+ char lines[3][512];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Setup test checking loop. */
+static __no_kcsan inline void
+begin_test_checks(void (*func1)(void), void (*func2)(void))
+{
+ kcsan_disable_current();
+
+ /*
+ * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at
+ * least one race is reported.
+ */
+ end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500);
+
+ /* Signal start; release potential initialization of shared data. */
+ smp_store_release(&access_kernels[0], func1);
+ smp_store_release(&access_kernels[1], func2);
+}
+
+/* End test checking loop. */
+static __no_kcsan inline bool
+end_test_checks(bool stop)
+{
+ if (!stop && time_before(jiffies, end_time)) {
+ /* Continue checking */
+ might_sleep();
+ return false;
+ }
+
+ kcsan_enable_current();
+ return true;
+}
+
+/*
+ * Probe for console output: checks if a race was reported, and obtains observed
+ * lines of interest.
+ */
+__no_kcsan
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+ int nlines;
+
+ /*
+ * Note that KCSAN reports under a global lock, so we do not risk the
+ * possibility of having multiple reports interleaved. If that were the
+ * case, we'd expect tests to fail.
+ */
+
+ spin_lock_irqsave(&observed.lock, flags);
+ nlines = observed.nlines;
+
+ if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) {
+ /*
+ * KCSAN report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+ nlines = 1;
+ } else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) {
+ strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+
+ if (strnstr(buf, "race at unknown origin", len)) {
+ if (WARN_ON(nlines != 2))
+ goto out;
+
+ /* No second line of interest. */
+ strscpy(observed.lines[nlines++], "<none>");
+ }
+ }
+
+out:
+ WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+__no_kcsan
+static bool report_available(void)
+{
+ return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Report information we expect in a report. */
+struct expect_report {
+ /* Access information of both accesses. */
+ struct {
+ void *fn; /* Function pointer to expected function of top frame. */
+ void *addr; /* Address of access; unchecked if NULL. */
+ size_t size; /* Size of access; unchecked if @addr is NULL. */
+ int type; /* Access type, see KCSAN_ACCESS definitions. */
+ } access[2];
+};
+
+/* Check observed report matches information in @r. */
+__no_kcsan
+static bool __report_matches(const struct expect_report *r)
+{
+ const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
+ bool ret = false;
+ unsigned long flags;
+ typeof(*observed.lines) *expect;
+ const char *end;
+ char *cur;
+ int i;
+
+ /* Doubled-checked locking. */
+ if (!report_available())
+ return false;
+
+ expect = kmalloc(sizeof(observed.lines), GFP_KERNEL);
+ if (WARN_ON(!expect))
+ return false;
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expect[0];
+ end = &expect[0][sizeof(expect[0]) - 1];
+ cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ",
+ is_assert ? "assert: race" : "data-race");
+ if (r->access[1].fn) {
+ char tmp[2][64];
+ int cmp;
+
+ /* Expect lexographically sorted function names in title. */
+ scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn);
+ scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn);
+ cmp = strcmp(tmp[0], tmp[1]);
+ cur += scnprintf(cur, end - cur, "%ps / %ps",
+ cmp < 0 ? r->access[0].fn : r->access[1].fn,
+ cmp < 0 ? r->access[1].fn : r->access[0].fn);
+ } else {
+ scnprintf(cur, end - cur, "%pS", r->access[0].fn);
+ /* The exact offset won't match, remove it. */
+ cur = strchr(expect[0], '+');
+ if (cur)
+ *cur = '\0';
+ }
+
+ /* Access 1 */
+ cur = expect[1];
+ end = &expect[1][sizeof(expect[1]) - 1];
+ if (!r->access[1].fn)
+ cur += scnprintf(cur, end - cur, "race at unknown origin, with ");
+
+ /* Access 1 & 2 */
+ for (i = 0; i < 2; ++i) {
+ const int ty = r->access[i].type;
+ const char *const access_type =
+ (ty & KCSAN_ACCESS_ASSERT) ?
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ "assert no accesses" :
+ "assert no writes") :
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ ((ty & KCSAN_ACCESS_COMPOUND) ?
+ "read-write" :
+ "write") :
+ "read");
+ const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
+ const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
+ const char *const access_type_aux =
+ (is_atomic && is_scoped) ? " (marked, reordered)"
+ : (is_atomic ? " (marked)"
+ : (is_scoped ? " (reordered)" : ""));
+
+ if (i == 1) {
+ /* Access 2 */
+ cur = expect[2];
+ end = &expect[2][sizeof(expect[2]) - 1];
+
+ if (!r->access[1].fn) {
+ /* Dummy string if no second access is available. */
+ strscpy(expect[2], "<none>");
+ break;
+ }
+ }
+
+ cur += scnprintf(cur, end - cur, "%s%s to ", access_type,
+ access_type_aux);
+
+ if (r->access[i].addr) /* Address is optional. */
+ cur += scnprintf(cur, end - cur, "0x%px of %zu bytes",
+ r->access[i].addr, r->access[i].size);
+ }
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.lines[0], expect[0]) &&
+ /* Access info may appear in any order. */
+ ((strstr(observed.lines[1], expect[1]) &&
+ strstr(observed.lines[2], expect[2])) ||
+ (strstr(observed.lines[1], expect[2]) &&
+ strstr(observed.lines[2], expect[1])));
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+ kfree(expect);
+ return ret;
+}
+
+static __always_inline const struct expect_report *
+__report_set_scoped(struct expect_report *r, int accesses)
+{
+ BUILD_BUG_ON(accesses > 3);
+
+ if (accesses & 1)
+ r->access[0].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
+
+ if (accesses & 2)
+ r->access[1].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
+
+ return r;
+}
+
+__no_kcsan
+static bool report_matches_any_reordered(struct expect_report *r)
+{
+ return __report_matches(__report_set_scoped(r, 0)) ||
+ __report_matches(__report_set_scoped(r, 1)) ||
+ __report_matches(__report_set_scoped(r, 2)) ||
+ __report_matches(__report_set_scoped(r, 3));
+}
+
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+/* Due to reordering accesses, any access may appear as "(reordered)". */
+#define report_matches report_matches_any_reordered
+#else
+#define report_matches __report_matches
+#endif
+
+/* ===== Test kernels ===== */
+
+static long test_sink;
+static long test_var;
+/* @test_array should be large enough to fall into multiple watchpoint slots. */
+static long test_array[3 * PAGE_SIZE / sizeof(long)];
+static struct {
+ long val[8];
+} test_struct;
+static long __data_racy test_data_racy;
+static DEFINE_SEQLOCK(test_seqlock);
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_MUTEX(test_mutex);
+
+/*
+ * Helper to avoid compiler optimizing out reads, and to generate source values
+ * for writes.
+ */
+__no_kcsan
+static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+
+/*
+ * Generates a delay and some accesses that enter the runtime but do not produce
+ * data races.
+ */
+static noinline void test_delay(int iter)
+{
+ while (iter--)
+ sink_value(READ_ONCE(test_sink));
+}
+
+static noinline void test_kernel_read(void) { sink_value(test_var); }
+
+static noinline void test_kernel_write(void)
+{
+ test_var = READ_ONCE_NOCHECK(test_sink) + 1;
+}
+
+static noinline void test_kernel_write_nochange(void) { test_var = 42; }
+
+/* Suffixed by value-change exception filter. */
+static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; }
+
+static noinline void test_kernel_read_atomic(void)
+{
+ sink_value(READ_ONCE(test_var));
+}
+
+static noinline void test_kernel_write_atomic(void)
+{
+ WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1);
+}
+
+static noinline void test_kernel_atomic_rmw(void)
+{
+ /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */
+ __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED);
+}
+
+__no_kcsan
+static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
+
+static noinline void test_kernel_data_race(void) { data_race(test_var++); }
+
+static noinline void test_kernel_data_racy_qualifier(void) { test_data_racy++; }
+
+static noinline void test_kernel_assert_writer(void)
+{
+ ASSERT_EXCLUSIVE_WRITER(test_var);
+}
+
+static noinline void test_kernel_assert_access(void)
+{
+ ASSERT_EXCLUSIVE_ACCESS(test_var);
+}
+
+#define TEST_CHANGE_BITS 0xff00ff00
+
+static noinline void test_kernel_change_bits(void)
+{
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+ /*
+ * Avoid race of unknown origin for this test, just pretend they
+ * are atomic.
+ */
+ kcsan_nestable_atomic_begin();
+ test_var ^= TEST_CHANGE_BITS;
+ kcsan_nestable_atomic_end();
+ } else
+ WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_change(void)
+{
+ ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_nochange(void)
+{
+ ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
+}
+
+/*
+ * Scoped assertions do trigger anywhere in scope. However, the report should
+ * still only point at the start of the scope.
+ */
+static noinline void test_enter_scope(void)
+{
+ int x = 0;
+
+ /* Unrelated accesses to scoped assert. */
+ READ_ONCE(test_sink);
+ kcsan_check_read(&x, sizeof(x));
+}
+
+static noinline void test_kernel_assert_writer_scoped(void)
+{
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var);
+ test_enter_scope();
+}
+
+static noinline void test_kernel_assert_access_scoped(void)
+{
+ ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var);
+ test_enter_scope();
+}
+
+static noinline void test_kernel_rmw_array(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_array); ++i)
+ test_array[i]++;
+}
+
+static noinline void test_kernel_write_struct(void)
+{
+ kcsan_check_write(&test_struct, sizeof(test_struct));
+ kcsan_disable_current();
+ test_struct.val[3]++; /* induce value change */
+ kcsan_enable_current();
+}
+
+static noinline void test_kernel_write_struct_part(void)
+{
+ test_struct.val[3] = 42;
+}
+
+static noinline void test_kernel_read_struct_zero_size(void)
+{
+ kcsan_check_read(&test_struct.val[3], 0);
+}
+
+static noinline void test_kernel_jiffies_reader(void)
+{
+ sink_value((long)jiffies);
+}
+
+static noinline void test_kernel_seqlock_reader(void)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&test_seqlock);
+ sink_value(test_var);
+ } while (read_seqretry(&test_seqlock, seq));
+}
+
+static noinline void test_kernel_seqlock_writer(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&test_seqlock, flags);
+ test_var++;
+ write_sequnlock_irqrestore(&test_seqlock, flags);
+}
+
+static noinline void test_kernel_atomic_builtins(void)
+{
+ /*
+ * Generate concurrent accesses, expecting no reports, ensuring KCSAN
+ * treats builtin atomics as actually atomic.
+ */
+ __atomic_load_n(&test_var, __ATOMIC_RELAXED);
+}
+
+static noinline void test_kernel_xor_1bit(void)
+{
+ /* Do not report data races between the read-writes. */
+ kcsan_nestable_atomic_begin();
+ test_var ^= 0x10000;
+ kcsan_nestable_atomic_end();
+}
+
+#define TEST_KERNEL_LOCKED(name, acquire, release) \
+ static noinline void test_kernel_##name(void) \
+ { \
+ long *flag = &test_struct.val[0]; \
+ long v = 0; \
+ if (!(acquire)) \
+ return; \
+ while (v++ < 100) { \
+ test_var++; \
+ barrier(); \
+ } \
+ release; \
+ test_delay(10); \
+ }
+
+TEST_KERNEL_LOCKED(with_memorder,
+ cmpxchg_acquire(flag, 0, 1) == 0,
+ smp_store_release(flag, 0));
+TEST_KERNEL_LOCKED(wrong_memorder,
+ cmpxchg_relaxed(flag, 0, 1) == 0,
+ WRITE_ONCE(*flag, 0));
+TEST_KERNEL_LOCKED(atomic_builtin_with_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELEASE));
+TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder,
+ __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED),
+ __atomic_store_n(flag, 0, __ATOMIC_RELAXED));
+
+/* ===== Test cases ===== */
+
+/*
+ * Tests that various barriers have the expected effect on internal state. Not
+ * exhaustive on atomic_t operations. Unlike the selftest, also checks for
+ * too-strict barrier instrumentation; these can be tolerated, because it does
+ * not cause false positives, but at least we should be aware of such cases.
+ */
+static void test_barrier_nothreads(struct kunit *test)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy = ATOMIC_INIT(0);
+
+ KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
+ KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
+
+#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = sizeof(test_var); \
+ barrier; \
+ KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \
+ order_before ? 0 : sizeof(test_var), \
+ "improperly instrumented type=(" #access_type "): " name); \
+ } while (0)
+#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b)
+#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b)
+#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b)
+
+ /*
+ * Lockdep initialization can strengthen certain locking operations due
+ * to calling into instrumented files; "warm up" our locks.
+ */
+ spin_lock(&test_spinlock);
+ spin_unlock(&test_spinlock);
+ mutex_lock(&test_mutex);
+ mutex_unlock(&test_mutex);
+
+ /* Force creating a valid entry in reorder_access first. */
+ test_var = 0;
+ while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var))
+ __kcsan_check_read(&test_var, sizeof(test_var));
+ KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var));
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_EXPECT_READ_BARRIER(mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false);
+ KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_WRITE_BARRIER(mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true);
+
+ KCSAN_EXPECT_RW_BARRIER(mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false);
+ KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true);
+ KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
+ KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false);
+ KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
+ KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
+ KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
+ KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+ kcsan_nestable_atomic_end();
+}
+
+/* Simple test with normal data race. */
+__no_kcsan
+static void test_basic(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_write, test_kernel_read);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never = report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Stress KCSAN with lots of concurrent races on different addresses until
+ * timeout.
+ */
+__no_kcsan
+static void test_concurrent_races(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ /* NULL will match any address. */
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_rmw_array, NULL, 0, 0 },
+ { test_kernel_rmw_array, NULL, 0, 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never |= report_matches(&never);
+ } while (!end_test_checks(false));
+ KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */
+__no_kcsan
+static void test_novalue_change(struct kunit *test)
+{
+ struct expect_report expect_rw = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ test_kernel_write_nochange(); /* Reset value. */
+ begin_test_checks(test_kernel_write_nochange, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
+ KUNIT_EXPECT_FALSE(test, match_expect);
+ else
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should
+ * never apply work.
+ */
+__no_kcsan
+static void test_novalue_change_exception(struct kunit *test)
+{
+ struct expect_report expect_rw = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ test_kernel_write_nochange_rcu(); /* Reset value. */
+ begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that data races of unknown origin are reported. */
+__no_kcsan
+static void test_unknown_origin(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { NULL },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */
+__no_kcsan
+static void test_write_write_assume_atomic(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write, test_kernel_write);
+ do {
+ sink_value(READ_ONCE(test_var)); /* induce value-change */
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC))
+ KUNIT_EXPECT_FALSE(test, match_expect);
+ else
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races with writes larger than word-size are always reported,
+ * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_write_struct);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races where only one write is larger than word-size are always
+ * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct_part(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that races with atomic accesses never result in reports. */
+__no_kcsan
+static void test_read_atomic_write_atomic(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that a race with an atomic and plain access result in reports. */
+__no_kcsan
+static void test_read_plain_atomic_write(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+ },
+ };
+ bool match_expect = false;
+
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
+
+ begin_test_checks(test_kernel_read, test_kernel_write_atomic);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that atomic RMWs generate correct report. */
+__no_kcsan
+static void test_read_plain_atomic_rmw(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_atomic_rmw, &test_var, sizeof(test_var),
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+ },
+ };
+ bool match_expect = false;
+
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
+
+ begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Zero-sized accesses should never cause data race reports. */
+__no_kcsan
+static void test_zero_size_access(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+ { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
+ },
+ };
+ bool match_expect = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size);
+ do {
+ match_expect |= report_matches(&expect);
+ match_never = report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the data_race() macro. */
+__no_kcsan
+static void test_data_race(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_data_race, test_kernel_data_race);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the __data_racy type qualifier. */
+__no_kcsan
+static void test_data_racy_qualifier(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_data_racy_qualifier, test_kernel_data_racy_qualifier);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_access, test_kernel_read);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_writer(struct kunit *test)
+{
+ struct expect_report expect_access_writer = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ },
+ };
+ struct expect_report expect_access_access = {
+ .access = {
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+ },
+ };
+ struct expect_report never = {
+ .access = {
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ },
+ };
+ bool match_expect_access_writer = false;
+ bool match_expect_access_access = false;
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer);
+ do {
+ match_expect_access_writer |= report_matches(&expect_access_writer);
+ match_expect_access_access |= report_matches(&expect_access_access);
+ match_never |= report_matches(&never);
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_TRUE(test, match_expect_access_writer);
+ KUNIT_EXPECT_TRUE(test, match_expect_access_access);
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_change(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+ { test_kernel_change_bits, &test_var, sizeof(test_var),
+ KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) },
+ },
+ };
+ bool match_expect = false;
+
+ begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_nochange(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer_scoped(struct kunit *test)
+{
+ struct expect_report expect_start = {
+ .access = {
+ { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ struct expect_report expect_inscope = {
+ .access = {
+ { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
+ bool match_expect_start = false;
+ bool match_expect_inscope = false;
+
+ begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
+ do {
+ match_expect_start |= report_matches(&expect_start);
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_inscope));
+ KUNIT_EXPECT_TRUE(test, match_expect_start);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_scoped(struct kunit *test)
+{
+ struct expect_report expect_start1 = {
+ .access = {
+ { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ struct expect_report expect_start2 = {
+ .access = { expect_start1.access[0], expect_start1.access[0] },
+ };
+ struct expect_report expect_inscope = {
+ .access = {
+ { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ },
+ };
+ bool match_expect_start = false;
+ bool match_expect_inscope = false;
+
+ begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read);
+ end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */
+ do {
+ match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_inscope));
+ KUNIT_EXPECT_TRUE(test, match_expect_start);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
+}
+
+/*
+ * jiffies is special (declared to be volatile) and its accesses are typically
+ * not marked; this test ensures that the compiler nor KCSAN gets confused about
+ * jiffies's declaration on different architectures.
+ */
+__no_kcsan
+static void test_jiffies_noreport(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that racing accesses in seqlock critical sections are not reported. */
+__no_kcsan
+static void test_seqlock_noreport(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer);
+ do {
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Test atomic builtins work and required instrumentation functions exist. We
+ * also test that KCSAN understands they're atomic by racing with them via
+ * test_kernel_atomic_builtins(), and expect no reports.
+ *
+ * The atomic builtins _SHOULD NOT_ be used in normal kernel code!
+ */
+static void test_atomic_builtins(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins);
+ do {
+ long tmp;
+
+ kcsan_enable_current();
+
+ __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED);
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED));
+
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 20L, test_var);
+
+ tmp = 20L;
+ KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L,
+ 0, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 20L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+ KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L,
+ 1, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 30L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, -2L, test_var);
+
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+ __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+ kcsan_disable_current();
+
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_1bit_value_change(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ },
+ };
+ bool match = false;
+
+ begin_test_checks(test_kernel_read, test_kernel_xor_1bit);
+ do {
+ match = IS_ENABLED(CONFIG_KCSAN_PERMISSIVE)
+ ? report_available()
+ : report_matches(&expect);
+ } while (!end_test_checks(match));
+ if (IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ KUNIT_EXPECT_FALSE(test, match);
+ else
+ KUNIT_EXPECT_TRUE(test, match);
+}
+
+__no_kcsan
+static void test_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_correct_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_with_memorder,
+ test_kernel_atomic_builtin_with_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+__no_kcsan
+static void test_atomic_builtins_missing_barrier(struct kunit *test)
+{
+ struct expect_report expect = {
+ .access = {
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
+ },
+ };
+ bool match_expect = false;
+
+ test_struct.val[0] = 0; /* init unlocked */
+ begin_test_checks(test_kernel_atomic_builtin_wrong_memorder,
+ test_kernel_atomic_builtin_wrong_memorder);
+ do {
+ match_expect = report_matches_any_reordered(&expect);
+ } while (!end_test_checks(match_expect));
+ if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ KUNIT_EXPECT_TRUE(test, match_expect);
+ else
+ KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+/*
+ * Generate thread counts for all test cases. Values generated are in interval
+ * [2, 5] followed by exponentially increasing thread counts from 8 to 32.
+ *
+ * The thread counts are chosen to cover potentially interesting boundaries and
+ * corner cases (2 to 5), and then stress the system with larger counts.
+ */
+static const void *nthreads_gen_params(struct kunit *test, const void *prev, char *desc)
+{
+ long nthreads = (long)prev;
+
+ if (nthreads < 0 || nthreads >= 32)
+ nthreads = 0; /* stop */
+ else if (!nthreads)
+ nthreads = 2; /* initial value */
+ else if (nthreads < 5)
+ nthreads++;
+ else if (nthreads == 5)
+ nthreads = 8;
+ else
+ nthreads *= 2;
+
+ if (!preempt_model_preemptible() ||
+ !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+ /*
+ * Without any preemption, keep 2 CPUs free for other tasks, one
+ * of which is the main test case function checking for
+ * completion or failure.
+ */
+ const long min_unused_cpus = preempt_model_none() ? 2 : 0;
+ const long min_required_cpus = 2 + min_unused_cpus;
+
+ if (num_online_cpus() < min_required_cpus) {
+ pr_err_once("Too few online CPUs (%u < %ld) for test\n",
+ num_online_cpus(), min_required_cpus);
+ nthreads = 0;
+ } else if (nthreads >= num_online_cpus() - min_unused_cpus) {
+ /* Use negative value to indicate last param. */
+ nthreads = -(num_online_cpus() - min_unused_cpus);
+ pr_warn_once("Limiting number of threads to %ld (only %d online CPUs)\n",
+ -nthreads, num_online_cpus());
+ }
+ }
+
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "threads=%ld", abs(nthreads));
+ return (void *)nthreads;
+}
+
+#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params)
+static struct kunit_case kcsan_test_cases[] = {
+ KUNIT_CASE(test_barrier_nothreads),
+ KCSAN_KUNIT_CASE(test_basic),
+ KCSAN_KUNIT_CASE(test_concurrent_races),
+ KCSAN_KUNIT_CASE(test_novalue_change),
+ KCSAN_KUNIT_CASE(test_novalue_change_exception),
+ KCSAN_KUNIT_CASE(test_unknown_origin),
+ KCSAN_KUNIT_CASE(test_write_write_assume_atomic),
+ KCSAN_KUNIT_CASE(test_write_write_struct),
+ KCSAN_KUNIT_CASE(test_write_write_struct_part),
+ KCSAN_KUNIT_CASE(test_read_atomic_write_atomic),
+ KCSAN_KUNIT_CASE(test_read_plain_atomic_write),
+ KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
+ KCSAN_KUNIT_CASE(test_zero_size_access),
+ KCSAN_KUNIT_CASE(test_data_race),
+ KCSAN_KUNIT_CASE(test_data_racy_qualifier),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped),
+ KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
+ KCSAN_KUNIT_CASE(test_jiffies_noreport),
+ KCSAN_KUNIT_CASE(test_seqlock_noreport),
+ KCSAN_KUNIT_CASE(test_atomic_builtins),
+ KCSAN_KUNIT_CASE(test_1bit_value_change),
+ KCSAN_KUNIT_CASE(test_correct_barrier),
+ KCSAN_KUNIT_CASE(test_missing_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier),
+ KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+/* Concurrent accesses from interrupts. */
+__no_kcsan
+static void access_thread_timer(struct timer_list *timer)
+{
+ static atomic_t cnt = ATOMIC_INIT(0);
+ unsigned int idx;
+ void (*func)(void);
+
+ idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels);
+ /* Acquire potential initialization. */
+ func = smp_load_acquire(&access_kernels[idx]);
+ if (func)
+ func();
+}
+
+/* The main loop for each thread. */
+__no_kcsan
+static int access_thread(void *arg)
+{
+ struct timer_list timer;
+ unsigned int cnt = 0;
+ unsigned int idx;
+ void (*func)(void);
+
+ timer_setup_on_stack(&timer, access_thread_timer, 0);
+ do {
+ might_sleep();
+
+ if (!timer_pending(&timer))
+ mod_timer(&timer, jiffies + 1);
+ else {
+ /* Iterate through all kernels. */
+ idx = cnt++ % ARRAY_SIZE(access_kernels);
+ /* Acquire potential initialization. */
+ func = smp_load_acquire(&access_kernels[idx]);
+ if (func)
+ func();
+ }
+ } while (!torture_must_stop());
+ timer_delete_sync(&timer);
+ timer_destroy_on_stack(&timer);
+
+ torture_kthread_stopping("access_thread");
+ return 0;
+}
+
+__no_kcsan
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+ int nthreads;
+ int i;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ for (i = 0; i < ARRAY_SIZE(observed.lines); ++i)
+ observed.lines[i][0] = '\0';
+ observed.nlines = 0;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ if (strstr(test->name, "nothreads"))
+ return 0;
+
+ if (!torture_init_begin((char *)test->name, 1))
+ return -EBUSY;
+
+ if (WARN_ON(threads))
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) {
+ if (WARN_ON(access_kernels[i]))
+ goto err;
+ }
+
+ nthreads = abs((long)test->param_value);
+ if (WARN_ON(!nthreads))
+ goto err;
+
+ threads = kcalloc(nthreads + 1, sizeof(struct task_struct *), GFP_KERNEL);
+ if (WARN_ON(!threads))
+ goto err;
+
+ threads[nthreads] = NULL;
+ for (i = 0; i < nthreads; ++i) {
+ if (torture_create_kthread(access_thread, NULL, threads[i]))
+ goto err;
+ }
+
+ torture_init_end();
+
+ return 0;
+
+err:
+ kfree(threads);
+ threads = NULL;
+ torture_init_end();
+ return -EINVAL;
+}
+
+__no_kcsan
+static void test_exit(struct kunit *test)
+{
+ struct task_struct **stop_thread;
+ int i;
+
+ if (strstr(test->name, "nothreads"))
+ return;
+
+ if (torture_cleanup_begin())
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(access_kernels); ++i)
+ WRITE_ONCE(access_kernels[i], NULL);
+
+ if (threads) {
+ for (stop_thread = threads; *stop_thread; stop_thread++)
+ torture_stop_kthread(reader_thread, *stop_thread);
+
+ kfree(threads);
+ threads = NULL;
+ }
+
+ torture_cleanup_end();
+}
+
+__no_kcsan
+static void register_tracepoints(void)
+{
+ register_trace_console(probe_console, NULL);
+}
+
+__no_kcsan
+static void unregister_tracepoints(void)
+{
+ unregister_trace_console(probe_console, NULL);
+}
+
+static int kcsan_suite_init(struct kunit_suite *suite)
+{
+ register_tracepoints();
+ return 0;
+}
+
+static void kcsan_suite_exit(struct kunit_suite *suite)
+{
+ unregister_tracepoints();
+ tracepoint_synchronize_unregister();
+}
+
+static struct kunit_suite kcsan_test_suite = {
+ .name = "kcsan",
+ .test_cases = kcsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kcsan_suite_init,
+ .suite_exit = kcsan_suite_exit,
+};
+
+kunit_test_suites(&kcsan_test_suite);
+
+MODULE_DESCRIPTION("KCSAN test suite");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/kcsan/permissive.h b/kernel/kcsan/permissive.h
new file mode 100644
index 000000000000..2c01fe4a59ee
--- /dev/null
+++ b/kernel/kcsan/permissive.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Special rules for ignoring entire classes of data-racy memory accesses. None
+ * of the rules here imply that such data races are generally safe!
+ *
+ * All rules in this file can be configured via CONFIG_KCSAN_PERMISSIVE. Keep
+ * them separate from core code to make it easier to audit.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#ifndef _KERNEL_KCSAN_PERMISSIVE_H
+#define _KERNEL_KCSAN_PERMISSIVE_H
+
+#include <linux/bitops.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+/*
+ * Access ignore rules based on address.
+ */
+static __always_inline bool kcsan_ignore_address(const volatile void *ptr)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Data-racy bitops on current->flags are too common, ignore completely
+ * for now.
+ */
+ return ptr == &current->flags;
+}
+
+/*
+ * Data race ignore rules based on access type and value change patterns.
+ */
+static bool
+kcsan_ignore_data_race(size_t size, int type, u64 old, u64 new, u64 diff)
+{
+ if (!IS_ENABLED(CONFIG_KCSAN_PERMISSIVE))
+ return false;
+
+ /*
+ * Rules here are only for plain read accesses, so that we still report
+ * data races between plain read-write accesses.
+ */
+ if (type || size > sizeof(long))
+ return false;
+
+ /*
+ * A common pattern is checking/setting just 1 bit in a variable; for
+ * example:
+ *
+ * if (flags & SOME_FLAG) { ... }
+ *
+ * and elsewhere flags is updated concurrently:
+ *
+ * flags |= SOME_OTHER_FLAG; // just 1 bit
+ *
+ * While it is still recommended that such accesses be marked
+ * appropriately, in many cases these types of data races are so common
+ * that marking them all is often unrealistic and left to maintainer
+ * preference.
+ *
+ * The assumption in all cases is that with all known compiler
+ * optimizations (including those that tear accesses), because no more
+ * than 1 bit changed, the plain accesses are safe despite the presence
+ * of data races.
+ *
+ * The rules here will ignore the data races if we observe no more than
+ * 1 bit changed.
+ *
+ * Of course many operations can effecively change just 1 bit, but the
+ * general assuption that data races involving 1-bit changes can be
+ * tolerated still applies.
+ *
+ * And in case a true bug is missed, the bug likely manifests as a
+ * reportable data race elsewhere.
+ */
+ if (hweight64(diff) == 1) {
+ /*
+ * Exception: Report data races where the values look like
+ * ordinary booleans (one of them was 0 and the 0th bit was
+ * changed) More often than not, they come with interesting
+ * memory ordering requirements, so let's report them.
+ */
+ if (!((!old || !new) && diff == 1))
+ return true;
+ }
+
+ return false;
+}
+
+#endif /* _KERNEL_KCSAN_PERMISSIVE_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
new file mode 100644
index 000000000000..e95ce7d7a76e
--- /dev/null
+++ b/kernel/kcsan/report.c
@@ -0,0 +1,715 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN reporting.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#include <linux/debug_locks.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/preempt.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/stacktrace.h>
+
+#include "kcsan.h"
+#include "encoding.h"
+
+/*
+ * Max. number of stack entries to show in the report.
+ */
+#define NUM_STACK_ENTRIES 64
+
+/* Common access info. */
+struct access_info {
+ const volatile void *ptr;
+ size_t size;
+ int access_type;
+ int task_pid;
+ int cpu_id;
+ unsigned long ip;
+};
+
+/*
+ * Other thread info: communicated from other racing thread to thread that set
+ * up the watchpoint, which then prints the complete report atomically.
+ */
+struct other_info {
+ struct access_info ai;
+ unsigned long stack_entries[NUM_STACK_ENTRIES];
+ int num_stack_entries;
+
+ /*
+ * Optionally pass @current. Typically we do not need to pass @current
+ * via @other_info since just @task_pid is sufficient. Passing @current
+ * has additional overhead.
+ *
+ * To safely pass @current, we must either use get_task_struct/
+ * put_task_struct, or stall the thread that populated @other_info.
+ *
+ * We cannot rely on get_task_struct/put_task_struct in case
+ * release_report() races with a task being released, and would have to
+ * free it in release_report(). This may result in deadlock if we want
+ * to use KCSAN on the allocators.
+ *
+ * Since we also want to reliably print held locks for
+ * CONFIG_KCSAN_VERBOSE, the current implementation stalls the thread
+ * that populated @other_info until it has been consumed.
+ */
+ struct task_struct *task;
+};
+
+/*
+ * To never block any producers of struct other_info, we need as many elements
+ * as we have watchpoints (upper bound on concurrent races to report).
+ */
+static struct other_info other_infos[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
+
+/*
+ * Information about reported races; used to rate limit reporting.
+ */
+struct report_time {
+ /*
+ * The last time the race was reported.
+ */
+ unsigned long time;
+
+ /*
+ * The frames of the 2 threads; if only 1 thread is known, one frame
+ * will be 0.
+ */
+ unsigned long frame1;
+ unsigned long frame2;
+};
+
+/*
+ * Since we also want to be able to debug allocators with KCSAN, to avoid
+ * deadlock, report_times cannot be dynamically resized with krealloc in
+ * rate_limit_report.
+ *
+ * Therefore, we use a fixed-size array, which at most will occupy a page. This
+ * still adequately rate limits reports, assuming that a) number of unique data
+ * races is not excessive, and b) occurrence of unique races within the
+ * same time window is limited.
+ */
+#define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time))
+#define REPORT_TIMES_SIZE \
+ (CONFIG_KCSAN_REPORT_ONCE_IN_MS > REPORT_TIMES_MAX ? \
+ REPORT_TIMES_MAX : \
+ CONFIG_KCSAN_REPORT_ONCE_IN_MS)
+static struct report_time report_times[REPORT_TIMES_SIZE];
+
+/*
+ * Spinlock serializing report generation, and access to @other_infos. Although
+ * it could make sense to have a finer-grained locking story for @other_infos,
+ * report generation needs to be serialized either way, so not much is gained.
+ */
+static DEFINE_RAW_SPINLOCK(report_lock);
+
+/*
+ * Checks if the race identified by thread frames frame1 and frame2 has
+ * been reported since (now - KCSAN_REPORT_ONCE_IN_MS).
+ */
+static bool rate_limit_report(unsigned long frame1, unsigned long frame2)
+{
+ struct report_time *use_entry = &report_times[0];
+ unsigned long invalid_before;
+ int i;
+
+ BUILD_BUG_ON(CONFIG_KCSAN_REPORT_ONCE_IN_MS != 0 && REPORT_TIMES_SIZE == 0);
+
+ if (CONFIG_KCSAN_REPORT_ONCE_IN_MS == 0)
+ return false;
+
+ invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS);
+
+ /* Check if a matching race report exists. */
+ for (i = 0; i < REPORT_TIMES_SIZE; ++i) {
+ struct report_time *rt = &report_times[i];
+
+ /*
+ * Must always select an entry for use to store info as we
+ * cannot resize report_times; at the end of the scan, use_entry
+ * will be the oldest entry, which ideally also happened before
+ * KCSAN_REPORT_ONCE_IN_MS ago.
+ */
+ if (time_before(rt->time, use_entry->time))
+ use_entry = rt;
+
+ /*
+ * Initially, no need to check any further as this entry as well
+ * as following entries have never been used.
+ */
+ if (rt->time == 0)
+ break;
+
+ /* Check if entry expired. */
+ if (time_before(rt->time, invalid_before))
+ continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */
+
+ /* Reported recently, check if race matches. */
+ if ((rt->frame1 == frame1 && rt->frame2 == frame2) ||
+ (rt->frame1 == frame2 && rt->frame2 == frame1))
+ return true;
+ }
+
+ use_entry->time = jiffies;
+ use_entry->frame1 = frame1;
+ use_entry->frame2 = frame2;
+ return false;
+}
+
+/*
+ * Special rules to skip reporting.
+ */
+static bool
+skip_report(enum kcsan_value_change value_change, unsigned long top_frame)
+{
+ /* Should never get here if value_change==FALSE. */
+ WARN_ON_ONCE(value_change == KCSAN_VALUE_CHANGE_FALSE);
+
+ /*
+ * The first call to skip_report always has value_change==TRUE, since we
+ * cannot know the value written of an instrumented access. For the 2nd
+ * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY:
+ *
+ * 1. read watchpoint, conflicting write (value_change==TRUE): report;
+ * 2. read watchpoint, conflicting write (value_change==MAYBE): skip;
+ * 3. write watchpoint, conflicting write (value_change==TRUE): report;
+ * 4. write watchpoint, conflicting write (value_change==MAYBE): skip;
+ * 5. write watchpoint, conflicting read (value_change==MAYBE): skip;
+ * 6. write watchpoint, conflicting read (value_change==TRUE): report;
+ *
+ * Cases 1-4 are intuitive and expected; case 5 ensures we do not report
+ * data races where the write may have rewritten the same value; case 6
+ * is possible either if the size is larger than what we check value
+ * changes for or the access type is KCSAN_ACCESS_ASSERT.
+ */
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) &&
+ value_change == KCSAN_VALUE_CHANGE_MAYBE) {
+ /*
+ * The access is a write, but the data value did not change.
+ *
+ * We opt-out of this filter for certain functions at request of
+ * maintainers.
+ */
+ char buf[64];
+ int len = scnprintf(buf, sizeof(buf), "%ps", (void *)top_frame);
+
+ if (!strnstr(buf, "rcu_", len) &&
+ !strnstr(buf, "_rcu", len) &&
+ !strnstr(buf, "_srcu", len))
+ return true;
+ }
+
+ return kcsan_skip_report_debugfs(top_frame);
+}
+
+static const char *get_access_type(int type)
+{
+ if (type & KCSAN_ACCESS_ASSERT) {
+ if (type & KCSAN_ACCESS_SCOPED) {
+ if (type & KCSAN_ACCESS_WRITE)
+ return "assert no accesses (reordered)";
+ else
+ return "assert no writes (reordered)";
+ } else {
+ if (type & KCSAN_ACCESS_WRITE)
+ return "assert no accesses";
+ else
+ return "assert no writes";
+ }
+ }
+
+ switch (type) {
+ case 0:
+ return "read";
+ case KCSAN_ACCESS_ATOMIC:
+ return "read (marked)";
+ case KCSAN_ACCESS_WRITE:
+ return "write";
+ case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "write (marked)";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked)";
+ case KCSAN_ACCESS_SCOPED:
+ return "read (reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
+ return "read (marked, reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
+ return "write (reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "write (marked, reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write (reordered)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked, reordered)";
+ default:
+ BUG();
+ }
+}
+
+static const char *get_bug_type(int type)
+{
+ return (type & KCSAN_ACCESS_ASSERT) != 0 ? "assert: race" : "data-race";
+}
+
+/* Return thread description: in task or interrupt. */
+static const char *get_thread_desc(int task_id)
+{
+ if (task_id != -1) {
+ static char buf[32]; /* safe: protected by report_lock */
+
+ snprintf(buf, sizeof(buf), "task %i", task_id);
+ return buf;
+ }
+ return "interrupt";
+}
+
+/* Helper to skip KCSAN-related functions in stack-trace. */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries)
+{
+ char buf[64];
+ char *cur;
+ int len, skip;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]);
+
+ /* Never show tsan_* or {read,write}_once_size. */
+ if (strnstr(buf, "tsan_", len) ||
+ strnstr(buf, "_once_size", len))
+ continue;
+
+ cur = strnstr(buf, "kcsan_", len);
+ if (cur) {
+ cur += strlen("kcsan_");
+ if (!str_has_prefix(cur, "test"))
+ continue; /* KCSAN runtime function. */
+ /* KCSAN related test. */
+ }
+
+ /*
+ * No match for runtime functions -- @skip entries to skip to
+ * get to first frame of interest.
+ */
+ break;
+ }
+
+ return skip;
+}
+
+/*
+ * Skips to the first entry that matches the function of @ip, and then replaces
+ * that entry with @ip, returning the entries to skip with @replaced containing
+ * the replaced entry.
+ */
+static int
+replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ unsigned long symbolsize, offset;
+ unsigned long target_func;
+ int skip;
+
+ if (kallsyms_lookup_size_offset(ip, &symbolsize, &offset))
+ target_func = ip - offset;
+ else
+ goto fallback;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ unsigned long func = stack_entries[skip];
+
+ if (!kallsyms_lookup_size_offset(func, &symbolsize, &offset))
+ goto fallback;
+ func -= offset;
+
+ if (func == target_func) {
+ *replaced = stack_entries[skip];
+ stack_entries[skip] = ip;
+ return skip;
+ }
+ }
+
+fallback:
+ /* Should not happen; the resulting stack trace is likely misleading. */
+ WARN_ONCE(1, "Cannot find frame for %pS in stack trace", (void *)ip);
+ return get_stack_skipnr(stack_entries, num_entries);
+}
+
+static int
+sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip,
+ unsigned long *replaced)
+{
+ return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) :
+ get_stack_skipnr(stack_entries, num_entries);
+}
+
+/* Compares symbolized strings of addr1 and addr2. */
+static int sym_strcmp(void *addr1, void *addr2)
+{
+ char buf1[64];
+ char buf2[64];
+
+ snprintf(buf1, sizeof(buf1), "%pS", addr1);
+ snprintf(buf2, sizeof(buf2), "%pS", addr2);
+
+ return strncmp(buf1, buf2, sizeof(buf1));
+}
+
+static void
+print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
+{
+ stack_trace_print(stack_entries, num_entries, 0);
+ if (reordered_to)
+ pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to);
+}
+
+static void print_verbose_info(struct task_struct *task)
+{
+ if (!task)
+ return;
+
+ /* Restore IRQ state trace for printing. */
+ kcsan_restore_irqtrace(task);
+
+ pr_err("\n");
+ debug_show_held_locks(task);
+ print_irqtrace_events(task);
+}
+
+static void print_report(enum kcsan_value_change value_change,
+ const struct access_info *ai,
+ struct other_info *other_info,
+ u64 old, u64 new, u64 mask)
+{
+ unsigned long reordered_to = 0;
+ unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
+ int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
+ int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to);
+ unsigned long this_frame = stack_entries[skipnr];
+ unsigned long other_reordered_to = 0;
+ unsigned long other_frame = 0;
+ int other_skipnr = 0; /* silence uninit warnings */
+
+ /*
+ * Must check report filter rules before starting to print.
+ */
+ if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
+ return;
+
+ if (other_info) {
+ other_skipnr = sanitize_stack_entries(other_info->stack_entries,
+ other_info->num_stack_entries,
+ other_info->ai.ip, &other_reordered_to);
+ other_frame = other_info->stack_entries[other_skipnr];
+
+ /* @value_change is only known for the other thread */
+ if (skip_report(value_change, other_frame))
+ return;
+ }
+
+ if (rate_limit_report(this_frame, other_frame))
+ return;
+
+ /* Print report header. */
+ pr_err("==================================================================\n");
+ if (other_info) {
+ int cmp;
+
+ /*
+ * Order functions lexographically for consistent bug titles.
+ * Do not print offset of functions to keep title short.
+ */
+ cmp = sym_strcmp((void *)other_frame, (void *)this_frame);
+ pr_err("BUG: KCSAN: %s in %ps / %ps\n",
+ get_bug_type(ai->access_type | other_info->ai.access_type),
+ (void *)(cmp < 0 ? other_frame : this_frame),
+ (void *)(cmp < 0 ? this_frame : other_frame));
+ } else {
+ pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type),
+ (void *)this_frame);
+ }
+
+ pr_err("\n");
+
+ /* Print information about the racing accesses. */
+ if (other_info) {
+ pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(other_info->ai.access_type), other_info->ai.ptr,
+ other_info->ai.size, get_thread_desc(other_info->ai.task_pid),
+ other_info->ai.cpu_id);
+
+ /* Print the other thread's stack trace. */
+ print_stack_trace(other_info->stack_entries + other_skipnr,
+ other_info->num_stack_entries - other_skipnr,
+ other_reordered_to);
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ print_verbose_info(other_info->task);
+
+ pr_err("\n");
+ pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(ai->access_type), ai->ptr, ai->size,
+ get_thread_desc(ai->task_pid), ai->cpu_id);
+ } else {
+ pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(ai->access_type), ai->ptr, ai->size,
+ get_thread_desc(ai->task_pid), ai->cpu_id);
+ }
+ /* Print stack trace of this thread. */
+ print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to);
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ print_verbose_info(current);
+
+ /* Print observed value change. */
+ if (ai->size <= 8) {
+ int hex_len = ai->size * 2;
+ u64 diff = old ^ new;
+
+ if (mask)
+ diff &= mask;
+ if (diff) {
+ pr_err("\n");
+ pr_err("value changed: 0x%0*llx -> 0x%0*llx\n",
+ hex_len, old, hex_len, new);
+ if (mask) {
+ pr_err(" bits changed: 0x%0*llx with mask 0x%0*llx\n",
+ hex_len, diff, hex_len, mask);
+ }
+ }
+ }
+
+ /* Print report footer. */
+ pr_err("\n");
+ pr_err("Reported by Kernel Concurrency Sanitizer on:\n");
+ dump_stack_print_info(KERN_DEFAULT);
+ pr_err("==================================================================\n");
+
+ check_panic_on_warn("KCSAN");
+}
+
+static void release_report(unsigned long *flags, struct other_info *other_info)
+{
+ /*
+ * Use size to denote valid/invalid, since KCSAN entirely ignores
+ * 0-sized accesses.
+ */
+ other_info->ai.size = 0;
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+}
+
+/*
+ * Sets @other_info->task and awaits consumption of @other_info.
+ *
+ * Precondition: report_lock is held.
+ * Postcondition: report_lock is held.
+ */
+static void set_other_info_task_blocking(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ /*
+ * We may be instrumenting a code-path where current->state is already
+ * something other than TASK_RUNNING.
+ */
+ const bool is_running = task_is_running(current);
+ /*
+ * To avoid deadlock in case we are in an interrupt here and this is a
+ * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a
+ * timeout to ensure this works in all contexts.
+ *
+ * Await approximately the worst case delay of the reporting thread (if
+ * we are not interrupted).
+ */
+ int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt);
+
+ other_info->task = current;
+ do {
+ if (is_running) {
+ /*
+ * Let lockdep know the real task is sleeping, to print
+ * the held locks (recall we turned lockdep off, so
+ * locking/unlocking @report_lock won't be recorded).
+ */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+ /*
+ * We cannot call schedule() since we also cannot reliably
+ * determine if sleeping here is permitted -- see in_atomic().
+ */
+
+ udelay(1);
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ if (timeout-- < 0) {
+ /*
+ * Abort. Reset @other_info->task to NULL, since it
+ * appears the other thread is still going to consume
+ * it. It will result in no verbose info printed for
+ * this task.
+ */
+ other_info->task = NULL;
+ break;
+ }
+ /*
+ * If invalid, or @ptr nor @current matches, then @other_info
+ * has been consumed and we may continue. If not, retry.
+ */
+ } while (other_info->ai.size && other_info->ai.ptr == ai->ptr &&
+ other_info->task == current);
+ if (is_running)
+ set_current_state(TASK_RUNNING);
+}
+
+/* Populate @other_info; requires that the provided @other_info not in use. */
+static void prepare_report_producer(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ raw_spin_lock_irqsave(&report_lock, *flags);
+
+ /*
+ * The same @other_infos entry cannot be used concurrently, because
+ * there is a one-to-one mapping to watchpoint slots (@watchpoints in
+ * core.c), and a watchpoint is only released for reuse after reporting
+ * is done by the consumer of @other_info. Therefore, it is impossible
+ * for another concurrent prepare_report_producer() to set the same
+ * @other_info, and are guaranteed exclusivity for the @other_infos
+ * entry pointed to by @other_info.
+ *
+ * To check this property holds, size should never be non-zero here,
+ * because every consumer of struct other_info resets size to 0 in
+ * release_report().
+ */
+ WARN_ON(other_info->ai.size);
+
+ other_info->ai = *ai;
+ other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 2);
+
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ set_other_info_task_blocking(flags, ai, other_info);
+
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+}
+
+/* Awaits producer to fill @other_info and then returns. */
+static bool prepare_report_consumer(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ while (!other_info->ai.size) { /* Await valid @other_info. */
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+ cpu_relax();
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ }
+
+ /* Should always have a matching access based on watchpoint encoding. */
+ if (WARN_ON(!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size,
+ (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size)))
+ goto discard;
+
+ if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size,
+ (unsigned long)ai->ptr, ai->size)) {
+ /*
+ * If the actual accesses to not match, this was a false
+ * positive due to watchpoint encoding.
+ */
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]);
+ goto discard;
+ }
+
+ return true;
+
+discard:
+ release_report(flags, other_info);
+ return false;
+}
+
+static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
+ int access_type, unsigned long ip)
+{
+ return (struct access_info) {
+ .ptr = ptr,
+ .size = size,
+ .access_type = access_type,
+ .task_pid = in_task() ? task_pid_nr(current) : -1,
+ .cpu_id = raw_smp_processor_id(),
+ /* Only replace stack entry with @ip if scoped access. */
+ .ip = (access_type & KCSAN_ACCESS_SCOPED) ? ip : 0,
+ };
+}
+
+void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, int watchpoint_idx)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
+ unsigned long flags;
+
+ kcsan_disable_current();
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ prepare_report_producer(&flags, &ai, &other_infos[watchpoint_idx]);
+
+ lockdep_on();
+ kcsan_enable_current();
+}
+
+void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
+ struct other_info *other_info = &other_infos[watchpoint_idx];
+ unsigned long flags = 0;
+
+ kcsan_disable_current();
+ /*
+ * Because we may generate reports when we're in scheduler code, the use
+ * of printk() could deadlock. Until such time that all printing code
+ * called in print_report() is scheduler-safe, accept the risk, and just
+ * get our message out. As such, also disable lockdep to hide the
+ * warning, and avoid disabling lockdep for the rest of the kernel.
+ */
+ lockdep_off();
+
+ if (!prepare_report_consumer(&flags, &ai, other_info))
+ goto out;
+ /*
+ * Never report if value_change is FALSE, only when it is
+ * either TRUE or MAYBE. In case of MAYBE, further filtering may
+ * be done once we know the full stack trace in print_report().
+ */
+ if (value_change != KCSAN_VALUE_CHANGE_FALSE)
+ print_report(value_change, &ai, other_info, old, new, mask);
+
+ release_report(&flags, other_info);
+out:
+ lockdep_on();
+ kcsan_enable_current();
+}
+
+void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
+ unsigned long ip, u64 old, u64 new, u64 mask)
+{
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
+ unsigned long flags;
+
+ kcsan_disable_current();
+ lockdep_off(); /* See kcsan_report_known_origin(). */
+
+ raw_spin_lock_irqsave(&report_lock, flags);
+ print_report(KCSAN_VALUE_CHANGE_TRUE, &ai, NULL, old, new, mask);
+ raw_spin_unlock_irqrestore(&report_lock, flags);
+
+ lockdep_on();
+ kcsan_enable_current();
+}
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
new file mode 100644
index 000000000000..84a1200271af
--- /dev/null
+++ b/kernel/kcsan/selftest.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN short boot-time selftests.
+ *
+ * Copyright (C) 2019, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kcsan: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "encoding.h"
+
+#define ITERS_PER_TEST 2000
+
+/*
+ * Test watchpoint encode and decode: check that encoding some access's info,
+ * and then subsequent decode preserves the access's info.
+ */
+static bool __init test_encode_decode(void)
+{
+ int i;
+
+ for (i = 0; i < ITERS_PER_TEST; ++i) {
+ size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE);
+ bool is_write = !!get_random_u32_below(2);
+ unsigned long verif_masked_addr;
+ long encoded_watchpoint;
+ bool verif_is_write;
+ unsigned long addr;
+ size_t verif_size;
+
+ get_random_bytes(&addr, sizeof(addr));
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+
+ if (WARN_ON(!check_encodable(addr, size)))
+ return false;
+
+ encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+
+ /* Check special watchpoints */
+ if (WARN_ON(decode_watchpoint(INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(decode_watchpoint(CONSUMED_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+
+ /* Check decoding watchpoint returns same data */
+ if (WARN_ON(!decode_watchpoint(encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(verif_masked_addr != (addr & WATCHPOINT_ADDR_MASK)))
+ goto fail;
+ if (WARN_ON(verif_size != size))
+ goto fail;
+ if (WARN_ON(is_write != verif_is_write))
+ goto fail;
+
+ continue;
+fail:
+ pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+ __func__, is_write ? "write" : "read", size, addr, encoded_watchpoint,
+ verif_is_write ? "write" : "read", verif_size, verif_masked_addr);
+ return false;
+ }
+
+ return true;
+}
+
+/* Test access matching function. */
+static bool __init test_matching_access(void)
+{
+ if (WARN_ON(!matching_access(10, 1, 10, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 2, 11, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 1, 9, 2)))
+ return false;
+ if (WARN_ON(matching_access(10, 1, 11, 1)))
+ return false;
+ if (WARN_ON(matching_access(9, 1, 10, 1)))
+ return false;
+
+ /*
+ * An access of size 0 could match another access, as demonstrated here.
+ * Rather than add more comparisons to 'matching_access()', which would
+ * end up in the fast-path for *all* checks, check_access() simply
+ * returns for all accesses of size 0.
+ */
+ if (WARN_ON(!matching_access(8, 8, 12, 0)))
+ return false;
+
+ return true;
+}
+
+/*
+ * Correct memory barrier instrumentation is critical to avoiding false
+ * positives: simple test to check at boot certain barriers are always properly
+ * instrumented. See kcsan_test for a more complete test.
+ */
+static DEFINE_SPINLOCK(test_spinlock);
+static bool __init test_barrier(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
+#else
+ struct kcsan_scoped_access *reorder_access = NULL;
+#endif
+ bool ret = true;
+ arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
+ atomic_t dummy;
+ long test_var;
+
+ if (!reorder_access || !IS_ENABLED(CONFIG_SMP))
+ return true;
+
+#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \
+ do { \
+ reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
+ reorder_access->size = 1; \
+ barrier; \
+ if (reorder_access->size != 0) { \
+ pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \
+ ret = false; \
+ } \
+ } while (0)
+#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b)
+#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b)
+#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b)
+
+ kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
+
+ KCSAN_CHECK_READ_BARRIER(mb());
+ KCSAN_CHECK_READ_BARRIER(rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb());
+ KCSAN_CHECK_READ_BARRIER(smp_rmb());
+ KCSAN_CHECK_READ_BARRIER(dma_rmb());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_WRITE_BARRIER(mb());
+ KCSAN_CHECK_WRITE_BARRIER(wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(dma_wmb());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock));
+
+ KCSAN_CHECK_RW_BARRIER(mb());
+ KCSAN_CHECK_RW_BARRIER(wmb());
+ KCSAN_CHECK_RW_BARRIER(rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb());
+ KCSAN_CHECK_RW_BARRIER(smp_wmb());
+ KCSAN_CHECK_RW_BARRIER(smp_rmb());
+ KCSAN_CHECK_RW_BARRIER(dma_wmb());
+ KCSAN_CHECK_RW_BARRIER(dma_rmb());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic());
+ KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock());
+ KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy));
+ KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var));
+ KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var));
+ arch_spin_lock(&arch_spinlock);
+ KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
+ spin_lock(&test_spinlock);
+ KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
+ KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+ kcsan_nestable_atomic_end();
+
+ return ret;
+}
+
+static int __init kcsan_selftest(void)
+{
+ int passed = 0;
+ int total = 0;
+
+#define RUN_TEST(do_test) \
+ do { \
+ ++total; \
+ if (do_test()) \
+ ++passed; \
+ else \
+ pr_err("selftest: " #do_test " failed"); \
+ } while (0)
+
+ RUN_TEST(test_encode_decode);
+ RUN_TEST(test_matching_access);
+ RUN_TEST(test_barrier);
+
+ pr_info("selftest: %d/%d tests passed\n", passed, total);
+ if (passed != total)
+ panic("selftests failed");
+ return 0;
+}
+postcore_initcall(kcsan_selftest);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7a36fdcca5bf..28008e3d462e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,296 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
-#define pr_fmt(fmt) "kexec: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
+#include <linux/security.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
-#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
#include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses. On processors
- * where you can disable the MMU this is trivial, and easy. For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place. This means I can only support memory whose
- * physical address can fit in an unsigned long. In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages. As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it). The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- * - allocating a page table with the control code buffer identity
- * mapped, to simplify machine_kexec and make kexec_on_panic more
- * reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
-
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long dest);
-
-static int copy_user_segment_list(struct kimage *image,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
-{
- int ret;
- size_t segment_bytes;
-
- /* Read in the segments */
- image->nr_segments = nr_segments;
- segment_bytes = nr_segments * sizeof(*segments);
- ret = copy_from_user(image->segment, segments, segment_bytes);
- if (ret)
- ret = -EFAULT;
-
- return ret;
-}
-
-static int sanity_check_segment_list(struct kimage *image)
-{
- int result, i;
- unsigned long nr_segments = image->nr_segments;
-
- /*
- * Verify we have good destination addresses. The caller is
- * responsible for making certain we don't attempt to load
- * the new image into invalid or reserved areas of RAM. This
- * just verifies it is an address we can use.
- *
- * Since the kernel does everything in page size chunks ensure
- * the destination addresses are page aligned. Too many
- * special cases crop of when we don't do this. The most
- * insidious is getting overlapping destination addresses
- * simply because addresses are changed to page size
- * granularity.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
- if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
- }
-
- /* Verify our destination addresses do not overlap.
- * If we alloed overlapping destination addresses
- * through very weird things can happen with no
- * easy explanation as one segment stops on another.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
- unsigned long j;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- for (j = 0; j < i; j++) {
- unsigned long pstart, pend;
- pstart = image->segment[j].mem;
- pend = pstart + image->segment[j].memsz;
- /* Do the segments overlap ? */
- if ((mend > pstart) && (mstart < pend))
- return result;
- }
- }
-
- /* Ensure our buffer sizes are strictly less than
- * our memory sizes. This should always be the case,
- * and it is easier to check up front than to be surprised
- * later on.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
- }
-
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
-
- if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
- }
- }
-
- return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
- struct kimage *image;
-
- /* Allocate a controlling structure */
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- return NULL;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unusable_pages);
-
- return image;
-}
+#include <linux/slab.h>
-static void kimage_free_page_list(struct list_head *list);
+#include "kexec_internal.h"
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
- struct kexec_segment __user *segments,
+ struct kexec_segment *segments,
unsigned long flags)
{
int ret;
struct kimage *image;
bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
/* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ if ((entry < phys_to_boot_phys(crashk_res.start)) ||
+ (entry > phys_to_boot_phys(crashk_res.end)))
return -EADDRNOTAVAIL;
}
+#endif
/* Allocate and initialize a controlling structure */
image = do_kimage_alloc_init();
@@ -298,20 +43,20 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
return -ENOMEM;
image->start = entry;
+ image->nr_segments = nr_segments;
+ memcpy(image->segment, segments, nr_segments * sizeof(*segments));
- ret = copy_user_segment_list(image, nr_segments, segments);
- if (ret)
- goto out_free_image;
-
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_image;
-
- /* Enable the special crash kernel control page allocation policy. */
+#ifdef CONFIG_CRASH_DUMP
if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
image->control_page = crashk_res.start;
image->type = KEXEC_TYPE_CRASH;
}
+#endif
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
/*
* Find a location for the control code buffer, and add it
@@ -343,871 +88,94 @@ out_free_image:
return ret;
}
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
+ struct kexec_segment *segments, unsigned long flags)
{
- struct fd f = fdget(fd);
+ struct kimage **dest_image, *image;
+ unsigned long i;
int ret;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -EBADF;
-
- ret = vfs_getattr(&f.file->f_path, &stat);
- if (ret)
- goto out;
-
- if (stat.size > INT_MAX) {
- ret = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- *buf = vmalloc(stat.size);
- if (!*buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(*buf);
- ret = bytes;
- goto out;
- }
-
- if (bytes == 0)
- break;
- pos += bytes;
- }
-
- if (pos != stat.size) {
- ret = -EBADF;
- vfree(*buf);
- goto out;
- }
-
- *buf_len = pos;
-out:
- fdput(f);
- return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
- */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
- struct purgatory_info *pi = &image->purgatory_info;
-
- vfree(image->kernel_buf);
- image->kernel_buf = NULL;
-
- vfree(image->initrd_buf);
- image->initrd_buf = NULL;
-
- kfree(image->cmdline_buf);
- image->cmdline_buf = NULL;
-
- vfree(pi->purgatory_buf);
- pi->purgatory_buf = NULL;
-
- vfree(pi->sechdrs);
- pi->sechdrs = NULL;
-
- /* See if architecture has anything to cleanup post load */
- arch_kimage_file_post_load_cleanup(image);
/*
- * Above call should have called into bootloader to free up
- * any data stored in kimage->image_loader_data. It should
- * be ok now to free it up.
+ * Because we write directly to the reserved memory region when loading
+ * crash kernels we need a serialization here to prevent multiple crash
+ * kernels from attempting to load simultaneously.
*/
- kfree(image->image_loader_data);
- image->image_loader_data = NULL;
-}
-
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
- const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned flags)
-{
- int ret = 0;
- void *ldata;
-
- ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
- &image->kernel_buf_len);
- if (ret)
- return ret;
-
- /* Call arch image probe handlers */
- ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
- image->kernel_buf_len);
-
- if (ret)
- goto out;
+ if (!kexec_trylock())
+ return -EBUSY;
-#ifdef CONFIG_KEXEC_VERIFY_SIG
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
- if (ret) {
- pr_debug("kernel signature verification failed.\n");
- goto out;
- }
- pr_debug("kernel signature verification successful.\n");
+#ifdef CONFIG_CRASH_DUMP
+ if (flags & KEXEC_ON_CRASH) {
+ dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ } else
#endif
- /* It is possible that there no initramfs is being loaded */
- if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
- &image->initrd_buf_len);
- if (ret)
- goto out;
- }
-
- if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ dest_image = &kexec_image;
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
- goto out;
- }
-
- image->cmdline_buf_len = cmdline_len;
-
- /* command line should be a string with last byte null */
- if (image->cmdline_buf[cmdline_len - 1] != '\0') {
- ret = -EINVAL;
- goto out;
- }
+ if (nr_segments == 0) {
+ /* Uninstall image */
+ kimage_free(xchg(dest_image, NULL));
+ ret = 0;
+ goto out_unlock;
}
-
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
-
- if (IS_ERR(ldata)) {
- ret = PTR_ERR(ldata);
- goto out;
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
+ * we corrupt it.
+ */
+ kimage_free(xchg(&kexec_crash_image, NULL));
}
- image->image_loader_data = ldata;
-out:
- /* In case of error, free up all allocated memory in this function */
+ ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
if (ret)
- kimage_file_post_load_cleanup(image);
- return ret;
-}
+ goto out_unlock;
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
- int initrd_fd, const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned long flags)
-{
- int ret;
- struct kimage *image;
- bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
- image = do_kimage_alloc_init();
- if (!image)
- return -ENOMEM;
+ if (flags & KEXEC_PRESERVE_CONTEXT)
+ image->preserve_context = 1;
- image->file_mode = 1;
-
- if (kexec_on_panic) {
- /* Enable special crash kernel control page alloc policy. */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
- }
-
- ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
- cmdline_ptr, cmdline_len, flags);
- if (ret)
- goto out_free_image;
+#ifdef CONFIG_CRASH_HOTPLUG
+ if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags))
+ image->hotplug_support = 1;
+#endif
- ret = sanity_check_segment_list(image);
+ ret = machine_kexec_prepare(image);
if (ret)
- goto out_free_post_load_bufs;
-
- ret = -ENOMEM;
- image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_PAGE_SIZE));
- if (!image->control_code_page) {
- pr_err("Could not allocate control_code_buffer\n");
- goto out_free_post_load_bufs;
- }
-
- if (!kexec_on_panic) {
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free_control_pages;
- }
- }
-
- *rimage = image;
- return 0;
-out_free_control_pages:
- kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
- kimage_file_post_load_cleanup(image);
-out_free_image:
- kfree(image);
- return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start,
- unsigned long end)
-{
- unsigned long i;
-
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
- return 1;
- }
-
- return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *pages;
-
- pages = alloc_pages(gfp_mask, order);
- if (pages) {
- unsigned int count, i;
- pages->mapping = NULL;
- set_page_private(pages, order);
- count = 1 << order;
- for (i = 0; i < count; i++)
- SetPageReserved(pages + i);
- }
-
- return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
- unsigned int order, count, i;
-
- order = page_private(page);
- count = 1 << order;
- for (i = 0; i < count; i++)
- ClearPageReserved(page + i);
- __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
- struct list_head *pos, *next;
-
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
- list_del(&page->lru);
- kimage_free_pages(page);
- }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * At worst this runs in O(N) of the image size.
- */
- struct list_head extra_pages;
- struct page *pages;
- unsigned int count;
-
- count = 1 << order;
- INIT_LIST_HEAD(&extra_pages);
-
- /* Loop while I can allocate a page and the page allocated
- * is a destination page.
- */
- do {
- unsigned long pfn, epfn, addr, eaddr;
-
- pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
- if (!pages)
- break;
- pfn = page_to_pfn(pages);
- epfn = pfn + count;
- addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
- if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
- kimage_is_destination_range(image, addr, eaddr)) {
- list_add(&pages->lru, &extra_pages);
- pages = NULL;
- }
- } while (!pages);
-
- if (pages) {
- /* Remember the allocated page... */
- list_add(&pages->lru, &image->control_pages);
-
- /* Because the page is already in it's destination
- * location we will never allocate another page at
- * that address. Therefore kimage_alloc_pages
- * will not return it (again) and we don't need
- * to give it an entry in image->segment[].
- */
- }
- /* Deal with the destination pages I have inadvertently allocated.
- *
- * Ideally I would convert multi-page allocations into single
- * page allocations, and add everything to image->dest_pages.
- *
- * For now it is simpler to just free the pages.
- */
- kimage_free_page_list(&extra_pages);
-
- return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * Control pages are also the only pags we must allocate
- * when loading a crash kernel. All of the other pages
- * are specified by the segments and we just memcpy
- * into them directly.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * Given the low demand this implements a very simple
- * allocator that finds the first hole of the appropriate
- * size in the reserved memory region, and allocates all
- * of the memory up to and including the hole.
- */
- unsigned long hole_start, hole_end, size;
- struct page *pages;
-
- pages = NULL;
- size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- while (hole_end <= crashk_res.end) {
- unsigned long i;
-
- if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
- break;
- /* See if I overlap any of the segments */
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- if ((hole_end >= mstart) && (hole_start <= mend)) {
- /* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- break;
- }
- }
- /* If I don't overlap any segments I have found my hole! */
- if (i == image->nr_segments) {
- pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- break;
- }
- }
- if (pages)
- image->control_page = hole_end;
-
- return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
- unsigned int order)
-{
- struct page *pages = NULL;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- pages = kimage_alloc_normal_control_pages(image, order);
- break;
- case KEXEC_TYPE_CRASH:
- pages = kimage_alloc_crash_control_pages(image, order);
- break;
- }
-
- return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
- if (*image->entry != 0)
- image->entry++;
-
- if (image->entry == image->last_entry) {
- kimage_entry_t *ind_page;
- struct page *page;
-
- page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
- if (!page)
- return -ENOMEM;
-
- ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
- image->entry = ind_page;
- image->last_entry = ind_page +
- ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
- }
- *image->entry = entry;
- image->entry++;
- *image->entry = 0;
-
- return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
- unsigned long destination)
-{
- int result;
-
- destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
-
- return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
- int result;
-
- page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
-
- return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
- /* Walk through and free any extra destination pages I may have */
- kimage_free_page_list(&image->dest_pages);
-
- /* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
- if (*image->entry != 0)
- image->entry++;
-
- *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
- for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
- ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
- struct page *page;
-
- page = pfn_to_page(entry >> PAGE_SHIFT);
- kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
- kimage_entry_t *ptr, entry;
- kimage_entry_t ind = 0;
-
- if (!image)
- return;
-
- kimage_free_extra_pages(image);
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_INDIRECTION) {
- /* Free the previous indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
- /* Save this indirection page until we are
- * done with it.
- */
- ind = entry;
- } else if (entry & IND_SOURCE)
- kimage_free_entry(entry);
- }
- /* Free the final indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
-
- /* Handle any machine specific cleanup */
- machine_kexec_cleanup(image);
-
- /* Free the kexec control pages... */
- kimage_free_page_list(&image->control_pages);
-
- /*
- * Free up any temporary buffers allocated. This might hit if
- * error occurred much later after buffer allocation.
- */
- if (image->file_mode)
- kimage_file_post_load_cleanup(image);
-
- kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
- unsigned long page)
-{
- kimage_entry_t *ptr, entry;
- unsigned long destination = 0;
-
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_DESTINATION)
- destination = entry & PAGE_MASK;
- else if (entry & IND_SOURCE) {
- if (page == destination)
- return ptr;
- destination += PAGE_SIZE;
- }
- }
-
- return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long destination)
-{
- /*
- * Here we implement safeguards to ensure that a source page
- * is not copied to its destination page before the data on
- * the destination page is no longer useful.
- *
- * To do this we maintain the invariant that a source page is
- * either its own destination page, or it is not a
- * destination page at all.
- *
- * That is slightly stronger than required, but the proof
- * that no problems will not occur is trivial, and the
- * implementation is simply to verify.
- *
- * When allocating all pages normally this algorithm will run
- * in O(N) time, but in the worst case it will run in O(N^2)
- * time. If the runtime is a problem the data structures can
- * be fixed.
- */
- struct page *page;
- unsigned long addr;
+ goto out;
/*
- * Walk through the list of destination pages, and see if I
- * have a match.
+ * Some architecture(like S390) may touch the crash memory before
+ * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
*/
- list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
- if (addr == destination) {
- list_del(&page->lru);
- return page;
- }
- }
- page = NULL;
- while (1) {
- kimage_entry_t *old;
-
- /* Allocate a page, if we run out of memory give up */
- page = kimage_alloc_pages(gfp_mask, 0);
- if (!page)
- return NULL;
- /* If the page cannot be used file it away */
- if (page_to_pfn(page) >
- (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unusable_pages);
- continue;
- }
- addr = page_to_pfn(page) << PAGE_SHIFT;
-
- /* If it is the destination page we want use it */
- if (addr == destination)
- break;
-
- /* If the page is not a destination page use it */
- if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
- break;
-
- /*
- * I know that the page is someones destination page.
- * See if there is already a source page for this
- * destination page. And if so swap the source pages.
- */
- old = kimage_dst_used(image, addr);
- if (old) {
- /* If so move it */
- unsigned long old_addr;
- struct page *old_page;
-
- old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
- copy_highpage(page, old_page);
- *old = addr | (*old & ~PAGE_MASK);
-
- /* The old page I have found cannot be a
- * destination page, so return it if it's
- * gfp_flags honor the ones passed in.
- */
- if (!(gfp_mask & __GFP_HIGHMEM) &&
- PageHighMem(old_page)) {
- kimage_free_pages(old_page);
- continue;
- }
- addr = old_addr;
- page = old_page;
- break;
- } else {
- /* Place the page on the destination list I
- * will use it later.
- */
- list_add(&page->lru, &image->dest_pages);
- }
- }
-
- return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
-
- result = kimage_set_destination(image, maddr);
- if (result < 0)
+ ret = kimage_crash_copy_vmcoreinfo(image);
+ if (ret)
goto out;
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- result = kimage_add_page(image, page_to_pfn(page)
- << PAGE_SHIFT);
- if (result < 0)
- goto out;
-
- ptr = kmap(page);
- /* Start with a clear page */
- clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
- if (result) {
- result = -EFAULT;
+ for (i = 0; i < nr_segments; i++) {
+ ret = kimage_load_segment(image, i);
+ if (ret)
goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
}
-out:
- return result;
-}
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- /* For crash dumps kernels we simply copy the data from
- * user space to it's destination.
- * We do things a page at a time for the sake of kmap.
- */
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
+ kimage_terminate(image);
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
+ ret = machine_kexec_post_load(image);
+ if (ret)
+ goto out;
- page = pfn_to_page(maddr >> PAGE_SHIFT);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- ptr = kmap(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
- if (mchunk > uchunk) {
- /* Zero the trailing part of the page */
- memset(ptr + uchunk, 0, mchunk - uchunk);
- }
+ /* Install the new kernel and uninstall the old */
+ image = xchg(dest_image, image);
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kexec_flush_icache_page(page);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
out:
- return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- int result = -ENOMEM;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
- break;
- case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
- break;
- }
+#ifdef CONFIG_CRASH_DUMP
+ if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+#endif
- return result;
+ kimage_free(image);
+out_unlock:
+ kexec_unlock();
+ return ret;
}
/*
@@ -1230,22 +198,31 @@ static int kimage_load_segment(struct kimage *image,
* kexec does not sync, or unmount filesystems so if you need
* that to happen you need to do that yourself.
*/
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-static DEFINE_MUTEX(kexec_mutex);
-
-SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
- struct kexec_segment __user *, segments, unsigned long, flags)
+static inline int kexec_load_check(unsigned long nr_segments,
+ unsigned long flags)
{
- struct kimage **dest_image, *image;
+ int image_type = (flags & KEXEC_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
int result;
/* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ if (!kexec_load_permitted(image_type))
return -EPERM;
+ /* Permit LSMs and IMA to fail the kexec */
+ result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false);
+ if (result < 0)
+ return result;
+
+ /*
+ * kexec can be used to circumvent module loading restrictions, so
+ * prevent loading in that case
+ */
+ result = security_locked_down(LOCKDOWN_KEXEC);
+ if (result)
+ return result;
+
/*
* Verify we have a legal set of flags
* This leaves us room for future extensions.
@@ -1253,94 +230,40 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
return -EINVAL;
- /* Verify we are on the appropriate architecture */
- if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
- ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
- return -EINVAL;
-
/* Put an artificial cap on the number
* of segments passed to kexec_load.
*/
if (nr_segments > KEXEC_SEGMENT_MAX)
return -EINVAL;
- image = NULL;
- result = 0;
-
- /* Because we write directly to the reserved memory
- * region when loading crash kernels we need a mutex here to
- * prevent multiple crash kernels from attempting to load
- * simultaneously, and to prevent a crash kernel from loading
- * over the top of a in use crash kernel.
- *
- * KISS: always take the mutex.
- */
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
- dest_image = &kexec_image;
- if (flags & KEXEC_ON_CRASH)
- dest_image = &kexec_crash_image;
- if (nr_segments > 0) {
- unsigned long i;
-
- if (flags & KEXEC_ON_CRASH) {
- /*
- * Loading another kernel to switch to if this one
- * crashes. Free any current crash dump kernel before
- * we corrupt it.
- */
+ return 0;
+}
- kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- crash_map_reserved_pages();
- } else {
- /* Loading another kernel to reboot into. */
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+ struct kexec_segment __user *, segments, unsigned long, flags)
+{
+ struct kexec_segment *ksegments;
+ unsigned long result;
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- }
- if (result)
- goto out;
+ result = kexec_load_check(nr_segments, flags);
+ if (result)
+ return result;
- if (flags & KEXEC_PRESERVE_CONTEXT)
- image->preserve_context = 1;
- result = machine_kexec_prepare(image);
- if (result)
- goto out;
+ /* Verify we are on the appropriate architecture */
+ if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+ ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+ return -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- result = kimage_load_segment(image, &image->segment[i]);
- if (result)
- goto out;
- }
- kimage_terminate(image);
- if (flags & KEXEC_ON_CRASH)
- crash_unmap_reserved_pages();
- }
- /* Install the new kernel, and Uninstall the old */
- image = xchg(dest_image, image);
+ ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
+ if (IS_ERR(ksegments))
+ return PTR_ERR(ksegments);
-out:
- mutex_unlock(&kexec_mutex);
- kimage_free(image);
+ result = do_kexec_load(entry, nr_segments, ksegments, flags);
+ kfree(ksegments);
return result;
}
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, nr_segments,
@@ -1348,1422 +271,39 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, flags)
{
struct compat_kexec_segment in;
- struct kexec_segment out, __user *ksegments;
+ struct kexec_segment *ksegments;
unsigned long i, result;
+ result = kexec_load_check(nr_segments, flags);
+ if (result)
+ return result;
+
/* Don't allow clients that don't understand the native
* architecture to do anything.
*/
if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
return -EINVAL;
- if (nr_segments > KEXEC_SEGMENT_MAX)
- return -EINVAL;
+ ksegments = kmalloc_array(nr_segments, sizeof(ksegments[0]),
+ GFP_KERNEL);
+ if (!ksegments)
+ return -ENOMEM;
- ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
for (i = 0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
if (result)
- return -EFAULT;
-
- out.buf = compat_ptr(in.buf);
- out.bufsz = in.bufsz;
- out.mem = in.mem;
- out.memsz = in.memsz;
-
- result = copy_to_user(&ksegments[i], &out, sizeof(out));
- if (result)
- return -EFAULT;
- }
-
- return sys_kexec_load(entry, nr_segments, ksegments, flags);
-}
-#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
- unsigned long, cmdline_len, const char __user *, cmdline_ptr,
- unsigned long, flags)
-{
- int ret = 0, i;
- struct kimage **dest_image, *image;
-
- /* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
- return -EPERM;
+ goto fail;
- /* Make sure we have a legal set of flags */
- if (flags != (flags & KEXEC_FILE_FLAGS))
- return -EINVAL;
-
- image = NULL;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
- dest_image = &kexec_crash_image;
-
- if (flags & KEXEC_FILE_UNLOAD)
- goto exchange;
-
- /*
- * In case of crash, new kernel gets loaded in reserved region. It is
- * same memory where old crash kernel might be loaded. Free any
- * current crash dump kernel before we corrupt it.
- */
- if (flags & KEXEC_FILE_ON_CRASH)
- kimage_free(xchg(&kexec_crash_image, NULL));
-
- ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
- cmdline_len, flags);
- if (ret)
- goto out;
-
- ret = machine_kexec_prepare(image);
- if (ret)
- goto out;
-
- ret = kexec_calculate_store_digests(image);
- if (ret)
- goto out;
-
- for (i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
-
- ret = kimage_load_segment(image, &image->segment[i]);
- if (ret)
- goto out;
- }
-
- kimage_terminate(image);
-
- /*
- * Free up any temporary buffers allocated which are not needed
- * after image has been loaded
- */
- kimage_file_post_load_cleanup(image);
-exchange:
- image = xchg(dest_image, image);
-out:
- mutex_unlock(&kexec_mutex);
- kimage_free(image);
- return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
- /* Take the kexec_mutex here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (mutex_trylock(&kexec_mutex)) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- mutex_unlock(&kexec_mutex);
- }
-}
-
-size_t crash_get_memory_size(void)
-{
- size_t size = 0;
- mutex_lock(&kexec_mutex);
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
- return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
- int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
-
- mutex_lock(&kexec_mutex);
-
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
-
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
+ ksegments[i].buf = compat_ptr(in.buf);
+ ksegments[i].bufsz = in.bufsz;
+ ksegments[i].mem = in.mem;
+ ksegments[i].memsz = in.memsz;
}
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
- crash_map_reserved_pages();
- crash_free_reserved_phys_range(end, crashk_res.end);
-
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
-
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
- ram_res->name = "System RAM";
-
- crashk_res.end = end - 1;
-
- insert_resource(&iomem_resource, ram_res);
- crash_unmap_reserved_pages();
-
-unlock:
- mutex_unlock(&kexec_mutex);
- return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- crash_notes = alloc_percpu(note_buf_t);
- if (!crash_notes) {
- pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline, *tmp;
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
-
- if (!ck_cmdline)
- return NULL;
-
- return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
-{
- char *first_colon, *first_space;
- char *ck_cmdline;
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
- if (!ck_cmdline)
- return -EINVAL;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
- /*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
- */
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
- update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
- return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_kexec_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
-
- return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
-
- do {
- /* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
-
- if (temp_start < start || temp_start < kbuf->buf_min)
- return 0;
-
- temp_end = temp_start + kbuf->memsz - 1;
-
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start - PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
+ result = do_kexec_load(entry, nr_segments, ksegments, flags);
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_start = max(start, kbuf->buf_min);
-
- do {
- temp_start = ALIGN(temp_start, kbuf->buf_align);
- temp_end = temp_start + kbuf->memsz - 1;
-
- if (temp_end > end || temp_end > kbuf->buf_max)
- return 0;
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start + PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
- struct kexec_buf *kbuf = (struct kexec_buf *)arg;
- unsigned long sz = end - start + 1;
-
- /* Returning 0 will take to next memory range */
- if (sz < kbuf->memsz)
- return 0;
-
- if (end < kbuf->buf_min || start > kbuf->buf_max)
- return 0;
-
- /*
- * Allocate memory top down with-in ram range. Otherwise bottom up
- * allocation.
- */
- if (kbuf->top_down)
- return locate_mem_hole_top_down(start, end, kbuf);
- return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
- unsigned long memsz, unsigned long buf_align,
- unsigned long buf_min, unsigned long buf_max,
- bool top_down, unsigned long *load_addr)
-{
-
- struct kexec_segment *ksegment;
- struct kexec_buf buf, *kbuf;
- int ret;
-
- /* Currently adding segment this way is allowed only in file mode */
- if (!image->file_mode)
- return -EINVAL;
-
- if (image->nr_segments >= KEXEC_SEGMENT_MAX)
- return -EINVAL;
-
- /*
- * Make sure we are not trying to add buffer after allocating
- * control pages. All segments need to be placed first before
- * any control pages are allocated. As control page allocation
- * logic goes through list of segments to make sure there are
- * no destination overlaps.
- */
- if (!list_empty(&image->control_pages)) {
- WARN_ON(1);
- return -EINVAL;
- }
-
- memset(&buf, 0, sizeof(struct kexec_buf));
- kbuf = &buf;
- kbuf->image = image;
- kbuf->buffer = buffer;
- kbuf->bufsz = bufsz;
-
- kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
- kbuf->buf_align = max(buf_align, PAGE_SIZE);
- kbuf->buf_min = buf_min;
- kbuf->buf_max = buf_max;
- kbuf->top_down = top_down;
-
- /* Walk the RAM ranges and allocate a suitable range for the buffer */
- if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res("Crash kernel",
- IORESOURCE_MEM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
- else
- ret = walk_system_ram_res(0, -1, kbuf,
- locate_mem_hole_callback);
- if (ret != 1) {
- /* A suitable memory range could not be found for buffer */
- return -EADDRNOTAVAIL;
- }
-
- /* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments];
- ksegment->kbuf = kbuf->buffer;
- ksegment->bufsz = kbuf->bufsz;
- ksegment->mem = kbuf->mem;
- ksegment->memsz = kbuf->memsz;
- image->nr_segments++;
- *load_addr = ksegment->mem;
- return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
- void *zero_buf;
- struct kexec_sha_region *sha_regions;
- struct purgatory_info *pi = &image->purgatory_info;
-
- zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
- zero_buf_sz = PAGE_SIZE;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
- sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
- sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
- goto out_free_desc;
-
- desc->tfm = tfm;
- desc->flags = 0;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
-
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
-
- for (j = i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- /*
- * Skip purgatory as it will be modified once we put digest
- * info in purgatory.
- */
- if (ksegment->kbuf == pi->purgatory_buf)
- continue;
-
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
-
- /*
- * Assume rest of the buffer is filled with zero and
- * update digest accordingly.
- */
- nullsz = ksegment->memsz - ksegment->bufsz;
- while (nullsz) {
- unsigned long bytes = nullsz;
-
- if (bytes > zero_buf_sz)
- bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
- nullsz -= bytes;
- }
-
- if (ret)
- break;
-
- sha_regions[j].start = ksegment->mem;
- sha_regions[j].len = ksegment->memsz;
- j++;
- }
-
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
-
- ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
-
-out_free_digest:
- kfree(digest);
-out_free_sha_regions:
- vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
- return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
- unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- void *purgatory_buf = NULL;
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
-
- /* Determine how much memory is needed to load relocatable object. */
- buf_align = 1;
- bss_align = 1;
- buf_sz = 0;
- bss_sz = 0;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (buf_align < align)
- buf_align = align;
- buf_sz = ALIGN(buf_sz, align);
- buf_sz += sechdrs[i].sh_size;
- } else {
- /* bss section */
- if (bss_align < align)
- bss_align = align;
- bss_sz = ALIGN(bss_sz, align);
- bss_sz += sechdrs[i].sh_size;
- }
- }
-
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (buf_sz & (bss_align - 1))
- bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
- memsz = buf_sz + bss_pad + bss_sz;
-
- /* Allocate buffer for purgatory */
- purgatory_buf = vzalloc(buf_sz);
- if (!purgatory_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (buf_align < bss_align)
- buf_align = bss_align;
-
- /* Add buffer to segment list */
- ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
- buf_align, min, max, top_down,
- &pi->purgatory_load_addr);
- if (ret)
- goto out;
-
- /* Load SHF_ALLOC sections */
- buf_addr = purgatory_buf;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + buf_sz + bss_pad;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
- bss_addr = ALIGN(bss_addr, align);
- sechdrs[i].sh_addr = bss_addr;
- bss_addr += sechdrs[i].sh_size;
- }
- }
-
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
-
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
-
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
-
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = purgatory_buf;
- return ret;
-out:
- vfree(sechdrs);
- vfree(purgatory_buf);
- return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
- int i, ret;
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
-
- /* Apply relocations */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
-
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
- continue;
-
- /*
- * For section of type SHT_RELA/SHT_REL,
- * ->sh_link contains section header index of associated
- * symbol table. And ->sh_info contains section header
- * index of section to which relocations apply.
- */
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
- return -ENOEXEC;
-
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
-
- if (!(section->sh_flags & SHF_ALLOC))
- continue;
-
- /*
- * symtab->sh_link contain section header index of associated
- * string table.
- */
- if (symtab->sh_link >= pi->ehdr->e_shnum)
- /* Invalid section number? */
- continue;
-
- /*
- * Respective architecture needs to provide support for applying
- * relocations of type SHT_RELA/SHT_REL.
- */
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- int ret;
-
- if (kexec_purgatory_size <= 0)
- return -EINVAL;
-
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
-
- ret = __kexec_load_purgatory(image, min, max, top_down);
- if (ret)
- return ret;
-
- ret = kexec_apply_relocations(image);
- if (ret)
- goto out;
-
- *load_addr = pi->purgatory_load_addr;
- return 0;
-out:
- vfree(pi->sechdrs);
- vfree(pi->purgatory_buf);
- return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
-{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
- const char *strtab;
-
- if (!pi->sechdrs || !pi->ehdr)
- return NULL;
-
- sechdrs = pi->sechdrs;
- ehdr = pi->ehdr;
-
- for (i = 0; i < ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_SYMTAB)
- continue;
-
- if (sechdrs[i].sh_link >= ehdr->e_shnum)
- /* Invalid strtab section number */
- continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
- /* Go through symbols for a match */
- for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
- if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
- continue;
-
- if (strcmp(strtab + syms[k].st_name, name) != 0)
- continue;
-
- if (syms[k].st_shndx == SHN_UNDEF ||
- syms[k].st_shndx >= ehdr->e_shnum) {
- pr_debug("Symbol: %s has bad section index %d.\n",
- name, syms[k].st_shndx);
- return NULL;
- }
-
- /* Found the symbol we are looking for */
- return &syms[k];
- }
- }
-
- return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
- Elf_Shdr *sechdr;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return ERR_PTR(-EINVAL);
-
- sechdr = &pi->sechdrs[sym->st_shndx];
-
- /*
- * Returns the address where symbol will finally be loaded after
- * kexec_load_segment()
- */
- return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
- void *buf, unsigned int size, bool get_value)
-{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
- struct purgatory_info *pi = &image->purgatory_info;
- char *sym_buf;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return -EINVAL;
-
- if (sym->st_size != size) {
- pr_err("symbol %s size mismatch: expected %lu actual %u\n",
- name, (unsigned long)sym->st_size, size);
- return -EINVAL;
- }
-
- sechdrs = pi->sechdrs;
-
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- pr_err("symbol %s is in a bss section. Cannot %s\n", name,
- get_value ? "get" : "set");
- return -EINVAL;
- }
-
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
-
- if (get_value)
- memcpy((void *)buf, sym_buf, size);
- else
- memcpy((void *)sym_buf, buf, size);
-
- return 0;
+fail:
+ kfree(ksegments);
+ return result;
}
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable. If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
- int error = 0;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
- if (!kexec_image) {
- error = -EINVAL;
- goto Unlock;
- }
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- lock_system_sleep();
- pm_prepare_console();
- error = freeze_processes();
- if (error) {
- error = -EBUSY;
- goto Restore_console;
- }
- suspend_console();
- error = dpm_suspend_start(PMSG_FREEZE);
- if (error)
- goto Resume_console;
- /* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_end(). We *must* call
- * dpm_suspend_end() now. Otherwise, drivers for
- * some devices (e.g. interrupt controllers) become
- * desynchronized with the actual state of the
- * hardware at resume time, and evil weirdness ensues.
- */
- error = dpm_suspend_end(PMSG_FREEZE);
- if (error)
- goto Resume_devices;
- error = disable_nonboot_cpus();
- if (error)
- goto Enable_cpus;
- local_irq_disable();
- error = syscore_suspend();
- if (error)
- goto Enable_irqs;
- } else
-#endif
- {
- kexec_in_progress = true;
- kernel_restart_prepare(NULL);
- migrate_to_reboot_cpu();
-
- /*
- * migrate_to_reboot_cpu() disables CPU hotplug assuming that
- * no further code needs to use CPU hotplug (which is true in
- * the reboot case). However, the kexec path depends on using
- * CPU hotplug again; so re-enable it here.
- */
- cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
- machine_shutdown();
- }
-
- machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- syscore_resume();
- Enable_irqs:
- local_irq_enable();
- Enable_cpus:
- enable_nonboot_cpus();
- dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
- dpm_resume_end(PMSG_RESTORE);
- Resume_console:
- resume_console();
- thaw_processes();
- Restore_console:
- pm_restore_console();
- unlock_system_sleep();
- }
#endif
-
- Unlock:
- mutex_unlock(&kexec_mutex);
- return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..0f92acdd354d
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1368 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/btf.h>
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/liveupdate.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/panic_notifier.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+#include <linux/objtool.h>
+#include <linux/kmsg_dump.h>
+#include <linux/dma-map-ops.h>
+#include <linux/sysfs.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include "kexec_internal.h"
+
+atomic_t __kexec_lock = ATOMIC_INIT(0);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+bool kexec_file_dbg_print;
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+#define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+ int i;
+ unsigned long nr_segments = image->nr_segments;
+ unsigned long total_pages = 0;
+ unsigned long nr_pages = totalram_pages();
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addresses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if (mstart > mend)
+ return -EADDRNOTAVAIL;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ return -EADDRNOTAVAIL;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ return -EINVAL;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ return -EINVAL;
+ }
+
+ /*
+ * Verify that no more than half of memory will be consumed. If the
+ * request from userspace is too large, a large amount of time will be
+ * wasted allocating pages, which can cause a soft lockup.
+ */
+ for (i = 0; i < nr_segments; i++) {
+ if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
+ return -EINVAL;
+
+ total_pages += PAGE_COUNT(image->segment[i].memsz);
+ }
+
+ if (total_pages > nr_pages / 2)
+ return -EINVAL;
+
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
+ (mend > phys_to_boot_phys(crashk_res.end)))
+ return -EADDRNOTAVAIL;
+ }
+ }
+#endif
+
+ /*
+ * The destination addresses are searched from system RAM rather than
+ * being allocated from the buddy allocator, so they are not guaranteed
+ * to be accepted by the current kernel. Accept the destination
+ * addresses before kexec swaps their content with the segments' source
+ * pages to avoid accessing memory before it is accepted.
+ */
+ for (i = 0; i < nr_segments; i++)
+ accept_memory(image->segment[i].mem, image->segment[i].memsz);
+
+ return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+ image->hp_action = KEXEC_CRASH_HP_NONE;
+ image->elfcorehdr_index = -1;
+ image->elfcorehdr_updated = false;
+#endif
+
+ return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((end >= mstart) && (start <= mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *pages;
+
+ if (fatal_signal_pending(current))
+ return NULL;
+ pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
+ if (pages) {
+ unsigned int count, i;
+
+ pages->mapping = NULL;
+ set_page_private(pages, order);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+
+ arch_kexec_post_alloc_pages(page_address(pages), count,
+ gfp_mask);
+
+ if (gfp_mask & __GFP_ZERO)
+ for (i = 0; i < count; i++)
+ clear_highpage(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page_private(page);
+ count = 1 << order;
+
+ arch_kexec_pre_free_pages(page_address(page), count);
+
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+ if (!pages)
+ break;
+ pfn = page_to_boot_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = (epfn << PAGE_SHIFT) - 1;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everything to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+#ifdef CONFIG_CRASH_DUMP
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = ALIGN(image->control_page, size);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ cond_resched();
+
+ if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = ALIGN(mend, size);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ image->control_page = hole_end + 1;
+ break;
+ }
+ }
+
+ /* Ensure that these pages are decrypted if SME is enabled. */
+ if (pages)
+ arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
+
+ return pages;
+}
+#endif
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+#ifdef CONFIG_CRASH_DUMP
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+#endif
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ destination &= PAGE_MASK;
+
+ return kimage_add_entry(image, destination | IND_DESTINATION);
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ page &= PAGE_MASK;
+
+ return kimage_add_entry(image, page | IND_SOURCE);
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unusable pages I have cached */
+ kimage_free_page_list(&image->unusable_pages);
+
+}
+
+void kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION) ? \
+ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = boot_pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+static void kimage_free_cma(struct kimage *image)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct page *cma = image->segment_cma[i];
+ u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;
+
+ if (!cma)
+ continue;
+
+ arch_kexec_pre_free_pages(page_address(cma), nr_pages);
+ dma_release_from_contiguous(NULL, cma, nr_pages);
+ image->segment_cma[i] = NULL;
+ }
+
+}
+
+void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+#ifdef CONFIG_CRASH_DUMP
+ if (image->vmcoreinfo_data_copy) {
+ crash_update_vmcoreinfo_safecopy(NULL);
+ vunmap(image->vmcoreinfo_data_copy);
+ }
+#endif
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ } else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+
+ /* Free CMA allocations */
+ kimage_free_cma(image);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return NULL;
+ /* If the page cannot be used file it away */
+ if (page_to_boot_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unusable_pages);
+ continue;
+ }
+ addr = page_to_boot_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE - 1))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
+ */
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
+ page = old_page;
+ break;
+ }
+ /* Place the page on the destination list, to be used later */
+ list_add(&page->lru, &image->dest_pages);
+ }
+
+ return page;
+}
+
+static int kimage_load_cma_segment(struct kimage *image, int idx)
+{
+ struct kexec_segment *segment = &image->segment[idx];
+ struct page *cma = image->segment_cma[idx];
+ char *ptr = page_address(cma);
+ size_t ubytes, mbytes;
+ int result = 0;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+
+ /* Then copy from source buffer to the CMA one */
+ while (mbytes) {
+ size_t uchunk, mchunk;
+
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
+ uchunk = min(ubytes, mchunk);
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+
+ ptr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+
+ /* Clear any remainder */
+ memset(ptr, 0, mbytes);
+
+out:
+ return result;
+}
+
+static int kimage_load_normal_segment(struct kimage *image, int idx)
+{
+ struct kexec_segment *segment = &image->segment[idx];
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ if (image->segment_cma[idx])
+ return kimage_load_cma_segment(image, idx);
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_boot_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap_local_page(page);
+ /* Start with a clear page */
+ clear_page(ptr);
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
+ uchunk = min(ubytes, mchunk);
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+ kunmap_local(ptr);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ maddr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+out:
+ return result;
+}
+
+#ifdef CONFIG_CRASH_DUMP
+static int kimage_load_crash_segment(struct kimage *image, int idx)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ struct kexec_segment *segment = &image->segment[idx];
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ arch_kexec_post_alloc_pages(page_address(page), 1, 0);
+ ptr = kmap_local_page(page);
+ mchunk = min_t(size_t, mbytes, PAGE_SIZE);
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+ kexec_flush_icache_page(page);
+ kunmap_local(ptr);
+ arch_kexec_pre_free_pages(page_address(page), 1);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ maddr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+out:
+ return result;
+}
+#endif
+
+int kimage_load_segment(struct kimage *image, int idx)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, idx);
+ break;
+#ifdef CONFIG_CRASH_DUMP
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, idx);
+ break;
+#endif
+ }
+
+ return result;
+}
+
+void *kimage_map_segment(struct kimage *image,
+ unsigned long addr, unsigned long size)
+{
+ unsigned long src_page_addr, dest_page_addr = 0;
+ unsigned long eaddr = addr + size;
+ kimage_entry_t *ptr, entry;
+ struct page **src_pages;
+ unsigned int npages;
+ void *vaddr = NULL;
+ int i;
+
+ /*
+ * Collect the source pages and map them in a contiguous VA range.
+ */
+ npages = PFN_UP(eaddr) - PFN_DOWN(addr);
+ src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL);
+ if (!src_pages) {
+ pr_err("Could not allocate ima pages array.\n");
+ return NULL;
+ }
+
+ i = 0;
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION) {
+ dest_page_addr = entry & PAGE_MASK;
+ } else if (entry & IND_SOURCE) {
+ if (dest_page_addr >= addr && dest_page_addr < eaddr) {
+ src_page_addr = entry & PAGE_MASK;
+ src_pages[i++] =
+ virt_to_page(__va(src_page_addr));
+ if (i == npages)
+ break;
+ dest_page_addr += PAGE_SIZE;
+ }
+ }
+ }
+
+ /* Sanity check. */
+ WARN_ON(i < npages);
+
+ vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL);
+ kfree(src_pages);
+
+ if (!vaddr)
+ pr_err("Could not map ima buffer.\n");
+
+ return vaddr;
+}
+
+void kimage_unmap_segment(void *segment_buffer)
+{
+ vunmap(segment_buffer);
+}
+
+struct kexec_load_limit {
+ /* Mutex protects the limit count. */
+ struct mutex mutex;
+ int limit;
+};
+
+static struct kexec_load_limit load_limit_reboot = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
+ .limit = -1,
+};
+
+static struct kexec_load_limit load_limit_panic = {
+ .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
+ .limit = -1,
+};
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+static int kexec_load_disabled;
+
+#ifdef CONFIG_SYSCTL
+static int kexec_limit_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct kexec_load_limit *limit = table->data;
+ int val;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ };
+ int ret;
+
+ if (write) {
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (val < 0)
+ return -EINVAL;
+
+ mutex_lock(&limit->mutex);
+ if (limit->limit != -1 && val >= limit->limit)
+ ret = -EINVAL;
+ else
+ limit->limit = val;
+ mutex_unlock(&limit->mutex);
+
+ return ret;
+ }
+
+ mutex_lock(&limit->mutex);
+ val = limit->limit;
+ mutex_unlock(&limit->mutex);
+
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static const struct ctl_table kexec_core_sysctls[] = {
+ {
+ .procname = "kexec_load_disabled",
+ .data = &kexec_load_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kexec_load_limit_panic",
+ .data = &load_limit_panic,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+ {
+ .procname = "kexec_load_limit_reboot",
+ .data = &load_limit_reboot,
+ .mode = 0644,
+ .proc_handler = kexec_limit_handler,
+ },
+};
+
+static int __init kexec_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", kexec_core_sysctls);
+ return 0;
+}
+late_initcall(kexec_core_sysctl_init);
+#endif
+
+bool kexec_load_permitted(int kexec_image_type)
+{
+ struct kexec_load_limit *limit;
+
+ /*
+ * Only the superuser can use the kexec syscall and if it has not
+ * been disabled.
+ */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return false;
+
+ /* Check limit counter and decrease it.*/
+ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
+ &load_limit_panic : &load_limit_reboot;
+ mutex_lock(&limit->mutex);
+ if (!limit->limit) {
+ mutex_unlock(&limit->mutex);
+ return false;
+ }
+ if (limit->limit != -1)
+ limit->limit--;
+ mutex_unlock(&limit->mutex);
+
+ return true;
+}
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!kexec_trylock())
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+ error = liveupdate_reboot();
+ if (error)
+ goto Unlock;
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ /*
+ * This flow is analogous to hibernation flows that occur
+ * before creating an image and before jumping from the
+ * restore kernel to the image one, so it uses the same
+ * device callbacks as those two flows.
+ */
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ console_suspend_all();
+ error = dpm_suspend_start(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ /*
+ * dpm_suspend_end() must be called after dpm_suspend_start()
+ * to complete the transition, like in the hibernation flows
+ * mentioned above.
+ */
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = suspend_disable_secondary_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kexec_in_progress = true;
+ kernel_restart_prepare("kexec reboot");
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_notice("Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ kmsg_dump(KMSG_DUMP_SHUTDOWN);
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ /*
+ * This flow is analogous to hibernation flows that occur after
+ * creating an image and after the image kernel has got control
+ * back, and in case the devices have been reset or otherwise
+ * manipulated in the meantime, it uses the device callbacks
+ * used by the latter.
+ */
+ syscore_resume();
+ Enable_irqs:
+ local_irq_enable();
+ Enable_cpus:
+ suspend_enable_secondary_cpus();
+ dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+ dpm_resume_end(PMSG_RESTORE);
+ console_resume_all();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ }
+#endif
+
+ Unlock:
+ kexec_unlock();
+ return error;
+}
+
+static ssize_t loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!kexec_image);
+}
+static struct kobj_attribute loaded_attr = __ATTR_RO(loaded);
+
+#ifdef CONFIG_CRASH_DUMP
+static ssize_t crash_loaded_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
+}
+static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded);
+
+#ifdef CONFIG_CRASH_RESERVE
+static ssize_t crash_cma_ranges_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ len += sysfs_emit_at(buf, len, "%08llx-%08llx\n",
+ crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
+ }
+ return len;
+}
+static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges);
+#endif
+
+static ssize_t crash_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ ssize_t size = crash_get_memory_size();
+
+ if (size < 0)
+ return size;
+
+ return sysfs_emit(buf, "%zd\n", size);
+}
+static ssize_t crash_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long cnt;
+ int ret;
+
+ if (kstrtoul(buf, 0, &cnt))
+ return -EINVAL;
+
+ ret = crash_shrink_memory(cnt);
+ return ret < 0 ? ret : count;
+}
+static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size);
+
+#ifdef CONFIG_CRASH_HOTPLUG
+static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ unsigned int sz = crash_get_elfcorehdr_size();
+
+ return sysfs_emit(buf, "%u\n", sz);
+}
+static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size);
+
+#endif /* CONFIG_CRASH_HOTPLUG */
+#endif /* CONFIG_CRASH_DUMP */
+
+static struct attribute *kexec_attrs[] = {
+ &loaded_attr.attr,
+#ifdef CONFIG_CRASH_DUMP
+ &crash_loaded_attr.attr,
+ &crash_size_attr.attr,
+#ifdef CONFIG_CRASH_RESERVE
+ &crash_cma_ranges_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ &crash_elfcorehdr_size_attr.attr,
+#endif
+#endif
+ NULL
+};
+
+struct kexec_link_entry {
+ const char *target;
+ const char *name;
+};
+
+static struct kexec_link_entry kexec_links[] = {
+ { "loaded", "kexec_loaded" },
+#ifdef CONFIG_CRASH_DUMP
+ { "crash_loaded", "kexec_crash_loaded" },
+ { "crash_size", "kexec_crash_size" },
+#ifdef CONFIG_CRASH_RESERVE
+ {"crash_cma_ranges", "kexec_crash_cma_ranges"},
+#endif
+#ifdef CONFIG_CRASH_HOTPLUG
+ { "crash_elfcorehdr_size", "crash_elfcorehdr_size" },
+#endif
+#endif
+};
+
+static struct kobject *kexec_kobj;
+ATTRIBUTE_GROUPS(kexec);
+
+static int __init init_kexec_sysctl(void)
+{
+ int error;
+ int i;
+
+ kexec_kobj = kobject_create_and_add("kexec", kernel_kobj);
+ if (!kexec_kobj) {
+ pr_err("failed to create kexec kobject\n");
+ return -ENOMEM;
+ }
+
+ error = sysfs_create_groups(kexec_kobj, kexec_groups);
+ if (error)
+ goto kset_exit;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_links); i++) {
+ error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj,
+ kexec_links[i].target,
+ kexec_links[i].name);
+ if (error)
+ pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error);
+ }
+
+ return 0;
+
+kset_exit:
+ kobject_put(kexec_kobj);
+ return error;
+}
+
+subsys_initcall(init_kexec_sysctl);
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c
new file mode 100644
index 000000000000..3a5c25b2adc9
--- /dev/null
+++ b/kernel/kexec_elf.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Load ELF vmlinux file for the kexec_file_load syscall.
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ */
+
+#define pr_fmt(fmt) "kexec_elf: " fmt
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+static inline bool elf_is_elf_file(const struct elfhdr *ehdr)
+{
+ return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0;
+}
+
+static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le64_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be64_to_cpu(value);
+
+ return value;
+}
+
+static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le32_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be32_to_cpu(value);
+
+ return value;
+}
+
+static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le16_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be16_to_cpu(value);
+
+ return value;
+}
+
+/**
+ * elf_is_ehdr_sane - check that it is safe to use the ELF header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len)
+{
+ if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) {
+ pr_debug("Bad program header size.\n");
+ return false;
+ } else if (ehdr->e_shnum > 0 &&
+ ehdr->e_shentsize != sizeof(struct elf_shdr)) {
+ pr_debug("Bad section header size.\n");
+ return false;
+ } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr->e_version != EV_CURRENT) {
+ pr_debug("Unknown ELF version.\n");
+ return false;
+ }
+
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ size_t phdr_size;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ /* Sanity check the program header table location. */
+ if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) {
+ pr_debug("Program headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_phoff + phdr_size > buf_len) {
+ pr_debug("Program headers truncated.\n");
+ return false;
+ }
+ }
+
+ if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+ size_t shdr_size;
+
+ /*
+ * e_shnum is at most 65536 so calculating
+ * the size of the section header cannot overflow.
+ */
+ shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum;
+
+ /* Sanity check the section header table location. */
+ if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) {
+ pr_debug("Section headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_shoff + shdr_size > buf_len) {
+ pr_debug("Section headers truncated.\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr)
+{
+ struct elfhdr *buf_ehdr;
+
+ if (len < sizeof(*buf_ehdr)) {
+ pr_debug("Buffer is too small to hold ELF header.\n");
+ return -ENOEXEC;
+ }
+
+ memset(ehdr, 0, sizeof(*ehdr));
+ memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident));
+ if (!elf_is_elf_file(ehdr)) {
+ pr_debug("No ELF header magic.\n");
+ return -ENOEXEC;
+ }
+
+ if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) {
+ pr_debug("Not a supported ELF class.\n");
+ return -ENOEXEC;
+ } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB &&
+ ehdr->e_ident[EI_DATA] != ELFDATA2MSB) {
+ pr_debug("Not a supported ELF data format.\n");
+ return -ENOEXEC;
+ }
+
+ buf_ehdr = (struct elfhdr *) buf;
+ if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) {
+ pr_debug("Bad ELF header size.\n");
+ return -ENOEXEC;
+ }
+
+ ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type);
+ ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine);
+ ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version);
+ ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags);
+ ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize);
+ ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum);
+ ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize);
+ ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum);
+ ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ case ELFCLASS32:
+ ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_is_phdr_sane - check that it is safe to use the program header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len)
+{
+
+ if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) {
+ pr_debug("ELF segment location wraps around.\n");
+ return false;
+ } else if (phdr->p_offset + phdr->p_filesz > buf_len) {
+ pr_debug("ELF segment not in file.\n");
+ return false;
+ } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) {
+ pr_debug("ELF segment address wraps around.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int elf_read_phdr(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info,
+ int idx)
+{
+ /* Override the const in proghdrs, we are the ones doing the loading. */
+ struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx];
+ const struct elfhdr *ehdr = elf_info->ehdr;
+ const char *pbuf;
+ struct elf_phdr *buf_phdr;
+
+ pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr));
+ buf_phdr = (struct elf_phdr *) pbuf;
+
+ phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type);
+ phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ case ELFCLASS32:
+ phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_phdrs - read the program headers from the buffer
+ *
+ * This function assumes that the program header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_phdrs(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info)
+{
+ size_t phdr_size, i;
+ const struct elfhdr *ehdr = elf_info->ehdr;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL);
+ if (!elf_info->proghdrs)
+ return -ENOMEM;
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ int ret;
+
+ ret = elf_read_phdr(buf, len, elf_info, i);
+ if (ret) {
+ kfree(elf_info->proghdrs);
+ elf_info->proghdrs = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info
+ * @buf: Buffer to read ELF file from.
+ * @len: Size of @buf.
+ * @ehdr: Pointer to existing struct which will be populated.
+ * @elf_info: Pointer to existing struct which will be populated.
+ *
+ * This function allows reading ELF files with different byte order than
+ * the kernel, byte-swapping the fields as needed.
+ *
+ * Return:
+ * On success returns 0, and the caller should call
+ * kexec_free_elf_info(elf_info) to free the memory allocated for the section
+ * and program headers.
+ */
+static int elf_read_from_buffer(const char *buf, size_t len,
+ struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int ret;
+
+ ret = elf_read_ehdr(buf, len, ehdr);
+ if (ret)
+ return ret;
+
+ elf_info->buffer = buf;
+ elf_info->ehdr = ehdr;
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ ret = elf_read_phdrs(buf, len, elf_info);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * kexec_free_elf_info - free memory allocated by elf_read_from_buffer
+ */
+void kexec_free_elf_info(struct kexec_elf_info *elf_info)
+{
+ kfree(elf_info->proghdrs);
+ memset(elf_info, 0, sizeof(*elf_info));
+}
+/**
+ * kexec_build_elf_info - read ELF executable and check that we can use it
+ */
+int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int i;
+ int ret;
+
+ ret = elf_read_from_buffer(buf, len, ehdr, elf_info);
+ if (ret)
+ return ret;
+
+ /* Big endian vmlinux has type ET_DYN. */
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) {
+ pr_err("Not an ELF executable.\n");
+ goto error;
+ } else if (!elf_info->proghdrs) {
+ pr_err("No ELF program header.\n");
+ goto error;
+ }
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ /*
+ * Kexec does not support loading interpreters.
+ * In addition this check keeps us from attempting
+ * to kexec ordinay executables.
+ */
+ if (elf_info->proghdrs[i].p_type == PT_INTERP) {
+ pr_err("Requires an ELF interpreter.\n");
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ kexec_free_elf_info(elf_info);
+ return -ENOEXEC;
+}
+
+
+int kexec_elf_probe(const char *buf, unsigned long len)
+{
+ struct elfhdr ehdr;
+ struct kexec_elf_info elf_info;
+ int ret;
+
+ ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info);
+ if (ret)
+ return ret;
+
+ kexec_free_elf_info(&elf_info);
+
+ return elf_check_arch(&ehdr) ? 0 : -ENOEXEC;
+}
+
+/**
+ * kexec_elf_load - load ELF executable image
+ * @lowest_load_addr: On return, will be the address where the first PT_LOAD
+ * section will be loaded in memory.
+ *
+ * Return:
+ * 0 on success, negative value on failure.
+ */
+int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info,
+ struct kexec_buf *kbuf,
+ unsigned long *lowest_load_addr)
+{
+ unsigned long lowest_addr = ULONG_MAX;
+ int ret;
+ size_t i;
+
+ /* Read in the PT_LOAD segments. */
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ unsigned long load_addr;
+ size_t size;
+ const struct elf_phdr *phdr;
+
+ phdr = &elf_info->proghdrs[i];
+ if (phdr->p_type != PT_LOAD)
+ continue;
+
+ size = phdr->p_filesz;
+ if (size > phdr->p_memsz)
+ size = phdr->p_memsz;
+
+ kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset;
+ kbuf->bufsz = size;
+ kbuf->memsz = phdr->p_memsz;
+ kbuf->buf_align = phdr->p_align;
+ kbuf->buf_min = phdr->p_paddr;
+ kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
+ goto out;
+ load_addr = kbuf->mem;
+
+ if (load_addr < lowest_addr)
+ lowest_addr = load_addr;
+ }
+
+ *lowest_load_addr = lowest_addr;
+ ret = 0;
+ out:
+ return ret;
+}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..eb62a9794242
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/ima.h>
+#include <crypto/sha2.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
+#include "kexec_internal.h"
+
+#ifdef CONFIG_KEXEC_SIG
+static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE);
+
+void set_kexec_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+#endif
+
+#ifdef CONFIG_IMA_KEXEC
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ if (image->is_ima_segment_index_set && i == image->ima_segment_index)
+ return true;
+ else
+ return false;
+}
+#else
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ return false;
+}
+#endif
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+/* Maximum size in bytes for kernel/initrd files. */
+#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX)
+
+/*
+ * Currently this is the only default function that is exported as some
+ * architectures need it to do additional handlings.
+ * In the future, other default functions may be exported too if required.
+ */
+int kexec_image_probe_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ const struct kexec_file_ops * const *fops;
+ int ret = -ENOEXEC;
+
+ for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) {
+ ret = (*fops)->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = *fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+static void *kexec_image_load_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+int kexec_image_post_load_cleanup_default(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+#ifdef CONFIG_IMA_KEXEC
+ vfree(image->ima_buffer);
+ image->ima_buffer = NULL;
+#endif /* CONFIG_IMA_KEXEC */
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
+ /*
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
+ */
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+
+ kexec_file_dbg_print = false;
+}
+
+#ifdef CONFIG_KEXEC_SIG
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len)
+{
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ }
+ return ret;
+}
+#endif
+
+static int kexec_image_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.\n");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(buf, buf_len);
+}
+
+static int
+kimage_validate_signature(struct kimage *image)
+{
+ int ret;
+
+ ret = kexec_image_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+
+ if (sig_enforce) {
+ pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
+ return ret;
+ }
+
+ /*
+ * If IMA is guaranteed to appraise a signature on the kexec
+ * image, permit it even if the kernel is otherwise locked
+ * down.
+ */
+ if (!ima_appraise_signature(READING_KEXEC_IMAGE) &&
+ security_locked_down(LOCKDOWN_KEXEC))
+ return -EPERM;
+
+ pr_debug("kernel signature verification failed (%d).\n", ret);
+ }
+
+ return 0;
+}
+#endif
+
+static int kexec_post_load(struct kimage *image, unsigned long flags)
+{
+#ifdef CONFIG_IMA_KEXEC
+ if (!(flags & KEXEC_FILE_ON_CRASH))
+ ima_kexec_post_load(image);
+#endif
+ return machine_kexec_post_load(image);
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ ssize_t ret;
+ void *ldata;
+
+ ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf,
+ KEXEC_FILE_SIZE_MAX, NULL,
+ READING_KEXEC_IMAGE);
+ if (ret < 0)
+ return ret;
+ image->kernel_buf_len = ret;
+ kexec_dprintk("kernel: %p kernel_size: %#lx\n",
+ image->kernel_buf, image->kernel_buf_len);
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_SIG
+ ret = kimage_validate_signature(image);
+
+ if (ret)
+ goto out;
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf,
+ KEXEC_FILE_SIZE_MAX, NULL,
+ READING_KEXEC_INITRAMFS);
+ if (ret < 0)
+ goto out;
+ image->initrd_buf_len = ret;
+ ret = 0;
+ }
+
+ image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+ image->force_dtb = flags & KEXEC_FILE_FORCE_DTB;
+
+ if (cmdline_len) {
+ image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
+ if (IS_ERR(image->cmdline_buf)) {
+ ret = PTR_ERR(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ima_kexec_cmdline(kernel_fd, image->cmdline_buf,
+ image->cmdline_buf_len - 1);
+ }
+
+ /* IMA needs to pass the measurement list to the next kernel. */
+ ima_add_kexec_buffer(image);
+
+ /* If KHO is active, add its images to the list */
+ ret = kho_fill_kimage(image);
+ if (ret)
+ goto out;
+
+ /* Call image load handler */
+ ldata = kexec_image_load_default(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
+ image->file_mode = 1;
+
+#ifdef CONFIG_CRASH_DUMP
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+#endif
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
+ }
+
+ *rimage = image;
+ return 0;
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
+ kfree(image);
+ return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int image_type = (flags & KEXEC_FILE_ON_CRASH) ?
+ KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;
+ struct kimage **dest_image, *image;
+ int ret = 0, i;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!kexec_load_permitted(image_type))
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!kexec_trylock())
+ return -EBUSY;
+
+#ifdef CONFIG_CRASH_DUMP
+ if (image_type == KEXEC_TYPE_CRASH) {
+ dest_image = &kexec_crash_image;
+ if (kexec_crash_image)
+ arch_kexec_unprotect_crashkres();
+ } else
+#endif
+ dest_image = &kexec_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_CRASH_HOTPLUG
+ if ((flags & KEXEC_FILE_ON_CRASH) && arch_crash_hotplug_support(image, flags))
+ image->hotplug_support = 1;
+#endif
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ /*
+ * Some architecture(like S390) may touch the crash memory before
+ * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
+ */
+ ret = kimage_crash_copy_vmcoreinfo(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ kexec_dprintk("nr_segments = %lu\n", image->nr_segments);
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, i);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ ret = kexec_post_load(image, flags);
+ if (ret)
+ goto out;
+
+ kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n",
+ image->type, image->start, image->head, flags);
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+#ifdef CONFIG_CRASH_DUMP
+ if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+ arch_kexec_protect_crashkres();
+#endif
+
+ kexec_unlock();
+ kimage_free(image);
+ return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz + 1;
+ kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start);
+
+ do {
+ /* align down start */
+ temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align);
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* Make sure this does not conflict with exclude range */
+ if (arch_check_excluded_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ kexec_random_range_start(temp_start, end, kbuf, &temp_start);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* Make sure this does not conflict with exclude range */
+ if (arch_check_excluded_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(struct resource *res, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ u64 start = res->start, end = res->end;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+
+ /* Don't use memory that will be detected and handled by a driver. */
+ if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
+ return 0;
+
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret = 0;
+ u64 i;
+ phys_addr_t mstart, mend;
+ struct resource res = { };
+
+#ifdef CONFIG_CRASH_DUMP
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return func(&crashk_res, kbuf);
+#endif
+
+ /*
+ * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
+ * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
+ * locate_mem_hole_callback().
+ */
+ if (kbuf->top_down) {
+ for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+ &mstart, &mend, NULL) {
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in kexec, end points to the last byte
+ * in the range.
+ */
+ res.start = mstart;
+ res.end = mend - 1;
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+ } else {
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+ &mstart, &mend, NULL) {
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in kexec, end points to the last byte
+ * in the range.
+ */
+ res.start = mstart;
+ res.end = mend - 1;
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 0;
+}
+#endif
+
+/**
+ * kexec_walk_resources - call func(data) on free memory regions
+ * @kbuf: Context info for the search. Also passed to @func.
+ * @func: Function to call for each memory region.
+ *
+ * Return: The memory walk will stop when func returns a non-zero value
+ * and that value will be returned. If all free regions are visited without
+ * func returning non-zero, then zero will be returned.
+ */
+static int kexec_walk_resources(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+#ifdef CONFIG_CRASH_DUMP
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return walk_iomem_res_desc(crashk_res.desc,
+ IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end,
+ kbuf, func);
+#endif
+ if (kbuf->top_down)
+ return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
+ else
+ return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
+}
+
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+ size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
+ unsigned long mem;
+ struct page *p;
+
+ /* User space disabled CMA allocations, bail out. */
+ if (kbuf->image->no_cma)
+ return -EPERM;
+
+ /* Skip CMA logic for crash kernel */
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return -EPERM;
+
+ p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
+ if (!p)
+ return -ENOMEM;
+
+ pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
+
+ mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+ if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+ /* Our region is already in use by a statically defined one. Bail out. */
+ pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+ dma_release_from_contiguous(NULL, p, nr_pages);
+ return -EBUSY;
+ }
+
+ kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+ kbuf->cma = p;
+
+ arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
+
+ return 0;
+}
+
+/**
+ * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
+ * @kbuf: Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ int ret;
+
+ /* Arch knows where to place */
+ if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
+ return 0;
+
+ /*
+ * If KHO is active, only use KHO scratch memory. All other memory
+ * could potentially be handed over.
+ */
+ ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * Try to find a free physically contiguous block of memory first. With that, we
+ * can avoid any copying at kexec time.
+ */
+ if (!kexec_alloc_contig(kbuf))
+ return 0;
+
+ if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
+ else
+ ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
+
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
+
+/**
+ * kexec_add_buffer - place a buffer in a kexec segment
+ * @kbuf: Buffer contents and memory parameters.
+ *
+ * This function assumes that kexec_lock is held.
+ * On successful return, @kbuf->mem will have the physical address of
+ * the buffer in memory.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_add_buffer(struct kexec_buf *kbuf)
+{
+ struct kexec_segment *ksegment;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!kbuf->image->file_mode)
+ return -EINVAL;
+
+ if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&kbuf->image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ /* Ensure minimum alignment needed for segments. */
+ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
+ kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
+ kbuf->cma = NULL;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ ret = arch_kexec_locate_mem_hole(kbuf);
+ if (ret)
+ return ret;
+
+ /* Found a suitable memory range */
+ ksegment = &kbuf->image->segment[kbuf->image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
+ kbuf->image->nr_segments++;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct sha256_ctx sctx;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t nullsz;
+ u8 digest[SHA256_DIGEST_SIZE];
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY))
+ return 0;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ return -ENOMEM;
+
+ sha256_init(&sctx);
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+#ifdef CONFIG_CRASH_HOTPLUG
+ /* Exclude elfcorehdr segment to allow future changes via hotplug */
+ if (i == image->elfcorehdr_index)
+ continue;
+#endif
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ /*
+ * Skip the segment if ima_segment_index is set and matches
+ * the current index
+ */
+ if (check_ima_segment_index(image, i))
+ continue;
+
+ sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz);
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ sha256_update(&sctx, zero_buf, bytes);
+ nullsz -= bytes;
+ }
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ sha256_final(&sctx, digest);
+
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_sha_regions;
+
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+out_free_sha_regions:
+ vfree(sha_regions);
+ return ret;
+}
+
+#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
+/*
+ * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer to setup.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
+{
+ const Elf_Shdr *sechdrs;
+ unsigned long bss_align;
+ unsigned long bss_sz;
+ unsigned long align;
+ int i, ret;
+
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ kbuf->buf_align = bss_align = 1;
+ kbuf->bufsz = bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (kbuf->buf_align < align)
+ kbuf->buf_align = align;
+ kbuf->bufsz = ALIGN(kbuf->bufsz, align);
+ kbuf->bufsz += sechdrs[i].sh_size;
+ } else {
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+ kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align);
+ kbuf->memsz = kbuf->bufsz + bss_sz;
+ if (kbuf->buf_align < bss_align)
+ kbuf->buf_align = bss_align;
+
+ kbuf->buffer = vzalloc(kbuf->bufsz);
+ if (!kbuf->buffer)
+ return -ENOMEM;
+ pi->purgatory_buf = kbuf->buffer;
+
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
+ goto out;
+
+ return 0;
+out:
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+ return ret;
+}
+
+/*
+ * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer.
+ * @pi: Purgatory to be loaded.
+ * @kbuf: Buffer prepared to store purgatory.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
+ struct kexec_buf *kbuf)
+{
+ unsigned long bss_addr;
+ unsigned long offset;
+ size_t sechdrs_size;
+ Elf_Shdr *sechdrs;
+ int i;
+
+ /*
+ * The section headers in kexec_purgatory are read-only. In order to
+ * have them modifiable make a temporary copy.
+ */
+ sechdrs_size = array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum);
+ sechdrs = vzalloc(sechdrs_size);
+ if (!sechdrs)
+ return -ENOMEM;
+ memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, sechdrs_size);
+ pi->sechdrs = sechdrs;
+
+ offset = 0;
+ bss_addr = kbuf->mem + kbuf->bufsz;
+ kbuf->image->start = pi->ehdr->e_entry;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ unsigned long align;
+ void *src, *dst;
+
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type == SHT_NOBITS) {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ continue;
+ }
+
+ offset = ALIGN(offset, align);
+
+ /*
+ * Check if the segment contains the entry point, if so,
+ * calculate the value of image->start based on it.
+ * If the compiler has produced more than one .text section
+ * (Eg: .text.hot), they are generally after the main .text
+ * section, and they shall not be used to calculate
+ * image->start. So do not re-calculate image->start if it
+ * is not set to the initial value, and warn the user so they
+ * have a chance to fix their purgatory's linker script.
+ */
+ if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+ pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
+ pi->ehdr->e_entry < (sechdrs[i].sh_addr
+ + sechdrs[i].sh_size) &&
+ !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) {
+ kbuf->image->start -= sechdrs[i].sh_addr;
+ kbuf->image->start += kbuf->mem + offset;
+ }
+
+ src = (void *)pi->ehdr + sechdrs[i].sh_offset;
+ dst = pi->purgatory_buf + offset;
+ memcpy(dst, src, sechdrs[i].sh_size);
+
+ sechdrs[i].sh_addr = kbuf->mem + offset;
+ sechdrs[i].sh_offset = offset;
+ offset += sechdrs[i].sh_size;
+ }
+
+ return 0;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ const Elf_Shdr *sechdrs;
+
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ const Elf_Shdr *relsec;
+ const Elf_Shdr *symtab;
+ Elf_Shdr *section;
+
+ relsec = sechdrs + i;
+
+ if (relsec->sh_type != SHT_RELA &&
+ relsec->sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (relsec->sh_info >= pi->ehdr->e_shnum ||
+ relsec->sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = pi->sechdrs + relsec->sh_info;
+ symtab = sechdrs + relsec->sh_link;
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective architecture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (relsec->sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi, section,
+ relsec, symtab);
+ else if (relsec->sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi, section,
+ relsec, symtab);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * kexec_load_purgatory - Load and relocate the purgatory object.
+ * @image: Image to add the purgatory to.
+ * @kbuf: Memory parameters to use.
+ *
+ * Allocates the memory needed for image->purgatory_info.sechdrs and
+ * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible
+ * to free the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
+
+ ret = kexec_purgatory_setup_kbuf(pi, kbuf);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_setup_sechdrs(pi, kbuf);
+ if (ret)
+ goto out_free_kbuf;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+out_free_kbuf:
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+ return ret;
+}
+
+/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi: Purgatory to search in.
+ * @name: Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ const Elf_Shdr *sechdrs;
+ const Elf_Ehdr *ehdr;
+ const Elf_Sym *syms;
+ const char *strtab;
+ int i, k;
+
+ if (!pi->ehdr)
+ return NULL;
+
+ ehdr = pi->ehdr;
+ sechdrs = (void *)ehdr + ehdr->e_shoff;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (void *)ehdr + sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ const Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ const Elf_Sym *sym;
+ Elf_Shdr *sec;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sec = pi->sechdrs + sym->st_shndx;
+
+ if (sec->sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
+#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..228bb88c018b
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kexec_segment;
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, int idx);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+
+/*
+ * Whatever is used to serialize accesses to the kexec_crash_image needs to be
+ * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a
+ * "simple" atomic variable that is acquired with a cmpxchg().
+ */
+extern atomic_t __kexec_lock;
+static inline bool kexec_trylock(void)
+{
+ int old = 0;
+ return atomic_try_cmpxchg_acquire(&__kexec_lock, &old, 1);
+}
+static inline void kexec_unlock(void)
+{
+ atomic_set_release(&__kexec_lock, 0);
+}
+
+#ifdef CONFIG_KEXEC_FILE
+#include <linux/purgatory.h>
+void kimage_file_post_load_cleanup(struct kimage *image);
+extern char kexec_purgatory[];
+extern size_t kexec_purgatory_size;
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+
+struct kexec_buf;
+
+#ifdef CONFIG_KEXEC_HANDOVER
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *));
+int kho_fill_kimage(struct kimage *image);
+#else
+static inline int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 1;
+}
+
+static inline int kho_fill_kimage(struct kimage *image) { return 0; }
+#endif /* CONFIG_KEXEC_HANDOVER */
+#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..378088b07f46
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel headers useful to build tracing programs
+ * such as for running eBPF tracing tools.
+ *
+ * (Borrowed code from kernel/configs.c)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/init.h>
+
+/*
+ * Define kernel_headers_data and kernel_headers_data_end, within which the
+ * compressed kernel headers are stored. The file is first compressed with xz.
+ */
+
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .global kernel_headers_data \n"
+"kernel_headers_data: \n"
+" .incbin \"kernel/kheaders_data.tar.xz\" \n"
+" .global kernel_headers_data_end \n"
+"kernel_headers_data_end: \n"
+" .popsection \n"
+);
+
+extern char kernel_headers_data[];
+extern char kernel_headers_data_end[];
+
+static struct bin_attribute kheaders_attr __ro_after_init =
+ __BIN_ATTR_SIMPLE_RO(kheaders.tar.xz, 0444);
+
+static int __init ikheaders_init(void)
+{
+ kheaders_attr.private = kernel_headers_data;
+ kheaders_attr.size = (kernel_headers_data_end -
+ kernel_headers_data);
+ return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
+}
+
+static void __exit ikheaders_cleanup(void)
+{
+ sysfs_remove_bin_file(kernel_kobj, &kheaders_attr);
+}
+
+module_init(ikheaders_init);
+module_exit(ikheaders_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joel Fernandes");
+MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..ab8f9fc1f0d1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,20 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
- * kernel/kprobes.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2002, 2004
*
@@ -31,13 +17,15 @@
* <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
* <prasanna@in.ibm.com> added function-return probes.
*/
+
+#define pr_fmt(fmt) "kprobes: " fmt
+
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/export.h>
-#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
@@ -48,50 +36,53 @@
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
+#include <linux/static_call.h>
+#include <linux/perf_event.h>
+#include <linux/execmem.h>
+#include <linux/cleanup.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
-
-/*
- * Some oddball architectures like 64bit powerpc have function descriptors
- * so this must be overridable.
- */
-#ifndef kprobe_lookup_name
-#define kprobe_lookup_name(name, addr) \
- addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
+#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL)
+#define kprobe_sysctls_init() do { } while (0)
#endif
static int kprobes_initialized;
+/* kprobe_table can be accessed by
+ * - Normal hlist traversal and RCU add/del under 'kprobe_mutex' is held.
+ * Or
+ * - RCU hlist traversal under disabling preempt (breakpoint handlers)
+ */
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
-static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: change this value only with 'kprobe_mutex' held */
static bool kprobes_all_disarmed;
-/* This protects kprobe_table and optimizing_list */
+/* This protects 'kprobe_table' and 'optimizing_list' */
static DEFINE_MUTEX(kprobe_mutex);
-static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
-static struct {
- raw_spinlock_t lock ____cacheline_aligned_in_smp;
-} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance);
-static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
+ unsigned int __unused)
{
- return &(kretprobe_table_locks[hash].lock);
+ return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}
-/* Blacklist -- list of struct kprobe_blacklist_entry */
+/*
+ * Blacklist -- list of 'struct kprobe_blacklist_entry' to store info where
+ * kprobes can not probe.
+ */
static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
- * kprobe->ainsn.insn points to the copy of the instruction to be
+ * 'kprobe::ainsn.insn' points to the copy of the instruction to be
* single-stepped. x86_64, POWER4 and above have no-exec support and
* stepping on the instruction on a vmalloced/kmalloced/data page
* is a recipe for disaster
@@ -105,10 +96,6 @@ struct kprobe_insn_page {
char slot_used[];
};
-#define KPROBE_INSN_PAGE_SIZE(slots) \
- (offsetof(struct kprobe_insn_page, slot_used) + \
- (sizeof(char) * (slots)))
-
static int slots_per_page(struct kprobe_insn_cache *c)
{
return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -120,20 +107,27 @@ enum kprobe_slot_state {
SLOT_USED = 2,
};
-static void *alloc_insn_page(void)
+void __weak *alloc_insn_page(void)
{
- return module_alloc(PAGE_SIZE);
+ /*
+ * Use execmem_alloc() so this page is within +/- 2GB of where the
+ * kernel image and loaded module images reside. This is required
+ * for most of the architectures.
+ * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
+ */
+ return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
}
static void free_insn_page(void *page)
{
- module_memfree(page);
+ execmem_free(page);
}
struct kprobe_insn_cache kprobe_insn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_INSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
@@ -141,51 +135,49 @@ struct kprobe_insn_cache kprobe_insn_slots = {
static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
- * __get_insn_slot() - Find a slot on an executable page for an instruction.
- * We allocate an executable page if there's no room on existing ones.
+ * __get_insn_slot - Find a slot on an executable page for an instruction.
+ * @c: Pointer to kprobe instruction cache
+ *
+ * Description: Locates available slot on existing executable pages,
+ * allocates an executable page if there's no room on existing ones.
+ * Return: Pointer to instruction slot on success, NULL on failure.
*/
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
- kprobe_opcode_t *slot = NULL;
-
- mutex_lock(&c->mutex);
- retry:
- list_for_each_entry(kip, &c->pages, list) {
- if (kip->nused < slots_per_page(c)) {
- int i;
- for (i = 0; i < slots_per_page(c); i++) {
- if (kip->slot_used[i] == SLOT_CLEAN) {
- kip->slot_used[i] = SLOT_USED;
- kip->nused++;
- slot = kip->insns + (i * c->insn_size);
- goto out;
+
+ /* Since the slot array is not protected by rcu, we need a mutex */
+ guard(mutex)(&c->mutex);
+ do {
+ guard(rcu)();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if (kip->nused < slots_per_page(c)) {
+ int i;
+
+ for (i = 0; i < slots_per_page(c); i++) {
+ if (kip->slot_used[i] == SLOT_CLEAN) {
+ kip->slot_used[i] = SLOT_USED;
+ kip->nused++;
+ return kip->insns + (i * c->insn_size);
+ }
}
+ /* kip->nused is broken. Fix it. */
+ kip->nused = slots_per_page(c);
+ WARN_ON(1);
}
- /* kip->nused is broken. Fix it. */
- kip->nused = slots_per_page(c);
- WARN_ON(1);
}
- }
-
/* If there are any garbage slots, collect it and try again. */
- if (c->nr_garbage && collect_garbage_slots(c) == 0)
- goto retry;
+ } while (c->nr_garbage && collect_garbage_slots(c) == 0);
/* All out of space. Need to allocate a new page. */
- kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
+ kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL);
if (!kip)
- goto out;
+ return NULL;
- /*
- * Use module_alloc so this page is within +/- 2GB of where the
- * kernel image and loaded module images reside. This is required
- * so x86_64 can correctly handle the %rip-relative fixups.
- */
kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
- goto out;
+ return NULL;
}
INIT_LIST_HEAD(&kip->list);
memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
@@ -193,33 +185,43 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->nused = 1;
kip->ngarbage = 0;
kip->cache = c;
- list_add(&kip->list, &c->pages);
- slot = kip->insns;
-out:
- mutex_unlock(&c->mutex);
- return slot;
+ list_add_rcu(&kip->list, &c->pages);
+
+ /* Record the perf ksymbol register event after adding the page */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+ PAGE_SIZE, false, c->sym);
+
+ return kip->insns;
}
-/* Return 1 if all garbages are collected, otherwise 0. */
-static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
+/* Return true if all garbages are collected, otherwise false. */
+static bool collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
- if (kip->nused == 0) {
+ if (kip->nused != 0)
+ return false;
+
+ /*
+ * Page is no longer in use. Free it unless
+ * it's the last one. We keep the last one
+ * so as not to have to set it up again the
+ * next time somebody inserts a probe.
+ */
+ if (!list_is_singular(&kip->list)) {
/*
- * Page is no longer in use. Free it unless
- * it's the last one. We keep the last one
- * so as not to have to set it up again the
- * next time somebody inserts a probe.
+ * Record perf ksymbol unregister event before removing
+ * the page.
*/
- if (!list_is_singular(&kip->list)) {
- list_del(&kip->list);
- kip->cache->free(kip->insns);
- kfree(kip);
- }
- return 1;
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ (unsigned long)kip->insns, PAGE_SIZE, true,
+ kip->cache->sym);
+ list_del_rcu(&kip->list);
+ synchronize_rcu();
+ kip->cache->free(kip->insns);
+ kfree(kip);
}
- return 0;
+ return true;
}
static int collect_garbage_slots(struct kprobe_insn_cache *c)
@@ -227,16 +229,16 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
struct kprobe_insn_page *kip, *next;
/* Ensure no-one is interrupted on the garbages */
- synchronize_sched();
+ synchronize_rcu();
list_for_each_entry_safe(kip, next, &c->pages, list) {
int i;
+
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
for (i = 0; i < slots_per_page(c); i++) {
- if (kip->slot_used[i] == SLOT_DIRTY &&
- collect_one_slot(kip, i))
+ if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
break;
}
}
@@ -244,45 +246,117 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
return 0;
}
+static long __find_insn_page(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, struct kprobe_insn_page **pkip)
+{
+ struct kprobe_insn_page *kip = NULL;
+ long idx;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ idx = ((long)slot - (long)kip->insns) /
+ (c->insn_size * sizeof(kprobe_opcode_t));
+ if (idx >= 0 && idx < slots_per_page(c)) {
+ *pkip = kip;
+ return idx;
+ }
+ }
+ /* Could not find this slot. */
+ WARN_ON(1);
+ *pkip = NULL;
+ return -1;
+}
+
void __free_insn_slot(struct kprobe_insn_cache *c,
kprobe_opcode_t *slot, int dirty)
{
+ struct kprobe_insn_page *kip = NULL;
+ long idx;
+
+ guard(mutex)(&c->mutex);
+ idx = __find_insn_page(c, slot, &kip);
+ /* Mark and sweep: this may sleep */
+ if (kip) {
+ /* Check double free */
+ WARN_ON(kip->slot_used[idx] != SLOT_USED);
+ if (dirty) {
+ kip->slot_used[idx] = SLOT_DIRTY;
+ kip->ngarbage++;
+ if (++c->nr_garbage > slots_per_page(c))
+ collect_garbage_slots(c);
+ } else {
+ collect_one_slot(kip, idx);
+ }
+ }
+}
+
+/*
+ * Check given address is on the page of kprobe instruction slots.
+ * This will be used for checking whether the address on a stack
+ * is on a text area or not.
+ */
+bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
+{
struct kprobe_insn_page *kip;
+ bool ret = false;
- mutex_lock(&c->mutex);
- list_for_each_entry(kip, &c->pages, list) {
- long idx = ((long)slot - (long)kip->insns) /
- (c->insn_size * sizeof(kprobe_opcode_t));
- if (idx >= 0 && idx < slots_per_page(c)) {
- WARN_ON(kip->slot_used[idx] != SLOT_USED);
- if (dirty) {
- kip->slot_used[idx] = SLOT_DIRTY;
- kip->ngarbage++;
- if (++c->nr_garbage > slots_per_page(c))
- collect_garbage_slots(c);
- } else
- collect_one_slot(kip, idx);
- goto out;
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if (addr >= (unsigned long)kip->insns &&
+ addr < (unsigned long)kip->insns + PAGE_SIZE) {
+ ret = true;
+ break;
}
}
- /* Could not free this slot. */
- WARN_ON(1);
-out:
- mutex_unlock(&c->mutex);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ unsigned long *value, char *type, char *sym)
+{
+ struct kprobe_insn_page *kip;
+ int ret = -ERANGE;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if ((*symnum)--)
+ continue;
+ strscpy(sym, c->sym, KSYM_NAME_LEN);
+ *type = 't';
+ *value = (unsigned long)kip->insns;
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
}
#ifdef CONFIG_OPTPROBES
+void __weak *alloc_optinsn_page(void)
+{
+ return alloc_insn_page();
+}
+
+void __weak free_optinsn_page(void *page)
+{
+ free_insn_page(page);
+}
+
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
- .alloc = alloc_insn_page,
- .free = free_insn_page,
+ .alloc = alloc_optinsn_page,
+ .free = free_optinsn_page,
+ .sym = KPROBE_OPTINSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
.nr_garbage = 0,
};
-#endif
-#endif
+#endif /* CONFIG_OPTPROBES */
+#endif /* __ARCH_WANT_KPROBES_INSN_SLOT */
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
@@ -297,9 +371,9 @@ static inline void reset_kprobe_instance(void)
/*
* This routine is called either:
- * - under the kprobe_mutex - during kprobe_[un]register()
- * OR
- * - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ * - under the 'kprobe_mutex' - during kprobe_[un]register().
+ * OR
+ * - with preemption disabled - from architecture specific code.
*/
struct kprobe *get_kprobe(void *addr)
{
@@ -307,7 +381,8 @@ struct kprobe *get_kprobe(void *addr)
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist,
+ lockdep_is_held(&kprobe_mutex)) {
if (p->addr == addr)
return p;
}
@@ -318,22 +393,20 @@ NOKPROBE_SYMBOL(get_kprobe);
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
-/* Return true if the kprobe is an aggregator */
-static inline int kprobe_aggrprobe(struct kprobe *p)
+/* Return true if 'p' is an aggregator */
+static inline bool kprobe_aggrprobe(struct kprobe *p)
{
return p->pre_handler == aggr_pre_handler;
}
-/* Return true(!0) if the kprobe is unused */
-static inline int kprobe_unused(struct kprobe *p)
+/* Return true if 'p' is unused */
+static inline bool kprobe_unused(struct kprobe *p)
{
return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
list_empty(&p->list);
}
-/*
- * Keep all fields in the kprobe consistent
- */
+/* Keep all fields in the kprobe consistent. */
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
@@ -341,11 +414,11 @@ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
}
#ifdef CONFIG_OPTPROBES
-/* NOTE: change this value only with kprobe_mutex held */
+/* NOTE: This is protected by 'kprobe_mutex'. */
static bool kprobes_allow_optimization;
/*
- * Call all pre_handler on the list, but ignores its return value.
+ * Call all 'kprobe::pre_handler' on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
@@ -373,7 +446,7 @@ static void free_aggr_kprobe(struct kprobe *p)
kfree(op);
}
-/* Return true(!0) if the kprobe is ready for optimization. */
+/* Return true if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -386,8 +459,8 @@ static inline int kprobe_optready(struct kprobe *p)
return 0;
}
-/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
-static inline int kprobe_disarmed(struct kprobe *p)
+/* Return true if the kprobe is disarmed. Note: p must be on hash list */
+bool kprobe_disarmed(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -400,32 +473,32 @@ static inline int kprobe_disarmed(struct kprobe *p)
return kprobe_disabled(p) && list_empty(&op->list);
}
-/* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int kprobe_queued(struct kprobe *p)
+/* Return true if the probe is queued on (un)optimizing lists */
+static bool kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
if (kprobe_aggrprobe(p)) {
op = container_of(p, struct optimized_kprobe, kp);
if (!list_empty(&op->list))
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
* Return an optimized kprobe whose optimizing code replaces
- * instructions including addr (exclude breakpoint).
+ * instructions including 'addr' (exclude breakpoint).
*/
-static struct kprobe *get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(kprobe_opcode_t *addr)
{
int i;
struct kprobe *p = NULL;
struct optimized_kprobe *op;
/* Don't check i == 0, since that is a breakpoint case. */
- for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
- p = get_kprobe((void *)(addr - i));
+ for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH / sizeof(kprobe_opcode_t); i++)
+ p = get_kprobe(addr - i);
if (p && kprobe_optready(p)) {
op = container_of(p, struct optimized_kprobe, kp);
@@ -436,7 +509,7 @@ static struct kprobe *get_optimized_kprobe(unsigned long addr)
return NULL;
}
-/* Optimization staging list, protected by kprobe_mutex */
+/* Optimization staging list, protected by 'kprobe_mutex' */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
static LIST_HEAD(freeing_list);
@@ -447,75 +520,79 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
/*
* Optimize (replace a breakpoint with a jump) kprobes listed on
- * optimizing_list.
+ * 'optimizing_list'.
*/
static void do_optimize_kprobes(void)
{
+ lockdep_assert_held(&text_mutex);
+ /*
+ * The optimization/unoptimization refers 'online_cpus' via
+ * stop_machine() and cpu-hotplug modifies the 'online_cpus'.
+ * And same time, 'text_mutex' will be held in cpu-hotplug and here.
+ * This combination can cause a deadlock (cpu-hotplug tries to lock
+ * 'text_mutex' but stop_machine() can not be done because
+ * the 'online_cpus' has been changed)
+ * To avoid this deadlock, caller must have locked cpu-hotplug
+ * for preventing cpu-hotplug outside of 'text_mutex' locking.
+ */
+ lockdep_assert_cpus_held();
+
/* Optimization never be done when disarmed */
if (kprobes_all_disarmed || !kprobes_allow_optimization ||
list_empty(&optimizing_list))
return;
- /*
- * The optimization/unoptimization refers online_cpus via
- * stop_machine() and cpu-hotplug modifies online_cpus.
- * And same time, text_mutex will be held in cpu-hotplug and here.
- * This combination can cause a deadlock (cpu-hotplug try to lock
- * text_mutex but stop_machine can not be done because online_cpus
- * has been changed)
- * To avoid this deadlock, we need to call get_online_cpus()
- * for preventing cpu-hotplug outside of text_mutex locking.
- */
- get_online_cpus();
- mutex_lock(&text_mutex);
arch_optimize_kprobes(&optimizing_list);
- mutex_unlock(&text_mutex);
- put_online_cpus();
}
/*
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
- * if need) kprobes listed on unoptimizing_list.
+ * if need) kprobes listed on 'unoptimizing_list'.
*/
static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
- /* Unoptimization must be done anytime */
- if (list_empty(&unoptimizing_list))
- return;
+ lockdep_assert_held(&text_mutex);
+ /* See comment in do_optimize_kprobes() */
+ lockdep_assert_cpus_held();
+
+ if (!list_empty(&unoptimizing_list))
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Ditto to do_optimize_kprobes */
- get_online_cpus();
- mutex_lock(&text_mutex);
- arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
- /* Loop free_list for disarming */
+ /* Loop on 'freeing_list' for disarming and removing from kprobe hash list */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
- /* Disarm probes if marked disabled */
- if (kprobe_disabled(&op->kp))
+ /* Switching from detour code to origin */
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ /* Disarm probes if marked disabled and not gone */
+ if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))
arch_disarm_kprobe(&op->kp);
if (kprobe_unused(&op->kp)) {
/*
* Remove unused probes from hash list. After waiting
* for synchronization, these probes are reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes.)
+ * (reclaiming is done by do_free_cleaned_kprobes().)
*/
hlist_del_rcu(&op->kp.hlist);
} else
list_del_init(&op->list);
}
- mutex_unlock(&text_mutex);
- put_online_cpus();
}
-/* Reclaim all kprobes on the free_list */
+/* Reclaim all kprobes on the 'freeing_list' */
static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
- BUG_ON(!kprobe_unused(&op->kp));
list_del_init(&op->list);
+ if (WARN_ON_ONCE(!kprobe_unused(&op->kp))) {
+ /*
+ * This must not happen, but if there is a kprobe
+ * still in use, keep it on kprobes hash list.
+ */
+ continue;
+ }
free_aggr_kprobe(&op->kp);
}
}
@@ -529,56 +606,74 @@ static void kick_kprobe_optimizer(void)
/* Kprobe jump optimizer */
static void kprobe_optimizer(struct work_struct *work)
{
- mutex_lock(&kprobe_mutex);
- /* Lock modules while optimizing kprobes */
- mutex_lock(&module_mutex);
+ guard(mutex)(&kprobe_mutex);
- /*
- * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
- * kprobes before waiting for quiesence period.
- */
- do_unoptimize_kprobes();
+ scoped_guard(cpus_read_lock) {
+ guard(mutex)(&text_mutex);
- /*
- * Step 2: Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
- */
- synchronize_sched();
+ /*
+ * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+ * kprobes before waiting for quiesence period.
+ */
+ do_unoptimize_kprobes();
- /* Step 3: Optimize kprobes after quiesence period */
- do_optimize_kprobes();
+ /*
+ * Step 2: Wait for quiesence period to ensure all potentially
+ * preempted tasks to have normally scheduled. Because optprobe
+ * may modify multiple instructions, there is a chance that Nth
+ * instruction is preempted. In that case, such tasks can return
+ * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+ * Note that on non-preemptive kernel, this is transparently converted
+ * to synchronoze_sched() to wait for all interrupts to have completed.
+ */
+ synchronize_rcu_tasks();
- /* Step 4: Free cleaned kprobes after quiesence period */
- do_free_cleaned_kprobes();
+ /* Step 3: Optimize kprobes after quiesence period */
+ do_optimize_kprobes();
- mutex_unlock(&module_mutex);
- mutex_unlock(&kprobe_mutex);
+ /* Step 4: Free cleaned kprobes after quiesence period */
+ do_free_cleaned_kprobes();
+ }
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
}
-/* Wait for completing optimization and unoptimization */
-static void wait_for_kprobe_optimizer(void)
+static void wait_for_kprobe_optimizer_locked(void)
{
- mutex_lock(&kprobe_mutex);
+ lockdep_assert_held(&kprobe_mutex);
while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
mutex_unlock(&kprobe_mutex);
- /* this will also make optimizing_work execute immmediately */
+ /* This will also make 'optimizing_work' execute immmediately */
flush_delayed_work(&optimizing_work);
- /* @optimizing_work might not have been queued yet, relax */
+ /* 'optimizing_work' might not have been queued yet, relax */
cpu_relax();
mutex_lock(&kprobe_mutex);
}
+}
- mutex_unlock(&kprobe_mutex);
+/* Wait for completing optimization and unoptimization */
+void wait_for_kprobe_optimizer(void)
+{
+ guard(mutex)(&kprobe_mutex);
+
+ wait_for_kprobe_optimizer_locked();
+}
+
+bool optprobe_queued_unopt(struct optimized_kprobe *op)
+{
+ struct optimized_kprobe *_op;
+
+ list_for_each_entry(_op, &unoptimizing_list, list) {
+ if (op == _op)
+ return true;
+ }
+
+ return false;
}
/* Optimize kprobe if p is ready to be optimized */
@@ -591,8 +686,8 @@ static void optimize_kprobe(struct kprobe *p)
(kprobe_disabled(p) || kprobes_all_disarmed))
return;
- /* Both of break_handler and post_handler are not supported. */
- if (p->break_handler || p->post_handler)
+ /* kprobes with 'post_handler' can not be optimized */
+ if (p->post_handler)
return;
op = container_of(p, struct optimized_kprobe, kp);
@@ -602,27 +697,32 @@ static void optimize_kprobe(struct kprobe *p)
return;
/* Check if it is already optimized. */
- if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) {
+ if (optprobe_queued_unopt(op)) {
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ }
return;
+ }
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- if (!list_empty(&op->list))
- /* This is under unoptimizing. Just dequeue the probe */
- list_del_init(&op->list);
- else {
- list_add(&op->list, &optimizing_list);
- kick_kprobe_optimizer();
- }
+ /*
+ * On the 'unoptimizing_list' and 'optimizing_list',
+ * 'op' must have OPTIMIZED flag
+ */
+ if (WARN_ON_ONCE(!list_empty(&op->list)))
+ return;
+
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
}
/* Short cut to direct unoptimizing */
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
- get_online_cpus();
+ lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
- put_online_cpus();
- if (kprobe_disabled(&op->kp))
- arch_disarm_kprobe(&op->kp);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
}
/* Unoptimize a kprobe if p is optimized */
@@ -634,55 +734,57 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
return; /* This is not an optprobe nor optimized */
op = container_of(p, struct optimized_kprobe, kp);
- if (!kprobe_optimized(p)) {
- /* Unoptimized or unoptimizing case */
- if (force && !list_empty(&op->list)) {
- /*
- * Only if this is unoptimizing kprobe and forced,
- * forcibly unoptimize it. (No need to unoptimize
- * unoptimized kprobe again :)
- */
- list_del_init(&op->list);
- force_unoptimize_kprobe(op);
- }
+ if (!kprobe_optimized(p))
return;
- }
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
- list_del_init(&op->list);
+ if (optprobe_queued_unopt(op)) {
+ /* Queued in unoptimizing queue */
+ if (force) {
+ /*
+ * Forcibly unoptimize the kprobe here, and queue it
+ * in the freeing list for release afterwards.
+ */
+ force_unoptimize_kprobe(op);
+ list_move(&op->list, &freeing_list);
+ }
+ } else {
+ /* Dequeue from the optimizing queue */
+ list_del_init(&op->list);
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ }
return;
}
+
/* Optimized kprobe case */
- if (force)
+ if (force) {
/* Forcibly update the code: this is a special case */
force_unoptimize_kprobe(op);
- else {
+ } else {
list_add(&op->list, &unoptimizing_list);
kick_kprobe_optimizer();
}
}
/* Cancel unoptimizing for reusing */
-static void reuse_unused_kprobe(struct kprobe *ap)
+static int reuse_unused_kprobe(struct kprobe *ap)
{
struct optimized_kprobe *op;
- BUG_ON(!kprobe_unused(ap));
/*
* Unused kprobe MUST be on the way of delayed unoptimizing (means
* there is still a relative jump) and disabled.
*/
op = container_of(ap, struct optimized_kprobe, kp);
- if (unlikely(list_empty(&op->list)))
- printk(KERN_WARNING "Warning: found a stray unused "
- "aggrprobe@%p\n", ap->addr);
+ WARN_ON_ONCE(list_empty(&op->list));
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
- /* Optimize it again (remove from op->list) */
- BUG_ON(!kprobe_optready(ap));
+ /* Optimize it again. (remove from 'op->list') */
+ if (!kprobe_optready(ap))
+ return -EINVAL;
+
optimize_kprobe(ap);
+ return 0;
}
/* Remove optimized instructions */
@@ -697,30 +799,36 @@ static void kill_optimized_kprobe(struct kprobe *p)
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
if (kprobe_unused(p)) {
- /* Enqueue if it is unused */
- list_add(&op->list, &freeing_list);
/*
- * Remove unused probes from the hash list. After waiting
- * for synchronization, this probe is reclaimed.
- * (reclaiming is done by do_free_cleaned_kprobes().)
+ * Unused kprobe is on unoptimizing or freeing list. We move it
+ * to freeing_list and let the kprobe_optimizer() remove it from
+ * the kprobe hash list and free it.
*/
- hlist_del_rcu(&op->kp.hlist);
+ if (optprobe_queued_unopt(op))
+ list_move(&op->list, &freeing_list);
}
/* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
+static inline
+void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
+{
+ if (!kprobe_ftrace(p))
+ arch_prepare_optimized_kprobe(op, p);
+}
+
/* Try to prepare optimized instructions */
static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- arch_prepare_optimized_kprobe(op, p);
+ __prepare_optimized_kprobe(op, p);
}
-/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+/* Allocate new optimized_kprobe and try to prepare optimized instructions. */
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -731,7 +839,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
INIT_LIST_HEAD(&op->list);
op->kp.addr = p->addr;
- arch_prepare_optimized_kprobe(op, p);
+ __prepare_optimized_kprobe(op, p);
return &op->kp;
}
@@ -739,103 +847,98 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
- * Prepare an optimized_kprobe and optimize it
- * NOTE: p must be a normal registered kprobe
+ * Prepare an optimized_kprobe and optimize it.
+ * NOTE: 'p' must be a normal registered kprobe.
*/
static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
- /* Impossible to optimize ftrace-based kprobe */
+ /* Impossible to optimize ftrace-based kprobe. */
if (kprobe_ftrace(p))
return;
- /* For preparing optimization, jump_label_text_reserved() is called */
- jump_label_lock();
- mutex_lock(&text_mutex);
+ /* For preparing optimization, jump_label_text_reserved() is called. */
+ guard(cpus_read_lock)();
+ guard(jump_label_lock)();
+ guard(mutex)(&text_mutex);
ap = alloc_aggr_kprobe(p);
if (!ap)
- goto out;
+ return;
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
- /* If failed to setup optimizing, fallback to kprobe */
+ /* If failed to setup optimizing, fallback to kprobe. */
arch_remove_optimized_kprobe(op);
kfree(op);
- goto out;
+ return;
}
init_aggr_kprobe(ap, p);
- optimize_kprobe(ap); /* This just kicks optimizer thread */
-
-out:
- mutex_unlock(&text_mutex);
- jump_label_unlock();
+ optimize_kprobe(ap); /* This just kicks optimizer thread. */
}
-#ifdef CONFIG_SYSCTL
static void optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
- mutex_lock(&kprobe_mutex);
- /* If optimization is already allowed, just return */
+ guard(mutex)(&kprobe_mutex);
+ /* If optimization is already allowed, just return. */
if (kprobes_allow_optimization)
- goto out;
+ return;
+ cpus_read_lock();
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
- printk(KERN_INFO "Kprobes globally optimized\n");
-out:
- mutex_unlock(&kprobe_mutex);
+ cpus_read_unlock();
+ pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n");
}
+#ifdef CONFIG_SYSCTL
static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
- mutex_lock(&kprobe_mutex);
- /* If optimization is already prohibited, just return */
- if (!kprobes_allow_optimization) {
- mutex_unlock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
+ /* If optimization is already prohibited, just return. */
+ if (!kprobes_allow_optimization)
return;
- }
+ cpus_read_lock();
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
}
- mutex_unlock(&kprobe_mutex);
-
- /* Wait for unoptimizing completion */
- wait_for_kprobe_optimizer();
- printk(KERN_INFO "Kprobes globally unoptimized\n");
+ cpus_read_unlock();
+ /* Wait for unoptimizing completion. */
+ wait_for_kprobe_optimizer_locked();
+ pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n");
}
static DEFINE_MUTEX(kprobe_sysctl_mutex);
-int sysctl_kprobes_optimization;
-int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
- loff_t *ppos)
+static int sysctl_kprobes_optimization;
+static int proc_kprobes_optimization_handler(const struct ctl_table *table,
+ int write, void *buffer,
+ size_t *length, loff_t *ppos)
{
int ret;
- mutex_lock(&kprobe_sysctl_mutex);
+ guard(mutex)(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -843,19 +946,37 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
+
+static const struct ctl_table kprobe_sysctls[] = {
+ {
+ .procname = "kprobes-optimization",
+ .data = &sysctl_kprobes_optimization,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_kprobes_optimization_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static void __init kprobe_sysctls_init(void)
+{
+ register_sysctl_init("debug", kprobe_sysctls);
+}
#endif /* CONFIG_SYSCTL */
-/* Put a breakpoint for a probe. Must be called with text_mutex locked */
+/* Put a breakpoint for a probe. */
static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
- /* Check collision with other optimized kprobes */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ lockdep_assert_held(&text_mutex);
+
+ /* Find the overlapping optimized kprobes. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p))
/* Fallback to unoptimized kprobe */
unoptimize_kprobe(_p, true);
@@ -864,22 +985,29 @@ static void __arm_kprobe(struct kprobe *p)
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
-/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+/* Remove the breakpoint of a probe. */
static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
+ lockdep_assert_held(&text_mutex);
+
/* Try to unoptimize */
unoptimize_kprobe(p, kprobes_all_disarmed);
if (!kprobe_queued(p)) {
arch_disarm_kprobe(p);
- /* If another kprobe was blocked, optimize it. */
- _p = get_optimized_kprobe((unsigned long)p->addr);
+ /* If another kprobe was blocked, re-optimize it. */
+ _p = get_optimized_kprobe(p->addr);
if (unlikely(_p) && reopt)
optimize_kprobe(_p);
}
- /* TODO: reoptimize others after unoptimized this probe */
+ /*
+ * TODO: Since unoptimization and real disarming will be done by
+ * the worker thread, we can not check whether another probe are
+ * unoptimized because of this probe here. It should be re-optimized
+ * by the worker thread.
+ */
}
#else /* !CONFIG_OPTPROBES */
@@ -892,13 +1020,19 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
#define __arm_kprobe(p) arch_arm_kprobe(p)
#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
#define kprobe_disarmed(p) kprobe_disabled(p)
-#define wait_for_kprobe_optimizer() do {} while (0)
+#define wait_for_kprobe_optimizer_locked() \
+ lockdep_assert_held(&kprobe_mutex)
-/* There should be no unused kprobes can be reused without optimization */
-static void reuse_unused_kprobe(struct kprobe *ap)
+static int reuse_unused_kprobe(struct kprobe *ap)
{
- printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
- BUG_ON(kprobe_unused(ap));
+ /*
+ * If the optimized kprobe is NOT supported, the aggr kprobe is
+ * released at the same time that the last aggregated kprobe is
+ * unregistered.
+ * Thus there should be no chance to reuse unused kprobe.
+ */
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
static void free_aggr_kprobe(struct kprobe *p)
@@ -916,82 +1050,129 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
+ .func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
+
+static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
+bool kprobe_ftrace_disabled;
-/* Must ensure p->addr is really on ftrace */
-static int prepare_kprobe(struct kprobe *p)
+static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
- if (!kprobe_ftrace(p))
- return arch_prepare_kprobe(p);
+ int ret;
+
+ lockdep_assert_held(&kprobe_mutex);
+
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
+ if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
+ return ret;
+
+ if (*cnt == 0) {
+ ret = register_ftrace_function(ops);
+ if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) {
+ /*
+ * At this point, sinec ops is not registered, we should be sefe from
+ * registering empty filter.
+ */
+ ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
+ return ret;
+ }
+ }
- return arch_prepare_kprobe_ftrace(p);
+ (*cnt)++;
+ return ret;
}
-/* Caller must lock kprobe_mutex */
-static void arm_kprobe_ftrace(struct kprobe *p)
+static int arm_kprobe_ftrace(struct kprobe *p)
{
- int ret;
+ bool ipmodify = (p->post_handler != NULL);
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 0, 0);
- WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
- kprobe_ftrace_enabled++;
- if (kprobe_ftrace_enabled == 1) {
- ret = register_ftrace_function(&kprobe_ftrace_ops);
- WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
- }
+ return __arm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
-/* Caller must lock kprobe_mutex */
-static void disarm_kprobe_ftrace(struct kprobe *p)
+static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
int ret;
- kprobe_ftrace_enabled--;
- if (kprobe_ftrace_enabled == 0) {
- ret = unregister_ftrace_function(&kprobe_ftrace_ops);
- WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ lockdep_assert_held(&kprobe_mutex);
+
+ if (*cnt == 1) {
+ ret = unregister_ftrace_function(ops);
+ if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret))
+ return ret;
}
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 1, 0);
- WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+
+ (*cnt)--;
+
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
+ WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (error %d)\n",
+ p->addr, ret);
+ return ret;
+}
+
+static int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ bool ipmodify = (p->post_handler != NULL);
+
+ return __disarm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
+}
+
+void kprobe_ftrace_kill(void)
+{
+ kprobe_ftrace_disabled = true;
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) do {} while (0)
-#define disarm_kprobe_ftrace(p) do {} while (0)
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
#endif
-/* Arm a kprobe with text_mutex */
-static void arm_kprobe(struct kprobe *kp)
+static int prepare_kprobe(struct kprobe *p)
{
- if (unlikely(kprobe_ftrace(kp))) {
- arm_kprobe_ftrace(kp);
- return;
- }
- /*
- * Here, since __arm_kprobe() doesn't use stop_machine(),
- * this doesn't cause deadlock on text_mutex. So, we don't
- * need get_online_cpus().
- */
- mutex_lock(&text_mutex);
+ /* Must ensure p->addr is really on ftrace */
+ if (kprobe_ftrace(p))
+ return arch_prepare_kprobe_ftrace(p);
+
+ return arch_prepare_kprobe(p);
+}
+
+static int arm_kprobe(struct kprobe *kp)
+{
+ if (unlikely(kprobe_ftrace(kp)))
+ return arm_kprobe_ftrace(kp);
+
+ guard(cpus_read_lock)();
+ guard(mutex)(&text_mutex);
__arm_kprobe(kp);
- mutex_unlock(&text_mutex);
+ return 0;
}
-/* Disarm a kprobe with text_mutex */
-static void disarm_kprobe(struct kprobe *kp, bool reopt)
+static int disarm_kprobe(struct kprobe *kp, bool reopt)
{
- if (unlikely(kprobe_ftrace(kp))) {
- disarm_kprobe_ftrace(kp);
- return;
- }
- /* Ditto */
- mutex_lock(&text_mutex);
+ if (unlikely(kprobe_ftrace(kp)))
+ return disarm_kprobe_ftrace(kp);
+
+ guard(cpus_read_lock)();
+ guard(mutex)(&text_mutex);
__disarm_kprobe(kp, reopt);
- mutex_unlock(&text_mutex);
+ return 0;
}
/*
@@ -1029,195 +1210,47 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
NOKPROBE_SYMBOL(aggr_post_handler);
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
- int trapnr)
-{
- struct kprobe *cur = __this_cpu_read(kprobe_instance);
-
- /*
- * if we faulted "during" the execution of a user specified
- * probe handler, invoke just that probe's fault handler
- */
- if (cur && cur->fault_handler) {
- if (cur->fault_handler(cur, regs, trapnr))
- return 1;
- }
- return 0;
-}
-NOKPROBE_SYMBOL(aggr_fault_handler);
-
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct kprobe *cur = __this_cpu_read(kprobe_instance);
- int ret = 0;
-
- if (cur && cur->break_handler) {
- if (cur->break_handler(cur, regs))
- ret = 1;
- }
- reset_kprobe_instance();
- return ret;
-}
-NOKPROBE_SYMBOL(aggr_break_handler);
-
-/* Walks the list and increments nmissed count for multiprobe case */
+/* Walks the list and increments 'nmissed' if 'p' has child probes. */
void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
+
if (!kprobe_aggrprobe(p)) {
p->nmissed++;
} else {
list_for_each_entry_rcu(kp, &p->list, list)
kp->nmissed++;
}
- return;
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-void recycle_rp_inst(struct kretprobe_instance *ri,
- struct hlist_head *head)
-{
- struct kretprobe *rp = ri->rp;
-
- /* remove rp inst off the rprobe_inst_table */
- hlist_del(&ri->hlist);
- INIT_HLIST_NODE(&ri->hlist);
- if (likely(rp)) {
- raw_spin_lock(&rp->lock);
- hlist_add_head(&ri->hlist, &rp->free_instances);
- raw_spin_unlock(&rp->lock);
- } else
- /* Unregistering */
- hlist_add_head(&ri->hlist, head);
-}
-NOKPROBE_SYMBOL(recycle_rp_inst);
-
-void kretprobe_hash_lock(struct task_struct *tsk,
- struct hlist_head **head, unsigned long *flags)
-__acquires(hlist_lock)
-{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock;
-
- *head = &kretprobe_inst_table[hash];
- hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_lock_irqsave(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_hash_lock);
-
-static void kretprobe_table_lock(unsigned long hash,
- unsigned long *flags)
-__acquires(hlist_lock)
-{
- raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_lock_irqsave(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_table_lock);
-
-void kretprobe_hash_unlock(struct task_struct *tsk,
- unsigned long *flags)
-__releases(hlist_lock)
-{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock;
-
- hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_unlock_irqrestore(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_hash_unlock);
+static struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
-static void kretprobe_table_unlock(unsigned long hash,
- unsigned long *flags)
-__releases(hlist_lock)
+void kprobe_busy_begin(void)
{
- raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_unlock_irqrestore(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_table_unlock);
+ struct kprobe_ctlblk *kcb;
-/*
- * This function is called from finish_task_switch when task tk becomes dead,
- * so that we can recycle any function-return probe instances associated
- * with this task. These left over instances represent probed functions
- * that have been called but will never return.
- */
-void kprobe_flush_task(struct task_struct *tk)
-{
- struct kretprobe_instance *ri;
- struct hlist_head *head, empty_rp;
- struct hlist_node *tmp;
- unsigned long hash, flags = 0;
-
- if (unlikely(!kprobes_initialized))
- /* Early boot. kretprobe_table_locks not yet initialized. */
- return;
-
- INIT_HLIST_HEAD(&empty_rp);
- hash = hash_ptr(tk, KPROBE_HASH_BITS);
- head = &kretprobe_inst_table[hash];
- kretprobe_table_lock(hash, &flags);
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task == tk)
- recycle_rp_inst(ri, &empty_rp);
- }
- kretprobe_table_unlock(hash, &flags);
- hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
-}
-NOKPROBE_SYMBOL(kprobe_flush_task);
-
-static inline void free_rp_inst(struct kretprobe *rp)
-{
- struct kretprobe_instance *ri;
- struct hlist_node *next;
-
- hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
}
-static void cleanup_rp_inst(struct kretprobe *rp)
+void kprobe_busy_end(void)
{
- unsigned long flags, hash;
- struct kretprobe_instance *ri;
- struct hlist_node *next;
- struct hlist_head *head;
-
- /* No race here */
- for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
- kretprobe_table_lock(hash, &flags);
- head = &kretprobe_inst_table[hash];
- hlist_for_each_entry_safe(ri, next, head, hlist) {
- if (ri->rp == rp)
- ri->rp = NULL;
- }
- kretprobe_table_unlock(hash, &flags);
- }
- free_rp_inst(rp);
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
}
-NOKPROBE_SYMBOL(cleanup_rp_inst);
-/*
-* Add the new probe to ap->list. Fail if this is the
-* second jprobe at the address - two jprobes can't coexist
-*/
+/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
- BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
-
- if (p->break_handler || p->post_handler)
+ if (p->post_handler)
unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
- if (p->break_handler) {
- if (ap->break_handler)
- return -EEXIST;
- list_add_tail_rcu(&p->list, &ap->list);
- ap->break_handler = aggr_break_handler;
- } else
- list_add_rcu(&p->list, &ap->list);
+ list_add_rcu(&p->list, &ap->list);
if (p->post_handler && !ap->post_handler)
ap->post_handler = aggr_post_handler;
@@ -1225,23 +1258,20 @@ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * Fill in the required fields of the "manager kprobe". Replace the
- * earlier kprobe in the hlist with the manager kprobe
+ * Fill in the required fields of the aggregator kprobe. Replace the
+ * earlier kprobe in the hlist with the aggregator kprobe.
*/
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
- /* Copy p's insn slot to ap */
+ /* Copy the insn slot of 'p' to 'ap'. */
copy_kprobe(p, ap);
flush_insn_slot(ap);
ap->addr = p->addr;
ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
ap->pre_handler = aggr_pre_handler;
- ap->fault_handler = aggr_fault_handler;
/* We don't care the kprobe which has gone. */
if (p->post_handler && !kprobe_gone(p))
ap->post_handler = aggr_post_handler;
- if (p->break_handler && !kprobe_gone(p))
- ap->break_handler = aggr_break_handler;
INIT_LIST_HEAD(&ap->list);
INIT_HLIST_NODE(&ap->hlist);
@@ -1251,252 +1281,373 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
}
/*
- * This is the second or subsequent kprobe at the address - handle
- * the intricacies
+ * This registers the second or subsequent kprobe at the same address.
*/
static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
int ret = 0;
struct kprobe *ap = orig_p;
- /* For preparing optimization, jump_label_text_reserved() is called */
- jump_label_lock();
- /*
- * Get online CPUs to avoid text_mutex deadlock.with stop machine,
- * which is invoked by unoptimize_kprobe() in add_new_kprobe()
- */
- get_online_cpus();
- mutex_lock(&text_mutex);
-
- if (!kprobe_aggrprobe(orig_p)) {
- /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
- ap = alloc_aggr_kprobe(orig_p);
- if (!ap) {
- ret = -ENOMEM;
- goto out;
+ scoped_guard(cpus_read_lock) {
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ guard(jump_label_lock)();
+ guard(mutex)(&text_mutex);
+
+ if (!kprobe_aggrprobe(orig_p)) {
+ /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
+ ap = alloc_aggr_kprobe(orig_p);
+ if (!ap)
+ return -ENOMEM;
+ init_aggr_kprobe(ap, orig_p);
+ } else if (kprobe_unused(ap)) {
+ /* This probe is going to die. Rescue it */
+ ret = reuse_unused_kprobe(ap);
+ if (ret)
+ return ret;
}
- init_aggr_kprobe(ap, orig_p);
- } else if (kprobe_unused(ap))
- /* This probe is going to die. Rescue it */
- reuse_unused_kprobe(ap);
- if (kprobe_gone(ap)) {
- /*
- * Attempting to insert new probe at the same location that
- * had a probe in the module vaddr area which already
- * freed. So, the instruction slot has already been
- * released. We need a new slot for the new probe.
- */
- ret = arch_prepare_kprobe(ap);
- if (ret)
+ if (kprobe_gone(ap)) {
/*
- * Even if fail to allocate new slot, don't need to
- * free aggr_probe. It will be used next time, or
- * freed by unregister_kprobe.
+ * Attempting to insert new probe at the same location that
+ * had a probe in the module vaddr area which already
+ * freed. So, the instruction slot has already been
+ * released. We need a new slot for the new probe.
*/
- goto out;
-
- /* Prepare optimized instructions if possible. */
- prepare_optimized_kprobe(ap);
+ ret = arch_prepare_kprobe(ap);
+ if (ret)
+ /*
+ * Even if fail to allocate new slot, don't need to
+ * free the 'ap'. It will be used next time, or
+ * freed by unregister_kprobe().
+ */
+ return ret;
- /*
- * Clear gone flag to prevent allocating new slot again, and
- * set disabled flag because it is not armed yet.
- */
- ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
- | KPROBE_FLAG_DISABLED;
- }
+ /* Prepare optimized instructions if possible. */
+ prepare_optimized_kprobe(ap);
- /* Copy ap's insn slot to p */
- copy_kprobe(ap, p);
- ret = add_new_kprobe(ap, p);
+ /*
+ * Clear gone flag to prevent allocating new slot again, and
+ * set disabled flag because it is not armed yet.
+ */
+ ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+ | KPROBE_FLAG_DISABLED;
+ }
-out:
- mutex_unlock(&text_mutex);
- put_online_cpus();
- jump_label_unlock();
+ /* Copy the insn slot of 'p' to 'ap'. */
+ copy_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
+ }
if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
ap->flags &= ~KPROBE_FLAG_DISABLED;
- if (!kprobes_all_disarmed)
+ if (!kprobes_all_disarmed) {
/* Arm the breakpoint again. */
- arm_kprobe(ap);
+ ret = arm_kprobe(ap);
+ if (ret) {
+ ap->flags |= KPROBE_FLAG_DISABLED;
+ list_del_rcu(&p->list);
+ synchronize_rcu();
+ }
+ }
}
return ret;
}
bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
- /* The __kprobes marked functions and entry code must not be probed */
+ /* The '__kprobes' functions and entry code must not be probed. */
return addr >= (unsigned long)__kprobes_text_start &&
addr < (unsigned long)__kprobes_text_end;
}
-static bool within_kprobe_blacklist(unsigned long addr)
+static bool __within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
if (arch_within_kprobe_blacklist(addr))
return true;
/*
- * If there exists a kprobe_blacklist, verify and
- * fail any probe registration in the prohibited area
+ * If 'kprobe_blacklist' is defined, check the address and
+ * reject any probe registration in the prohibited area.
*/
list_for_each_entry(ent, &kprobe_blacklist, list) {
if (addr >= ent->start_addr && addr < ent->end_addr)
return true;
}
+ return false;
+}
+
+bool within_kprobe_blacklist(unsigned long addr)
+{
+ char symname[KSYM_NAME_LEN], *p;
+ if (__within_kprobe_blacklist(addr))
+ return true;
+
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return false;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_kprobe_blacklist(addr);
+ }
return false;
}
/*
- * If we have a symbol_name argument, look it up and add the offset field
+ * arch_adjust_kprobe_addr - adjust the address
+ * @addr: symbol base address
+ * @offset: offset within the symbol
+ * @on_func_entry: was this @addr+@offset on the function entry
+ *
+ * Typically returns @addr + @offset, except for special cases where the
+ * function might be prefixed by a CFI landing pad, in that case any offset
+ * inside the landing pad is mapped to the first 'real' instruction of the
+ * symbol.
+ *
+ * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
+ * instruction at +0.
+ */
+kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
+ unsigned long offset,
+ bool *on_func_entry)
+{
+ *on_func_entry = !offset;
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
+ * If 'symbol_name' is specified, look it up and add the 'offset'
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *
+_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
+ unsigned long offset, bool *on_func_entry)
{
- kprobe_opcode_t *addr = p->addr;
-
- if ((p->symbol_name && p->addr) ||
- (!p->symbol_name && !p->addr))
- goto invalid;
+ if ((symbol_name && addr) || (!symbol_name && !addr))
+ return ERR_PTR(-EINVAL);
- if (p->symbol_name) {
- kprobe_lookup_name(p->symbol_name, addr);
+ if (symbol_name) {
+ /*
+ * Input: @sym + @offset
+ * Output: @addr + @offset
+ *
+ * NOTE: kprobe_lookup_name() does *NOT* fold the offset
+ * argument into it's output!
+ */
+ addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
- if (addr)
- return addr;
+ /*
+ * So here we have @addr + @offset, displace it into a new
+ * @addr' + @offset' where @addr' is the symbol start address.
+ */
+ addr = (void *)addr + offset;
+ if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+ return ERR_PTR(-ENOENT);
+ addr = (void *)addr - offset;
+
+ /*
+ * Then ask the architecture to re-combine them, taking care of
+ * magical function entry details while telling us if this was indeed
+ * at the start of the function.
+ */
+ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
+ if (!addr)
+ return ERR_PTR(-EINVAL);
-invalid:
- return ERR_PTR(-EINVAL);
+ return addr;
}
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+ bool on_func_entry;
+
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
+}
+
+/*
+ * Check the 'p' is valid and return the aggregator kprobe
+ * at the same address.
+ */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
+ lockdep_assert_held(&kprobe_mutex);
+
ap = get_kprobe(p->addr);
if (unlikely(!ap))
return NULL;
- if (p != ap) {
- list_for_each_entry_rcu(list_p, &ap->list, list)
- if (list_p == p)
- /* kprobe p is a valid probe */
- goto valid;
- return NULL;
- }
-valid:
- return ap;
+ if (p == ap)
+ return ap;
+
+ list_for_each_entry(list_p, &ap->list, list)
+ if (list_p == p)
+ /* kprobe p is a valid probe */
+ return ap;
+
+ return NULL;
}
-/* Return error if the kprobe is being re-registered */
-static inline int check_kprobe_rereg(struct kprobe *p)
+/*
+ * Warn and return error if the kprobe is being re-registered since
+ * there must be a software bug.
+ */
+static inline int warn_kprobe_rereg(struct kprobe *p)
{
- int ret = 0;
+ guard(mutex)(&kprobe_mutex);
- mutex_lock(&kprobe_mutex);
- if (__get_valid_kprobe(p))
- ret = -EINVAL;
- mutex_unlock(&kprobe_mutex);
+ if (WARN_ON_ONCE(__get_valid_kprobe(p)))
+ return -EINVAL;
- return ret;
+ return 0;
}
-int __weak arch_check_ftrace_location(struct kprobe *p)
+static int check_ftrace_location(struct kprobe *p)
{
- unsigned long ftrace_addr;
+ unsigned long addr = (unsigned long)p->addr;
- ftrace_addr = ftrace_location((unsigned long)p->addr);
- if (ftrace_addr) {
+ if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
- /* Given address is not on the instruction boundary */
- if ((unsigned long)p->addr != ftrace_addr)
- return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
-#else /* !CONFIG_KPROBES_ON_FTRACE */
+#else
return -EINVAL;
#endif
}
return 0;
}
+static bool is_cfi_preamble_symbol(unsigned long addr)
+{
+ char symbuf[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symbuf))
+ return false;
+
+ return str_has_prefix(symbuf, "__cfi_") ||
+ str_has_prefix(symbuf, "__pfx_");
+}
+
static int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
{
int ret;
- ret = arch_check_ftrace_location(p);
+ ret = check_ftrace_location(p);
if (ret)
return ret;
- jump_label_lock();
- preempt_disable();
- /* Ensure it is not in reserved area nor out of text */
- if (!kernel_text_address((unsigned long) p->addr) ||
- within_kprobe_blacklist((unsigned long) p->addr) ||
- jump_label_text_reserved(p->addr, p->addr)) {
- ret = -EINVAL;
- goto out;
- }
+ guard(jump_label_lock)();
+
+ /* Ensure the address is in a text area, and find a module if exists. */
+ *probed_mod = NULL;
+ if (!core_kernel_text((unsigned long) p->addr)) {
+ guard(rcu)();
+ *probed_mod = __module_text_address((unsigned long) p->addr);
+ if (!(*probed_mod))
+ return -EINVAL;
- /* Check if are we probing a module */
- *probed_mod = __module_text_address((unsigned long) p->addr);
- if (*probed_mod) {
/*
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
- if (unlikely(!try_module_get(*probed_mod))) {
- ret = -ENOENT;
- goto out;
- }
+ if (unlikely(!try_module_get(*probed_mod)))
+ return -ENOENT;
+ }
+ /* Ensure it is not in reserved area. */
+ if (in_gate_area_no_mm((unsigned long) p->addr) ||
+ within_kprobe_blacklist((unsigned long) p->addr) ||
+ jump_label_text_reserved(p->addr, p->addr) ||
+ static_call_text_reserved(p->addr, p->addr) ||
+ find_bug((unsigned long)p->addr) ||
+ is_cfi_preamble_symbol((unsigned long)p->addr)) {
+ module_put(*probed_mod);
+ return -EINVAL;
+ }
+ /* Get module refcount and reject __init functions for loaded modules. */
+ if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
/*
- * If the module freed .init.text, we couldn't insert
+ * If the module freed '.init.text', we couldn't insert
* kprobes in there.
*/
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
- (*probed_mod)->state != MODULE_STATE_COMING) {
+ !module_is_coming(*probed_mod)) {
module_put(*probed_mod);
- *probed_mod = NULL;
- ret = -ENOENT;
+ return -ENOENT;
}
}
-out:
- preempt_enable();
- jump_label_unlock();
- return ret;
+ return 0;
}
-int register_kprobe(struct kprobe *p)
+static int __register_kprobe(struct kprobe *p)
{
int ret;
struct kprobe *old_p;
+
+ guard(mutex)(&kprobe_mutex);
+
+ old_p = get_kprobe(p->addr);
+ if (old_p)
+ /* Since this may unoptimize 'old_p', locking 'text_mutex'. */
+ return register_aggr_kprobe(old_p, p);
+
+ scoped_guard(cpus_read_lock) {
+ /* Prevent text modification */
+ guard(mutex)(&text_mutex);
+ ret = prepare_kprobe(p);
+ if (ret)
+ return ret;
+ }
+
+ INIT_HLIST_NODE(&p->hlist);
+ hlist_add_head_rcu(&p->hlist,
+ &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
+ ret = arm_kprobe(p);
+ if (ret) {
+ hlist_del_rcu(&p->hlist);
+ synchronize_rcu();
+ }
+ }
+
+ /* Try to optimize kprobe */
+ try_to_optimize_kprobe(p);
+ return 0;
+}
+
+int register_kprobe(struct kprobe *p)
+{
+ int ret;
struct module *probed_mod;
kprobe_opcode_t *addr;
+ bool on_func_entry;
- /* Adjust probe address from symbol */
- addr = kprobe_addr(p);
+ /* Canonicalize probe address from symbol */
+ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
if (IS_ERR(addr))
return PTR_ERR(addr);
p->addr = addr;
- ret = check_kprobe_rereg(p);
+ ret = warn_kprobe_rereg(p);
if (ret)
return ret;
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p->flags &= KPROBE_FLAG_DISABLED;
+ if (on_func_entry)
+ p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
@@ -1504,33 +1655,7 @@ int register_kprobe(struct kprobe *p)
if (ret)
return ret;
- mutex_lock(&kprobe_mutex);
-
- old_p = get_kprobe(p->addr);
- if (old_p) {
- /* Since this may unoptimize old_p, locking text_mutex. */
- ret = register_aggr_kprobe(old_p, p);
- goto out;
- }
-
- mutex_lock(&text_mutex); /* Avoiding text modification */
- ret = prepare_kprobe(p);
- mutex_unlock(&text_mutex);
- if (ret)
- goto out;
-
- INIT_HLIST_NODE(&p->hlist);
- hlist_add_head_rcu(&p->hlist,
- &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
-
- if (!kprobes_all_disarmed && !kprobe_disabled(p))
- arm_kprobe(p);
-
- /* Try to optimize kprobe */
- try_to_optimize_kprobe(p);
-
-out:
- mutex_unlock(&kprobe_mutex);
+ ret = __register_kprobe(p);
if (probed_mod)
module_put(probed_mod);
@@ -1539,48 +1664,59 @@ out:
}
EXPORT_SYMBOL_GPL(register_kprobe);
-/* Check if all probes on the aggrprobe are disabled */
-static int aggr_kprobe_disabled(struct kprobe *ap)
+/* Check if all probes on the 'ap' are disabled. */
+static bool aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
- list_for_each_entry_rcu(kp, &ap->list, list)
+ lockdep_assert_held(&kprobe_mutex);
+
+ list_for_each_entry(kp, &ap->list, list)
if (!kprobe_disabled(kp))
/*
- * There is an active probe on the list.
- * We can't disable this ap.
+ * Since there is an active probe on the list,
+ * we can't disable this 'ap'.
*/
- return 0;
+ return false;
- return 1;
+ return true;
}
-/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
+ int ret;
+
+ lockdep_assert_held(&kprobe_mutex);
/* Get an original kprobe for return */
orig_p = __get_valid_kprobe(p);
if (unlikely(orig_p == NULL))
- return NULL;
+ return ERR_PTR(-EINVAL);
- if (!kprobe_disabled(p)) {
- /* Disable probe if it is a child probe */
- if (p != orig_p)
- p->flags |= KPROBE_FLAG_DISABLED;
+ if (kprobe_disabled(p))
+ return orig_p;
- /* Try to disarm and disable this/parent probe */
- if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
- /*
- * If kprobes_all_disarmed is set, orig_p
- * should have already been disarmed, so
- * skip unneed disarming process.
- */
- if (!kprobes_all_disarmed)
- disarm_kprobe(orig_p, true);
- orig_p->flags |= KPROBE_FLAG_DISABLED;
+ /* Disable probe if it is a child probe */
+ if (p != orig_p)
+ p->flags |= KPROBE_FLAG_DISABLED;
+
+ /* Try to disarm and disable this/parent probe */
+ if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+ /*
+ * Don't be lazy here. Even if 'kprobes_all_disarmed'
+ * is false, 'orig_p' might not have been armed yet.
+ * Note arm_all_kprobes() __tries__ to arm all kprobes
+ * on the best effort basis.
+ */
+ if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) {
+ ret = disarm_kprobe(orig_p, true);
+ if (ret) {
+ p->flags &= ~KPROBE_FLAG_DISABLED;
+ return ERR_PTR(ret);
+ }
}
+ orig_p->flags |= KPROBE_FLAG_DISABLED;
}
return orig_p;
@@ -1595,55 +1731,57 @@ static int __unregister_kprobe_top(struct kprobe *p)
/* Disable kprobe. This will disarm it if needed. */
ap = __disable_kprobe(p);
- if (ap == NULL)
- return -EINVAL;
-
- if (ap == p)
- /*
- * This probe is an independent(and non-optimized) kprobe
- * (not an aggrprobe). Remove from the hash list.
- */
- goto disarmed;
+ if (IS_ERR(ap))
+ return PTR_ERR(ap);
- /* Following process expects this probe is an aggrprobe */
- WARN_ON(!kprobe_aggrprobe(ap));
+ WARN_ON(ap != p && !kprobe_aggrprobe(ap));
- if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+ /*
+ * If the probe is an independent(and non-optimized) kprobe
+ * (not an aggrprobe), the last kprobe on the aggrprobe, or
+ * kprobe is already disarmed, just remove from the hash list.
+ */
+ if (ap == p ||
+ (list_is_singular(&ap->list) && kprobe_disarmed(ap))) {
/*
* !disarmed could be happen if the probe is under delayed
* unoptimizing.
*/
- goto disarmed;
- else {
- /* If disabling probe has special handlers, update aggrprobe */
- if (p->break_handler && !kprobe_gone(p))
- ap->break_handler = NULL;
- if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry_rcu(list_p, &ap->list, list) {
- if ((list_p != p) && (list_p->post_handler))
- goto noclean;
- }
- ap->post_handler = NULL;
+ hlist_del_rcu(&ap->hlist);
+ return 0;
+ }
+
+ /* If disabling probe has special handlers, update aggrprobe */
+ if (p->post_handler && !kprobe_gone(p)) {
+ list_for_each_entry(list_p, &ap->list, list) {
+ if ((list_p != p) && (list_p->post_handler))
+ break;
}
-noclean:
- /*
- * Remove from the aggrprobe: this path will do nothing in
- * __unregister_kprobe_bottom().
- */
- list_del_rcu(&p->list);
- if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /* No other probe has post_handler */
+ if (list_entry_is_head(list_p, &ap->list, list)) {
/*
- * Try to optimize this probe again, because post
- * handler may have been changed.
+ * For the kprobe-on-ftrace case, we keep the
+ * post_handler setting to identify this aggrprobe
+ * armed with kprobe_ipmodify_ops.
*/
- optimize_kprobe(ap);
+ if (!kprobe_ftrace(ap))
+ ap->post_handler = NULL;
+ }
}
- return 0;
-disarmed:
- BUG_ON(!kprobe_disarmed(ap));
- hlist_del_rcu(&ap->hlist);
+ /*
+ * Remove from the aggrprobe: this path will do nothing in
+ * __unregister_kprobe_bottom().
+ */
+ list_del_rcu(&p->list);
+ if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /*
+ * Try to optimize this probe again, because post
+ * handler may have been changed.
+ */
+ optimize_kprobe(ap);
return 0;
+
}
static void __unregister_kprobe_bottom(struct kprobe *p)
@@ -1692,93 +1830,239 @@ void unregister_kprobes(struct kprobe **kps, int num)
if (num <= 0)
return;
- mutex_lock(&kprobe_mutex);
- for (i = 0; i < num; i++)
- if (__unregister_kprobe_top(kps[i]) < 0)
- kps[i]->addr = NULL;
- mutex_unlock(&kprobe_mutex);
-
- synchronize_sched();
+ scoped_guard(mutex, &kprobe_mutex) {
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(kps[i]) < 0)
+ kps[i]->addr = NULL;
+ }
+ synchronize_rcu();
for (i = 0; i < num; i++)
if (kps[i]->addr)
__unregister_kprobe_bottom(kps[i]);
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
+int __weak kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return NOTIFY_DONE;
+}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
+
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
.priority = 0x7fffffff /* we need to be notified first */
};
-unsigned long __weak arch_deref_entry_point(void *entry)
+#ifdef CONFIG_KRETPROBES
+
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+
+/* callbacks for objpool of kretprobe instances */
+static int kretprobe_init_inst(void *nod, void *context)
{
- return (unsigned long)entry;
+ struct kretprobe_instance *ri = nod;
+
+ ri->rph = context;
+ return 0;
+}
+static int kretprobe_fini_pool(struct objpool_head *head, void *context)
+{
+ kfree(context);
+ return 0;
}
-int register_jprobes(struct jprobe **jps, int num)
+static void free_rp_inst_rcu(struct rcu_head *head)
{
- struct jprobe *jp;
- int ret = 0, i;
+ struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+ struct kretprobe_holder *rph = ri->rph;
- if (num <= 0)
- return -EINVAL;
- for (i = 0; i < num; i++) {
- unsigned long addr, offset;
- jp = jps[i];
- addr = arch_deref_entry_point(jp->entry);
-
- /* Verify probepoint is a function entry point */
- if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
- offset == 0) {
- jp->kp.pre_handler = setjmp_pre_handler;
- jp->kp.break_handler = longjmp_break_handler;
- ret = register_kprobe(&jp->kp);
- } else
- ret = -EINVAL;
+ objpool_drop(ri, &rph->pool);
+}
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
- if (ret < 0) {
- if (i > 0)
- unregister_jprobes(jps, i);
- break;
+static void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
+
+ if (likely(rp))
+ objpool_push(ri, &rp->rph->pool);
+ else
+ call_rcu(&ri->rcu, free_rp_inst_rcu);
+}
+NOKPROBE_SYMBOL(recycle_rp_inst);
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+ struct kretprobe_instance *ri;
+ struct llist_node *node;
+
+ /* Early boot, not yet initialized. */
+ if (unlikely(!kprobes_initialized))
+ return;
+
+ kprobe_busy_begin();
+
+ node = __llist_del_all(&tk->kretprobe_instances);
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ node = node->next;
+
+ recycle_rp_inst(ri);
+ }
+
+ kprobe_busy_end();
+}
+NOKPROBE_SYMBOL(kprobe_flush_task);
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_holder *rph = rp->rph;
+
+ if (!rph)
+ return;
+ rp->rph = NULL;
+ objpool_fini(&rph->pool);
+}
+
+/* This assumes the 'tsk' is the current task or the is not running. */
+static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->kretprobe_instances.first;
+ else
+ node = node->next;
+
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ if (ri->ret_addr != kretprobe_trampoline_addr()) {
+ *cur = node;
+ return ri->ret_addr;
}
+ node = node->next;
}
- return ret;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(register_jprobes);
+NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
-int register_jprobe(struct jprobe *jp)
+/**
+ * kretprobe_find_ret_addr -- Find correct return address modified by kretprobe
+ * @tsk: Target task
+ * @fp: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a kretprobe on @tsk in unsigned
+ * long type. If it finds the return address, this returns that address value,
+ * or this returns 0.
+ * The @tsk must be 'current' or a task which is not running. @fp is a hint
+ * to get the currect return address - which is compared with the
+ * kretprobe_instance::fp field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ */
+unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
+ struct llist_node **cur)
{
- return register_jprobes(&jp, 1);
+ struct kretprobe_instance *ri;
+ kprobe_opcode_t *ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ do {
+ ret = __kretprobe_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ ri = container_of(*cur, struct kretprobe_instance, llist);
+ } while (ri->fp != fp);
+
+ return (unsigned long)ret;
}
-EXPORT_SYMBOL_GPL(register_jprobe);
+NOKPROBE_SYMBOL(kretprobe_find_ret_addr);
-void unregister_jprobe(struct jprobe *jp)
+void __weak arch_kretprobe_fixup_return(struct pt_regs *regs,
+ kprobe_opcode_t *correct_ret_addr)
{
- unregister_jprobes(&jp, 1);
+ /*
+ * Do nothing by default. Please fill this to update the fake return
+ * address on the stack with the correct one on each arch if possible.
+ */
}
-EXPORT_SYMBOL_GPL(unregister_jprobe);
-void unregister_jprobes(struct jprobe **jps, int num)
+unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
+ void *frame_pointer)
{
- int i;
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *first, *node = NULL;
+ kprobe_opcode_t *correct_ret_addr;
+ struct kretprobe *rp;
- if (num <= 0)
- return;
- mutex_lock(&kprobe_mutex);
- for (i = 0; i < num; i++)
- if (__unregister_kprobe_top(&jps[i]->kp) < 0)
- jps[i]->kp.addr = NULL;
- mutex_unlock(&kprobe_mutex);
+ /* Find correct address and all nodes for this frame. */
+ correct_ret_addr = __kretprobe_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n");
+ BUG_ON(1);
+ }
- synchronize_sched();
- for (i = 0; i < num; i++) {
- if (jps[i]->kp.addr)
- __unregister_kprobe_bottom(&jps[i]->kp);
+ /*
+ * Set the return address as the instruction pointer, because if the
+ * user handler calls stack_trace_save_regs() with this 'regs',
+ * the stack trace will start from the instruction pointer.
+ */
+ instruction_pointer_set(regs, (unsigned long)correct_ret_addr);
+
+ /* Run the user handler of the nodes. */
+ first = current->kretprobe_instances.first;
+ while (first) {
+ ri = container_of(first, struct kretprobe_instance, llist);
+
+ if (WARN_ON_ONCE(ri->fp != frame_pointer))
+ break;
+
+ rp = get_kretprobe(ri);
+ if (rp && rp->handler) {
+ struct kprobe *prev = kprobe_running();
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ ri->ret_addr = correct_ret_addr;
+ rp->handler(ri, regs);
+ __this_cpu_write(current_kprobe, prev);
+ }
+ if (first == node)
+ break;
+
+ first = first->next;
}
+
+ arch_kretprobe_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink all nodes for this frame. */
+ first = current->kretprobe_instances.first;
+ current->kretprobe_instances.first = node->next;
+ node->next = NULL;
+
+ /* Recycle free instances. */
+ while (first) {
+ ri = container_of(first, struct kretprobe_instance, llist);
+ first = first->next;
+
+ recycle_rp_inst(ri);
+ }
+
+ return (unsigned long)correct_ret_addr;
}
-EXPORT_SYMBOL_GPL(unregister_jprobes);
+NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
-#ifdef CONFIG_KRETPROBES
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
@@ -1786,61 +2070,121 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
- unsigned long hash, flags = 0;
+ struct kretprobe_holder *rph = rp->rph;
struct kretprobe_instance *ri;
- /*
- * To avoid deadlocks, prohibit return probing in NMI contexts,
- * just skip the probe and increase the (inexact) 'nmissed'
- * statistical counter, so that the user is informed that
- * something happened:
- */
- if (unlikely(in_nmi())) {
+ ri = objpool_pop(&rph->pool);
+ if (!ri) {
rp->nmissed++;
return 0;
}
- /* TODO: consider to only swap the RA after the last pre_handler fired */
- hash = hash_ptr(current, KPROBE_HASH_BITS);
- raw_spin_lock_irqsave(&rp->lock, flags);
- if (!hlist_empty(&rp->free_instances)) {
- ri = hlist_entry(rp->free_instances.first,
- struct kretprobe_instance, hlist);
- hlist_del(&ri->hlist);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ objpool_push(ri, &rph->pool);
+ return 0;
+ }
- ri->rp = rp;
- ri->task = current;
+ arch_prepare_kretprobe(ri, regs);
- if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- raw_spin_lock_irqsave(&rp->lock, flags);
- hlist_add_head(&ri->hlist, &rp->free_instances);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
- return 0;
- }
+ __llist_add(&ri->llist, &current->kretprobe_instances);
- arch_prepare_kretprobe(ri, regs);
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
+#else /* CONFIG_KRETPROBE_ON_RETHOOK */
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_instance *ri;
+ struct rethook_node *rhn;
- /* XXX(hch): why is there no hlist_move_head? */
- INIT_HLIST_NODE(&ri->hlist);
- kretprobe_table_lock(hash, &flags);
- hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
- kretprobe_table_unlock(hash, &flags);
- } else {
+ rhn = rethook_try_get(rp->rh);
+ if (!rhn) {
rp->nmissed++;
- raw_spin_unlock_irqrestore(&rp->lock, flags);
+ return 0;
}
+
+ ri = container_of(rhn, struct kretprobe_instance, node);
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs))
+ rethook_recycle(rhn);
+ else
+ rethook_hook(rhn, regs, kprobe_ftrace(p));
+
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+ unsigned long ret_addr,
+ struct pt_regs *regs)
+{
+ struct kretprobe *rp = (struct kretprobe *)data;
+ struct kretprobe_instance *ri;
+ struct kprobe_ctlblk *kcb;
+
+ /* The data must NOT be null. This means rethook data structure is broken. */
+ if (WARN_ON_ONCE(!data) || !rp->handler)
+ return;
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ ri = container_of(rh, struct kretprobe_instance, node);
+ rp->handler(ri, regs);
+
+ __this_cpu_write(current_kprobe, NULL);
+}
+NOKPROBE_SYMBOL(kretprobe_rethook_handler);
+
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
+
+/**
+ * kprobe_on_func_entry() -- check whether given address is function entry
+ * @addr: Target address
+ * @sym: Target symbol name
+ * @offset: The offset from the symbol or the address
+ *
+ * This checks whether the given @addr+@offset or @sym+@offset is on the
+ * function entry address or not.
+ * This returns 0 if it is the function entry, or -EINVAL if it is not.
+ * And also it returns -ENOENT if it fails the symbol or address lookup.
+ * Caller must pass @addr or @sym (either one must be NULL), or this
+ * returns -EINVAL.
+ */
+int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+ bool on_func_entry;
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);
+
+ if (IS_ERR(kp_addr))
+ return PTR_ERR(kp_addr);
+
+ if (!on_func_entry)
+ return -EINVAL;
+
+ return 0;
+}
+
int register_kretprobe(struct kretprobe *rp)
{
- int ret = 0;
- struct kretprobe_instance *inst;
+ int ret;
int i;
void *addr;
+ ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset);
+ if (ret)
+ return ret;
+
+ /* If only 'rp->kp.addr' is specified, check reregistering kprobes */
+ if (rp->kp.addr && warn_kprobe_rereg(&rp->kp))
+ return -EINVAL;
+
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
@@ -1852,37 +2196,49 @@ int register_kretprobe(struct kretprobe *rp)
}
}
+ if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
+ return -E2BIG;
+
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
- rp->kp.fault_handler = NULL;
- rp->kp.break_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
- if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPT
+ if (rp->maxactive <= 0)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
-#else
- rp->maxactive = num_possible_cpus();
-#endif
- }
- raw_spin_lock_init(&rp->lock);
- INIT_HLIST_HEAD(&rp->free_instances);
- for (i = 0; i < rp->maxactive; i++) {
- inst = kmalloc(sizeof(struct kretprobe_instance) +
- rp->data_size, GFP_KERNEL);
- if (inst == NULL) {
- free_rp_inst(rp);
- return -ENOMEM;
- }
- INIT_HLIST_NODE(&inst->hlist);
- hlist_add_head(&inst->hlist, &rp->free_instances);
+
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
+ sizeof(struct kretprobe_instance) +
+ rp->data_size, rp->maxactive);
+ if (IS_ERR(rp->rh))
+ return PTR_ERR(rp->rh);
+
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = register_kprobe(&rp->kp);
+ if (ret != 0) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
}
+#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
+ rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
+ if (!rp->rph)
+ return -ENOMEM;
+ if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
+ sizeof(struct kretprobe_instance), GFP_KERNEL,
+ rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
+ return -ENOMEM;
+ }
+ rcu_assign_pointer(rp->rph->rp, rp);
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
+#endif
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
@@ -1917,17 +2273,25 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
if (num <= 0)
return;
- mutex_lock(&kprobe_mutex);
- for (i = 0; i < num; i++)
+ for (i = 0; i < num; i++) {
+ guard(mutex)(&kprobe_mutex);
+
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
- mutex_unlock(&kprobe_mutex);
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rethook_free(rps[i]->rh);
+#else
+ rcu_assign_pointer(rps[i]->rph->rp, NULL);
+#endif
+ }
- synchronize_sched();
+ synchronize_rcu();
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
- cleanup_rp_inst(rps[i]);
+#ifndef CONFIG_KRETPROBE_ON_RETHOOK
+ free_rp_inst(rps[i]);
+#endif
}
}
}
@@ -1936,13 +2300,13 @@ EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
int register_kretprobe(struct kretprobe *rp)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
int register_kretprobes(struct kretprobe **rps, int num)
{
- return -ENOSYS;
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
@@ -1969,16 +2333,25 @@ static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
+ lockdep_assert_held(&kprobe_mutex);
+
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * 'MODULE_STATE_GOING' notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
* If this is an aggr_kprobe, we have to list all the
* chained probes and mark them GONE.
*/
- list_for_each_entry_rcu(kp, &p->list, list)
+ list_for_each_entry(kp, &p->list, list)
kp->flags |= KPROBE_FLAG_GONE;
p->post_handler = NULL;
- p->break_handler = NULL;
kill_optimized_kprobe(p);
}
/*
@@ -1991,16 +2364,14 @@ static void kill_kprobe(struct kprobe *p)
/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
{
- int ret = 0;
+ struct kprobe *p;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Disable this kprobe */
- if (__disable_kprobe(kp) == NULL)
- ret = -EINVAL;
+ p = __disable_kprobe(kp);
- mutex_unlock(&kprobe_mutex);
- return ret;
+ return IS_ERR(p) ? PTR_ERR(p) : 0;
}
EXPORT_SYMBOL_GPL(disable_kprobe);
@@ -2010,42 +2381,104 @@ int enable_kprobe(struct kprobe *kp)
int ret = 0;
struct kprobe *p;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Check whether specified probe is valid. */
p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
- ret = -EINVAL;
- goto out;
- }
+ if (unlikely(p == NULL))
+ return -EINVAL;
- if (kprobe_gone(kp)) {
+ if (kprobe_gone(kp))
/* This kprobe has gone, we couldn't enable it. */
- ret = -EINVAL;
- goto out;
- }
+ return -EINVAL;
if (p != kp)
kp->flags &= ~KPROBE_FLAG_DISABLED;
if (!kprobes_all_disarmed && kprobe_disabled(p)) {
p->flags &= ~KPROBE_FLAG_DISABLED;
- arm_kprobe(p);
+ ret = arm_kprobe(p);
+ if (ret) {
+ p->flags |= KPROBE_FLAG_DISABLED;
+ if (p != kp)
+ kp->flags |= KPROBE_FLAG_DISABLED;
+ }
}
-out:
- mutex_unlock(&kprobe_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);
+/* Caller must NOT call this in usual path. This is only for critical case */
void dump_kprobe(struct kprobe *kp)
{
- printk(KERN_WARNING "Dumping kprobe:\n");
- printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
- kp->symbol_name, kp->addr, kp->offset);
+ pr_err("Dump kprobe:\n.symbol_name = %s, .offset = %x, .addr = %pS\n",
+ kp->symbol_name, kp->offset, kp->addr);
}
NOKPROBE_SYMBOL(dump_kprobe);
+int kprobe_add_ksym_blacklist(unsigned long entry)
+{
+ struct kprobe_blacklist_entry *ent;
+ unsigned long offset = 0, size = 0;
+
+ if (!kernel_text_address(entry) ||
+ !kallsyms_lookup_size_offset(entry, &size, &offset))
+ return -EINVAL;
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->start_addr = entry;
+ ent->end_addr = entry + size;
+ INIT_LIST_HEAD(&ent->list);
+ list_add_tail(&ent->list, &kprobe_blacklist);
+
+ return (int)size;
+}
+
+/* Add all symbols in given area into kprobe blacklist */
+int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
+{
+ unsigned long entry;
+ int ret = 0;
+
+ for (entry = start; entry < end; entry += ret) {
+ ret = kprobe_add_ksym_blacklist(entry);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) /* In case of alias symbol */
+ ret = 1;
+ }
+ return 0;
+}
+
+int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ char *type, char *sym)
+{
+ return -ERANGE;
+}
+
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+ if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+ return 0;
+#ifdef CONFIG_OPTPROBES
+ if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+ return 0;
+#endif
+#endif
+ if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+ return 0;
+ return -ERANGE;
+}
+
+int __init __weak arch_populate_kprobe_blacklist(void)
+{
+ return 0;
+}
+
/*
* Lookup and populate the kprobe_blacklist.
*
@@ -2057,29 +2490,95 @@ NOKPROBE_SYMBOL(dump_kprobe);
static int __init populate_kprobe_blacklist(unsigned long *start,
unsigned long *end)
{
+ unsigned long entry;
unsigned long *iter;
- struct kprobe_blacklist_entry *ent;
- unsigned long entry, offset = 0, size = 0;
+ int ret;
for (iter = start; iter < end; iter++) {
- entry = arch_deref_entry_point((void *)*iter);
+ entry = (unsigned long)dereference_symbol_descriptor((void *)*iter);
+ ret = kprobe_add_ksym_blacklist(entry);
+ if (ret == -EINVAL)
+ continue;
+ if (ret < 0)
+ return ret;
+ }
+
+ /* Symbols in '__kprobes_text' are blacklisted */
+ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start,
+ (unsigned long)__kprobes_text_end);
+ if (ret)
+ return ret;
- if (!kernel_text_address(entry) ||
- !kallsyms_lookup_size_offset(entry, &size, &offset)) {
- pr_err("Failed to find blacklist at %p\n",
- (void *)entry);
+ /* Symbols in 'noinstr' section are blacklisted */
+ ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start,
+ (unsigned long)__noinstr_text_end);
+
+ return ret ? : arch_populate_kprobe_blacklist();
+}
+
+#ifdef CONFIG_MODULES
+/* Remove all symbols in given area from kprobe blacklist */
+static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
+{
+ struct kprobe_blacklist_entry *ent, *n;
+
+ list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
+ if (ent->start_addr < start || ent->start_addr >= end)
continue;
- }
+ list_del(&ent->list);
+ kfree(ent);
+ }
+}
+
+static void kprobe_remove_ksym_blacklist(unsigned long entry)
+{
+ kprobe_remove_area_blacklist(entry, entry + 1);
+}
+
+static void add_module_kprobe_blacklist(struct module *mod)
+{
+ unsigned long start, end;
+ int i;
- ent = kmalloc(sizeof(*ent), GFP_KERNEL);
- if (!ent)
- return -ENOMEM;
- ent->start_addr = entry;
- ent->end_addr = entry + size;
- INIT_LIST_HEAD(&ent->list);
- list_add_tail(&ent->list, &kprobe_blacklist);
+ if (mod->kprobe_blacklist) {
+ for (i = 0; i < mod->num_kprobe_blacklist; i++)
+ kprobe_add_ksym_blacklist(mod->kprobe_blacklist[i]);
+ }
+
+ start = (unsigned long)mod->kprobes_text_start;
+ if (start) {
+ end = start + mod->kprobes_text_size;
+ kprobe_add_area_blacklist(start, end);
+ }
+
+ start = (unsigned long)mod->noinstr_text_start;
+ if (start) {
+ end = start + mod->noinstr_text_size;
+ kprobe_add_area_blacklist(start, end);
+ }
+}
+
+static void remove_module_kprobe_blacklist(struct module *mod)
+{
+ unsigned long start, end;
+ int i;
+
+ if (mod->kprobe_blacklist) {
+ for (i = 0; i < mod->num_kprobe_blacklist; i++)
+ kprobe_remove_ksym_blacklist(mod->kprobe_blacklist[i]);
+ }
+
+ start = (unsigned long)mod->kprobes_text_start;
+ if (start) {
+ end = start + mod->kprobes_text_size;
+ kprobe_remove_area_blacklist(start, end);
+ }
+
+ start = (unsigned long)mod->noinstr_text_start;
+ if (start) {
+ end = start + mod->noinstr_text_size;
+ kprobe_remove_area_blacklist(start, end);
}
- return 0;
}
/* Module notifier call back, checking kprobes on the module */
@@ -2092,19 +2591,23 @@ static int kprobes_module_callback(struct notifier_block *nb,
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
+ guard(mutex)(&kprobe_mutex);
+
+ if (val == MODULE_STATE_COMING)
+ add_module_kprobe_blacklist(mod);
+
if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
return NOTIFY_DONE;
/*
- * When MODULE_STATE_GOING was notified, both of module .text and
- * .init.text sections would be freed. When MODULE_STATE_LIVE was
- * notified, only .init.text section would be freed. We need to
+ * When 'MODULE_STATE_GOING' was notified, both of module '.text' and
+ * '.init.text' sections would be freed. When 'MODULE_STATE_LIVE' was
+ * notified, only '.init.text' section would be freed. We need to
* disable kprobes which have been inserted in the sections.
*/
- mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2112,11 +2615,18 @@ static int kprobes_module_callback(struct notifier_block *nb,
* The vaddr this probe is installed will soon
* be vfreed buy not synced to disk. Hence,
* disarming the breakpoint isn't needed.
+ *
+ * Note, this will also move any optimized probes
+ * that are pending to be removed from their
+ * corresponding lists to the 'freeing_list' and
+ * will not be touched by the delayed
+ * kprobe_optimizer() work handler.
*/
kill_kprobe(p);
}
}
- mutex_unlock(&kprobe_mutex);
+ if (val == MODULE_STATE_GOING)
+ remove_module_kprobe_blacklist(mod);
return NOTIFY_DONE;
}
@@ -2125,85 +2635,119 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
-/* Markers of _kprobe_blacklist section */
-extern unsigned long __start_kprobe_blacklist[];
-extern unsigned long __stop_kprobe_blacklist[];
+static int kprobe_register_module_notifier(void)
+{
+ return register_module_notifier(&kprobe_module_nb);
+}
+#else
+static int kprobe_register_module_notifier(void)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
+
+void kprobe_free_init_mem(void)
+{
+ void *start = (void *)(&__init_begin);
+ void *end = (void *)(&__init_end);
+ struct hlist_head *head;
+ struct kprobe *p;
+ int i;
+
+ guard(mutex)(&kprobe_mutex);
+
+ /* Kill all kprobes on initmem because the target code has been freed. */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry(p, head, hlist) {
+ if (start <= (void *)p->addr && (void *)p->addr < end)
+ kill_kprobe(p);
+ }
+ }
+}
static int __init init_kprobes(void)
{
- int i, err = 0;
+ int i, err;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
- for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&kprobe_table[i]);
- INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
- raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
- }
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
- if (err) {
- pr_err("kprobes: failed to populate blacklist: %d\n", err);
- pr_err("Please take care of using kprobes.\n");
- }
+ if (err)
+ pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err);
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
- kprobe_lookup_name(kretprobe_blacklist[i].name,
- kretprobe_blacklist[i].addr);
+ kretprobe_blacklist[i].addr =
+ kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
if (!kretprobe_blacklist[i].addr)
- printk("kretprobe: lookup failed: %s\n",
+ pr_err("Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed.\n",
kretprobe_blacklist[i].name);
}
}
-#if defined(CONFIG_OPTPROBES)
-#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
- /* Init kprobe_optinsn_slots */
- kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
-#endif
- /* By default, kprobes can be optimized */
- kprobes_allow_optimization = true;
-#endif
-
/* By default, kprobes are armed */
kprobes_all_disarmed = false;
+#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+ /* Init 'kprobe_optinsn_slots' for allocation */
+ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
if (!err)
- err = register_module_notifier(&kprobe_module_nb);
+ err = kprobe_register_module_notifier();
kprobes_initialized = (err == 0);
-
- if (!err)
- init_test_probes();
+ kprobe_sysctls_init();
return err;
}
+early_initcall(init_kprobes);
+
+#if defined(CONFIG_OPTPROBES)
+static int __init init_optprobes(void)
+{
+ /*
+ * Enable kprobe optimization - this kicks the optimizer which
+ * depends on synchronize_rcu_tasks() and ksoftirqd, that is
+ * not spawned in early initcall. So delay the optimization.
+ */
+ optimize_all_kprobes();
+
+ return 0;
+}
+subsys_initcall(init_optprobes);
+#endif
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
+ void *addr = p->addr;
if (p->pre_handler == pre_handler_kretprobe)
kprobe_type = "r";
- else if (p->pre_handler == setjmp_pre_handler)
- kprobe_type = "j";
else
kprobe_type = "k";
+ if (!kallsyms_show_value(pi->file->f_cred))
+ addr = NULL;
+
if (sym)
- seq_printf(pi, "%p %s %s+0x%x %s ",
- p->addr, kprobe_type, sym, offset,
+ seq_printf(pi, "%px %s %s+0x%x %s ",
+ addr, kprobe_type, sym, offset,
(modname ? modname : " "));
- else
- seq_printf(pi, "%p %s %p ",
- p->addr, kprobe_type, p->addr);
+ else /* try to use %pS */
+ seq_printf(pi, "%px %s %pS ",
+ addr, kprobe_type, p->addr);
if (!pp)
pp = p;
@@ -2236,7 +2780,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
- const char *sym = NULL;
+ const char *sym;
unsigned int i = *(loff_t *) v;
unsigned long offset = 0;
char *modname, namebuf[KSYM_NAME_LEN];
@@ -2256,28 +2800,19 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
return 0;
}
-static const struct seq_operations kprobes_seq_ops = {
+static const struct seq_operations kprobes_sops = {
.start = kprobe_seq_start,
.next = kprobe_seq_next,
.stop = kprobe_seq_stop,
.show = show_kprobe_addr
};
-static int kprobes_open(struct inode *inode, struct file *filp)
-{
- return seq_open(filp, &kprobes_seq_ops);
-}
-
-static const struct file_operations debugfs_kprobes_operations = {
- .open = kprobes_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(kprobes);
/* kprobes/blacklist -- shows which functions can not be probed */
static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
{
+ mutex_lock(&kprobe_mutex);
return seq_list_start(&kprobe_blacklist, *pos);
}
@@ -2291,41 +2826,44 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
struct kprobe_blacklist_entry *ent =
list_entry(v, struct kprobe_blacklist_entry, list);
- seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
- (void *)ent->end_addr, (void *)ent->start_addr);
+ /*
+ * If '/proc/kallsyms' is not showing kernel address, we won't
+ * show them here either.
+ */
+ if (!kallsyms_show_value(m->file->f_cred))
+ seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
+ (void *)ent->start_addr);
+ else
+ seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
+ (void *)ent->end_addr, (void *)ent->start_addr);
return 0;
}
-static const struct seq_operations kprobe_blacklist_seq_ops = {
- .start = kprobe_blacklist_seq_start,
- .next = kprobe_blacklist_seq_next,
- .stop = kprobe_seq_stop, /* Reuse void function */
- .show = kprobe_blacklist_seq_show,
-};
-
-static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
+static void kprobe_blacklist_seq_stop(struct seq_file *f, void *v)
{
- return seq_open(filp, &kprobe_blacklist_seq_ops);
+ mutex_unlock(&kprobe_mutex);
}
-static const struct file_operations debugfs_kprobe_blacklist_ops = {
- .open = kprobe_blacklist_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct seq_operations kprobe_blacklist_sops = {
+ .start = kprobe_blacklist_seq_start,
+ .next = kprobe_blacklist_seq_next,
+ .stop = kprobe_blacklist_seq_stop,
+ .show = kprobe_blacklist_seq_show,
};
+DEFINE_SEQ_ATTRIBUTE(kprobe_blacklist);
-static void arm_all_kprobes(void)
+static int arm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
- unsigned int i;
+ unsigned int i, total = 0, errors = 0;
+ int err, ret = 0;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If kprobes are armed, just return */
if (!kprobes_all_disarmed)
- goto already_enabled;
+ return 0;
/*
* optimize_kprobe() called by arm_kprobe() checks
@@ -2336,46 +2874,67 @@ static void arm_all_kprobes(void)
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
- if (!kprobe_disabled(p))
- arm_kprobe(p);
+ /* Arm all kprobes on a best-effort basis */
+ hlist_for_each_entry(p, head, hlist) {
+ if (!kprobe_disabled(p)) {
+ err = arm_kprobe(p);
+ if (err) {
+ errors++;
+ ret = err;
+ }
+ total++;
+ }
+ }
}
- printk(KERN_INFO "Kprobes globally enabled\n");
+ if (errors)
+ pr_warn("Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs.\n",
+ errors, total);
+ else
+ pr_info("Kprobes globally enabled\n");
-already_enabled:
- mutex_unlock(&kprobe_mutex);
- return;
+ return ret;
}
-static void disarm_all_kprobes(void)
+static int disarm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
- unsigned int i;
+ unsigned int i, total = 0, errors = 0;
+ int err, ret = 0;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
- if (kprobes_all_disarmed) {
- mutex_unlock(&kprobe_mutex);
- return;
- }
+ if (kprobes_all_disarmed)
+ return 0;
kprobes_all_disarmed = true;
- printk(KERN_INFO "Kprobes globally disabled\n");
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist) {
- if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- disarm_kprobe(p, false);
+ /* Disarm all kprobes on a best-effort basis */
+ hlist_for_each_entry(p, head, hlist) {
+ if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
+ err = disarm_kprobe(p, false);
+ if (err) {
+ errors++;
+ ret = err;
+ }
+ total++;
+ }
}
}
- mutex_unlock(&kprobe_mutex);
+
+ if (errors)
+ pr_warn("Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs.\n",
+ errors, total);
+ else
+ pr_info("Kprobes globally disabled\n");
/* Wait for disarming all kprobes by optimizer */
- wait_for_kprobe_optimizer();
+ wait_for_kprobe_optimizer_locked();
+ return ret;
}
/*
@@ -2400,28 +2959,16 @@ static ssize_t read_enabled_file_bool(struct file *file,
static ssize_t write_enabled_file_bool(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
- char buf[32];
- size_t buf_size;
+ bool enable;
+ int ret;
- buf_size = min(count, (sizeof(buf)-1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
+ ret = kstrtobool_from_user(user_buf, count, &enable);
+ if (ret)
+ return ret;
- buf[buf_size] = '\0';
- switch (buf[0]) {
- case 'y':
- case 'Y':
- case '1':
- arm_all_kprobes();
- break;
- case 'n':
- case 'N':
- case '0':
- disarm_all_kprobes();
- break;
- default:
- return -EINVAL;
- }
+ ret = enable ? arm_all_kprobes() : disarm_all_kprobes();
+ if (ret)
+ return ret;
return count;
}
@@ -2434,39 +2981,19 @@ static const struct file_operations fops_kp = {
static int __init debugfs_kprobe_init(void)
{
- struct dentry *dir, *file;
- unsigned int value = 1;
+ struct dentry *dir;
dir = debugfs_create_dir("kprobes", NULL);
- if (!dir)
- return -ENOMEM;
- file = debugfs_create_file("list", 0444, dir, NULL,
- &debugfs_kprobes_operations);
- if (!file)
- goto error;
+ debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops);
- file = debugfs_create_file("enabled", 0600, dir,
- &value, &fops_kp);
- if (!file)
- goto error;
+ debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp);
- file = debugfs_create_file("blacklist", 0444, dir, NULL,
- &debugfs_kprobe_blacklist_ops);
- if (!file)
- goto error;
+ debugfs_create_file("blacklist", 0400, dir, NULL,
+ &kprobe_blacklist_fops);
return 0;
-
-error:
- debugfs_remove(dir);
- return -ENOMEM;
}
late_initcall(debugfs_kprobe_init);
#endif /* CONFIG_DEBUG_FS */
-
-module_init(init_kprobes);
-
-/* defined in arch/.../kernel/kprobes.c */
-EXPORT_SYMBOL_GPL(jprobe_return);
diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c
new file mode 100644
index 000000000000..d4449884084c
--- /dev/null
+++ b/kernel/kstack_erase.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code fills the used part of the kernel stack with a poison value
+ * before returning to userspace. It's part of the STACKLEAK feature
+ * ported from grsecurity/PaX.
+ *
+ * Author: Alexander Popov <alex.popov@linux.com>
+ *
+ * KSTACK_ERASE reduces the information which kernel stack leak bugs can
+ * reveal and blocks some uninitialized stack variable attacks.
+ */
+
+#include <linux/kstack_erase.h>
+#include <linux/kprobes.h>
+
+#ifdef CONFIG_KSTACK_ERASE_RUNTIME_DISABLE
+#include <linux/jump_label.h>
+#include <linux/string_choices.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+
+static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
+
+#ifdef CONFIG_SYSCTL
+static int stack_erasing_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret = 0;
+ int state = !static_branch_unlikely(&stack_erasing_bypass);
+ int prev_state = state;
+ struct ctl_table table_copy = *table;
+
+ table_copy.data = &state;
+ ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
+ state = !!state;
+ if (ret || !write || state == prev_state)
+ return ret;
+
+ if (state)
+ static_branch_disable(&stack_erasing_bypass);
+ else
+ static_branch_enable(&stack_erasing_bypass);
+
+ pr_warn("stackleak: kernel stack erasing is %s\n",
+ str_enabled_disabled(state));
+ return ret;
+}
+static const struct ctl_table stackleak_sysctls[] = {
+ {
+ .procname = "stack_erasing",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = stack_erasing_sysctl,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init stackleak_sysctls_init(void)
+{
+ register_sysctl_init("kernel", stackleak_sysctls);
+ return 0;
+}
+late_initcall(stackleak_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
+#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass)
+#else
+#define skip_erasing() false
+#endif /* CONFIG_KSTACK_ERASE_RUNTIME_DISABLE */
+
+#ifndef __stackleak_poison
+static __always_inline void __stackleak_poison(unsigned long erase_low,
+ unsigned long erase_high,
+ unsigned long poison)
+{
+ while (erase_low < erase_high) {
+ *(unsigned long *)erase_low = poison;
+ erase_low += sizeof(unsigned long);
+ }
+}
+#endif
+
+static __always_inline void __stackleak_erase(bool on_task_stack)
+{
+ const unsigned long task_stack_low = stackleak_task_low_bound(current);
+ const unsigned long task_stack_high = stackleak_task_high_bound(current);
+ unsigned long erase_low, erase_high;
+
+ erase_low = stackleak_find_top_of_poison(task_stack_low,
+ current->lowest_stack);
+
+#ifdef CONFIG_KSTACK_ERASE_METRICS
+ current->prev_lowest_stack = erase_low;
+#endif
+
+ /*
+ * Write poison to the task's stack between 'erase_low' and
+ * 'erase_high'.
+ *
+ * If we're running on a different stack (e.g. an entry trampoline
+ * stack) we can erase everything below the pt_regs at the top of the
+ * task stack.
+ *
+ * If we're running on the task stack itself, we must not clobber any
+ * stack used by this function and its caller. We assume that this
+ * function has a fixed-size stack frame, and the current stack pointer
+ * doesn't change while we write poison.
+ */
+ if (on_task_stack)
+ erase_high = current_stack_pointer;
+ else
+ erase_high = task_stack_high;
+
+ __stackleak_poison(erase_low, erase_high, KSTACK_ERASE_POISON);
+
+ /* Reset the 'lowest_stack' value for the next syscall */
+ current->lowest_stack = task_stack_high;
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can be called from the task stack or an entry stack when the task stack is
+ * no longer in use.
+ */
+asmlinkage void noinstr stackleak_erase(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(on_thread_stack());
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_on_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(true);
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from a stack other than the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_off_task_stack(void)
+{
+ if (skip_erasing())
+ return;
+
+ __stackleak_erase(false);
+}
+
+void __used __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void)
+{
+ unsigned long sp = current_stack_pointer;
+
+ /*
+ * Having CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE larger than
+ * KSTACK_ERASE_SEARCH_DEPTH makes the poison search in
+ * stackleak_erase() unreliable. Let's prevent that.
+ */
+ BUILD_BUG_ON(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE > KSTACK_ERASE_SEARCH_DEPTH);
+
+ /* 'lowest_stack' should be aligned on the register width boundary */
+ sp = ALIGN(sp, sizeof(unsigned long));
+ if (sp < current->lowest_stack &&
+ sp >= stackleak_task_low_bound(current)) {
+ current->lowest_stack = sp;
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_stack_depth);
diff --git a/kernel/ksyms_common.c b/kernel/ksyms_common.c
new file mode 100644
index 000000000000..cf1a73cbf2f6
--- /dev/null
+++ b/kernel/ksyms_common.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ksyms_common.c: A split of kernel/kallsyms.c
+ * Contains a few generic function definations independent of config KALLSYMS.
+ */
+#include <linux/kallsyms.h>
+#include <linux/security.h>
+
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+bool kallsyms_show_value(const struct cred *cred)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return true;
+ fallthrough;
+ case 1:
+ if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
+ CAP_OPT_NOAUDIT) == 0)
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..a9e6354d9e25 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -1,48 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
- *
- * This file is release under the GPLv2
- *
*/
+#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/export.h>
#include <linux/init.h>
-#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
#include <linux/profile.h>
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/compiler.h>
-#include <linux/rcupdate.h> /* rcu_expedited */
+#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
+
+#if defined(__LITTLE_ENDIAN)
+#define CPU_BYTEORDER_STRING "little"
+#elif defined(__BIG_ENDIAN)
+#define CPU_BYTEORDER_STRING "big"
+#else
+#error Unknown byteorder
+#endif
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
-static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+ return sysfs_emit(buf, "%llu\n", (u64)atomic64_read(&uevent_seqnum));
}
KERNEL_ATTR_RO(uevent_seqnum);
+/* cpu byteorder */
+static ssize_t cpu_byteorder_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
+}
+KERNEL_ATTR_RO(cpu_byteorder);
+
+/* address bits */
+static ssize_t address_bits_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */);
+}
+KERNEL_ATTR_RO(address_bits);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", uevent_helper);
+ return sysfs_emit(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -63,14 +85,21 @@ KERNEL_ATTR_RW(uevent_helper);
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", prof_on);
+ return sysfs_emit(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int ret;
+ static DEFINE_MUTEX(lock);
+ /*
+ * We need serialization, for profile_setup() initializes prof_on
+ * value and profile_init() must not reallocate prof_buffer after
+ * once allocated.
+ */
+ guard(mutex)(&lock);
if (prof_on)
return -EEXIST;
/*
@@ -90,65 +119,33 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC
-static ssize_t kexec_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%d\n", !!kexec_image);
-}
-KERNEL_ATTR_RO(kexec_loaded);
-
-static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%d\n", !!kexec_crash_image);
-}
-KERNEL_ATTR_RO(kexec_crash_loaded);
-
-static ssize_t kexec_crash_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%zu\n", crash_get_memory_size());
-}
-static ssize_t kexec_crash_size_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long cnt;
- int ret;
-
- if (kstrtoul(buf, 0, &cnt))
- return -EINVAL;
-
- ret = crash_shrink_memory(cnt);
- return ret < 0 ? ret : count;
-}
-KERNEL_ATTR_RW(kexec_crash_size);
+#ifdef CONFIG_VMCORE_INFO
static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lx %x\n",
- paddr_vmcoreinfo_note(),
- (unsigned int)sizeof(vmcoreinfo_note));
+ phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
+ return sysfs_emit(buf, "%pa %x\n", &vmcore_base,
+ (unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_VMCORE_INFO */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", file_caps_enabled);
+ return sysfs_emit(buf, "%d\n", file_caps_enabled);
}
KERNEL_ATTR_RO(fscaps);
+#ifndef CONFIG_TINY_RCU
int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", rcu_expedited);
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -161,28 +158,32 @@ static ssize_t rcu_expedited_store(struct kobject *kobj,
}
KERNEL_ATTR_RW(rcu_expedited);
+int rcu_normal;
+static ssize_t rcu_normal_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal));
+}
+static ssize_t rcu_normal_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (kstrtoint(buf, 0, &rcu_normal))
+ return -EINVAL;
+
+ return count;
+}
+KERNEL_ATTR_RW(rcu_normal);
+#endif /* #ifndef CONFIG_TINY_RCU */
+
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
-extern const void __start_notes __weak;
-extern const void __stop_notes __weak;
+extern const void __start_notes;
+extern const void __stop_notes;
#define notes_size (&__stop_notes - &__start_notes)
-static ssize_t notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t count)
-{
- memcpy(buf, &__start_notes + off, count);
- return count;
-}
-
-static struct bin_attribute notes_attr = {
- .attr = {
- .name = "notes",
- .mode = S_IRUGO,
- },
- .read = &notes_read,
-};
+static __ro_after_init BIN_ATTR_SIMPLE_RO(notes);
struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
@@ -190,23 +191,25 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+ &cpu_byteorder_attr.attr,
+ &address_bits_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC
- &kexec_loaded_attr.attr,
- &kexec_crash_loaded_attr.attr,
- &kexec_crash_size_attr.attr,
+#ifdef CONFIG_VMCORE_INFO
&vmcoreinfo_attr.attr,
#endif
+#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
+ &rcu_normal_attr.attr,
+#endif
NULL
};
-static struct attribute_group kernel_attr_group = {
+static const struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
@@ -224,8 +227,9 @@ static int __init ksysfs_init(void)
goto kset_exit;
if (notes_size > 0) {
- notes_attr.size = notes_size;
- error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
+ bin_attr_notes.private = (void *)&__start_notes;
+ bin_attr_notes.size = notes_size;
+ error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes);
if (error)
goto group_exit;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..99a3808d086f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,14 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ * Copyright (C) 2009 Red Hat, Inc.
*
* Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
+#include <uapi/linux/sched/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
+#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
@@ -18,15 +26,22 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
+
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -41,32 +56,104 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ unsigned int node;
+ int started;
+ int result;
+ int (*threadfn)(void *);
void *data;
struct completion parked;
struct completion exited;
+#ifdef CONFIG_BLK_CGROUP
+ struct cgroup_subsys_state *blkcg_css;
+#endif
+ /* To store the full name if task comm is truncated. */
+ char *full_name;
+ struct task_struct *task;
+ struct list_head hotplug_node;
+ struct cpumask *preferred_affinity;
};
enum KTHREAD_BITS {
KTHREAD_IS_PER_CPU = 0,
KTHREAD_SHOULD_STOP,
KTHREAD_SHOULD_PARK,
- KTHREAD_IS_PARKED,
};
-#define __to_kthread(vfork) \
- container_of(vfork, struct kthread, exited)
-
static inline struct kthread *to_kthread(struct task_struct *k)
{
- return __to_kthread(k->vfork_done);
+ WARN_ON(!(k->flags & PF_KTHREAD));
+ return k->worker_private;
+}
+
+/*
+ * Variant of to_kthread() that doesn't assume @p is a kthread.
+ *
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread. For kthreads p->worker_private always
+ * points to a struct kthread. For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
+ *
+ * Return NULL for any task that is not a kthread.
+ */
+static inline struct kthread *__to_kthread(struct task_struct *p)
+{
+ void *kthread = p->worker_private;
+ if (kthread && !(p->flags & PF_KTHREAD))
+ kthread = NULL;
+ return kthread;
}
-static struct kthread *to_live_kthread(struct task_struct *k)
+void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
- struct completion *vfork = ACCESS_ONCE(k->vfork_done);
- if (likely(vfork))
- return __to_kthread(vfork);
- return NULL;
+ struct kthread *kthread = to_kthread(tsk);
+
+ if (!kthread || !kthread->full_name) {
+ strscpy(buf, tsk->comm, buf_size);
+ return;
+ }
+
+ strscpy_pad(buf, kthread->full_name, buf_size);
+}
+
+bool set_kthread_struct(struct task_struct *p)
+{
+ struct kthread *kthread;
+
+ if (WARN_ON_ONCE(to_kthread(p)))
+ return false;
+
+ kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
+ if (!kthread)
+ return false;
+
+ init_completion(&kthread->exited);
+ init_completion(&kthread->parked);
+ INIT_LIST_HEAD(&kthread->hotplug_node);
+ p->vfork_done = &kthread->exited;
+
+ kthread->task = p;
+ kthread->node = tsk_fork_get_node(current);
+ p->worker_private = kthread;
+ return true;
+}
+
+void free_kthread_struct(struct task_struct *k)
+{
+ struct kthread *kthread;
+
+ /*
+ * Can be NULL if kmalloc() in set_kthread_struct() failed.
+ */
+ kthread = to_kthread(k);
+ if (!kthread)
+ return;
+
+#ifdef CONFIG_BLK_CGROUP
+ WARN_ON_ONCE(kthread->blkcg_css);
+#endif
+ k->worker_private = NULL;
+ kfree(kthread->full_name);
+ kfree(kthread);
}
/**
@@ -82,6 +169,11 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
+static bool __kthread_should_park(struct task_struct *k)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
+}
+
/**
* kthread_should_park - should this kthread park now?
*
@@ -95,7 +187,18 @@ EXPORT_SYMBOL(kthread_should_stop);
*/
bool kthread_should_park(void)
{
- return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+ return __kthread_should_park(current);
+}
+EXPORT_SYMBOL_GPL(kthread_should_park);
+
+bool kthread_should_stop_or_park(void)
+{
+ struct kthread *kthread = __to_kthread(current);
+
+ if (!kthread)
+ return false;
+
+ return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
}
/**
@@ -124,6 +227,21 @@ bool kthread_freezable_should_stop(bool *was_frozen)
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
/**
+ * kthread_func - return the function specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Returns NULL if the task is not a kthread.
+ */
+void *kthread_func(struct task_struct *task)
+{
+ struct kthread *kthread = __to_kthread(task);
+ if (kthread)
+ return kthread->threadfn;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kthread_func);
+
+/**
* kthread_data - return data value specified on kthread creation
* @task: kthread task in question
*
@@ -135,9 +253,10 @@ void *kthread_data(struct task_struct *task)
{
return to_kthread(task)->data;
}
+EXPORT_SYMBOL_GPL(kthread_data);
/**
- * probe_kthread_data - speculative version of kthread_data()
+ * kthread_probe_data - speculative version of kthread_data()
* @task: possible kthread task in question
*
* @task could be a kthread task. Return the data value specified when it
@@ -145,25 +264,42 @@ void *kthread_data(struct task_struct *task)
* inaccessible for any reason, %NULL is returned. This function requires
* that @task itself is safe to dereference.
*/
-void *probe_kthread_data(struct task_struct *task)
+void *kthread_probe_data(struct task_struct *task)
{
- struct kthread *kthread = to_kthread(task);
+ struct kthread *kthread = __to_kthread(task);
void *data = NULL;
- probe_kernel_read(&data, &kthread->data, sizeof(data));
+ if (kthread)
+ copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
static void __kthread_parkme(struct kthread *self)
{
- __set_current_state(TASK_PARKED);
- while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
- if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
- complete(&self->parked);
- schedule();
- __set_current_state(TASK_PARKED);
+ for (;;) {
+ /*
+ * TASK_PARKED is a special state; we must serialize against
+ * possible pending wakeups to avoid store-store collisions on
+ * task->state.
+ *
+ * Such a collision might possibly result in the task state
+ * changin from TASK_PARKED and us failing the
+ * wait_task_inactive() in kthread_park().
+ */
+ set_special_state(TASK_PARKED);
+ if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
+ break;
+
+ /*
+ * Thread is going to call schedule(), do not preempt it,
+ * or the caller of kthread_park() may spend more time in
+ * wait_task_inactive().
+ */
+ preempt_disable();
+ complete(&self->parked);
+ schedule_preempt_disabled();
+ preempt_enable();
}
- clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
}
@@ -171,46 +307,165 @@ void kthread_parkme(void)
{
__kthread_parkme(to_kthread(current));
}
+EXPORT_SYMBOL_GPL(kthread_parkme);
+
+/**
+ * kthread_exit - Cause the current kthread return @result to kthread_stop().
+ * @result: The integer value to return to kthread_stop().
+ *
+ * While kthread_exit can be called directly, it exists so that
+ * functions which do some additional work in non-modular code such as
+ * module_put_and_kthread_exit can be implemented.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_exit(long result)
+{
+ struct kthread *kthread = to_kthread(current);
+ kthread->result = result;
+ if (!list_empty(&kthread->hotplug_node)) {
+ mutex_lock(&kthreads_hotplug_lock);
+ list_del(&kthread->hotplug_node);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ if (kthread->preferred_affinity) {
+ kfree(kthread->preferred_affinity);
+ kthread->preferred_affinity = NULL;
+ }
+ }
+ do_exit(0);
+}
+EXPORT_SYMBOL(kthread_exit);
+
+/**
+ * kthread_complete_and_exit - Exit the current kthread.
+ * @comp: Completion to complete
+ * @code: The integer value to return to kthread_stop().
+ *
+ * If present, complete @comp and then return code to kthread_stop().
+ *
+ * A kernel thread whose module may be removed after the completion of
+ * @comp can use this function to exit safely.
+ *
+ * Does not return.
+ */
+void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
+{
+ if (comp)
+ complete(comp);
+
+ kthread_exit(code);
+}
+EXPORT_SYMBOL(kthread_complete_and_exit);
+
+static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
+{
+ const struct cpumask *pref;
+
+ if (kthread->preferred_affinity) {
+ pref = kthread->preferred_affinity;
+ } else {
+ if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
+ return;
+ pref = cpumask_of_node(kthread->node);
+ }
+
+ cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ if (cpumask_empty(cpumask))
+ cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+}
+
+static void kthread_affine_node(void)
+{
+ struct kthread *kthread = to_kthread(current);
+ cpumask_var_t affinity;
+
+ WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+ if (kthread->node == NUMA_NO_NODE) {
+ housekeeping_affine(current, HK_TYPE_KTHREAD);
+ } else {
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ mutex_lock(&kthreads_hotplug_lock);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ /*
+ * The node cpumask is racy when read from kthread() but:
+ * - a racing CPU going down will either fail on the subsequent
+ * call to set_cpus_allowed_ptr() or be migrated to housekeepers
+ * afterwards by the scheduler.
+ * - a racing CPU going up will be handled by kthreads_online_cpu()
+ */
+ kthread_fetch_affinity(kthread, affinity);
+ set_cpus_allowed_ptr(current, affinity);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ free_cpumask_var(affinity);
+ }
+}
static int kthread(void *_create)
{
+ static const struct sched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
void *data = create->data;
struct completion *done;
- struct kthread self;
+ struct kthread *self;
int ret;
- self.flags = 0;
- self.data = data;
- init_completion(&self.exited);
- init_completion(&self.parked);
- current->vfork_done = &self.exited;
+ self = to_kthread(current);
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
- do_exit(-EINTR);
+ kthread_exit(-EINTR);
}
+
+ self->full_name = create->full_name;
+ self->threadfn = threadfn;
+ self->data = data;
+
+ /*
+ * The new thread inherited kthreadd's priority and CPU mask. Reset
+ * back to default in case they have been changed.
+ */
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
+ /*
+ * Thread is going to call schedule(), do not preempt it,
+ * or the creator may spend more time in wait_task_inactive().
+ */
+ preempt_disable();
complete(done);
- schedule();
+ schedule_preempt_disabled();
+ preempt_enable();
- ret = -EINTR;
+ self->started = 1;
- if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
- __kthread_parkme(&self);
+ if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
+ kthread_affine_node();
+
+ ret = -EINTR;
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+ cgroup_kthread_ready();
+ __kthread_parkme(self);
ret = threadfn(data);
}
- /* we can't just return, we must preserve "self" on stack */
- do_exit(ret);
+ kthread_exit(ret);
}
-/* called from do_fork() to get node information for about to be created task */
+/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
@@ -228,11 +483,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
- /* If user was SIGKILLed, I release the structure. */
+ /* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -242,32 +499,11 @@ static void create_kthread(struct kthread_create_info *create)
}
}
-/**
- * kthread_create_on_node - create a kthread.
- * @threadfn: the function to run until signal_pending(current).
- * @data: data ptr for @threadfn.
- * @node: memory node number.
- * @namefmt: printf-style name for the thread.
- *
- * Description: This helper function creates and names a kernel
- * thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run().
- *
- * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
- * When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which no one will call kthread_stop(), or
- * return when 'kthread_should_stop()' is true (which means
- * kthread_stop() has been called). The return value should be zero
- * or a negative error number; it will be passed to kthread_stop().
- *
- * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
- */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
- void *data, int node,
- const char namefmt[],
- ...)
+static __printf(4, 0)
+struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
+ void *data, int node,
+ const char namefmt[],
+ va_list args)
{
DECLARE_COMPLETION_ONSTACK(done);
struct task_struct *task;
@@ -280,6 +516,11 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -293,9 +534,9 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
*/
if (unlikely(wait_for_completion_killable(&done))) {
/*
- * If I was SIGKILLed before kthreadd (or new kernel thread)
- * calls complete(), leave the cleanup of this structure to
- * that thread.
+ * If I was killed by a fatal signal before kthreadd (or new
+ * kernel thread) calls complete(), leave the cleanup of this
+ * structure to that thread.
*/
if (xchg(&create->done, NULL))
return ERR_PTR(-EINTR);
@@ -306,37 +547,76 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- static const struct sched_param param = { .sched_priority = 0 };
- va_list args;
-
- va_start(args, namefmt);
- vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
- va_end(args);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task, cpu_all_mask);
- }
+free_create:
kfree(create);
return task;
}
+
+/**
+ * kthread_create_on_node - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @node: task and thread structures for the thread are allocated on this node
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread. The thread will be stopped: use wake_up_process() to start
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
+ *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn() can either return directly if it is a
+ * standalone thread for which no one will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called). The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
+ */
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+ void *data, int node,
+ const char namefmt[],
+ ...)
+{
+ struct task_struct *task;
+ va_list args;
+
+ va_start(args, namefmt);
+ task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
+ va_end(args);
+
+ return task;
+}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, mask);
+
/* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_NO_SETAFFINITY;
}
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
+{
+ __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+ struct kthread *kthread = to_kthread(p);
+ __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
+ WARN_ON_ONCE(kthread->started);
+}
+
/**
* kthread_bind - bind a just-created kthread to a cpu.
* @p: thread created by kthread_create().
@@ -348,7 +628,9 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
+ struct kthread *kthread = to_kthread(p);
__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
+ WARN_ON_ONCE(kthread->started);
}
EXPORT_SYMBOL(kthread_bind);
@@ -361,7 +643,6 @@ EXPORT_SYMBOL(kthread_bind);
* to "name.*%u". Code fills in cpu number.
*
* Description: This helper function creates and names a kernel thread
- * The thread will be woken and put into park mode.
*/
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
void *data, unsigned int cpu,
@@ -373,27 +654,37 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
cpu);
if (IS_ERR(p))
return p;
- set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+ kthread_bind(p, cpu);
+ /* CPU hotplug need to bind once again when unparking the thread. */
to_kthread(p)->cpu = cpu;
- /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
- kthread_park(p);
return p;
}
+EXPORT_SYMBOL(kthread_create_on_cpu);
-static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- /*
- * We clear the IS_PARKED bit here as we don't wait
- * until the task has left the park code. So if we'd
- * park before that happens we'd see the IS_PARKED bit
- * which might be about to be cleared.
- */
- if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
- __kthread_bind(k, kthread->cpu, TASK_PARKED);
- wake_up_state(k, TASK_PARKED);
+ struct kthread *kthread = to_kthread(k);
+ if (!kthread)
+ return;
+
+ WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
+
+ if (cpu < 0) {
+ clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+ return;
}
+
+ kthread->cpu = cpu;
+ set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+}
+
+bool kthread_is_per_cpu(struct task_struct *p)
+{
+ struct kthread *kthread = __to_kthread(p);
+ if (!kthread)
+ return false;
+
+ return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
/**
@@ -406,11 +697,24 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
*/
void kthread_unpark(struct task_struct *k)
{
- struct kthread *kthread = to_live_kthread(k);
+ struct kthread *kthread = to_kthread(k);
- if (kthread)
- __kthread_unpark(k, kthread);
+ if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))
+ return;
+ /*
+ * Newly created kthread was parked when the CPU was offline.
+ * The binding was lost and we need to set it again.
+ */
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
+ */
+ wake_up_state(k, TASK_PARKED);
}
+EXPORT_SYMBOL_GPL(kthread_unpark);
/**
* kthread_park - park a thread created by kthread_create().
@@ -426,21 +730,32 @@ void kthread_unpark(struct task_struct *k)
*/
int kthread_park(struct task_struct *k)
{
- struct kthread *kthread = to_live_kthread(k);
- int ret = -ENOSYS;
+ struct kthread *kthread = to_kthread(k);
- if (kthread) {
- if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- if (k != current) {
- wake_up_process(k);
- wait_for_completion(&kthread->parked);
- }
- }
- ret = 0;
+ if (WARN_ON(k->flags & PF_EXITING))
+ return -ENOSYS;
+
+ if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
+ return -EBUSY;
+
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ /*
+ * Wait for __kthread_parkme() to complete(), this means we
+ * _will_ have TASK_PARKED and are about to call schedule().
+ */
+ wait_for_completion(&kthread->parked);
+ /*
+ * Now wait for that schedule() to complete and the task to
+ * get scheduled out.
+ */
+ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
}
- return ret;
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(kthread_park);
/**
* kthread_stop - stop a thread created by kthread_create().
@@ -451,7 +766,7 @@ int kthread_park(struct task_struct *k)
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
- * If threadfn() may call do_exit() itself, the caller must ensure
+ * If threadfn() may call kthread_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
@@ -465,14 +780,13 @@ int kthread_stop(struct task_struct *k)
trace_sched_kthread_stop(k);
get_task_struct(k);
- kthread = to_live_kthread(k);
- if (kthread) {
- set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
- __kthread_unpark(k, kthread);
- wake_up_process(k);
- wait_for_completion(&kthread->exited);
- }
- ret = k->exit_code;
+ kthread = to_kthread(k);
+ set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+ kthread_unpark(k);
+ set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
+ wake_up_process(k);
+ wait_for_completion(&kthread->exited);
+ ret = kthread->result;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
@@ -480,17 +794,37 @@ int kthread_stop(struct task_struct *k)
}
EXPORT_SYMBOL(kthread_stop);
+/**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+int kthread_stop_put(struct task_struct *k)
+{
+ int ret;
+
+ ret = kthread_stop(k);
+ put_task_struct(k);
+ return ret;
+}
+EXPORT_SYMBOL(kthread_stop_put);
+
int kthreadd(void *unused)
{
+ static const char comm[TASK_COMM_LEN] = "kthreadd";
struct task_struct *tsk = current;
/* Setup a clean context for our children to inherit. */
- set_task_comm(tsk, "kthreadd");
+ set_task_comm(tsk, comm);
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
+ cgroup_init_kthreadd();
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -517,78 +851,313 @@ int kthreadd(void *unused)
return 0;
}
-void __init_kthread_worker(struct kthread_worker *worker,
+int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+{
+ struct kthread *kthread = to_kthread(p);
+ cpumask_var_t affinity;
+ int ret = 0;
+
+ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ WARN_ON_ONCE(kthread->preferred_affinity);
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+ if (!kthread->preferred_affinity) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&kthreads_hotplug_lock);
+ cpumask_copy(kthread->preferred_affinity, mask);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ kthread_fetch_affinity(kthread, affinity);
+
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock)
+ set_cpus_allowed_force(p, affinity);
+
+ mutex_unlock(&kthreads_hotplug_lock);
+out:
+ free_cpumask_var(affinity);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_affine_preferred);
+
+/*
+ * Re-affine kthreads according to their preferences
+ * and the newly online CPU. The CPU down part is handled
+ * by select_fallback_rq() which default re-affines to
+ * housekeepers from other nodes in case the preferred
+ * affinity doesn't apply anymore.
+ */
+static int kthreads_online_cpu(unsigned int cpu)
+{
+ cpumask_var_t affinity;
+ struct kthread *k;
+ int ret;
+
+ guard(mutex)(&kthreads_hotplug_lock);
+
+ if (list_empty(&kthreads_hotplug))
+ return 0;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = 0;
+
+ list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+ if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+ kthread_is_per_cpu(k->task))) {
+ ret = -EINVAL;
+ continue;
+ }
+ kthread_fetch_affinity(k, affinity);
+ set_cpus_allowed_ptr(k->task, affinity);
+ }
+
+ free_cpumask_var(affinity);
+
+ return ret;
+}
+
+static int kthreads_init(void)
+{
+ return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+ kthreads_online_cpu, NULL);
+}
+early_initcall(kthreads_init);
+
+void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
{
- spin_lock_init(&worker->lock);
+ memset(worker, 0, sizeof(struct kthread_worker));
+ raw_spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
- worker->task = NULL;
+ INIT_LIST_HEAD(&worker->delayed_work_list);
}
-EXPORT_SYMBOL_GPL(__init_kthread_worker);
+EXPORT_SYMBOL_GPL(__kthread_init_worker);
/**
* kthread_worker_fn - kthread function to process kthread_worker
* @worker_ptr: pointer to initialized kthread_worker
*
- * This function can be used as @threadfn to kthread_create() or
- * kthread_run() with @worker_ptr argument pointing to an initialized
- * kthread_worker. The started kthread will process work_list until
- * the it is stopped with kthread_stop(). A kthread can also call
- * this function directly after extra initialization.
+ * This function implements the main cycle of kthread worker. It processes
+ * work_list until it is stopped with kthread_stop(). It sleeps when the queue
+ * is empty.
*
- * Different kthreads can be used for the same kthread_worker as long
- * as there's only one kthread attached to it at any given time. A
- * kthread_worker without an attached kthread simply collects queued
- * kthread_works.
+ * The works are not allowed to keep any locks, disable preemption or interrupts
+ * when they finish. There is defined a safe point for freezing when one work
+ * finishes and before a new one is started.
+ *
+ * Also the works must not be handled by more than one worker at the same time,
+ * see also kthread_queue_work().
*/
int kthread_worker_fn(void *worker_ptr)
{
struct kthread_worker *worker = worker_ptr;
struct kthread_work *work;
- WARN_ON(worker->task);
+ /*
+ * FIXME: Update the check and remove the assignment when all kthread
+ * worker users are created using kthread_create_worker*() functions.
+ */
+ WARN_ON(worker->task && worker->task != current);
worker->task = current;
+
+ if (worker->flags & KTW_FREEZABLE)
+ set_freezable();
+
repeat:
set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
worker->task = NULL;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
return 0;
}
work = NULL;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
}
worker->current_work = work;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (work) {
+ kthread_work_func_t func = work->func;
__set_current_state(TASK_RUNNING);
+ trace_sched_kthread_work_execute_start(work);
work->func(work);
- } else if (!freezing(current))
+ /*
+ * Avoid dereferencing work after this point. The trace
+ * event only cares about the address.
+ */
+ trace_sched_kthread_work_execute_end(work, func);
+ } else if (!freezing(current)) {
schedule();
+ } else {
+ /*
+ * Handle the case where the current remains
+ * TASK_INTERRUPTIBLE. try_to_freeze() expects
+ * the current to be TASK_RUNNING.
+ */
+ __set_current_state(TASK_RUNNING);
+ }
try_to_freeze();
+ cond_resched();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
-/* insert @work before @pos in @worker */
-static void insert_kthread_work(struct kthread_worker *worker,
- struct kthread_work *work,
- struct list_head *pos)
+static __printf(3, 0) struct kthread_worker *
+__kthread_create_worker_on_node(unsigned int flags, int node,
+ const char namefmt[], va_list args)
+{
+ struct kthread_worker *worker;
+ struct task_struct *task;
+
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ if (!worker)
+ return ERR_PTR(-ENOMEM);
+
+ kthread_init_worker(worker);
+
+ task = __kthread_create_on_node(kthread_worker_fn, worker,
+ node, namefmt, args);
+ if (IS_ERR(task))
+ goto fail_task;
+
+ worker->flags = flags;
+ worker->task = task;
+
+ return worker;
+
+fail_task:
+ kfree(worker);
+ return ERR_CAST(task);
+}
+
+/**
+ * kthread_create_worker_on_node - create a kthread worker
+ * @flags: flags modifying the default behavior of the worker
+ * @node: task structure for the thread is allocated on this node
+ * @namefmt: printf-style name for the kthread worker (task).
+ *
+ * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
+ * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
+ * when the caller was killed by a fatal signal.
+ */
+struct kthread_worker *
+kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
+{
+ struct kthread_worker *worker;
+ va_list args;
+
+ va_start(args, namefmt);
+ worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
+ va_end(args);
+
+ return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker_on_node);
+
+/**
+ * kthread_create_worker_on_cpu - create a kthread worker and bind it
+ * to a given CPU and the associated NUMA node.
+ * @cpu: CPU number
+ * @flags: flags modifying the default behavior of the worker
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
+ *
+ * Use a valid CPU number if you want to bind the kthread worker
+ * to the given CPU and the associated NUMA node.
+ *
+ * A good practice is to add the cpu number also into the worker name.
+ * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
+ *
+ * CPU hotplug:
+ * The kthread worker API is simple and generic. It just provides a way
+ * to create, use, and destroy workers.
+ *
+ * It is up to the API user how to handle CPU hotplug. They have to decide
+ * how to handle pending work items, prevent queuing new ones, and
+ * restore the functionality when the CPU goes off and on. There are a
+ * few catches:
+ *
+ * - CPU affinity gets lost when it is scheduled on an offline CPU.
+ *
+ * - The worker might not exist when the CPU was off when the user
+ * created the workers.
+ *
+ * Good practice is to implement two CPU hotplug callbacks and to
+ * destroy/create the worker when the CPU goes down/up.
+ *
+ * Return:
+ * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
+ * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
+ * when the caller was killed by a fatal signal.
+ */
+struct kthread_worker *
+kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+ const char namefmt[])
+{
+ struct kthread_worker *worker;
+
+ worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
+ if (!IS_ERR(worker))
+ kthread_bind(worker->task, cpu);
+
+ return worker;
+}
+EXPORT_SYMBOL(kthread_create_worker_on_cpu);
+
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+ struct kthread_work *work)
{
lockdep_assert_held(&worker->lock);
+ return !list_empty(&work->node) || work->canceling;
+}
+
+static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ lockdep_assert_held(&worker->lock);
+ WARN_ON_ONCE(!list_empty(&work->node));
+ /* Do not use a work with >1 worker, see kthread_queue_work() */
+ WARN_ON_ONCE(work->worker && work->worker != worker);
+}
+
+/* insert @work before @pos in @worker */
+static void kthread_insert_work(struct kthread_worker *worker,
+ struct kthread_work *work,
+ struct list_head *pos)
+{
+ kthread_insert_work_sanity_check(worker, work);
+
+ trace_sched_kthread_work_queue_work(worker, work);
+
list_add_tail(&work->node, pos);
work->worker = worker;
if (!worker->current_work && likely(worker->task))
@@ -596,29 +1165,133 @@ static void insert_kthread_work(struct kthread_worker *worker,
}
/**
- * queue_kthread_work - queue a kthread_work
+ * kthread_queue_work - queue a kthread_work
* @worker: target kthread_worker
* @work: kthread_work to queue
*
* Queue @work to work processor @task for async execution. @task
- * must have been created with kthread_worker_create(). Returns %true
+ * must have been created with kthread_create_worker(). Returns %true
* if @work was successfully queued, %false if it was already pending.
+ *
+ * Reinitialize the work if it needs to be used by another worker.
+ * For example, when the worker was stopped and started again.
*/
-bool queue_kthread_work(struct kthread_worker *worker,
+bool kthread_queue_work(struct kthread_worker *worker,
struct kthread_work *work)
{
bool ret = false;
unsigned long flags;
- spin_lock_irqsave(&worker->lock, flags);
- if (list_empty(&work->node)) {
- insert_kthread_work(worker, work, &worker->work_list);
+ raw_spin_lock_irqsave(&worker->lock, flags);
+ if (!queuing_blocked(worker, work)) {
+ kthread_insert_work(worker, work, &worker->work_list);
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_queue_work);
+
+/**
+ * kthread_delayed_work_timer_fn - callback that queues the associated kthread
+ * delayed work when the timer expires.
+ * @t: pointer to the expired timer
+ *
+ * The format of the function is defined by struct timer_list.
+ * It should have been called from irqsafe timer with irq already off.
+ */
+void kthread_delayed_work_timer_fn(struct timer_list *t)
+{
+ struct kthread_delayed_work *dwork = timer_container_of(dwork, t,
+ timer);
+ struct kthread_work *work = &dwork->work;
+ struct kthread_worker *worker = work->worker;
+ unsigned long flags;
+
+ /*
+ * This might happen when a pending work is reinitialized.
+ * It means that it is used a wrong way.
+ */
+ if (WARN_ON_ONCE(!worker))
+ return;
+
+ raw_spin_lock_irqsave(&worker->lock, flags);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ /* Move the work from worker->delayed_work_list. */
+ WARN_ON_ONCE(list_empty(&work->node));
+ list_del_init(&work->node);
+ if (!work->canceling)
+ kthread_insert_work(worker, work, &worker->work_list);
+
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
+}
+EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
+
+static void __kthread_queue_delayed_work(struct kthread_worker *worker,
+ struct kthread_delayed_work *dwork,
+ unsigned long delay)
+{
+ struct timer_list *timer = &dwork->timer;
+ struct kthread_work *work = &dwork->work;
+
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
+
+ /*
+ * If @delay is 0, queue @dwork->work immediately. This is for
+ * both optimization and correctness. The earliest @timer can
+ * expire is on the closest next tick and delayed_work users depend
+ * on that there's no such delay when @delay is 0.
+ */
+ if (!delay) {
+ kthread_insert_work(worker, work, &worker->work_list);
+ return;
+ }
+
+ /* Be paranoid and try to detect possible races already now. */
+ kthread_insert_work_sanity_check(worker, work);
+
+ list_add(&work->node, &worker->delayed_work_list);
+ work->worker = worker;
+ timer->expires = jiffies + delay;
+ add_timer(timer);
+}
+
+/**
+ * kthread_queue_delayed_work - queue the associated kthread work
+ * after a delay.
+ * @worker: target kthread_worker
+ * @dwork: kthread_delayed_work to queue
+ * @delay: number of jiffies to wait before queuing
+ *
+ * If the work has not been pending it starts a timer that will queue
+ * the work after the given @delay. If @delay is zero, it queues the
+ * work immediately.
+ *
+ * Return: %false if the @work has already been pending. It means that
+ * either the timer was running or the work was queued. It returns %true
+ * otherwise.
+ */
+bool kthread_queue_delayed_work(struct kthread_worker *worker,
+ struct kthread_delayed_work *dwork,
+ unsigned long delay)
+{
+ struct kthread_work *work = &dwork->work;
+ unsigned long flags;
+ bool ret = false;
+
+ raw_spin_lock_irqsave(&worker->lock, flags);
+
+ if (!queuing_blocked(worker, work)) {
+ __kthread_queue_delayed_work(worker, dwork, delay);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
-EXPORT_SYMBOL_GPL(queue_kthread_work);
+EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
struct kthread_flush_work {
struct kthread_work work;
@@ -633,12 +1306,12 @@ static void kthread_flush_work_fn(struct kthread_work *work)
}
/**
- * flush_kthread_work - flush a kthread_work
+ * kthread_flush_work - flush a kthread_work
* @work: work to flush
*
* If @work is queued or executing, wait for it to finish execution.
*/
-void flush_kthread_work(struct kthread_work *work)
+void kthread_flush_work(struct kthread_work *work)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
@@ -647,46 +1320,399 @@ void flush_kthread_work(struct kthread_work *work)
struct kthread_worker *worker;
bool noop = false;
-retry:
worker = work->worker;
if (!worker)
return;
- spin_lock_irq(&worker->lock);
- if (work->worker != worker) {
- spin_unlock_irq(&worker->lock);
- goto retry;
- }
+ raw_spin_lock_irq(&worker->lock);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
if (!list_empty(&work->node))
- insert_kthread_work(worker, &fwork.work, work->node.next);
+ kthread_insert_work(worker, &fwork.work, work->node.next);
else if (worker->current_work == work)
- insert_kthread_work(worker, &fwork.work, worker->work_list.next);
+ kthread_insert_work(worker, &fwork.work,
+ worker->work_list.next);
else
noop = true;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (!noop)
wait_for_completion(&fwork.done);
}
-EXPORT_SYMBOL_GPL(flush_kthread_work);
+EXPORT_SYMBOL_GPL(kthread_flush_work);
+
+/*
+ * Make sure that the timer is neither set nor running and could
+ * not manipulate the work list_head any longer.
+ *
+ * The function is called under worker->lock. The lock is temporary
+ * released but the timer can't be set again in the meantime.
+ */
+static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
+ unsigned long *flags)
+{
+ struct kthread_delayed_work *dwork =
+ container_of(work, struct kthread_delayed_work, work);
+ struct kthread_worker *worker = work->worker;
+
+ /*
+ * timer_delete_sync() must be called to make sure that the timer
+ * callback is not running. The lock must be temporary released
+ * to avoid a deadlock with the callback. In the meantime,
+ * any queuing is blocked by setting the canceling counter.
+ */
+ work->canceling++;
+ raw_spin_unlock_irqrestore(&worker->lock, *flags);
+ timer_delete_sync(&dwork->timer);
+ raw_spin_lock_irqsave(&worker->lock, *flags);
+ work->canceling--;
+}
+
+/*
+ * This function removes the work from the worker queue.
+ *
+ * It is called under worker->lock. The caller must make sure that
+ * the timer used by delayed work is not running, e.g. by calling
+ * kthread_cancel_delayed_work_timer().
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ * %false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work)
+{
+ /*
+ * Try to remove the work from a worker list. It might either
+ * be from worker->work_list or from worker->delayed_work_list.
+ */
+ if (!list_empty(&work->node)) {
+ list_del_init(&work->node);
+ return true;
+ }
+
+ return false;
+}
/**
- * flush_kthread_worker - flush all current works on a kthread_worker
+ * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
+ * @worker: kthread worker to use
+ * @dwork: kthread delayed work to queue
+ * @delay: number of jiffies to wait before queuing
+ *
+ * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
+ * modify @dwork's timer so that it expires after @delay. If @delay is zero,
+ * @work is guaranteed to be queued immediately.
+ *
+ * Return: %false if @dwork was idle and queued, %true otherwise.
+ *
+ * A special case is when the work is being canceled in parallel.
+ * It might be caused either by the real kthread_cancel_delayed_work_sync()
+ * or yet another kthread_mod_delayed_work() call. We let the other command
+ * win and return %true here. The return value can be used for reference
+ * counting and the number of queued works stays the same. Anyway, the caller
+ * is supposed to synchronize these operations a reasonable way.
+ *
+ * This function is safe to call from any context including IRQ handler.
+ * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
+ * for details.
+ */
+bool kthread_mod_delayed_work(struct kthread_worker *worker,
+ struct kthread_delayed_work *dwork,
+ unsigned long delay)
+{
+ struct kthread_work *work = &dwork->work;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&worker->lock, flags);
+
+ /* Do not bother with canceling when never queued. */
+ if (!work->worker) {
+ ret = false;
+ goto fast_queue;
+ }
+
+ /* Work must not be used with >1 worker, see kthread_queue_work() */
+ WARN_ON_ONCE(work->worker != worker);
+
+ /*
+ * Temporary cancel the work but do not fight with another command
+ * that is canceling the work as well.
+ *
+ * It is a bit tricky because of possible races with another
+ * mod_delayed_work() and cancel_delayed_work() callers.
+ *
+ * The timer must be canceled first because worker->lock is released
+ * when doing so. But the work can be removed from the queue (list)
+ * only when it can be queued again so that the return value can
+ * be used for reference counting.
+ */
+ kthread_cancel_delayed_work_timer(work, &flags);
+ if (work->canceling) {
+ /* The number of works in the queue does not change. */
+ ret = true;
+ goto out;
+ }
+ ret = __kthread_cancel_work(work);
+
+fast_queue:
+ __kthread_queue_delayed_work(worker, dwork, delay);
+out:
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
+
+static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
+{
+ struct kthread_worker *worker = work->worker;
+ unsigned long flags;
+ int ret = false;
+
+ if (!worker)
+ goto out;
+
+ raw_spin_lock_irqsave(&worker->lock, flags);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ if (is_dwork)
+ kthread_cancel_delayed_work_timer(work, &flags);
+
+ ret = __kthread_cancel_work(work);
+
+ if (worker->current_work != work)
+ goto out_fast;
+
+ /*
+ * The work is in progress and we need to wait with the lock released.
+ * In the meantime, block any queuing by setting the canceling counter.
+ */
+ work->canceling++;
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
+ kthread_flush_work(work);
+ raw_spin_lock_irqsave(&worker->lock, flags);
+ work->canceling--;
+
+out_fast:
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+ return __kthread_cancel_work_sync(work, false);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+
+/**
+ * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
+ * wait for it to finish.
+ * @dwork: the kthread delayed work to cancel
+ *
+ * This is kthread_cancel_work_sync() for delayed works.
+ *
+ * Return: %true if @dwork was pending, %false otherwise.
+ */
+bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
+{
+ return __kthread_cancel_work_sync(&dwork->work, true);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);
+
+/**
+ * kthread_flush_worker - flush all current works on a kthread_worker
* @worker: worker to flush
*
* Wait until all currently executing or pending works on @worker are
* finished.
*/
-void flush_kthread_worker(struct kthread_worker *worker)
+void kthread_flush_worker(struct kthread_worker *worker)
{
struct kthread_flush_work fwork = {
KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
COMPLETION_INITIALIZER_ONSTACK(fwork.done),
};
- queue_kthread_work(worker, &fwork.work);
+ kthread_queue_work(worker, &fwork.work);
wait_for_completion(&fwork.done);
}
-EXPORT_SYMBOL_GPL(flush_kthread_worker);
+EXPORT_SYMBOL_GPL(kthread_flush_worker);
+
+/**
+ * kthread_destroy_worker - destroy a kthread worker
+ * @worker: worker to be destroyed
+ *
+ * Flush and destroy @worker. The simple flush is enough because the kthread
+ * worker API is used only in trivial scenarios. There are no multi-step state
+ * machines needed.
+ *
+ * Note that this function is not responsible for handling delayed work, so
+ * caller should be responsible for queuing or canceling all delayed work items
+ * before invoke this function.
+ */
+void kthread_destroy_worker(struct kthread_worker *worker)
+{
+ struct task_struct *task;
+
+ task = worker->task;
+ if (WARN_ON(!task))
+ return;
+
+ kthread_flush_worker(worker);
+ kthread_stop(task);
+ WARN_ON(!list_empty(&worker->delayed_work_list));
+ WARN_ON(!list_empty(&worker->work_list));
+ kfree(worker);
+}
+EXPORT_SYMBOL(kthread_destroy_worker);
+
+/**
+ * kthread_use_mm - make the calling kthread operate on an address space
+ * @mm: address space to operate on
+ */
+void kthread_use_mm(struct mm_struct *mm)
+{
+ struct mm_struct *active_mm;
+ struct task_struct *tsk = current;
+
+ WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
+ WARN_ON_ONCE(tsk->mm);
+
+ /*
+ * It is possible for mm to be the same as tsk->active_mm, but
+ * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
+ * because these references are not equivalent.
+ */
+ mmgrab(mm);
+
+ task_lock(tsk);
+ /* Hold off tlb flush IPIs while switching mm's */
+ local_irq_disable();
+ active_mm = tsk->active_mm;
+ tsk->active_mm = mm;
+ tsk->mm = mm;
+ membarrier_update_current_mm(mm);
+ switch_mm_irqs_off(active_mm, mm, tsk);
+ local_irq_enable();
+ task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+ finish_arch_post_lock_switch();
+#endif
+
+ /*
+ * When a kthread starts operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after storing to tsk->mm, before accessing
+ * user-space memory. A full memory barrier for membarrier
+ * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
+ * mmdrop_lazy_tlb().
+ */
+ mmdrop_lazy_tlb(active_mm);
+}
+EXPORT_SYMBOL_GPL(kthread_use_mm);
+
+/**
+ * kthread_unuse_mm - reverse the effect of kthread_use_mm()
+ * @mm: address space to operate on
+ */
+void kthread_unuse_mm(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!tsk->mm);
+
+ task_lock(tsk);
+ /*
+ * When a kthread stops operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after accessing user-space memory, before
+ * clearing tsk->mm.
+ */
+ smp_mb__after_spinlock();
+ local_irq_disable();
+ tsk->mm = NULL;
+ membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
+ /* active_mm is still 'mm' */
+ enter_lazy_tlb(mm, tsk);
+ local_irq_enable();
+ task_unlock(tsk);
+
+ mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(kthread_unuse_mm);
+
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+ struct kthread *kthread;
+
+ if (!(current->flags & PF_KTHREAD))
+ return;
+ kthread = to_kthread(current);
+ if (!kthread)
+ return;
+
+ if (kthread->blkcg_css) {
+ css_put(kthread->blkcg_css);
+ kthread->blkcg_css = NULL;
+ }
+ if (css) {
+ css_get(css);
+ kthread->blkcg_css = css;
+ }
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+ struct kthread *kthread;
+
+ if (current->flags & PF_KTHREAD) {
+ kthread = to_kthread(current);
+ if (kthread)
+ return kthread->blkcg_css;
+ }
+ return NULL;
+}
+#endif
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a02812743a7e..d4281d1e13a6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
/*
@@ -47,16 +43,19 @@
* of times)
*/
-#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
+#include <linux/latencytop.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/stat.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
+#include <linux/sysctl.h>
static DEFINE_RAW_SPINLOCK(latency_lock);
@@ -65,12 +64,33 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
-void clear_all_latency_tracing(struct task_struct *p)
+#ifdef CONFIG_SYSCTL
+static int sysctl_latencytop(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
- unsigned long flags;
+ int err;
- if (!latencytop_enabled)
- return;
+ err = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (latencytop_enabled)
+ force_schedstat_enabled();
+
+ return err;
+}
+
+static const struct ctl_table latencytop_sysctl[] = {
+ {
+ .procname = "latencytop",
+ .data = &latencytop_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_latencytop,
+ },
+};
+#endif
+
+void clear_tsk_latency_tracing(struct task_struct *p)
+{
+ unsigned long flags;
raw_spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
@@ -91,12 +111,9 @@ static void __sched
account_global_scheduler_latency(struct task_struct *tsk,
struct latency_record *lat)
{
- int firstnonnull = MAXLR + 1;
+ int firstnonnull = MAXLR;
int i;
- if (!latencytop_enabled)
- return;
-
/* skip kernel threads for now */
if (!tsk->mm)
return;
@@ -118,8 +135,8 @@ account_global_scheduler_latency(struct task_struct *tsk,
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry marks end of backtrace: */
+ if (!record)
break;
}
if (same) {
@@ -132,32 +149,18 @@ account_global_scheduler_latency(struct task_struct *tsk,
}
i = firstnonnull;
- if (i >= MAXLR - 1)
+ if (i >= MAXLR)
return;
/* Allocted a new one: */
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
-/*
- * Iterator to store a backtrace into a latency record entry
- */
-static inline void store_stacktrace(struct task_struct *tsk,
- struct latency_record *lat)
-{
- struct stack_trace trace;
-
- memset(&trace, 0, sizeof(trace));
- trace.max_entries = LT_BACKTRACEDEPTH;
- trace.entries = &lat->backtrace[0];
- save_stack_trace_tsk(tsk, &trace);
-}
-
/**
* __account_scheduler_latency - record an occurred latency
- * @tsk - the task struct of the task hitting the latency
- * @usecs - the duration of the latency in microseconds
- * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
+ * @tsk: the task struct of the task hitting the latency
+ * @usecs: the duration of the latency in microseconds
+ * @inter: 1 if the sleep was interruptible, 0 if uninterruptible
*
* This function is the main entry point for recording latency entries
* as called by the scheduler.
@@ -189,7 +192,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
- store_stacktrace(tsk, &lat);
+
+ stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
raw_spin_lock_irqsave(&latency_lock, flags);
@@ -208,8 +212,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry is end of backtrace */
+ if (!record)
break;
}
if (same) {
@@ -250,10 +254,10 @@ static int lstats_show(struct seq_file *m, void *v)
lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long bt = lr->backtrace[q];
+
if (!bt)
break;
- if (bt == ULONG_MAX)
- break;
+
seq_printf(m, " %ps", (void *)bt);
}
seq_puts(m, "\n");
@@ -276,17 +280,20 @@ static int lstats_open(struct inode *inode, struct file *filp)
return single_open(filp, lstats_show, NULL);
}
-static const struct file_operations lstats_fops = {
- .open = lstats_open,
- .read = seq_read,
- .write = lstats_write,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops lstats_proc_ops = {
+ .proc_open = lstats_open,
+ .proc_read = seq_read,
+ .proc_write = lstats_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
static int __init init_lstats_procfs(void)
{
- proc_create("latency_stats", 0644, NULL, &lstats_fops);
+ proc_create("latency_stats", 0644, NULL, &lstats_proc_ops);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("kernel", latencytop_sysctl);
+#endif
return 0;
}
device_initcall(init_lstats_procfs);
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 045022557936..4c0a9c18d0b2 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config HAVE_LIVEPATCH
bool
help
@@ -5,14 +6,27 @@ config HAVE_LIVEPATCH
config LIVEPATCH
bool "Kernel Live Patching"
- depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
depends on MODULES
depends on SYSFS
depends on KALLSYMS_ALL
depends on HAVE_LIVEPATCH
+ depends on !TRIM_UNUSED_KSYMS
help
Say Y here if you want to support kernel live patching.
This option has no runtime impact until a kernel "patch"
module uses the interface provided by this option to register
a patch, causing calls to patched functions to be redirected
to new function code contained in the patch module.
+
+config HAVE_KLP_BUILD
+ bool
+ help
+ Arch supports klp-build
+
+config KLP_BUILD
+ def_bool y
+ depends on LIVEPATCH && HAVE_KLP_BUILD
+ select OBJTOOL
+ help
+ Enable klp-build support
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index e8780c0901d9..cf03d4bdfc66 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_LIVEPATCH) += livepatch.o
-livepatch-objs := core.o
+livepatch-objs := core.o patch.o shadow.o state.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 284e2691e380..9917756dae46 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* core.c - Kernel Live Patching Core
*
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -24,68 +12,45 @@
#include <linux/kernel.h>
#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/ftrace.h>
#include <linux/list.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
+#include <linux/completion.h>
+#include <linux/memory.h>
+#include <linux/rcupdate.h>
+#include <asm/cacheflush.h>
+#include "core.h"
+#include "patch.h"
+#include "state.h"
+#include "transition.h"
-/**
- * struct klp_ops - structure for tracking registered ftrace ops structs
- *
- * A single ftrace_ops is shared between all enabled replacement functions
- * (klp_func structs) which have the same old_addr. This allows the switch
- * between function versions to happen instantaneously by updating the klp_ops
- * struct's func_stack list. The winner is the klp_func at the top of the
- * func_stack (front of the list).
+/*
+ * klp_mutex is a coarse lock which serializes access to klp data. All
+ * accesses to klp-related variables and structures must have mutex protection,
+ * except within the following functions which carefully avoid the need for it:
*
- * @node: node for the global klp_ops list
- * @func_stack: list head for the stack of klp_func's (active func is on top)
- * @fops: registered ftrace ops struct
+ * - klp_ftrace_handler()
+ * - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
-struct klp_ops {
- struct list_head node;
- struct list_head func_stack;
- struct ftrace_ops fops;
-};
+DEFINE_MUTEX(klp_mutex);
/*
- * The klp_mutex protects the global lists and state transitions of any
- * structure reachable from them. References to any structure must be obtained
- * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
- * ensure it gets consistent data).
+ * Actively used patches: enabled or in transition. Note that replaced
+ * or disabled patches are not listed even though the related kernel
+ * module still can be loaded.
*/
-static DEFINE_MUTEX(klp_mutex);
-
-static LIST_HEAD(klp_patches);
-static LIST_HEAD(klp_ops);
+LIST_HEAD(klp_patches);
static struct kobject *klp_root_kobj;
-static struct klp_ops *klp_find_ops(unsigned long old_addr)
-{
- struct klp_ops *ops;
- struct klp_func *func;
-
- list_for_each_entry(ops, &klp_ops, node) {
- func = list_first_entry(&ops->func_stack, struct klp_func,
- stack_node);
- if (func->old_addr == old_addr)
- return ops;
- }
-
- return NULL;
-}
-
static bool klp_is_module(struct klp_object *obj)
{
return obj->name;
}
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
- return !obj->name || obj->mod;
-}
-
/* sets obj->mod if object is not vmlinux and module is found */
static void klp_find_object_module(struct klp_object *obj)
{
@@ -94,99 +59,132 @@ static void klp_find_object_module(struct klp_object *obj)
if (!klp_is_module(obj))
return;
- mutex_lock(&module_mutex);
+ guard(rcu)();
/*
* We do not want to block removal of patched modules and therefore
* we do not take a reference here. The patches are removed by
- * a going module handler instead.
+ * klp_module_going() instead.
*/
mod = find_module(obj->name);
/*
- * Do not mess work of the module coming and going notifiers.
- * Note that the patch might still be needed before the going handler
+ * Do not mess work of klp_module_coming() and klp_module_going().
+ * Note that the patch might still be needed before klp_module_going()
* is called. Module functions can be called even in the GOING state
* until mod->exit() finishes. This is especially important for
* patches that modify semantic of the functions.
*/
if (mod && mod->klp_alive)
obj->mod = mod;
+}
- mutex_unlock(&module_mutex);
+static bool klp_initialized(void)
+{
+ return !!klp_root_kobj;
}
-/* klp_mutex must be held by caller */
-static bool klp_is_patch_registered(struct klp_patch *patch)
+static struct klp_func *klp_find_func(struct klp_object *obj,
+ struct klp_func *old_func)
{
- struct klp_patch *mypatch;
+ struct klp_func *func;
- list_for_each_entry(mypatch, &klp_patches, list)
- if (mypatch == patch)
- return true;
+ klp_for_each_func(obj, func) {
+ /*
+ * Besides identical old_sympos, also consider old_sympos
+ * of 0 and 1 are identical.
+ */
+ if ((strcmp(old_func->old_name, func->old_name) == 0) &&
+ ((old_func->old_sympos == func->old_sympos) ||
+ (old_func->old_sympos == 0 && func->old_sympos == 1) ||
+ (old_func->old_sympos == 1 && func->old_sympos == 0))) {
+ return func;
+ }
+ }
- return false;
+ return NULL;
}
-static bool klp_initialized(void)
+static struct klp_object *klp_find_object(struct klp_patch *patch,
+ struct klp_object *old_obj)
{
- return klp_root_kobj;
+ struct klp_object *obj;
+
+ klp_for_each_object(patch, obj) {
+ if (klp_is_module(old_obj)) {
+ if (klp_is_module(obj) &&
+ strcmp(old_obj->name, obj->name) == 0) {
+ return obj;
+ }
+ } else if (!klp_is_module(obj)) {
+ return obj;
+ }
+ }
+
+ return NULL;
}
struct klp_find_arg {
- const char *objname;
const char *name;
unsigned long addr;
- /*
- * If count == 0, the symbol was not found. If count == 1, a unique
- * match was found and addr is set. If count > 1, there is
- * unresolvable ambiguity among "count" number of symbols with the same
- * name in the same object.
- */
unsigned long count;
+ unsigned long pos;
};
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_match_callback(void *data, unsigned long addr)
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
- if (strcmp(args->name, name))
- return 0;
-
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
+ args->addr = addr;
+ args->count++;
/*
- * args->addr might be overwritten if another match is found
- * but klp_find_object_symbol() handles this and only returns the
- * addr if count == 1.
+ * Finish the search when the symbol is found for the desired position
+ * or the position is not defined for a non-unique symbol.
*/
- args->addr = addr;
- args->count++;
+ if ((args->pos && (args->count == args->pos)) ||
+ (!args->pos && (args->count > 1)))
+ return 1;
return 0;
}
+static int klp_find_callback(void *data, const char *name, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ return klp_match_callback(data, addr);
+}
+
static int klp_find_object_symbol(const char *objname, const char *name,
- unsigned long *addr)
+ unsigned long sympos, unsigned long *addr)
{
struct klp_find_arg args = {
- .objname = objname,
.name = name,
.addr = 0,
- .count = 0
+ .count = 0,
+ .pos = sympos,
};
- kallsyms_on_each_symbol(klp_find_callback, &args);
+ if (objname)
+ module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);
+ else
+ kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
- if (args.count == 0)
+ /*
+ * Ensure an address was found. If sympos is 0, ensure symbol is unique;
+ * otherwise ensure the symbol position count matches sympos.
+ */
+ if (args.addr == 0)
pr_err("symbol '%s' not found in symbol table\n", name);
- else if (args.count > 1)
- pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
- args.count, name, objname);
- else {
+ else if (args.count > 1 && sympos == 0) {
+ pr_err("unresolvable ambiguity for symbol '%s' in object '%s'\n",
+ name, objname);
+ } else if (sympos != args.count && sympos > 0) {
+ pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n",
+ sympos, name, objname ? objname : "vmlinux");
+ } else {
*addr = args.addr;
return 0;
}
@@ -195,482 +193,499 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-struct klp_verify_args {
- const char *name;
- const unsigned long addr;
-};
-
-static int klp_verify_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
+ unsigned int symndx, Elf_Shdr *relasec,
+ const char *sec_objname)
{
- struct klp_verify_args *args = data;
+ int i, cnt, ret;
+ char sym_objname[MODULE_NAME_LEN];
+ char sym_name[KSYM_NAME_LEN];
+ Elf_Rela *relas;
+ Elf_Sym *sym;
+ unsigned long sympos, addr;
+ bool sym_vmlinux;
+ bool sec_vmlinux = !strcmp(sec_objname, "vmlinux");
- if (!mod &&
- !strcmp(args->name, name) &&
- args->addr == addr)
- return 1;
+ /*
+ * Since the field widths for sym_objname and sym_name in the sscanf()
+ * call are hard-coded and correspond to MODULE_NAME_LEN and
+ * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN
+ * and KSYM_NAME_LEN have the values we expect them to have.
+ *
+ * Because the value of MODULE_NAME_LEN can differ among architectures,
+ * we use the smallest/strictest upper bound possible (56, based on
+ * the current definition of MODULE_NAME_LEN) to prevent overflows.
+ */
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512);
+
+ relas = (Elf_Rela *) relasec->sh_addr;
+ /* For each rela in this klp relocation section */
+ for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
+ sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
+ if (sym->st_shndx != SHN_LIVEPATCH) {
+ pr_err("symbol %s at rela sec %u idx %d is not marked as a livepatch symbol\n",
+ strtab + sym->st_name, symndx, i);
+ return -EINVAL;
+ }
- return 0;
-}
+ /* Format: .klp.sym.sym_objname.sym_name,sympos */
+ cnt = sscanf(strtab + sym->st_name,
+ KLP_SYM_PREFIX "%55[^.].%511[^,],%lu",
+ sym_objname, sym_name, &sympos);
+ if (cnt != 3) {
+ pr_err("symbol %s has an incorrectly formatted name\n",
+ strtab + sym->st_name);
+ return -EINVAL;
+ }
-static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
-{
- struct klp_verify_args args = {
- .name = name,
- .addr = addr,
- };
+ sym_vmlinux = !strcmp(sym_objname, "vmlinux");
+
+ /*
+ * Prevent module-specific KLP rela sections from referencing
+ * vmlinux symbols. This helps prevent ordering issues with
+ * module special section initializations. Presumably such
+ * symbols are exported and normal relas can be used instead.
+ */
+ if (!sec_vmlinux && sym_vmlinux) {
+ pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section\n",
+ sym_name);
+ return -EINVAL;
+ }
- if (kallsyms_on_each_symbol(klp_verify_callback, &args))
- return 0;
+ /* klp_find_object_symbol() treats a NULL objname as vmlinux */
+ ret = klp_find_object_symbol(sym_vmlinux ? NULL : sym_objname,
+ sym_name, sympos, &addr);
+ if (ret)
+ return ret;
- pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n",
- name, addr);
- return -EINVAL;
+ sym->st_value = addr;
+ }
+
+ return 0;
}
-static int klp_find_verify_func_addr(struct klp_object *obj,
- struct klp_func *func)
+void __weak clear_relocate_add(Elf_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
{
- int ret;
-
-#if defined(CONFIG_RANDOMIZE_BASE)
- /* KASLR is enabled, disregard old_addr from user */
- func->old_addr = 0;
-#endif
-
- if (!func->old_addr || klp_is_module(obj))
- ret = klp_find_object_symbol(obj->name, func->old_name,
- &func->old_addr);
- else
- ret = klp_verify_vmlinux_symbol(func->old_name,
- func->old_addr);
-
- return ret;
}
/*
- * external symbols are located outside the parent object (where the parent
- * object is either vmlinux or the kmod being patched).
+ * At a high-level, there are two types of klp relocation sections: those which
+ * reference symbols which live in vmlinux; and those which reference symbols
+ * which live in other modules. This function is called for both types:
+ *
+ * 1) When a klp module itself loads, the module code calls this function to
+ * write vmlinux-specific klp relocations (.klp.rela.vmlinux.* sections).
+ * These relocations are written to the klp module text to allow the patched
+ * code/data to reference unexported vmlinux symbols. They're written as
+ * early as possible to ensure that other module init code (.e.g.,
+ * jump_label_apply_nops) can access any unexported vmlinux symbols which
+ * might be referenced by the klp module's special sections.
+ *
+ * 2) When a to-be-patched module loads -- or is already loaded when a
+ * corresponding klp module loads -- klp code calls this function to write
+ * module-specific klp relocations (.klp.rela.{module}.* sections). These
+ * are written to the klp module text to allow the patched code/data to
+ * reference symbols which live in the to-be-patched module or one of its
+ * module dependencies. Exported symbols are supported, in addition to
+ * unexported symbols, in order to enable late module patching, which allows
+ * the to-be-patched module to be loaded and patched sometime *after* the
+ * klp module is loaded.
*/
-static int klp_find_external_symbol(struct module *pmod, const char *name,
- unsigned long *addr)
-{
- const struct kernel_symbol *sym;
-
- /* first, check if it's an exported symbol */
- preempt_disable();
- sym = find_symbol(name, NULL, NULL, true, true);
- if (sym) {
- *addr = sym->value;
- preempt_enable();
- return 0;
- }
- preempt_enable();
-
- /* otherwise check if it's in another .o within the patch module */
- return klp_find_object_symbol(pmod->name, name, addr);
-}
-
-static int klp_write_object_relocations(struct module *pmod,
- struct klp_object *obj)
+static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname, bool apply)
{
- int ret;
- struct klp_reloc *reloc;
+ int cnt, ret;
+ char sec_objname[MODULE_NAME_LEN];
+ Elf_Shdr *sec = sechdrs + secndx;
- if (WARN_ON(!klp_is_object_loaded(obj)))
+ /*
+ * Format: .klp.rela.sec_objname.section_name
+ * See comment in klp_resolve_symbols() for an explanation
+ * of the selected field width value.
+ */
+ cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]",
+ sec_objname);
+ if (cnt != 1) {
+ pr_err("section %s has an incorrectly formatted name\n",
+ shstrtab + sec->sh_name);
return -EINVAL;
+ }
- if (WARN_ON(!obj->relocs))
- return -EINVAL;
+ if (strcmp(objname ? objname : "vmlinux", sec_objname))
+ return 0;
- for (reloc = obj->relocs; reloc->name; reloc++) {
- if (!klp_is_module(obj)) {
- ret = klp_verify_vmlinux_symbol(reloc->name,
- reloc->val);
- if (ret)
- return ret;
- } else {
- /* module, reloc->val needs to be discovered */
- if (reloc->external)
- ret = klp_find_external_symbol(pmod,
- reloc->name,
- &reloc->val);
- else
- ret = klp_find_object_symbol(obj->mod->name,
- reloc->name,
- &reloc->val);
- if (ret)
- return ret;
- }
- ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
- reloc->val + reloc->addend);
- if (ret) {
- pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
- reloc->name, reloc->val, ret);
+ if (apply) {
+ ret = klp_resolve_symbols(sechdrs, strtab, symndx,
+ sec, sec_objname);
+ if (ret)
return ret;
- }
+
+ return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
}
+ clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod);
return 0;
}
-static void notrace klp_ftrace_handler(unsigned long ip,
- unsigned long parent_ip,
- struct ftrace_ops *fops,
- struct pt_regs *regs)
+int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
+ const char *shstrtab, const char *strtab,
+ unsigned int symndx, unsigned int secndx,
+ const char *objname)
{
- struct klp_ops *ops;
- struct klp_func *func;
-
- ops = container_of(fops, struct klp_ops, fops);
-
- rcu_read_lock();
- func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
- stack_node);
- if (WARN_ON_ONCE(!func))
- goto unlock;
-
- klp_arch_set_pc(regs, (unsigned long)func->new_func);
-unlock:
- rcu_read_unlock();
+ return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx,
+ secndx, objname, true);
}
-static void klp_disable_func(struct klp_func *func)
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/transition
+ * /sys/kernel/livepatch/<patch>/force
+ * /sys/kernel/livepatch/<patch>/replace
+ * /sys/kernel/livepatch/<patch>/stack_order
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/patched
+ * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
+ */
+static int __klp_disable_patch(struct klp_patch *patch);
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
- struct klp_ops *ops;
+ struct klp_patch *patch;
+ int ret;
+ bool enabled;
- WARN_ON(func->state != KLP_ENABLED);
- WARN_ON(!func->old_addr);
+ ret = kstrtobool(buf, &enabled);
+ if (ret)
+ return ret;
- ops = klp_find_ops(func->old_addr);
- if (WARN_ON(!ops))
- return;
+ patch = container_of(kobj, struct klp_patch, kobj);
- if (list_is_singular(&ops->func_stack)) {
- WARN_ON(unregister_ftrace_function(&ops->fops));
- WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
+ mutex_lock(&klp_mutex);
- list_del_rcu(&func->stack_node);
- list_del(&ops->node);
- kfree(ops);
- } else {
- list_del_rcu(&func->stack_node);
+ if (patch->enabled == enabled) {
+ /* already in requested state */
+ ret = -EINVAL;
+ goto out;
}
- func->state = KLP_DISABLED;
-}
+ /*
+ * Allow to reverse a pending transition in both ways. It might be
+ * necessary to complete the transition without forcing and breaking
+ * the system integrity.
+ *
+ * Do not allow to re-enable a disabled patch.
+ */
+ if (patch == klp_transition_patch)
+ klp_reverse_transition();
+ else if (!enabled)
+ ret = __klp_disable_patch(patch);
+ else
+ ret = -EINVAL;
-static int klp_enable_func(struct klp_func *func)
-{
- struct klp_ops *ops;
- int ret;
+out:
+ mutex_unlock(&klp_mutex);
- if (WARN_ON(!func->old_addr))
- return -EINVAL;
+ if (ret)
+ return ret;
+ return count;
+}
- if (WARN_ON(func->state != KLP_DISABLED))
- return -EINVAL;
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
- ops = klp_find_ops(func->old_addr);
- if (!ops) {
- ops = kzalloc(sizeof(*ops), GFP_KERNEL);
- if (!ops)
- return -ENOMEM;
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return sysfs_emit(buf, "%d\n", patch->enabled);
+}
- ops->fops.func = klp_ftrace_handler;
- ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
- FTRACE_OPS_FL_DYNAMIC |
- FTRACE_OPS_FL_IPMODIFY;
+static ssize_t transition_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch;
- list_add(&ops->node, &klp_ops);
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return sysfs_emit(buf, "%d\n", patch == klp_transition_patch);
+}
- INIT_LIST_HEAD(&ops->func_stack);
- list_add_rcu(&func->stack_node, &ops->func_stack);
+static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct klp_patch *patch;
+ int ret;
+ bool val;
- ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
- if (ret) {
- pr_err("failed to set ftrace filter for function '%s' (%d)\n",
- func->old_name, ret);
- goto err;
- }
+ ret = kstrtobool(buf, &val);
+ if (ret)
+ return ret;
- ret = register_ftrace_function(&ops->fops);
- if (ret) {
- pr_err("failed to register ftrace handler for function '%s' (%d)\n",
- func->old_name, ret);
- ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
- goto err;
- }
+ if (!val)
+ return count;
+ mutex_lock(&klp_mutex);
- } else {
- list_add_rcu(&func->stack_node, &ops->func_stack);
+ patch = container_of(kobj, struct klp_patch, kobj);
+ if (patch != klp_transition_patch) {
+ mutex_unlock(&klp_mutex);
+ return -EINVAL;
}
- func->state = KLP_ENABLED;
+ klp_force_transition();
- return 0;
+ mutex_unlock(&klp_mutex);
-err:
- list_del_rcu(&func->stack_node);
- list_del(&ops->node);
- kfree(ops);
- return ret;
+ return count;
}
-static void klp_disable_object(struct klp_object *obj)
+static ssize_t replace_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- struct klp_func *func;
-
- for (func = obj->funcs; func->old_name; func++)
- if (func->state == KLP_ENABLED)
- klp_disable_func(func);
+ struct klp_patch *patch;
- obj->state = KLP_DISABLED;
+ patch = container_of(kobj, struct klp_patch, kobj);
+ return sysfs_emit(buf, "%d\n", patch->replace);
}
-static int klp_enable_object(struct klp_object *obj)
+static ssize_t stack_order_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- struct klp_func *func;
- int ret;
+ struct klp_patch *patch, *this_patch;
+ int stack_order = 0;
- if (WARN_ON(obj->state != KLP_DISABLED))
- return -EINVAL;
+ this_patch = container_of(kobj, struct klp_patch, kobj);
- if (WARN_ON(!klp_is_object_loaded(obj)))
- return -EINVAL;
+ mutex_lock(&klp_mutex);
- for (func = obj->funcs; func->old_name; func++) {
- ret = klp_enable_func(func);
- if (ret) {
- klp_disable_object(obj);
- return ret;
- }
+ klp_for_each_patch(patch) {
+ stack_order++;
+ if (patch == this_patch)
+ break;
}
- obj->state = KLP_ENABLED;
- return 0;
+ mutex_unlock(&klp_mutex);
+
+ return sysfs_emit(buf, "%d\n", stack_order);
}
-static int __klp_disable_patch(struct klp_patch *patch)
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
+static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
+static struct kobj_attribute replace_kobj_attr = __ATTR_RO(replace);
+static struct kobj_attribute stack_order_kobj_attr = __ATTR_RO(stack_order);
+static struct attribute *klp_patch_attrs[] = {
+ &enabled_kobj_attr.attr,
+ &transition_kobj_attr.attr,
+ &force_kobj_attr.attr,
+ &replace_kobj_attr.attr,
+ &stack_order_kobj_attr.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(klp_patch);
+
+static ssize_t patched_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
struct klp_object *obj;
- /* enforce stacking: only the last enabled patch can be disabled */
- if (!list_is_last(&patch->list, &klp_patches) &&
- list_next_entry(patch, list)->state == KLP_ENABLED)
- return -EBUSY;
-
- pr_notice("disabling patch '%s'\n", patch->mod->name);
-
- for (obj = patch->objs; obj->funcs; obj++) {
- if (obj->state == KLP_ENABLED)
- klp_disable_object(obj);
- }
+ obj = container_of(kobj, struct klp_object, kobj);
+ return sysfs_emit(buf, "%d\n", obj->patched);
+}
- patch->state = KLP_DISABLED;
+static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched);
+static struct attribute *klp_object_attrs[] = {
+ &patched_kobj_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(klp_object);
- return 0;
+static void klp_free_object_dynamic(struct klp_object *obj)
+{
+ kfree(obj->name);
+ kfree(obj);
}
-/**
- * klp_disable_patch() - disables a registered patch
- * @patch: The registered, enabled patch to be disabled
- *
- * Unregisters the patched functions from ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_disable_patch(struct klp_patch *patch)
+static void klp_init_func_early(struct klp_object *obj,
+ struct klp_func *func);
+static void klp_init_object_early(struct klp_patch *patch,
+ struct klp_object *obj);
+
+static struct klp_object *klp_alloc_object_dynamic(const char *name,
+ struct klp_patch *patch)
{
- int ret;
+ struct klp_object *obj;
- mutex_lock(&klp_mutex);
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return NULL;
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
-
- if (patch->state == KLP_DISABLED) {
- ret = -EINVAL;
- goto err;
+ if (name) {
+ obj->name = kstrdup(name, GFP_KERNEL);
+ if (!obj->name) {
+ kfree(obj);
+ return NULL;
+ }
}
- ret = __klp_disable_patch(patch);
+ klp_init_object_early(patch, obj);
+ obj->dynamic = true;
-err:
- mutex_unlock(&klp_mutex);
- return ret;
+ return obj;
}
-EXPORT_SYMBOL_GPL(klp_disable_patch);
-static int __klp_enable_patch(struct klp_patch *patch)
+static void klp_free_func_nop(struct klp_func *func)
{
- struct klp_object *obj;
- int ret;
-
- if (WARN_ON(patch->state != KLP_DISABLED))
- return -EINVAL;
-
- /* enforce stacking: only the first disabled patch can be enabled */
- if (patch->list.prev != &klp_patches &&
- list_prev_entry(patch, list)->state == KLP_DISABLED)
- return -EBUSY;
-
- pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
- add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ kfree(func->old_name);
+ kfree(func);
+}
- pr_notice("enabling patch '%s'\n", patch->mod->name);
+static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
- for (obj = patch->objs; obj->funcs; obj++) {
- if (!klp_is_object_loaded(obj))
- continue;
+ func = kzalloc(sizeof(*func), GFP_KERNEL);
+ if (!func)
+ return NULL;
- ret = klp_enable_object(obj);
- if (ret)
- goto unregister;
+ if (old_func->old_name) {
+ func->old_name = kstrdup(old_func->old_name, GFP_KERNEL);
+ if (!func->old_name) {
+ kfree(func);
+ return NULL;
+ }
}
- patch->state = KLP_ENABLED;
-
- return 0;
+ klp_init_func_early(obj, func);
+ /*
+ * func->new_func is same as func->old_func. These addresses are
+ * set when the object is loaded, see klp_init_object_loaded().
+ */
+ func->old_sympos = old_func->old_sympos;
+ func->nop = true;
-unregister:
- WARN_ON(__klp_disable_patch(patch));
- return ret;
+ return func;
}
-/**
- * klp_enable_patch() - enables a registered patch
- * @patch: The registered, disabled patch to be enabled
- *
- * Performs the needed symbol lookups and code relocations,
- * then registers the patched functions with ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_enable_patch(struct klp_patch *patch)
+static int klp_add_object_nops(struct klp_patch *patch,
+ struct klp_object *old_obj)
{
- int ret;
+ struct klp_object *obj;
+ struct klp_func *func, *old_func;
- mutex_lock(&klp_mutex);
+ obj = klp_find_object(patch, old_obj);
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
+ if (!obj) {
+ obj = klp_alloc_object_dynamic(old_obj->name, patch);
+ if (!obj)
+ return -ENOMEM;
}
- ret = __klp_enable_patch(patch);
+ klp_for_each_func(old_obj, old_func) {
+ func = klp_find_func(obj, old_func);
+ if (func)
+ continue;
-err:
- mutex_unlock(&klp_mutex);
- return ret;
+ func = klp_alloc_func_nop(old_func, obj);
+ if (!func)
+ return -ENOMEM;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(klp_enable_patch);
/*
- * Sysfs Interface
+ * Add 'nop' functions which simply return to the caller to run the
+ * original function.
*
- * /sys/kernel/livepatch
- * /sys/kernel/livepatch/<patch>
- * /sys/kernel/livepatch/<patch>/enabled
- * /sys/kernel/livepatch/<patch>/<object>
- * /sys/kernel/livepatch/<patch>/<object>/<func>
+ * They are added only when the atomic replace mode is used and only for
+ * functions which are currently livepatched but are no longer included
+ * in the new livepatch.
*/
-
-static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
+static int klp_add_nops(struct klp_patch *patch)
{
- struct klp_patch *patch;
- int ret;
- unsigned long val;
-
- ret = kstrtoul(buf, 10, &val);
- if (ret)
- return -EINVAL;
-
- if (val != KLP_DISABLED && val != KLP_ENABLED)
- return -EINVAL;
-
- patch = container_of(kobj, struct klp_patch, kobj);
-
- mutex_lock(&klp_mutex);
+ struct klp_patch *old_patch;
+ struct klp_object *old_obj;
- if (val == patch->state) {
- /* already in requested state */
- ret = -EINVAL;
- goto err;
- }
+ klp_for_each_patch(old_patch) {
+ klp_for_each_object(old_patch, old_obj) {
+ int err;
- if (val == KLP_ENABLED) {
- ret = __klp_enable_patch(patch);
- if (ret)
- goto err;
- } else {
- ret = __klp_disable_patch(patch);
- if (ret)
- goto err;
+ err = klp_add_object_nops(patch, old_obj);
+ if (err)
+ return err;
+ }
}
- mutex_unlock(&klp_mutex);
-
- return count;
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
+ return 0;
}
-static ssize_t enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static void klp_kobj_release_patch(struct kobject *kobj)
{
struct klp_patch *patch;
patch = container_of(kobj, struct klp_patch, kobj);
- return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+ complete(&patch->finish);
}
-static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
-static struct attribute *klp_patch_attrs[] = {
- &enabled_kobj_attr.attr,
- NULL
+static const struct kobj_type klp_ktype_patch = {
+ .release = klp_kobj_release_patch,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = klp_patch_groups,
};
-static void klp_kobj_release_patch(struct kobject *kobj)
+static void klp_kobj_release_object(struct kobject *kobj)
{
- /*
- * Once we have a consistency model we'll need to module_put() the
- * patch module here. See klp_register_patch() for more details.
- */
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+
+ if (obj->dynamic)
+ klp_free_object_dynamic(obj);
}
-static struct kobj_type klp_ktype_patch = {
- .release = klp_kobj_release_patch,
+static const struct kobj_type klp_ktype_object = {
+ .release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
- .default_attrs = klp_patch_attrs,
+ .default_groups = klp_object_groups,
};
static void klp_kobj_release_func(struct kobject *kobj)
{
+ struct klp_func *func;
+
+ func = container_of(kobj, struct klp_func, kobj);
+
+ if (func->nop)
+ klp_free_func_nop(func);
}
-static struct kobj_type klp_ktype_func = {
+static const struct kobj_type klp_ktype_func = {
.release = klp_kobj_release_func,
.sysfs_ops = &kobj_sysfs_ops,
};
-/*
- * Free all functions' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_funcs_limited(struct klp_object *obj,
- struct klp_func *limit)
+static void __klp_free_funcs(struct klp_object *obj, bool nops_only)
{
- struct klp_func *func;
+ struct klp_func *func, *tmp_func;
- for (func = obj->funcs; func->old_name && func != limit; func++)
+ klp_for_each_func_safe(obj, func, tmp_func) {
+ if (nops_only && !func->nop)
+ continue;
+
+ list_del(&func->node);
kobject_put(&func->kobj);
+ }
}
/* Clean up when a patched object is unloaded */
@@ -680,40 +695,171 @@ static void klp_free_object_loaded(struct klp_object *obj)
obj->mod = NULL;
- for (func = obj->funcs; func->old_name; func++)
- func->old_addr = 0;
+ klp_for_each_func(obj, func) {
+ func->old_func = NULL;
+
+ if (func->nop)
+ func->new_func = NULL;
+ }
}
-/*
- * Free all objects' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_objects_limited(struct klp_patch *patch,
- struct klp_object *limit)
+static void __klp_free_objects(struct klp_patch *patch, bool nops_only)
{
- struct klp_object *obj;
+ struct klp_object *obj, *tmp_obj;
- for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
- klp_free_funcs_limited(obj, NULL);
- kobject_put(obj->kobj);
+ klp_for_each_object_safe(patch, obj, tmp_obj) {
+ __klp_free_funcs(obj, nops_only);
+
+ if (nops_only && !obj->dynamic)
+ continue;
+
+ list_del(&obj->node);
+ kobject_put(&obj->kobj);
}
}
-static void klp_free_patch(struct klp_patch *patch)
+static void klp_free_objects(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, false);
+}
+
+static void klp_free_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, true);
+}
+
+/*
+ * This function implements the free operations that can be called safely
+ * under klp_mutex.
+ *
+ * The operation must be completed by calling klp_free_patch_finish()
+ * outside klp_mutex.
+ */
+static void klp_free_patch_start(struct klp_patch *patch)
{
- klp_free_objects_limited(patch, NULL);
if (!list_empty(&patch->list))
list_del(&patch->list);
+
+ klp_free_objects(patch);
+}
+
+/*
+ * This function implements the free part that must be called outside
+ * klp_mutex.
+ *
+ * It must be called after klp_free_patch_start(). And it has to be
+ * the last function accessing the livepatch structures when the patch
+ * gets disabled.
+ */
+static void klp_free_patch_finish(struct klp_patch *patch)
+{
+ /*
+ * Avoid deadlock with enabled_store() sysfs callback by
+ * calling this outside klp_mutex. It is safe because
+ * this is called when the patch gets disabled and it
+ * cannot get enabled again.
+ */
kobject_put(&patch->kobj);
+ wait_for_completion(&patch->finish);
+
+ /* Put the module after the last access to struct klp_patch. */
+ if (!patch->forced)
+ module_put(patch->mod);
+}
+
+/*
+ * The livepatch might be freed from sysfs interface created by the patch.
+ * This work allows to wait until the interface is destroyed in a separate
+ * context.
+ */
+static void klp_free_patch_work_fn(struct work_struct *work)
+{
+ struct klp_patch *patch =
+ container_of(work, struct klp_patch, free_work);
+
+ klp_free_patch_finish(patch);
+}
+
+void klp_free_patch_async(struct klp_patch *patch)
+{
+ klp_free_patch_start(patch);
+ schedule_work(&patch->free_work);
+}
+
+void klp_free_replaced_patches_async(struct klp_patch *new_patch)
+{
+ struct klp_patch *old_patch, *tmp_patch;
+
+ klp_for_each_patch_safe(old_patch, tmp_patch) {
+ if (old_patch == new_patch)
+ return;
+ klp_free_patch_async(old_patch);
+ }
}
static int klp_init_func(struct klp_object *obj, struct klp_func *func)
{
+ if (!func->old_name)
+ return -EINVAL;
+
+ /*
+ * NOPs get the address later. The patched module must be loaded,
+ * see klp_init_object_loaded().
+ */
+ if (!func->new_func && !func->nop)
+ return -EINVAL;
+
+ if (strlen(func->old_name) >= KSYM_NAME_LEN)
+ return -EINVAL;
+
INIT_LIST_HEAD(&func->stack_node);
- func->state = KLP_DISABLED;
+ func->patched = false;
+ func->transition = false;
+
+ /* The format for the sysfs directory is <function,sympos> where sympos
+ * is the nth occurrence of this symbol in kallsyms for the patched
+ * object. If the user selects 0 for old_sympos, then 1 will be used
+ * since a unique symbol will be the first occurrence.
+ */
+ return kobject_add(&func->kobj, &obj->kobj, "%s,%lu",
+ func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
+}
+
+static int klp_write_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj,
+ bool apply)
+{
+ int i, ret;
+ struct klp_modinfo *info = patch->mod->klp_info;
+
+ for (i = 1; i < info->hdr.e_shnum; i++) {
+ Elf_Shdr *sec = info->sechdrs + i;
+
+ if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
+ continue;
+
+ ret = klp_write_section_relocs(patch->mod, info->sechdrs,
+ info->secstrings,
+ patch->mod->core_kallsyms.strtab,
+ info->symndx, i, obj->name, apply);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int klp_apply_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ return klp_write_object_relocs(patch, obj, true);
+}
- return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- obj->kobj, "%s", func->old_name);
+static void klp_clear_object_relocs(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ klp_write_object_relocs(patch, obj, false);
}
/* parts of the initialization that is done only when the object is loaded */
@@ -723,16 +869,43 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
- if (obj->relocs) {
- ret = klp_write_object_relocations(patch->mod, obj);
+ if (klp_is_module(obj)) {
+ /*
+ * Only write module-specific relocations here
+ * (.klp.rela.{module}.*). vmlinux-specific relocations were
+ * written earlier during the initialization of the klp module
+ * itself.
+ */
+ ret = klp_apply_object_relocs(patch, obj);
if (ret)
return ret;
}
- for (func = obj->funcs; func->old_name; func++) {
- ret = klp_find_verify_func_addr(obj, func);
+ klp_for_each_func(obj, func) {
+ ret = klp_find_object_symbol(obj->name, func->old_name,
+ func->old_sympos,
+ (unsigned long *)&func->old_func);
if (ret)
return ret;
+
+ ret = kallsyms_lookup_size_offset((unsigned long)func->old_func,
+ &func->old_size, NULL);
+ if (!ret) {
+ pr_err("kallsyms size lookup failed for '%s'\n",
+ func->old_name);
+ return -ENOENT;
+ }
+
+ if (func->nop)
+ func->new_func = func->old_func;
+
+ ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
+ &func->new_size, NULL);
+ if (!ret) {
+ pr_err("kallsyms size lookup failed for '%s' replacement\n",
+ func->old_name);
+ return -ENOENT;
+ }
}
return 0;
@@ -744,220 +917,402 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
int ret;
const char *name;
- if (!obj->funcs)
+ if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN)
return -EINVAL;
- obj->state = KLP_DISABLED;
+ obj->patched = false;
obj->mod = NULL;
klp_find_object_module(obj);
name = klp_is_module(obj) ? obj->name : "vmlinux";
- obj->kobj = kobject_create_and_add(name, &patch->kobj);
- if (!obj->kobj)
- return -ENOMEM;
+ ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name);
+ if (ret)
+ return ret;
- for (func = obj->funcs; func->old_name; func++) {
+ klp_for_each_func(obj, func) {
ret = klp_init_func(obj, func);
if (ret)
- goto free;
+ return ret;
}
- if (klp_is_object_loaded(obj)) {
+ if (klp_is_object_loaded(obj))
ret = klp_init_object_loaded(patch, obj);
- if (ret)
- goto free;
- }
-
- return 0;
-free:
- klp_free_funcs_limited(obj, func);
- kobject_put(obj->kobj);
return ret;
}
-static int klp_init_patch(struct klp_patch *patch)
+static void klp_init_func_early(struct klp_object *obj,
+ struct klp_func *func)
+{
+ kobject_init(&func->kobj, &klp_ktype_func);
+ list_add_tail(&func->node, &obj->func_list);
+}
+
+static void klp_init_object_early(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ INIT_LIST_HEAD(&obj->func_list);
+ kobject_init(&obj->kobj, &klp_ktype_object);
+ list_add_tail(&obj->node, &patch->obj_list);
+}
+
+static void klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
- int ret;
+ struct klp_func *func;
- if (!patch->objs)
- return -EINVAL;
+ INIT_LIST_HEAD(&patch->list);
+ INIT_LIST_HEAD(&patch->obj_list);
+ kobject_init(&patch->kobj, &klp_ktype_patch);
+ patch->enabled = false;
+ patch->forced = false;
+ INIT_WORK(&patch->free_work, klp_free_patch_work_fn);
+ init_completion(&patch->finish);
- mutex_lock(&klp_mutex);
+ klp_for_each_object_static(patch, obj) {
+ klp_init_object_early(patch, obj);
+
+ klp_for_each_func_static(obj, func) {
+ klp_init_func_early(obj, func);
+ }
+ }
+}
- patch->state = KLP_DISABLED;
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
- ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
- klp_root_kobj, "%s", patch->mod->name);
+ ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name);
if (ret)
- goto unlock;
+ return ret;
+
+ if (patch->replace) {
+ ret = klp_add_nops(patch);
+ if (ret)
+ return ret;
+ }
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_object(patch, obj) {
ret = klp_init_object(patch, obj);
if (ret)
- goto free;
+ return ret;
}
list_add_tail(&patch->list, &klp_patches);
- mutex_unlock(&klp_mutex);
-
return 0;
+}
-free:
- klp_free_objects_limited(patch, obj);
- kobject_put(&patch->kobj);
-unlock:
- mutex_unlock(&klp_mutex);
- return ret;
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+
+ if (WARN_ON(!patch->enabled))
+ return -EINVAL;
+
+ if (klp_transition_patch)
+ return -EBUSY;
+
+ klp_init_transition(patch, KLP_TRANSITION_UNPATCHED);
+
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_pre_unpatch_callback(obj);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the TIF_PATCH_PENDING writes in
+ * klp_start_transition(). In the rare case where klp_ftrace_handler()
+ * is called shortly after klp_update_patch_state() switches the task,
+ * this ensures the handler sees that func->transition is set.
+ */
+ smp_wmb();
+
+ klp_start_transition();
+ patch->enabled = false;
+ klp_try_complete_transition();
+
+ return 0;
}
-/**
- * klp_unregister_patch() - unregisters a patch
- * @patch: Disabled patch to be unregistered
- *
- * Frees the data structures and removes the sysfs interface.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_unregister_patch(struct klp_patch *patch)
+static int __klp_enable_patch(struct klp_patch *patch)
{
- int ret = 0;
+ struct klp_object *obj;
+ int ret;
- mutex_lock(&klp_mutex);
+ if (klp_transition_patch)
+ return -EBUSY;
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON(patch->enabled))
+ return -EINVAL;
- if (patch->state == KLP_ENABLED) {
- ret = -EBUSY;
- goto out;
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+ klp_init_transition(patch, KLP_TRANSITION_PATCHED);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the ops->func_stack writes in
+ * klp_patch_object(), so that klp_ftrace_handler() will see the
+ * func->transition updates before the handler is registered and the
+ * new funcs become visible to the handler.
+ */
+ smp_wmb();
+
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
+
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to patch object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
}
- klp_free_patch(patch);
+ klp_start_transition();
+ patch->enabled = true;
+ klp_try_complete_transition();
-out:
- mutex_unlock(&klp_mutex);
+ return 0;
+err:
+ pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+ klp_cancel_transition();
return ret;
}
-EXPORT_SYMBOL_GPL(klp_unregister_patch);
/**
- * klp_register_patch() - registers a patch
- * @patch: Patch to be registered
+ * klp_enable_patch() - enable the livepatch
+ * @patch: patch to be enabled
*
- * Initializes the data structure associated with the patch and
- * creates the sysfs interface.
+ * Initializes the data structure associated with the patch, creates the sysfs
+ * interface, performs the needed symbol lookups and code relocations,
+ * registers the patched functions with ftrace.
+ *
+ * This function is supposed to be called from the livepatch module_init()
+ * callback.
*
* Return: 0 on success, otherwise error
*/
-int klp_register_patch(struct klp_patch *patch)
+int klp_enable_patch(struct klp_patch *patch)
{
int ret;
+ struct klp_object *obj;
+
+ if (!patch || !patch->mod || !patch->objs)
+ return -EINVAL;
+
+ klp_for_each_object_static(patch, obj) {
+ if (!obj->funcs)
+ return -EINVAL;
+ }
+
+
+ if (!is_livepatch_module(patch->mod)) {
+ pr_err("module %s is not marked as a livepatch module\n",
+ patch->mod->name);
+ return -EINVAL;
+ }
if (!klp_initialized())
return -ENODEV;
- if (!patch || !patch->mod)
+ if (!klp_have_reliable_stack()) {
+ pr_warn("This architecture doesn't have support for the livepatch consistency model.\n");
+ pr_warn("The livepatch transition may never complete.\n");
+ }
+
+ mutex_lock(&klp_mutex);
+
+ if (!klp_is_patch_compatible(patch)) {
+ pr_err("Livepatch patch (%s) is not compatible with the already installed livepatches.\n",
+ patch->mod->name);
+ mutex_unlock(&klp_mutex);
return -EINVAL;
+ }
- /*
- * A reference is taken on the patch module to prevent it from being
- * unloaded. Right now, we don't allow patch modules to unload since
- * there is currently no method to determine if a thread is still
- * running in the patched code contained in the patch module once
- * the ftrace registration is successful.
- */
- if (!try_module_get(patch->mod))
+ if (!try_module_get(patch->mod)) {
+ mutex_unlock(&klp_mutex);
return -ENODEV;
+ }
+
+ klp_init_patch_early(patch);
ret = klp_init_patch(patch);
if (ret)
- module_put(patch->mod);
+ goto err;
+
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+err:
+ klp_free_patch_start(patch);
+
+ mutex_unlock(&klp_mutex);
+
+ klp_free_patch_finish(patch);
return ret;
}
-EXPORT_SYMBOL_GPL(klp_register_patch);
+EXPORT_SYMBOL_GPL(klp_enable_patch);
-static void klp_module_notify_coming(struct klp_patch *patch,
- struct klp_object *obj)
+/*
+ * This function unpatches objects from the replaced livepatches.
+ *
+ * We could be pretty aggressive here. It is called in the situation where
+ * these structures are no longer accessed from the ftrace handler.
+ * All functions are redirected by the klp_transition_patch. They
+ * use either a new code or they are in the original code because
+ * of the special nop function patches.
+ *
+ * The only exception is when the transition was forced. In this case,
+ * klp_ftrace_handler() might still see the replaced patch on the stack.
+ * Fortunately, it is carefully designed to work with removed functions
+ * thanks to RCU. We only have to keep the patches on the system. Also
+ * this is handled transparently by patch->module_put.
+ */
+void klp_unpatch_replaced_patches(struct klp_patch *new_patch)
{
- struct module *pmod = patch->mod;
- struct module *mod = obj->mod;
- int ret;
+ struct klp_patch *old_patch;
- ret = klp_init_object_loaded(patch, obj);
- if (ret)
- goto err;
+ klp_for_each_patch(old_patch) {
+ if (old_patch == new_patch)
+ return;
- if (patch->state == KLP_DISABLED)
- return;
-
- pr_notice("applying patch '%s' to loading module '%s'\n",
- pmod->name, mod->name);
-
- ret = klp_enable_object(obj);
- if (!ret)
- return;
+ old_patch->enabled = false;
+ klp_unpatch_objects(old_patch);
+ }
+}
-err:
- pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
- pmod->name, mod->name, ret);
+/*
+ * This function removes the dynamically allocated 'nop' functions.
+ *
+ * We could be pretty aggressive. NOPs do not change the existing
+ * behavior except for adding unnecessary delay by the ftrace handler.
+ *
+ * It is safe even when the transition was forced. The ftrace handler
+ * will see a valid ops->func_stack entry thanks to RCU.
+ *
+ * We could even free the NOPs structures. They must be the last entry
+ * in ops->func_stack. Therefore unregister_ftrace_function() is called.
+ * It does the same as klp_synchronize_transition() to make sure that
+ * nobody is inside the ftrace handler once the operation finishes.
+ *
+ * IMPORTANT: It must be called right after removing the replaced patches!
+ */
+void klp_discard_nops(struct klp_patch *new_patch)
+{
+ klp_unpatch_objects_dynamic(klp_transition_patch);
+ klp_free_objects_dynamic(klp_transition_patch);
}
-static void klp_module_notify_going(struct klp_patch *patch,
- struct klp_object *obj)
+/*
+ * Remove parts of patches that touch a given kernel module. The list of
+ * patches processed might be limited. When limit is NULL, all patches
+ * will be handled.
+ */
+static void klp_cleanup_module_patches_limited(struct module *mod,
+ struct klp_patch *limit)
{
- struct module *pmod = patch->mod;
- struct module *mod = obj->mod;
+ struct klp_patch *patch;
+ struct klp_object *obj;
- if (patch->state == KLP_DISABLED)
- goto disabled;
+ klp_for_each_patch(patch) {
+ if (patch == limit)
+ break;
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- pmod->name, mod->name);
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+ continue;
- klp_disable_object(obj);
+ if (patch != klp_transition_patch)
+ klp_pre_unpatch_callback(obj);
-disabled:
- klp_free_object_loaded(obj);
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_unpatch_object(obj);
+
+ klp_post_unpatch_callback(obj);
+ klp_clear_object_relocs(patch, obj);
+ klp_free_object_loaded(obj);
+ break;
+ }
+ }
}
-static int klp_module_notify(struct notifier_block *nb, unsigned long action,
- void *data)
+int klp_module_coming(struct module *mod)
{
- struct module *mod = data;
+ int ret;
struct klp_patch *patch;
struct klp_object *obj;
- if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
- return 0;
+ if (WARN_ON(mod->state != MODULE_STATE_COMING))
+ return -EINVAL;
- mutex_lock(&klp_mutex);
+ if (!strcmp(mod->name, "vmlinux")) {
+ pr_err("vmlinux.ko: invalid module name\n");
+ return -EINVAL;
+ }
+ mutex_lock(&klp_mutex);
/*
- * Each module has to know that the notifier has been called.
- * We never know what module will get patched by a new patch.
+ * Each module has to know that klp_module_coming()
+ * has been called. We never know what module will
+ * get patched by a new patch.
*/
- if (action == MODULE_STATE_COMING)
- mod->klp_alive = true;
- else /* MODULE_STATE_GOING */
- mod->klp_alive = false;
+ mod->klp_alive = true;
- list_for_each_entry(patch, &klp_patches, list) {
- for (obj = patch->objs; obj->funcs; obj++) {
+ klp_for_each_patch(patch) {
+ klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
- if (action == MODULE_STATE_COMING) {
- obj->mod = mod;
- klp_module_notify_coming(patch, obj);
- } else /* MODULE_STATE_GOING */
- klp_module_notify_going(patch, obj);
+ obj->mod = mod;
+
+ ret = klp_init_object_loaded(patch, obj);
+ if (ret) {
+ pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n",
+ patch->mod->name, obj->mod->name, ret);
+ goto err;
+ }
+
+ pr_notice("applying patch '%s' to loading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ obj->name);
+ goto err;
+ }
+
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+ patch->mod->name, obj->mod->name, ret);
+
+ klp_post_unpatch_callback(obj);
+ goto err;
+ }
+
+ if (patch != klp_transition_patch)
+ klp_post_patch_callback(obj);
break;
}
@@ -966,38 +1321,48 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
mutex_unlock(&klp_mutex);
return 0;
-}
-static struct notifier_block klp_module_nb = {
- .notifier_call = klp_module_notify,
- .priority = INT_MIN+1, /* called late but before ftrace notifier */
-};
+err:
+ /*
+ * If a patch is unsuccessfully applied, return
+ * error to the module loader.
+ */
+ pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n",
+ patch->mod->name, obj->mod->name, obj->mod->name);
+ mod->klp_alive = false;
+ obj->mod = NULL;
+ klp_cleanup_module_patches_limited(mod, patch);
+ mutex_unlock(&klp_mutex);
+
+ return ret;
+}
-static int klp_init(void)
+void klp_module_going(struct module *mod)
{
- int ret;
+ if (WARN_ON(mod->state != MODULE_STATE_GOING &&
+ mod->state != MODULE_STATE_COMING))
+ return;
- ret = klp_check_compiler_support();
- if (ret) {
- pr_info("Your compiler is too old; turning off.\n");
- return -EINVAL;
- }
+ mutex_lock(&klp_mutex);
+ /*
+ * Each module has to know that klp_module_going()
+ * has been called. We never know what module will
+ * get patched by a new patch.
+ */
+ mod->klp_alive = false;
- ret = register_module_notifier(&klp_module_nb);
- if (ret)
- return ret;
+ klp_cleanup_module_patches_limited(mod, NULL);
+ mutex_unlock(&klp_mutex);
+}
+
+static int __init klp_init(void)
+{
klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
- if (!klp_root_kobj) {
- ret = -ENOMEM;
- goto unregister;
- }
+ if (!klp_root_kobj)
+ return -ENOMEM;
return 0;
-
-unregister:
- unregister_module_notifier(&klp_module_nb);
- return ret;
}
module_init(klp_init);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
new file mode 100644
index 000000000000..38209c7361b6
--- /dev/null
+++ b/kernel/livepatch/core.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIVEPATCH_CORE_H
+#define _LIVEPATCH_CORE_H
+
+#include <linux/livepatch.h>
+
+extern struct mutex klp_mutex;
+extern struct list_head klp_patches;
+
+#define klp_for_each_patch_safe(patch, tmp_patch) \
+ list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list)
+
+#define klp_for_each_patch(patch) \
+ list_for_each_entry(patch, &klp_patches, list)
+
+void klp_free_patch_async(struct klp_patch *patch);
+void klp_free_replaced_patches_async(struct klp_patch *new_patch);
+void klp_unpatch_replaced_patches(struct klp_patch *new_patch);
+void klp_discard_nops(struct klp_patch *new_patch);
+
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+ return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+ int ret = 0;
+
+ if (obj->callbacks.pre_patch)
+ ret = (*obj->callbacks.pre_patch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = !ret;
+
+ return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_patch)
+ (*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.pre_unpatch)
+ (*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+ if (obj->callbacks.post_unpatch_enabled &&
+ obj->callbacks.post_unpatch)
+ (*obj->callbacks.post_unpatch)(obj);
+
+ obj->callbacks.post_unpatch_enabled = false;
+}
+
+#endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
new file mode 100644
index 000000000000..90408500e5a3
--- /dev/null
+++ b/kernel/livepatch/patch.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * patch.c - livepatch patching functions
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/livepatch.h>
+#include <linux/list.h>
+#include <linux/ftrace.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/bug.h>
+#include <linux/printk.h>
+#include "core.h"
+#include "patch.h"
+#include "transition.h"
+
+static LIST_HEAD(klp_ops);
+
+struct klp_ops *klp_find_ops(void *old_func)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+
+ list_for_each_entry(ops, &klp_ops, node) {
+ func = list_first_entry(&ops->func_stack, struct klp_func,
+ stack_node);
+ if (func->old_func == old_func)
+ return ops;
+ }
+
+ return NULL;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+ unsigned long parent_ip,
+ struct ftrace_ops *fops,
+ struct ftrace_regs *fregs)
+{
+ struct klp_ops *ops;
+ struct klp_func *func;
+ int patch_state;
+ int bit;
+
+ ops = container_of(fops, struct klp_ops, fops);
+
+ /*
+ * The ftrace_test_recursion_trylock() will disable preemption,
+ * which is required for the variant of synchronize_rcu() that is
+ * used to allow patching functions where RCU is not watching.
+ * See klp_synchronize_transition() for more details.
+ */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (WARN_ON_ONCE(bit < 0))
+ return;
+
+ func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+ stack_node);
+
+ /*
+ * func should never be NULL because preemption should be disabled here
+ * and unregister_ftrace_function() does the equivalent of a
+ * synchronize_rcu() before the func_stack removal.
+ */
+ if (WARN_ON_ONCE(!func))
+ goto unlock;
+
+ /*
+ * In the enable path, enforce the order of the ops->func_stack and
+ * func->transition reads. The corresponding write barrier is in
+ * __klp_enable_patch().
+ *
+ * (Note that this barrier technically isn't needed in the disable
+ * path. In the rare case where klp_update_patch_state() runs before
+ * this handler, its TIF_PATCH_PENDING read and this func->transition
+ * read need to be ordered. But klp_update_patch_state() already
+ * enforces that.)
+ */
+ smp_rmb();
+
+ if (unlikely(func->transition)) {
+
+ /*
+ * Enforce the order of the func->transition and
+ * current->patch_state reads. Otherwise we could read an
+ * out-of-date task state and pick the wrong function. The
+ * corresponding write barrier is in klp_init_transition().
+ */
+ smp_rmb();
+
+ patch_state = current->patch_state;
+
+ WARN_ON_ONCE(patch_state == KLP_TRANSITION_IDLE);
+
+ if (patch_state == KLP_TRANSITION_UNPATCHED) {
+ /*
+ * Use the previously patched version of the function.
+ * If no previous patches exist, continue with the
+ * original function.
+ */
+ func = list_entry_rcu(func->stack_node.next,
+ struct klp_func, stack_node);
+
+ if (&func->stack_node == &ops->func_stack)
+ goto unlock;
+ }
+ }
+
+ /*
+ * NOPs are used to replace existing patches with original code.
+ * Do nothing! Setting pc would cause an infinite loop.
+ */
+ if (func->nop)
+ goto unlock;
+
+ ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func);
+
+unlock:
+ ftrace_test_recursion_unlock(bit);
+}
+
+static void klp_unpatch_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+
+ if (WARN_ON(!func->patched))
+ return;
+ if (WARN_ON(!func->old_func))
+ return;
+
+ ops = klp_find_ops(func->old_func);
+ if (WARN_ON(!ops))
+ return;
+
+ if (list_is_singular(&ops->func_stack)) {
+ unsigned long ftrace_loc;
+
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
+ if (WARN_ON(!ftrace_loc))
+ return;
+
+ WARN_ON(unregister_ftrace_function(&ops->fops));
+ WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
+
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ } else {
+ list_del_rcu(&func->stack_node);
+ }
+
+ func->patched = false;
+}
+
+static int klp_patch_func(struct klp_func *func)
+{
+ struct klp_ops *ops;
+ int ret;
+
+ if (WARN_ON(!func->old_func))
+ return -EINVAL;
+
+ if (WARN_ON(func->patched))
+ return -EINVAL;
+
+ ops = klp_find_ops(func->old_func);
+ if (!ops) {
+ unsigned long ftrace_loc;
+
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
+ if (!ftrace_loc) {
+ pr_err("failed to find location for function '%s'\n",
+ func->old_name);
+ return -EINVAL;
+ }
+
+ ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ ops->fops.func = klp_ftrace_handler;
+ ops->fops.flags = FTRACE_OPS_FL_DYNAMIC |
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+ FTRACE_OPS_FL_SAVE_REGS |
+#endif
+ FTRACE_OPS_FL_IPMODIFY |
+ FTRACE_OPS_FL_PERMANENT;
+
+ list_add(&ops->node, &klp_ops);
+
+ INIT_LIST_HEAD(&ops->func_stack);
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+
+ ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
+ if (ret) {
+ pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+ func->old_name, ret);
+ goto err;
+ }
+
+ ret = register_ftrace_function(&ops->fops);
+ if (ret) {
+ pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+ func->old_name, ret);
+ ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
+ goto err;
+ }
+
+
+ } else {
+ list_add_rcu(&func->stack_node, &ops->func_stack);
+ }
+
+ func->patched = true;
+
+ return 0;
+
+err:
+ list_del_rcu(&func->stack_node);
+ list_del(&ops->node);
+ kfree(ops);
+ return ret;
+}
+
+static void __klp_unpatch_object(struct klp_object *obj, bool nops_only)
+{
+ struct klp_func *func;
+
+ klp_for_each_func(obj, func) {
+ if (nops_only && !func->nop)
+ continue;
+
+ if (func->patched)
+ klp_unpatch_func(func);
+ }
+
+ if (obj->dynamic || !nops_only)
+ obj->patched = false;
+}
+
+
+void klp_unpatch_object(struct klp_object *obj)
+{
+ __klp_unpatch_object(obj, false);
+}
+
+int klp_patch_object(struct klp_object *obj)
+{
+ struct klp_func *func;
+ int ret;
+
+ if (WARN_ON(obj->patched))
+ return -EINVAL;
+
+ klp_for_each_func(obj, func) {
+ ret = klp_patch_func(func);
+ if (ret) {
+ klp_unpatch_object(obj);
+ return ret;
+ }
+ }
+ obj->patched = true;
+
+ return 0;
+}
+
+static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only)
+{
+ struct klp_object *obj;
+
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ __klp_unpatch_object(obj, nops_only);
+}
+
+void klp_unpatch_objects(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, false);
+}
+
+void klp_unpatch_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, true);
+}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
new file mode 100644
index 000000000000..d5f2fbe373e0
--- /dev/null
+++ b/kernel/livepatch/patch.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIVEPATCH_PATCH_H
+#define _LIVEPATCH_PATCH_H
+
+#include <linux/livepatch.h>
+#include <linux/list.h>
+#include <linux/ftrace.h>
+
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_func. This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list. The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node: node for the global klp_ops list
+ * @func_stack: list head for the stack of klp_func's (active func is on top)
+ * @fops: registered ftrace ops struct
+ */
+struct klp_ops {
+ struct list_head node;
+ struct list_head func_stack;
+ struct ftrace_ops fops;
+};
+
+struct klp_ops *klp_find_ops(void *old_func);
+
+int klp_patch_object(struct klp_object *obj);
+void klp_unpatch_object(struct klp_object *obj);
+void klp_unpatch_objects(struct klp_patch *patch);
+void klp_unpatch_objects_dynamic(struct klp_patch *patch);
+
+#endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..c2e724d97ddf
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value. It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer. Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures. Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node: klp_shadow_hash hash table node
+ * @rcu_head: RCU is used to safely free this structure
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @data: data area
+ */
+struct klp_shadow {
+ struct hlist_node node;
+ struct rcu_head rcu_head;
+ void *obj;
+ unsigned long id;
+ char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow: shadow variable to match
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+ unsigned long id)
+{
+ return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj: pointer to parent object
+ * @id: data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+ struct klp_shadow *shadow;
+
+ rcu_read_lock();
+
+ hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ rcu_read_unlock();
+ return shadow->data;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id,
+ size_t size, gfp_t gfp_flags,
+ klp_shadow_ctor_t ctor, void *ctor_data,
+ bool warn_on_exist)
+{
+ struct klp_shadow *new_shadow;
+ void *shadow_data;
+ unsigned long flags;
+
+ /* Check if the shadow variable already exists */
+ shadow_data = klp_shadow_get(obj, id);
+ if (shadow_data)
+ goto exists;
+
+ /*
+ * Allocate a new shadow variable. Fill it with zeroes by default.
+ * More complex setting can be done by @ctor function. But it is
+ * called only when the buffer is really used (under klp_shadow_lock).
+ */
+ new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+ if (!new_shadow)
+ return NULL;
+
+ /* Look for <obj, id> again under the lock */
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+ shadow_data = klp_shadow_get(obj, id);
+ if (unlikely(shadow_data)) {
+ /*
+ * Shadow variable was found, throw away speculative
+ * allocation.
+ */
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+ kfree(new_shadow);
+ goto exists;
+ }
+
+ new_shadow->obj = obj;
+ new_shadow->id = id;
+
+ if (ctor) {
+ int err;
+
+ err = ctor(obj, new_shadow->data, ctor_data);
+ if (err) {
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+ kfree(new_shadow);
+ pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n",
+ obj, id, err);
+ return NULL;
+ }
+ }
+
+ /* No <obj, id> found, so attach the newly allocated one */
+ hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+ (unsigned long)new_shadow->obj);
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+ return new_shadow->data;
+
+exists:
+ if (warn_on_exist) {
+ WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+ return NULL;
+ }
+
+ return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ * @ctor: custom constructor to initialize the shadow data (optional)
+ * @ctor_data: pointer to any data needed by @ctor (optional)
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags.
+ * The data are zeroed by default. They are further initialized by @ctor
+ * function if it is not NULL. The new shadow variable is then added
+ * to the global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine will
+ * issue a WARN, exit early and return NULL.
+ *
+ * This function guarantees that the constructor function is called only when
+ * the variable did not exist before. The cost is that @ctor is called
+ * in atomic context under a spin lock.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id,
+ size_t size, gfp_t gfp_flags,
+ klp_shadow_ctor_t ctor, void *ctor_data)
+{
+ return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+ ctor, ctor_data, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @size: size of attached data
+ * @gfp_flags: GFP mask for allocation
+ * @ctor: custom constructor to initialize the shadow data (optional)
+ * @ctor_data: pointer to any data needed by @ctor (optional)
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present. Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with the given
+ * @id for the given @obj. It also guarantees that the constructor function
+ * will be called only when the variable did not exist before. The cost is
+ * that @ctor is called in atomic context under a spin lock.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
+ size_t size, gfp_t gfp_flags,
+ klp_shadow_ctor_t ctor, void *ctor_data)
+{
+ return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+ ctor, ctor_data, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+static void klp_shadow_free_struct(struct klp_shadow *shadow,
+ klp_shadow_dtor_t dtor)
+{
+ hash_del_rcu(&shadow->node);
+ if (dtor)
+ dtor(shadow->obj, shadow->data);
+ kfree_rcu(shadow, rcu_head);
+}
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj: pointer to parent object
+ * @id: data identifier
+ * @dtor: custom callback that can be used to unregister the variable
+ * and/or free data that the shadow variable points to (optional)
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete <obj, id> from hash */
+ hash_for_each_possible(klp_shadow_hash, shadow, node,
+ (unsigned long)obj) {
+
+ if (klp_shadow_match(shadow, obj, id)) {
+ klp_shadow_free_struct(shadow, dtor);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <_, id> shadow variables
+ * @id: data identifier
+ * @dtor: custom callback that can be used to unregister the variable
+ * and/or free data that the shadow variable points to (optional)
+ *
+ * This function releases the memory for all <_, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
+{
+ struct klp_shadow *shadow;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&klp_shadow_lock, flags);
+
+ /* Delete all <_, id> from hash */
+ hash_for_each(klp_shadow_hash, i, shadow, node) {
+ if (klp_shadow_match(shadow, shadow->obj, id))
+ klp_shadow_free_struct(shadow, dtor);
+ }
+
+ spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c
new file mode 100644
index 000000000000..2565d039ade0
--- /dev/null
+++ b/kernel/livepatch/state.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * system_state.c - State of the system modified by livepatches
+ *
+ * Copyright (C) 2019 SUSE
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/livepatch.h>
+#include "core.h"
+#include "state.h"
+#include "transition.h"
+
+#define klp_for_each_state(patch, state) \
+ for (state = patch->states; state && state->id; state++)
+
+/**
+ * klp_get_state() - get information about system state modified by
+ * the given patch
+ * @patch: livepatch that modifies the given system state
+ * @id: custom identifier of the modified system state
+ *
+ * Checks whether the given patch modifies the given system state.
+ *
+ * The function can be called either from pre/post (un)patch
+ * callbacks or from the kernel code added by the livepatch.
+ *
+ * Return: pointer to struct klp_state when found, otherwise NULL.
+ */
+struct klp_state *klp_get_state(struct klp_patch *patch, unsigned long id)
+{
+ struct klp_state *state;
+
+ klp_for_each_state(patch, state) {
+ if (state->id == id)
+ return state;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_get_state);
+
+/**
+ * klp_get_prev_state() - get information about system state modified by
+ * the already installed livepatches
+ * @id: custom identifier of the modified system state
+ *
+ * Checks whether already installed livepatches modify the given
+ * system state.
+ *
+ * The same system state can be modified by more non-cumulative
+ * livepatches. It is expected that the latest livepatch has
+ * the most up-to-date information.
+ *
+ * The function can be called only during transition when a new
+ * livepatch is being enabled or when such a transition is reverted.
+ * It is typically called only from pre/post (un)patch
+ * callbacks.
+ *
+ * Return: pointer to the latest struct klp_state from already
+ * installed livepatches, NULL when not found.
+ */
+struct klp_state *klp_get_prev_state(unsigned long id)
+{
+ struct klp_patch *patch;
+ struct klp_state *state, *last_state = NULL;
+
+ if (WARN_ON_ONCE(!klp_transition_patch))
+ return NULL;
+
+ klp_for_each_patch(patch) {
+ if (patch == klp_transition_patch)
+ goto out;
+
+ state = klp_get_state(patch, id);
+ if (state)
+ last_state = state;
+ }
+
+out:
+ return last_state;
+}
+EXPORT_SYMBOL_GPL(klp_get_prev_state);
+
+/* Check if the patch is able to deal with the existing system state. */
+static bool klp_is_state_compatible(struct klp_patch *patch,
+ struct klp_state *old_state)
+{
+ struct klp_state *state;
+
+ state = klp_get_state(patch, old_state->id);
+
+ /* A cumulative livepatch must handle all already modified states. */
+ if (!state)
+ return !patch->replace;
+
+ return state->version >= old_state->version;
+}
+
+/*
+ * Check that the new livepatch will not break the existing system states.
+ * Cumulative patches must handle all already modified states.
+ * Non-cumulative patches can touch already modified states.
+ */
+bool klp_is_patch_compatible(struct klp_patch *patch)
+{
+ struct klp_patch *old_patch;
+ struct klp_state *old_state;
+
+ klp_for_each_patch(old_patch) {
+ klp_for_each_state(old_patch, old_state) {
+ if (!klp_is_state_compatible(patch, old_state))
+ return false;
+ }
+ }
+
+ return true;
+}
diff --git a/kernel/livepatch/state.h b/kernel/livepatch/state.h
new file mode 100644
index 000000000000..49d9c16e8762
--- /dev/null
+++ b/kernel/livepatch/state.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIVEPATCH_STATE_H
+#define _LIVEPATCH_STATE_H
+
+#include <linux/livepatch.h>
+
+bool klp_is_patch_compatible(struct klp_patch *patch);
+
+#endif /* _LIVEPATCH_STATE_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
new file mode 100644
index 000000000000..2351a19ac2a9
--- /dev/null
+++ b/kernel/livepatch/transition.c
@@ -0,0 +1,731 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * transition.c - Kernel Live Patching transition functions
+ *
+ * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/stacktrace.h>
+#include <linux/static_call.h>
+#include "core.h"
+#include "patch.h"
+#include "transition.h"
+
+#define MAX_STACK_ENTRIES 100
+static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
+#define STACK_ERR_BUF_SIZE 128
+
+#define SIGNALS_TIMEOUT 15
+
+struct klp_patch *klp_transition_patch;
+
+static int klp_target_state = KLP_TRANSITION_IDLE;
+
+static unsigned int klp_signals_cnt;
+
+/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * schedule(). This helps CPU-bound kthreads get patched.
+ */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+
+#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+/*
+ * This work can be performed periodically to finish patching or unpatching any
+ * "straggler" tasks which failed to transition in the first attempt.
+ */
+static void klp_transition_work_fn(struct work_struct *work)
+{
+ mutex_lock(&klp_mutex);
+
+ if (klp_transition_patch)
+ klp_try_complete_transition();
+
+ mutex_unlock(&klp_mutex);
+}
+static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
+
+/*
+ * This function is just a stub to implement a hard force
+ * of synchronize_rcu(). This requires synchronizing
+ * tasks even in userspace and idle.
+ */
+static void klp_sync(struct work_struct *work)
+{
+}
+
+/*
+ * We allow to patch also functions where RCU is not watching,
+ * e.g. before user_exit(). We can not rely on the RCU infrastructure
+ * to do the synchronization. Instead hard force the sched synchronization.
+ *
+ * This approach allows to use RCU functions for manipulating func_stack
+ * safely.
+ */
+static void klp_synchronize_transition(void)
+{
+ schedule_on_each_cpu(klp_sync);
+}
+
+/*
+ * The transition to the target patch state is complete. Clean up the data
+ * structures.
+ */
+static void klp_complete_transition(void)
+{
+ struct klp_object *obj;
+ struct klp_func *func;
+ struct task_struct *g, *task;
+ unsigned int cpu;
+
+ pr_debug("'%s': completing %s transition\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
+
+ if (klp_transition_patch->replace && klp_target_state == KLP_TRANSITION_PATCHED) {
+ klp_unpatch_replaced_patches(klp_transition_patch);
+ klp_discard_nops(klp_transition_patch);
+ }
+
+ if (klp_target_state == KLP_TRANSITION_UNPATCHED) {
+ /*
+ * All tasks have transitioned to KLP_TRANSITION_UNPATCHED so we can now
+ * remove the new functions from the func_stack.
+ */
+ klp_unpatch_objects(klp_transition_patch);
+
+ /*
+ * Make sure klp_ftrace_handler() can no longer see functions
+ * from this patch on the ops->func_stack. Otherwise, after
+ * func->transition gets cleared, the handler may choose a
+ * removed function.
+ */
+ klp_synchronize_transition();
+ }
+
+ klp_for_each_object(klp_transition_patch, obj)
+ klp_for_each_func(obj, func)
+ func->transition = false;
+
+ /* Prevent klp_ftrace_handler() from seeing KLP_TRANSITION_IDLE state */
+ if (klp_target_state == KLP_TRANSITION_PATCHED)
+ klp_synchronize_transition();
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+ task->patch_state = KLP_TRANSITION_IDLE;
+ }
+ read_unlock(&tasklist_lock);
+
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+ task->patch_state = KLP_TRANSITION_IDLE;
+ }
+
+ klp_for_each_object(klp_transition_patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+ if (klp_target_state == KLP_TRANSITION_PATCHED)
+ klp_post_patch_callback(obj);
+ else if (klp_target_state == KLP_TRANSITION_UNPATCHED)
+ klp_post_unpatch_callback(obj);
+ }
+
+ pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
+
+ klp_target_state = KLP_TRANSITION_IDLE;
+ klp_transition_patch = NULL;
+}
+
+/*
+ * This is called in the error path, to cancel a transition before it has
+ * started, i.e. klp_init_transition() has been called but
+ * klp_start_transition() hasn't. If the transition *has* been started,
+ * klp_reverse_transition() should be used instead.
+ */
+void klp_cancel_transition(void)
+{
+ if (WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_PATCHED))
+ return;
+
+ pr_debug("'%s': canceling patching transition, going to unpatch\n",
+ klp_transition_patch->mod->name);
+
+ klp_target_state = KLP_TRANSITION_UNPATCHED;
+ klp_complete_transition();
+}
+
+/*
+ * Switch the patched state of the task to the set of functions in the target
+ * patch state.
+ *
+ * NOTE: If task is not 'current', the caller must ensure the task is inactive.
+ * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value.
+ */
+void klp_update_patch_state(struct task_struct *task)
+{
+ /*
+ * A variant of synchronize_rcu() is used to allow patching functions
+ * where RCU is not watching, see klp_synchronize_transition().
+ */
+ preempt_disable_notrace();
+
+ /*
+ * This test_and_clear_tsk_thread_flag() call also serves as a read
+ * barrier (smp_rmb) for two cases:
+ *
+ * 1) Enforce the order of the TIF_PATCH_PENDING read and the
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
+ *
+ * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
+ * of func->transition, if klp_ftrace_handler() is called later on
+ * the same CPU. See __klp_disable_patch().
+ */
+ if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))
+ task->patch_state = READ_ONCE(klp_target_state);
+
+ preempt_enable_notrace();
+}
+
+/*
+ * Determine whether the given stack trace includes any references to a
+ * to-be-patched or to-be-unpatched function.
+ */
+static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
+ unsigned int nr_entries)
+{
+ unsigned long func_addr, func_size, address;
+ struct klp_ops *ops;
+ int i;
+
+ if (klp_target_state == KLP_TRANSITION_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_func);
+
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = (unsigned long)func->old_func;
+ func_size = func->old_size;
+ } else {
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
+ }
+ }
+
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
+
+ if (address >= func_addr && address < func_addr + func_size)
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
+ * Determine whether it's safe to transition the task to the target patch state
+ * by looking for any to-be-patched or to-be-unpatched functions on its stack.
+ */
+static int klp_check_stack(struct task_struct *task, const char **oldname)
+{
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
+ struct klp_object *obj;
+ struct klp_func *func;
+ int ret, nr_entries;
+
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
+ if (ret < 0)
+ return -EINVAL;
+ nr_entries = ret;
+
+ klp_for_each_object(klp_transition_patch, obj) {
+ if (!obj->patched)
+ continue;
+ klp_for_each_func(obj, func) {
+ ret = klp_check_stack_func(func, entries, nr_entries);
+ if (ret) {
+ *oldname = func->old_name;
+ return -EADDRINUSE;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int klp_check_and_switch_task(struct task_struct *task, void *arg)
+{
+ int ret;
+
+ if (task_curr(task) && task != current)
+ return -EBUSY;
+
+ ret = klp_check_stack(task, arg);
+ if (ret)
+ return ret;
+
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ task->patch_state = klp_target_state;
+ return 0;
+}
+
+/*
+ * Try to safely switch a task to the target patch state. If it's currently
+ * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
+ * if the stack is unreliable, return false.
+ */
+static bool klp_try_switch_task(struct task_struct *task)
+{
+ const char *old_name;
+ int ret;
+
+ /* check if this task has already switched over */
+ if (task->patch_state == klp_target_state)
+ return true;
+
+ /*
+ * For arches which don't have reliable stack traces, we have to rely
+ * on other methods (e.g., switching tasks at kernel exit).
+ */
+ if (!klp_have_reliable_stack())
+ return false;
+
+ /*
+ * Now try to check the stack for any to-be-patched or to-be-unpatched
+ * functions. If all goes well, switch the task to the target patch
+ * state.
+ */
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
+ switch (ret) {
+ case 0: /* success */
+ break;
+
+ case -EBUSY: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is running\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EINVAL: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d has an unreliable stack\n",
+ __func__, task->comm, task->pid);
+ break;
+ case -EADDRINUSE: /* klp_check_and_switch_task() */
+ pr_debug("%s: %s:%d is sleeping on function %s\n",
+ __func__, task->comm, task->pid, old_name);
+ break;
+
+ default:
+ pr_debug("%s: Unknown error code (%d) when trying to switch %s:%d\n",
+ __func__, ret, task->comm, task->pid);
+ break;
+ }
+
+ return !ret;
+}
+
+void __klp_sched_try_switch(void)
+{
+ /*
+ * This function is called from __schedule() while a context switch is
+ * about to happen. Preemption is already disabled and klp_mutex
+ * can't be acquired.
+ * Disabled preemption is used to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch to this task while it's running.
+ */
+ lockdep_assert_preemption_disabled();
+
+ if (likely(!klp_patch_pending(current)))
+ return;
+
+ /*
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
+ */
+ smp_rmb();
+
+ klp_try_switch_task(current);
+}
+
+/*
+ * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
+ * Kthreads with TIF_PATCH_PENDING set are woken up.
+ */
+static void klp_send_signals(void)
+{
+ struct task_struct *g, *task;
+
+ if (klp_signals_cnt == SIGNALS_TIMEOUT)
+ pr_notice("signaling remaining tasks\n");
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ if (!klp_patch_pending(task))
+ continue;
+
+ /*
+ * There is a small race here. We could see TIF_PATCH_PENDING
+ * set and decide to wake up a kthread or send a fake signal.
+ * Meanwhile the task could migrate itself and the action
+ * would be meaningless. It is not serious though.
+ */
+ if (task->flags & PF_KTHREAD) {
+ /*
+ * Wake up a kthread which sleeps interruptedly and
+ * still has not been migrated.
+ */
+ wake_up_state(task, TASK_INTERRUPTIBLE);
+ } else {
+ /*
+ * Send fake signal to all non-kthread tasks which are
+ * still not migrated.
+ */
+ set_notify_signal(task);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Try to switch all remaining tasks to the target patch state by walking the
+ * stacks of sleeping tasks and looking for any to-be-patched or
+ * to-be-unpatched functions. If such functions are found, the task can't be
+ * switched yet.
+ *
+ * If any tasks are still stuck in the initial patch state, schedule a retry.
+ */
+void klp_try_complete_transition(void)
+{
+ unsigned int cpu;
+ struct task_struct *g, *task;
+ struct klp_patch *patch;
+ bool complete = true;
+
+ WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE);
+
+ /*
+ * Try to switch the tasks to the target patch state by walking their
+ * stacks and looking for any to-be-patched or to-be-unpatched
+ * functions. If such functions are found on a stack, or if the stack
+ * is deemed unreliable, the task can't be switched yet.
+ *
+ * Usually this will transition most (or all) of the tasks on a system
+ * unless the patch includes changes to a very common function.
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ if (!klp_try_switch_task(task))
+ complete = false;
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Ditto for the idle "swapper" tasks.
+ */
+ cpus_read_lock();
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ if (cpu_online(cpu)) {
+ if (!klp_try_switch_task(task)) {
+ complete = false;
+ /* Make idle task go through the main loop. */
+ wake_up_if_idle(cpu);
+ }
+ } else if (task->patch_state != klp_target_state) {
+ /* offline idle tasks can be switched immediately */
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ task->patch_state = klp_target_state;
+ }
+ }
+ cpus_read_unlock();
+
+ if (!complete) {
+ if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
+ klp_send_signals();
+ klp_signals_cnt++;
+
+ /*
+ * Some tasks weren't able to be switched over. Try again
+ * later and/or wait for other methods like kernel exit
+ * switching.
+ */
+ schedule_delayed_work(&klp_transition_work,
+ round_jiffies_relative(HZ));
+ return;
+ }
+
+ /* Done! Now cleanup the data structures. */
+ klp_resched_disable();
+ patch = klp_transition_patch;
+ klp_complete_transition();
+
+ /*
+ * It would make more sense to free the unused patches in
+ * klp_complete_transition() but it is called also
+ * from klp_cancel_transition().
+ */
+ if (!patch->enabled)
+ klp_free_patch_async(patch);
+ else if (patch->replace)
+ klp_free_replaced_patches_async(patch);
+}
+
+/*
+ * Start the transition to the specified target patch state so tasks can begin
+ * switching to it.
+ */
+void klp_start_transition(void)
+{
+ struct task_struct *g, *task;
+ unsigned int cpu;
+
+ WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE);
+
+ pr_notice("'%s': starting %s transition\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
+
+ /*
+ * Mark all normal tasks as needing a patch state update. They'll
+ * switch either in klp_try_complete_transition() or as they exit the
+ * kernel.
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ if (task->patch_state != klp_target_state)
+ set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Mark all idle tasks as needing a patch state update. They'll switch
+ * either in klp_try_complete_transition() or at the idle loop switch
+ * point.
+ */
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ if (task->patch_state != klp_target_state)
+ set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ }
+
+ klp_resched_enable();
+
+ klp_signals_cnt = 0;
+}
+
+/*
+ * Initialize the global target patch state and all tasks to the initial patch
+ * state, and initialize all function transition states to true in preparation
+ * for patching or unpatching.
+ */
+void klp_init_transition(struct klp_patch *patch, int state)
+{
+ struct task_struct *g, *task;
+ unsigned int cpu;
+ struct klp_object *obj;
+ struct klp_func *func;
+ int initial_state = !state;
+
+ WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_IDLE);
+
+ klp_transition_patch = patch;
+
+ /*
+ * Set the global target patch state which tasks will switch to. This
+ * has no effect until the TIF_PATCH_PENDING flags get set later.
+ */
+ klp_target_state = state;
+
+ pr_debug("'%s': initializing %s transition\n", patch->mod->name,
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
+
+ /*
+ * Initialize all tasks to the initial patch state to prepare them for
+ * switching to the target state.
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE);
+ task->patch_state = initial_state;
+ }
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Ditto for the idle "swapper" tasks.
+ */
+ for_each_possible_cpu(cpu) {
+ task = idle_task(cpu);
+ WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE);
+ task->patch_state = initial_state;
+ }
+
+ /*
+ * Enforce the order of the task->patch_state initializations and the
+ * func->transition updates to ensure that klp_ftrace_handler() doesn't
+ * see a func in transition with a task->patch_state of KLP_TRANSITION_IDLE.
+ *
+ * Also enforce the order of the klp_target_state write and future
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_TRANSITION_IDLE.
+ */
+ smp_wmb();
+
+ /*
+ * Set the func transition states so klp_ftrace_handler() will know to
+ * switch to the transition logic.
+ *
+ * When patching, the funcs aren't yet in the func_stack and will be
+ * made visible to the ftrace handler shortly by the calls to
+ * klp_patch_object().
+ *
+ * When unpatching, the funcs are already in the func_stack and so are
+ * already visible to the ftrace handler.
+ */
+ klp_for_each_object(patch, obj)
+ klp_for_each_func(obj, func)
+ func->transition = true;
+}
+
+/*
+ * This function can be called in the middle of an existing transition to
+ * reverse the direction of the target patch state. This can be done to
+ * effectively cancel an existing enable or disable operation if there are any
+ * tasks which are stuck in the initial patch state.
+ */
+void klp_reverse_transition(void)
+{
+ unsigned int cpu;
+ struct task_struct *g, *task;
+
+ pr_debug("'%s': reversing transition from %s\n",
+ klp_transition_patch->mod->name,
+ klp_target_state == KLP_TRANSITION_PATCHED ? "patching to unpatching" :
+ "unpatching to patching");
+
+ /*
+ * Clear all TIF_PATCH_PENDING flags to prevent races caused by
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
+ */
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+ read_unlock(&tasklist_lock);
+
+ for_each_possible_cpu(cpu)
+ clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
+
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
+ klp_synchronize_transition();
+
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
+ klp_start_transition();
+}
+
+/* Called from copy_process() during fork */
+void klp_copy_process(struct task_struct *child)
+{
+
+ /*
+ * The parent process may have gone through a KLP transition since
+ * the thread flag was copied in setup_thread_stack earlier. Bring
+ * the task flag up to date with the parent here.
+ *
+ * The operation is serialized against all klp_*_transition()
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
+ */
+ if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
+ set_tsk_thread_flag(child, TIF_PATCH_PENDING);
+ else
+ clear_tsk_thread_flag(child, TIF_PATCH_PENDING);
+
+ child->patch_state = current->patch_state;
+}
+
+/*
+ * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an
+ * existing transition to finish.
+ *
+ * NOTE: klp_update_patch_state(task) requires the task to be inactive or
+ * 'current'. This is not the case here and the consistency model could be
+ * broken. Administrator, who is the only one to execute the
+ * klp_force_transitions(), has to be aware of this.
+ */
+void klp_force_transition(void)
+{
+ struct klp_patch *patch;
+ struct task_struct *g, *task;
+ unsigned int cpu;
+
+ pr_warn("forcing remaining tasks to the patched state\n");
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task)
+ klp_update_patch_state(task);
+ read_unlock(&tasklist_lock);
+
+ for_each_possible_cpu(cpu)
+ klp_update_patch_state(idle_task(cpu));
+
+ /* Set forced flag for patches being removed. */
+ if (klp_target_state == KLP_TRANSITION_UNPATCHED)
+ klp_transition_patch->forced = true;
+ else if (klp_transition_patch->replace) {
+ klp_for_each_patch(patch) {
+ if (patch != klp_transition_patch)
+ patch->forced = true;
+ }
+ }
+}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
new file mode 100644
index 000000000000..322db16233de
--- /dev/null
+++ b/kernel/livepatch/transition.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIVEPATCH_TRANSITION_H
+#define _LIVEPATCH_TRANSITION_H
+
+#include <linux/livepatch.h>
+
+extern struct klp_patch *klp_transition_patch;
+
+void klp_init_transition(struct klp_patch *patch, int state);
+void klp_cancel_transition(void);
+void klp_start_transition(void);
+void klp_try_complete_transition(void);
+void klp_reverse_transition(void);
+void klp_force_transition(void);
+
+#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig
new file mode 100644
index 000000000000..9b2515f31afb
--- /dev/null
+++ b/kernel/liveupdate/Kconfig
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+menu "Live Update and Kexec HandOver"
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+
+config KEXEC_HANDOVER
+ bool "kexec handover"
+ depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ depends on !DEFERRED_STRUCT_PAGE_INIT
+ select MEMBLOCK_KHO_SCRATCH
+ select KEXEC_FILE
+ select LIBFDT
+ select CMA
+ help
+ Allow kexec to hand over state across kernels by generating and
+ passing additional metadata to the target kernel. This is useful
+ to keep data or state alive across the kexec. For this to work,
+ both source and target kernels need to have this option enabled.
+
+config KEXEC_HANDOVER_DEBUG
+ bool "Enable Kexec Handover debug checks"
+ depends on KEXEC_HANDOVER
+ help
+ This option enables extra sanity checks for the Kexec Handover
+ subsystem. Since, KHO performance is crucial in live update
+ scenarios and the extra code might be adding overhead it is
+ only optionally enabled.
+
+config KEXEC_HANDOVER_DEBUGFS
+ bool "kexec handover debugfs interface"
+ default KEXEC_HANDOVER
+ depends on KEXEC_HANDOVER
+ select DEBUG_FS
+ help
+ Allow to control kexec handover device tree via debugfs
+ interface, i.e. finalize the state or aborting the finalization.
+ Also, enables inspecting the KHO fdt trees with the debugfs binary
+ blobs.
+
+config KEXEC_HANDOVER_ENABLE_DEFAULT
+ bool "Enable kexec handover by default"
+ depends on KEXEC_HANDOVER
+ help
+ Enable Kexec Handover by default. This avoids the need to
+ explicitly pass 'kho=on' on the kernel command line.
+
+ This is useful for systems where KHO is a prerequisite for other
+ features, such as Live Update, ensuring the mechanism is always
+ active.
+
+ The default behavior can still be overridden at boot time by
+ passing 'kho=off'.
+
+config LIVEUPDATE
+ bool "Live Update Orchestrator"
+ depends on KEXEC_HANDOVER
+ help
+ Enable the Live Update Orchestrator. Live Update is a mechanism,
+ typically based on kexec, that allows the kernel to be updated
+ while keeping selected devices operational across the transition.
+ These devices are intended to be reclaimed by the new kernel and
+ re-attached to their original workload without requiring a device
+ reset.
+
+ Ability to handover a device from current to the next kernel depends
+ on specific support within device drivers and related kernel
+ subsystems.
+
+ This feature primarily targets virtual machine hosts to quickly update
+ the kernel hypervisor with minimal disruption to the running virtual
+ machines.
+
+ If unsure, say N.
+
+endmenu
diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile
new file mode 100644
index 000000000000..7cad2eece32d
--- /dev/null
+++ b/kernel/liveupdate/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+
+luo-y := \
+ luo_core.o \
+ luo_file.o \
+ luo_session.o
+
+obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o
+
+obj-$(CONFIG_LIVEUPDATE) += luo.o
diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
new file mode 100644
index 000000000000..9dc51fab604f
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover.c
@@ -0,0 +1,1594 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover.c - kexec handover metadata processing
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/cleanup.h>
+#include <linux/cma.h>
+#include <linux/kmemleak.h>
+#include <linux/count_zeros.h>
+#include <linux/kexec.h>
+#include <linux/kexec_handover.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/page-isolation.h>
+#include <linux/unaligned.h>
+#include <linux/vmalloc.h>
+
+#include <asm/early_ioremap.h>
+
+#include "kexec_handover_internal.h"
+/*
+ * KHO is tightly coupled with mm init and needs access to some of mm
+ * internal APIs.
+ */
+#include "../../mm/internal.h"
+#include "../kexec_internal.h"
+#include "kexec_handover_internal.h"
+
+#define KHO_FDT_COMPATIBLE "kho-v1"
+#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
+#define PROP_SUB_FDT "fdt"
+
+#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */
+
+/*
+ * KHO uses page->private, which is an unsigned long, to store page metadata.
+ * Use it to store both the magic and the order.
+ */
+union kho_page_info {
+ unsigned long page_private;
+ struct {
+ unsigned int order;
+ unsigned int magic;
+ };
+};
+
+static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private));
+
+static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT);
+
+bool kho_is_enabled(void)
+{
+ return kho_enable;
+}
+EXPORT_SYMBOL_GPL(kho_is_enabled);
+
+static int __init kho_parse_enable(char *p)
+{
+ return kstrtobool(p, &kho_enable);
+}
+early_param("kho", kho_parse_enable);
+
+/*
+ * Keep track of memory that is to be preserved across KHO.
+ *
+ * The serializing side uses two levels of xarrays to manage chunks of per-order
+ * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
+ * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
+ * allocations each bitmap will cover 128M of address space. Thus, for 16G of
+ * memory at most 512K of bitmap memory will be needed for order 0.
+ *
+ * This approach is fully incremental, as the serialization progresses folios
+ * can continue be aggregated to the tracker. The final step, immediately prior
+ * to kexec would serialize the xarray information into a linked list for the
+ * successor kernel to parse.
+ */
+
+#define PRESERVE_BITS (PAGE_SIZE * 8)
+
+struct kho_mem_phys_bits {
+ DECLARE_BITMAP(preserve, PRESERVE_BITS);
+};
+
+static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
+
+struct kho_mem_phys {
+ /*
+ * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
+ * to order.
+ */
+ struct xarray phys_bits;
+};
+
+struct kho_mem_track {
+ /* Points to kho_mem_phys, each order gets its own bitmap tree */
+ struct xarray orders;
+};
+
+struct khoser_mem_chunk;
+
+struct kho_out {
+ void *fdt;
+ bool finalized;
+ struct mutex lock; /* protects KHO FDT finalization */
+
+ struct kho_mem_track track;
+ struct kho_debugfs dbg;
+};
+
+static struct kho_out kho_out = {
+ .lock = __MUTEX_INITIALIZER(kho_out.lock),
+ .track = {
+ .orders = XARRAY_INIT(kho_out.track.orders, 0),
+ },
+ .finalized = false,
+};
+
+static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
+{
+ void *res = xa_load(xa, index);
+
+ if (res)
+ return res;
+
+ void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
+
+ if (!elm)
+ return ERR_PTR(-ENOMEM);
+
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
+ res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
+ if (xa_is_err(res))
+ return ERR_PTR(xa_err(res));
+ else if (res)
+ return res;
+
+ return no_free_ptr(elm);
+}
+
+static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa;
+ const unsigned long pfn_high = pfn >> order;
+
+ physxa = xa_load(&track->orders, order);
+ if (WARN_ON_ONCE(!physxa))
+ return;
+
+ bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (WARN_ON_ONCE(!bits))
+ return;
+
+ clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+}
+
+static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ unsigned int order;
+
+ while (pfn < end_pfn) {
+ order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ __kho_unpreserve_order(track, pfn, order);
+
+ pfn += 1 << order;
+ }
+}
+
+static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa, *new_physxa;
+ const unsigned long pfn_high = pfn >> order;
+
+ might_sleep();
+ physxa = xa_load(&track->orders, order);
+ if (!physxa) {
+ int err;
+
+ new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
+ if (!new_physxa)
+ return -ENOMEM;
+
+ xa_init(&new_physxa->phys_bits);
+ physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
+ GFP_KERNEL);
+
+ err = xa_err(physxa);
+ if (err || physxa) {
+ xa_destroy(&new_physxa->phys_bits);
+ kfree(new_physxa);
+
+ if (err)
+ return err;
+ } else {
+ physxa = new_physxa;
+ }
+ }
+
+ bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (IS_ERR(bits))
+ return PTR_ERR(bits);
+
+ set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+
+ return 0;
+}
+
+static struct page *kho_restore_page(phys_addr_t phys, bool is_folio)
+{
+ struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+ unsigned int nr_pages, ref_cnt;
+ union kho_page_info info;
+
+ if (!page)
+ return NULL;
+
+ info.page_private = page->private;
+ /*
+ * deserialize_bitmap() only sets the magic on the head page. This magic
+ * check also implicitly makes sure phys is order-aligned since for
+ * non-order-aligned phys addresses, magic will never be set.
+ */
+ if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER))
+ return NULL;
+ nr_pages = (1 << info.order);
+
+ /* Clear private to make sure later restores on this page error out. */
+ page->private = 0;
+ /* Head page gets refcount of 1. */
+ set_page_count(page, 1);
+
+ /*
+ * For higher order folios, tail pages get a page count of zero.
+ * For physically contiguous order-0 pages every pages gets a page
+ * count of 1
+ */
+ ref_cnt = is_folio ? 0 : 1;
+ for (unsigned int i = 1; i < nr_pages; i++)
+ set_page_count(page + i, ref_cnt);
+
+ if (is_folio && info.order)
+ prep_compound_page(page, info.order);
+
+ adjust_managed_page_count(page, nr_pages);
+ return page;
+}
+
+/**
+ * kho_restore_folio - recreates the folio from the preserved memory.
+ * @phys: physical address of the folio.
+ *
+ * Return: pointer to the struct folio on success, NULL on failure.
+ */
+struct folio *kho_restore_folio(phys_addr_t phys)
+{
+ struct page *page = kho_restore_page(phys, true);
+
+ return page ? page_folio(page) : NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_folio);
+
+/**
+ * kho_restore_pages - restore list of contiguous order 0 pages.
+ * @phys: physical address of the first page.
+ * @nr_pages: number of pages.
+ *
+ * Restore a contiguous list of order 0 pages that was preserved with
+ * kho_preserve_pages().
+ *
+ * Return: 0 on success, error code on failure
+ */
+struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages)
+{
+ const unsigned long start_pfn = PHYS_PFN(phys);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn = start_pfn;
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+ struct page *page = kho_restore_page(PFN_PHYS(pfn), false);
+
+ if (!page)
+ return NULL;
+ pfn += 1 << order;
+ }
+
+ return pfn_to_page(start_pfn);
+}
+EXPORT_SYMBOL_GPL(kho_restore_pages);
+
+/* Serialize and deserialize struct kho_mem_phys across kexec
+ *
+ * Record all the bitmaps in a linked list of pages for the next kernel to
+ * process. Each chunk holds bitmaps of the same order and each block of bitmaps
+ * starts at a given physical address. This allows the bitmaps to be sparse. The
+ * xarray is used to store them in a tree while building up the data structure,
+ * but the KHO successor kernel only needs to process them once in order.
+ *
+ * All of this memory is normal kmalloc() memory and is not marked for
+ * preservation. The successor kernel will remain isolated to the scratch space
+ * until it completes processing this list. Once processed all the memory
+ * storing these ranges will be marked as free.
+ */
+
+struct khoser_mem_bitmap_ptr {
+ phys_addr_t phys_start;
+ DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
+};
+
+struct khoser_mem_chunk_hdr {
+ DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
+ unsigned int order;
+ unsigned int num_elms;
+};
+
+#define KHOSER_BITMAP_SIZE \
+ ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
+ sizeof(struct khoser_mem_bitmap_ptr))
+
+struct khoser_mem_chunk {
+ struct khoser_mem_chunk_hdr hdr;
+ struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
+};
+
+static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
+
+static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
+ unsigned long order)
+{
+ struct khoser_mem_chunk *chunk __free(free_page) = NULL;
+
+ chunk = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!chunk)
+ return ERR_PTR(-ENOMEM);
+
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
+ chunk->hdr.order = order;
+ if (cur_chunk)
+ KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
+ return no_free_ptr(chunk);
+}
+
+static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
+{
+ struct khoser_mem_chunk *chunk = first_chunk;
+
+ while (chunk) {
+ struct khoser_mem_chunk *tmp = chunk;
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ free_page((unsigned long)tmp);
+ }
+}
+
+/*
+ * Update memory map property, if old one is found discard it via
+ * kho_mem_ser_free().
+ */
+static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk)
+{
+ void *ptr;
+ u64 phys;
+
+ ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL);
+
+ /* Check and discard previous memory map */
+ phys = get_unaligned((u64 *)ptr);
+ if (phys)
+ kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys));
+
+ /* Update with the new value */
+ phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0;
+ put_unaligned(phys, (u64 *)ptr);
+}
+
+static int kho_mem_serialize(struct kho_out *kho_out)
+{
+ struct khoser_mem_chunk *first_chunk = NULL;
+ struct khoser_mem_chunk *chunk = NULL;
+ struct kho_mem_phys *physxa;
+ unsigned long order;
+ int err = -ENOMEM;
+
+ xa_for_each(&kho_out->track.orders, order, physxa) {
+ struct kho_mem_phys_bits *bits;
+ unsigned long phys;
+
+ chunk = new_chunk(chunk, order);
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
+ goto err_free;
+ }
+
+ if (!first_chunk)
+ first_chunk = chunk;
+
+ xa_for_each(&physxa->phys_bits, phys, bits) {
+ struct khoser_mem_bitmap_ptr *elm;
+
+ if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
+ chunk = new_chunk(chunk, order);
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
+ goto err_free;
+ }
+ }
+
+ elm = &chunk->bitmaps[chunk->hdr.num_elms];
+ chunk->hdr.num_elms++;
+ elm->phys_start = (phys * PRESERVE_BITS)
+ << (order + PAGE_SHIFT);
+ KHOSER_STORE_PTR(elm->bitmap, bits);
+ }
+ }
+
+ kho_update_memory_map(first_chunk);
+
+ return 0;
+
+err_free:
+ kho_mem_ser_free(first_chunk);
+ return err;
+}
+
+static void __init deserialize_bitmap(unsigned int order,
+ struct khoser_mem_bitmap_ptr *elm)
+{
+ struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
+ unsigned long bit;
+
+ for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
+ int sz = 1 << (order + PAGE_SHIFT);
+ phys_addr_t phys =
+ elm->phys_start + (bit << (order + PAGE_SHIFT));
+ struct page *page = phys_to_page(phys);
+ union kho_page_info info;
+
+ memblock_reserve(phys, sz);
+ memblock_reserved_mark_noinit(phys, sz);
+ info.magic = KHO_PAGE_MAGIC;
+ info.order = order;
+ page->private = info.page_private;
+ }
+}
+
+/* Return true if memory was deserizlied */
+static bool __init kho_mem_deserialize(const void *fdt)
+{
+ struct khoser_mem_chunk *chunk;
+ const void *mem_ptr;
+ u64 mem;
+ int len;
+
+ mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+ if (!mem_ptr || len != sizeof(u64)) {
+ pr_err("failed to get preserved memory bitmaps\n");
+ return false;
+ }
+
+ mem = get_unaligned((const u64 *)mem_ptr);
+ chunk = mem ? phys_to_virt(mem) : NULL;
+
+ /* No preserved physical pages were passed, no deserialization */
+ if (!chunk)
+ return false;
+
+ while (chunk) {
+ unsigned int i;
+
+ for (i = 0; i != chunk->hdr.num_elms; i++)
+ deserialize_bitmap(chunk->hdr.order,
+ &chunk->bitmaps[i]);
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ }
+
+ return true;
+}
+
+/*
+ * With KHO enabled, memory can become fragmented because KHO regions may
+ * be anywhere in physical address space. The scratch regions give us a
+ * safe zones that we will never see KHO allocations from. This is where we
+ * can later safely load our new kexec images into and then use the scratch
+ * area for early allocations that happen before page allocator is
+ * initialized.
+ */
+struct kho_scratch *kho_scratch;
+unsigned int kho_scratch_cnt;
+
+/*
+ * The scratch areas are scaled by default as percent of memory allocated from
+ * memblock. A user can override the scale with command line parameter:
+ *
+ * kho_scratch=N%
+ *
+ * It is also possible to explicitly define size for a lowmem, a global and
+ * per-node scratch areas:
+ *
+ * kho_scratch=l[KMG],n[KMG],m[KMG]
+ *
+ * The explicit size definition takes precedence over scale definition.
+ */
+static unsigned int scratch_scale __initdata = 200;
+static phys_addr_t scratch_size_global __initdata;
+static phys_addr_t scratch_size_pernode __initdata;
+static phys_addr_t scratch_size_lowmem __initdata;
+
+static int __init kho_parse_scratch_size(char *p)
+{
+ size_t len;
+ unsigned long sizes[3];
+ size_t total_size = 0;
+ int i;
+
+ if (!p)
+ return -EINVAL;
+
+ len = strlen(p);
+ if (!len)
+ return -EINVAL;
+
+ /* parse nn% */
+ if (p[len - 1] == '%') {
+ /* unsigned int max is 4,294,967,295, 10 chars */
+ char s_scale[11] = {};
+ int ret = 0;
+
+ if (len > ARRAY_SIZE(s_scale))
+ return -EINVAL;
+
+ memcpy(s_scale, p, len - 1);
+ ret = kstrtouint(s_scale, 10, &scratch_scale);
+ if (!ret)
+ pr_notice("scratch scale is %d%%\n", scratch_scale);
+ return ret;
+ }
+
+ /* parse ll[KMG],mm[KMG],nn[KMG] */
+ for (i = 0; i < ARRAY_SIZE(sizes); i++) {
+ char *endp = p;
+
+ if (i > 0) {
+ if (*p != ',')
+ return -EINVAL;
+ p += 1;
+ }
+
+ sizes[i] = memparse(p, &endp);
+ if (endp == p)
+ return -EINVAL;
+ p = endp;
+ total_size += sizes[i];
+ }
+
+ if (!total_size)
+ return -EINVAL;
+
+ /* The string should be fully consumed by now. */
+ if (*p)
+ return -EINVAL;
+
+ scratch_size_lowmem = sizes[0];
+ scratch_size_global = sizes[1];
+ scratch_size_pernode = sizes[2];
+ scratch_scale = 0;
+
+ pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
+ (u64)(scratch_size_lowmem >> 20),
+ (u64)(scratch_size_global >> 20),
+ (u64)(scratch_size_pernode >> 20));
+
+ return 0;
+}
+early_param("kho_scratch", kho_parse_scratch_size);
+
+static void __init scratch_size_update(void)
+{
+ phys_addr_t size;
+
+ if (!scratch_scale)
+ return;
+
+ size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100;
+ scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100 - scratch_size_lowmem;
+ scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+static phys_addr_t __init scratch_size_node(int nid)
+{
+ phys_addr_t size;
+
+ if (scratch_scale) {
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ nid);
+ size = size * scratch_scale / 100;
+ } else {
+ size = scratch_size_pernode;
+ }
+
+ return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+/**
+ * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
+ *
+ * With KHO we can preserve arbitrary pages in the system. To ensure we still
+ * have a large contiguous region of memory when we search the physical address
+ * space for target memory, let's make sure we always have a large CMA region
+ * active. This CMA region will only be used for movable pages which are not a
+ * problem for us during KHO because we can just move them somewhere else.
+ */
+static void __init kho_reserve_scratch(void)
+{
+ phys_addr_t addr, size;
+ int nid, i = 0;
+
+ if (!kho_enable)
+ return;
+
+ scratch_size_update();
+
+ /* FIXME: deal with node hot-plug/remove */
+ kho_scratch_cnt = num_online_nodes() + 2;
+ size = kho_scratch_cnt * sizeof(*kho_scratch);
+ kho_scratch = memblock_alloc(size, PAGE_SIZE);
+ if (!kho_scratch)
+ goto err_disable_kho;
+
+ /*
+ * reserve scratch area in low memory for lowmem allocations in the
+ * next kernel
+ */
+ size = scratch_size_lowmem;
+ addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
+ ARCH_LOW_ADDRESS_LIMIT);
+ if (!addr)
+ goto err_free_scratch_desc;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ /* reserve large contiguous area for allocations without nid */
+ size = scratch_size_global;
+ addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ for_each_online_node(nid) {
+ size = scratch_size_node(nid);
+ addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid, true);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+ }
+
+ return;
+
+err_free_scratch_areas:
+ for (i--; i >= 0; i--)
+ memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
+err_free_scratch_desc:
+ memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
+err_disable_kho:
+ pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
+ kho_enable = false;
+}
+
+/**
+ * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
+ * @name: name of the sub tree.
+ * @fdt: the sub tree blob.
+ *
+ * Creates a new child node named @name in KHO root FDT and records
+ * the physical address of @fdt. The pages of @fdt must also be preserved
+ * by KHO for the new kernel to retrieve it after kexec.
+ *
+ * A debugfs blob entry is also created at
+ * ``/sys/kernel/debug/kho/out/sub_fdts/@name`` when kernel is configured with
+ * CONFIG_KEXEC_HANDOVER_DEBUGFS
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_add_subtree(const char *name, void *fdt)
+{
+ phys_addr_t phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int err = -ENOMEM;
+ int off, fdt_err;
+
+ guard(mutex)(&kho_out.lock);
+
+ fdt_err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (fdt_err < 0)
+ return err;
+
+ off = fdt_add_subnode(root_fdt, 0, name);
+ if (off < 0) {
+ if (off == -FDT_ERR_EXISTS)
+ err = -EEXIST;
+ goto out_pack;
+ }
+
+ err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys));
+ if (err < 0)
+ goto out_pack;
+
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, name, fdt, false));
+
+out_pack:
+ fdt_pack(root_fdt);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_add_subtree);
+
+void kho_remove_subtree(void *fdt)
+{
+ phys_addr_t target_phys = virt_to_phys(fdt);
+ void *root_fdt = kho_out.fdt;
+ int off;
+ int err;
+
+ guard(mutex)(&kho_out.lock);
+
+ err = fdt_open_into(root_fdt, root_fdt, PAGE_SIZE);
+ if (err < 0)
+ return;
+
+ for (off = fdt_first_subnode(root_fdt, 0); off >= 0;
+ off = fdt_next_subnode(root_fdt, off)) {
+ const u64 *val;
+ int len;
+
+ val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(phys_addr_t))
+ continue;
+
+ if ((phys_addr_t)*val == target_phys) {
+ fdt_del_node(root_fdt, off);
+ kho_debugfs_fdt_remove(&kho_out.dbg, fdt);
+ break;
+ }
+ }
+
+ fdt_pack(root_fdt);
+}
+EXPORT_SYMBOL_GPL(kho_remove_subtree);
+
+/**
+ * kho_preserve_folio - preserve a folio across kexec.
+ * @folio: folio to preserve.
+ *
+ * Instructs KHO to preserve the whole folio across kexec. The order
+ * will be preserved as well.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.track;
+
+ if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
+ return -EINVAL;
+
+ return __kho_preserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_preserve_folio);
+
+/**
+ * kho_unpreserve_folio - unpreserve a folio.
+ * @folio: folio to unpreserve.
+ *
+ * Instructs KHO to unpreserve a folio that was preserved by
+ * kho_preserve_folio() before. The provided @folio (pfn and order)
+ * must exactly match a previously preserved folio.
+ */
+void kho_unpreserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.track;
+
+ __kho_unpreserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
+
+/**
+ * kho_preserve_pages - preserve contiguous pages across kexec
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
+ *
+ * Preserve a contiguous list of order 0 pages. Must be restored using
+ * kho_restore_pages() to ensure the pages are restored properly as order 0.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_pages(struct page *page, unsigned int nr_pages)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn = start_pfn;
+ unsigned long failed_pfn = 0;
+ int err = 0;
+
+ if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT))) {
+ return -EINVAL;
+ }
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ err = __kho_preserve_order(track, pfn, order);
+ if (err) {
+ failed_pfn = pfn;
+ break;
+ }
+
+ pfn += 1 << order;
+ }
+
+ if (err)
+ __kho_unpreserve(track, start_pfn, failed_pfn);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_pages);
+
+/**
+ * kho_unpreserve_pages - unpreserve contiguous pages.
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
+ *
+ * Instructs KHO to unpreserve @nr_pages contiguous pages starting from @page.
+ * This must be called with the same @page and @nr_pages as the corresponding
+ * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger
+ * preserved blocks is not supported.
+ */
+void kho_unpreserve_pages(struct page *page, unsigned int nr_pages)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ __kho_unpreserve(track, start_pfn, end_pfn);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_pages);
+
+struct kho_vmalloc_hdr {
+ DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
+};
+
+#define KHO_VMALLOC_SIZE \
+ ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
+ sizeof(phys_addr_t))
+
+struct kho_vmalloc_chunk {
+ struct kho_vmalloc_hdr hdr;
+ phys_addr_t phys[KHO_VMALLOC_SIZE];
+};
+
+static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
+
+/* vmalloc flags KHO supports */
+#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP)
+
+/* KHO internal flags for vmalloc preservations */
+#define KHO_VMALLOC_ALLOC 0x0001
+#define KHO_VMALLOC_HUGE_VMAP 0x0002
+
+static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
+{
+ unsigned short kho_flags = 0;
+
+ if (vm_flags & VM_ALLOC)
+ kho_flags |= KHO_VMALLOC_ALLOC;
+ if (vm_flags & VM_ALLOW_HUGE_VMAP)
+ kho_flags |= KHO_VMALLOC_HUGE_VMAP;
+
+ return kho_flags;
+}
+
+static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
+{
+ unsigned int vm_flags = 0;
+
+ if (kho_flags & KHO_VMALLOC_ALLOC)
+ vm_flags |= VM_ALLOC;
+ if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
+ vm_flags |= VM_ALLOW_HUGE_VMAP;
+
+ return vm_flags;
+}
+
+static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
+{
+ struct kho_vmalloc_chunk *chunk;
+ int err;
+
+ chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ err = kho_preserve_pages(virt_to_page(chunk), 1);
+ if (err)
+ goto err_free;
+ if (cur)
+ KHOSER_STORE_PTR(cur->hdr.next, chunk);
+ return chunk;
+
+err_free:
+ free_page((unsigned long)chunk);
+ return NULL;
+}
+
+static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
+ unsigned short order)
+{
+ struct kho_mem_track *track = &kho_out.track;
+ unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
+
+ __kho_unpreserve(track, pfn, pfn + 1);
+
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
+ pfn = PHYS_PFN(chunk->phys[i]);
+ __kho_unpreserve(track, pfn, pfn + (1 << order));
+ }
+}
+
+/**
+ * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
+ * @ptr: pointer to the area in vmalloc address space
+ * @preservation: placeholder for preservation metadata
+ *
+ * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
+ * physical pages mapped at @ptr will be preserved and on successful return
+ * @preservation will hold the physical address of a structure that describes
+ * the preservation.
+ *
+ * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
+ * restored on the same node
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk;
+ struct vm_struct *vm = find_vm_area(ptr);
+ unsigned int order, flags, nr_contig_pages;
+ unsigned int idx = 0;
+ int err;
+
+ if (!vm)
+ return -EINVAL;
+
+ if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+ return -EOPNOTSUPP;
+
+ flags = vmalloc_flags_to_kho(vm->flags);
+ order = get_vm_area_page_order(vm);
+
+ chunk = new_vmalloc_chunk(NULL);
+ if (!chunk)
+ return -ENOMEM;
+ KHOSER_STORE_PTR(preservation->first, chunk);
+
+ nr_contig_pages = (1 << order);
+ for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) {
+ phys_addr_t phys = page_to_phys(vm->pages[i]);
+
+ err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
+ if (err)
+ goto err_free;
+
+ chunk->phys[idx++] = phys;
+ if (idx == ARRAY_SIZE(chunk->phys)) {
+ chunk = new_vmalloc_chunk(chunk);
+ if (!chunk)
+ goto err_free;
+ idx = 0;
+ }
+ }
+
+ preservation->total_pages = vm->nr_pages;
+ preservation->flags = flags;
+ preservation->order = order;
+
+ return 0;
+
+err_free:
+ kho_unpreserve_vmalloc(preservation);
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
+
+/**
+ * kho_unpreserve_vmalloc - unpreserve memory allocated with vmalloc()
+ * @preservation: preservation metadata returned by kho_preserve_vmalloc()
+ *
+ * Instructs KHO to unpreserve the area in vmalloc address space that was
+ * previously preserved with kho_preserve_vmalloc().
+ */
+void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+
+ while (chunk) {
+ struct kho_vmalloc_chunk *tmp = chunk;
+
+ kho_vmalloc_unpreserve_chunk(chunk, preservation->order);
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ free_page((unsigned long)tmp);
+ }
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc);
+
+/**
+ * kho_restore_vmalloc - recreates and populates an area in vmalloc address
+ * space from the preserved memory.
+ * @preservation: preservation metadata.
+ *
+ * Recreates an area in vmalloc address space and populates it with memory that
+ * was preserved using kho_preserve_vmalloc().
+ *
+ * Return: pointer to the area in the vmalloc address space, NULL on failure.
+ */
+void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+ unsigned int align, order, shift, vm_flags;
+ unsigned long total_pages, contig_pages;
+ unsigned long addr, size;
+ struct vm_struct *area;
+ struct page **pages;
+ unsigned int idx = 0;
+ int err;
+
+ vm_flags = kho_flags_to_vmalloc(preservation->flags);
+ if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+ return NULL;
+
+ total_pages = preservation->total_pages;
+ pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+ order = preservation->order;
+ contig_pages = (1 << order);
+ shift = PAGE_SHIFT + order;
+ align = 1 << shift;
+
+ while (chunk) {
+ struct page *page;
+
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
+ phys_addr_t phys = chunk->phys[i];
+
+ if (idx + contig_pages > total_pages)
+ goto err_free_pages_array;
+
+ page = kho_restore_pages(phys, contig_pages);
+ if (!page)
+ goto err_free_pages_array;
+
+ for (int j = 0; j < contig_pages; j++)
+ pages[idx++] = page + j;
+
+ phys += contig_pages * PAGE_SIZE;
+ }
+
+ page = kho_restore_pages(virt_to_phys(chunk), 1);
+ if (!page)
+ goto err_free_pages_array;
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ __free_page(page);
+ }
+
+ if (idx != total_pages)
+ goto err_free_pages_array;
+
+ area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift,
+ vm_flags, VMALLOC_START, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL,
+ __builtin_return_address(0));
+ if (!area)
+ goto err_free_pages_array;
+
+ addr = (unsigned long)area->addr;
+ size = get_vm_area_size(area);
+ err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
+ if (err)
+ goto err_free_vm_area;
+
+ area->nr_pages = total_pages;
+ area->pages = pages;
+
+ return area->addr;
+
+err_free_vm_area:
+ free_vm_area(area);
+err_free_pages_array:
+ kvfree(pages);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
+
+/**
+ * kho_alloc_preserve - Allocate, zero, and preserve memory.
+ * @size: The number of bytes to allocate.
+ *
+ * Allocates a physically contiguous block of zeroed pages that is large
+ * enough to hold @size bytes. The allocated memory is then registered with
+ * KHO for preservation across a kexec.
+ *
+ * Note: The actual allocated size will be rounded up to the nearest
+ * power-of-two page boundary.
+ *
+ * @return A virtual pointer to the allocated and preserved memory on success,
+ * or an ERR_PTR() encoded error on failure.
+ */
+void *kho_alloc_preserve(size_t size)
+{
+ struct folio *folio;
+ int order, ret;
+
+ if (!size)
+ return ERR_PTR(-EINVAL);
+
+ order = get_order(size);
+ if (order > MAX_PAGE_ORDER)
+ return ERR_PTR(-E2BIG);
+
+ folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, order);
+ if (!folio)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kho_preserve_folio(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
+
+ return folio_address(folio);
+}
+EXPORT_SYMBOL_GPL(kho_alloc_preserve);
+
+/**
+ * kho_unpreserve_free - Unpreserve and free memory.
+ * @mem: Pointer to the memory allocated by kho_alloc_preserve().
+ *
+ * Unregisters the memory from KHO preservation and frees the underlying
+ * pages back to the system. This function should be called to clean up
+ * memory allocated with kho_alloc_preserve().
+ */
+void kho_unpreserve_free(void *mem)
+{
+ struct folio *folio;
+
+ if (!mem)
+ return;
+
+ folio = virt_to_folio(mem);
+ kho_unpreserve_folio(folio);
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_free);
+
+/**
+ * kho_restore_free - Restore and free memory after kexec.
+ * @mem: Pointer to the memory (in the new kernel's address space)
+ * that was allocated by the old kernel.
+ *
+ * This function is intended to be called in the new kernel (post-kexec)
+ * to take ownership of and free a memory region that was preserved by the
+ * old kernel using kho_alloc_preserve().
+ *
+ * It first restores the pages from KHO (using their physical address)
+ * and then frees the pages back to the new kernel's page allocator.
+ */
+void kho_restore_free(void *mem)
+{
+ struct folio *folio;
+
+ if (!mem)
+ return;
+
+ folio = kho_restore_folio(__pa(mem));
+ if (!WARN_ON(!folio))
+ folio_put(folio);
+}
+EXPORT_SYMBOL_GPL(kho_restore_free);
+
+int kho_finalize(void)
+{
+ int ret;
+
+ if (!kho_enable)
+ return -EOPNOTSUPP;
+
+ guard(mutex)(&kho_out.lock);
+ ret = kho_mem_serialize(&kho_out);
+ if (ret)
+ return ret;
+
+ kho_out.finalized = true;
+
+ return 0;
+}
+
+bool kho_finalized(void)
+{
+ guard(mutex)(&kho_out.lock);
+ return kho_out.finalized;
+}
+
+struct kho_in {
+ phys_addr_t fdt_phys;
+ phys_addr_t scratch_phys;
+ struct kho_debugfs dbg;
+};
+
+static struct kho_in kho_in = {
+};
+
+static const void *kho_get_fdt(void)
+{
+ return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
+}
+
+/**
+ * is_kho_boot - check if current kernel was booted via KHO-enabled
+ * kexec
+ *
+ * This function checks if the current kernel was loaded through a kexec
+ * operation with KHO enabled, by verifying that a valid KHO FDT
+ * was passed.
+ *
+ * Note: This function returns reliable results only after
+ * kho_populate() has been called during early boot. Before that,
+ * it may return false even if KHO data is present.
+ *
+ * Return: true if booted via KHO-enabled kexec, false otherwise
+ */
+bool is_kho_boot(void)
+{
+ return !!kho_get_fdt();
+}
+EXPORT_SYMBOL_GPL(is_kho_boot);
+
+/**
+ * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
+ * @name: the name of the sub FDT passed to kho_add_subtree().
+ * @phys: if found, the physical address of the sub FDT is stored in @phys.
+ *
+ * Retrieve a preserved sub FDT named @name and store its physical
+ * address in @phys.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+{
+ const void *fdt = kho_get_fdt();
+ const u64 *val;
+ int offset, len;
+
+ if (!fdt)
+ return -ENOENT;
+
+ if (!phys)
+ return -EINVAL;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0)
+ return -ENOENT;
+
+ val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(*val))
+ return -EINVAL;
+
+ *phys = (phys_addr_t)*val;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
+
+static __init int kho_out_fdt_setup(void)
+{
+ void *root = kho_out.fdt;
+ u64 empty_mem_map = 0;
+ int err;
+
+ err = fdt_create(root, PAGE_SIZE);
+ err |= fdt_finish_reservemap(root);
+ err |= fdt_begin_node(root, "");
+ err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE);
+ err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map,
+ sizeof(empty_mem_map));
+ err |= fdt_end_node(root);
+ err |= fdt_finish(root);
+
+ return err;
+}
+
+static __init int kho_init(void)
+{
+ const void *fdt = kho_get_fdt();
+ int err = 0;
+
+ if (!kho_enable)
+ return 0;
+
+ kho_out.fdt = kho_alloc_preserve(PAGE_SIZE);
+ if (IS_ERR(kho_out.fdt)) {
+ err = PTR_ERR(kho_out.fdt);
+ goto err_free_scratch;
+ }
+
+ err = kho_debugfs_init();
+ if (err)
+ goto err_free_fdt;
+
+ err = kho_out_debugfs_init(&kho_out.dbg);
+ if (err)
+ goto err_free_fdt;
+
+ err = kho_out_fdt_setup();
+ if (err)
+ goto err_free_fdt;
+
+ if (fdt) {
+ kho_in_debugfs_init(&kho_in.dbg, fdt);
+ return 0;
+ }
+
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
+ unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
+ unsigned long pfn;
+
+ /*
+ * When debug_pagealloc is enabled, __free_pages() clears the
+ * corresponding PRESENT bit in the kernel page table.
+ * Subsequent kmemleak scans of these pages cause the
+ * non-PRESENT page faults.
+ * Mark scratch areas with kmemleak_ignore_phys() to exclude
+ * them from kmemleak scanning.
+ */
+ kmemleak_ignore_phys(kho_scratch[i].addr);
+ for (pfn = base_pfn; pfn < base_pfn + count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
+ }
+
+ WARN_ON_ONCE(kho_debugfs_fdt_add(&kho_out.dbg, "fdt",
+ kho_out.fdt, true));
+
+ return 0;
+
+err_free_fdt:
+ kho_unpreserve_free(kho_out.fdt);
+err_free_scratch:
+ kho_out.fdt = NULL;
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ void *start = __va(kho_scratch[i].addr);
+ void *end = start + kho_scratch[i].size;
+
+ free_reserved_area(start, end, -1, "");
+ }
+ kho_enable = false;
+ return err;
+}
+fs_initcall(kho_init);
+
+static void __init kho_release_scratch(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ memmap_init_kho_scratch_pages();
+
+ /*
+ * Mark scratch mem as CMA before we return it. That way we
+ * ensure that no kernel allocations happen on it. That means
+ * we can reuse it as scratch memory again later.
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
+ ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
+ ulong end_pfn = pageblock_align(PFN_UP(end));
+ ulong pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
+ init_pageblock_migratetype(pfn_to_page(pfn),
+ MIGRATE_CMA, false);
+ }
+}
+
+void __init kho_memory_init(void)
+{
+ if (kho_in.scratch_phys) {
+ kho_scratch = phys_to_virt(kho_in.scratch_phys);
+ kho_release_scratch();
+
+ if (!kho_mem_deserialize(kho_get_fdt()))
+ kho_in.fdt_phys = 0;
+ } else {
+ kho_reserve_scratch();
+ }
+}
+
+void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
+ phys_addr_t scratch_phys, u64 scratch_len)
+{
+ void *fdt = NULL;
+ struct kho_scratch *scratch = NULL;
+ int err = 0;
+ unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
+
+ /* Validate the input FDT */
+ fdt = early_memremap(fdt_phys, fdt_len);
+ if (!fdt) {
+ pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
+ err = -EFAULT;
+ goto out;
+ }
+ err = fdt_check_header(fdt);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
+ fdt_phys, err);
+ err = -EINVAL;
+ goto out;
+ }
+ err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
+ fdt_phys, KHO_FDT_COMPATIBLE, err);
+ err = -EINVAL;
+ goto out;
+ }
+
+ scratch = early_memremap(scratch_phys, scratch_len);
+ if (!scratch) {
+ pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
+ scratch_phys, scratch_len);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * We pass a safe contiguous blocks of memory to use for early boot
+ * purporses from the previous kernel so that we can resize the
+ * memblock array as needed.
+ */
+ for (int i = 0; i < scratch_cnt; i++) {
+ struct kho_scratch *area = &scratch[i];
+ u64 size = area->size;
+
+ memblock_add(area->addr, size);
+ err = memblock_mark_kho_scratch(area->addr, size);
+ if (WARN_ON(err)) {
+ pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe",
+ &area->addr, &size, ERR_PTR(err));
+ goto out;
+ }
+ pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
+ }
+
+ memblock_reserve(scratch_phys, scratch_len);
+
+ /*
+ * Now that we have a viable region of scratch memory, let's tell
+ * the memblocks allocator to only use that for any allocations.
+ * That way we ensure that nothing scribbles over in use data while
+ * we initialize the page tables which we will need to ingest all
+ * memory reservations from the previous kernel.
+ */
+ memblock_set_kho_scratch_only();
+
+ kho_in.fdt_phys = fdt_phys;
+ kho_in.scratch_phys = scratch_phys;
+ kho_scratch_cnt = scratch_cnt;
+ pr_info("found kexec handover data.\n");
+
+out:
+ if (fdt)
+ early_memunmap(fdt, fdt_len);
+ if (scratch)
+ early_memunmap(scratch, scratch_len);
+ if (err)
+ pr_warn("disabling KHO revival: %d\n", err);
+}
+
+/* Helper functions for kexec_file_load */
+
+int kho_fill_kimage(struct kimage *image)
+{
+ ssize_t scratch_size;
+ int err = 0;
+ struct kexec_buf scratch;
+
+ if (!kho_enable)
+ return 0;
+
+ image->kho.fdt = virt_to_phys(kho_out.fdt);
+
+ scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
+ scratch = (struct kexec_buf){
+ .image = image,
+ .buffer = kho_scratch,
+ .bufsz = scratch_size,
+ .mem = KEXEC_BUF_MEM_UNKNOWN,
+ .memsz = scratch_size,
+ .buf_align = SZ_64K, /* Makes it easier to map */
+ .buf_max = ULONG_MAX,
+ .top_down = true,
+ };
+ err = kexec_add_buffer(&scratch);
+ if (err)
+ return err;
+ image->kho.scratch = &image->segment[image->nr_segments - 1];
+
+ return 0;
+}
+
+static int kho_walk_scratch(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ struct resource res = {
+ .start = kho_scratch[i].addr,
+ .end = kho_scratch[i].addr + kho_scratch[i].size - 1,
+ };
+
+ /* Try to fit the kimage into our KHO scratch region */
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret;
+
+ if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
+ return 1;
+
+ ret = kho_walk_scratch(kbuf, func);
+
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
diff --git a/kernel/liveupdate/kexec_handover_debug.c b/kernel/liveupdate/kexec_handover_debug.c
new file mode 100644
index 000000000000..6efb696f5426
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_debug.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debug.c - kexec handover optional debug functionality
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include "kexec_handover_internal.h"
+
+bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ phys_addr_t scratch_start, scratch_end;
+ unsigned int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ scratch_start = kho_scratch[i].addr;
+ scratch_end = kho_scratch[i].addr + kho_scratch[i].size;
+
+ if (phys < scratch_end && (phys + size) > scratch_start)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c
new file mode 100644
index 000000000000..2abbf62ba942
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_debugfs.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debugfs.c - kexec handover debugfs interfaces
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include "kexec_handover_internal.h"
+
+static struct dentry *debugfs_root;
+
+struct fdt_debugfs {
+ struct list_head list;
+ struct debugfs_blob_wrapper wrapper;
+ struct dentry *file;
+};
+
+static int __kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
+ const char *name, const void *fdt)
+{
+ struct fdt_debugfs *f;
+ struct dentry *file;
+
+ f = kmalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ f->wrapper.data = (void *)fdt;
+ f->wrapper.size = fdt_totalsize(fdt);
+
+ file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
+ if (IS_ERR(file)) {
+ kfree(f);
+ return PTR_ERR(file);
+ }
+
+ f->file = file;
+ list_add(&f->list, list);
+
+ return 0;
+}
+
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root)
+{
+ struct dentry *dir;
+
+ if (root)
+ dir = dbg->dir;
+ else
+ dir = dbg->sub_fdt_dir;
+
+ return __kho_debugfs_fdt_add(&dbg->fdt_list, dir, name, fdt);
+}
+
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt)
+{
+ struct fdt_debugfs *ff;
+
+ list_for_each_entry(ff, &dbg->fdt_list, list) {
+ if (ff->wrapper.data == fdt) {
+ debugfs_remove(ff->file);
+ list_del(&ff->list);
+ kfree(ff);
+ break;
+ }
+ }
+}
+
+static int kho_out_finalize_get(void *data, u64 *val)
+{
+ *val = kho_finalized();
+
+ return 0;
+}
+
+static int kho_out_finalize_set(void *data, u64 val)
+{
+ if (val)
+ return kho_finalize();
+ else
+ return -EINVAL;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get,
+ kho_out_finalize_set, "%llu\n");
+
+static int scratch_phys_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_phys);
+
+static int scratch_len_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].size);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_len);
+
+__init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt)
+{
+ struct dentry *dir, *sub_fdt_dir;
+ int err, child;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("in", debugfs_root);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto err_out;
+ }
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir)) {
+ err = PTR_ERR(sub_fdt_dir);
+ goto err_rmdir;
+ }
+
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, dir, "fdt", fdt);
+ if (err)
+ goto err_rmdir;
+
+ fdt_for_each_subnode(child, fdt, 0) {
+ int len = 0;
+ const char *name = fdt_get_name(fdt, child, NULL);
+ const u64 *fdt_phys;
+
+ fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+ if (!fdt_phys)
+ continue;
+ if (len != sizeof(*fdt_phys)) {
+ pr_warn("node %s prop fdt has invalid length: %d\n",
+ name, len);
+ continue;
+ }
+ err = __kho_debugfs_fdt_add(&dbg->fdt_list, sub_fdt_dir, name,
+ phys_to_virt(*fdt_phys));
+ if (err) {
+ pr_warn("failed to add fdt %s to debugfs: %pe\n", name,
+ ERR_PTR(err));
+ continue;
+ }
+ }
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+
+ return;
+err_rmdir:
+ debugfs_remove_recursive(dir);
+err_out:
+ /*
+ * Failure to create /sys/kernel/debug/kho/in does not prevent
+ * reviving state from KHO and setting up KHO for the next
+ * kexec.
+ */
+ if (err) {
+ pr_err("failed exposing handover FDT in debugfs: %pe\n",
+ ERR_PTR(err));
+ }
+}
+
+__init int kho_out_debugfs_init(struct kho_debugfs *dbg)
+{
+ struct dentry *dir, *f, *sub_fdt_dir;
+
+ INIT_LIST_HEAD(&dbg->fdt_list);
+
+ dir = debugfs_create_dir("out", debugfs_root);
+ if (IS_ERR(dir))
+ return -ENOMEM;
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
+ &scratch_phys_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_len", 0400, dir, NULL,
+ &scratch_len_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("finalize", 0600, dir, NULL,
+ &kho_out_finalize_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ dbg->dir = dir;
+ dbg->sub_fdt_dir = sub_fdt_dir;
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(dir);
+ return -ENOENT;
+}
+
+__init int kho_debugfs_init(void)
+{
+ debugfs_root = debugfs_create_dir("kho", NULL);
+ if (IS_ERR(debugfs_root))
+ return -ENOENT;
+ return 0;
+}
diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h
new file mode 100644
index 000000000000..0202c85ad14f
--- /dev/null
+++ b/kernel/liveupdate/kexec_handover_internal.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
+#define LINUX_KEXEC_HANDOVER_INTERNAL_H
+
+#include <linux/kexec_handover.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+#include <linux/debugfs.h>
+
+struct kho_debugfs {
+ struct dentry *dir;
+ struct dentry *sub_fdt_dir;
+ struct list_head fdt_list;
+};
+
+#else
+struct kho_debugfs {};
+#endif
+
+extern struct kho_scratch *kho_scratch;
+extern unsigned int kho_scratch_cnt;
+
+bool kho_finalized(void);
+int kho_finalize(void);
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS
+int kho_debugfs_init(void);
+void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt);
+int kho_out_debugfs_init(struct kho_debugfs *dbg);
+int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root);
+void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt);
+#else
+static inline int kho_debugfs_init(void) { return 0; }
+static inline void kho_in_debugfs_init(struct kho_debugfs *dbg,
+ const void *fdt) { }
+static inline int kho_out_debugfs_init(struct kho_debugfs *dbg) { return 0; }
+static inline int kho_debugfs_fdt_add(struct kho_debugfs *dbg, const char *name,
+ const void *fdt, bool root) { return 0; }
+static inline void kho_debugfs_fdt_remove(struct kho_debugfs *dbg,
+ void *fdt) { }
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUGFS */
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
+bool kho_scratch_overlap(phys_addr_t phys, size_t size);
+#else
+static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
+
+#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c
new file mode 100644
index 000000000000..f7ecaf7740d1
--- /dev/null
+++ b/kernel/liveupdate/luo_core.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: Live Update Orchestrator (LUO)
+ *
+ * Live Update is a specialized, kexec-based reboot process that allows a
+ * running kernel to be updated from one version to another while preserving
+ * the state of selected resources and keeping designated hardware devices
+ * operational. For these devices, DMA activity may continue throughout the
+ * kernel transition.
+ *
+ * While the primary use case driving this work is supporting live updates of
+ * the Linux kernel when it is used as a hypervisor in cloud environments, the
+ * LUO framework itself is designed to be workload-agnostic. Live Update
+ * facilitates a full kernel version upgrade for any type of system.
+ *
+ * For example, a non-hypervisor system running an in-memory cache like
+ * memcached with many gigabytes of data can use LUO. The userspace service
+ * can place its cache into a memfd, have its state preserved by LUO, and
+ * restore it immediately after the kernel kexec.
+ *
+ * Whether the system is running virtual machines, containers, a
+ * high-performance database, or networking services, LUO's primary goal is to
+ * enable a full kernel update by preserving critical userspace state and
+ * keeping essential devices operational.
+ *
+ * The core of LUO is a mechanism that tracks the progress of a live update,
+ * along with a callback API that allows other kernel subsystems to participate
+ * in the process. Example subsystems that can hook into LUO include: kvm,
+ * iommu, interrupts, vfio, participating filesystems, and memory management.
+ *
+ * LUO uses Kexec Handover to transfer memory state from the current kernel to
+ * the next kernel. For more details see
+ * Documentation/core-api/kho/concepts.rst.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/kobject.h>
+#include <linux/libfdt.h>
+#include <linux/liveupdate.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+#include "kexec_handover_internal.h"
+#include "luo_internal.h"
+
+static struct {
+ bool enabled;
+ void *fdt_out;
+ void *fdt_in;
+ u64 liveupdate_num;
+} luo_global;
+
+static int __init early_liveupdate_param(char *buf)
+{
+ return kstrtobool(buf, &luo_global.enabled);
+}
+early_param("liveupdate", early_liveupdate_param);
+
+static int __init luo_early_startup(void)
+{
+ phys_addr_t fdt_phys;
+ int err, ln_size;
+ const void *ptr;
+
+ if (!kho_is_enabled()) {
+ if (liveupdate_enabled())
+ pr_warn("Disabling liveupdate because KHO is disabled\n");
+ luo_global.enabled = false;
+ return 0;
+ }
+
+ /* Retrieve LUO subtree, and verify its format. */
+ err = kho_retrieve_subtree(LUO_FDT_KHO_ENTRY_NAME, &fdt_phys);
+ if (err) {
+ if (err != -ENOENT) {
+ pr_err("failed to retrieve FDT '%s' from KHO: %pe\n",
+ LUO_FDT_KHO_ENTRY_NAME, ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+ }
+
+ luo_global.fdt_in = phys_to_virt(fdt_phys);
+ err = fdt_node_check_compatible(luo_global.fdt_in, 0,
+ LUO_FDT_COMPATIBLE);
+ if (err) {
+ pr_err("FDT '%s' is incompatible with '%s' [%d]\n",
+ LUO_FDT_KHO_ENTRY_NAME, LUO_FDT_COMPATIBLE, err);
+
+ return -EINVAL;
+ }
+
+ ln_size = 0;
+ ptr = fdt_getprop(luo_global.fdt_in, 0, LUO_FDT_LIVEUPDATE_NUM,
+ &ln_size);
+ if (!ptr || ln_size != sizeof(luo_global.liveupdate_num)) {
+ pr_err("Unable to get live update number '%s' [%d]\n",
+ LUO_FDT_LIVEUPDATE_NUM, ln_size);
+
+ return -EINVAL;
+ }
+
+ luo_global.liveupdate_num = get_unaligned((u64 *)ptr);
+ pr_info("Retrieved live update data, liveupdate number: %lld\n",
+ luo_global.liveupdate_num);
+
+ err = luo_session_setup_incoming(luo_global.fdt_in);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int __init liveupdate_early_init(void)
+{
+ int err;
+
+ err = luo_early_startup();
+ if (err) {
+ luo_global.enabled = false;
+ luo_restore_fail("The incoming tree failed to initialize properly [%pe], disabling live update\n",
+ ERR_PTR(err));
+ }
+
+ return err;
+}
+early_initcall(liveupdate_early_init);
+
+/* Called during boot to create outgoing LUO fdt tree */
+static int __init luo_fdt_setup(void)
+{
+ const u64 ln = luo_global.liveupdate_num + 1;
+ void *fdt_out;
+ int err;
+
+ fdt_out = kho_alloc_preserve(LUO_FDT_SIZE);
+ if (IS_ERR(fdt_out)) {
+ pr_err("failed to allocate/preserve FDT memory\n");
+ return PTR_ERR(fdt_out);
+ }
+
+ err = fdt_create(fdt_out, LUO_FDT_SIZE);
+ err |= fdt_finish_reservemap(fdt_out);
+ err |= fdt_begin_node(fdt_out, "");
+ err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln));
+ err |= luo_session_setup_outgoing(fdt_out);
+ err |= fdt_end_node(fdt_out);
+ err |= fdt_finish(fdt_out);
+ if (err)
+ goto exit_free;
+
+ err = kho_add_subtree(LUO_FDT_KHO_ENTRY_NAME, fdt_out);
+ if (err)
+ goto exit_free;
+ luo_global.fdt_out = fdt_out;
+
+ return 0;
+
+exit_free:
+ kho_unpreserve_free(fdt_out);
+ pr_err("failed to prepare LUO FDT: %d\n", err);
+
+ return err;
+}
+
+/*
+ * late initcall because it initializes the outgoing tree that is needed only
+ * once userspace starts using /dev/liveupdate.
+ */
+static int __init luo_late_startup(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_fdt_setup();
+ if (err)
+ luo_global.enabled = false;
+
+ return err;
+}
+late_initcall(luo_late_startup);
+
+/* Public Functions */
+
+/**
+ * liveupdate_reboot() - Kernel reboot notifier for live update final
+ * serialization.
+ *
+ * This function is invoked directly from the reboot() syscall pathway
+ * if kexec is in progress.
+ *
+ * If any callback fails, this function aborts KHO, undoes the freeze()
+ * callbacks, and returns an error.
+ */
+int liveupdate_reboot(void)
+{
+ int err;
+
+ if (!liveupdate_enabled())
+ return 0;
+
+ err = luo_session_serialize();
+ if (err)
+ return err;
+
+ err = kho_finalize();
+ if (err) {
+ pr_err("kho_finalize failed %d\n", err);
+ /*
+ * kho_finalize() may return libfdt errors, to aboid passing to
+ * userspace unknown errors, change this to EAGAIN.
+ */
+ err = -EAGAIN;
+ }
+
+ return err;
+}
+
+/**
+ * liveupdate_enabled - Check if the live update feature is enabled.
+ *
+ * This function returns the state of the live update feature flag, which
+ * can be controlled via the ``liveupdate`` kernel command-line parameter.
+ *
+ * @return true if live update is enabled, false otherwise.
+ */
+bool liveupdate_enabled(void)
+{
+ return luo_global.enabled;
+}
+
+/**
+ * DOC: LUO ioctl Interface
+ *
+ * The IOCTL user-space control interface for the LUO subsystem.
+ * It registers a character device, typically found at ``/dev/liveupdate``,
+ * which allows a userspace agent to manage the LUO state machine and its
+ * associated resources, such as preservable file descriptors.
+ *
+ * To ensure that the state machine is controlled by a single entity, access
+ * to this device is exclusive: only one process is permitted to have
+ * ``/dev/liveupdate`` open at any given time. Subsequent open attempts will
+ * fail with -EBUSY until the first process closes its file descriptor.
+ * This singleton model simplifies state management by preventing conflicting
+ * commands from multiple userspace agents.
+ */
+
+struct luo_device_state {
+ struct miscdevice miscdev;
+ atomic_t in_use;
+};
+
+static int luo_ioctl_create_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_create_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_create(argp->name, &file);
+ if (err)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_ioctl_retrieve_session(struct luo_ucmd *ucmd)
+{
+ struct liveupdate_ioctl_retrieve_session *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ err = luo_session_retrieve(argp->name, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_open(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+
+ if (atomic_cmpxchg(&ldev->in_use, 0, 1))
+ return -EBUSY;
+
+ /* Always return -EIO to user if deserialization fail */
+ if (luo_session_deserialize()) {
+ atomic_set(&ldev->in_use, 0);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int luo_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_device_state *ldev = container_of(filep->private_data,
+ struct luo_device_state,
+ miscdev);
+ atomic_set(&ldev->in_use, 0);
+
+ return 0;
+}
+
+union ucmd_buffer {
+ struct liveupdate_ioctl_create_session create;
+ struct liveupdate_ioctl_retrieve_session retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_IOCTL_CREATE_SESSION, luo_ioctl_create_session,
+ struct liveupdate_ioctl_create_session, name),
+ IOCTL_OP(LIVEUPDATE_IOCTL_RETRIEVE_SESSION, luo_ioctl_retrieve_session,
+ struct liveupdate_ioctl_retrieve_session, name),
+};
+
+static long luo_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int err;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_BASE ||
+ (nr - LIVEUPDATE_CMD_BASE) >= ARRAY_SIZE(luo_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ err = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (err)
+ return err;
+
+ op = &luo_ioctl_ops[nr - LIVEUPDATE_CMD_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ err = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (err)
+ return err;
+
+ return op->execute(&ucmd);
+}
+
+static const struct file_operations luo_fops = {
+ .owner = THIS_MODULE,
+ .open = luo_open,
+ .release = luo_release,
+ .unlocked_ioctl = luo_ioctl,
+};
+
+static struct luo_device_state luo_dev = {
+ .miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "liveupdate",
+ .fops = &luo_fops,
+ },
+ .in_use = ATOMIC_INIT(0),
+};
+
+static int __init liveupdate_ioctl_init(void)
+{
+ if (!liveupdate_enabled())
+ return 0;
+
+ return misc_register(&luo_dev.miscdev);
+}
+late_initcall(liveupdate_ioctl_init);
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
new file mode 100644
index 000000000000..ddff87917b21
--- /dev/null
+++ b/kernel/liveupdate/luo_file.c
@@ -0,0 +1,889 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO File Descriptors
+ *
+ * LUO provides the infrastructure to preserve specific, stateful file
+ * descriptors across a kexec-based live update. The primary goal is to allow
+ * workloads, such as virtual machines using vfio, memfd, or iommufd, to
+ * retain access to their essential resources without interruption.
+ *
+ * The framework is built around a callback-based handler model and a well-
+ * defined lifecycle for each preserved file.
+ *
+ * Handler Registration:
+ * Kernel modules responsible for a specific file type (e.g., memfd, vfio)
+ * register a &struct liveupdate_file_handler. This handler provides a set of
+ * callbacks that LUO invokes at different stages of the update process, most
+ * notably:
+ *
+ * - can_preserve(): A lightweight check to determine if the handler is
+ * compatible with a given 'struct file'.
+ * - preserve(): The heavyweight operation that saves the file's state and
+ * returns an opaque u64 handle. This is typically performed while the
+ * workload is still active to minimize the downtime during the
+ * actual reboot transition.
+ * - unpreserve(): Cleans up any resources allocated by .preserve(), called
+ * if the preservation process is aborted before the reboot (i.e. session is
+ * closed).
+ * - freeze(): A final pre-reboot opportunity to prepare the state for kexec.
+ * We are already in reboot syscall, and therefore userspace cannot mutate
+ * the file anymore.
+ * - unfreeze(): Undoes the actions of .freeze(), called if the live update
+ * is aborted after the freeze phase.
+ * - retrieve(): Reconstructs the file in the new kernel from the preserved
+ * handle.
+ * - finish(): Performs final check and cleanup in the new kernel. After
+ * succesul finish call, LUO gives up ownership to this file.
+ *
+ * File Preservation Lifecycle happy path:
+ *
+ * 1. Preserve (Normal Operation): A userspace agent preserves files one by one
+ * via an ioctl. For each file, luo_preserve_file() finds a compatible
+ * handler, calls its .preserve() operation, and creates an internal &struct
+ * luo_file to track the live state.
+ *
+ * 2. Freeze (Pre-Reboot): Just before the kexec, luo_file_freeze() is called.
+ * It iterates through all preserved files, calls their respective .freeze()
+ * operation, and serializes their final metadata (compatible string, token,
+ * and data handle) into a contiguous memory block for KHO.
+ *
+ * 3. Deserialize: After kexec, luo_file_deserialize() runs when session gets
+ * deserialized (which is when /dev/liveupdate is first opened). It reads the
+ * serialized data from the KHO memory region and reconstructs the in-memory
+ * list of &struct luo_file instances for the new kernel, linking them to
+ * their corresponding handlers.
+ *
+ * 4. Retrieve (New Kernel - Userspace Ready): The userspace agent can now
+ * restore file descriptors by providing a token. luo_retrieve_file()
+ * searches for the matching token, calls the handler's .retrieve() op to
+ * re-create the 'struct file', and returns a new FD. Files can be
+ * retrieved in ANY order.
+ *
+ * 5. Finish (New Kernel - Cleanup): Once a session retrival is complete,
+ * luo_file_finish() is called. It iterates through all files, invokes their
+ * .finish() operations for final cleanup, and releases all associated kernel
+ * resources.
+ *
+ * File Preservation Lifecycle unhappy paths:
+ *
+ * 1. Abort Before Reboot: If the userspace agent aborts the live update
+ * process before calling reboot (e.g., by closing the session file
+ * descriptor), the session's release handler calls
+ * luo_file_unpreserve_files(). This invokes the .unpreserve() callback on
+ * all preserved files, ensuring all allocated resources are cleaned up and
+ * returning the system to a clean state.
+ *
+ * 2. Freeze Failure: During the reboot() syscall, if any handler's .freeze()
+ * op fails, the .unfreeze() op is invoked on all previously *successful*
+ * freezes to roll back their state. The reboot() syscall then returns an
+ * error to userspace, canceling the live update.
+ *
+ * 3. Finish Failure: In the new kernel, if a handler's .finish() op fails,
+ * the luo_file_finish() operation is aborted. LUO retains ownership of
+ * all files within that session, including those that were not yet
+ * processed. The userspace agent can attempt to call the finish operation
+ * again later. If the issue cannot be resolved, these resources will be held
+ * by LUO until the next live update cycle, at which point they will be
+ * discarded.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/liveupdate.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include "luo_internal.h"
+
+static LIST_HEAD(luo_file_handler_list);
+
+/* 2 4K pages, give space for 128 files per file_set */
+#define LUO_FILE_PGCNT 2ul
+#define LUO_FILE_MAX \
+ ((LUO_FILE_PGCNT << PAGE_SHIFT) / sizeof(struct luo_file_ser))
+
+/**
+ * struct luo_file - Represents a single preserved file instance.
+ * @fh: Pointer to the &struct liveupdate_file_handler that manages
+ * this type of file.
+ * @file: Pointer to the kernel's &struct file that is being preserved.
+ * This is NULL in the new kernel until the file is successfully
+ * retrieved.
+ * @serialized_data: The opaque u64 handle to the serialized state of the file.
+ * This handle is passed back to the handler's .freeze(),
+ * .retrieve(), and .finish() callbacks, allowing it to track
+ * and update its serialized state across phases.
+ * @private_data: Pointer to the private data for the file used to hold runtime
+ * state that is not preserved. Set by the handler's .preserve()
+ * callback, and must be freed in the handler's .unpreserve()
+ * callback.
+ * @retrieved: A flag indicating whether a user/kernel in the new kernel has
+ * successfully called retrieve() on this file. This prevents
+ * multiple retrieval attempts.
+ * @mutex: A mutex that protects the fields of this specific instance
+ * (e.g., @retrieved, @file), ensuring that operations like
+ * retrieving or finishing a file are atomic.
+ * @list: The list_head linking this instance into its parent
+ * file_set's list of preserved files.
+ * @token: The user-provided unique token used to identify this file.
+ *
+ * This structure is the core in-kernel representation of a single file being
+ * managed through a live update. An instance is created by luo_preserve_file()
+ * to link a 'struct file' to its corresponding handler, a user-provided token,
+ * and the serialized state handle returned by the handler's .preserve()
+ * operation.
+ *
+ * These instances are tracked in a per-file_set list. The @serialized_data
+ * field, which holds a handle to the file's serialized state, may be updated
+ * during the .freeze() callback before being serialized for the next kernel.
+ * After reboot, these structures are recreated by luo_file_deserialize() and
+ * are finally cleaned up by luo_file_finish().
+ */
+struct luo_file {
+ struct liveupdate_file_handler *fh;
+ struct file *file;
+ u64 serialized_data;
+ void *private_data;
+ bool retrieved;
+ struct mutex mutex;
+ struct list_head list;
+ u64 token;
+};
+
+static int luo_alloc_files_mem(struct luo_file_set *file_set)
+{
+ size_t size;
+ void *mem;
+
+ if (file_set->files)
+ return 0;
+
+ WARN_ON_ONCE(file_set->count);
+
+ size = LUO_FILE_PGCNT << PAGE_SHIFT;
+ mem = kho_alloc_preserve(size);
+ if (IS_ERR(mem))
+ return PTR_ERR(mem);
+
+ file_set->files = mem;
+
+ return 0;
+}
+
+static void luo_free_files_mem(struct luo_file_set *file_set)
+{
+ /* If file_set has files, no need to free preservation memory */
+ if (file_set->count)
+ return;
+
+ if (!file_set->files)
+ return;
+
+ kho_unpreserve_free(file_set->files);
+ file_set->files = NULL;
+}
+
+static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
+{
+ struct luo_file *iter;
+
+ list_for_each_entry(iter, &file_set->files_list, list) {
+ if (iter->token == token)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * luo_preserve_file - Initiate the preservation of a file descriptor.
+ * @file_set: The file_set to which the preserved file will be added.
+ * @token: A unique, user-provided identifier for the file.
+ * @fd: The file descriptor to be preserved.
+ *
+ * This function orchestrates the first phase of preserving a file. Upon entry,
+ * it takes a reference to the 'struct file' via fget(), effectively making LUO
+ * a co-owner of the file. This reference is held until the file is either
+ * unpreserved or successfully finished in the next kernel, preventing the file
+ * from being prematurely destroyed.
+ *
+ * This function orchestrates the first phase of preserving a file. It performs
+ * the following steps:
+ *
+ * 1. Validates that the @token is not already in use within the file_set.
+ * 2. Ensures the file_set's memory for files serialization is allocated
+ * (allocates if needed).
+ * 3. Iterates through registered handlers, calling can_preserve() to find one
+ * compatible with the given @fd.
+ * 4. Calls the handler's .preserve() operation, which saves the file's state
+ * and returns an opaque private data handle.
+ * 5. Adds the new instance to the file_set's internal list.
+ *
+ * On success, LUO takes a reference to the 'struct file' and considers it
+ * under its management until it is unpreserved or finished.
+ *
+ * In case of any failure, all intermediate allocations (file reference, memory
+ * for the 'luo_file' struct, etc.) are cleaned up before returning an error.
+ *
+ * Context: Can be called from an ioctl handler during normal system operation.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -EEXIST if the token is already used.
+ * -EBADF if the file descriptor is invalid.
+ * -ENOSPC if the file_set is full.
+ * -ENOENT if no compatible handler is found.
+ * -ENOMEM on memory allocation failure.
+ * Other erros might be returned by .preserve().
+ */
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct liveupdate_file_handler *fh;
+ struct luo_file *luo_file;
+ struct file *file;
+ int err;
+
+ if (luo_token_is_used(file_set, token))
+ return -EEXIST;
+
+ if (file_set->count == LUO_FILE_MAX)
+ return -ENOSPC;
+
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+
+ err = luo_alloc_files_mem(file_set);
+ if (err)
+ goto err_fput;
+
+ err = -ENOENT;
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (fh->ops->can_preserve(fh, file)) {
+ err = 0;
+ break;
+ }
+ }
+
+ /* err is still -ENOENT if no handler was found */
+ if (err)
+ goto err_free_files_mem;
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file) {
+ err = -ENOMEM;
+ goto err_free_files_mem;
+ }
+
+ luo_file->file = file;
+ luo_file->fh = fh;
+ luo_file->token = token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+
+ args.handler = fh;
+ args.file = file;
+ err = fh->ops->preserve(&args);
+ if (err)
+ goto err_kfree;
+
+ luo_file->serialized_data = args.serialized_data;
+ luo_file->private_data = args.private_data;
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ file_set->count++;
+
+ return 0;
+
+err_kfree:
+ kfree(luo_file);
+err_free_files_mem:
+ luo_free_files_mem(file_set);
+err_fput:
+ fput(file);
+
+ return err;
+}
+
+/**
+ * luo_file_unpreserve_files - Unpreserves all files from a file_set.
+ * @file_set: The files to be cleaned up.
+ *
+ * This function serves as the primary cleanup path for a file_set. It is
+ * invoked when the userspace agent closes the file_set's file descriptor.
+ *
+ * For each file, it performs the following cleanup actions:
+ * 1. Calls the handler's .unpreserve() callback to allow the handler to
+ * release any resources it allocated.
+ * 2. Removes the file from the file_set's internal tracking list.
+ * 3. Releases the reference to the 'struct file' that was taken by
+ * luo_preserve_file() via fput(), returning ownership.
+ * 4. Frees the memory associated with the internal 'struct luo_file'.
+ *
+ * After all individual files are unpreserved, it frees the contiguous memory
+ * block that was allocated to hold their serialization data.
+ */
+void luo_file_unpreserve_files(struct luo_file_set *file_set)
+{
+ struct luo_file *luo_file;
+
+ while (!list_empty(&file_set->files_list)) {
+ struct liveupdate_file_op_args args = {0};
+
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+ luo_file->fh->ops->unpreserve(&args);
+
+ list_del(&luo_file->list);
+ file_set->count--;
+
+ fput(luo_file->file);
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ luo_free_files_mem(file_set);
+}
+
+static int luo_file_freeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ int err = 0;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->freeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ err = luo_file->fh->ops->freeze(&args);
+ if (!err)
+ luo_file->serialized_data = args.serialized_data;
+ }
+
+ return err;
+}
+
+static void luo_file_unfreeze_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->unfreeze) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.private_data = luo_file->private_data;
+
+ luo_file->fh->ops->unfreeze(&args);
+ }
+
+ luo_file->serialized_data = 0;
+}
+
+static void __luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file *failed_entry)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ if (luo_file == failed_entry)
+ break;
+
+ luo_file_unfreeze_one(file_set, luo_file);
+ }
+
+ memset(file_set->files, 0, LUO_FILE_PGCNT << PAGE_SHIFT);
+}
+
+/**
+ * luo_file_freeze - Freezes all preserved files and serializes their metadata.
+ * @file_set: The file_set whose files are to be frozen.
+ * @file_set_ser: Where to put the serialized file_set.
+ *
+ * This function is called from the reboot() syscall path, just before the
+ * kernel transitions to the new image via kexec. Its purpose is to perform the
+ * final preparation and serialization of all preserved files in the file_set.
+ *
+ * It iterates through each preserved file in FIFO order (the order of
+ * preservation) and performs two main actions:
+ *
+ * 1. Freezes the File: It calls the handler's .freeze() callback for each
+ * file. This gives the handler a final opportunity to quiesce the device or
+ * prepare its state for the upcoming reboot. The handler may update its
+ * private data handle during this step.
+ *
+ * 2. Serializes Metadata: After a successful freeze, it copies the final file
+ * metadata—the handler's compatible string, the user token, and the final
+ * private data handle—into the pre-allocated contiguous memory buffer
+ * (file_set->files) that will be handed over to the next kernel via KHO.
+ *
+ * Error Handling (Rollback):
+ * This function is atomic. If any handler's .freeze() operation fails, the
+ * entire live update is aborted. The __luo_file_unfreeze() helper is
+ * immediately called to invoke the .unfreeze() op on all files that were
+ * successfully frozen before the point of failure, rolling them back to a
+ * running state. The function then returns an error, causing the reboot()
+ * syscall to fail.
+ *
+ * Context: Called only from the liveupdate_reboot() path.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser = file_set->files;
+ struct luo_file *luo_file;
+ int err;
+ int i;
+
+ if (!file_set->count)
+ return 0;
+
+ if (WARN_ON(!file_ser))
+ return -EINVAL;
+
+ i = 0;
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ err = luo_file_freeze_one(file_set, luo_file);
+ if (err < 0) {
+ pr_warn("Freeze failed for token[%#0llx] handler[%s] err[%pe]\n",
+ luo_file->token, luo_file->fh->compatible,
+ ERR_PTR(err));
+ goto err_unfreeze;
+ }
+
+ strscpy(file_ser[i].compatible, luo_file->fh->compatible,
+ sizeof(file_ser[i].compatible));
+ file_ser[i].data = luo_file->serialized_data;
+ file_ser[i].token = luo_file->token;
+ i++;
+ }
+
+ file_set_ser->count = file_set->count;
+ if (file_set->files)
+ file_set_ser->files = virt_to_phys(file_set->files);
+
+ return 0;
+
+err_unfreeze:
+ __luo_file_unfreeze(file_set, luo_file);
+
+ return err;
+}
+
+/**
+ * luo_file_unfreeze - Unfreezes all files in a file_set and clear serialization
+ * @file_set: The file_set whose files are to be unfrozen.
+ * @file_set_ser: Serialized file_set.
+ *
+ * This function rolls back the state of all files in a file_set after the
+ * freeze phase has begun but must be aborted. It is the counterpart to
+ * luo_file_freeze().
+ *
+ * It invokes the __luo_file_unfreeze() helper with a NULL argument, which
+ * signals the helper to iterate through all files in the file_set and call
+ * their respective .unfreeze() handler callbacks.
+ *
+ * Context: This is called when the live update is aborted during
+ * the reboot() syscall, after luo_file_freeze() has been called.
+ */
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ if (!file_set->count)
+ return;
+
+ __luo_file_unfreeze(file_set, NULL);
+ memset(file_set_ser, 0, sizeof(*file_set_ser));
+}
+
+/**
+ * luo_retrieve_file - Restores a preserved file from a file_set by its token.
+ * @file_set: The file_set from which to retrieve the file.
+ * @token: The unique token identifying the file to be restored.
+ * @filep: Output parameter; on success, this is populated with a pointer
+ * to the newly retrieved 'struct file'.
+ *
+ * This function is the primary mechanism for recreating a file in the new
+ * kernel after a live update. It searches the file_set's list of deserialized
+ * files for an entry matching the provided @token.
+ *
+ * The operation is idempotent: if a file has already been successfully
+ * retrieved, this function will simply return a pointer to the existing
+ * 'struct file' and report success without re-executing the retrieve
+ * operation. This is handled by checking the 'retrieved' flag under a lock.
+ *
+ * File retrieval can happen in any order; it is not bound by the order of
+ * preservation.
+ *
+ * Context: Can be called from an ioctl or other in-kernel code in the new
+ * kernel.
+ * Return: 0 on success. Returns a negative errno on failure:
+ * -ENOENT if no file with the matching token is found.
+ * Any error code returned by the handler's .retrieve() op.
+ */
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep)
+{
+ struct liveupdate_file_op_args args = {0};
+ struct luo_file *luo_file;
+ int err;
+
+ if (list_empty(&file_set->files_list))
+ return -ENOENT;
+
+ list_for_each_entry(luo_file, &file_set->files_list, list) {
+ if (luo_file->token == token)
+ break;
+ }
+
+ if (luo_file->token != token)
+ return -ENOENT;
+
+ guard(mutex)(&luo_file->mutex);
+ if (luo_file->retrieved) {
+ /*
+ * Someone is asking for this file again, so get a reference
+ * for them.
+ */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ return 0;
+ }
+
+ args.handler = luo_file->fh;
+ args.serialized_data = luo_file->serialized_data;
+ err = luo_file->fh->ops->retrieve(&args);
+ if (!err) {
+ luo_file->file = args.file;
+
+ /* Get reference so we can keep this file in LUO until finish */
+ get_file(luo_file->file);
+ *filep = luo_file->file;
+ luo_file->retrieved = true;
+ }
+
+ return err;
+}
+
+static int luo_file_can_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ bool can_finish = true;
+
+ guard(mutex)(&luo_file->mutex);
+
+ if (luo_file->fh->ops->can_finish) {
+ struct liveupdate_file_op_args args = {0};
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+ can_finish = luo_file->fh->ops->can_finish(&args);
+ }
+
+ return can_finish ? 0 : -EBUSY;
+}
+
+static void luo_file_finish_one(struct luo_file_set *file_set,
+ struct luo_file *luo_file)
+{
+ struct liveupdate_file_op_args args = {0};
+
+ guard(mutex)(&luo_file->mutex);
+
+ args.handler = luo_file->fh;
+ args.file = luo_file->file;
+ args.serialized_data = luo_file->serialized_data;
+ args.retrieved = luo_file->retrieved;
+
+ luo_file->fh->ops->finish(&args);
+}
+
+/**
+ * luo_file_finish - Completes the lifecycle for all files in a file_set.
+ * @file_set: The file_set to be finalized.
+ *
+ * This function orchestrates the final teardown of a live update file_set in
+ * the new kernel. It should be called after all necessary files have been
+ * retrieved and the userspace agent is ready to release the preserved state.
+ *
+ * The function iterates through all tracked files. For each file, it performs
+ * the following sequence of cleanup actions:
+ *
+ * 1. If file is not yet retrieved, retrieves it, and calls can_finish() on
+ * every file in the file_set. If all can_finish return true, continue to
+ * finish.
+ * 2. Calls the handler's .finish() callback (via luo_file_finish_one) to
+ * allow for final resource cleanup within the handler.
+ * 3. Releases LUO's ownership reference on the 'struct file' via fput(). This
+ * is the counterpart to the get_file() call in luo_retrieve_file().
+ * 4. Removes the 'struct luo_file' from the file_set's internal list.
+ * 5. Frees the memory for the 'struct luo_file' instance itself.
+ *
+ * After successfully finishing all individual files, it frees the
+ * contiguous memory block that was used to transfer the serialized metadata
+ * from the previous kernel.
+ *
+ * Error Handling (Atomic Failure):
+ * This operation is atomic. If any handler's .can_finish() op fails, the entire
+ * function aborts immediately and returns an error.
+ *
+ * Context: Can be called from an ioctl handler in the new kernel.
+ * Return: 0 on success, or a negative errno on failure.
+ */
+int luo_file_finish(struct luo_file_set *file_set)
+{
+ struct list_head *files_list = &file_set->files_list;
+ struct luo_file *luo_file;
+ int err;
+
+ if (!file_set->count)
+ return 0;
+
+ list_for_each_entry(luo_file, files_list, list) {
+ err = luo_file_can_finish_one(file_set, luo_file);
+ if (err)
+ return err;
+ }
+
+ while (!list_empty(&file_set->files_list)) {
+ luo_file = list_last_entry(&file_set->files_list,
+ struct luo_file, list);
+
+ luo_file_finish_one(file_set, luo_file);
+
+ if (luo_file->file)
+ fput(luo_file->file);
+ list_del(&luo_file->list);
+ file_set->count--;
+ mutex_destroy(&luo_file->mutex);
+ kfree(luo_file);
+ }
+
+ if (file_set->files) {
+ kho_restore_free(file_set->files);
+ file_set->files = NULL;
+ }
+
+ return 0;
+}
+
+/**
+ * luo_file_deserialize - Reconstructs the list of preserved files in the new kernel.
+ * @file_set: The incoming file_set to fill with deserialized data.
+ * @file_set_ser: Serialized KHO file_set data from the previous kernel.
+ *
+ * This function is called during the early boot process of the new kernel. It
+ * takes the raw, contiguous memory block of 'struct luo_file_ser' entries,
+ * provided by the previous kernel, and transforms it back into a live,
+ * in-memory linked list of 'struct luo_file' instances.
+ *
+ * For each serialized entry, it performs the following steps:
+ * 1. Reads the 'compatible' string.
+ * 2. Searches the global list of registered file handlers for one that
+ * matches the compatible string.
+ * 3. Allocates a new 'struct luo_file'.
+ * 4. Populates the new structure with the deserialized data (token, private
+ * data handle) and links it to the found handler. The 'file' pointer is
+ * initialized to NULL, as the file has not been retrieved yet.
+ * 5. Adds the new 'struct luo_file' to the file_set's files_list.
+ *
+ * This prepares the file_set for userspace, which can later call
+ * luo_retrieve_file() to restore the actual file descriptors.
+ *
+ * Context: Called from session deserialization.
+ */
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser)
+{
+ struct luo_file_ser *file_ser;
+ u64 i;
+
+ if (!file_set_ser->files) {
+ WARN_ON(file_set_ser->count);
+ return 0;
+ }
+
+ file_set->count = file_set_ser->count;
+ file_set->files = phys_to_virt(file_set_ser->files);
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of files that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ file_ser = file_set->files;
+ for (i = 0; i < file_set->count; i++) {
+ struct liveupdate_file_handler *fh;
+ bool handler_found = false;
+ struct luo_file *luo_file;
+
+ luo_list_for_each_private(fh, &luo_file_handler_list, list) {
+ if (!strcmp(fh->compatible, file_ser[i].compatible)) {
+ handler_found = true;
+ break;
+ }
+ }
+
+ if (!handler_found) {
+ pr_warn("No registered handler for compatible '%s'\n",
+ file_ser[i].compatible);
+ return -ENOENT;
+ }
+
+ luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL);
+ if (!luo_file)
+ return -ENOMEM;
+
+ luo_file->fh = fh;
+ luo_file->file = NULL;
+ luo_file->serialized_data = file_ser[i].data;
+ luo_file->token = file_ser[i].token;
+ luo_file->retrieved = false;
+ mutex_init(&luo_file->mutex);
+ list_add_tail(&luo_file->list, &file_set->files_list);
+ }
+
+ return 0;
+}
+
+void luo_file_set_init(struct luo_file_set *file_set)
+{
+ INIT_LIST_HEAD(&file_set->files_list);
+}
+
+void luo_file_set_destroy(struct luo_file_set *file_set)
+{
+ WARN_ON(file_set->count);
+ WARN_ON(!list_empty(&file_set->files_list));
+}
+
+/**
+ * liveupdate_register_file_handler - Register a file handler with LUO.
+ * @fh: Pointer to a caller-allocated &struct liveupdate_file_handler.
+ * The caller must initialize this structure, including a unique
+ * 'compatible' string and a valid 'fh' callbacks. This function adds the
+ * handler to the global list of supported file handlers.
+ *
+ * Context: Typically called during module initialization for file types that
+ * support live update preservation.
+ *
+ * Return: 0 on success. Negative errno on failure.
+ */
+int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
+{
+ struct liveupdate_file_handler *fh_iter;
+ int err;
+
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ /* Sanity check that all required callbacks are set */
+ if (!fh->ops->preserve || !fh->ops->unpreserve || !fh->ops->retrieve ||
+ !fh->ops->finish || !fh->ops->can_preserve) {
+ return -EINVAL;
+ }
+
+ /*
+ * Ensure the system is quiescent (no active sessions).
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ */
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ /* Check for duplicate compatible strings */
+ luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) {
+ if (!strcmp(fh_iter->compatible, fh->compatible)) {
+ pr_err("File handler registration failed: Compatible string '%s' already registered.\n",
+ fh->compatible);
+ err = -EEXIST;
+ goto err_resume;
+ }
+ }
+
+ /* Pin the module implementing the handler */
+ if (!try_module_get(fh->ops->owner)) {
+ err = -EAGAIN;
+ goto err_resume;
+ }
+
+ INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list));
+ list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list);
+ luo_session_resume();
+
+ return 0;
+
+err_resume:
+ luo_session_resume();
+ return err;
+}
+
+/**
+ * liveupdate_unregister_file_handler - Unregister a liveupdate file handler
+ * @fh: The file handler to unregister
+ *
+ * Unregisters the file handler from the liveupdate core. This function
+ * reverses the operations of liveupdate_register_file_handler().
+ *
+ * It ensures safe removal by checking that:
+ * No live update session is currently in progress.
+ *
+ * If the unregistration fails, the internal test state is reverted.
+ *
+ * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live
+ * update is in progress, can't quiesce live update.
+ */
+int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh)
+{
+ if (!liveupdate_enabled())
+ return -EOPNOTSUPP;
+
+ if (!luo_session_quiesce())
+ return -EBUSY;
+
+ list_del(&ACCESS_PRIVATE(fh, list));
+ module_put(fh->ops->owner);
+ luo_session_resume();
+
+ return 0;
+}
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
new file mode 100644
index 000000000000..c8973b543d1d
--- /dev/null
+++ b/kernel/liveupdate/luo_internal.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef _LINUX_LUO_INTERNAL_H
+#define _LINUX_LUO_INTERNAL_H
+
+#include <linux/liveupdate.h>
+#include <linux/uaccess.h>
+
+struct luo_ucmd {
+ void __user *ubuffer;
+ u32 user_size;
+ void *cmd;
+};
+
+static inline int luo_ucmd_respond(struct luo_ucmd *ucmd,
+ size_t kernel_cmd_size)
+{
+ /*
+ * Copy the minimum of what the user provided and what we actually
+ * have.
+ */
+ if (copy_to_user(ucmd->ubuffer, ucmd->cmd,
+ min_t(size_t, ucmd->user_size, kernel_cmd_size))) {
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Handles a deserialization failure: devices and memory is in unpredictable
+ * state.
+ *
+ * Continuing the boot process after a failure is dangerous because it could
+ * lead to leaks of private data.
+ */
+#define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__)
+
+/* Mimics list_for_each_entry() but for private list head entries */
+#define luo_list_for_each_private(pos, head, member) \
+ for (struct list_head *__iter = (head)->next; \
+ __iter != (head) && \
+ ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \
+ __iter = __iter->next)
+
+/**
+ * struct luo_file_set - A set of files that belong to the same sessions.
+ * @files_list: An ordered list of files associated with this session, it is
+ * ordered by preservation time.
+ * @files: The physically contiguous memory block that holds the serialized
+ * state of files.
+ * @count: A counter tracking the number of files currently stored in the
+ * @files_list for this session.
+ */
+struct luo_file_set {
+ struct list_head files_list;
+ struct luo_file_ser *files;
+ long count;
+};
+
+/**
+ * struct luo_session - Represents an active or incoming Live Update session.
+ * @name: A unique name for this session, used for identification and
+ * retrieval.
+ * @ser: Pointer to the serialized data for this session.
+ * @list: A list_head member used to link this session into a global list
+ * of either outgoing (to be preserved) or incoming (restored from
+ * previous kernel) sessions.
+ * @retrieved: A boolean flag indicating whether this session has been
+ * retrieved by a consumer in the new kernel.
+ * @file_set: A set of files that belong to this session.
+ * @mutex: protects fields in the luo_session.
+ */
+struct luo_session {
+ char name[LIVEUPDATE_SESSION_NAME_LENGTH];
+ struct luo_session_ser *ser;
+ struct list_head list;
+ bool retrieved;
+ struct luo_file_set file_set;
+ struct mutex mutex;
+};
+
+int luo_session_create(const char *name, struct file **filep);
+int luo_session_retrieve(const char *name, struct file **filep);
+int __init luo_session_setup_outgoing(void *fdt);
+int __init luo_session_setup_incoming(void *fdt);
+int luo_session_serialize(void);
+int luo_session_deserialize(void);
+bool luo_session_quiesce(void);
+void luo_session_resume(void);
+
+int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd);
+void luo_file_unpreserve_files(struct luo_file_set *file_set);
+int luo_file_freeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_unfreeze(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+int luo_retrieve_file(struct luo_file_set *file_set, u64 token,
+ struct file **filep);
+int luo_file_finish(struct luo_file_set *file_set);
+int luo_file_deserialize(struct luo_file_set *file_set,
+ struct luo_file_set_ser *file_set_ser);
+void luo_file_set_init(struct luo_file_set *file_set);
+void luo_file_set_destroy(struct luo_file_set *file_set);
+
+#endif /* _LINUX_LUO_INTERNAL_H */
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
new file mode 100644
index 000000000000..dbdbc3bd7929
--- /dev/null
+++ b/kernel/liveupdate/luo_session.c
@@ -0,0 +1,646 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2025, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+/**
+ * DOC: LUO Sessions
+ *
+ * LUO Sessions provide the core mechanism for grouping and managing `struct
+ * file *` instances that need to be preserved across a kexec-based live
+ * update. Each session acts as a named container for a set of file objects,
+ * allowing a userspace agent to manage the lifecycle of resources critical to a
+ * workload.
+ *
+ * Core Concepts:
+ *
+ * - Named Containers: Sessions are identified by a unique, user-provided name,
+ * which is used for both creation in the current kernel and retrieval in the
+ * next kernel.
+ *
+ * - Userspace Interface: Session management is driven from userspace via
+ * ioctls on /dev/liveupdate.
+ *
+ * - Serialization: Session metadata is preserved using the KHO framework. When
+ * a live update is triggered via kexec, an array of `struct luo_session_ser`
+ * is populated and placed in a preserved memory region. An FDT node is also
+ * created, containing the count of sessions and the physical address of this
+ * array.
+ *
+ * Session Lifecycle:
+ *
+ * 1. Creation: A userspace agent calls `luo_session_create()` to create a
+ * new, empty session and receives a file descriptor for it.
+ *
+ * 2. Serialization: When the `reboot(LINUX_REBOOT_CMD_KEXEC)` syscall is
+ * made, `luo_session_serialize()` is called. It iterates through all
+ * active sessions and writes their metadata into a memory area preserved
+ * by KHO.
+ *
+ * 3. Deserialization (in new kernel): After kexec, `luo_session_deserialize()`
+ * runs, reading the serialized data and creating a list of `struct
+ * luo_session` objects representing the preserved sessions.
+ *
+ * 4. Retrieval: A userspace agent in the new kernel can then call
+ * `luo_session_retrieve()` with a session name to get a new file
+ * descriptor and access the preserved state.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/kexec_handover.h>
+#include <linux/kho/abi/luo.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/liveupdate.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/unaligned.h>
+#include <uapi/linux/liveupdate.h>
+#include "luo_internal.h"
+
+/* 16 4K pages, give space for 744 sessions */
+#define LUO_SESSION_PGCNT 16ul
+#define LUO_SESSION_MAX (((LUO_SESSION_PGCNT << PAGE_SHIFT) - \
+ sizeof(struct luo_session_header_ser)) / \
+ sizeof(struct luo_session_ser))
+
+/**
+ * struct luo_session_header - Header struct for managing LUO sessions.
+ * @count: The number of sessions currently tracked in the @list.
+ * @list: The head of the linked list of `struct luo_session` instances.
+ * @rwsem: A read-write semaphore providing synchronized access to the
+ * session list and other fields in this structure.
+ * @header_ser: The header data of serialization array.
+ * @ser: The serialized session data (an array of
+ * `struct luo_session_ser`).
+ * @active: Set to true when first initialized. If previous kernel did not
+ * send session data, active stays false for incoming.
+ */
+struct luo_session_header {
+ long count;
+ struct list_head list;
+ struct rw_semaphore rwsem;
+ struct luo_session_header_ser *header_ser;
+ struct luo_session_ser *ser;
+ bool active;
+};
+
+/**
+ * struct luo_session_global - Global container for managing LUO sessions.
+ * @incoming: The sessions passed from the previous kernel.
+ * @outgoing: The sessions that are going to be passed to the next kernel.
+ */
+struct luo_session_global {
+ struct luo_session_header incoming;
+ struct luo_session_header outgoing;
+};
+
+static struct luo_session_global luo_session_global = {
+ .incoming = {
+ .list = LIST_HEAD_INIT(luo_session_global.incoming.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.incoming.rwsem),
+ },
+ .outgoing = {
+ .list = LIST_HEAD_INIT(luo_session_global.outgoing.list),
+ .rwsem = __RWSEM_INITIALIZER(luo_session_global.outgoing.rwsem),
+ },
+};
+
+static struct luo_session *luo_session_alloc(const char *name)
+{
+ struct luo_session *session = kzalloc(sizeof(*session), GFP_KERNEL);
+
+ if (!session)
+ return ERR_PTR(-ENOMEM);
+
+ strscpy(session->name, name, sizeof(session->name));
+ INIT_LIST_HEAD(&session->file_set.files_list);
+ luo_file_set_init(&session->file_set);
+ INIT_LIST_HEAD(&session->list);
+ mutex_init(&session->mutex);
+
+ return session;
+}
+
+static void luo_session_free(struct luo_session *session)
+{
+ luo_file_set_destroy(&session->file_set);
+ mutex_destroy(&session->mutex);
+ kfree(session);
+}
+
+static int luo_session_insert(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ struct luo_session *it;
+
+ guard(rwsem_write)(&sh->rwsem);
+
+ /*
+ * For outgoing we should make sure there is room in serialization array
+ * for new session.
+ */
+ if (sh == &luo_session_global.outgoing) {
+ if (sh->count == LUO_SESSION_MAX)
+ return -ENOMEM;
+ }
+
+ /*
+ * For small number of sessions this loop won't hurt performance
+ * but if we ever start using a lot of sessions, this might
+ * become a bottle neck during deserialization time, as it would
+ * cause O(n*n) complexity.
+ */
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, session->name, sizeof(it->name)))
+ return -EEXIST;
+ }
+ list_add_tail(&session->list, &sh->list);
+ sh->count++;
+
+ return 0;
+}
+
+static void luo_session_remove(struct luo_session_header *sh,
+ struct luo_session *session)
+{
+ guard(rwsem_write)(&sh->rwsem);
+ list_del(&session->list);
+ sh->count--;
+}
+
+static int luo_session_finish_one(struct luo_session *session)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_finish(&session->file_set);
+}
+
+static void luo_session_unfreeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ luo_file_unfreeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_freeze_one(struct luo_session *session,
+ struct luo_session_ser *ser)
+{
+ guard(mutex)(&session->mutex);
+ return luo_file_freeze(&session->file_set, &ser->file_set_ser);
+}
+
+static int luo_session_release(struct inode *inodep, struct file *filep)
+{
+ struct luo_session *session = filep->private_data;
+ struct luo_session_header *sh;
+
+ /* If retrieved is set, it means this session is from incoming list */
+ if (session->retrieved) {
+ int err = luo_session_finish_one(session);
+
+ if (err) {
+ pr_warn("Unable to finish session [%s] on release\n",
+ session->name);
+ return err;
+ }
+ sh = &luo_session_global.incoming;
+ } else {
+ scoped_guard(mutex, &session->mutex)
+ luo_file_unpreserve_files(&session->file_set);
+ sh = &luo_session_global.outgoing;
+ }
+
+ luo_session_remove(sh, session);
+ luo_session_free(session);
+
+ return 0;
+}
+
+static int luo_session_preserve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_preserve_fd *argp = ucmd->cmd;
+ int err;
+
+ guard(mutex)(&session->mutex);
+ err = luo_preserve_file(&session->file_set, argp->token, argp->fd);
+ if (err)
+ return err;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ pr_warn("The file was successfully preserved, but response to user failed\n");
+
+ return err;
+}
+
+static int luo_session_retrieve_fd(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_retrieve_fd *argp = ucmd->cmd;
+ struct file *file;
+ int err;
+
+ argp->fd = get_unused_fd_flags(O_CLOEXEC);
+ if (argp->fd < 0)
+ return argp->fd;
+
+ guard(mutex)(&session->mutex);
+ err = luo_retrieve_file(&session->file_set, argp->token, &file);
+ if (err < 0)
+ goto err_put_fd;
+
+ err = luo_ucmd_respond(ucmd, sizeof(*argp));
+ if (err)
+ goto err_put_file;
+
+ fd_install(argp->fd, file);
+
+ return 0;
+
+err_put_file:
+ fput(file);
+err_put_fd:
+ put_unused_fd(argp->fd);
+
+ return err;
+}
+
+static int luo_session_finish(struct luo_session *session,
+ struct luo_ucmd *ucmd)
+{
+ struct liveupdate_session_finish *argp = ucmd->cmd;
+ int err = luo_session_finish_one(session);
+
+ if (err)
+ return err;
+
+ return luo_ucmd_respond(ucmd, sizeof(*argp));
+}
+
+union ucmd_buffer {
+ struct liveupdate_session_finish finish;
+ struct liveupdate_session_preserve_fd preserve;
+ struct liveupdate_session_retrieve_fd retrieve;
+};
+
+struct luo_ioctl_op {
+ unsigned int size;
+ unsigned int min_size;
+ unsigned int ioctl_num;
+ int (*execute)(struct luo_session *session, struct luo_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last) \
+ [_IOC_NR(_ioctl) - LIVEUPDATE_CMD_SESSION_BASE] = { \
+ .size = sizeof(_struct) + \
+ BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+ sizeof(_struct)), \
+ .min_size = offsetofend(_struct, _last), \
+ .ioctl_num = _ioctl, \
+ .execute = _fn, \
+ }
+
+static const struct luo_ioctl_op luo_session_ioctl_ops[] = {
+ IOCTL_OP(LIVEUPDATE_SESSION_FINISH, luo_session_finish,
+ struct liveupdate_session_finish, reserved),
+ IOCTL_OP(LIVEUPDATE_SESSION_PRESERVE_FD, luo_session_preserve_fd,
+ struct liveupdate_session_preserve_fd, token),
+ IOCTL_OP(LIVEUPDATE_SESSION_RETRIEVE_FD, luo_session_retrieve_fd,
+ struct liveupdate_session_retrieve_fd, token),
+};
+
+static long luo_session_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct luo_session *session = filep->private_data;
+ const struct luo_ioctl_op *op;
+ struct luo_ucmd ucmd = {};
+ union ucmd_buffer buf;
+ unsigned int nr;
+ int ret;
+
+ nr = _IOC_NR(cmd);
+ if (nr < LIVEUPDATE_CMD_SESSION_BASE || (nr - LIVEUPDATE_CMD_SESSION_BASE) >=
+ ARRAY_SIZE(luo_session_ioctl_ops)) {
+ return -EINVAL;
+ }
+
+ ucmd.ubuffer = (void __user *)arg;
+ ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+ if (ret)
+ return ret;
+
+ op = &luo_session_ioctl_ops[nr - LIVEUPDATE_CMD_SESSION_BASE];
+ if (op->ioctl_num != cmd)
+ return -ENOIOCTLCMD;
+ if (ucmd.user_size < op->min_size)
+ return -EINVAL;
+
+ ucmd.cmd = &buf;
+ ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+ ucmd.user_size);
+ if (ret)
+ return ret;
+
+ return op->execute(session, &ucmd);
+}
+
+static const struct file_operations luo_session_fops = {
+ .owner = THIS_MODULE,
+ .release = luo_session_release,
+ .unlocked_ioctl = luo_session_ioctl,
+};
+
+/* Create a "struct file" for session */
+static int luo_session_getfile(struct luo_session *session, struct file **filep)
+{
+ char name_buf[128];
+ struct file *file;
+
+ lockdep_assert_held(&session->mutex);
+ snprintf(name_buf, sizeof(name_buf), "[luo_session] %s", session->name);
+ file = anon_inode_getfile(name_buf, &luo_session_fops, session, O_RDWR);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ *filep = file;
+
+ return 0;
+}
+
+int luo_session_create(const char *name, struct file **filep)
+{
+ struct luo_session *session;
+ int err;
+
+ session = luo_session_alloc(name);
+ if (IS_ERR(session))
+ return PTR_ERR(session);
+
+ err = luo_session_insert(&luo_session_global.outgoing, session);
+ if (err)
+ goto err_free;
+
+ scoped_guard(mutex, &session->mutex)
+ err = luo_session_getfile(session, filep);
+ if (err)
+ goto err_remove;
+
+ return 0;
+
+err_remove:
+ luo_session_remove(&luo_session_global.outgoing, session);
+err_free:
+ luo_session_free(session);
+
+ return err;
+}
+
+int luo_session_retrieve(const char *name, struct file **filep)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ struct luo_session *session = NULL;
+ struct luo_session *it;
+ int err;
+
+ scoped_guard(rwsem_read, &sh->rwsem) {
+ list_for_each_entry(it, &sh->list, list) {
+ if (!strncmp(it->name, name, sizeof(it->name))) {
+ session = it;
+ break;
+ }
+ }
+ }
+
+ if (!session)
+ return -ENOENT;
+
+ guard(mutex)(&session->mutex);
+ if (session->retrieved)
+ return -EINVAL;
+
+ err = luo_session_getfile(session, filep);
+ if (!err)
+ session->retrieved = true;
+
+ return err;
+}
+
+int __init luo_session_setup_outgoing(void *fdt_out)
+{
+ struct luo_session_header_ser *header_ser;
+ u64 header_ser_pa;
+ int err;
+
+ header_ser = kho_alloc_preserve(LUO_SESSION_PGCNT << PAGE_SHIFT);
+ if (IS_ERR(header_ser))
+ return PTR_ERR(header_ser);
+ header_ser_pa = virt_to_phys(header_ser);
+
+ err = fdt_begin_node(fdt_out, LUO_FDT_SESSION_NODE_NAME);
+ err |= fdt_property_string(fdt_out, "compatible",
+ LUO_FDT_SESSION_COMPATIBLE);
+ err |= fdt_property(fdt_out, LUO_FDT_SESSION_HEADER, &header_ser_pa,
+ sizeof(header_ser_pa));
+ err |= fdt_end_node(fdt_out);
+
+ if (err)
+ goto err_unpreserve;
+
+ luo_session_global.outgoing.header_ser = header_ser;
+ luo_session_global.outgoing.ser = (void *)(header_ser + 1);
+ luo_session_global.outgoing.active = true;
+
+ return 0;
+
+err_unpreserve:
+ kho_unpreserve_free(header_ser);
+ return err;
+}
+
+int __init luo_session_setup_incoming(void *fdt_in)
+{
+ struct luo_session_header_ser *header_ser;
+ int err, header_size, offset;
+ u64 header_ser_pa;
+ const void *ptr;
+
+ offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_SESSION_NODE_NAME);
+ if (offset < 0) {
+ pr_err("Unable to get session node: [%s]\n",
+ LUO_FDT_SESSION_NODE_NAME);
+ return -EINVAL;
+ }
+
+ err = fdt_node_check_compatible(fdt_in, offset,
+ LUO_FDT_SESSION_COMPATIBLE);
+ if (err) {
+ pr_err("Session node incompatible [%s]\n",
+ LUO_FDT_SESSION_COMPATIBLE);
+ return -EINVAL;
+ }
+
+ header_size = 0;
+ ptr = fdt_getprop(fdt_in, offset, LUO_FDT_SESSION_HEADER, &header_size);
+ if (!ptr || header_size != sizeof(u64)) {
+ pr_err("Unable to get session header '%s' [%d]\n",
+ LUO_FDT_SESSION_HEADER, header_size);
+ return -EINVAL;
+ }
+
+ header_ser_pa = get_unaligned((u64 *)ptr);
+ header_ser = phys_to_virt(header_ser_pa);
+
+ luo_session_global.incoming.header_ser = header_ser;
+ luo_session_global.incoming.ser = (void *)(header_ser + 1);
+ luo_session_global.incoming.active = true;
+
+ return 0;
+}
+
+int luo_session_deserialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.incoming;
+ static bool is_deserialized;
+ static int err;
+
+ /* If has been deserialized, always return the same error code */
+ if (is_deserialized)
+ return err;
+
+ is_deserialized = true;
+ if (!sh->active)
+ return 0;
+
+ /*
+ * Note on error handling:
+ *
+ * If deserialization fails (e.g., allocation failure or corrupt data),
+ * we intentionally skip cleanup of sessions that were already restored.
+ *
+ * A partial failure leaves the preserved state inconsistent.
+ * Implementing a safe "undo" to unwind complex dependencies (sessions,
+ * files, hardware state) is error-prone and provides little value, as
+ * the system is effectively in a broken state.
+ *
+ * We treat these resources as leaked. The expected recovery path is for
+ * userspace to detect the failure and trigger a reboot, which will
+ * reliably reset devices and reclaim memory.
+ */
+ for (int i = 0; i < sh->header_ser->count; i++) {
+ struct luo_session *session;
+
+ session = luo_session_alloc(sh->ser[i].name);
+ if (IS_ERR(session)) {
+ pr_warn("Failed to allocate session [%s] during deserialization %pe\n",
+ sh->ser[i].name, session);
+ return PTR_ERR(session);
+ }
+
+ err = luo_session_insert(sh, session);
+ if (err) {
+ pr_warn("Failed to insert session [%s] %pe\n",
+ session->name, ERR_PTR(err));
+ luo_session_free(session);
+ return err;
+ }
+
+ scoped_guard(mutex, &session->mutex) {
+ luo_file_deserialize(&session->file_set,
+ &sh->ser[i].file_set_ser);
+ }
+ }
+
+ kho_restore_free(sh->header_ser);
+ sh->header_ser = NULL;
+ sh->ser = NULL;
+
+ return 0;
+}
+
+int luo_session_serialize(void)
+{
+ struct luo_session_header *sh = &luo_session_global.outgoing;
+ struct luo_session *session;
+ int i = 0;
+ int err;
+
+ guard(rwsem_write)(&sh->rwsem);
+ list_for_each_entry(session, &sh->list, list) {
+ err = luo_session_freeze_one(session, &sh->ser[i]);
+ if (err)
+ goto err_undo;
+
+ strscpy(sh->ser[i].name, session->name,
+ sizeof(sh->ser[i].name));
+ i++;
+ }
+ sh->header_ser->count = sh->count;
+
+ return 0;
+
+err_undo:
+ list_for_each_entry_continue_reverse(session, &sh->list, list) {
+ i--;
+ luo_session_unfreeze_one(session, &sh->ser[i]);
+ memset(sh->ser[i].name, 0, sizeof(sh->ser[i].name));
+ }
+
+ return err;
+}
+
+/**
+ * luo_session_quiesce - Ensure no active sessions exist and lock session lists.
+ *
+ * Acquires exclusive write locks on both incoming and outgoing session lists.
+ * It then validates no sessions exist in either list.
+ *
+ * This mechanism is used during file handler un/registration to ensure that no
+ * sessions are currently using the handler, and no new sessions can be created
+ * while un/registration is in progress.
+ *
+ * This prevents registering new handlers while sessions are active or
+ * while deserialization is in progress.
+ *
+ * Return:
+ * true - System is quiescent (0 sessions) and locked.
+ * false - Active sessions exist. The locks are released internally.
+ */
+bool luo_session_quiesce(void)
+{
+ down_write(&luo_session_global.incoming.rwsem);
+ down_write(&luo_session_global.outgoing.rwsem);
+
+ if (luo_session_global.incoming.count ||
+ luo_session_global.outgoing.count) {
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * luo_session_resume - Unlock session lists and resume normal activity.
+ *
+ * Releases the exclusive locks acquired by a successful call to
+ * luo_session_quiesce().
+ */
+void luo_session_resume(void)
+{
+ up_write(&luo_session_global.outgoing.rwsem);
+ up_write(&luo_session_global.incoming.rwsem);
+}
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cca2a..a114949eeed5 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,13 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance.
+KASAN_SANITIZE_lockdep.o := n
+KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
+obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
@@ -15,15 +23,13 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
-obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
-obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
+obj-$(CONFIG_RT_MUTEXES) += rtmutex_api.o
+obj-$(CONFIG_PREEMPT_RT) += spinlock_rt.o ww_rt_mutex.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
-obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
+obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c
new file mode 100644
index 000000000000..810b50344d35
--- /dev/null
+++ b/kernel/locking/irqflag-debug.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/irqflags.h>
+
+noinstr void warn_bogus_irq_restore(void)
+{
+ instrumentation_begin();
+ WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n");
+ instrumentation_end();
+}
+EXPORT_SYMBOL(warn_bogus_irq_restore);
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
deleted file mode 100644
index 86ae2aebf004..000000000000
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-
-void lg_lock_init(struct lglock *lg, char *name)
-{
- LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-
-void lg_local_lock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-
-void lg_local_unlock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-
-void lg_global_lock(struct lglock *lg)
-{
- int i;
-
- preempt_disable();
- lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_lock(lock);
- }
-}
-EXPORT_SYMBOL(lg_global_lock);
-
-void lg_global_unlock(struct lglock *lg)
-{
- int i;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_unlock(lock);
- }
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..e68d82099558
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * Collect locking event counts
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+
+#include "lock_events.h"
+
+#undef LOCK_EVENT
+#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name,
+
+#define LOCK_EVENTS_DIR "lock_event_counts"
+
+/*
+ * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
+ * types of locks will be reported under the <debugfs>/lock_event_counts/
+ * directory. See lock_events_list.h for the list of available locking
+ * events.
+ *
+ * Writing to the special ".reset_counts" file will reset all the above
+ * locking event counts. This is a very slow operation and so should not
+ * be done frequently.
+ *
+ * These event counts are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counts usable even in a production
+ * environment.
+ */
+static const char * const lockevent_names[lockevent_num + 1] = {
+
+#include "lock_events_list.h"
+
+ [LOCKEVENT_reset_cnts] = ".reset_counts",
+};
+
+/*
+ * Per-cpu counts
+ */
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * The lockevent_read() function can be overridden.
+ */
+ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, id, len;
+ u64 sum = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ id = (long)file_inode(file)->i_private;
+
+ if (id >= lockevent_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu)
+ sum += per_cpu(lockevents[id], cpu);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When idx = reset_cnts, reset all the counts.
+ */
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
+
+ for (i = 0 ; i < lockevent_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_lockevent = {
+ .read = lockevent_read,
+ .write = lockevent_write,
+ .llseek = default_llseek,
+};
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include <asm/paravirt.h>
+
+static bool __init skip_lockevent(const char *name)
+{
+ static int pv_on __initdata = -1;
+
+ if (pv_on < 0)
+ pv_on = !pv_is_native_spin_unlock();
+ /*
+ * Skip PV qspinlock events on bare metal.
+ */
+ if (!pv_on && !memcmp(name, "pv_", 3))
+ return true;
+ return false;
+}
+#else
+static inline bool skip_lockevent(const char *name)
+{
+ return false;
+}
+#endif
+
+/*
+ * Initialize debugfs for the locking event counts.
+ */
+static int __init init_lockevent_counts(void)
+{
+ struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
+ int i;
+
+ if (IS_ERR(d_counts))
+ goto out;
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < lockevent_num; i++) {
+ if (skip_lockevent(lockevent_names[i]))
+ continue;
+ if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent)))
+ goto fail_undo;
+ }
+
+ if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+ &fops_lockevent)))
+ goto fail_undo;
+
+ return 0;
+fail_undo:
+ debugfs_remove_recursive(d_counts);
+out:
+ pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
+ return -ENOMEM;
+}
+fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..d2345e9c0190
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef __LOCKING_LOCK_EVENTS_H
+#define __LOCKING_LOCK_EVENTS_H
+
+enum lock_events {
+
+#include "lock_events_list.h"
+
+ lockevent_num, /* Total number of lock event counts */
+ LOCKEVENT_reset_cnts = lockevent_num,
+};
+
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+/*
+ * Per-cpu counters
+ */
+DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * Increment the statistical counters. use raw_cpu_inc() because of lower
+ * overhead and we don't care if we loose the occasional update.
+ */
+static inline void __lockevent_inc(enum lock_events event, bool cond)
+{
+ if (cond)
+ raw_cpu_inc(lockevents[event]);
+}
+
+#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
+#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+
+static inline void __lockevent_add(enum lock_events event, int inc)
+{
+ raw_cpu_add(lockevents[event], inc);
+}
+
+#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
+
+#define lockevent_inc(ev)
+#define lockevent_add(ev, c) do { (void)(c); } while (0)
+#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0)
+
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
+
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos);
+
+#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..4e36258cc34f
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef LOCK_EVENT
+#define LOCK_EVENT(name) LOCKEVENT_ ## name,
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Locking events for PV qspinlock.
+ */
+LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */
+LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */
+LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */
+LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */
+LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */
+LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */
+LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */
+LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */
+LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */
+LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */
+LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+/*
+ * Locking events for qspinlock
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
+ */
+LOCK_EVENT(lock_pending) /* # of locking ops via pending code */
+LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */
+LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */
+LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */
+LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */
+LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+/*
+ * Locking events for Resilient Queued Spin Lock
+ */
+LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */
+
+/*
+ * Locking events for rwsem
+ */
+LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
+LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
+LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
+LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
+LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */
+LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
+LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
+LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
+LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */
+LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
+LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
+LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
+LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
+LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
+LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
+
+/*
+ * Locking events for rtlock_slowlock()
+ */
+LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */
+LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */
+LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */
+
+/*
+ * Locking events for rt_mutex_slowlock()
+ */
+LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */
+LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */
+LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */
+LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */
+LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */
+LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */
+
+/*
+ * Locking events for lockdep
+ */
+LOCK_EVENT(lockdep_acquire)
+LOCK_EVENT(lockdep_lock)
+LOCK_EVENT(lockdep_nocheck)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index aaeae885d9af..2d4c5bab5af8 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/lockdep.c
*
@@ -6,7 +7,7 @@
* Started by Ingo Molnar:
*
* Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*
* this code maps all the lock dependencies as they occur in a live kernel
* and will warn about the following classes of locking bugs:
@@ -28,6 +29,9 @@
#define DISABLE_BRANCH_PROFILING
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/sched/mm.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
@@ -42,31 +46,87 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
+#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
-#include <linux/kmemcheck.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/nmi.h>
+#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
+#include <linux/lockdep.h>
+#include <linux/context_tracking.h>
+#include <linux/console.h>
+#include <linux/kasan.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
+#include "lock_events.h"
-#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
#ifdef CONFIG_PROVE_LOCKING
-int prove_locking = 1;
+static int prove_locking = 1;
module_param(prove_locking, int, 0644);
#else
#define prove_locking 0
#endif
#ifdef CONFIG_LOCK_STAT
-int lock_stat = 1;
+static int lock_stat = 1;
module_param(lock_stat, int, 0644);
#else
#define lock_stat 0
#endif
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_lockdep_table[] = {
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ .procname = "prove_locking",
+ .data = &prove_locking,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_LOCK_STAT
+ {
+ .procname = "lock_stat",
+ .data = &lock_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_LOCK_STAT */
+};
+
+static __init int kernel_lockdep_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_lockdep_table);
+ return 0;
+}
+late_initcall(kernel_lockdep_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
+DEFINE_PER_CPU(unsigned int, lockdep_recursion);
+EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
+
+static __always_inline bool lockdep_enabled(void)
+{
+ if (!debug_locks)
+ return false;
+
+ if (this_cpu_read(lockdep_recursion))
+ return false;
+
+ if (current->lockdep_recursion)
+ return false;
+
+ return true;
+}
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -75,11 +135,44 @@ module_param(lock_stat, int, 0644);
* to use a raw spinlock - we really dont want the spinlock
* code to recurse back into the lockdep code...
*/
-static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static struct task_struct *__owner;
+
+static inline void lockdep_lock(void)
+{
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ __this_cpu_inc(lockdep_recursion);
+ arch_spin_lock(&__lock);
+ __owner = current;
+}
+
+static inline void lockdep_unlock(void)
+{
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
+ if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))
+ return;
+
+ __owner = NULL;
+ arch_spin_unlock(&__lock);
+ __this_cpu_dec(lockdep_recursion);
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static inline bool lockdep_assert_locked(void)
+{
+ return DEBUG_LOCKS_WARN_ON(__owner != current);
+}
+#endif
+
+static struct task_struct *lockdep_selftest_task_struct;
+
static int graph_lock(void)
{
- arch_spin_lock(&lockdep_lock);
+ lockdep_lock();
+ lockevent_inc(lockdep_lock);
/*
* Make sure that if another CPU detected a bug while
* walking the graph we dont change it (while the other
@@ -87,27 +180,15 @@ static int graph_lock(void)
* dropped already)
*/
if (!debug_locks) {
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
return 0;
}
- /* prevent any recursions within lockdep from causing deadlocks */
- current->lockdep_recursion++;
return 1;
}
-static inline int graph_unlock(void)
+static inline void graph_unlock(void)
{
- if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
- /*
- * The lockdep graph lock isn't locked while we expect it to
- * be, we're confused now, bye!
- */
- return DEBUG_LOCKS_WARN_ON(1);
- }
-
- current->lockdep_recursion--;
- arch_spin_unlock(&lockdep_lock);
- return 0;
+ lockdep_unlock();
}
/*
@@ -118,40 +199,55 @@ static inline int debug_locks_off_graph_unlock(void)
{
int ret = debug_locks_off();
- arch_spin_unlock(&lockdep_lock);
+ lockdep_unlock();
return ret;
}
-static int lockdep_initialized;
-
unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
/*
* All data structures here are protected by the global debug_lock.
*
- * Mutex key structs only get allocated, once during bootup, and never
- * get freed - this significantly simplifies the debugging code.
+ * nr_lock_classes is the number of elements of lock_classes[] that is
+ * in use.
*/
+#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define KEYHASH_SIZE (1UL << KEYHASH_BITS)
+static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
-static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+unsigned long nr_zapped_classes;
+unsigned long nr_dynamic_keys;
+unsigned long max_lock_class_idx;
+struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
- if (!hlock->class_idx) {
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
+ barrier();
+
+ if (!test_bit(class_idx, lock_classes_in_use)) {
/*
* Someone passed in garbage, we give up.
*/
DEBUG_LOCKS_WARN_ON(1);
return NULL;
}
- return lock_classes + hlock->class_idx - 1;
+
+ /*
+ * At this point, if the passed hlock->class_idx is still garbage,
+ * we just have to live with it
+ */
+ return lock_classes + class_idx;
}
#ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
- cpu_lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats);
static inline u64 lockstat_clock(void)
{
@@ -201,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
dst->nr += src->nr;
}
-struct lock_class_stats lock_stats(struct lock_class *class)
+void lock_stats(struct lock_class *class, struct lock_class_stats *stats)
{
- struct lock_class_stats stats;
int cpu, i;
- memset(&stats, 0, sizeof(struct lock_class_stats));
+ memset(stats, 0, sizeof(struct lock_class_stats));
for_each_possible_cpu(cpu) {
struct lock_class_stats *pcs =
&per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
- for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
- stats.contention_point[i] += pcs->contention_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++)
+ stats->contention_point[i] += pcs->contention_point[i];
- for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
- stats.contending_point[i] += pcs->contending_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++)
+ stats->contending_point[i] += pcs->contending_point[i];
- lock_time_add(&pcs->read_waittime, &stats.read_waittime);
- lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+ lock_time_add(&pcs->read_waittime, &stats->read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats->write_waittime);
- lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
- lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+ lock_time_add(&pcs->read_holdtime, &stats->read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats->write_holdtime);
- for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
- stats.bounces[i] += pcs->bounces[i];
+ for (i = 0; i < ARRAY_SIZE(stats->bounces); i++)
+ stats->bounces[i] += pcs->bounces[i];
}
-
- return stats;
}
void clear_lock_stats(struct lock_class *class)
@@ -246,12 +339,7 @@ void clear_lock_stats(struct lock_class *class)
static struct lock_class_stats *get_lock_stats(struct lock_class *class)
{
- return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
-}
-
-static void put_lock_stats(struct lock_class_stats *stats)
-{
- put_cpu_var(cpu_lock_stats);
+ return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes];
}
static void lock_release_holdtime(struct held_lock *hlock)
@@ -269,7 +357,6 @@ static void lock_release_holdtime(struct held_lock *hlock)
lock_time_inc(&stats->read_holdtime, holdtime);
else
lock_time_inc(&stats->write_holdtime, holdtime);
- put_lock_stats(stats);
}
#else
static inline void lock_release_holdtime(struct held_lock *hlock)
@@ -278,11 +365,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
#endif
/*
- * We keep a global list of all lock classes. The list only grows,
- * never shrinks. The list is only accessed with the lockdep
- * spinlock lock held.
+ * We keep a global list of all lock classes. The list is only accessed with
+ * the lockdep spinlock lock held. free_lock_classes is a list with free
+ * elements. These elements are linked together by the lock_entry member in
+ * struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
+static LIST_HEAD(free_lock_classes);
+
+/**
+ * struct pending_free - information about data structures about to be freed
+ * @zapped: Head of a list with struct lock_class elements.
+ * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements
+ * are about to be freed.
+ */
+struct pending_free {
+ struct list_head zapped;
+ DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS);
+};
+
+/**
+ * struct delayed_free - data structures used for delayed freeing
+ *
+ * A data structure for delayed freeing of data structures that may be
+ * accessed by RCU readers at the time these were freed.
+ *
+ * @rcu_head: Used to schedule an RCU callback for freeing data structures.
+ * @index: Index of @pf to which freed data structures are added.
+ * @scheduled: Whether or not an RCU callback has been scheduled.
+ * @pf: Array with information about data structures about to be freed.
+ */
+static struct delayed_free {
+ struct rcu_head rcu_head;
+ int index;
+ int scheduled;
+ struct pending_free pf[2];
+} delayed_free;
/*
* The lockdep classes are in a hash-table as well, for fast lookup:
@@ -292,7 +410,7 @@ LIST_HEAD(all_lock_classes);
#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
#define classhashentry(key) (classhash_table + __classhashfn((key)))
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
/*
* We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +421,22 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
+
+/*
+ * the id of held_lock
+ */
+static inline u16 hlock_id(struct held_lock *hlock)
+{
+ BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16);
+
+ return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS));
+}
+
+static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id)
+{
+ return hlock_id & (MAX_LOCKDEP_KEYS - 1);
+}
/*
* The hash key of the lock dependency chains is a hash itself too:
@@ -311,22 +444,37 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
* It's a 64-bit hash, because it's important for the keys to be
* unique.
*/
-#define iterate_chain_key(key1, key2) \
- (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
- ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
- (key2))
+static inline u64 iterate_chain_key(u64 key, u32 idx)
+{
+ u32 k0 = key, k1 = key >> 32;
+
+ __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */
+
+ return k0 | (u64)k1 << 32;
+}
-void lockdep_off(void)
+void lockdep_init_task(struct task_struct *task)
{
- current->lockdep_recursion++;
+ task->lockdep_depth = 0; /* no locks held yet */
+ task->curr_chain_key = INITIAL_CHAIN_KEY;
+ task->lockdep_recursion = 0;
}
-EXPORT_SYMBOL(lockdep_off);
-void lockdep_on(void)
+static __always_inline void lockdep_recursion_inc(void)
{
- current->lockdep_recursion--;
+ __this_cpu_inc(lockdep_recursion);
+}
+
+static __always_inline void lockdep_recursion_finish(void)
+{
+ if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion)))
+ __this_cpu_write(lockdep_recursion, 0);
+}
+
+void lockdep_set_selftest_task(struct task_struct *task)
+{
+ lockdep_selftest_task_struct = task;
}
-EXPORT_SYMBOL(lockdep_on);
/*
* Debugging switches:
@@ -338,14 +486,12 @@ EXPORT_SYMBOL(lockdep_on);
#if VERBOSE
# define HARDIRQ_VERBOSE 1
# define SOFTIRQ_VERBOSE 1
-# define RECLAIM_VERBOSE 1
#else
# define HARDIRQ_VERBOSE 0
# define SOFTIRQ_VERBOSE 0
-# define RECLAIM_VERBOSE 0
#endif
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
/*
* Quick filtering for interesting events:
*/
@@ -373,13 +519,6 @@ static int verbose(struct lock_class *class)
return 0;
}
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
-
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
@@ -389,44 +528,108 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
-static int save_trace(struct stack_trace *trace)
-{
- trace->nr_entries = 0;
- trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->entries = stack_trace + nr_stack_trace_entries;
+unsigned long nr_stack_trace_entries;
- trace->skip = 3;
+#ifdef CONFIG_PROVE_LOCKING
+/**
+ * struct lock_trace - single stack backtrace
+ * @hash_entry: Entry in a stack_trace_hash[] list.
+ * @hash: jhash() of @entries.
+ * @nr_entries: Number of entries in @entries.
+ * @entries: Actual stack backtrace.
+ */
+struct lock_trace {
+ struct hlist_node hash_entry;
+ u32 hash;
+ u32 nr_entries;
+ unsigned long entries[] __aligned(sizeof(unsigned long));
+};
+#define LOCK_TRACE_SIZE_IN_LONGS \
+ (sizeof(struct lock_trace) / sizeof(unsigned long))
+/*
+ * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE];
- save_stack_trace(trace);
+static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2)
+{
+ return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries &&
+ memcmp(t1->entries, t2->entries,
+ t1->nr_entries * sizeof(t1->entries[0])) == 0;
+}
- /*
- * Some daft arches put -1 at the end to indicate its a full trace.
- *
- * <rant> this is buggy anyway, since it takes a whole extra entry so a
- * complete trace that maxes out the entries provided will be reported
- * as incomplete, friggin useless </rant>
- */
- if (trace->nr_entries != 0 &&
- trace->entries[trace->nr_entries-1] == ULONG_MAX)
- trace->nr_entries--;
+static struct lock_trace *save_trace(void)
+{
+ struct lock_trace *trace, *t2;
+ struct hlist_head *hash_head;
+ u32 hash;
+ int max_entries;
- trace->max_entries = trace->nr_entries;
+ BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE);
+ BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES);
- nr_stack_trace_entries += trace->nr_entries;
+ trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries);
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries -
+ LOCK_TRACE_SIZE_IN_LONGS;
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ if (max_entries <= 0) {
if (!debug_locks_off_graph_unlock())
- return 0;
+ return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
- return 0;
+ return NULL;
}
+ trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
+
+ hash = jhash(trace->entries, trace->nr_entries *
+ sizeof(trace->entries[0]), 0);
+ trace->hash = hash;
+ hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1));
+ hlist_for_each_entry(t2, hash_head, hash_entry) {
+ if (traces_identical(trace, t2))
+ return t2;
+ }
+ nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries;
+ hlist_add_head(&trace->hash_entry, hash_head);
- return 1;
+ return trace;
}
+/* Return the number of stack traces in the stack_trace[] array. */
+u64 lockdep_stack_trace_count(void)
+{
+ struct lock_trace *trace;
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) {
+ hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) {
+ c++;
+ }
+ }
+
+ return c;
+}
+
+/* Return the number of stack hash chains that have at least one stack trace. */
+u64 lockdep_stack_hash_count(void)
+{
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++)
+ if (!hlist_empty(&stack_trace_hash[i]))
+ c++;
+
+ return c;
+}
+#endif
+
unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
unsigned int nr_process_chains;
@@ -434,24 +637,12 @@ unsigned int max_lockdep_depth;
#ifdef CONFIG_DEBUG_LOCKDEP
/*
- * We cannot printk in early bootup code. Not even early_printk()
- * might work. So we mark any initialization errors and printk
- * about it later on, in lockdep_info().
- */
-static int lockdep_init_error;
-static const char *lock_init_error;
-static unsigned long lockdep_init_trace_data[20];
-static struct stack_trace lockdep_init_trace = {
- .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
- .entries = lockdep_init_trace_data,
-};
-
-/*
* Various lockdep statistics:
*/
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
+#ifdef CONFIG_PROVE_LOCKING
/*
* Locking printouts:
*/
@@ -468,9 +659,13 @@ static const char *usage_str[] =
#include "lockdep_states.h"
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
+ [LOCK_USED_READ] = "INITIAL READ USE",
+ /* abused as string storage for verify_lock_unused() */
+ [LOCK_USAGE_STATES] = "IN-NMI",
};
+#endif
-const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+const char *__get_key_name(const struct lockdep_subclass_key *key, char *str)
{
return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
@@ -482,15 +677,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit)
static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
{
+ /*
+ * The usage character defaults to '.' (i.e., irqs disabled and not in
+ * irq context), which is the safest usage category.
+ */
char c = '.';
- if (class->usage_mask & lock_flag(bit + 2))
+ /*
+ * The order of the following usage checks matters, which will
+ * result in the outcome character as follows:
+ *
+ * - '+': irq is enabled and not in irq context
+ * - '-': in irq context and irq is disabled
+ * - '?': in irq context and irq is enabled
+ */
+ if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) {
c = '+';
- if (class->usage_mask & lock_flag(bit)) {
- c = '-';
- if (class->usage_mask & lock_flag(bit + 2))
+ if (class->usage_mask & lock_flag(bit))
c = '?';
- }
+ } else if (class->usage_mask & lock_flag(bit))
+ c = '-';
return c;
}
@@ -508,7 +714,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
-static void __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char str[KSYM_NAME_LEN];
const char *name;
@@ -516,25 +722,29 @@ static void __print_lock_name(struct lock_class *class)
name = class->name;
if (!name) {
name = __get_key_name(class->key, str);
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
} else {
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
+ printk(KERN_CONT "#%d", class->name_version);
if (class->subclass)
- printk("/%d", class->subclass);
+ printk(KERN_CONT "/%d", class->subclass);
+ if (hlock && class->print_fn)
+ class->print_fn(hlock->instance);
}
}
-static void print_lock_name(struct lock_class *class)
+static void print_lock_name(struct held_lock *hlock, struct lock_class *class)
{
char usage[LOCK_USAGE_CHARS];
get_usage_chars(class, usage);
- printk(" (");
- __print_lock_name(class);
- printk("){%s}", usage);
+ printk(KERN_CONT " (");
+ __print_lock_name(hlock, class);
+ printk(KERN_CONT "){%s}-{%d:%d}", usage,
+ class->wait_type_outer ?: class->wait_type_inner,
+ class->wait_type_inner);
}
static void print_lockdep_cache(struct lockdep_map *lock)
@@ -546,7 +756,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
if (!name)
name = __get_key_name(lock->key->subkeys, str);
- printk("%s", name);
+ printk(KERN_CONT "%s", name);
}
static void print_lock(struct held_lock *hlock)
@@ -554,36 +764,43 @@ static void print_lock(struct held_lock *hlock)
/*
* We can be called locklessly through debug_show_all_locks() so be
* extra careful, the hlock might have been released and cleared.
+ *
+ * If this indeed happens, lets pretend it does not hurt to continue
+ * to print the lock unless the hlock class_idx does not point to a
+ * registered class. The rationale here is: since we don't attempt
+ * to distinguish whether we are in this situation, if it just
+ * happened we can't count on class_idx to tell either.
*/
- unsigned int class_idx = hlock->class_idx;
-
- /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
- barrier();
+ struct lock_class *lock = hlock_class(hlock);
- if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
- printk("<RELEASED>\n");
+ if (!lock) {
+ printk(KERN_CONT "<RELEASED>\n");
return;
}
- print_lock_name(lock_classes + class_idx - 1);
- printk(", at: ");
- print_ip_sym(hlock->acquire_ip);
+ printk(KERN_CONT "%px", hlock->instance);
+ print_lock_name(hlock, lock);
+ printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
-static void lockdep_print_held_locks(struct task_struct *curr)
+static void lockdep_print_held_locks(struct task_struct *p)
{
- int i, depth = curr->lockdep_depth;
+ int i, depth = READ_ONCE(p->lockdep_depth);
- if (!depth) {
- printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+ if (!depth)
+ printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
+ else
+ printk("%d lock%s held by %s/%d:\n", depth,
+ str_plural(depth), p->comm, task_pid_nr(p));
+ /*
+ * It's not reliable to print a task's held locks if it's not sleeping
+ * and it's not the current task.
+ */
+ if (p != current && task_is_running(p))
return;
- }
- printk("%d lock%s held by %s/%d:\n",
- depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
-
for (i = 0; i < depth; i++) {
printk(" #%d: ", i);
- print_lock(curr->held_locks + i);
+ print_lock(p->held_locks + i);
}
}
@@ -607,19 +824,26 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-static int static_obj(void *obj)
+static int static_obj(const void *obj)
{
- unsigned long start = (unsigned long) &_stext,
- end = (unsigned long) &_end,
- addr = (unsigned long) obj;
+ unsigned long addr = (unsigned long) obj;
+
+ if (is_kernel_core_data(addr))
+ return 1;
/*
- * static variable?
+ * keys are allowed in the __ro_after_init section.
*/
- if ((addr >= start) && (addr < end))
+ if (is_kernel_rodata(addr))
return 1;
- if (arch_is_kernel_data(addr))
+ /*
+ * in initdata section and used during bootup only?
+ * NOTE: On some platforms the initdata section is
+ * outside of the _stext ... _end range.
+ */
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ init_section_contains((void *)addr, 1))
return 1;
/*
@@ -637,7 +861,8 @@ static int static_obj(void *obj)
/*
* To make lock name printouts unique, we calculate a unique
- * class->name_version generation counter:
+ * class->name_version generation counter. The caller must hold the graph
+ * lock.
*/
static int count_matching_names(struct lock_class *new_class)
{
@@ -647,7 +872,7 @@ static int count_matching_names(struct lock_class *new_class)
if (!new_class->name)
return 0;
- list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) {
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (new_class->key - new_class->subclass == class->key)
return class->name_version;
if (class->name && !strcmp(class->name, new_class->name))
@@ -657,48 +882,34 @@ static int count_matching_names(struct lock_class *new_class)
return count + 1;
}
-/*
- * Register a lock's class in the hash-table, if the class is not present
- * yet. Otherwise we look it up. We cache the result in the lock object
- * itself, so actual lookup of the hash should be once per lock object.
- */
-static inline struct lock_class *
-look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+/* used from NMI context -- must be lockless */
+static noinstr struct lock_class *
+look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * If the architecture calls into lockdep before initializing
- * the hashes then we'll warn about it later. (we cannot printk
- * right now)
- */
- if (unlikely(!lockdep_initialized)) {
- lockdep_init();
- lockdep_init_error = 1;
- lock_init_error = lock->name;
- save_stack_trace(&lockdep_init_trace);
- }
-#endif
-
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ instrumentation_begin();
debug_locks_off();
+ nbcon_cpu_emergency_enter();
printk(KERN_ERR
"BUG: looking up invalid subclass: %u\n", subclass);
printk(KERN_ERR
"turning off the locking correctness validator.\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
+ instrumentation_end();
return NULL;
}
/*
- * Static locks do not have their class-keys yet - for them the key
- * is the lock object itself:
+ * If it is not initialised then it has never been locked,
+ * so it won't be present in the hash table.
*/
if (unlikely(!lock->key))
- lock->key = (void *)lock;
+ return NULL;
/*
* NOTE: the class-key must be unique. For dynamic locks, a static
@@ -719,13 +930,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name);
+ WARN_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__,
+ "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n",
+ lock->name, lock->key, class->name);
return class;
}
}
@@ -734,16 +948,346 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
/*
+ * Static locks do not have their class-keys yet - for them the key is
+ * the lock object itself. If the lock is in the per cpu area, the
+ * canonical address of the lock (per cpu offset removed) is used.
+ */
+static bool assign_lock_key(struct lockdep_map *lock)
+{
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+#ifdef __KERNEL__
+ /*
+ * lockdep_free_key_range() assumes that struct lock_class_key
+ * objects do not overlap. Since we use the address of lock
+ * objects as class key for static objects, check whether the
+ * size of lock_class_key objects does not exceed the size of
+ * the smallest lock object.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t));
+#endif
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else {
+ /* Debug-check: all keys must be persistent! */
+ debug_locks_off();
+ nbcon_cpu_emergency_enter();
+ pr_err("INFO: trying to register non-static key.\n");
+ pr_err("The code is fine but needs lockdep annotation, or maybe\n");
+ pr_err("you didn't initialize this object before use?\n");
+ pr_err("turning off the locking correctness validator.\n");
+ dump_stack();
+ nbcon_cpu_emergency_exit();
+ return false;
+ }
+
+ return true;
+}
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+
+/* Check whether element @e occurs in list @h */
+static bool in_list(struct list_head *e, struct list_head *h)
+{
+ struct list_head *f;
+
+ list_for_each(f, h) {
+ if (e == f)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check whether entry @e occurs in any of the locks_after or locks_before
+ * lists.
+ */
+static bool in_any_class_list(struct list_head *e)
+{
+ struct lock_class *class;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (in_list(e, &class->locks_after) ||
+ in_list(e, &class->locks_before))
+ return true;
+ }
+ return false;
+}
+
+static bool class_lock_list_valid(struct lock_class *c, struct list_head *h)
+{
+ struct lock_list *e;
+
+ list_for_each_entry(e, h, entry) {
+ if (e->links_to != c) {
+ printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s",
+ c->name ? : "(?)",
+ (unsigned long)(e - list_entries),
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)",
+ e->class && e->class->name ? e->class->name :
+ "(?)");
+ return false;
+ }
+ }
+ return true;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+#endif
+
+static bool check_lock_chain_key(struct lock_chain *chain)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ u64 chain_key = INITIAL_CHAIN_KEY;
+ int i;
+
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
+ /*
+ * The 'unsigned long long' casts avoid that a compiler warning
+ * is reported when building tools/lib/lockdep.
+ */
+ if (chain->chain_key != chain_key) {
+ printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n",
+ (unsigned long long)(chain - lock_chains),
+ (unsigned long long)chain->chain_key,
+ (unsigned long long)chain_key);
+ return false;
+ }
+#endif
+ return true;
+}
+
+static bool in_any_zapped_class_list(struct lock_class *class)
+{
+ struct pending_free *pf;
+ int i;
+
+ for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) {
+ if (in_list(&class->lock_entry, &pf->zapped))
+ return true;
+ }
+
+ return false;
+}
+
+static bool __check_data_structures(void)
+{
+ struct lock_class *class;
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ struct lock_list *e;
+ int i;
+
+ /* Check whether all classes occur in a lock list. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!in_list(&class->lock_entry, &all_lock_classes) &&
+ !in_list(&class->lock_entry, &free_lock_classes) &&
+ !in_any_zapped_class_list(class)) {
+ printk(KERN_INFO "class %px/%s is not in any class list\n",
+ class, class->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /* Check whether all classes have valid lock lists. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!class_lock_list_valid(class, &class->locks_before))
+ return false;
+ if (!class_lock_list_valid(class, &class->locks_after))
+ return false;
+ }
+
+ /* Check the chain_key of all lock chains. */
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ if (!check_lock_chain_key(chain))
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are in use occur in a class
+ * lock list.
+ */
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (!in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class->name ? : "(?)",
+ e->links_to->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are not in use do not occur in
+ * a class lock list.
+ */
+ for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class && e->class->name ? e->class->name :
+ "(?)",
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int check_consistency = 0;
+module_param(check_consistency, int, 0644);
+
+static void check_data_structures(void)
+{
+ static bool once = false;
+
+ if (check_consistency && !once) {
+ if (!__check_data_structures()) {
+ once = true;
+ WARN_ON(once);
+ }
+ }
+}
+
+#else /* CONFIG_DEBUG_LOCKDEP */
+
+static inline void check_data_structures(void) { }
+
+#endif /* CONFIG_DEBUG_LOCKDEP */
+
+static void init_chain_block_buckets(void);
+
+/*
+ * Initialize the lock_classes[] array elements, the free_lock_classes list
+ * and also the delayed_free structure.
+ */
+static void init_data_structures_once(void)
+{
+ static bool __read_mostly ds_initialized, rcu_head_initialized;
+ int i;
+
+ if (likely(rcu_head_initialized))
+ return;
+
+ if (system_state >= SYSTEM_SCHEDULING) {
+ init_rcu_head(&delayed_free.rcu_head);
+ rcu_head_initialized = true;
+ }
+
+ if (ds_initialized)
+ return;
+
+ ds_initialized = true;
+
+ INIT_LIST_HEAD(&delayed_free.pf[0].zapped);
+ INIT_LIST_HEAD(&delayed_free.pf[1].zapped);
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes);
+ INIT_LIST_HEAD(&lock_classes[i].locks_after);
+ INIT_LIST_HEAD(&lock_classes[i].locks_before);
+ }
+ init_chain_block_buckets();
+}
+
+static inline struct hlist_head *keyhashentry(const struct lock_class_key *key)
+{
+ unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS);
+
+ return lock_keys_hash + hash;
+}
+
+/* Register a dynamically allocated key. */
+void lockdep_register_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+ hash_head = keyhashentry(key);
+
+ raw_local_irq_save(flags);
+ if (!graph_lock())
+ goto restore_irqs;
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (WARN_ON_ONCE(k == key))
+ goto out_unlock;
+ }
+ hlist_add_head_rcu(&key->hash_entry, hash_head);
+ nr_dynamic_keys++;
+out_unlock:
+ graph_unlock();
+restore_irqs:
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_register_key);
+
+/* Check whether a key has been registered as a dynamic key. */
+static bool is_dynamic_key(const struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ bool found = false;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return false;
+
+ /*
+ * If lock debugging is disabled lock_keys_hash[] may contain
+ * pointers to memory that has already been freed. Avoid triggering
+ * a use-after-free in that case by returning early.
+ */
+ if (!debug_locks)
+ return true;
+
+ hash_head = keyhashentry(key);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
* itself, so actual lookup of the hash should be once per lock object.
*/
-static inline struct lock_class *
+static struct lock_class *
register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -751,16 +1295,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (likely(class))
goto out_set_class_cache;
- /*
- * Debug-check: all keys must be persistent!
- */
- if (!static_obj(lock->key)) {
- debug_locks_off();
- printk("INFO: trying to register non-static key.\n");
- printk("the code is fine but needs lockdep annotation.\n");
- printk("turning off the locking correctness validator.\n");
- dump_stack();
-
+ if (!lock->key) {
+ if (!assign_lock_key(lock))
+ return NULL;
+ } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) {
return NULL;
}
@@ -774,51 +1312,63 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
}
- /*
- * Allocate a new key from the static array, and add it to
- * the hash:
- */
- if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ init_data_structures_once();
+
+ /* Allocate a new lock class and add it to the hash. */
+ class = list_first_entry_or_null(&free_lock_classes, typeof(*class),
+ lock_entry);
+ if (!class) {
if (!debug_locks_off_graph_unlock()) {
return NULL;
}
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
- class = lock_classes + nr_lock_classes++;
+ nr_lock_classes++;
+ __set_bit(class - lock_classes, lock_classes_in_use);
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
class->subclass = subclass;
- INIT_LIST_HEAD(&class->lock_entry);
- INIT_LIST_HEAD(&class->locks_before);
- INIT_LIST_HEAD(&class->locks_after);
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
class->name_version = count_matching_names(class);
+ class->wait_type_inner = lock->wait_type_inner;
+ class->wait_type_outer = lock->wait_type_outer;
+ class->lock_type = lock->lock_type;
/*
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
*/
- list_add_tail_rcu(&class->hash_entry, hash_head);
+ hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
- * Add it to the global list of classes:
+ * Remove the class from the free list and add it to the global list
+ * of classes.
*/
- list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
+ list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
- printk("\nnew class %p: %s", class->key, class->name);
+ nbcon_cpu_emergency_enter();
+ printk("\nnew class %px: %s", class->key, class->name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
- printk("\n");
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
if (!graph_lock()) {
return NULL;
@@ -850,23 +1400,31 @@ out_set_class_cache:
*/
static struct lock_list *alloc_list_entry(void)
{
- if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ int idx = find_first_zero_bit(list_entries_in_use,
+ ARRAY_SIZE(list_entries));
+
+ if (idx >= ARRAY_SIZE(list_entries)) {
if (!debug_locks_off_graph_unlock())
return NULL;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return NULL;
}
- return list_entries + nr_list_entries++;
+ nr_list_entries++;
+ __set_bit(idx, list_entries_in_use);
+ return list_entries + idx;
}
/*
* Add a new dependency to the head of the list:
*/
-static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
- struct list_head *head, unsigned long ip,
- int distance, struct stack_trace *trace)
+static int add_lock_to_list(struct lock_class *this,
+ struct lock_class *links_to, struct list_head *head,
+ u16 distance, u8 dep,
+ const struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -878,8 +1436,10 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
return 0;
entry->class = this;
+ entry->links_to = links_to;
+ entry->dep = dep;
entry->distance = distance;
- entry->trace = *trace;
+ entry->trace = trace;
/*
* Both allocation and removal are done under the graph lock; but
* iteration is under RCU-sched; see look_up_lock_class() and
@@ -893,17 +1453,21 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
/*
* For good efficiency of modular, we use power of 2
*/
-#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
+#define MAX_CIRCULAR_QUEUE_SIZE (1UL << CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS)
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
- * The circular_queue and helpers is used to implement the
- * breadth-first search(BFS)algorithem, by which we can build
- * the shortest path from the next lock to be acquired to the
- * previous held lock if there is a circular between them.
+ * The circular_queue and helpers are used to implement graph
+ * breadth-first search (BFS) algorithm, by which we can determine
+ * whether there is a path from a lock to another. In deadlock checks,
+ * a path from the next lock to be acquired to a previous held lock
+ * indicates that adding the <prev> -> <next> lock dependency will
+ * produce a circle in the graph. Breadth-first search instead of
+ * depth-first search is used in order to find the shortest (circular)
+ * path.
*/
struct circular_queue {
- unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE];
unsigned int front, rear;
};
@@ -929,7 +1493,7 @@ static inline int __cq_full(struct circular_queue *cq)
return ((cq->rear + 1) & CQ_MASK) == cq->front;
}
-static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem)
{
if (__cq_full(cq))
return -1;
@@ -939,14 +1503,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
return 0;
}
-static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+/*
+ * Dequeue an element from the circular_queue, return a lock_list if
+ * the queue is not empty, or NULL if otherwise.
+ */
+static inline struct lock_list * __cq_dequeue(struct circular_queue *cq)
{
+ struct lock_list * lock;
+
if (__cq_empty(cq))
- return -1;
+ return NULL;
- *elem = cq->element[cq->front];
+ lock = cq->element[cq->front];
cq->front = (cq->front + 1) & CQ_MASK;
- return 0;
+
+ return lock;
}
static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
@@ -954,23 +1525,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
return (cq->rear - cq->front) & CQ_MASK;
}
-static inline void mark_lock_accessed(struct lock_list *lock,
- struct lock_list *parent)
+static inline void mark_lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
- nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+static inline void visit_lock_entry(struct lock_list *lock,
+ struct lock_list *parent)
+{
lock->parent = parent;
- lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
static inline unsigned long lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
-
- nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -991,114 +1558,329 @@ static inline int get_lock_depth(struct lock_list *child)
return depth;
}
-static int __bfs(struct lock_list *source_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry,
- int forward)
+/*
+ * Return the forward or backward dependency list.
+ *
+ * @lock: the lock_list to get its class's dependency list
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ */
+static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
+{
+ void *lock_class = lock->class;
+
+ return lock_class + offset;
+}
+/*
+ * Return values of a bfs search:
+ *
+ * BFS_E* indicates an error
+ * BFS_R* indicates a result (match or not)
+ *
+ * BFS_EINVALIDNODE: Find a invalid node in the graph.
+ *
+ * BFS_EQUEUEFULL: The queue is full while doing the bfs.
+ *
+ * BFS_RMATCH: Find the matched node in the graph, and put that node into
+ * *@target_entry.
+ *
+ * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry
+ * _unchanged_.
+ */
+enum bfs_result {
+ BFS_EINVALIDNODE = -2,
+ BFS_EQUEUEFULL = -1,
+ BFS_RMATCH = 0,
+ BFS_RNOMATCH = 1,
+};
+
+/*
+ * bfs_result < 0 means error
+ */
+static inline bool bfs_error(enum bfs_result res)
+{
+ return res < 0;
+}
+
+/*
+ * DEP_*_BIT in lock_list::dep
+ *
+ * For dependency @prev -> @next:
+ *
+ * SR: @prev is shared reader (->read != 0) and @next is recursive reader
+ * (->read == 2)
+ * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader
+ * SN: @prev is shared reader and @next is non-recursive locker (->read != 2)
+ * EN: @prev is exclusive locker and @next is non-recursive locker
+ *
+ * Note that we define the value of DEP_*_BITs so that:
+ * bit0 is prev->read == 0
+ * bit1 is next->read != 2
+ */
+#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */
+#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */
+#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */
+#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */
+
+#define DEP_SR_MASK (1U << (DEP_SR_BIT))
+#define DEP_ER_MASK (1U << (DEP_ER_BIT))
+#define DEP_SN_MASK (1U << (DEP_SN_BIT))
+#define DEP_EN_MASK (1U << (DEP_EN_BIT))
+
+static inline unsigned int
+__calc_dep_bit(struct held_lock *prev, struct held_lock *next)
+{
+ return (prev->read == 0) + ((next->read != 2) << 1);
+}
+
+static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bit(prev, next);
+}
+
+/*
+ * calculate the dep_bit for backwards edges. We care about whether @prev is
+ * shared and whether @next is recursive.
+ */
+static inline unsigned int
+__calc_dep_bitb(struct held_lock *prev, struct held_lock *next)
+{
+ return (next->read != 2) + ((prev->read == 0) << 1);
+}
+
+static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bitb(prev, next);
+}
+
+/*
+ * Initialize a lock_list entry @lock belonging to @class as the root for a BFS
+ * search.
+ */
+static inline void __bfs_init_root(struct lock_list *lock,
+ struct lock_class *class)
+{
+ lock->class = class;
+ lock->parent = NULL;
+ lock->only_xr = 0;
+}
+
+/*
+ * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the
+ * root for a BFS search.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure
+ * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)->
+ * and -(S*)->.
+ */
+static inline void bfs_init_root(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read == 2);
+}
+
+/*
+ * Similar to bfs_init_root() but initialize the root for backwards BFS.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure
+ * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not
+ * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->).
+ */
+static inline void bfs_init_rootb(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read != 0);
+}
+
+static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
+{
+ if (!lock || !lock->parent)
+ return NULL;
+
+ return list_next_or_null_rcu(get_dep_list(lock->parent, offset),
+ &lock->entry, struct lock_list, entry);
+}
+
+/*
+ * Breadth-First Search to find a strong path in the dependency graph.
+ *
+ * @source_entry: the source of the path we are searching for.
+ * @data: data used for the second parameter of @match function
+ * @match: match function for the search
+ * @target_entry: pointer to the target of a matched path
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ *
+ * We may have multiple edges (considering different kinds of dependencies,
+ * e.g. ER and SN) between two nodes in the dependency graph. But
+ * only the strong dependency path in the graph is relevant to deadlocks. A
+ * strong dependency path is a dependency path that doesn't have two adjacent
+ * dependencies as -(*R)-> -(S*)->, please see:
+ *
+ * Documentation/locking/lockdep-design.rst
+ *
+ * for more explanation of the definition of strong dependency paths
+ *
+ * In __bfs(), we only traverse in the strong dependency path:
+ *
+ * In lock_list::only_xr, we record whether the previous dependency only
+ * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we
+ * filter out any -(S*)-> in the current dependency and after that, the
+ * ->only_xr is set according to whether we only have -(*R)-> left.
+ */
+static enum bfs_result __bfs(struct lock_list *source_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int offset)
{
+ struct circular_queue *cq = &lock_cq;
+ struct lock_list *lock = NULL;
struct lock_list *entry;
struct list_head *head;
- struct circular_queue *cq = &lock_cq;
- int ret = 1;
+ unsigned int cq_depth;
+ bool first;
- if (match(source_entry, data)) {
- *target_entry = source_entry;
- ret = 0;
- goto exit;
- }
+ lockdep_assert_locked();
- if (forward)
- head = &source_entry->class->locks_after;
- else
- head = &source_entry->class->locks_before;
+ __cq_init(cq);
+ __cq_enqueue(cq, source_entry);
- if (list_empty(head))
- goto exit;
+ while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) {
+ if (!lock->class)
+ return BFS_EINVALIDNODE;
- __cq_init(cq);
- __cq_enqueue(cq, (unsigned long)source_entry);
+ /*
+ * Step 1: check whether we already finish on this one.
+ *
+ * If we have visited all the dependencies from this @lock to
+ * others (iow, if we have visited all lock_list entries in
+ * @lock->class->locks_{after,before}) we skip, otherwise go
+ * and visit all the dependencies in the list and mark this
+ * list accessed.
+ */
+ if (lock_accessed(lock))
+ continue;
+ else
+ mark_lock_accessed(lock);
+
+ /*
+ * Step 2: check whether prev dependency and this form a strong
+ * dependency path.
+ */
+ if (lock->parent) { /* Parent exists, check prev dependency */
+ u8 dep = lock->dep;
+ bool prev_only_xr = lock->parent->only_xr;
- while (!__cq_empty(cq)) {
- struct lock_list *lock;
+ /*
+ * Mask out all -(S*)-> if we only have *R in previous
+ * step, because -(*R)-> -(S*)-> don't make up a strong
+ * dependency.
+ */
+ if (prev_only_xr)
+ dep &= ~(DEP_SR_MASK | DEP_SN_MASK);
- __cq_dequeue(cq, (unsigned long *)&lock);
+ /* If nothing left, we skip */
+ if (!dep)
+ continue;
- if (!lock->class) {
- ret = -2;
- goto exit;
+ /* If there are only -(*R)-> left, set that for the next step */
+ lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK));
}
- if (forward)
- head = &lock->class->locks_after;
- else
- head = &lock->class->locks_before;
+ /*
+ * Step 3: we haven't visited this and there is a strong
+ * dependency path to this, so check with @match.
+ * If @skip is provide and returns true, we skip this
+ * lock (and any path this lock is in).
+ */
+ if (skip && skip(lock, data))
+ continue;
- DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+ if (match(lock, data)) {
+ *target_entry = lock;
+ return BFS_RMATCH;
+ }
+ /*
+ * Step 4: if not match, expand the path by adding the
+ * forward or backwards dependencies in the search
+ *
+ */
+ first = true;
+ head = get_dep_list(lock, offset);
list_for_each_entry_rcu(entry, head, entry) {
- if (!lock_accessed(entry)) {
- unsigned int cq_depth;
- mark_lock_accessed(entry, lock);
- if (match(entry, data)) {
- *target_entry = entry;
- ret = 0;
- goto exit;
- }
+ visit_lock_entry(entry, lock);
- if (__cq_enqueue(cq, (unsigned long)entry)) {
- ret = -1;
- goto exit;
- }
- cq_depth = __cq_get_elem_count(cq);
- if (max_bfs_queue_depth < cq_depth)
- max_bfs_queue_depth = cq_depth;
- }
+ /*
+ * Note we only enqueue the first of the list into the
+ * queue, because we can always find a sibling
+ * dependency from one (see __bfs_next()), as a result
+ * the space of queue is saved.
+ */
+ if (!first)
+ continue;
+
+ first = false;
+
+ if (__cq_enqueue(cq, entry))
+ return BFS_EQUEUEFULL;
+
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
}
}
-exit:
- return ret;
+
+ return BFS_RNOMATCH;
}
-static inline int __bfs_forwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 1);
+ return __bfs(src_entry, data, match, skip, target_entry,
+ offsetof(struct lock_class, locks_after));
}
-static inline int __bfs_backwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 0);
+ return __bfs(src_entry, data, match, skip, target_entry,
+ offsetof(struct lock_class, locks_before));
}
-/*
- * Recursive, forwards-direction lock-dependency checking, used for
- * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
- * checking.
- */
+static void print_lock_trace(const struct lock_trace *trace,
+ unsigned int spaces)
+{
+ stack_trace_print(trace->entries, trace->nr_entries, spaces);
+}
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
-static noinline int
+static noinline void
print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
- return 0;
+ return;
printk("\n-> #%u", depth);
- print_lock_name(target->class);
- printk(":\n");
- print_stack_trace(&target->trace, 6);
-
- return 0;
+ print_lock_name(NULL, target->class);
+ printk(KERN_CONT ":\n");
+ print_lock_trace(target->trace, 6);
}
static void
@@ -1109,6 +1891,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1125,29 +1909,37 @@ print_circular_lock_scenario(struct held_lock *src,
*/
if (parent != source) {
printk("Chain exists of:\n ");
- __print_lock_name(source);
- printk(" --> ");
- __print_lock_name(parent);
- printk(" --> ");
- __print_lock_name(target);
- printk("\n\n");
+ __print_lock_name(src, source);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(NULL, parent);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(tgt, target);
+ printk(KERN_CONT "\n\n");
}
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(");\n");
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
+ __print_lock_name(tgt, target);
+ printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(parent);
- printk(");\n");
+ __print_lock_name(NULL, parent);
+ printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(target);
- printk(");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(");\n");
+ __print_lock_name(tgt, target);
+ printk(KERN_CONT ");\n");
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
+ __print_lock_name(src, source);
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
@@ -1155,7 +1947,7 @@ print_circular_lock_scenario(struct held_lock *src,
* When a circular dependency is detected, print the
* header first:
*/
-static noinline int
+static noinline void
print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct held_lock *check_src,
struct held_lock *check_tgt)
@@ -1163,32 +1955,54 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct task_struct *curr = current;
if (debug_locks_silent)
- return 0;
+ return;
- printk("\n");
- printk("======================================================\n");
- printk("[ INFO: possible circular locking dependency detected ]\n");
+ pr_warn("\n");
+ pr_warn("======================================================\n");
+ pr_warn("WARNING: possible circular locking dependency detected\n");
print_kernel_ident();
- printk("-------------------------------------------------------\n");
- printk("%s/%d is trying to acquire lock:\n",
+ pr_warn("------------------------------------------------------\n");
+ pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- printk("\nbut task is already holding lock:\n");
+
+ pr_warn("\nbut task is already holding lock:\n");
+
print_lock(check_tgt);
- printk("\nwhich lock already depends on the new lock.\n\n");
- printk("\nthe existing dependency chain (in reverse order) is:\n");
+ pr_warn("\nwhich lock already depends on the new lock.\n\n");
+ pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
print_circular_bug_entry(entry, depth);
-
- return 0;
}
-static inline int class_equal(struct lock_list *entry, void *data)
+/*
+ * We are about to add B -> A into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong
+ * dependency cycle, that means:
+ *
+ * Either
+ *
+ * a) B -> A is -(E*)->
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B)
+ *
+ * as then we don't have -(*R)-> -(S*)-> in the cycle.
+ */
+static inline bool hlock_conflict(struct lock_list *entry, void *data)
{
- return entry->class == data;
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 0 || /* B -> A is -(E*)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
-static noinline int print_circular_bug(struct lock_list *this,
+static noinline void print_circular_bug(struct lock_list *this,
struct lock_list *target,
struct held_lock *check_src,
struct held_lock *check_tgt)
@@ -1199,13 +2013,16 @@ static noinline int print_circular_bug(struct lock_list *this,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
- if (!save_trace(&this->trace))
- return 0;
+ this->trace = save_trace();
+ if (!this->trace)
+ return;
depth = get_lock_depth(target);
+ nbcon_cpu_emergency_enter();
+
print_circular_bug_header(target, depth, check_src, check_tgt);
parent = get_lock_parent(target);
@@ -1225,34 +2042,35 @@ static noinline int print_circular_bug(struct lock_list *this,
printk("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
-static noinline int print_bfs_bug(int ret)
+static noinline void print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
- return 0;
+ return;
/*
* Breadth-first-search failed, graph got corrupted?
*/
- WARN(1, "lockdep bfs error:%d\n", ret);
+ if (ret == BFS_EQUEUEFULL)
+ pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n");
- return 0;
+ WARN(1, "lockdep bfs error:%d\n", ret);
}
-static int noop_count(struct lock_list *entry, void *data)
+static bool noop_count(struct lock_list *entry, void *data)
{
(*(unsigned long *)data)++;
- return 0;
+ return false;
}
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
- __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -1261,14 +2079,13 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
- local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ raw_local_irq_save(flags);
+ lockdep_lock();
ret = __lockdep_count_forward_deps(&this);
- arch_spin_unlock(&lockdep_lock);
- local_irq_restore(flags);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
return ret;
}
@@ -1276,9 +2093,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
static unsigned long __lockdep_count_backward_deps(struct lock_list *this)
{
unsigned long count = 0;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
- __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
+ __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry);
return count;
}
@@ -1288,68 +2105,208 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
- local_irq_save(flags);
- arch_spin_lock(&lockdep_lock);
+ raw_local_irq_save(flags);
+ lockdep_lock();
ret = __lockdep_count_backward_deps(&this);
- arch_spin_unlock(&lockdep_lock);
- local_irq_restore(flags);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
return ret;
}
/*
- * Prove that the dependency graph starting at <entry> can not
- * lead to <target>. Print an error and return 0 if it does.
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not.
*/
-static noinline int
-check_noncircular(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+static noinline enum bfs_result
+check_path(struct held_lock *target, struct lock_list *src_entry,
+ bool (*match)(struct lock_list *entry, void *data),
+ bool (*skip)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
- int result;
+ enum bfs_result ret;
+
+ ret = __bfs_forwards(src_entry, target, match, skip, target_entry);
+
+ if (unlikely(bfs_error(ret)))
+ print_bfs_bug(ret);
+
+ return ret;
+}
+
+static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *);
+
+/*
+ * Prove that the dependency graph starting at <src> can not
+ * lead to <target>. If it can, there is a circle when adding
+ * <target> -> <src> dependency.
+ *
+ * Print an error and return BFS_RMATCH if it does.
+ */
+static noinline enum bfs_result
+check_noncircular(struct held_lock *src, struct held_lock *target,
+ struct lock_trace **const trace)
+{
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
debug_atomic_inc(nr_cyclic_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry);
- return result;
+ if (unlikely(ret == BFS_RMATCH)) {
+ if (!*trace) {
+ /*
+ * If save_trace fails here, the printing might
+ * trigger a WARN but because of the !nr_entries it
+ * should not do bad things.
+ */
+ *trace = save_trace();
+ }
+
+ if (src->class_idx == target->class_idx)
+ print_deadlock_bug(current, src, target);
+ else
+ print_circular_bug(&src_entry, target_entry, src, target);
+ }
+
+ return ret;
}
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_TRACE_IRQFLAGS
+
/*
* Forwards and backwards subgraph searching, for the purposes of
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ *
+ * A irq safe->unsafe deadlock happens with the following conditions:
+ *
+ * 1) We have a strong dependency path A -> ... -> B
+ *
+ * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore
+ * irq can create a new dependency B -> A (consider the case that a holder
+ * of B gets interrupted by an irq whose handler will try to acquire A).
+ *
+ * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a
+ * strong circle:
+ *
+ * For the usage bits of B:
+ * a) if A -> B is -(*N)->, then B -> A could be any type, so any
+ * ENABLED_IRQ usage suffices.
+ * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only
+ * ENABLED_IRQ_*_READ usage suffices.
+ *
+ * For the usage bits of A:
+ * c) if A -> B is -(E*)->, then B -> A could be any type, so any
+ * USED_IN_IRQ usage suffices.
+ * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only
+ * USED_IN_IRQ_*_READ usage suffices.
+ */
+
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of A should be accumulated to detect
+ * safe->unsafe bugs.
+ *
+ * Note that usage_accumulate() is used in backwards search, so ->only_xr
+ * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true).
+ *
+ * As above, if only_xr is false, which means A -> B has -(E*)-> dependency
+ * path, any usage of A should be considered. Otherwise, we should only
+ * consider _READ usage.
*/
+static inline bool usage_accumulate(struct lock_list *entry, void *mask)
+{
+ if (!entry->only_xr)
+ *(unsigned long *)mask |= entry->class->usage_mask;
+ else /* Mask out _READ usage bits */
+ *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ);
+
+ return false;
+}
-static inline int usage_match(struct lock_list *entry, void *bit)
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of B conflicts with the usage bits of A,
+ * i.e. which usage bit of B may introduce safe->unsafe deadlocks.
+ *
+ * As above, if only_xr is false, which means A -> B has -(*N)-> dependency
+ * path, any usage of B should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_match(struct lock_list *entry, void *mask)
{
- return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+ if (!entry->only_xr)
+ return !!(entry->class->usage_mask & *(unsigned long *)mask);
+ else /* Mask out _READ usage bits */
+ return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
}
+static inline bool usage_skip(struct lock_list *entry, void *mask)
+{
+ if (entry->class->lock_type == LD_LOCK_NORMAL)
+ return false;
+
+ /*
+ * Skip local_lock() for irq inversion detection.
+ *
+ * For !RT, local_lock() is not a real lock, so it won't carry any
+ * dependency.
+ *
+ * For RT, an irq inversion happens when we have lock A and B, and on
+ * some CPU we can have:
+ *
+ * lock(A);
+ * <interrupted>
+ * lock(B);
+ *
+ * where lock(B) cannot sleep, and we have a dependency B -> ... -> A.
+ *
+ * Now we prove local_lock() cannot exist in that dependency. First we
+ * have the observation for any lock chain L1 -> ... -> Ln, for any
+ * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise
+ * wait context check will complain. And since B is not a sleep lock,
+ * therefore B.inner_wait_type >= 2, and since the inner_wait_type of
+ * local_lock() is 3, which is greater than 2, therefore there is no
+ * way the local_lock() exists in the dependency B -> ... -> A.
+ *
+ * As a result, we will skip local_lock(), when we search for irq
+ * inversion bugs.
+ */
+ if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ return false;
+
+ /*
+ * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ * a lock and only used to override the wait_type.
+ */
+ return true;
+}
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
*
- * Return 0 if such a node exists in the subgraph, and put that node
+ * Return BFS_MATCH if such a node exists in the subgraph, and put that node
* into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
-find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+static enum bfs_result
+find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -1357,22 +2314,16 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
/*
* Find a node in the backwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
- *
- * Return 0 if such a node exists in the subgraph, and put that node
- * into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
-find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+static enum bfs_result
+find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry);
return result;
}
@@ -1382,31 +2333,82 @@ static void print_lock_class_header(struct lock_class *class, int depth)
int bit;
printk("%*s->", depth, "");
- print_lock_name(class);
- printk(" ops: %lu", class->ops);
- printk(" {\n");
+ print_lock_name(NULL, class);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ printk(KERN_CONT " ops: %lu", debug_class_ops_read(class));
+#endif
+ printk(KERN_CONT " {\n");
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ for (bit = 0; bit < LOCK_TRACE_STATES; bit++) {
if (class->usage_mask & (1 << bit)) {
int len = depth;
len += printk("%*s %s", depth, "", usage_str[bit]);
- len += printk(" at:\n");
- print_stack_trace(class->usage_traces + bit, len);
+ len += printk(KERN_CONT " at:\n");
+ print_lock_trace(class->usage_traces[bit], len);
}
}
printk("%*s }\n", depth, "");
- printk("%*s ... key at: ",depth,"");
- print_ip_sym((unsigned long)class->key);
+ printk("%*s ... key at: [<%px>] %pS\n",
+ depth, "", class->key, class->key);
}
/*
- * printk the shortest lock dependencies from @start to @end in reverse order:
+ * Dependency path printing:
+ *
+ * After BFS we get a lock dependency path (linked via ->parent of lock_list),
+ * printing out each lock in the dependency path will help on understanding how
+ * the deadlock could happen. Here are some details about dependency path
+ * printing:
+ *
+ * 1) A lock_list can be either forwards or backwards for a lock dependency,
+ * for a lock dependency A -> B, there are two lock_lists:
+ *
+ * a) lock_list in the ->locks_after list of A, whose ->class is B and
+ * ->links_to is A. In this case, we can say the lock_list is
+ * "A -> B" (forwards case).
+ *
+ * b) lock_list in the ->locks_before list of B, whose ->class is A
+ * and ->links_to is B. In this case, we can say the lock_list is
+ * "B <- A" (bacwards case).
+ *
+ * The ->trace of both a) and b) point to the call trace where B was
+ * acquired with A held.
+ *
+ * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't
+ * represent a certain lock dependency, it only provides an initial entry
+ * for BFS. For example, BFS may introduce a "helper" lock_list whose
+ * ->class is A, as a result BFS will search all dependencies starting with
+ * A, e.g. A -> B or A -> C.
+ *
+ * The notation of a forwards helper lock_list is like "-> A", which means
+ * we should search the forwards dependencies starting with "A", e.g A -> B
+ * or A -> C.
+ *
+ * The notation of a bacwards helper lock_list is like "<- B", which means
+ * we should search the backwards dependencies ending with "B", e.g.
+ * B <- A or B <- C.
+ */
+
+/*
+ * printk the shortest lock dependencies from @root to @leaf in reverse order.
+ *
+ * We have a lock dependency path as follow:
+ *
+ * @root @leaf
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | <--------- | lock_list | ... | lock_list | <--------- | lock_list |
+ * | -> L1 | | L1 -> L2 | ... |Ln-2 -> Ln-1| | Ln-1 -> Ln|
+ *
+ * , so it's natural that we start from @leaf and print every ->class and
+ * ->trace until we reach the @root.
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
- struct lock_list *root)
+ struct lock_list *root)
{
struct lock_list *entry = leaf;
int depth;
@@ -1417,7 +2419,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_stack_trace(&entry->trace, 2);
+ print_lock_trace(entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1428,8 +2430,61 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
entry = get_lock_parent(entry);
depth--;
} while (entry && (depth >= 0));
+}
- return;
+/*
+ * printk the shortest lock dependencies from @leaf to @root.
+ *
+ * We have a lock dependency path (from a backwards search) as follow:
+ *
+ * @leaf @root
+ * | |
+ * V V
+ * ->parent ->parent
+ * | lock_list | ---------> | lock_list | ... | lock_list | ---------> | lock_list |
+ * | L2 <- L1 | | L3 <- L2 | ... | Ln <- Ln-1 | | <- Ln |
+ *
+ * , so when we iterate from @leaf to @root, we actually print the lock
+ * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order.
+ *
+ * Another thing to notice here is that ->class of L2 <- L1 is L1, while the
+ * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call
+ * trace of L1 in the dependency path, which is alright, because most of the
+ * time we can figure out where L1 is held from the call trace of L2.
+ */
+static void __used
+print_shortest_lock_dependencies_backwards(struct lock_list *leaf,
+ struct lock_list *root)
+{
+ struct lock_list *entry = leaf;
+ const struct lock_trace *trace = NULL;
+ int depth;
+
+ /*compute depth from generated tree by BFS*/
+ depth = get_lock_depth(leaf);
+
+ do {
+ print_lock_class_header(entry->class, depth);
+ if (trace) {
+ printk("%*s ... acquired at:\n", depth, "");
+ print_lock_trace(trace, 2);
+ printk("\n");
+ }
+
+ /*
+ * Record the pointer to the trace for the next lock_list
+ * entry, see the comments for the function.
+ */
+ trace = entry->trace;
+
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && (depth >= 0));
}
static void
@@ -1460,35 +2515,35 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
*/
if (middle_class != unsafe_class) {
printk("Chain exists of:\n ");
- __print_lock_name(safe_class);
- printk(" --> ");
- __print_lock_name(middle_class);
- printk(" --> ");
- __print_lock_name(unsafe_class);
- printk("\n\n");
+ __print_lock_name(NULL, safe_class);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(NULL, middle_class);
+ printk(KERN_CONT " --> ");
+ __print_lock_name(NULL, unsafe_class);
+ printk(KERN_CONT "\n\n");
}
printk(" Possible interrupt unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
printk(" lock(");
- __print_lock_name(unsafe_class);
- printk(");\n");
+ __print_lock_name(NULL, unsafe_class);
+ printk(KERN_CONT ");\n");
printk(" local_irq_disable();\n");
printk(" lock(");
- __print_lock_name(safe_class);
- printk(");\n");
+ __print_lock_name(NULL, safe_class);
+ printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(middle_class);
- printk(");\n");
+ __print_lock_name(NULL, middle_class);
+ printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(safe_class);
- printk(");\n");
+ __print_lock_name(NULL, safe_class);
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_bad_irq_dependency(struct task_struct *curr,
struct lock_list *prev_root,
struct lock_list *next_root,
@@ -1501,99 +2556,66 @@ print_bad_irq_dependency(struct task_struct *curr,
const char *irqclass)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
- printk("\n");
- printk("======================================================\n");
- printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("=====================================================\n");
+ pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
irqclass, irqclass);
print_kernel_ident();
- printk("------------------------------------------------------\n");
- printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ pr_warn("-----------------------------------------------------\n");
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
- curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
- curr->hardirqs_enabled,
+ lockdep_hardirqs_enabled(),
curr->softirqs_enabled);
print_lock(next);
- printk("\nand this task is already holding:\n");
+ pr_warn("\nand this task is already holding:\n");
print_lock(prev);
- printk("which would create a new lock dependency:\n");
- print_lock_name(hlock_class(prev));
- printk(" ->");
- print_lock_name(hlock_class(next));
- printk("\n");
+ pr_warn("which would create a new lock dependency:\n");
+ print_lock_name(prev, hlock_class(prev));
+ pr_cont(" ->");
+ print_lock_name(next, hlock_class(next));
+ pr_cont("\n");
- printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
irqclass);
- print_lock_name(backwards_entry->class);
- printk("\n... which became %s-irq-safe at:\n", irqclass);
+ print_lock_name(NULL, backwards_entry->class);
+ pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
- printk("\nto a %s-irq-unsafe lock:\n", irqclass);
- print_lock_name(forwards_entry->class);
- printk("\n... which became %s-irq-unsafe at:\n", irqclass);
- printk("...");
+ pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
+ print_lock_name(NULL, forwards_entry->class);
+ pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
+ pr_warn("...");
- print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces[bit2], 1);
- printk("\nother info that might help us debug this:\n\n");
+ pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
hlock_class(prev), hlock_class(next));
lockdep_print_held_locks(curr);
- printk("\nthe dependencies between %s-irq-safe lock", irqclass);
- printk(" and the holding lock:\n");
- if (!save_trace(&prev_root->trace))
- return 0;
- print_shortest_lock_dependencies(backwards_entry, prev_root);
+ pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
+ print_shortest_lock_dependencies_backwards(backwards_entry, prev_root);
- printk("\nthe dependencies between the lock to be acquired");
- printk(" and %s-irq-unsafe lock:\n", irqclass);
- if (!save_trace(&next_root->trace))
- return 0;
+ pr_warn("\nthe dependencies between the lock to be acquired");
+ pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
+ next_root->trace = save_trace();
+ if (!next_root->trace)
+ goto out;
print_shortest_lock_dependencies(forwards_entry, next_root);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
-}
-
-static int
-check_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit_backwards,
- enum lock_usage_bit bit_forwards, const char *irqclass)
-{
- int ret;
- struct lock_list this, that;
- struct lock_list *uninitialized_var(target_entry);
- struct lock_list *uninitialized_var(target_entry1);
-
- this.parent = NULL;
-
- this.class = hlock_class(prev);
- ret = find_usage_backwards(&this, bit_backwards, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- that.parent = NULL;
- that.class = hlock_class(next);
- ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- return print_bad_irq_dependency(curr, &this, &that,
- target_entry, target_entry1,
- prev, next,
- bit_backwards, bit_forwards, irqclass);
+out:
+ nbcon_cpu_emergency_exit();
}
static const char *state_names[] = {
@@ -1612,103 +2634,361 @@ static const char *state_rnames[] = {
static inline const char *state_name(enum lock_usage_bit bit)
{
- return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+ if (bit & LOCK_USAGE_READ_MASK)
+ return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
+ else
+ return state_names[bit >> LOCK_USAGE_DIR_MASK];
}
+/*
+ * The bit number is encoded like:
+ *
+ * bit0: 0 exclusive, 1 read lock
+ * bit1: 0 used in irq, 1 irq enabled
+ * bit2-n: state
+ */
static int exclusive_bit(int new_bit)
{
- /*
- * USED_IN
- * USED_IN_READ
- * ENABLED
- * ENABLED_READ
- *
- * bit 0 - write/read
- * bit 1 - used_in/enabled
- * bit 2+ state
- */
-
- int state = new_bit & ~3;
- int dir = new_bit & 2;
+ int state = new_bit & LOCK_USAGE_STATE_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* keep state, bit flip the direction and strip read.
*/
- return state | (dir ^ 2);
+ return state | (dir ^ LOCK_USAGE_DIR_MASK);
+}
+
+/*
+ * Observe that when given a bitmask where each bitnr is encoded as above, a
+ * right shift of the mask transforms the individual bitnrs as -1 and
+ * conversely, a left shift transforms into +1 for the individual bitnrs.
+ *
+ * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
+ * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
+ * instead by subtracting the bit number by 2, or shifting the mask right by 2.
+ *
+ * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
+ *
+ * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
+ * all bits set) and recompose with bitnr1 flipped.
+ */
+static unsigned long invert_dir_mask(unsigned long mask)
+{
+ unsigned long excl = 0;
+
+ /* Invert dir */
+ excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
+ excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
+
+ return excl;
+}
+
+/*
+ * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ
+ * usage may cause deadlock too, for example:
+ *
+ * P1 P2
+ * <irq disabled>
+ * write_lock(l1); <irq enabled>
+ * read_lock(l2);
+ * write_lock(l2);
+ * <in irq>
+ * read_lock(l1);
+ *
+ * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2
+ * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible
+ * deadlock.
+ *
+ * In fact, all of the following cases may cause deadlocks:
+ *
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ
+ *
+ * As a result, to calculate the "exclusive mask", first we invert the
+ * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with
+ * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all
+ * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*).
+ */
+static unsigned long exclusive_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+
+ return excl;
+}
+
+/*
+ * Retrieve the _possible_ original mask to which @mask is
+ * exclusive. Ie: this is the opposite of exclusive_mask().
+ * Note that 2 possible original bits can match an exclusive
+ * bit: one has LOCK_USAGE_READ_MASK set, the other has it
+ * cleared. So both are returned for each exclusive bit.
+ */
+static unsigned long original_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ /* Include read in existing usages */
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+
+ return excl;
+}
+
+/*
+ * Find the first pair of bit match between an original
+ * usage mask and an exclusive usage mask.
+ */
+static int find_exclusive_match(unsigned long mask,
+ unsigned long excl_mask,
+ enum lock_usage_bit *bitp,
+ enum lock_usage_bit *excl_bitp)
+{
+ int bit, excl, excl_read;
+
+ for_each_set_bit(bit, &mask, LOCK_USED) {
+ /*
+ * exclusive_bit() strips the read bit, however,
+ * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need
+ * to search excl | LOCK_USAGE_READ_MASK as well.
+ */
+ excl = exclusive_bit(bit);
+ excl_read = excl | LOCK_USAGE_READ_MASK;
+ if (excl_mask & lock_flag(excl)) {
+ *bitp = bit;
+ *excl_bitp = excl;
+ return 0;
+ } else if (excl_mask & lock_flag(excl_read)) {
+ *bitp = bit;
+ *excl_bitp = excl_read;
+ return 0;
+ }
+ }
+ return -1;
}
+/*
+ * Prove that the new dependency does not connect a hardirq-safe(-read)
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit)
+ struct held_lock *next)
{
+ unsigned long usage_mask = 0, forward_mask, backward_mask;
+ enum lock_usage_bit forward_bit = 0, backward_bit = 0;
+ struct lock_list *target_entry1;
+ struct lock_list *target_entry;
+ struct lock_list this, that;
+ enum bfs_result ret;
+
/*
- * Prove that the new dependency does not connect a hardirq-safe
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 1: gather all hard/soft IRQs usages backward in an
+ * accumulated usage mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
+ bfs_init_rootb(&this, prev);
+
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
return 0;
+ }
- bit++; /* _READ */
+ usage_mask &= LOCKF_USED_IN_IRQ_ALL;
+ if (!usage_mask)
+ return 1;
/*
- * Prove that the new dependency does not connect a hardirq-safe-read
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 2: find exclusive uses forward that match the previous
+ * backward accumulated mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
+ forward_mask = exclusive_mask(usage_mask);
+
+ bfs_init_root(&that, next);
+
+ ret = find_usage_forwards(&that, forward_mask, &target_entry1);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
return 0;
+ }
+ if (ret == BFS_RNOMATCH)
+ return 1;
+ /*
+ * Step 3: we found a bad match! Now retrieve a lock from the backward
+ * list whose usage mask matches the exclusive usage mask from the
+ * lock found on the forward list.
+ *
+ * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering
+ * the follow case:
+ *
+ * When trying to add A -> B to the graph, we find that there is a
+ * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M,
+ * that B -> ... -> M. However M is **softirq-safe**, if we use exact
+ * invert bits of M's usage_mask, we will find another lock N that is
+ * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not
+ * cause a inversion deadlock.
+ */
+ backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL);
+
+ ret = find_usage_backwards(&this, backward_mask, &target_entry);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
+ return 0;
+ }
+ if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH))
+ return 1;
+
+ /*
+ * Step 4: narrow down to a pair of incompatible usage bits
+ * and report it.
+ */
+ ret = find_exclusive_match(target_entry->class->usage_mask,
+ target_entry1->class->usage_mask,
+ &backward_bit, &forward_bit);
+ if (DEBUG_LOCKS_WARN_ON(ret == -1))
+ return 1;
+
+ print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ backward_bit, forward_bit,
+ state_name(backward_bit));
+
+ return 0;
+}
+
+#else
+
+static inline int check_irq_usage(struct task_struct *curr,
+ struct held_lock *prev, struct held_lock *next)
+{
return 1;
}
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
+static inline bool usage_skip(struct lock_list *entry, void *mask)
{
-#define LOCKDEP_STATE(__STATE) \
- if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
- return 0;
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
+ return false;
+}
- return 1;
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * We are about to add A -> B into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
+ * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
+ * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
+ * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
+ * dependency graph, as any strong path ..-> A -> B ->.. we can get with
+ * having dependency A -> B, we could already get a equivalent path ..-> A ->
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant.
+ *
+ * We need to make sure both the start and the end of A -> .. -> B is not
+ * weaker than A -> B. For the start part, please see the comment in
+ * check_redundant(). For the end part, we need:
+ *
+ * Either
+ *
+ * a) A -> B is -(*R)-> (everything is not weaker than that)
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
+ *
+ */
+static inline bool hlock_equal(struct lock_list *entry, void *data)
+{
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 2 || /* A -> B is -(*R)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
-static void inc_chains(void)
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
+ */
+static noinline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
{
- if (current->hardirq_context)
- nr_hardirq_chains++;
- else {
- if (current->softirq_context)
- nr_softirq_chains++;
- else
- nr_process_chains++;
- }
+ enum bfs_result ret;
+ struct lock_list *target_entry;
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
+
+ debug_atomic_inc(nr_redundant_checks);
+
+ /*
+ * Note: we skip local_lock() for redundant check, because as the
+ * comment in usage_skip(), A -> local_lock() -> B and A -> B are not
+ * the same.
+ */
+ ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry);
+
+ if (ret == BFS_RMATCH)
+ debug_atomic_inc(nr_redundant);
+
+ return ret;
}
#else
-static inline int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
+static inline enum bfs_result
+check_redundant(struct held_lock *src, struct held_lock *target)
{
- return 1;
+ return BFS_RNOMATCH;
}
-static inline void inc_chains(void)
+#endif
+
+static void inc_chains(int irq_context)
{
- nr_process_chains++;
+ if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
+ nr_hardirq_chains++;
+ else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
}
-#endif
+static void dec_chains(int irq_context)
+{
+ if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT)
+ nr_hardirq_chains--;
+ else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT)
+ nr_softirq_chains--;
+ else
+ nr_process_chains--;
+}
static void
-print_deadlock_scenario(struct held_lock *nxt,
- struct held_lock *prv)
+print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
{
struct lock_class *next = hlock_class(nxt);
struct lock_class *prev = hlock_class(prv);
@@ -1717,41 +2997,50 @@ print_deadlock_scenario(struct held_lock *nxt,
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(prev);
- printk(");\n");
+ __print_lock_name(prv, prev);
+ printk(KERN_CONT ");\n");
printk(" lock(");
- __print_lock_name(next);
- printk(");\n");
+ __print_lock_name(nxt, next);
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
printk(" May be due to missing lock nesting notation\n\n");
}
-static int
+static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
+ struct lock_class *class = hlock_class(prev);
+
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
- printk("\n");
- printk("=============================================\n");
- printk("[ INFO: possible recursive locking detected ]\n");
+ pr_warn("\n");
+ pr_warn("============================================\n");
+ pr_warn("WARNING: possible recursive locking detected\n");
print_kernel_ident();
- printk("---------------------------------------------\n");
- printk("%s/%d is trying to acquire lock:\n",
+ pr_warn("--------------------------------------------\n");
+ pr_warn("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(next);
- printk("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(prev);
- printk("\nother info that might help us debug this:\n");
+ if (class->cmp_fn) {
+ pr_warn("and the lock comparison function returns %i:\n",
+ class->cmp_fn(prev->instance, next->instance));
+ }
+
+ pr_warn("\nother info that might help us debug this:\n");
print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
/*
@@ -1760,12 +3049,14 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
* (Note that this has to be done separately, because the graph cannot
* detect such classes of deadlocks.)
*
- * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same
+ * lock class is held but nest_lock is also held, i.e. we rely on the
+ * nest_lock to avoid the deadlock.
*/
static int
-check_deadlock(struct task_struct *curr, struct held_lock *next,
- struct lockdep_map *next_instance, int read)
+check_deadlock(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_class *class;
struct held_lock *prev;
struct held_lock *nest = NULL;
int i;
@@ -1783,8 +3074,14 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
* Allow read-after-read recursion of the same
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
- if ((read == 2) && prev->read)
- return 2;
+ if ((next->read == 2) && prev->read)
+ continue;
+
+ class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ continue;
/*
* We're holding the nest_lock, which serializes this lock's
@@ -1793,14 +3090,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- return print_deadlock_bug(curr, prev, next);
+ print_deadlock_bug(curr, prev, next);
+ return 0;
}
return 1;
}
/*
* There was a chain-cache miss, and we are about to add a new dependency
- * to a previous lock. We recursively validate the following rules:
+ * to a previous lock. We validate the following rules:
*
* - would the adding of the <prev> -> <next> dependency create a
* circular dependency in the graph? [== circular deadlock]
@@ -1822,51 +3120,55 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int trylock_loop)
+ struct held_lock *next, u16 distance,
+ struct lock_trace **const trace)
{
struct lock_list *entry;
- int ret;
- struct lock_list this;
- struct lock_list *uninitialized_var(target_entry);
- /*
- * Static variable, serialized by the graph_lock().
- *
- * We use this static variable to save the stack trace in case
- * we call into this function multiple times due to encountering
- * trylocks in the held lock stack.
- */
- static struct stack_trace trace;
+ enum bfs_result ret;
+
+ if (!hlock_class(prev)->key || !hlock_class(next)->key) {
+ /*
+ * The warning statements below may trigger a use-after-free
+ * of the class name. It is better to trigger a use-after free
+ * and to have the class name most of the time instead of not
+ * having the class name available.
+ */
+ WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(prev),
+ hlock_class(prev)->name);
+ WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(next),
+ hlock_class(next)->name);
+ return 2;
+ }
+
+ if (prev->class_idx == next->class_idx) {
+ struct lock_class *class = hlock_class(prev);
+
+ if (class->cmp_fn &&
+ class->cmp_fn(prev->instance, next->instance) < 0)
+ return 2;
+ }
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
- * forward-recursing into the graph starting at <next>, and
- * checking whether we can reach <prev>.)
+ * a breadth-first search into the graph starting at <next>,
+ * and check whether we can reach <prev>.)
*
- * We are using global variables to control the recursion, to
- * keep the stackframe size of the recursive functions low:
+ * The search is limited by the size of the circular queue (i.e.,
+ * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes
+ * in the graph whose neighbours are to be checked.
*/
- this.class = hlock_class(next);
- this.parent = NULL;
- ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret))
- return print_circular_bug(&this, target_entry, next, prev);
- else if (unlikely(ret < 0))
- return print_bfs_bug(ret);
-
- if (!check_prev_add_irq(curr, prev, next))
+ ret = check_noncircular(next, prev, trace);
+ if (unlikely(bfs_error(ret) || ret == BFS_RMATCH))
+ return 0;
+
+ if (!check_irq_usage(curr, prev, next))
return 0;
- /*
- * For recursive read-locks we do all the dependency checks,
- * but we dont store read-triggered dependencies (only
- * write-triggered dependencies). This ensures that only the
- * write-side dependencies matter, and that if for example a
- * write-lock never takes any other locks, then the reads are
- * equivalent to a NOP.
- */
- if (next->read == 2 || prev->read == 2)
- return 1;
/*
* Is the <prev> -> <next> dependency already present?
*
@@ -1879,44 +3181,71 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 2;
+ entry->dep |= calc_dep(prev, next);
+
+ /*
+ * Also, update the reverse dependency in @next's
+ * ->locks_before list.
+ *
+ * Here we reuse @entry as the cursor, which is fine
+ * because we won't go to the next iteration of the
+ * outer loop:
+ *
+ * For normal cases, we return in the inner loop.
+ *
+ * If we fail to return, we have inconsistency, i.e.
+ * <prev>::locks_after contains <next> while
+ * <next>::locks_before doesn't contain <prev>. In
+ * that case, we return after the inner and indicate
+ * something is wrong.
+ */
+ list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) {
+ if (entry->class == hlock_class(prev)) {
+ if (distance == 1)
+ entry->distance = 1;
+ entry->dep |= calc_depb(prev, next);
+ return 1;
+ }
+ }
+
+ /* <prev> is not found in <next>::locks_before */
+ return 0;
}
}
- if (!trylock_loop && !save_trace(&trace))
+ /*
+ * Is the <prev> -> <next> link redundant?
+ */
+ ret = check_redundant(prev, next);
+ if (bfs_error(ret))
return 0;
+ else if (ret == BFS_RMATCH)
+ return 2;
+
+ if (!*trace) {
+ *trace = save_trace();
+ if (!*trace)
+ return 0;
+ }
/*
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
- &hlock_class(prev)->locks_after,
- next->acquire_ip, distance, &trace);
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+ &hlock_class(prev)->locks_after, distance,
+ calc_dep(prev, next), *trace);
if (!ret)
return 0;
- ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
- &hlock_class(next)->locks_before,
- next->acquire_ip, distance, &trace);
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+ &hlock_class(next)->locks_before, distance,
+ calc_depb(prev, next), *trace);
if (!ret)
return 0;
- /*
- * Debugging printouts:
- */
- if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
- graph_unlock();
- printk("\n new dependency: ");
- print_lock_name(hlock_class(prev));
- printk(" => ");
- print_lock_name(hlock_class(next));
- printk("\n");
- dump_stack();
- return graph_lock();
- }
- return 1;
+ return 2;
}
/*
@@ -1928,8 +3257,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_trace *trace = NULL;
int depth = curr->lockdep_depth;
- int trylock_loop = 0;
struct held_lock *hlock;
/*
@@ -1948,16 +3277,14 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
goto out_bug;
for (;;) {
- int distance = curr->lockdep_depth - depth + 1;
+ u16 distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
- /*
- * Only non-recursive-read entries get new dependencies
- * added:
- */
- if (hlock->read != 2 && hlock->check) {
- if (!check_prev_add(curr, hlock, next,
- distance, trylock_loop))
+
+ if (hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace);
+ if (!ret)
return 0;
+
/*
* Stop after the first non-trylock entry,
* as non-trylock entries have added their
@@ -1967,6 +3294,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (!hlock->trylock)
break;
}
+
depth--;
/*
* End of lock-stack?
@@ -1979,7 +3307,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (curr->held_locks[depth].irq_context !=
curr->held_locks[depth-1].irq_context)
break;
- trylock_loop = 1;
}
return 1;
out_bug:
@@ -1996,110 +3323,544 @@ out_bug:
return 0;
}
-unsigned long nr_lock_chains;
struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
-int nr_chain_hlocks;
+static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS);
static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+unsigned long nr_zapped_lock_chains;
+unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */
+unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */
+unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */
-struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+/*
+ * The first 2 chain_hlocks entries in the chain block in the bucket
+ * list contains the following meta data:
+ *
+ * entry[0]:
+ * Bit 15 - always set to 1 (it is not a class index)
+ * Bits 0-14 - upper 15 bits of the next block index
+ * entry[1] - lower 16 bits of next block index
+ *
+ * A next block index of all 1 bits means it is the end of the list.
+ *
+ * On the unsized bucket (bucket-0), the 3rd and 4th entries contain
+ * the chain block size:
+ *
+ * entry[2] - upper 16 bits of the chain block size
+ * entry[3] - lower 16 bits of the chain block size
+ */
+#define MAX_CHAIN_BUCKETS 16
+#define CHAIN_BLK_FLAG (1U << 15)
+#define CHAIN_BLK_LIST_END 0xFFFFU
+
+static int chain_block_buckets[MAX_CHAIN_BUCKETS];
+
+static inline int size_to_bucket(int size)
{
- return lock_classes + chain_hlocks[chain->base + i];
+ if (size > MAX_CHAIN_BUCKETS)
+ return 0;
+
+ return size - 1;
}
/*
- * Look up a dependency chain. If the key is not present yet then
- * add it and return 1 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 0.
- * (On return with 1 graph_lock is held.)
+ * Iterate all the chain blocks in a bucket.
+ */
+#define for_each_chain_block(bucket, prev, curr) \
+ for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \
+ (curr) >= 0; \
+ (prev) = (curr), (curr) = chain_block_next(curr))
+
+/*
+ * next block or -1
*/
-static inline int lookup_chain_cache(struct task_struct *curr,
- struct held_lock *hlock,
- u64 chain_key)
+static inline int chain_block_next(int offset)
{
- struct lock_class *class = hlock_class(hlock);
- struct list_head *hash_head = chainhashentry(chain_key);
- struct lock_chain *chain;
- struct held_lock *hlock_curr;
- int i, j;
+ int next = chain_hlocks[offset];
+
+ WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG));
+
+ if (next == CHAIN_BLK_LIST_END)
+ return -1;
+
+ next &= ~CHAIN_BLK_FLAG;
+ next <<= 16;
+ next |= chain_hlocks[offset + 1];
+ return next;
+}
+
+/*
+ * bucket-0 only
+ */
+static inline int chain_block_size(int offset)
+{
+ return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3];
+}
+
+static inline void init_chain_block(int offset, int next, int bucket, int size)
+{
+ chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG;
+ chain_hlocks[offset + 1] = (u16)next;
+
+ if (size && !bucket) {
+ chain_hlocks[offset + 2] = size >> 16;
+ chain_hlocks[offset + 3] = (u16)size;
+ }
+}
+
+static inline void add_chain_block(int offset, int size)
+{
+ int bucket = size_to_bucket(size);
+ int next = chain_block_buckets[bucket];
+ int prev, curr;
+
+ if (unlikely(size < 2)) {
+ /*
+ * We can't store single entries on the freelist. Leak them.
+ *
+ * One possible way out would be to uniquely mark them, other
+ * than with CHAIN_BLK_FLAG, such that we can recover them when
+ * the block before it is re-added.
+ */
+ if (size)
+ nr_lost_chain_hlocks++;
+ return;
+ }
+
+ nr_free_chain_hlocks += size;
+ if (!bucket) {
+ nr_large_chain_blocks++;
+
+ /*
+ * Variable sized, sort large to small.
+ */
+ for_each_chain_block(0, prev, curr) {
+ if (size >= chain_block_size(curr))
+ break;
+ }
+ init_chain_block(offset, curr, 0, size);
+ if (prev < 0)
+ chain_block_buckets[0] = offset;
+ else
+ init_chain_block(prev, offset, 0, 0);
+ return;
+ }
/*
- * We might need to take the graph lock, ensure we've got IRQs
- * disabled to make this an IRQ-safe lock.. for recursion reasons
- * lockdep won't complain about its own locking errors.
+ * Fixed size, add to head.
*/
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return 0;
+ init_chain_block(offset, next, bucket, size);
+ chain_block_buckets[bucket] = offset;
+}
+
+/*
+ * Only the first block in the list can be deleted.
+ *
+ * For the variable size bucket[0], the first block (the largest one) is
+ * returned, broken up and put back into the pool. So if a chain block of
+ * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be
+ * queued up after the primordial chain block and never be used until the
+ * hlock entries in the primordial chain block is almost used up. That
+ * causes fragmentation and reduce allocation efficiency. That can be
+ * monitored by looking at the "large chain blocks" number in lockdep_stats.
+ */
+static inline void del_chain_block(int bucket, int size, int next)
+{
+ nr_free_chain_hlocks -= size;
+ chain_block_buckets[bucket] = next;
+
+ if (!bucket)
+ nr_large_chain_blocks--;
+}
+
+static void init_chain_block_buckets(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_CHAIN_BUCKETS; i++)
+ chain_block_buckets[i] = -1;
+
+ add_chain_block(0, ARRAY_SIZE(chain_hlocks));
+}
+
+/*
+ * Return offset of a chain block of the right size or -1 if not found.
+ *
+ * Fairly simple worst-fit allocator with the addition of a number of size
+ * specific free lists.
+ */
+static int alloc_chain_hlocks(int req)
+{
+ int bucket, curr, size;
+
/*
- * We can walk it lock-free, because entries only get added
- * to the hash:
+ * We rely on the MSB to act as an escape bit to denote freelist
+ * pointers. Make sure this bit isn't set in 'normal' class_idx usage.
*/
- list_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
-cache_hit:
- debug_atomic_inc(chain_lookup_hits);
- if (very_verbose(class))
- printk("\nhash chain already cached, key: "
- "%016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key,
- class->key, class->name);
- return 0;
+ BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG);
+
+ init_data_structures_once();
+
+ if (nr_free_chain_hlocks < req)
+ return -1;
+
+ /*
+ * We require a minimum of 2 (u16) entries to encode a freelist
+ * 'pointer'.
+ */
+ req = max(req, 2);
+ bucket = size_to_bucket(req);
+ curr = chain_block_buckets[bucket];
+
+ if (bucket) {
+ if (curr >= 0) {
+ del_chain_block(bucket, req, chain_block_next(curr));
+ return curr;
}
+ /* Try bucket 0 */
+ curr = chain_block_buckets[0];
}
- if (very_verbose(class))
- printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
- (unsigned long long)chain_key, class->key, class->name);
+
/*
- * Allocate a new chain entry from the static array, and add
- * it to the hash:
+ * The variable sized freelist is sorted by size; the first entry is
+ * the largest. Use it if it fits.
*/
- if (!graph_lock())
- return 0;
+ if (curr >= 0) {
+ size = chain_block_size(curr);
+ if (likely(size >= req)) {
+ del_chain_block(0, size, chain_block_next(curr));
+ if (size > req)
+ add_chain_block(curr + req, size - req);
+ return curr;
+ }
+ }
+
/*
- * We have to walk the chain again locked - to avoid duplicates:
+ * Last resort, split a block in a larger sized bucket.
*/
- list_for_each_entry(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
- graph_unlock();
- goto cache_hit;
+ for (size = MAX_CHAIN_BUCKETS; size > req; size--) {
+ bucket = size_to_bucket(size);
+ curr = chain_block_buckets[bucket];
+ if (curr < 0)
+ continue;
+
+ del_chain_block(bucket, size, chain_block_next(curr));
+ add_chain_block(curr + req, size - req);
+ return curr;
+ }
+
+ return -1;
+}
+
+static inline void free_chain_hlocks(int base, int size)
+{
+ add_chain_block(base, max(size, 2));
+}
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+{
+ u16 chain_hlock = chain_hlocks[chain->base + i];
+ unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
+
+ return lock_classes + class_idx;
+}
+
+/*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ int i;
+ struct held_lock *hlock_curr;
+
+ for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+ hlock_curr = curr->held_locks + i;
+ if (hlock_curr->irq_context != hlock->irq_context)
+ break;
+
+ }
+
+ return ++i;
+}
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Returns the next chain_key iteration
+ */
+static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key)
+{
+ u64 new_chain_key = iterate_chain_key(chain_key, hlock_id);
+
+ printk(" hlock_id:%d -> chain_key:%016Lx",
+ (unsigned int)hlock_id,
+ (unsigned long long)new_chain_key);
+ return new_chain_key;
+}
+
+static void
+print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
+{
+ struct held_lock *hlock;
+ u64 chain_key = INITIAL_CHAIN_KEY;
+ int depth = curr->lockdep_depth;
+ int i = get_first_held_lock(curr, hlock_next);
+
+ printk("depth: %u (irq_context %u)\n", depth - i + 1,
+ hlock_next->irq_context);
+ for (; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key);
+
+ print_lock(hlock);
+ }
+
+ print_chain_key_iteration(hlock_id(hlock_next), chain_key);
+ print_lock(hlock_next);
+}
+
+static void print_chain_keys_chain(struct lock_chain *chain)
+{
+ int i;
+ u64 chain_key = INITIAL_CHAIN_KEY;
+ u16 hlock_id;
+
+ printk("depth: %u\n", chain->depth);
+ for (i = 0; i < chain->depth; i++) {
+ hlock_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(hlock_id, chain_key);
+
+ print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id));
+ printk("\n");
+ }
+}
+
+static void print_collision(struct task_struct *curr,
+ struct held_lock *hlock_next,
+ struct lock_chain *chain)
+{
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("============================\n");
+ pr_warn("WARNING: chain_key collision\n");
+ print_kernel_ident();
+ pr_warn("----------------------------\n");
+ pr_warn("%s/%d: ", current->comm, task_pid_nr(current));
+ pr_warn("Hash chain already cached but the contents don't match!\n");
+
+ pr_warn("Held locks:");
+ print_chain_keys_held_locks(curr, hlock_next);
+
+ pr_warn("Locks in cached chain:");
+ print_chain_keys_chain(chain);
+
+ pr_warn("\nstack backtrace:\n");
+ dump_stack();
+
+ nbcon_cpu_emergency_exit();
+}
+#endif
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+ struct held_lock *hlock,
+ struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ int i, j, id;
+
+ i = get_first_held_lock(curr, hlock);
+
+ if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) {
+ print_collision(curr, hlock, chain);
+ return 0;
+ }
+
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ id = hlock_id(&curr->held_locks[i]);
+
+ if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
+ print_collision(curr, hlock, chain);
+ return 0;
}
}
- if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+#endif
+ return 1;
+}
+
+/*
+ * Given an index that is >= -1, return the index of the next lock chain.
+ * Return -2 if there is no next lock chain.
+ */
+long lockdep_next_lockchain(long i)
+{
+ i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1);
+ return i < ARRAY_SIZE(lock_chains) ? i : -2;
+}
+
+unsigned long lock_chain_count(void)
+{
+ return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains));
+}
+
+/* Must be called with the graph lock held. */
+static struct lock_chain *alloc_lock_chain(void)
+{
+ int idx = find_first_zero_bit(lock_chains_in_use,
+ ARRAY_SIZE(lock_chains));
+
+ if (unlikely(idx >= ARRAY_SIZE(lock_chains)))
+ return NULL;
+ __set_bit(idx, lock_chains_in_use);
+ return lock_chains + idx;
+}
+
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+ int i, j;
+
+ /*
+ * The caller must hold the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
+ */
+ if (lockdep_assert_locked())
+ return 0;
+
+ chain = alloc_lock_chain();
+ if (!chain) {
if (!debug_locks_off_graph_unlock())
return 0;
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
- chain = lock_chains + nr_lock_chains++;
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
- /* Find the first held_lock of current chain */
- for (i = curr->lockdep_depth - 1; i >= 0; i--) {
- hlock_curr = curr->held_locks + i;
- if (hlock_curr->irq_context != hlock->irq_context)
- break;
- }
- i++;
+ i = get_first_held_lock(curr, hlock);
chain->depth = curr->lockdep_depth + 1 - i;
- if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
- chain->base = nr_chain_hlocks;
- nr_chain_hlocks += chain->depth;
- for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx - 1;
- chain_hlocks[chain->base + j] = lock_id;
- }
- chain_hlocks[chain->base + j] = class - lock_classes;
+
+ BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks));
+ BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks));
+ BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
+
+ j = alloc_chain_hlocks(chain->depth);
+ if (j < 0) {
+ if (!debug_locks_off_graph_unlock())
+ return 0;
+
+ nbcon_cpu_emergency_enter();
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ nbcon_cpu_emergency_exit();
+ return 0;
+ }
+
+ chain->base = j;
+ for (j = 0; j < chain->depth - 1; j++, i++) {
+ int lock_id = hlock_id(curr->held_locks + i);
+
+ chain_hlocks[chain->base + j] = lock_id;
}
- list_add_tail_rcu(&chain->entry, hash_head);
+ chain_hlocks[chain->base + j] = hlock_id(hlock);
+ hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
- inc_chains();
+ inc_chains(chain->irq_context);
+
+ return 1;
+}
+
+/*
+ * Look up a dependency chain. Must be called with either the graph lock or
+ * the RCU read lock held.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+ struct hlist_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
+ if (READ_ONCE(chain->chain_key) == chain_key) {
+ debug_atomic_inc(chain_lookup_hits);
+ return chain;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+ struct held_lock *hlock,
+ u64 chain_key)
+{
+ struct lock_class *class = hlock_class(hlock);
+ struct lock_chain *chain = lookup_chain_cache(chain_key);
+
+ if (chain) {
+cache_hit:
+ if (!check_no_collision(curr, hlock, chain))
+ return 0;
+
+ if (very_verbose(class)) {
+ printk("\nhash chain already cached, key: "
+ "%016Lx tail class: [%px] %s\n",
+ (unsigned long long)chain_key,
+ class->key, class->name);
+ }
+
+ return 0;
+ }
+
+ if (very_verbose(class)) {
+ printk("\nnew hash chain, key: %016Lx tail class: [%px] %s\n",
+ (unsigned long long)chain_key, class->key, class->name);
+ }
+
+ if (!graph_lock())
+ return 0;
+
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ chain = lookup_chain_cache(chain_key);
+ if (chain) {
+ graph_unlock();
+ goto cache_hit;
+ }
+
+ if (!add_chain_cache(curr, hlock, chain_key))
+ return 0;
return 1;
}
-static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
- struct held_lock *hlock, int chain_head, u64 chain_key)
+static int validate_chain(struct task_struct *curr,
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
/*
* Trylock needs to maintain the stack of held locks, but it
@@ -2108,11 +3869,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
*
* We look up the chain_key and do the O(N^2) check and update of
* the dependencies only if this is a new dependency chain.
- * (If lookup_chain_cache() returns with 1 it acquires
+ * (If lookup_chain_cache_add() return with 1 it acquires
* graph_lock for us)
*/
if (!hlock->trylock && hlock->check &&
- lookup_chain_cache(curr, hlock, chain_key)) {
+ lookup_chain_cache_add(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
*
@@ -2120,45 +3881,53 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* - is softirq-safe, if this lock is hardirq-unsafe
*
* And check whether the new lock's dependency graph
- * could lead back to the previous lock.
+ * could lead back to the previous lock:
*
- * any of these scenarios could lead to a deadlock. If
- * All validations
+ * - within the current held-lock stack
+ * - across our accumulated lock dependency records
+ *
+ * any of these scenarios could lead to a deadlock.
+ */
+ /*
+ * The simple case: does the current hold the same lock
+ * already?
*/
- int ret = check_deadlock(curr, hlock, lock, hlock->read);
+ int ret = check_deadlock(curr, hlock);
if (!ret)
return 0;
/*
- * Mark recursive read, as we jump over it when
- * building dependencies (just like we jump over
- * trylock entries):
- */
- if (ret == 2)
- hlock->read = 2;
- /*
* Add dependency only if this lock is not the head
- * of the chain, and if it's not a secondary read-lock:
+ * of the chain, and if the new lock introduces no more
+ * lock dependency (because we already hold a lock with the
+ * same lock class) nor deadlock (because the nest_lock
+ * serializes nesting locks), see the comments for
+ * check_deadlock().
*/
- if (!chain_head && ret != 2)
+ if (!chain_head && ret != 2) {
if (!check_prevs_add(curr, hlock))
return 0;
+ }
+
graph_unlock();
- } else
- /* after lookup_chain_cache(): */
+ } else {
+ /* after lookup_chain_cache_add(): */
if (unlikely(!debug_locks))
return 0;
+ }
return 1;
}
#else
static inline int validate_chain(struct task_struct *curr,
- struct lockdep_map *lock, struct held_lock *hlock,
- int chain_head, u64 chain_key)
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
return 1;
}
-#endif
+
+static void init_chain_block_buckets(void) { }
+#endif /* CONFIG_PROVE_LOCKING */
/*
* We are building curr_chain_key incrementally, so double-check
@@ -2168,8 +3937,8 @@ static void check_chain_key(struct task_struct *curr)
{
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
- unsigned int i, id;
- u64 chain_key = 0;
+ unsigned int i;
+ u64 chain_key = INITIAL_CHAIN_KEY;
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
@@ -2185,17 +3954,18 @@ static void check_chain_key(struct task_struct *curr)
(unsigned long long)hlock->prev_chain_key);
return;
}
- id = hlock->class_idx - 1;
+
/*
- * Whoops ran out of static storage again?
+ * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
+ * it registered lock class index?
*/
- if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
return;
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
- chain_key = 0;
- chain_key = iterate_chain_key(chain_key, id);
+ chain_key = INITIAL_CHAIN_KEY;
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
prev_hlock = hlock;
}
if (chain_key != curr->curr_chain_key) {
@@ -2212,8 +3982,11 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
-static void
-print_usage_bug_scenario(struct held_lock *lock)
+#ifdef CONFIG_PROVE_LOCKING
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit);
+
+static void print_usage_bug_scenario(struct held_lock *lock)
{
struct lock_class *class = hlock_class(lock);
@@ -2221,52 +3994,54 @@ print_usage_bug_scenario(struct held_lock *lock)
printk(" CPU0\n");
printk(" ----\n");
printk(" lock(");
- __print_lock_name(class);
- printk(");\n");
+ __print_lock_name(lock, class);
+ printk(KERN_CONT ");\n");
printk(" <Interrupt>\n");
printk(" lock(");
- __print_lock_name(class);
- printk(");\n");
+ __print_lock_name(lock, class);
+ printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
- if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ if (!debug_locks_off() || debug_locks_silent)
+ return;
+
+ nbcon_cpu_emergency_enter();
- printk("\n");
- printk("=================================\n");
- printk("[ INFO: inconsistent lock state ]\n");
+ pr_warn("\n");
+ pr_warn("================================\n");
+ pr_warn("WARNING: inconsistent lock state\n");
print_kernel_ident();
- printk("---------------------------------\n");
+ pr_warn("--------------------------------\n");
- printk("inconsistent {%s} -> {%s} usage.\n",
+ pr_warn("inconsistent {%s} -> {%s} usage.\n",
usage_str[prev_bit], usage_str[new_bit]);
- printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
curr->comm, task_pid_nr(curr),
- trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
- trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
- trace_hardirqs_enabled(curr),
- trace_softirqs_enabled(curr));
+ lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
+ lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ lockdep_hardirqs_enabled(),
+ lockdep_softirqs_enabled(curr));
print_lock(this);
- printk("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
+ print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1);
print_irqtrace_events(curr);
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
print_usage_bug_scenario(this);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
/*
@@ -2276,20 +4051,19 @@ static inline int
valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
- if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
- return print_usage_bug(curr, this, bad_bit, new_bit);
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ graph_unlock();
+ print_usage_bug(curr, this, bad_bit, new_bit);
+ return 0;
+ }
return 1;
}
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit);
-
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* print irq inversion bug:
*/
-static int
+static void
print_irq_inversion_bug(struct task_struct *curr,
struct lock_list *root, struct lock_list *other,
struct held_lock *this, int forwards,
@@ -2300,30 +4074,32 @@ print_irq_inversion_bug(struct task_struct *curr,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
- printk("\n");
- printk("=========================================================\n");
- printk("[ INFO: possible irq lock inversion dependency detected ]\n");
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("========================================================\n");
+ pr_warn("WARNING: possible irq lock inversion dependency detected\n");
print_kernel_ident();
- printk("---------------------------------------------------------\n");
- printk("%s/%d just changed the state of lock:\n",
+ pr_warn("--------------------------------------------------------\n");
+ pr_warn("%s/%d just changed the state of lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(this);
if (forwards)
- printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
+ pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
- printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
- print_lock_name(other->class);
- printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+ pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
+ print_lock_name(NULL, other->class);
+ pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
/* Find a middle lock (if one exists) */
depth = get_lock_depth(other);
do {
if (depth == 0 && (entry != root)) {
- printk("lockdep:%s bad path found in chain graph\n", __func__);
+ pr_warn("lockdep:%s bad path found in chain graph\n", __func__);
break;
}
middle = entry;
@@ -2339,15 +4115,16 @@ print_irq_inversion_bug(struct task_struct *curr,
lockdep_print_held_locks(curr);
- printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
- if (!save_trace(&root->trace))
- return 0;
+ pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+ root->trace = save_trace();
+ if (!root->trace)
+ goto out;
print_shortest_lock_dependencies(other, root);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
+out:
+ nbcon_cpu_emergency_exit();
}
/*
@@ -2356,22 +4133,33 @@ print_irq_inversion_bug(struct task_struct *curr,
*/
static int
check_usage_forwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
+
+ bfs_init_root(&root, this);
+ ret = find_usage_forwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
+ return 0;
+ }
+ if (ret == BFS_RNOMATCH)
+ return 1;
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_forwards(&root, bit, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(read_bit));
+ }
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
+ return 0;
}
/*
@@ -2380,35 +4168,56 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
*/
static int
check_usage_backwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
- struct lock_list *uninitialized_var(target_entry);
+ struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
+
+ bfs_init_rootb(&root, this);
+ ret = find_usage_backwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
+ print_bfs_bug(ret);
+ return 0;
+ }
+ if (ret == BFS_RNOMATCH)
+ return 1;
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_backwards(&root, bit, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(read_bit));
+ }
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
+ return 0;
}
void print_irqtrace_events(struct task_struct *curr)
{
- printk("irq event stamp: %u\n", curr->irq_events);
- printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
- print_ip_sym(curr->hardirq_enable_ip);
- printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
- print_ip_sym(curr->hardirq_disable_ip);
- printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
- print_ip_sym(curr->softirq_enable_ip);
- printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
- print_ip_sym(curr->softirq_disable_ip);
+ const struct irqtrace_events *trace = &curr->irqtrace;
+
+ nbcon_cpu_emergency_enter();
+
+ printk("irq event stamp: %u\n", trace->irq_events);
+ printk("hardirqs last enabled at (%u): [<%px>] %pS\n",
+ trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
+ (void *)trace->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
+ trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip,
+ (void *)trace->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): [<%px>] %pS\n",
+ trace->softirq_enable_event, (void *)trace->softirq_enable_ip,
+ (void *)trace->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): [<%px>] %pS\n",
+ trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
+ (void *)trace->softirq_disable_ip);
+
+ nbcon_cpu_emergency_exit();
}
static int HARDIRQ_verbose(struct lock_class *class)
@@ -2427,16 +4236,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
- return class_filter(class);
-#endif
- return 0;
-}
-
-#define STRICT_READ_CHECKS 1
-
static int (*state_verbose_f[])(struct lock_class *class) = {
#define LOCKDEP_STATE(__STATE) \
__STATE##_verbose,
@@ -2447,7 +4246,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
static inline int state_verbose(enum lock_usage_bit bit,
struct lock_class *class)
{
- return state_verbose_f[bit >> 2](class);
+ return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
}
typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -2458,18 +4257,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
int excl_bit = exclusive_bit(new_bit);
- int read = new_bit & 1;
- int dir = new_bit & 2;
-
- /*
- * mark USED_IN has to look forwards -- to ensure no dependency
- * has ENABLED state, which would allow recursion deadlocks.
- *
- * mark ENABLED has to look backwards -- to ensure no dependee
- * has USED_IN state, which, again, would allow recursion deadlocks.
- */
- check_usage_f usage = dir ?
- check_usage_backwards : check_usage_forwards;
+ int read = new_bit & LOCK_USAGE_READ_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* Validate that this particular lock does not have conflicting
@@ -2479,23 +4268,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 0;
/*
- * Validate that the lock dependencies don't have conflicting usage
- * states.
+ * Check for read in write conflicts
*/
- if ((!read || !dir || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ if (!read && !valid_state(curr, this, new_bit,
+ excl_bit + LOCK_USAGE_READ_MASK))
return 0;
+
/*
- * Check for read in write conflicts
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
*/
- if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + 1))
+ if (dir) {
+ /*
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ if (!check_usage_backwards(curr, this, excl_bit))
return 0;
-
- if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + 1,
- state_name(new_bit + 1)))
+ } else {
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ */
+ if (!check_usage_forwards(curr, this, excl_bit))
return 0;
}
@@ -2505,35 +4301,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 1;
}
-enum mark_type {
-#define LOCKDEP_STATE(__STATE) __STATE,
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
/*
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, enum mark_type mark)
+mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)
{
- enum lock_usage_bit usage_bit;
struct held_lock *hlock;
int i;
for (i = 0; i < curr->lockdep_depth; i++) {
+ enum lock_usage_bit hlock_bit = base_bit;
hlock = curr->held_locks + i;
- usage_bit = 2 + (mark << 2); /* ENABLED */
if (hlock->read)
- usage_bit += 1; /* READ */
+ hlock_bit += LOCK_USAGE_READ_MASK;
- BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+ BUG_ON(hlock_bit >= LOCK_USAGE_STATES);
if (!hlock->check)
continue;
- if (!mark_lock(curr, hlock, usage_bit))
+ if (!mark_lock(curr, hlock, hlock_bit))
return 0;
}
@@ -2543,18 +4332,15 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
/*
* Hardirqs will be enabled:
*/
-static void __trace_hardirqs_on_caller(unsigned long ip)
+static void __trace_hardirqs_on_caller(void)
{
struct task_struct *curr = current;
- /* we'll do an OFF -> ON transition: */
- curr->hardirqs_enabled = 1;
-
/*
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, HARDIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -2562,22 +4348,32 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, SOFTIRQ))
- return;
-
- curr->hardirq_enable_ip = ip;
- curr->hardirq_enable_event = ++curr->irq_events;
- debug_atomic_inc(hardirqs_on_events);
+ mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
}
-__visible void trace_hardirqs_on_caller(unsigned long ip)
+/**
+ * lockdep_hardirqs_on_prepare - Prepare for enabling interrupts
+ *
+ * Invoked before a possible transition to RCU idle from exit to user or
+ * guest mode. This ensures that all RCU operations are done before RCU
+ * stops watching. After the RCU transition lockdep_hardirqs_on() has to be
+ * invoked to set the final state.
+ */
+void lockdep_hardirqs_on_prepare(void)
{
- time_hardirqs_on(CALLER_ADDR0, ip);
+ if (unlikely(!debug_locks))
+ return;
+
+ /*
+ * NMIs do not (and cannot) track lock dependencies, nothing to do.
+ */
+ if (unlikely(in_nmi()))
+ return;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(this_cpu_read(lockdep_recursion)))
return;
- if (unlikely(current->hardirqs_enabled)) {
+ if (unlikely(lockdep_hardirqs_enabled())) {
/*
* Neither irq nor preemption are disabled here
* so this is racy by nature but losing one hit
@@ -2598,38 +4394,105 @@ __visible void trace_hardirqs_on_caller(unsigned long ip)
/*
* See the fine text that goes along with this variable definition.
*/
- if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+ if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
return;
/*
* Can't allow enabling interrupts while in an interrupt handler,
* that's general bad form and such. Recursion, limited stack etc..
*/
- if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()))
return;
- current->lockdep_recursion = 1;
- __trace_hardirqs_on_caller(ip);
- current->lockdep_recursion = 0;
+ current->hardirq_chain_key = current->curr_chain_key;
+
+ lockdep_recursion_inc();
+ __trace_hardirqs_on_caller();
+ lockdep_recursion_finish();
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
-void trace_hardirqs_on(void)
+void noinstr lockdep_hardirqs_on(unsigned long ip)
{
- trace_hardirqs_on_caller(CALLER_ADDR0);
+ struct irqtrace_events *trace = &current->irqtrace;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ /*
+ * NMIs can happen in the middle of local_irq_{en,dis}able() where the
+ * tracking state and hardware state are out of sync.
+ *
+ * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from,
+ * and not rely on hardware state like normal interrupts.
+ */
+ if (unlikely(in_nmi())) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+
+ /*
+ * Skip:
+ * - recursion check, because NMI can hit lockdep;
+ * - hardware state check, because above;
+ * - chain_key check, see lockdep_hardirqs_on_prepare().
+ */
+ goto skip_checks;
+ }
+
+ if (unlikely(this_cpu_read(lockdep_recursion)))
+ return;
+
+ if (lockdep_hardirqs_enabled()) {
+ /*
+ * Neither irq nor preemption are disabled here
+ * so this is racy by nature but losing one hit
+ * in a stat is not a big deal.
+ */
+ __debug_atomic_inc(redundant_hardirqs_on);
+ return;
+ }
+
+ /*
+ * We're enabling irqs and according to our state above irqs weren't
+ * already enabled, yet we find the hardware thinks they are in fact
+ * enabled.. someone messed up their IRQ state tracing.
+ */
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ /*
+ * Ensure the lock stack remained unchanged between
+ * lockdep_hardirqs_on_prepare() and lockdep_hardirqs_on().
+ */
+ DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
+ current->curr_chain_key);
+
+skip_checks:
+ /* we'll do an OFF -> ON transition: */
+ __this_cpu_write(hardirqs_enabled, 1);
+ trace->hardirq_enable_ip = ip;
+ trace->hardirq_enable_event = ++trace->irq_events;
+ debug_atomic_inc(hardirqs_on_events);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-__visible void trace_hardirqs_off_caller(unsigned long ip)
+void noinstr lockdep_hardirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
-
- time_hardirqs_off(CALLER_ADDR0, ip);
+ if (unlikely(!debug_locks))
+ return;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ /*
+ * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep;
+ * they will restore the software state. This ensures the software
+ * state is consistent inside NMIs as well.
+ */
+ if (in_nmi()) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+ } else if (__this_cpu_read(lockdep_recursion))
return;
/*
@@ -2639,33 +4502,30 @@ __visible void trace_hardirqs_off_caller(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->hardirqs_enabled) {
+ if (lockdep_hardirqs_enabled()) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->hardirqs_enabled = 0;
- curr->hardirq_disable_ip = ip;
- curr->hardirq_disable_event = ++curr->irq_events;
+ __this_cpu_write(hardirqs_enabled, 0);
+ trace->hardirq_disable_ip = ip;
+ trace->hardirq_disable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_off_events);
- } else
+ } else {
debug_atomic_inc(redundant_hardirqs_off);
+ }
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
-
-void trace_hardirqs_off(void)
-{
- trace_hardirqs_off_caller(CALLER_ADDR0);
-}
-EXPORT_SYMBOL(trace_hardirqs_off);
+EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
*/
-void trace_softirqs_on(unsigned long ip)
+void lockdep_softirqs_on(unsigned long ip)
{
- struct task_struct *curr = current;
+ struct irqtrace_events *trace = &current->irqtrace;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -2675,37 +4535,35 @@ void trace_softirqs_on(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
debug_atomic_inc(redundant_softirqs_on);
return;
}
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
/*
* We'll do an OFF -> ON transition:
*/
- curr->softirqs_enabled = 1;
- curr->softirq_enable_ip = ip;
- curr->softirq_enable_event = ++curr->irq_events;
+ current->softirqs_enabled = 1;
+ trace->softirq_enable_ip = ip;
+ trace->softirq_enable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_on_events);
/*
* We are going to turn softirqs on, so set the
* usage bit for all held locks, if hardirqs are
* enabled too:
*/
- if (curr->hardirqs_enabled)
- mark_held_locks(curr, SOFTIRQ);
- current->lockdep_recursion = 0;
+ if (lockdep_hardirqs_enabled())
+ mark_held_locks(current, LOCK_ENABLED_SOFTIRQ);
+ lockdep_recursion_finish();
}
/*
* Softirqs were disabled:
*/
-void trace_softirqs_off(unsigned long ip)
+void lockdep_softirqs_off(unsigned long ip)
{
- struct task_struct *curr = current;
-
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -2714,13 +4572,15 @@ void trace_softirqs_off(unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
- if (curr->softirqs_enabled) {
+ if (current->softirqs_enabled) {
+ struct irqtrace_events *trace = &current->irqtrace;
+
/*
* We have done an ON -> OFF transition:
*/
- curr->softirqs_enabled = 0;
- curr->softirq_disable_ip = ip;
- curr->softirq_disable_event = ++curr->irq_events;
+ current->softirqs_enabled = 0;
+ trace->softirq_disable_ip = ip;
+ trace->softirq_disable_event = ++trace->irq_events;
debug_atomic_inc(softirqs_off_events);
/*
* Whoops, we wanted softirqs off, so why aren't they?
@@ -2730,60 +4590,43 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
+/**
+ * lockdep_cleanup_dead_cpu - Ensure CPU lockdep state is cleanly stopped
+ *
+ * @cpu: index of offlined CPU
+ * @idle: task pointer for offlined CPU's idle thread
+ *
+ * Invoked after the CPU is dead. Ensures that the tracing infrastructure
+ * is left in a suitable state for the CPU to be subsequently brought
+ * online again.
+ */
+void lockdep_cleanup_dead_cpu(unsigned int cpu, struct task_struct *idle)
{
- struct task_struct *curr = current;
-
if (unlikely(!debug_locks))
return;
- /* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_WAIT))
- return;
-
- /* this guy won't enter reclaim */
- if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
- return;
-
- /* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS))
- return;
-
- /*
- * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
- */
- if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
- return;
-
- mark_held_locks(curr, RECLAIM_FS);
+ if (unlikely(per_cpu(hardirqs_enabled, cpu))) {
+ pr_warn("CPU %u left hardirqs enabled!", cpu);
+ if (idle)
+ print_irqtrace_events(idle);
+ /* Clean it up for when the CPU comes online again. */
+ per_cpu(hardirqs_enabled, cpu) = 0;
+ }
}
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
- unsigned long flags;
-
- if (unlikely(current->lockdep_recursion))
- return;
+ if (!check)
+ goto lock_used;
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- __lockdep_trace_alloc(gfp_mask, flags);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
-
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
-{
/*
* If non-trylock use in a hardirq or softirq context, then
* mark the lock as used in these contexts:
*/
if (!hlock->trylock) {
if (hlock->read) {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock,
LOCK_USED_IN_HARDIRQ_READ))
return 0;
@@ -2792,7 +4635,7 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
- if (curr->hardirq_context)
+ if (lockdep_hardirq_context())
if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
@@ -2800,7 +4643,13 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -2820,25 +4669,20 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
- /*
- * We reuse the irq context infrastructure more broadly as a general
- * context checking code. This tests GFP_FS recursion (a lock taken
- * during reclaim for a GFP_FS allocation is held over a GFP_FS
- * allocation).
- */
- if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
- if (hlock->read) {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
- return 0;
- } else {
- if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
- return 0;
- }
- }
+lock_used:
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
return 1;
}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() +
+ LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
+}
+
static int separate_irq_context(struct task_struct *curr,
struct held_lock *hlock)
{
@@ -2847,8 +4691,6 @@ static int separate_irq_context(struct task_struct *curr,
/*
* Keep track of points where we cross into an interrupt context:
*/
- hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
- curr->softirq_context;
if (depth) {
struct held_lock *prev_hlock;
@@ -2864,41 +4706,23 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
-{
- WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
- return 1;
-}
-
-static inline int mark_irqflags(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 1;
-}
-
-static inline int separate_irq_context(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 0;
-}
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
- unsigned int new_mask = 1 << new_bit, ret = 1;
+ unsigned int new_mask, ret = 1;
+
+ if (new_bit >= LOCK_USAGE_STATES) {
+ DEBUG_LOCKS_WARN_ON(1);
+ return 0;
+ }
+
+ if (new_bit == LOCK_USED && this->read)
+ new_bit = LOCK_USED_READ;
+
+ new_mask = 1 << new_bit;
/*
* If already set then do not dirty the cacheline,
@@ -2912,63 +4736,210 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Make sure we didn't race:
*/
- if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
- graph_unlock();
- return 1;
- }
+ if (unlikely(hlock_class(this)->usage_mask & new_mask))
+ goto unlock;
+
+ if (!hlock_class(this)->usage_mask)
+ debug_atomic_dec(nr_unused_locks);
hlock_class(this)->usage_mask |= new_mask;
- if (!save_trace(hlock_class(this)->usage_traces + new_bit))
- return 0;
+ if (new_bit < LOCK_TRACE_STATES) {
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
+ return 0;
+ }
- switch (new_bit) {
-#define LOCKDEP_STATE(__STATE) \
- case LOCK_USED_IN_##__STATE: \
- case LOCK_USED_IN_##__STATE##_READ: \
- case LOCK_ENABLED_##__STATE: \
- case LOCK_ENABLED_##__STATE##_READ:
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
+ if (new_bit < LOCK_USED) {
ret = mark_lock_irq(curr, this, new_bit);
if (!ret)
return 0;
- break;
- case LOCK_USED:
- debug_atomic_dec(nr_unused_locks);
- break;
- default:
- if (!debug_locks_off_graph_unlock())
- return 0;
- WARN_ON(1);
- return 0;
}
+unlock:
graph_unlock();
/*
* We must printk outside of the graph_lock:
*/
if (ret == 2) {
+ nbcon_cpu_emergency_enter();
printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
print_lock(this);
print_irqtrace_events(curr);
dump_stack();
+ nbcon_cpu_emergency_exit();
}
return ret;
}
+static inline short task_wait_context(struct task_struct *curr)
+{
+ /*
+ * Set appropriate wait type for the context; for IRQs we have to take
+ * into account force_irqthread as that is implied by PREEMPT_RT.
+ */
+ if (lockdep_hardirq_context()) {
+ /*
+ * Check if force_irqthreads will run us threaded.
+ */
+ if (curr->hardirq_threaded || curr->irq_config)
+ return LD_WAIT_CONFIG;
+
+ return LD_WAIT_SPIN;
+ } else if (curr->softirq_context) {
+ /*
+ * Softirqs are always threaded.
+ */
+ return LD_WAIT_CONFIG;
+ }
+
+ return LD_WAIT_MAX;
+}
+
+static int
+print_lock_invalid_wait_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ short curr_inner;
+
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("=============================\n");
+ pr_warn("[ BUG: Invalid wait context ]\n");
+ print_kernel_ident();
+ pr_warn("-----------------------------\n");
+
+ pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ print_lock(hlock);
+
+ pr_warn("other info that might help us debug this:\n");
+
+ curr_inner = task_wait_context(curr);
+ pr_warn("context-{%d:%d}\n", curr_inner, curr_inner);
+
+ lockdep_print_held_locks(curr);
+
+ pr_warn("stack backtrace:\n");
+ dump_stack();
+
+ nbcon_cpu_emergency_exit();
+
+ return 0;
+}
+
+/*
+ * Verify the wait_type context.
+ *
+ * This check validates we take locks in the right wait-type order; that is it
+ * ensures that we do not take mutexes inside spinlocks and do not attempt to
+ * acquire spinlocks inside raw_spinlocks and the sort.
+ *
+ * The entire thing is slightly more complex because of RCU, RCU is a lock that
+ * can be taken from (pretty much) any context but also has constraints.
+ * However when taken in a stricter environment the RCU lock does not loosen
+ * the constraints.
+ *
+ * Therefore we must look for the strictest environment in the lock stack and
+ * compare that to the lock we're trying to acquire.
+ */
+static int check_wait_context(struct task_struct *curr, struct held_lock *next)
+{
+ u8 next_inner = hlock_class(next)->wait_type_inner;
+ u8 next_outer = hlock_class(next)->wait_type_outer;
+ u8 curr_inner;
+ int depth;
+
+ if (!next_inner || next->trylock)
+ return 0;
+
+ if (!next_outer)
+ next_outer = next_inner;
+
+ /*
+ * Find start of current irq_context..
+ */
+ for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) {
+ struct held_lock *prev = curr->held_locks + depth;
+ if (prev->irq_context != next->irq_context)
+ break;
+ }
+ depth++;
+
+ curr_inner = task_wait_context(curr);
+
+ for (; depth < curr->lockdep_depth; depth++) {
+ struct held_lock *prev = curr->held_locks + depth;
+ struct lock_class *class = hlock_class(prev);
+ u8 prev_inner = class->wait_type_inner;
+
+ if (prev_inner) {
+ /*
+ * We can have a bigger inner than a previous one
+ * when outer is smaller than inner, as with RCU.
+ *
+ * Also due to trylocks.
+ */
+ curr_inner = min(curr_inner, prev_inner);
+
+ /*
+ * Allow override for annotations -- this is typically
+ * only valid/needed for code that only exists when
+ * CONFIG_PREEMPT_RT=n.
+ */
+ if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ curr_inner = prev_inner;
+ }
+ }
+
+ if (next_outer > curr_inner)
+ return print_lock_invalid_wait_context(curr, next);
+
+ return 0;
+}
+
+#else /* CONFIG_PROVE_LOCKING */
+
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+static inline int check_wait_context(struct task_struct *curr,
+ struct held_lock *next)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROVE_LOCKING */
+
/*
* Initialize a lock instance's lock-class mapping info:
*/
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass)
+void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass,
+ u8 inner, u8 outer, u8 lock_type)
{
int i;
- kmemcheck_mark_initialized(lock, sizeof(*lock));
-
for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
lock->class_cache[i] = NULL;
@@ -2986,19 +4957,22 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
lock->name = name;
+ lock->wait_type_outer = outer;
+ lock->wait_type_inner = inner;
+ lock->lock_type = lock_type;
+
/*
* No key, no joy, we need to hash something.
*/
if (DEBUG_LOCKS_WARN_ON(!key))
return;
/*
- * Sanity check, the lock-class key must be persistent:
+ * Sanity check, the lock-class key must either have been allocated
+ * statically or must have been registered as a dynamic key.
*/
- if (!static_obj(key)) {
- printk("BUG: key %p not in .data!\n", key);
- /*
- * What it says above ^^^^^, I suggest you read it.
- */
+ if (!static_obj(key) && !is_dynamic_key(key)) {
+ if (debug_locks)
+ printk(KERN_ERR "BUG: key %px has not been registered!\n", key);
DEBUG_LOCKS_WARN_ON(1);
return;
}
@@ -3010,70 +4984,105 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
if (subclass) {
unsigned long flags;
- if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
register_lock_class(lock, subclass, 1);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
}
-EXPORT_SYMBOL_GPL(lockdep_init_map);
+EXPORT_SYMBOL_GPL(lockdep_init_map_type);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
-static int
+struct lock_class_key __lockdep_no_track__;
+EXPORT_SYMBOL_GPL(__lockdep_no_track__);
+
+#ifdef CONFIG_PROVE_LOCKING
+void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
+ lock_print_fn print_fn)
+{
+ struct lock_class *class = lock->class_cache[0];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ lockdep_recursion_inc();
+
+ if (!class)
+ class = register_lock_class(lock, 0, 0);
+
+ if (class) {
+ WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn);
+ WARN_ON(class->print_fn && class->print_fn != print_fn);
+
+ class->cmp_fn = cmp_fn;
+ class->print_fn = print_fn;
+ }
+
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn);
+#endif
+
+static void
print_lock_nested_lock_not_held(struct task_struct *curr,
- struct held_lock *hlock,
- unsigned long ip)
+ struct held_lock *hlock)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
+
+ nbcon_cpu_emergency_enter();
- printk("\n");
- printk("==================================\n");
- printk("[ BUG: Nested lock was not taken ]\n");
+ pr_warn("\n");
+ pr_warn("==================================\n");
+ pr_warn("WARNING: Nested lock was not taken\n");
print_kernel_ident();
- printk("----------------------------------\n");
+ pr_warn("----------------------------------\n");
- printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
print_lock(hlock);
- printk("\nbut this task is not holding:\n");
- printk("%s\n", hlock->nest_lock->name);
+ pr_warn("\nbut this task is not holding:\n");
+ pr_warn("%s\n", hlock->nest_lock->name);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- printk("\nother info that might help us debug this:\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(const struct lockdep_map *lock, int read);
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
* We maintain the dependency maps and validate the locking attempt:
+ *
+ * The callers must make sure that IRQs are disabled before calling it,
+ * otherwise we could get an interrupt which would want to take locks,
+ * which would end up in lockdep again.
*/
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
struct held_lock *hlock;
- unsigned int depth, id;
+ unsigned int depth;
int chain_head = 0;
int class_idx;
u64 chain_key;
@@ -3081,16 +5090,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (unlikely(!debug_locks))
return 0;
- /*
- * Lockdep should run with IRQs disabled, otherwise we could
- * get an interrupt which would want to take locks, which would
- * end up in lockdep and have you got a head-ache already?
- */
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ if (unlikely(lock->key == &__lockdep_no_track__))
return 0;
- if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ lockevent_inc(lockdep_acquire);
+
+ if (!prove_locking || lock->key == &__lockdep_no_validate__) {
check = 0;
+ lockevent_inc(lockdep_nocheck);
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES))
+ return 0;
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -3102,13 +5113,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!class)
return 0;
}
- atomic_inc((atomic_t *)&class->ops);
+
+ debug_class_ops_inc(class);
+
if (very_verbose(class)) {
- printk("\nacquire class [%p] %s", class->key, class->name);
+ nbcon_cpu_emergency_enter();
+ printk("\nacquire class [%px] %s", class->key, class->name);
if (class->name_version > 1)
- printk("#%d", class->name_version);
- printk("\n");
+ printk(KERN_CONT "#%d", class->name_version);
+ printk(KERN_CONT "\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
}
/*
@@ -3123,17 +5138,25 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
- class_idx = class - lock_classes + 1;
+ class_idx = class - lock_classes;
- if (depth) {
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references)
+ if (!references)
+ references++;
+
+ if (!hlock->references)
hlock->references++;
- else
- hlock->references = 2;
- return 1;
+ hlock->references += references;
+
+ /* Overflow */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
+ return 0;
+
+ return 2;
}
}
@@ -3148,21 +5171,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->acquire_ip = ip;
hlock->instance = lock;
hlock->nest_lock = nest_lock;
+ hlock->irq_context = task_irq_context(curr);
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = lockstat_clock();
#endif
+ hlock->pin_count = pin_count;
- if (check && !mark_irqflags(curr, hlock))
+ if (check_wait_context(curr, hlock))
return 0;
- /* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED))
+ /* Initialize the lock usage bit */
+ if (!mark_usage(curr, hlock, check))
return 0;
/*
@@ -3175,11 +5201,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* The 'key ID' is what is the most compact key value to drive
* the hash, not class->key.
*/
- id = class - lock_classes;
/*
- * Whoops, we did it again.. ran straight out of our static allocation.
+ * Whoops, we did it again.. class_idx is invalid.
*/
- if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
return 0;
chain_key = curr->curr_chain_key;
@@ -3187,24 +5212,35 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
/*
* How can we have a chain hash when we ain't got no keys?!
*/
- if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
return 0;
chain_head = 1;
}
hlock->prev_chain_key = chain_key;
if (separate_irq_context(curr, hlock)) {
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
- chain_key = iterate_chain_key(chain_key, id);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
+
+ if (nest_lock && !__lock_is_held(nest_lock, -1)) {
+ print_lock_nested_lock_not_held(curr, hlock);
+ return 0;
+ }
- if (nest_lock && !__lock_is_held(nest_lock))
- return print_lock_nested_lock_not_held(curr, hlock, ip);
+ if (!debug_locks_silent) {
+ WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
+ WARN_ON_ONCE(!hlock_class(hlock)->key);
+ }
- if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3214,6 +5250,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
+ nbcon_cpu_emergency_enter();
print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
printk(KERN_DEBUG "depth: %i max: %lu!\n",
curr->lockdep_depth, MAX_LOCK_DEPTH);
@@ -3221,6 +5258,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_print_held_locks(current);
debug_show_all_locks();
dump_stack();
+ nbcon_cpu_emergency_exit();
return 0;
}
@@ -3231,62 +5269,45 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
return 1;
}
-static int
-print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_unlock_imbalance_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
- printk("\n");
- printk("=====================================\n");
- printk("[ BUG: bad unlock balance detected! ]\n");
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("=====================================\n");
+ pr_warn("WARNING: bad unlock balance detected!\n");
print_kernel_ident();
- printk("-------------------------------------\n");
- printk("%s/%d is trying to release lock (",
+ pr_warn("-------------------------------------\n");
+ pr_warn("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(") at:\n");
- print_ip_sym(ip);
- printk("but there are no more locks to release!\n");
- printk("\nother info that might help us debug this:\n");
+ pr_cont(") at:\n");
+ print_ip_sym(KERN_WARNING, ip);
+ pr_warn("but there are no more locks to release!\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
-}
-
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
-{
- if (unlikely(!debug_locks))
- return 0;
- /*
- * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
- */
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return 0;
-
- if (curr->lockdep_depth <= 0)
- return print_unlock_imbalance_bug(curr, lock, ip);
-
- return 1;
+ nbcon_cpu_emergency_exit();
}
-static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+static noinstr int match_held_lock(const struct held_lock *hlock,
+ const struct lockdep_map *lock)
{
if (hlock->instance == lock)
return 1;
if (hlock->references) {
- struct lock_class *class = lock->class_cache[0];
+ const struct lock_class *class = lock->class_cache[0];
if (!class)
class = look_up_lock_class(lock, 0);
@@ -3308,24 +5329,95 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
return 0;
- if (hlock->class_idx == class - lock_classes + 1)
+ if (hlock->class_idx == class - lock_classes)
return 1;
}
return 0;
}
+/* @depth must not be zero */
+static struct held_lock *find_held_lock(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned int depth, int *idx)
+{
+ struct held_lock *ret, *hlock, *prev_hlock;
+ int i;
+
+ i = depth - 1;
+ hlock = curr->held_locks + i;
+ ret = hlock;
+ if (match_held_lock(hlock, lock))
+ goto out;
+
+ ret = NULL;
+ for (i--, prev_hlock = hlock--;
+ i >= 0;
+ i--, prev_hlock = hlock--) {
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock->irq_context != hlock->irq_context) {
+ ret = NULL;
+ break;
+ }
+ if (match_held_lock(hlock, lock)) {
+ ret = hlock;
+ break;
+ }
+ }
+
+out:
+ *idx = i;
+ return ret;
+}
+
+static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
+ int idx, unsigned int *merged)
+{
+ struct held_lock *hlock;
+ int first_idx = idx;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
+ switch (__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass,
+ hlock->trylock,
+ hlock->read, hlock->check,
+ hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references, hlock->pin_count, 0)) {
+ case 0:
+ return 1;
+ case 1:
+ break;
+ case 2:
+ *merged += (idx == first_idx);
+ break;
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+ }
+ return 0;
+}
+
static int
__lock_set_class(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, unsigned int subclass,
unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ unsigned int depth, merged = 0;
+ struct held_lock *hlock;
struct lock_class *class;
- unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3334,91 +5426,123 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
if (DEBUG_LOCKS_WARN_ON(!depth))
return 0;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
}
- return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
- lockdep_init_map(lock, name, key, 0);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
- hlock->class_idx = class - lock_classes + 1;
+ hlock->class_idx = class - lock_classes;
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references))
- return 0;
+ if (reacquire_held_locks(curr, depth, i, &merged))
+ return 0;
+
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged))
+ return 0;
+ return 1;
+}
+
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
+ struct held_lock *hlock;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
}
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ WARN(hlock->read, "downgrading a read lock");
+ hlock->read = 1;
+ hlock->acquire_ip = ip;
+
+ if (reacquire_held_locks(curr, depth, i, &merged))
+ return 0;
+
+ /* Merging can't happen with unchanged classes.. */
+ if (DEBUG_LOCKS_WARN_ON(merged))
+ return 0;
+
/*
* I took it apart and put it back together again, except now I have
* these 'spare' parts.. where shall I put them.
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
return 0;
+
return 1;
}
/*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock from the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
*/
static int
-lock_release_non_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, unsigned long ip)
{
- struct held_lock *hlock, *prev_hlock;
- unsigned int depth;
+ struct task_struct *curr = current;
+ unsigned int depth, merged = 1;
+ struct held_lock *hlock;
int i;
- /*
- * Check whether the lock exists in the current stack
- * of held locks:
- */
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(!depth))
+ if (depth <= 0) {
+ print_unlock_imbalance_bug(curr, lock, ip);
return 0;
+ }
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
}
- return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
if (hlock->instance == lock)
lock_release_holdtime(hlock);
+ WARN(hlock->pin_count, "releasing a pinned lock\n");
+
if (hlock->references) {
hlock->references--;
if (hlock->references) {
@@ -3440,129 +5564,149 @@ found_it:
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (i++; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references))
- return 0;
- }
+ /*
+ * The most likely case is when the unlock is on the innermost
+ * lock. In this case, we are done!
+ */
+ if (i == depth-1)
+ return 1;
+
+ if (reacquire_held_locks(curr, depth, i + 1, &merged))
+ return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
* there's not N-1 bottles of beer left on the wall...
+ * Pouring two of the bottles together is acceptable.
*/
- if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
- return 0;
- return 1;
+ DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged);
+
+ /*
+ * Since reacquire_held_locks() would have called check_chain_key()
+ * indirectly via __lock_acquire(), we don't need to do it again
+ * on return.
+ */
+ return 0;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+static __always_inline
+int __lock_is_held(const struct lockdep_map *lock, int read)
{
- struct held_lock *hlock;
- unsigned int depth;
+ struct task_struct *curr = current;
+ int i;
- /*
- * Pop off the top of the lock stack:
- */
- depth = curr->lockdep_depth - 1;
- hlock = curr->held_locks + depth;
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
- /*
- * Is the unlock non-nested:
- */
- if (hlock->instance != lock || hlock->references)
- return lock_release_non_nested(curr, lock, ip);
- curr->lockdep_depth--;
+ if (match_held_lock(hlock, lock)) {
+ if (read == -1 || !!hlock->read == read)
+ return LOCK_STATE_HELD;
- /*
- * No more locks, but somehow we've got hash left over, who left it?
- */
- if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
- return 0;
+ return LOCK_STATE_NOT_HELD;
+ }
+ }
- curr->curr_chain_key = hlock->prev_chain_key;
+ return LOCK_STATE_NOT_HELD;
+}
- lock_release_holdtime(hlock);
+static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
+{
+ struct pin_cookie cookie = NIL_COOKIE;
+ struct task_struct *curr = current;
+ int i;
-#ifdef CONFIG_DEBUG_LOCKDEP
- hlock->prev_chain_key = 0;
- hlock->class_idx = 0;
- hlock->acquire_ip = 0;
- hlock->irq_context = 0;
-#endif
- return 1;
+ if (unlikely(!debug_locks))
+ return cookie;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ /*
+ * Grab 16bits of randomness; this is sufficient to not
+ * be guessable and still allows some pin nesting in
+ * our u32 pin_count.
+ */
+ cookie.val = 1 + (sched_clock() & 0xffff);
+ hlock->pin_count += cookie.val;
+ return cookie;
+ }
+ }
+
+ WARN(1, "pinning an unheld lock\n");
+ return cookie;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
struct task_struct *curr = current;
+ int i;
- if (!check_unlock(curr, lock, ip))
+ if (unlikely(!debug_locks))
return;
- if (nested) {
- if (!lock_release_nested(curr, lock, ip))
- return;
- } else {
- if (!lock_release_non_nested(curr, lock, ip))
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ hlock->pin_count += cookie.val;
return;
+ }
}
- check_chain_key(curr);
+ WARN(1, "pinning an unheld lock\n");
}
-static int __lock_is_held(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
struct task_struct *curr = current;
int i;
+ if (unlikely(!debug_locks))
+ return;
+
for (i = 0; i < curr->lockdep_depth; i++) {
struct held_lock *hlock = curr->held_locks + i;
- if (match_held_lock(hlock, lock))
- return 1;
+ if (match_held_lock(hlock, lock)) {
+ if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+ return;
+
+ hlock->pin_count -= cookie.val;
+
+ if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n"))
+ hlock->pin_count = 0;
+
+ return;
+ }
}
- return 0;
+ WARN(1, "unpinning an unheld lock\n");
}
/*
* Check whether we follow the irq-flags state precisely:
*/
-static void check_flags(unsigned long flags)
+static noinstr void check_flags(unsigned long flags)
{
-#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
- defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
if (!debug_locks)
return;
+ /* Get the warning out.. */
+ instrumentation_begin();
+
if (irqs_disabled_flags(flags)) {
- if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-off.\n");
}
} else {
- if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) {
printk("possible reason: unannotated irqs-on.\n");
}
}
+#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@@ -3577,9 +5721,12 @@ static void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
+#endif
if (!debug_locks)
print_irqtrace_events(current);
+
+ instrumentation_end();
#endif
}
@@ -3589,19 +5736,88 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_set_class(lock, name, key, subclass, ip))
check_chain_key(current);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_set_class);
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ lockdep_recursion_inc();
+ check_flags(flags);
+ if (__lock_downgrade(lock, ip))
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
+/* NMI context !!! */
+static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class = look_up_lock_class(lock, subclass);
+ unsigned long mask = LOCKF_USED;
+
+ /* if it doesn't have a class (yet), it certainly hasn't been used yet */
+ if (!class)
+ return;
+
+ /*
+ * READ locks only conflict with USED, such that if we only ever use
+ * READ locks, there is no deadlock possible -- RCU.
+ */
+ if (!hlock->read)
+ mask |= LOCKF_USED_READ;
+
+ if (!(class->usage_mask & mask))
+ return;
+
+ hlock->class_idx = class - lock_classes;
+
+ print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES);
+#endif
+}
+
+static bool lockdep_nmi(void)
+{
+ if (raw_cpu_read(lockdep_recursion))
+ return false;
+
+ if (!in_nmi())
+ return false;
+
+ return true;
+}
+
+/*
+ * read_lock() is recursive if:
+ * 1. We force lockdep think this way in selftests or
+ * 2. The implementation is not queued read/write lock or
+ * 3. The locker is at an in_interrupt() context.
+ */
+bool read_lock_is_recursive(void)
+{
+ return force_read_lock_recursive ||
+ !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) ||
+ in_interrupt();
+}
+EXPORT_SYMBOL_GPL(read_lock_is_recursive);
+
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -3612,104 +5828,215 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
+ if (!debug_locks)
+ return;
+
+ /*
+ * As KASAN instrumentation is disabled and lock_acquire() is usually
+ * the first lockdep call when a task tries to acquire a lock, add
+ * kasan_check_byte() here to check for use-after-free and other
+ * memory errors.
+ */
+ kasan_check_byte(lock);
+
+ if (unlikely(!lockdep_enabled())) {
+ /* XXX allow trylock from NMI ?!? */
+ if (lockdep_nmi() && !trylock) {
+ struct held_lock hlock;
+
+ hlock.acquire_ip = ip;
+ hlock.instance = lock;
+ hlock.nest_lock = nest_lock;
+ hlock.irq_context = 2; // XXX
+ hlock.trylock = trylock;
+ hlock.read = read;
+ hlock.check = check;
+ hlock.hardirqs_off = true;
+ hlock.references = 0;
+
+ verify_lock_unused(lock, &hlock, subclass);
+ }
return;
+ }
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
- trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+ lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0);
- current->lockdep_recursion = 0;
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_acquire);
-void lock_release(struct lockdep_map *lock, int nested,
- unsigned long ip)
+void lock_release(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ trace_lock_release(lock, ip);
+
+ if (unlikely(!lockdep_enabled() ||
+ lock->key == &__lockdep_no_track__))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
- trace_lock_release(lock, ip);
- __lock_release(lock, nested, ip);
- current->lockdep_recursion = 0;
+
+ lockdep_recursion_inc();
+ if (__lock_release(lock, ip))
+ check_chain_key(current);
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_release);
-int lock_is_held(struct lockdep_map *lock)
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
{
unsigned long flags;
- int ret = 0;
- if (unlikely(current->lockdep_recursion))
- return 1; /* avoid false negative lockdep_assert_held() */
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
+noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
+{
+ unsigned long flags;
+ int ret = LOCK_STATE_NOT_HELD;
+
+ /*
+ * Avoid false negative lockdep_assert_held() and
+ * lockdep_assert_not_held().
+ */
+ if (unlikely(!lockdep_enabled()))
+ return LOCK_STATE_UNKNOWN;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
- ret = __lock_is_held(lock);
- current->lockdep_recursion = 0;
+ lockdep_recursion_inc();
+ ret = __lock_is_held(lock, read);
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(lock_is_held_type);
+NOKPROBE_SYMBOL(lock_is_held_type);
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
+struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
- current->lockdep_reclaim_gfp = gfp_mask;
+ struct pin_cookie cookie = NIL_COOKIE;
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return cookie;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ cookie = __lock_pin_lock(lock);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+
+ return cookie;
}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
-void lockdep_clear_current_reclaim_state(void)
+void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
- current->lockdep_reclaim_gfp = 0;
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_repin_lock(lock, cookie);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(lock_repin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_unpin_lock(lock, cookie);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
#ifdef CONFIG_LOCK_STAT
-static int
-print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_lock_contention_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
- printk("\n");
- printk("=================================\n");
- printk("[ BUG: bad contention detected! ]\n");
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("=================================\n");
+ pr_warn("WARNING: bad contention detected!\n");
print_kernel_ident();
- printk("---------------------------------\n");
- printk("%s/%d is trying to contend lock (",
+ pr_warn("---------------------------------\n");
+ pr_warn("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
- printk(") at:\n");
- print_ip_sym(ip);
- printk("but there are no locks held!\n");
- printk("\nother info that might help us debug this:\n");
+ pr_cont(") at:\n");
+ print_ip_sym(KERN_WARNING, ip);
+ pr_warn("but there are no locks held!\n");
+ pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
- return 0;
+ nbcon_cpu_emergency_exit();
}
static void
__lock_contended(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
int i, contention_point, contending_point;
@@ -3722,22 +6049,15 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, ip);
+ return;
}
- print_lock_contention_bug(curr, lock, ip);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -3754,14 +6074,13 @@ found_it:
stats->contending_point[contending_point]++;
if (lock->cpu != smp_processor_id())
stats->bounces[bounce_contended + !!hlock->read]++;
- put_lock_stats(stats);
}
static void
__lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
u64 now, waittime = 0;
@@ -3775,22 +6094,15 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ if (unlikely(lock->key == &__lockdep_no_track__))
+ return;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, _RET_IP_);
+ return;
}
- print_lock_contention_bug(curr, lock, _RET_IP_);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -3801,8 +6113,6 @@ found_it:
hlock->holdtime_stamp = now;
}
- trace_lock_acquired(lock, ip);
-
stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
if (hlock->read)
@@ -3812,7 +6122,6 @@ found_it:
}
if (lock->cpu != cpu)
stats->bounces[bounce_acquired + !!hlock->read]++;
- put_lock_stats(stats);
lock->cpu = cpu;
lock->ip = ip;
@@ -3822,18 +6131,16 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(!lock_stat))
- return;
+ trace_lock_contended(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
- trace_lock_contended(lock, ip);
+ lockdep_recursion_inc();
__lock_contended(lock, ip);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_contended);
@@ -3842,17 +6149,16 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(!lock_stat))
- return;
+ trace_lock_acquired(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion = 1;
+ lockdep_recursion_inc();
__lock_acquired(lock, ip);
- current->lockdep_recursion = 0;
+ lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(lock_acquired);
@@ -3869,39 +6175,122 @@ void lockdep_reset(void)
int i;
raw_local_irq_save(flags);
- current->curr_chain_key = 0;
- current->lockdep_depth = 0;
- current->lockdep_recursion = 0;
+ lockdep_init_task(current);
memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
nr_hardirq_chains = 0;
nr_softirq_chains = 0;
nr_process_chains = 0;
debug_locks = 1;
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
raw_local_irq_restore(flags);
}
-static void zap_class(struct lock_class *class)
+/* Remove a class from a lock chain. Must be called with the graph lock held. */
+static void remove_class_from_lock_chain(struct pending_free *pf,
+ struct lock_chain *chain,
+ struct lock_class *class)
{
+#ifdef CONFIG_PROVE_LOCKING
int i;
+ for (i = chain->base; i < chain->base + chain->depth; i++) {
+ if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes)
+ continue;
+ /*
+ * Each lock class occurs at most once in a lock chain so once
+ * we found a match we can break out of this loop.
+ */
+ goto free_lock_chain;
+ }
+ /* Since the chain has not been modified, return. */
+ return;
+
+free_lock_chain:
+ free_chain_hlocks(chain->base, chain->depth);
+ /* Overwrite the chain key for concurrent RCU readers. */
+ WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY);
+ dec_chains(chain->irq_context);
+
/*
- * Remove all dependencies this lock is
- * involved in:
+ * Note: calling hlist_del_rcu() from inside a
+ * hlist_for_each_entry_rcu() loop is safe.
*/
- for (i = 0; i < nr_list_entries; i++) {
- if (list_entries[i].class == class)
- list_del_rcu(&list_entries[i].entry);
+ hlist_del_rcu(&chain->entry);
+ __set_bit(chain - lock_chains, pf->lock_chains_being_freed);
+ nr_zapped_lock_chains++;
+#endif
+}
+
+/* Must be called with the graph lock held. */
+static void remove_class_from_lock_chains(struct pending_free *pf,
+ struct lock_class *class)
+{
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ remove_class_from_lock_chain(pf, chain, class);
+ }
}
+}
+
+/*
+ * Remove all references to a lock class. The caller must hold the graph lock.
+ */
+static void zap_class(struct pending_free *pf, struct lock_class *class)
+{
+ struct lock_list *entry;
+ int i;
+
+ WARN_ON_ONCE(!class->key);
+
/*
- * Unhash the class and remove it from the all_lock_classes list:
+ * Remove all dependencies this lock is
+ * involved in:
*/
- list_del_rcu(&class->hash_entry);
- list_del_rcu(&class->lock_entry);
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ entry = list_entries + i;
+ if (entry->class != class && entry->links_to != class)
+ continue;
+ __clear_bit(i, list_entries_in_use);
+ nr_list_entries--;
+ list_del_rcu(&entry->entry);
+ }
+ if (list_empty(&class->locks_after) &&
+ list_empty(&class->locks_before)) {
+ list_move_tail(&class->lock_entry, &pf->zapped);
+ hlist_del_rcu(&class->hash_entry);
+ WRITE_ONCE(class->key, NULL);
+ WRITE_ONCE(class->name, NULL);
+ /* Class allocated but not used, -1 in nr_unused_locks */
+ if (class->usage_mask == 0)
+ debug_atomic_dec(nr_unused_locks);
+ nr_lock_classes--;
+ __clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
+ } else {
+ WARN_ONCE(true, "%s() failed for class %s\n", __func__,
+ class->name);
+ }
- RCU_INIT_POINTER(class->key, NULL);
- RCU_INIT_POINTER(class->name, NULL);
+ remove_class_from_lock_chains(pf, class);
+ nr_zapped_classes++;
+}
+
+static void reinit_class(struct lock_class *class)
+{
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ memset_startat(class, 0, key);
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
}
static inline int within(const void *addr, void *start, unsigned long size)
@@ -3909,68 +6298,206 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+static bool inside_selftest(void)
+{
+ return current == lockdep_selftest_task_struct;
+}
+
+/* The caller must hold the graph lock. */
+static struct pending_free *get_pending_free(void)
+{
+ return delayed_free.pf + delayed_free.index;
+}
+
+static void free_zapped_rcu(struct rcu_head *cb);
+
/*
- * Used in module.c to remove lock classes from memory that is going to be
- * freed; and possibly re-used by other modules.
- *
- * We will have had one sync_sched() before getting here, so we're guaranteed
- * nobody will look up these exact classes -- they're properly dead but still
- * allocated.
- */
-void lockdep_free_key_range(void *start, unsigned long size)
+* See if we need to queue an RCU callback, must called with
+* the lockdep lock held, returns false if either we don't have
+* any pending free or the callback is already scheduled.
+* Otherwise, a call_rcu() must follow this function call.
+*/
+static bool prepare_call_rcu_zapped(struct pending_free *pf)
+{
+ WARN_ON_ONCE(inside_selftest());
+
+ if (list_empty(&pf->zapped))
+ return false;
+
+ if (delayed_free.scheduled)
+ return false;
+
+ delayed_free.scheduled = true;
+
+ WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
+ delayed_free.index ^= 1;
+
+ return true;
+}
+
+/* The caller must hold the graph lock. May be called from RCU context. */
+static void __free_zapped_classes(struct pending_free *pf)
{
struct lock_class *class;
- struct list_head *head;
+
+ check_data_structures();
+
+ list_for_each_entry(class, &pf->zapped, lock_entry)
+ reinit_class(class);
+
+ list_splice_init(&pf->zapped, &free_lock_classes);
+
+#ifdef CONFIG_PROVE_LOCKING
+ bitmap_andnot(lock_chains_in_use, lock_chains_in_use,
+ pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains));
+ bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains));
+#endif
+}
+
+static void free_zapped_rcu(struct rcu_head *ch)
+{
+ struct pending_free *pf;
unsigned long flags;
- int i;
- int locked;
+ bool need_callback;
+
+ if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
+ return;
raw_local_irq_save(flags);
- locked = graph_lock();
+ lockdep_lock();
+
+ /* closed head */
+ pf = delayed_free.pf + (delayed_free.index ^ 1);
+ __free_zapped_classes(pf);
+ delayed_free.scheduled = false;
+ need_callback =
+ prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
/*
- * Unhash all classes that were created by this module:
- */
+ * If there's pending free and its callback has not been scheduled,
+ * queue an RCU callback.
+ */
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+
+}
+
+/*
+ * Remove all lock classes from the class hash table and from the
+ * all_lock_classes list whose key or name is in the address range [start,
+ * start + size). Move these lock classes to the zapped_classes list. Must
+ * be called with the graph lock held.
+ */
+static void __lockdep_free_key_range(struct pending_free *pf, void *start,
+ unsigned long size)
+{
+ struct lock_class *class;
+ struct hlist_head *head;
+ int i;
+
+ /* Unhash all classes that were created by a module. */
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
- if (within(class->key, start, size))
- zap_class(class);
- else if (within(class->name, start, size))
- zap_class(class);
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
+ if (!within(class->key, start, size) &&
+ !within(class->name, start, size))
+ continue;
+ zap_class(pf, class);
}
}
+}
- if (locked)
- graph_unlock();
- raw_local_irq_restore(flags);
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one synchronize_rcu() before getting here, so we're
+ * guaranteed nobody will look up these exact classes -- they're properly dead
+ * but still allocated.
+ */
+static void lockdep_free_key_range_reg(void *start, unsigned long size)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ bool need_callback;
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ lockdep_lock();
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, start, size);
+ need_callback = prepare_call_rcu_zapped(pf);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
- *
- * sync_sched() is sufficient because the read-side is IRQ disable.
*/
- synchronize_sched();
+ synchronize_rcu();
+}
- /*
- * XXX at this point we could return the resources to the pool;
- * instead we leak them. We would need to change to bitmap allocators
- * instead of the linear allocators we have now.
- */
+/*
+ * Free all lockdep keys in the range [start, start+size). Does not sleep.
+ * Ignores debug_locks. Must only be used by the lockdep selftests.
+ */
+static void lockdep_free_key_range_imm(void *start, unsigned long size)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ lockdep_lock();
+ __lockdep_free_key_range(pf, start, size);
+ __free_zapped_classes(pf);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
}
-void lockdep_reset_lock(struct lockdep_map *lock)
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_free_key_range_imm(start, size);
+ else
+ lockdep_free_key_range_reg(start, size);
+}
+
+/*
+ * Check whether any element of the @lock->class_cache[] array refers to a
+ * registered lock class. The caller must hold either the graph lock or the
+ * RCU read lock.
+ */
+static bool lock_class_cache_is_registered(struct lockdep_map *lock)
{
struct lock_class *class;
- struct list_head *head;
- unsigned long flags;
+ struct hlist_head *head;
int i, j;
- int locked;
- raw_local_irq_save(flags);
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
+ for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+ if (lock->class_cache[j] == class)
+ return true;
+ }
+ }
+ return false;
+}
+
+/* The caller must hold the graph lock. Does not sleep. */
+static void __lockdep_reset_lock(struct pending_free *pf,
+ struct lockdep_map *lock)
+{
+ struct lock_class *class;
+ int j;
/*
* Remove all classes this lock might have:
@@ -3981,98 +6508,164 @@ void lockdep_reset_lock(struct lockdep_map *lock)
*/
class = look_up_lock_class(lock, j);
if (class)
- zap_class(class);
+ zap_class(pf, class);
}
/*
* Debug check: in the end all mapped classes should
* be gone.
*/
+ if (WARN_ON_ONCE(lock_class_cache_is_registered(lock)))
+ debug_locks_off();
+}
+
+/*
+ * Remove all information lockdep has about a lock if debug_locks == 1. Free
+ * released data structures from RCU context.
+ */
+static void lockdep_reset_lock_reg(struct lockdep_map *lock)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ int locked;
+ bool need_callback = false;
+
+ raw_local_irq_save(flags);
locked = graph_lock();
- for (i = 0; i < CLASSHASH_SIZE; i++) {
- head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
- int match = 0;
+ if (!locked)
+ goto out_irq;
- for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
- match |= class == lock->class_cache[j];
-
- if (unlikely(match)) {
- if (debug_locks_off_graph_unlock()) {
- /*
- * We all just reset everything, how did it match?
- */
- WARN_ON(1);
- }
- goto out_restore;
- }
- }
- }
- if (locked)
- graph_unlock();
+ pf = get_pending_free();
+ __lockdep_reset_lock(pf, lock);
+ need_callback = prepare_call_rcu_zapped(pf);
-out_restore:
+ graph_unlock();
+out_irq:
raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
}
-void lockdep_init(void)
+/*
+ * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the
+ * lockdep selftests.
+ */
+static void lockdep_reset_lock_imm(struct lockdep_map *lock)
{
- int i;
-
- /*
- * Some architectures have their own start_kernel()
- * code which calls lockdep_init(), while we also
- * call lockdep_init() from the start_kernel() itself,
- * and we want to initialize the hashes only once:
- */
- if (lockdep_initialized)
- return;
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
- for (i = 0; i < CLASSHASH_SIZE; i++)
- INIT_LIST_HEAD(classhash_table + i);
+ raw_local_irq_save(flags);
+ lockdep_lock();
+ __lockdep_reset_lock(pf, lock);
+ __free_zapped_classes(pf);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
+}
- for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ init_data_structures_once();
- lockdep_initialized = 1;
+ if (inside_selftest())
+ lockdep_reset_lock_imm(lock);
+ else
+ lockdep_reset_lock_reg(lock);
}
-void __init lockdep_info(void)
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
+void lockdep_unregister_key(struct lock_class_key *key)
{
- printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+ struct hlist_head *hash_head = keyhashentry(key);
+ struct lock_class_key *k;
+ struct pending_free *pf;
+ unsigned long flags;
+ bool found = false;
+ bool need_callback = false;
- printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
- printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
- printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
- printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
- printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
- printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
- printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+ might_sleep();
- printk(" memory used by lock dependency info: %lu kB\n",
- (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
- sizeof(struct list_head) * CLASSHASH_SIZE +
- sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
- sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
- sizeof(struct list_head) * CHAINHASH_SIZE
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+
+ raw_local_irq_save(flags);
+ lockdep_lock();
+
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ hlist_del_rcu(&k->hash_entry);
+ found = true;
+ break;
+ }
+ }
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ need_callback = prepare_call_rcu_zapped(pf);
+ nr_dynamic_keys--;
+ }
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
+
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+
+ /*
+ * Wait until is_dynamic_key() has finished accessing k->hash_entry.
+ *
+ * Some operations like __qdisc_destroy() will call this in a debug
+ * kernel, and the network traffic is disabled while waiting, hence
+ * the delay of the wait matters in debugging cases. Currently use a
+ * synchronize_rcu_expedited() to speed up the wait at the cost of
+ * system IPIs. TODO: Replace RCU with hazptr for this.
+ */
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(lockdep_unregister_key);
+
+void __init lockdep_init(void)
+{
+ pr_info("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+ pr_info("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ pr_info("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ pr_info("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ pr_info("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ pr_info("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ pr_info("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ pr_info("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+
+ pr_info(" memory used by lock dependency info: %zu kB\n",
+ (sizeof(lock_classes) +
+ sizeof(lock_classes_in_use) +
+ sizeof(classhash_table) +
+ sizeof(list_entries) +
+ sizeof(list_entries_in_use) +
+ sizeof(chainhash_table) +
+ sizeof(delayed_free)
#ifdef CONFIG_PROVE_LOCKING
- + sizeof(struct circular_queue)
+ + sizeof(lock_cq)
+ + sizeof(lock_chains)
+ + sizeof(lock_chains_in_use)
+ + sizeof(chain_hlocks)
#endif
) / 1024
);
- printk(" per task-struct memory footprint: %lu bytes\n",
- sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- if (lockdep_init_error) {
- printk("WARNING: lockdep init error! lock-%s was acquired"
- "before lockdep_init\n", lock_init_error);
- printk("Call stack leading to lockdep invocation was:\n");
- print_stack_trace(&lockdep_init_trace, 0);
- }
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ pr_info(" memory used for stack traces: %zu kB\n",
+ (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024
+ );
#endif
+
+ pr_info(" per task-struct memory footprint: %zu bytes\n",
+ sizeof(((struct task_struct *)NULL)->held_locks));
}
static void
@@ -4084,18 +6677,22 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
if (debug_locks_silent)
return;
- printk("\n");
- printk("=========================\n");
- printk("[ BUG: held lock freed! ]\n");
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("=========================\n");
+ pr_warn("WARNING: held lock freed!\n");
print_kernel_ident();
- printk("-------------------------\n");
- printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ pr_warn("-------------------------\n");
+ pr_warn("%s/%d is freeing memory %px-%px, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
static inline int not_in_range(const void* mem_from, unsigned long mem_len,
@@ -4120,7 +6717,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
if (unlikely(!debug_locks))
return;
- local_irq_save(flags);
+ raw_local_irq_save(flags);
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
@@ -4131,7 +6728,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock);
break;
}
- local_irq_restore(flags);
+ raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
@@ -4142,15 +6739,19 @@ static void print_held_locks_bug(void)
if (debug_locks_silent)
return;
- printk("\n");
- printk("=====================================\n");
- printk("[ BUG: %s/%d still has locks held! ]\n",
+ nbcon_cpu_emergency_enter();
+
+ pr_warn("\n");
+ pr_warn("====================================\n");
+ pr_warn("WARNING: %s/%d still has locks held!\n",
current->comm, task_pid_nr(current));
print_kernel_ident();
- printk("-------------------------------------\n");
+ pr_warn("------------------------------------\n");
lockdep_print_held_locks(current);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
+
+ nbcon_cpu_emergency_exit();
}
void debug_check_no_locks_held(void)
@@ -4164,58 +6765,25 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
void debug_show_all_locks(void)
{
struct task_struct *g, *p;
- int count = 10;
- int unlock = 1;
if (unlikely(!debug_locks)) {
- printk("INFO: lockdep is turned off.\n");
+ pr_warn("INFO: lockdep is turned off.\n");
return;
}
- printk("\nShowing all locks held in the system:\n");
+ pr_warn("\nShowing all locks held in the system:\n");
- /*
- * Here we try to get the tasklist_lock as hard as possible,
- * if not successful after 2 seconds we ignore it (but keep
- * trying). This is to enable a debug printout even if a
- * tasklist_lock-holding task deadlocks or crashes.
- */
-retry:
- if (!read_trylock(&tasklist_lock)) {
- if (count == 10)
- printk("hm, tasklist_lock locked, retrying... ");
- if (count) {
- count--;
- printk(" #%d", 10-count);
- mdelay(200);
- goto retry;
- }
- printk(" ignoring it.\n");
- unlock = 0;
- } else {
- if (count != 10)
- printk(KERN_CONT " locked it.\n");
- }
-
- do_each_thread(g, p) {
- /*
- * It's not reliable to print a task's held locks
- * if it's not sleeping (or if it's not the current
- * task):
- */
- if (p->state == TASK_RUNNING && p != current)
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (!p->lockdep_depth)
continue;
- if (p->lockdep_depth)
- lockdep_print_held_locks(p);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
- } while_each_thread(g, p);
-
- printk("\n");
- printk("=============================================\n\n");
+ lockdep_print_held_locks(p);
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
+ }
+ rcu_read_unlock();
- if (unlock)
- read_unlock(&tasklist_lock);
+ pr_warn("\n");
+ pr_warn("=============================================\n\n");
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);
#endif
@@ -4241,44 +6809,50 @@ asmlinkage __visible void lockdep_sys_exit(void)
if (unlikely(curr->lockdep_depth)) {
if (!debug_locks_off())
return;
- printk("\n");
- printk("================================================\n");
- printk("[ BUG: lock held when returning to user space! ]\n");
+ nbcon_cpu_emergency_enter();
+ pr_warn("\n");
+ pr_warn("================================================\n");
+ pr_warn("WARNING: lock held when returning to user space!\n");
print_kernel_ident();
- printk("------------------------------------------------\n");
- printk("%s/%d is leaving the kernel with locks still held!\n",
+ pr_warn("------------------------------------------------\n");
+ pr_warn("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
+ nbcon_cpu_emergency_exit();
}
+
+ /*
+ * The lock history for each syscall should be independent. So wipe the
+ * slate clean on return to userspace.
+ */
+ lockdep_invariant_state(false);
}
void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
+ int dl = READ_ONCE(debug_locks);
+ bool rcu = warn_rcu_enter();
-#ifndef CONFIG_PROVE_RCU_REPEATEDLY
- if (!debug_locks_off())
- return;
-#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
- printk("\n");
- printk("===============================\n");
- printk("[ INFO: suspicious RCU usage. ]\n");
+ nbcon_cpu_emergency_enter();
+ pr_warn("\n");
+ pr_warn("=============================\n");
+ pr_warn("WARNING: suspicious RCU usage\n");
print_kernel_ident();
- printk("-------------------------------\n");
- printk("%s:%d %s!\n", file, line, s);
- printk("\nother info that might help us debug this:\n\n");
- printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+ pr_warn("-----------------------------\n");
+ pr_warn("%s:%d %s!\n", file, line, s);
+ pr_warn("\nother info that might help us debug this:\n\n");
+ pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
!rcu_lockdep_current_cpu_online()
? "RCU used illegally from offline CPU!\n"
- : !rcu_is_watching()
- ? "RCU used illegally from idle CPU!\n"
- : "",
- rcu_scheduler_active, debug_locks);
+ : "",
+ rcu_scheduler_active, dl,
+ dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
/*
* If a CPU is in the RCU-free window in idle (ie: in the section
- * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * between ct_idle_enter() and ct_idle_exit(), then RCU
* considers that CPU to be in an "extended quiescent state",
* which means that RCU will be completely ignoring that CPU.
* Therefore, rcu_read_lock() and friends have absolutely no
@@ -4295,10 +6869,12 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
* rcu_read_lock_bh() and so on from extended quiescent states.
*/
if (!rcu_is_watching())
- printk("RCU used illegally from extended quiescent state!\n");
+ pr_warn("RCU used illegally from extended quiescent state!\n");
lockdep_print_held_locks(curr);
- printk("\nstack backtrace:\n");
+ pr_warn("\nstack backtrace:\n");
dump_stack();
+ nbcon_cpu_emergency_exit();
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 51c4b24b6328..0e5e6ffe91a3 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* kernel/lockdep_internals.h
*
@@ -18,9 +19,17 @@ enum lock_usage_bit {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
LOCK_USED,
- LOCK_USAGE_STATES
+ LOCK_USED_READ,
+ LOCK_USAGE_STATES,
};
+/* states after LOCK_USED_READ are not traced and printed */
+static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES);
+
+#define LOCK_USAGE_READ_MASK 1
+#define LOCK_USAGE_DIR_MASK 2
+#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
+
/*
* Usage-state bitmasks:
*/
@@ -35,17 +44,50 @@ enum {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
__LOCKF(USED)
+ __LOCKF(USED_READ)
};
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+enum {
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
+ LOCKF_ENABLED_IRQ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
-#define LOCKF_ENABLED_IRQ_READ \
- (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
- (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
+ LOCKF_USED_IN_IRQ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
+ LOCKF_ENABLED_IRQ_READ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
+ LOCKF_USED_IN_IRQ_READ =
+#include "lockdep_states.h"
+ 0,
+#undef LOCKDEP_STATE
+};
+
+#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
+#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
+
+#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
+#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
/*
+ * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
+ * .data and .bss to fit in required 32MB limit for the kernel. With
+ * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems.
+ * So, reduce the static allocations for lockdeps related structures so that
+ * everything fits in current required size limit.
+ */
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
*
@@ -54,48 +96,76 @@ enum {
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
-#define MAX_LOCKDEP_ENTRIES 32768UL
-
-#define MAX_LOCKDEP_CHAINS_BITS 16
-#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+#define MAX_LOCKDEP_ENTRIES 16384UL
+#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define STACK_TRACE_HASH_SIZE 8192
+#else
+#define MAX_LOCKDEP_ENTRIES (1UL << CONFIG_LOCKDEP_BITS)
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+#define MAX_LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS
/*
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 524288UL
+#define MAX_STACK_TRACE_ENTRIES (1UL << CONFIG_LOCKDEP_STACK_TRACE_BITS)
+#define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS)
+#endif
+
+/*
+ * Bit definitions for lock_chain.irq_context
+ */
+#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0)
+#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1)
+
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define AVG_LOCKDEP_CHAIN_DEPTH 5
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
-#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
-extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+extern const char *__get_key_name(const struct lockdep_subclass_key *key,
+ char *str);
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
extern unsigned long nr_lock_classes;
+extern unsigned long nr_zapped_classes;
+extern unsigned long nr_zapped_lock_chains;
extern unsigned long nr_list_entries;
-extern unsigned long nr_lock_chains;
-extern int nr_chain_hlocks;
+extern unsigned long nr_dynamic_keys;
+long lockdep_next_lockchain(long i);
+unsigned long lock_chain_count(void);
extern unsigned long nr_stack_trace_entries;
extern unsigned int nr_hardirq_chains;
extern unsigned int nr_softirq_chains;
extern unsigned int nr_process_chains;
-extern unsigned int max_lockdep_depth;
-extern unsigned int max_recursion_depth;
+extern unsigned int nr_free_chain_hlocks;
+extern unsigned int nr_lost_chain_hlocks;
+extern unsigned int nr_large_chain_blocks;
+extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#ifdef CONFIG_TRACE_IRQFLAGS
+u64 lockdep_stack_trace_count(void);
+u64 lockdep_stack_hash_count(void);
+#endif
#else
static inline unsigned long
lockdep_count_forward_deps(struct lock_class *class)
@@ -118,23 +188,27 @@ lockdep_count_backward_deps(struct lock_class *class)
* and we want to avoid too much cache bouncing.
*/
struct lockdep_stats {
- int chain_lookup_hits;
- int chain_lookup_misses;
- int hardirqs_on_events;
- int hardirqs_off_events;
- int redundant_hardirqs_on;
- int redundant_hardirqs_off;
- int softirqs_on_events;
- int softirqs_off_events;
- int redundant_softirqs_on;
- int redundant_softirqs_off;
- int nr_unused_locks;
- int nr_cyclic_checks;
- int nr_cyclic_check_recursions;
- int nr_find_usage_forwards_checks;
- int nr_find_usage_forwards_recursions;
- int nr_find_usage_backwards_checks;
- int nr_find_usage_backwards_recursions;
+ unsigned long chain_lookup_hits;
+ unsigned int chain_lookup_misses;
+ unsigned long hardirqs_on_events;
+ unsigned long hardirqs_off_events;
+ unsigned long redundant_hardirqs_on;
+ unsigned long redundant_hardirqs_off;
+ unsigned long softirqs_on_events;
+ unsigned long softirqs_off_events;
+ unsigned long redundant_softirqs_on;
+ unsigned long redundant_softirqs_off;
+ int nr_unused_locks;
+ unsigned int nr_redundant_checks;
+ unsigned int nr_redundant;
+ unsigned int nr_cyclic_checks;
+ unsigned int nr_find_usage_forwards_checks;
+ unsigned int nr_find_usage_backwards_checks;
+
+ /*
+ * Per lock class locking operation stat counts
+ */
+ unsigned long lock_class_ops[MAX_LOCKDEP_KEYS];
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
@@ -162,9 +236,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
} \
__total; \
})
+
+static inline void debug_class_ops_inc(struct lock_class *class)
+{
+ int idx;
+
+ idx = class - lock_classes;
+ __debug_atomic_inc(lock_class_ops[idx]);
+}
+
+static inline unsigned long debug_class_ops_read(struct lock_class *class)
+{
+ int idx, cpu;
+ unsigned long ops = 0;
+
+ idx = class - lock_classes;
+ for_each_possible_cpu(cpu)
+ ops += per_cpu(lockdep_stats.lock_class_ops[idx], cpu);
+ return ops;
+}
+
#else
# define __debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_dec(ptr) do { } while (0)
# define debug_atomic_read(ptr) 0
+# define debug_class_ops_inc(ptr) do { } while (0)
#endif
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index d83d798bef95..1916db9aa46b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/lockdep_proc.c
*
@@ -6,7 +7,7 @@
* Started by Ingo Molnar:
*
* Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*
* Code for /proc/lockdep and /proc/lockdep_stats:
*
@@ -18,19 +19,38 @@
#include <linux/debug_locks.h>
#include <linux/vmalloc.h>
#include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/div64.h>
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -56,39 +76,43 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
- seq_printf(m, " OPS:%8ld", class->ops);
-#endif
-#ifdef CONFIG_PROVE_LOCKING
- seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
- seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
+ seq_printf(m, " OPS:%8ld", debug_class_ops_read(class));
#endif
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
- get_usage_chars(class, usage);
- seq_printf(m, " %s", usage);
+ get_usage_chars(class, usage);
+ seq_printf(m, " %s", usage);
+ }
seq_printf(m, ": ");
print_name(m, class);
seq_puts(m, "\n");
- list_for_each_entry(entry, &class->locks_after, entry) {
- if (entry->distance == 1) {
- seq_printf(m, " -> [%p] ", entry->class->key);
- print_name(m, entry->class);
- seq_puts(m, "\n");
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ if (entry->distance == 1) {
+ seq_printf(m, " -> [%p] ", entry->class->key);
+ print_name(m, entry->class);
+ seq_puts(m, "\n");
+ }
}
+ seq_puts(m, "\n");
}
- seq_puts(m, "\n");
return 0;
}
@@ -100,33 +124,21 @@ static const struct seq_operations lockdep_ops = {
.show = l_show,
};
-static int lockdep_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &lockdep_ops);
-}
-
-static const struct file_operations proc_lockdep_operations = {
- .open = lockdep_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#ifdef CONFIG_PROVE_LOCKING
static void *lc_start(struct seq_file *m, loff_t *pos)
{
+ if (*pos < 0)
+ return NULL;
+
if (*pos == 0)
return SEQ_START_TOKEN;
- if (*pos - 1 < nr_lock_chains)
- return lock_chains + (*pos - 1);
-
- return NULL;
+ return lock_chains + (*pos - 1);
}
static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
{
- (*pos)++;
+ *pos = lockdep_next_lockchain(*pos - 1) + 1;
return lc_start(m, pos);
}
@@ -139,13 +151,22 @@ static int lc_show(struct seq_file *m, void *v)
struct lock_chain *chain = v;
struct lock_class *class;
int i;
+ static const char * const irq_strs[] = {
+ [0] = "0",
+ [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq",
+ [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq",
+ [LOCK_CHAIN_SOFTIRQ_CONTEXT|
+ LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq",
+ };
if (v == SEQ_START_TOKEN) {
+ if (!nr_free_chain_hlocks)
+ seq_printf(m, "(buggered) ");
seq_printf(m, "all lock chains:\n");
return 0;
}
- seq_printf(m, "irq_context: %d\n", chain->irq_context);
+ seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]);
for (i = 0; i < chain->depth; i++) {
class = lock_chain_get_class(chain, i);
@@ -167,18 +188,6 @@ static const struct seq_operations lockdep_chains_ops = {
.stop = lc_stop,
.show = lc_show,
};
-
-static int lockdep_chains_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &lockdep_chains_ops);
-}
-
-static const struct file_operations proc_lockdep_chains_operations = {
- .open = lockdep_chains_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
#endif /* CONFIG_PROVE_LOCKING */
static void lockdep_stats_debug_show(struct seq_file *m)
@@ -199,6 +208,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
debug_atomic_read(chain_lookup_hits));
seq_printf(m, " cyclic checks: %11llu\n",
debug_atomic_read(nr_cyclic_checks));
+ seq_printf(m, " redundant checks: %11llu\n",
+ debug_atomic_read(nr_redundant_checks));
+ seq_printf(m, " redundant links: %11llu\n",
+ debug_atomic_read(nr_redundant));
seq_printf(m, " find-mask forwards checks: %11llu\n",
debug_atomic_read(nr_find_usage_forwards_checks));
seq_printf(m, " find-mask backwards checks: %11llu\n",
@@ -217,7 +230,6 @@ static void lockdep_stats_debug_show(struct seq_file *m)
static int lockdep_stats_show(struct seq_file *m, void *v)
{
- struct lock_class *class;
unsigned long nr_unused = 0, nr_uncategorized = 0,
nr_irq_safe = 0, nr_irq_unsafe = 0,
nr_softirq_safe = 0, nr_softirq_unsafe = 0,
@@ -227,7 +239,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
sum_forward_deps = 0;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class;
+ unsigned long idx;
+
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -258,15 +276,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
-#ifdef CONFIG_PROVE_LOCKING
sum_forward_deps += lockdep_count_forward_deps(class);
-#endif
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
+
+#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " dynamic-keys: %11lu\n",
+ nr_dynamic_keys);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
nr_list_entries, MAX_LOCKDEP_ENTRIES);
seq_printf(m, " indirect dependencies: %11lu\n",
@@ -285,9 +306,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
- nr_lock_chains, MAX_LOCKDEP_CHAINS);
- seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
- nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
+ lock_chain_count(), MAX_LOCKDEP_CHAINS);
+ seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n",
+ MAX_LOCKDEP_CHAIN_HLOCKS -
+ (nr_free_chain_hlocks + nr_lost_chain_hlocks),
+ MAX_LOCKDEP_CHAIN_HLOCKS);
+ seq_printf(m, " dependency chain hlocks lost: %11u\n",
+ nr_lost_chain_hlocks);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -300,6 +325,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_process_chains);
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ seq_printf(m, " number of stack traces: %11llu\n",
+ lockdep_stack_trace_count());
+ seq_printf(m, " number of stack hash chains: %11llu\n",
+ lockdep_stack_hash_count());
+#endif
seq_printf(m, " combined max dependencies: %11u\n",
(nr_hardirq_chains + 1) *
(nr_softirq_chains + 1) *
@@ -341,25 +372,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
+ /*
+ * Zapped classes and lockdep data buffers reuse statistics.
+ */
+ seq_puts(m, "\n");
+ seq_printf(m, " zapped classes: %11lu\n",
+ nr_zapped_classes);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " zapped lock chains: %11lu\n",
+ nr_zapped_lock_chains);
+ seq_printf(m, " large chain blocks: %11u\n",
+ nr_large_chain_blocks);
+#endif
return 0;
}
-static int lockdep_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, lockdep_stats_show, NULL);
-}
-
-static const struct file_operations proc_lockdep_stats_operations = {
- .open = lockdep_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
#ifdef CONFIG_LOCK_STAT
struct lock_stat_data {
@@ -393,7 +426,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
for (i = 0; i < offset; i++)
seq_puts(m, " ");
for (i = 0; i < length; i++)
- seq_printf(m, "%c", c);
+ seq_putc(m, c);
seq_puts(m, "\n");
}
@@ -409,7 +442,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
static void seq_time(struct seq_file *m, s64 time)
{
- char num[15];
+ char num[22];
snprint_time(num, sizeof(num), time);
seq_printf(m, " %14s", num);
@@ -421,12 +454,12 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
- struct lockdep_subclass_key *ckey;
+ const struct lockdep_subclass_key *ckey;
struct lock_class_stats *stats;
struct lock_class *class;
const char *cname;
@@ -618,12 +651,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
- iter->stats = lock_stats(class);
+ lock_stats(class, &iter->stats);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -641,6 +678,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -650,8 +688,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
@@ -664,28 +705,24 @@ static int lock_stat_release(struct inode *inode, struct file *file)
return seq_release(inode, file);
}
-static const struct file_operations proc_lock_stat_operations = {
- .open = lock_stat_open,
- .write = lock_stat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = lock_stat_release,
+static const struct proc_ops lock_stat_proc_ops = {
+ .proc_open = lock_stat_open,
+ .proc_write = lock_stat_write,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = lock_stat_release,
};
#endif /* CONFIG_LOCK_STAT */
static int __init lockdep_proc_init(void)
{
- proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+ proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops);
#ifdef CONFIG_PROVE_LOCKING
- proc_create("lockdep_chains", S_IRUSR, NULL,
- &proc_lockdep_chains_operations);
+ proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
#endif
- proc_create("lockdep_stats", S_IRUSR, NULL,
- &proc_lockdep_stats_operations);
-
+ proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
#ifdef CONFIG_LOCK_STAT
- proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
- &proc_lock_stat_operations);
+ proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &lock_stat_proc_ops);
#endif
return 0;
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
*/
LOCKDEP_STATE(HARDIRQ)
LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ec8cce259779..6567e5eeacc0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1,85 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Module-based torture test facility for locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+ * Davidlohr Bueso <dave@stgolabs.net>
* Based on kernel/rcu/torture.c.
*/
+
+#define pr_fmt(fmt) fmt
+
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kthread.h>
+#include <linux/sched/rt.h>
#include <linux/spinlock.h>
-#include <linux/rwlock.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/rtmutex.h>
#include <linux/atomic.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/torture.h>
+#include <linux/reboot.h>
+MODULE_DESCRIPTION("torture test facility for locking");
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
-
-torture_param(int, nwriters_stress, -1,
- "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1,
- "Number of read-locking stress-test threads");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
+
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
+torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
-torture_param(int, shuffle_interval, 3,
- "Number of jiffies between shuffles, 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+ "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
+torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(bool, verbose, true,
- "Enable verbose debugging printk()s");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter. If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+ int ret;
+ char *s;
+
+ if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+ s = "Out of memory";
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = cpulist_parse(val, *cm_bind);
+ if (!ret)
+ return ret;
+ s = "Bad CPU range";
+out_err:
+ pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+ cpumask_setall(*cm_bind);
+ return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+ cpumask_var_t *cm_bind = kp->arg;
+
+ return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+ return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+ .set = param_set_cpumask,
+ .get = param_get_cpumask,
+};
+
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0444);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0444);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn);
+
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
+static unsigned long last_lock_release;
struct lock_stress_stats {
long n_lock_fail;
long n_lock_acquired;
};
-#if defined(MODULE)
-#define LOCKTORTURE_RUNNABLE_INIT 1
-#else
-#define LOCKTORTURE_RUNNABLE_INIT 0
-#endif
-int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
-module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
+struct call_rcu_chain {
+ struct rcu_head crc_rh;
+ bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain_list;
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -89,13 +135,18 @@ static void lock_torture_cleanup(void);
*/
struct lock_torture_ops {
void (*init)(void);
- int (*writelock)(void);
+ void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
+ int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
- void (*writeunlock)(void);
- int (*readlock)(void);
+ void (*task_boost)(struct torture_random_state *trsp);
+ void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
+ int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
- void (*readunlock)(void);
- unsigned long flags;
+ void (*readunlock)(int tid);
+
+ unsigned long flags; /* for irq spinlocks */
const char *name;
};
@@ -103,45 +154,82 @@ struct lock_torture_cxt {
int nrealwriters_stress;
int nrealreaders_stress;
bool debug_lock;
+ bool init_called;
atomic_t n_lock_torture_errors;
struct lock_torture_ops *cur_ops;
struct lock_stress_stats *lwsa; /* writer statistics */
struct lock_stress_stats *lrsa; /* reader statistics */
};
-static struct lock_torture_cxt cxt = { 0, 0, false,
+static struct lock_torture_cxt cxt = { 0, 0, false, false,
ATOMIC_INIT(0),
NULL, NULL};
/*
* Definitions for lock torture testing.
*/
-static int torture_lock_busted_write_lock(void)
+static int torture_lock_busted_write_lock(int tid __maybe_unused)
{
return 0; /* BUGGY, do not use in real life!!! */
}
static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_us = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
-#ifdef CONFIG_PREEMPT
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_lock_busted_write_unlock(void)
+static void torture_lock_busted_write_unlock(int tid __maybe_unused)
{
/* BUGGY, do not use in real life!!! */
}
+static void __torture_rt_boost(struct torture_random_state *trsp)
+{
+ const unsigned int factor = rt_boost_factor;
+
+ if (!rt_task(current)) {
+ /*
+ * Boost priority once every rt_boost_factor operations. When
+ * the task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ sched_set_fifo(current);
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another 10 * rt_boost_factor
+ * operations, then restored back to its original prio, and so
+ * forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ sched_set_normal(current, 0);
+ } else /* common case, do nothing */
+ return;
+ }
+}
+
+static void torture_rt_boost(struct torture_random_state *trsp)
+{
+ if (rt_boost != 2)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -151,7 +239,8 @@ static struct lock_torture_ops lock_busted_ops = {
static DEFINE_SPINLOCK(torture_spinlock);
-static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+static int torture_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_spinlock)
{
spin_lock(&torture_spinlock);
return 0;
@@ -160,24 +249,24 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_us = 100;
+ unsigned long j;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_us)))
- mdelay(longdelay_us);
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
+ j = jiffies;
+ mdelay(long_hold);
+ pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
+ }
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+static void torture_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_spinlock)
{
spin_unlock(&torture_spinlock);
}
@@ -185,6 +274,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -192,7 +282,7 @@ static struct lock_torture_ops spin_lock_ops = {
.name = "spin_lock"
};
-static int torture_spin_lock_write_lock_irq(void)
+static int torture_spin_lock_write_lock_irq(int tid __maybe_unused)
__acquires(torture_spinlock)
{
unsigned long flags;
@@ -202,7 +292,7 @@ __acquires(torture_spinlock)
return 0;
}
-static void torture_lock_spin_write_unlock_irq(void)
+static void torture_lock_spin_write_unlock_irq(int tid __maybe_unused)
__releases(torture_spinlock)
{
spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
@@ -211,6 +301,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -218,9 +309,117 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
+#ifdef CONFIG_BPF_SYSCALL
+
+#include <asm/rqspinlock.h>
+static rqspinlock_t rqspinlock;
+
+static int torture_raw_res_spin_write_lock(int tid __maybe_unused)
+{
+ raw_res_spin_lock(&rqspinlock);
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock(int tid __maybe_unused)
+{
+ raw_res_spin_unlock(&rqspinlock);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_ops = {
+ .writelock = torture_raw_res_spin_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock"
+};
+
+static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused)
+{
+ unsigned long flags;
+
+ raw_res_spin_lock_irqsave(&rqspinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused)
+{
+ raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_irq_ops = {
+ .writelock = torture_raw_res_spin_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock_irq"
+};
+
+#endif
+
static DEFINE_RWLOCK(torture_rwlock);
-static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
write_lock(&torture_rwlock);
return 0;
@@ -229,24 +428,24 @@ static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_write_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 2;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_write_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
write_unlock(&torture_rwlock);
}
-static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
read_lock(&torture_rwlock);
return 0;
@@ -255,19 +454,18 @@ static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
static void torture_rwlock_read_delay(struct torture_random_state *trsp)
{
const unsigned long shortdelay_us = 10;
- const unsigned long longdelay_ms = 100;
/* We want a short delay mostly to emulate likely code, and
* we want a long delay occasionally to force massive contention.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold);
else
udelay(shortdelay_us);
}
-static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
+static void torture_rwlock_read_unlock(int tid __maybe_unused)
+__releases(torture_rwlock)
{
read_unlock(&torture_rwlock);
}
@@ -275,6 +473,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -282,7 +481,8 @@ static struct lock_torture_ops rw_lock_ops = {
.name = "rw_lock"
};
-static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -291,13 +491,14 @@ static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_write_unlock_irq(void)
+static void torture_rwlock_write_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
-static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
+static int torture_rwlock_read_lock_irq(int tid __maybe_unused)
+__acquires(torture_rwlock)
{
unsigned long flags;
@@ -306,15 +507,16 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
return 0;
}
-static void torture_rwlock_read_unlock_irq(void)
+static void torture_rwlock_read_unlock_irq(int tid __maybe_unused)
__releases(torture_rwlock)
{
- write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+ read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
}
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -323,8 +525,31 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
-static int torture_mutex_lock(void) __acquires(torture_mutex)
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
+
+static int torture_mutex_lock(int tid __maybe_unused)
+__acquires(torture_mutex)
{
mutex_lock(&torture_mutex);
return 0;
@@ -332,37 +557,232 @@ static int torture_mutex_lock(void) __acquires(torture_mutex)
static void torture_mutex_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 5);
- else
- mdelay(longdelay_ms / 5);
-#ifdef CONFIG_PREEMPT
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 5);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_mutex_unlock(void) __releases(torture_mutex)
+static void torture_mutex_unlock(int tid __maybe_unused)
+__releases(torture_mutex)
{
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
.name = "mutex_lock"
};
+#include <linux/ww_mutex.h>
+/*
+ * The torture ww_mutexes should belong to the same lock class as
+ * torture_ww_class to avoid lockdep problem. The ww_mutex_init()
+ * function is called for initialization to ensure that.
+ */
+static DEFINE_WD_CLASS(torture_ww_class);
+static struct ww_mutex torture_ww_mutex_0, torture_ww_mutex_1, torture_ww_mutex_2;
+static struct ww_acquire_ctx *ww_acquire_ctxs;
+
+static void torture_ww_mutex_init(void)
+{
+ ww_mutex_init(&torture_ww_mutex_0, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class);
+ ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class);
+
+ ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*ww_acquire_ctxs),
+ GFP_KERNEL);
+ if (!ww_acquire_ctxs)
+ VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory");
+}
+
+static void torture_ww_mutex_exit(void)
+{
+ kfree(ww_acquire_ctxs);
+}
+
+static int torture_ww_mutex_lock(int tid)
+__acquires(torture_ww_mutex_0)
+__acquires(torture_ww_mutex_1)
+__acquires(torture_ww_mutex_2)
+{
+ LIST_HEAD(list);
+ struct reorder_lock {
+ struct list_head link;
+ struct ww_mutex *lock;
+ } locks[3], *ll, *ln;
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
+
+ locks[0].lock = &torture_ww_mutex_0;
+ list_add(&locks[0].link, &list);
+
+ locks[1].lock = &torture_ww_mutex_1;
+ list_add(&locks[1].link, &list);
+
+ locks[2].lock = &torture_ww_mutex_2;
+ list_add(&locks[2].link, &list);
+
+ ww_acquire_init(ctx, &torture_ww_class);
+
+ list_for_each_entry(ll, &list, link) {
+ int err;
+
+ err = ww_mutex_lock(ll->lock, ctx);
+ if (!err)
+ continue;
+
+ ln = ll;
+ list_for_each_entry_continue_reverse(ln, &list, link)
+ ww_mutex_unlock(ln->lock);
+
+ if (err != -EDEADLK)
+ return err;
+
+ ww_mutex_lock_slow(ll->lock, ctx);
+ list_move(&ll->link, &list);
+ }
+
+ return 0;
+}
+
+static void torture_ww_mutex_unlock(int tid)
+__releases(torture_ww_mutex_0)
+__releases(torture_ww_mutex_1)
+__releases(torture_ww_mutex_2)
+{
+ struct ww_acquire_ctx *ctx = &ww_acquire_ctxs[tid];
+
+ ww_mutex_unlock(&torture_ww_mutex_0);
+ ww_mutex_unlock(&torture_ww_mutex_1);
+ ww_mutex_unlock(&torture_ww_mutex_2);
+ ww_acquire_fini(ctx);
+}
+
+static struct lock_torture_ops ww_mutex_lock_ops = {
+ .init = torture_ww_mutex_init,
+ .exit = torture_ww_mutex_exit,
+ .writelock = torture_ww_mutex_lock,
+ .write_delay = torture_mutex_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_ww_mutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "ww_mutex_lock"
+};
+
+#ifdef CONFIG_RT_MUTEXES
+static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_rtmutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
+}
+
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
+
+static int torture_rtmutex_lock(int tid __maybe_unused)
+__acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
+}
+
+static void torture_rtmutex_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+
+ /*
+ * We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold);
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 200 * shortdelay_us)))
+ udelay(shortdelay_us);
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ torture_preempt_schedule(); /* Allow test to be preempted. */
+}
+
+static void torture_rtmutex_unlock(int tid __maybe_unused)
+__releases(torture_rtmutex)
+{
+ rt_mutex_unlock(&torture_rtmutex);
+}
+
+static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
+{
+ if (!rt_boost)
+ return;
+
+ __torture_rt_boost(trsp);
+}
+
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
+static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
+ .writelock = torture_rtmutex_lock,
+ .write_delay = torture_rtmutex_delay,
+ .task_boost = torture_rt_boost_rtmutex,
+ .writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "rtmutex_lock"
+};
+#endif
+
static DECLARE_RWSEM(torture_rwsem);
-static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_write(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_write(&torture_rwsem);
return 0;
@@ -370,26 +790,21 @@ static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
static void torture_rwsem_write_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 10);
- else
- mdelay(longdelay_ms / 10);
-#ifdef CONFIG_PREEMPT
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+ mdelay(long_hold * 10);
if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_write(void) __releases(torture_rwsem)
+static void torture_rwsem_up_write(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_write(&torture_rwsem);
}
-static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
+static int torture_rwsem_down_read(int tid __maybe_unused)
+__acquires(torture_rwsem)
{
down_read(&torture_rwsem);
return 0;
@@ -397,21 +812,17 @@ static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
static void torture_rwsem_read_delay(struct torture_random_state *trsp)
{
- const unsigned long longdelay_ms = 100;
-
/* We want a long delay occasionally to force massive contention. */
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
- mdelay(longdelay_ms * 2);
+ if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+ mdelay(long_hold * 2);
else
- mdelay(longdelay_ms / 2);
-#ifdef CONFIG_PREEMPT
+ mdelay(long_hold / 2);
if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
- preempt_schedule(); /* Allow test to be preempted. */
-#endif
+ torture_preempt_schedule(); /* Allow test to be preempted. */
}
-static void torture_rwsem_up_read(void) __releases(torture_rwsem)
+static void torture_rwsem_up_read(int tid __maybe_unused)
+__releases(torture_rwsem)
{
up_read(&torture_rwsem);
}
@@ -419,6 +830,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
+ .task_boost = torture_rt_boost,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -426,36 +838,126 @@ static struct lock_torture_ops rwsem_lock_ops = {
.name = "rwsem_lock"
};
+#include <linux/percpu-rwsem.h>
+static struct percpu_rw_semaphore pcpu_rwsem;
+
+static void torture_percpu_rwsem_init(void)
+{
+ BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
+}
+
+static void torture_percpu_rwsem_exit(void)
+{
+ percpu_free_rwsem(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_write(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
+{
+ percpu_down_write(&pcpu_rwsem);
+ return 0;
+}
+
+static void torture_percpu_rwsem_up_write(int tid __maybe_unused)
+__releases(pcpu_rwsem)
+{
+ percpu_up_write(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_read(int tid __maybe_unused)
+__acquires(pcpu_rwsem)
+{
+ percpu_down_read(&pcpu_rwsem);
+ return 0;
+}
+
+static void torture_percpu_rwsem_up_read(int tid __maybe_unused)
+__releases(pcpu_rwsem)
+{
+ percpu_up_read(&pcpu_rwsem);
+}
+
+static struct lock_torture_ops percpu_rwsem_lock_ops = {
+ .init = torture_percpu_rwsem_init,
+ .exit = torture_percpu_rwsem_exit,
+ .writelock = torture_percpu_rwsem_down_write,
+ .write_delay = torture_rwsem_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_percpu_rwsem_up_write,
+ .readlock = torture_percpu_rwsem_down_read,
+ .read_delay = torture_rwsem_read_delay,
+ .readunlock = torture_percpu_rwsem_up_read,
+ .name = "percpu_rwsem_lock"
+};
+
/*
* Lock torture writer kthread. Repeatedly acquires and releases
* the lock, checking for duplicate acquisitions.
*/
static int lock_torture_writer(void *arg)
{
+ unsigned long j;
+ unsigned long j1;
+ u32 lockset_mask;
struct lock_stress_stats *lwsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
+ bool skip_main_lock;
+ int tid = lwsp - cxt.lwsa;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+ if (!rt_task(current))
+ set_user_nice(current, MAX_NICE);
do {
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->writelock();
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = 1;
- if (WARN_ON_ONCE(lock_is_read_held))
- lwsp->n_lock_fail++; /* rare, but... */
-
- lwsp->n_lock_acquired++;
- cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = 0;
- cxt.cur_ops->writeunlock();
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
+
+ cxt.cur_ops->task_boost(&rand);
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ if (acq_writer_lim > 0)
+ j = jiffies;
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+ if (acq_writer_lim > 0) {
+ j1 = jiffies;
+ WARN_ONCE(time_after(j1, j + acq_writer_lim),
+ "%s: Lock acquisition took %lu jiffies.\n",
+ __func__, j1 - j);
+ }
+ lwsp->n_lock_acquired++;
+
+ cxt.cur_ops->write_delay(&rand);
+
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
+
+ cxt.cur_ops->task_boost(NULL); /* reset prio */
torture_kthread_stopping("lock_torture_writer");
return 0;
}
@@ -467,7 +969,8 @@ static int lock_torture_writer(void *arg)
static int lock_torture_reader(void *arg)
{
struct lock_stress_stats *lrsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ int tid = lrsp - cxt.lrsa;
+ DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_reader task started");
set_user_nice(current, MAX_NICE);
@@ -476,15 +979,15 @@ static int lock_torture_reader(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->readlock();
- lock_is_read_held = 1;
+ cxt.cur_ops->readlock(tid);
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = 0;
- cxt.cur_ops->readunlock();
+ atomic_dec(&lock_is_read_held);
+ cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
} while (!torture_must_stop());
@@ -498,26 +1001,28 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
- bool fail = 0;
+ long cur;
+ bool fail = false;
int i, n_stress;
- long max = 0;
- long min = statp[0].n_lock_acquired;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_fail)
- max = statp[i].n_lock_fail;
- if (min > statp[i].n_lock_fail)
- min = statp[i].n_lock_fail;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
write ? "Writes" : "Reads ",
- sum, max, min, max / 2 > min ? "???" : "",
+ sum, max, min,
+ !onoff_interval && max / 2 > min ? "???" : "",
fail, fail ? "!!!" : "");
if (fail)
atomic_inc(&cxt.n_lock_torture_errors);
@@ -583,16 +1088,69 @@ static int lock_torture_stats(void *arg)
return 0;
}
+
static inline void
lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
+ static cpumask_t cpumask_all;
+ cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+ cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
+
+ cpumask_setall(&cpumask_all);
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+ call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+ cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+ rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+ verbose, writer_fifo);
+}
+
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight. These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+ struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+ if (!smp_load_acquire(&crcp->crc_stop)) {
+ (void)start_poll_synchronize_rcu(); // Start one grace period...
+ call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+ }
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+ int i;
+
+ if (call_rcu_chains <= 0)
+ return 0;
+ call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL);
+ if (!call_rcu_chain_list)
+ return -ENOMEM;
+ for (i = 0; i < call_rcu_chains; i++) {
+ call_rcu_chain_list[i].crc_stop = false;
+ call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb);
+ }
+ return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+ int i;
+
+ if (!call_rcu_chain_list)
+ return;
+ for (i = 0; i < call_rcu_chains; i++)
+ smp_store_release(&call_rcu_chain_list[i].crc_stop, true);
+ rcu_barrier();
+ kfree(call_rcu_chain_list);
+ call_rcu_chain_list = NULL;
}
static void lock_torture_cleanup(void)
@@ -602,10 +1160,19 @@ static void lock_torture_cleanup(void)
if (torture_cleanup_begin())
return;
+ /*
+ * Indicates early cleanup, meaning that the test has not run,
+ * such as when passing bogus args when loading the module.
+ * However cxt->cur_ops.init() may have been invoked, so beside
+ * perform the underlying torture-specific cleanups, cur_ops.exit()
+ * will be invoked if needed.
+ */
+ if (!cxt.lwsa && !cxt.lrsa)
+ goto end;
+
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
- torture_stop_kthread(lock_torture_writer,
- writer_tasks[i]);
+ torture_stop_kthread(lock_torture_writer, writer_tasks[i]);
kfree(writer_tasks);
writer_tasks = NULL;
}
@@ -630,6 +1197,24 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+
+ kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
+ kfree(cxt.lrsa);
+ cxt.lrsa = NULL;
+
+ call_rcu_chain_cleanup();
+
+end:
+ if (cxt.init_called) {
+ if (cxt.cur_ops->exit)
+ cxt.cur_ops->exit();
+ cxt.init_called = false;
+ }
+
+ free_cpumask_var(bind_readers);
+ free_cpumask_var(bind_writers);
+
torture_cleanup_end();
}
@@ -640,12 +1225,21 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
+#ifdef CONFIG_BPF_SYSCALL
+ &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops,
+#endif
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
+ &ww_mutex_lock_ops,
+#ifdef CONFIG_RT_MUTEXES
+ &rtmutex_lock_ops,
+#endif
&rwsem_lock_ops,
+ &percpu_rwsem_lock_ops,
};
- if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ if (!torture_init_begin(torture_type, verbose))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
@@ -661,39 +1255,57 @@ static int __init lock_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_alert(" %s", torture_ops[i]->name);
pr_alert("\n");
- torture_init_end();
- return -EINVAL;
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+
+ if (nwriters_stress == 0 &&
+ (!cxt.cur_ops->readlock || nreaders_stress == 0)) {
+ pr_alert("lock-torture: must run at least one locking thread\n");
+ firsterr = -EINVAL;
+ goto unwind;
}
- if (cxt.cur_ops->init)
- cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
if (nwriters_stress >= 0)
cxt.nrealwriters_stress = nwriters_stress;
else
cxt.nrealwriters_stress = 2 * num_online_cpus();
+ if (cxt.cur_ops->init) {
+ cxt.cur_ops->init();
+ cxt.init_called = true;
+ }
+
#ifdef CONFIG_DEBUG_MUTEXES
- if (strncmp(torture_type, "mutex", 5) == 0)
+ if (str_has_prefix(torture_type, "mutex"))
+ cxt.debug_lock = true;
+#endif
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ if (str_has_prefix(torture_type, "rtmutex"))
cxt.debug_lock = true;
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
- if ((strncmp(torture_type, "spin", 4) == 0) ||
- (strncmp(torture_type, "rw_lock", 7) == 0))
+ if ((str_has_prefix(torture_type, "spin")) ||
+ (str_has_prefix(torture_type, "rw_lock")))
cxt.debug_lock = true;
#endif
/* Initialize the statistics so that each run gets its own numbers. */
+ if (nwriters_stress) {
+ lock_is_write_held = false;
+ cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
+ sizeof(*cxt.lwsa),
+ GFP_KERNEL);
+ if (cxt.lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
- lock_is_write_held = 0;
- cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
- if (cxt.lwsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
- }
- for (i = 0; i < cxt.nrealwriters_stress; i++) {
- cxt.lwsa[i].n_lock_fail = 0;
- cxt.lwsa[i].n_lock_acquired = 0;
+ for (i = 0; i < cxt.nrealwriters_stress; i++) {
+ cxt.lwsa[i].n_lock_fail = 0;
+ cxt.lwsa[i].n_lock_acquired = 0;
+ }
}
if (cxt.cur_ops->readlock) {
@@ -710,59 +1322,78 @@ static int __init lock_torture_init(void)
cxt.nrealreaders_stress = cxt.nrealwriters_stress;
}
- lock_is_read_held = 0;
- cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
- if (cxt.lrsa == NULL) {
- VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
- firsterr = -ENOMEM;
- kfree(cxt.lwsa);
- goto unwind;
- }
-
- for (i = 0; i < cxt.nrealreaders_stress; i++) {
- cxt.lrsa[i].n_lock_fail = 0;
- cxt.lrsa[i].n_lock_acquired = 0;
+ if (nreaders_stress) {
+ cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
+ sizeof(*cxt.lrsa),
+ GFP_KERNEL);
+ if (cxt.lrsa == NULL) {
+ VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
+ firsterr = -ENOMEM;
+ kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
+ goto unwind;
+ }
+
+ for (i = 0; i < cxt.nrealreaders_stress; i++) {
+ cxt.lrsa[i].n_lock_fail = 0;
+ cxt.lrsa[i].n_lock_acquired = 0;
+ }
}
}
+
+ firsterr = call_rcu_chain_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
- onoff_interval * HZ);
- if (firsterr)
+ onoff_interval * HZ, NULL);
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs,
lock_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
- firsterr = torture_stutter_init(stutter);
- if (firsterr)
+ firsterr = torture_stutter_init(stutter, stutter);
+ if (torture_init_error(firsterr))
goto unwind;
}
- writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
- GFP_KERNEL);
- if (writer_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nwriters_stress) {
+ writer_tasks = kcalloc(cxt.nrealwriters_stress,
+ sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
- reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
+ reader_tasks = kcalloc(cxt.nrealreaders_stress,
+ sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ kfree(writer_tasks);
+ writer_tasks = NULL;
firsterr = -ENOMEM;
goto unwind;
}
@@ -782,10 +1413,13 @@ static int __init lock_torture_init(void)
goto create_reader;
/* Create writer. */
- firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
- writer_tasks[i]);
- if (firsterr)
+ firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i],
+ writer_tasks[i],
+ writer_fifo ? sched_set_fifo : NULL);
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_writers))
+ torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true);
create_reader:
if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -793,13 +1427,15 @@ static int __init lock_torture_init(void)
/* Create reader. */
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
reader_tasks[j]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
+ if (cpumask_nonempty(bind_readers))
+ torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true);
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
@@ -808,6 +1444,10 @@ static int __init lock_torture_init(void)
unwind:
torture_init_end();
lock_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 75e114bdf3f2..5c92ba199b90 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* MCS lock defines
*
@@ -6,7 +7,7 @@
* The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
* with the desirable properties of being fair, and with each cpu trying
* to acquire the lock spinning on a local variable.
- * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * It avoids expensive cache bounces that common test-and-set spin-lock
* implementations incur.
*/
#ifndef __LINUX_MCS_SPINLOCK_H
@@ -14,21 +15,16 @@
#include <asm/mcs_spinlock.h>
-struct mcs_spinlock {
- struct mcs_spinlock *next;
- int locked; /* 1 if lock acquired */
-};
-
#ifndef arch_mcs_spin_lock_contended
/*
- * Using smp_load_acquire() provides a memory barrier that ensures
- * subsequent operations happen after the lock is acquired.
+ * Using smp_cond_load_acquire() provides the acquire semantics
+ * required so that subsequent operations happen after the
+ * lock is acquired. Additionally, some architectures such as
+ * ARM64 would like to do spin-waiting instead of purely
+ * spinning, and smp_cond_load_acquire() provides that behavior.
*/
#define arch_mcs_spin_lock_contended(l) \
-do { \
- while (!(smp_load_acquire(l))) \
- cpu_relax_lowlatency(); \
-} while (0)
+ smp_cond_load_acquire(l, VAL)
#endif
#ifndef arch_mcs_spin_unlock_contended
@@ -66,6 +62,12 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;
+ /*
+ * We rely on the full barrier with global transitivity implied by the
+ * below xchg() to order the initialization stores above against any
+ * observation of @node. And to provide the ACQUIRE ordering associated
+ * with a LOCK primitive.
+ */
prev = xchg(lock, node);
if (likely(prev == NULL)) {
/*
@@ -97,11 +99,11 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
/*
* Release the lock by setting it to NULL
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(cmpxchg_release(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
while (!(next = READ_ONCE(node->next)))
- cpu_relax_lowlatency();
+ cpu_relax();
}
/* Pass lock to next waiter. */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..2c6b02d4699b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -1,6 +1,4 @@
/*
- * kernel/mutex-debug.c
- *
* Debugging code for mutexes
*
* Started by Ingo Molnar:
@@ -14,6 +12,7 @@
*/
#include <linux/mutex.h>
#include <linux/delay.h>
+#include <linux/device.h>
#include <linux/export.h>
#include <linux/poison.h>
#include <linux/sched.h>
@@ -22,7 +21,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
-#include "mutex-debug.h"
+#include "mutex.h"
/*
* Must be called with lock->wait_lock held.
@@ -32,11 +31,12 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
+ waiter->ww_ctx = MUTEX_POISON_WW_CTX;
}
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
- SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ lockdep_assert_held(&lock->wait_lock);
DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
@@ -49,23 +49,24 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+ struct task_struct *task)
{
- SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ lockdep_assert_held(&lock->wait_lock);
- /* Mark the current thread as blocked on the lock: */
- ti->task->blocked_on = waiter;
+ /* Current thread can't be already blocked (since it's executing!) */
+ DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
}
-void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task)
{
+ struct mutex *blocked_on = __get_task_blocked_on(task);
+
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
- DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
- DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
- ti->task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(waiter->task != task);
+ DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
- list_del_init(&waiter->list);
+ INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
}
@@ -73,36 +74,26 @@ void debug_mutex_unlock(struct mutex *lock)
{
if (likely(debug_locks)) {
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-
- if (!lock->owner)
- DEBUG_LOCKS_WARN_ON(!lock->owner);
- else
- DEBUG_LOCKS_WARN_ON(lock->owner != current);
-
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
}
-
- /*
- * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
- * mutexes so that we can do it here after we've verified state.
- */
- mutex_clear_owner(lock);
- atomic_set(&lock->count, 1);
}
-void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key)
+void debug_mutex_init(struct mutex *lock)
{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
lock->magic = lock;
}
+static void devm_mutex_release(void *res)
+{
+ mutex_destroy(res);
+}
+
+int __devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+ return devm_add_action_or_reset(dev, devm_mutex_release, lock);
+}
+EXPORT_SYMBOL_GPL(__devm_mutex_init);
+
/***
* mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
deleted file mode 100644
index 0799fd3e4cfa..000000000000
--- a/kernel/locking/mutex-debug.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Mutexes: blocking mutual exclusion locks
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal declarations,
- * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case.
- * More details are in kernel/mutex-debug.c.
- */
-
-/*
- * This must be called with lock->wait_lock held.
- */
-extern void debug_mutex_lock_common(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_wake_waiter(struct mutex *lock,
- struct mutex_waiter *waiter);
-extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
-extern void debug_mutex_add_waiter(struct mutex *lock,
- struct mutex_waiter *waiter,
- struct thread_info *ti);
-extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti);
-extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
-
-static inline void mutex_set_owner(struct mutex *lock)
-{
- lock->owner = current;
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
-{
- lock->owner = NULL;
-}
-
-#define spin_lock_mutex(lock, flags) \
- do { \
- struct mutex *l = container_of(lock, struct mutex, wait_lock); \
- \
- DEBUG_LOCKS_WARN_ON(in_interrupt()); \
- local_irq_save(flags); \
- arch_spin_lock(&(lock)->rlock.raw_lock);\
- DEBUG_LOCKS_WARN_ON(l->magic != l); \
- } while (0)
-
-#define spin_unlock_mutex(lock, flags) \
- do { \
- arch_spin_unlock(&(lock)->rlock.raw_lock); \
- local_irq_restore(flags); \
- preempt_check_resched(); \
- } while (0)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6b8934..2a1d165b3167 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/locking/mutex.c
*
@@ -15,52 +16,241 @@
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#include <linux/hung_task.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/lock.h>
+
+#ifndef CONFIG_PREEMPT_RT
+#include "mutex.h"
-/*
- * In the DEBUG case we are using the "NULL fastpath" for mutexes,
- * which forces all calls into the slowpath:
- */
#ifdef CONFIG_DEBUG_MUTEXES
-# include "mutex-debug.h"
-# include <asm-generic/mutex-null.h>
-/*
- * Must be 0 for the debug case so we do not do the unlock outside of the
- * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
- * case.
- */
-# undef __mutex_slowpath_needs_to_unlock
-# define __mutex_slowpath_needs_to_unlock() 0
+# define MUTEX_WARN_ON(cond) DEBUG_LOCKS_WARN_ON(cond)
#else
-# include "mutex.h"
-# include <asm/mutex.h>
+# define MUTEX_WARN_ON(cond)
#endif
-void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+static void __mutex_init_generic(struct mutex *lock)
{
- atomic_set(&lock->count, 1);
- spin_lock_init(&lock->wait_lock);
+ atomic_long_set(&lock->owner, 0);
+ raw_spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
- mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
#endif
+ debug_mutex_init(lock);
+}
+
+static inline struct task_struct *__owner_task(unsigned long owner)
+{
+ return (struct task_struct *)(owner & ~MUTEX_FLAGS);
+}
+
+bool mutex_is_locked(struct mutex *lock)
+{
+ return __mutex_owner(lock) != NULL;
+}
+EXPORT_SYMBOL(mutex_is_locked);
+
+static inline unsigned long __owner_flags(unsigned long owner)
+{
+ return owner & MUTEX_FLAGS;
+}
+
+/* Do not use the return value as a pointer directly. */
+unsigned long mutex_get_owner(struct mutex *lock)
+{
+ unsigned long owner = atomic_long_read(&lock->owner);
+
+ return (unsigned long)__owner_task(owner);
+}
+
+/*
+ * Returns: __mutex_owner(lock) on failure or NULL on success.
+ */
+static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
+{
+ unsigned long owner, curr = (unsigned long)current;
+
+ owner = atomic_long_read(&lock->owner);
+ for (;;) { /* must loop, can race against a flag */
+ unsigned long flags = __owner_flags(owner);
+ unsigned long task = owner & ~MUTEX_FLAGS;
+
+ if (task) {
+ if (flags & MUTEX_FLAG_PICKUP) {
+ if (task != curr)
+ break;
+ flags &= ~MUTEX_FLAG_PICKUP;
+ } else if (handoff) {
+ if (flags & MUTEX_FLAG_HANDOFF)
+ break;
+ flags |= MUTEX_FLAG_HANDOFF;
+ } else {
+ break;
+ }
+ } else {
+ MUTEX_WARN_ON(flags & (MUTEX_FLAG_HANDOFF | MUTEX_FLAG_PICKUP));
+ task = curr;
+ }
+
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &owner, task | flags)) {
+ if (task == curr)
+ return NULL;
+ break;
+ }
+ }
+
+ return __owner_task(owner);
+}
+
+/*
+ * Trylock or set HANDOFF
+ */
+static inline bool __mutex_trylock_or_handoff(struct mutex *lock, bool handoff)
+{
+ return !__mutex_trylock_common(lock, handoff);
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+ return !__mutex_trylock_common(lock, false);
+}
- debug_mutex_init(lock, name, key);
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Lockdep annotations are contained to the slow paths for simplicity.
+ * There is nothing that would stop spreading the lockdep annotations outwards
+ * except more code.
+ */
+void mutex_init_generic(struct mutex *lock)
+{
+ __mutex_init_generic(lock);
}
+EXPORT_SYMBOL(mutex_init_generic);
+
+/*
+ * Optimistic trylock that only works in the uncontended case. Make sure to
+ * follow with a __mutex_trylock() before failing.
+ */
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+ unsigned long curr = (unsigned long)current;
+ unsigned long zero = 0UL;
+
+ MUTEX_WARN_ON(lock->magic != lock);
+
+ if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
+ return true;
+
+ return false;
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+ unsigned long curr = (unsigned long)current;
+
+ return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
+}
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ __mutex_init_generic(lock);
+
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_init_lockep);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
+{
+ atomic_long_or(flag, &lock->owner);
+}
+
+static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
+{
+ atomic_long_andnot(flag, &lock->owner);
+}
+
+static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
+}
+
+/*
+ * Add @waiter to a given location in the lock wait_list and set the
+ * FLAG_WAITERS flag if it's the first waiter.
+ */
+static void
+__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct list_head *list)
+{
+ hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
+ debug_mutex_add_waiter(lock, waiter, current);
+
+ list_add_tail(&waiter->list, list);
+ if (__mutex_waiter_is_first(lock, waiter))
+ __mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+}
+
+static void
+__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
+{
+ list_del(&waiter->list);
+ if (likely(list_empty(&lock->wait_list)))
+ __mutex_clear_flag(lock, MUTEX_FLAGS);
+
+ debug_mutex_remove_waiter(lock, waiter, current);
+ hung_task_clear_blocker();
+}
+
+/*
+ * Give up ownership to a specific task, when @task = NULL, this is equivalent
+ * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves
+ * WAITERS. Provides RELEASE semantics like a regular unlock, the
+ * __mutex_trylock() provides a matching ACQUIRE semantics for the handoff.
+ */
+static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
+{
+ unsigned long owner = atomic_long_read(&lock->owner);
+
+ for (;;) {
+ unsigned long new;
+
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
+
+ new = (owner & MUTEX_FLAG_WAITERS);
+ new |= (unsigned long)task;
+ if (task)
+ new |= MUTEX_FLAG_PICKUP;
-EXPORT_SYMBOL(__mutex_init);
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, new))
+ break;
+ }
+}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
/*
@@ -69,7 +259,7 @@ EXPORT_SYMBOL(__mutex_init);
* We also put the fastpath first in the kernel image, to make sure the
* branch is predicted by the CPU as default-untaken.
*/
-__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
+static void __sched __mutex_lock_slowpath(struct mutex *lock);
/**
* mutex_lock - acquire the mutex
@@ -86,164 +276,116 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
* (or statically defined) before it can be locked. memset()-ing
* the mutex to 0 is not allowed.
*
- * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
- * checks that will enforce the restrictions and will also do
- * deadlock debugging. )
+ * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ * checks that will enforce the restrictions and will also do
+ * deadlock debugging)
*
* This function is similar to (but not equivalent to) down().
*/
void __sched mutex_lock(struct mutex *lock)
{
might_sleep();
- /*
- * The locking fastpath is the 1->0 transition from
- * 'unlocked' into 'locked' state.
- */
- __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
- mutex_set_owner(lock);
-}
+ if (!__mutex_trylock_fast(lock))
+ __mutex_lock_slowpath(lock);
+}
EXPORT_SYMBOL(mutex_lock);
#endif
-static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
- struct ww_acquire_ctx *ww_ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
- /*
- * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
- * but released with a normal mutex_unlock in this call.
- *
- * This should never happen, always use ww_mutex_unlock.
- */
- DEBUG_LOCKS_WARN_ON(ww->ctx);
-
- /*
- * Not quite done after calling ww_acquire_done() ?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
-
- if (ww_ctx->contending_lock) {
- /*
- * After -EDEADLK you tried to
- * acquire a different ww_mutex? Bad!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
-
- /*
- * You called ww_mutex_lock after receiving -EDEADLK,
- * but 'forgot' to unlock everything else first?
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
- ww_ctx->contending_lock = NULL;
- }
+#include "ww_mutex.h"
- /*
- * Naughty, using a different class will lead to undefined behavior!
- */
- DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
-#endif
- ww_ctx->acquired++;
-}
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * After acquiring lock with fastpath or when we lost out in contested
- * slowpath, set ctx and wake up any waiters so they can recheck.
- *
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
+ * Trylock variant that returns the owning task on failure.
*/
-static __always_inline void
-ww_mutex_set_context_fastpath(struct ww_mutex *lock,
- struct ww_acquire_ctx *ctx)
+static inline struct task_struct *__mutex_trylock_or_owner(struct mutex *lock)
{
- unsigned long flags;
- struct mutex_waiter *cur;
+ return __mutex_trylock_common(lock, false);
+}
- ww_mutex_lock_acquired(lock, ctx);
+static inline
+bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ struct mutex_waiter *waiter)
+{
+ struct ww_mutex *ww;
- lock->ctx = ctx;
+ ww = container_of(lock, struct ww_mutex, base);
/*
- * The lock->ctx update should be visible on all cores before
- * the atomic read is done, otherwise contended waiters might be
- * missed. The contended waiters will either see ww_ctx == NULL
- * and keep spinning, or it will acquire wait_lock, add itself
- * to waiter list and sleep.
+ * If ww->ctx is set the contents are undefined, only
+ * by acquiring wait_lock there is a guarantee that
+ * they are not invalid when reading.
+ *
+ * As such, when deadlock detection needs to be
+ * performed the optimistic spinning cannot be done.
+ *
+ * Check this in every inner iteration because we may
+ * be racing against another thread's ww_mutex_lock.
*/
- smp_mb(); /* ^^^ */
+ if (ww_ctx->acquired > 0 && READ_ONCE(ww->ctx))
+ return false;
/*
- * Check if lock is contended, if not there is nobody to wake up
+ * If we aren't on the wait list yet, cancel the spin
+ * if there are waiters. We want to avoid stealing the
+ * lock from a waiter with an earlier stamp, since the
+ * other thread may already own a lock that we also
+ * need.
*/
- if (likely(atomic_read(&lock->base.count) == 0))
- return;
+ if (!waiter && (atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS))
+ return false;
/*
- * Uh oh, we raced in fastpath, wake up everyone in this case,
- * so they can see the new lock->ctx.
+ * Similarly, stop spinning if we are no longer the
+ * first waiter.
*/
- spin_lock_mutex(&lock->base.wait_lock, flags);
- list_for_each_entry(cur, &lock->base.wait_list, list) {
- debug_mutex_wake_waiter(&lock->base, cur);
- wake_up_process(cur->task);
- }
- spin_unlock_mutex(&lock->base.wait_lock, flags);
-}
+ if (waiter && !__mutex_waiter_is_first(lock, waiter))
+ return false;
-/*
- * After acquiring lock in the slowpath set ctx and wake up any
- * waiters so they can recheck.
- *
- * Callers must hold the mutex wait_lock.
- */
-static __always_inline void
-ww_mutex_set_context_slowpath(struct ww_mutex *lock,
- struct ww_acquire_ctx *ctx)
-{
- struct mutex_waiter *cur;
-
- ww_mutex_lock_acquired(lock, ctx);
- lock->ctx = ctx;
-
- /*
- * Give any possible sleeping processes the chance to wake up,
- * so they can recheck if they have to back off.
- */
- list_for_each_entry(cur, &lock->base.wait_list, list) {
- debug_mutex_wake_waiter(&lock->base, cur);
- wake_up_process(cur->task);
- }
+ return true;
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
+ * Look out! "owner" is an entirely speculative pointer access and not
+ * reliable.
+ *
+ * "noinline" so that this function shows up on perf profiles.
*/
static noinline
-bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
+ struct ww_acquire_ctx *ww_ctx, struct mutex_waiter *waiter)
{
bool ret = true;
- rcu_read_lock();
- while (lock->owner == owner) {
+ lockdep_assert_preemption_disabled();
+
+ while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
+ * checking lock->owner still matches owner. And we already
+ * disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the
+ * task_strcut structure won't go away during the spinning
+ * period
*/
barrier();
- if (!owner->on_cpu || need_resched()) {
+ /*
+ * Use vcpu_is_preempted to detect lock holder preemption issue.
+ */
+ if (!owner_on_cpu(owner) || need_resched()) {
+ ret = false;
+ break;
+ }
+
+ if (ww_ctx && !ww_mutex_spin_on_owner(lock, ww_ctx, waiter)) {
ret = false;
break;
}
- cpu_relax_lowlatency();
+ cpu_relax();
}
- rcu_read_unlock();
return ret;
}
@@ -256,31 +398,29 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ lockdep_assert_preemption_disabled();
+
if (need_resched())
return 0;
- rcu_read_lock();
- owner = READ_ONCE(lock->owner);
+ /*
+ * We already disabled preemption which is equal to the RCU read-side
+ * crital section in optimistic spinning code. Thus the task_strcut
+ * structure won't go away during the spinning period.
+ */
+ owner = __mutex_owner(lock);
if (owner)
- retval = owner->on_cpu;
- rcu_read_unlock();
+ retval = owner_on_cpu(owner);
+
/*
- * if lock->owner is not set, the mutex owner may have just acquired
- * it and not set the owner yet or the mutex has been released.
+ * If lock->owner is not set, the mutex has been released. Return true
+ * such that we'll trylock in the spin path, which is a faster option
+ * than the blocking slow path.
*/
return retval;
}
/*
- * Atomically try to take the lock when it is available
- */
-static inline bool mutex_try_to_acquire(struct mutex *lock)
-{
- return !mutex_is_locked(lock) &&
- (atomic_cmpxchg(&lock->count, 1, 0) == 1);
-}
-
-/*
* Optimistic spinning.
*
* We try to spin for acquisition when we find that the lock owner
@@ -288,13 +428,6 @@ static inline bool mutex_try_to_acquire(struct mutex *lock)
* need to reschedule. The rationale is that if the lock owner is
* running, it is likely to release the lock soon.
*
- * Since this needs the lock owner, and this mutex implementation
- * doesn't track the owner atomically in the lock field, we need to
- * track it non-atomically.
- *
- * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
- * to serialize everything.
- *
* The mutex spinners are queued up using MCS lock so that only one
* spinner can compete for the mutex. However, if mutex spinning isn't
* going to happen, there is no point in going through the lock/unlock
@@ -302,74 +435,50 @@ static inline bool mutex_try_to_acquire(struct mutex *lock)
*
* Returns true when the lock was taken, otherwise false, indicating
* that we need to jump to the slowpath and sleep.
+ *
+ * The waiter flag is set to true if the spinner is a waiter in the wait
+ * queue. The waiter-spinner will spin on the lock directly and concurrently
+ * with the spinner at the head of the OSQ, if present, until the owner is
+ * changed to itself.
*/
-static bool mutex_optimistic_spin(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ struct mutex_waiter *waiter)
{
- struct task_struct *task = current;
-
- if (!mutex_can_spin_on_owner(lock))
- goto done;
-
- /*
- * In order to avoid a stampede of mutex spinners trying to
- * acquire the mutex all at once, the spinners need to take a
- * MCS (queued) lock first before spinning on the owner field.
- */
- if (!osq_lock(&lock->osq))
- goto done;
-
- while (true) {
- struct task_struct *owner;
-
- if (use_ww_ctx && ww_ctx->acquired > 0) {
- struct ww_mutex *ww;
-
- ww = container_of(lock, struct ww_mutex, base);
- /*
- * If ww->ctx is set the contents are undefined, only
- * by acquiring wait_lock there is a guarantee that
- * they are not invalid when reading.
- *
- * As such, when deadlock detection needs to be
- * performed the optimistic spinning cannot be done.
- */
- if (READ_ONCE(ww->ctx))
- break;
- }
-
+ if (!waiter) {
/*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
+ * The purpose of the mutex_can_spin_on_owner() function is
+ * to eliminate the overhead of osq_lock() and osq_unlock()
+ * in case spinning isn't possible. As a waiter-spinner
+ * is not going to take OSQ lock anyway, there is no need
+ * to call mutex_can_spin_on_owner().
*/
- owner = READ_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
- break;
+ if (!mutex_can_spin_on_owner(lock))
+ goto fail;
- /* Try to acquire the mutex if it is unlocked. */
- if (mutex_try_to_acquire(lock)) {
- lock_acquired(&lock->dep_map, ip);
-
- if (use_ww_ctx) {
- struct ww_mutex *ww;
- ww = container_of(lock, struct ww_mutex, base);
+ /*
+ * In order to avoid a stampede of mutex spinners trying to
+ * acquire the mutex all at once, the spinners need to take a
+ * MCS (queued) lock first before spinning on the owner field.
+ */
+ if (!osq_lock(&lock->osq))
+ goto fail;
+ }
- ww_mutex_set_context_fastpath(ww, ww_ctx);
- }
+ for (;;) {
+ struct task_struct *owner;
- mutex_set_owner(lock);
- osq_unlock(&lock->osq);
- return true;
- }
+ /* Try to acquire the mutex... */
+ owner = __mutex_trylock_or_owner(lock);
+ if (!owner)
+ break;
/*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
+ * There's an owner, wait for it to either
+ * release the lock or go to sleep.
*/
- if (!owner && (need_resched() || rt_task(task)))
- break;
+ if (!mutex_spin_on_owner(lock, owner, ww_ctx, waiter))
+ goto fail_unlock;
/*
* The cpu_relax() call is a compiler barrier which forces
@@ -377,11 +486,20 @@ static bool mutex_optimistic_spin(struct mutex *lock,
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- cpu_relax_lowlatency();
+ cpu_relax();
}
- osq_unlock(&lock->osq);
-done:
+ if (!waiter)
+ osq_unlock(&lock->osq);
+
+ return true;
+
+
+fail_unlock:
+ if (!waiter)
+ osq_unlock(&lock->osq);
+
+fail:
/*
* If we fell out of the spin path because of need_resched(),
* reschedule now, before we try-lock the mutex. This avoids getting
@@ -399,15 +517,15 @@ done:
return false;
}
#else
-static bool mutex_optimistic_spin(struct mutex *lock,
- struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+static __always_inline bool
+mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ struct mutex_waiter *waiter)
{
return false;
}
#endif
-__visible __used noinline
-void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip);
/**
* mutex_unlock - release the mutex
@@ -418,25 +536,21 @@ void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
-#ifndef CONFIG_DEBUG_MUTEXES
- /*
- * When debugging is enabled we must not clear the owner before time,
- * the slow path will always be taken, and that clears the owner field
- * after verifying that it was indeed current.
- */
- mutex_clear_owner(lock);
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+ if (__mutex_unlock_fast(lock))
+ return;
#endif
- __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+ __mutex_unlock_slowpath(lock, _RET_IP_);
}
-
EXPORT_SYMBOL(mutex_unlock);
/**
@@ -452,170 +566,266 @@ EXPORT_SYMBOL(mutex_unlock);
*/
void __sched ww_mutex_unlock(struct ww_mutex *lock)
{
- /*
- * The unlocking fastpath is the 0->1 transition from 'locked'
- * into 'unlocked' state:
- */
- if (lock->ctx) {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
-#endif
- if (lock->ctx->acquired > 0)
- lock->ctx->acquired--;
- lock->ctx = NULL;
- }
-
-#ifndef CONFIG_DEBUG_MUTEXES
- /*
- * When debugging is enabled we must not clear the owner before time,
- * the slow path will always be taken, and that clears the owner field
- * after verifying that it was indeed current.
- */
- mutex_clear_owner(&lock->base);
-#endif
- __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+ __ww_mutex_unlock(lock);
+ mutex_unlock(&lock->base);
}
EXPORT_SYMBOL(ww_mutex_unlock);
-static inline int __sched
-__ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
-{
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
-
- if (!hold_ctx)
- return 0;
-
- if (unlikely(ctx == hold_ctx))
- return -EALREADY;
-
- if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
- (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
-#ifdef CONFIG_DEBUG_MUTEXES
- DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
- ctx->contending_lock = ww;
-#endif
- return -EDEADLK;
- }
-
- return 0;
-}
-
/*
* Lock a mutex (possibly interruptible), slowpath:
*/
static __always_inline int __sched
-__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
+__mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
struct lockdep_map *nest_lock, unsigned long ip,
struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
{
- struct task_struct *task = current;
+ DEFINE_WAKE_Q(wake_q);
struct mutex_waiter waiter;
+ struct ww_mutex *ww;
unsigned long flags;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
+ might_sleep();
+
+ MUTEX_WARN_ON(lock->magic != lock);
+
+ ww = container_of(lock, struct ww_mutex, base);
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
+ }
+
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
- if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ if (__mutex_trylock(lock) ||
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
+ lock_acquired(&lock->dep_map, ip);
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
- spin_lock_mutex(&lock->wait_lock, flags);
-
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/*
- * Once more, try to acquire the lock. Only try-lock the mutex if
- * it is unlocked to reduce unnecessary xchg() operations.
+ * After waiting to acquire the wait_lock, try again.
*/
- if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+ if (__mutex_trylock(lock)) {
+ if (ww_ctx)
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
+
goto skip_wait;
+ }
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
-
- /* add waiting tasks to the end of the waitqueue (FIFO): */
- list_add_tail(&waiter.list, &lock->wait_list);
- waiter.task = task;
+ waiter.task = current;
+ if (use_ww_ctx)
+ waiter.ww_ctx = ww_ctx;
lock_contended(&lock->dep_map, ip);
+ if (!use_ww_ctx) {
+ /* add waiting tasks to the end of the waitqueue (FIFO): */
+ __mutex_add_waiter(lock, &waiter, &lock->wait_list);
+ } else {
+ /*
+ * Add in stamp order, waking up waiters that must kill
+ * themselves.
+ */
+ ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q);
+ if (ret)
+ goto err_early_kill;
+ }
+
+ __set_task_blocked_on(current, lock);
+ set_current_state(state);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
+ bool first;
+
/*
- * Lets try to take the lock again - this is needed even if
- * we get here for the first time (shortly after failing to
- * acquire the lock), to make sure that we get a wakeup once
- * it's unlocked. Later on, if we sleep, this is the
- * operation that gives us the lock. We xchg it to -1, so
- * that when we release the lock, we properly wake up the
- * other waiters. We only attempt the xchg if the count is
- * non-negative in order to avoid unnecessary xchg operations:
+ * Once we hold wait_lock, we're serialized against
+ * mutex_unlock() handing the lock off to us, do a trylock
+ * before testing the error conditions to make sure we pick up
+ * the handoff.
*/
- if (atomic_read(&lock->count) >= 0 &&
- (atomic_xchg(&lock->count, -1) == 1))
- break;
+ if (__mutex_trylock(lock))
+ goto acquired;
/*
- * got a signal? (This code gets eliminated in the
- * TASK_UNINTERRUPTIBLE case.)
+ * Check for signals and kill conditions while holding
+ * wait_lock. This ensures the lock cancellation is ordered
+ * against mutex_unlock() and wake-ups do not go missing.
*/
- if (unlikely(signal_pending_state(state, task))) {
+ if (signal_pending_state(state, current)) {
ret = -EINTR;
goto err;
}
- if (use_ww_ctx && ww_ctx->acquired > 0) {
- ret = __ww_mutex_lock_check_stamp(lock, ww_ctx);
+ if (ww_ctx) {
+ ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
}
- __set_task_state(task, state);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
- /* didn't get the lock, go to sleep: */
- spin_unlock_mutex(&lock->wait_lock, flags);
schedule_preempt_disabled();
- spin_lock_mutex(&lock->wait_lock, flags);
+
+ first = __mutex_waiter_is_first(lock, &waiter);
+
+ /*
+ * As we likely have been woken up by task
+ * that has cleared our blocked_on state, re-set
+ * it to the lock we are trying to acquire.
+ */
+ set_task_blocked_on(current, lock);
+ set_current_state(state);
+ /*
+ * Here we order against unlock; we must either see it change
+ * state back to RUNNING and fall through the next schedule(),
+ * or we must see its unlock and acquire.
+ */
+ if (__mutex_trylock_or_handoff(lock, first))
+ break;
+
+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ /*
+ * mutex_optimistic_spin() can call schedule(), so
+ * clear blocked on so we don't become unselectable
+ * to run.
+ */
+ clear_task_blocked_on(current, lock);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ set_task_blocked_on(current, lock);
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
- __set_task_state(task, TASK_RUNNING);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+acquired:
+ __clear_task_blocked_on(current, lock);
+ __set_current_state(TASK_RUNNING);
+
+ if (ww_ctx) {
+ /*
+ * Wound-Wait; we stole the lock (!first_waiter), check the
+ * waiters as anyone might want to wound us.
+ */
+ if (!ww_ctx->is_wait_die &&
+ !__mutex_waiter_is_first(lock, &waiter))
+ __ww_mutex_check_waiters(lock, ww_ctx, &wake_q);
+ }
+
+ __mutex_remove_waiter(lock, &waiter);
- mutex_remove_waiter(lock, &waiter, current_thread_info());
- /* set it to 0 if there are no waiters left: */
- if (likely(list_empty(&lock->wait_list)))
- atomic_set(&lock->count, 0);
debug_mutex_free_waiter(&waiter);
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
- mutex_set_owner(lock);
+ trace_contention_end(lock, 0);
- if (use_ww_ctx) {
- struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
- ww_mutex_set_context_slowpath(ww, ww_ctx);
- }
+ if (ww_ctx)
+ ww_mutex_lock_acquired(ww, ww_ctx);
- spin_unlock_mutex(&lock->wait_lock, flags);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
preempt_enable();
return 0;
err:
- mutex_remove_waiter(lock, &waiter, task_thread_info(task));
- spin_unlock_mutex(&lock->wait_lock, flags);
+ __clear_task_blocked_on(current, lock);
+ __set_current_state(TASK_RUNNING);
+ __mutex_remove_waiter(lock, &waiter);
+err_early_kill:
+ WARN_ON(__get_task_blocked_on(current));
+ trace_contention_end(lock, ret);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
- mutex_release(&lock->dep_map, 1, ip);
+ mutex_release(&lock->dep_map, ip);
preempt_enable();
return ret;
}
+static int __sched
+__mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
+ struct lockdep_map *nest_lock, unsigned long ip)
+{
+ return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
+}
+
+static int __sched
+__ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
+ unsigned long ip, struct ww_acquire_ctx *ww_ctx)
+{
+ return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
+}
+
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
+ * @ww: mutex to lock
+ * @ww_ctx: optional w/w acquire context
+ *
+ * Trylocks a mutex with the optional acquire context; no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ *
+ * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
+ * specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+ if (!ww_ctx)
+ return mutex_trylock(&ww->base);
+
+ MUTEX_WARN_ON(ww->base.magic != &ww->base);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__mutex_trylock(&ww->base)) {
+ ww_mutex_set_context_fastpath(ww, ww_ctx);
+ mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
- might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
- subclass, NULL, _RET_IP_, NULL, 0);
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -623,32 +833,39 @@ EXPORT_SYMBOL_GPL(mutex_lock_nested);
void __sched
_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
{
- might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
- 0, nest, _RET_IP_, NULL, 0);
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
}
-
EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
int __sched
-mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+_mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest)
{
- might_sleep();
- return __mutex_lock_common(lock, TASK_KILLABLE,
- subclass, NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_);
}
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
- might_sleep();
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
- subclass, NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
-
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+void __sched
+mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+ subclass, NULL, _RET_IP_, NULL, 0);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
static inline int
ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
@@ -676,88 +893,92 @@ ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
}
int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
int ret;
might_sleep();
- ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
- 0, &ctx->dep_map, _RET_IP_, ctx, 1);
- if (!ret && ctx->acquired > 1)
+ ret = __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE,
+ 0, _RET_IP_, ctx);
+ if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
return ret;
}
-EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+EXPORT_SYMBOL_GPL(ww_mutex_lock);
int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
int ret;
might_sleep();
- ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
- 0, &ctx->dep_map, _RET_IP_, ctx, 1);
+ ret = __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE,
+ 0, _RET_IP_, ctx);
- if (!ret && ctx->acquired > 1)
+ if (!ret && ctx && ctx->acquired > 1)
return ww_mutex_deadlock_injection(lock, ctx);
return ret;
}
-EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible);
#endif
/*
* Release the lock, slowpath:
*/
-static inline void
-__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
{
+ struct task_struct *next = NULL;
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long owner;
unsigned long flags;
+ mutex_release(&lock->dep_map, ip);
+
/*
- * As a performance measurement, release the lock before doing other
- * wakeup related duties to follow. This allows other tasks to acquire
- * the lock sooner, while still handling cleanups in past unlock calls.
- * This can be done as we do not enforce strict equivalence between the
- * mutex counter and wait_list.
- *
+ * Release the lock before (potentially) taking the spinlock such that
+ * other contenders can get on with things ASAP.
*
- * Some architectures leave the lock unlocked in the fastpath failure
- * case, others need to leave it locked. In the later case we have to
- * unlock it here - as the lock counter is currently 0 or negative.
+ * Except when HANDOFF, in that case we must not clear the owner field,
+ * but instead set it to the top waiter.
*/
- if (__mutex_slowpath_needs_to_unlock())
- atomic_set(&lock->count, 1);
+ owner = atomic_long_read(&lock->owner);
+ for (;;) {
+ MUTEX_WARN_ON(__owner_task(owner) != current);
+ MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP);
- spin_lock_mutex(&lock->wait_lock, flags);
- mutex_release(&lock->dep_map, nested, _RET_IP_);
- debug_mutex_unlock(lock);
+ if (owner & MUTEX_FLAG_HANDOFF)
+ break;
+ if (atomic_long_try_cmpxchg_release(&lock->owner, &owner, __owner_flags(owner))) {
+ if (owner & MUTEX_FLAG_WAITERS)
+ break;
+
+ return;
+ }
+ }
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ debug_mutex_unlock(lock);
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
struct mutex_waiter *waiter =
- list_entry(lock->wait_list.next,
- struct mutex_waiter, list);
+ list_first_entry(&lock->wait_list,
+ struct mutex_waiter, list);
- debug_mutex_wake_waiter(lock, waiter);
+ next = waiter->task;
- wake_up_process(waiter->task);
+ debug_mutex_wake_waiter(lock, waiter);
+ __clear_task_blocked_on(next, lock);
+ wake_q_add(&wake_q, next);
}
- spin_unlock_mutex(&lock->wait_lock, flags);
-}
+ if (owner & MUTEX_FLAG_HANDOFF)
+ __mutex_handoff(lock, next);
-/*
- * Release the lock, slowpath:
- */
-__visible void
-__mutex_unlock_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
-
- __mutex_unlock_common_slowpath(lock, 1);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
@@ -772,116 +993,108 @@ static noinline int __sched
__mutex_lock_interruptible_slowpath(struct mutex *lock);
/**
- * mutex_lock_interruptible - acquire the mutex, interruptible
- * @lock: the mutex to be acquired
+ * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals.
+ * @lock: The mutex to be acquired.
*
- * Lock the mutex like mutex_lock(), and return 0 if the mutex has
- * been acquired or sleep until the mutex becomes available. If a
- * signal arrives while waiting for the lock then this function
- * returns -EINTR.
+ * Lock the mutex like mutex_lock(). If a signal is delivered while the
+ * process is sleeping, this function will return without acquiring the
+ * mutex.
*
- * This function is similar to (but not equivalent to) down_interruptible().
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * signal arrived.
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->count);
- if (likely(!ret)) {
- mutex_set_owner(lock);
+
+ if (__mutex_trylock_fast(lock))
return 0;
- } else
- return __mutex_lock_interruptible_slowpath(lock);
+
+ return __mutex_lock_interruptible_slowpath(lock);
}
EXPORT_SYMBOL(mutex_lock_interruptible);
+/**
+ * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals.
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). If a signal which will be fatal to
+ * the current process is delivered while the process is sleeping, this
+ * function will return without acquiring the mutex.
+ *
+ * Context: Process context.
+ * Return: 0 if the lock was successfully acquired or %-EINTR if a
+ * fatal signal arrived.
+ */
int __sched mutex_lock_killable(struct mutex *lock)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->count);
- if (likely(!ret)) {
- mutex_set_owner(lock);
+
+ if (__mutex_trylock_fast(lock))
return 0;
- } else
- return __mutex_lock_killable_slowpath(lock);
+
+ return __mutex_lock_killable_slowpath(lock);
}
EXPORT_SYMBOL(mutex_lock_killable);
-__visible void __sched
-__mutex_lock_slowpath(atomic_t *lock_count)
+/**
+ * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O
+ * @lock: The mutex to be acquired.
+ *
+ * Lock the mutex like mutex_lock(). While the task is waiting for this
+ * mutex, it will be accounted as being in the IO wait state by the
+ * scheduler.
+ *
+ * Context: Process context.
+ */
+void __sched mutex_lock_io(struct mutex *lock)
{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
+ int token;
+
+ token = io_schedule_prepare();
+ mutex_lock(lock);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io);
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
- NULL, _RET_IP_, NULL, 0);
+static noinline void __sched
+__mutex_lock_slowpath(struct mutex *lock)
+{
+ __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__mutex_lock_killable_slowpath(struct mutex *lock)
{
- return __mutex_lock_common(lock, TASK_KILLABLE, 0,
- NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__mutex_lock_interruptible_slowpath(struct mutex *lock)
{
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
- NULL, _RET_IP_, NULL, 0);
+ return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
- NULL, _RET_IP_, ctx, 1);
+ return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0,
+ _RET_IP_, ctx);
}
static noinline int __sched
__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
struct ww_acquire_ctx *ctx)
{
- return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
- NULL, _RET_IP_, ctx, 1);
+ return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0,
+ _RET_IP_, ctx);
}
#endif
-/*
- * Spinlock based trylock, we take the spinlock and check whether we
- * can get the lock:
- */
-static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
-{
- struct mutex *lock = container_of(lock_count, struct mutex, count);
- unsigned long flags;
- int prev;
-
- /* No need to trylock if the mutex is locked. */
- if (mutex_is_locked(lock))
- return 0;
-
- spin_lock_mutex(&lock->wait_lock, flags);
-
- prev = atomic_xchg(&lock->count, -1);
- if (likely(prev == 1)) {
- mutex_set_owner(lock);
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
- }
-
- /* Set it back to 0 if there are no waiters: */
- if (likely(list_empty(&lock->wait_list)))
- atomic_set(&lock->count, 0);
-
- spin_unlock_mutex(&lock->wait_lock, flags);
-
- return prev == 1;
-}
-
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
/**
* mutex_trylock - try to acquire the mutex, without waiting
* @lock: the mutex to be acquired
@@ -898,54 +1111,61 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
*/
int __sched mutex_trylock(struct mutex *lock)
{
- int ret;
+ MUTEX_WARN_ON(lock->magic != lock);
+ return __mutex_trylock(lock);
+}
+EXPORT_SYMBOL(mutex_trylock);
+#else
+int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock)
+{
+ bool locked;
- ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
- if (ret)
- mutex_set_owner(lock);
+ MUTEX_WARN_ON(lock->magic != lock);
+ locked = __mutex_trylock(lock);
+ if (locked)
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
- return ret;
+ return locked;
}
-EXPORT_SYMBOL(mutex_trylock);
+EXPORT_SYMBOL(_mutex_trylock_nest_lock);
+#endif
#ifndef CONFIG_DEBUG_LOCK_ALLOC
int __sched
-__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->base.count);
+ if (__mutex_trylock_fast(&lock->base)) {
+ if (ctx)
+ ww_mutex_set_context_fastpath(lock, ctx);
+ return 0;
+ }
- if (likely(!ret)) {
- ww_mutex_set_context_fastpath(lock, ctx);
- mutex_set_owner(&lock->base);
- } else
- ret = __ww_mutex_lock_slowpath(lock, ctx);
- return ret;
+ return __ww_mutex_lock_slowpath(lock, ctx);
}
-EXPORT_SYMBOL(__ww_mutex_lock);
+EXPORT_SYMBOL(ww_mutex_lock);
int __sched
-__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
{
- int ret;
-
might_sleep();
- ret = __mutex_fastpath_lock_retval(&lock->base.count);
+ if (__mutex_trylock_fast(&lock->base)) {
+ if (ctx)
+ ww_mutex_set_context_fastpath(lock, ctx);
+ return 0;
+ }
- if (likely(!ret)) {
- ww_mutex_set_context_fastpath(lock, ctx);
- mutex_set_owner(&lock->base);
- } else
- ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
- return ret;
+ return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
}
-EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
-#endif
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* !CONFIG_PREEMPT_RT */
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
/**
* atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..9ad4da8cea00 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -1,48 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Mutexes: blocking mutual exclusion locks
*
* started by Ingo Molnar:
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * This file contains mutex debugging related internal prototypes, for the
- * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
*/
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * This is the control structure for tasks blocked on mutex, which resides
+ * on the blocked task's kernel stack:
+ */
+struct mutex_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ struct ww_acquire_ctx *ww_ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ void *magic;
+#endif
+};
-#define spin_lock_mutex(lock, flags) \
- do { spin_lock(lock); (void)(flags); } while (0)
-#define spin_unlock_mutex(lock, flags) \
- do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
- __list_del((waiter)->list.prev, (waiter)->list.next)
+/*
+ * @owner: contains: 'struct task_struct *' to the current lock owner,
+ * NULL means not owned. Since task_struct pointers are aligned at
+ * at least L1_CACHE_BYTES, we have low bits to store extra state.
+ *
+ * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
+ * Bit1 indicates unlock needs to hand the lock to the top-waiter
+ * Bit2 indicates handoff has been done and we're waiting for pickup.
+ */
+#define MUTEX_FLAG_WAITERS 0x01
+#define MUTEX_FLAG_HANDOFF 0x02
+#define MUTEX_FLAG_PICKUP 0x04
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline void mutex_set_owner(struct mutex *lock)
-{
- lock->owner = current;
-}
+#define MUTEX_FLAGS 0x07
-static inline void mutex_clear_owner(struct mutex *lock)
-{
- lock->owner = NULL;
-}
-#else
-static inline void mutex_set_owner(struct mutex *lock)
-{
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
+/*
+ * Internal helper function; C doesn't allow us to hide it :/
+ *
+ * DO NOT USE (outside of mutex & scheduler code).
+ */
+static inline struct task_struct *__mutex_owner(struct mutex *lock)
{
+ if (!lock)
+ return NULL;
+ return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
}
-#endif
-
-#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
-#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
-#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name, key) do { } while (0)
-static inline void
-debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
-{
-}
+#ifdef CONFIG_DEBUG_MUTEXES
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_wake_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter);
+extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_add_waiter(struct mutex *lock,
+ struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
+ struct task_struct *task);
+extern void debug_mutex_unlock(struct mutex *lock);
+extern void debug_mutex_init(struct mutex *lock);
+#else /* CONFIG_DEBUG_MUTEXES */
+# define debug_mutex_lock_common(lock, waiter) do { } while (0)
+# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
+# define debug_mutex_free_waiter(waiter) do { } while (0)
+# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
+# define debug_mutex_unlock(lock) do { } while (0)
+# define debug_mutex_init(lock) do { } while (0)
+#endif /* !CONFIG_DEBUG_MUTEXES */
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index dc85ee23a26f..b4233dc2c2b0 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
@@ -10,6 +11,13 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+};
+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
/*
@@ -21,6 +29,11 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
}
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+ return node->cpu - 1;
+}
+
static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
{
int cpu_nr = encoded_cpu_val - 1;
@@ -31,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
*/
static inline struct optimistic_spin_node *
osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
- struct optimistic_spin_node *prev)
+ int old_cpu)
{
- struct optimistic_spin_node *next = NULL;
int curr = encode_cpu(smp_processor_id());
- int old;
-
- /*
- * If there is a prev node in queue, then the 'old' value will be
- * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
- * we're currently last in queue, then the queue will then become empty.
- */
- old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
* unlock()/unqueue().
*/
- break;
+ return NULL;
}
/*
@@ -70,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
* wait for a new @node->next from its Step-C.
*/
if (node->next) {
+ struct optimistic_spin_node *next;
+
next = xchg(&node->next, NULL);
if (next)
- break;
+ return next;
}
- cpu_relax_lowlatency();
+ cpu_relax();
}
-
- return next;
}
bool osq_lock(struct optimistic_spin_queue *lock)
@@ -92,12 +101,31 @@ bool osq_lock(struct optimistic_spin_queue *lock)
node->next = NULL;
node->cpu = curr;
+ /*
+ * We need both ACQUIRE (pairs with corresponding RELEASE in
+ * unlock() uncontended, or fastpath) and RELEASE (to publish
+ * the node fields we just initialised) semantics when updating
+ * the lock tail.
+ */
old = atomic_xchg(&lock->tail, curr);
if (old == OSQ_UNLOCKED_VAL)
return true;
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
WRITE_ONCE(prev->next, node);
/*
@@ -109,18 +137,17 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* cmpxchg in an attempt to undo our queueing.
*/
- while (!READ_ONCE(node->locked)) {
- /*
- * If we need to reschedule bail... so we can block.
- */
- if (need_resched())
- goto unqueue;
-
- cpu_relax_lowlatency();
- }
- return true;
+ /*
+ * Wait to acquire the lock or cancellation. Note that need_resched()
+ * will come with an IPI, which will wake smp_cond_load_relaxed() if it
+ * is implemented with a monitor-wait. vcpu_is_preempted() relies on
+ * polling, be careful.
+ */
+ if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
+ vcpu_is_preempted(node_cpu(node->prev))))
+ return true;
-unqueue:
+ /* unqueue */
/*
* Step - A -- stabilize @prev
*
@@ -130,19 +157,23 @@ unqueue:
*/
for (;;) {
- if (prev->next == node &&
+ /*
+ * cpu_relax() below implies a compiler barrier which would
+ * prevent this comparison being optimized away.
+ */
+ if (data_race(prev->next) == node &&
cmpxchg(&prev->next, node, NULL) == node)
break;
/*
* We can only fail the cmpxchg() racing against an unlock(),
- * in which case we should observe @node->locked becomming
+ * in which case we should observe @node->locked becoming
* true.
*/
if (smp_load_acquire(&node->locked))
return true;
- cpu_relax_lowlatency();
+ cpu_relax();
/*
* Or we race against a concurrent unqueue()'s step-B, in which
@@ -158,7 +189,7 @@ unqueue:
* back to @prev.
*/
- next = osq_wait_next(lock, node, prev);
+ next = osq_wait_next(lock, node, prev->cpu);
if (!next)
return false;
@@ -184,7 +215,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Fast path for the uncontended case.
*/
- if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+ if (atomic_try_cmpxchg_release(&lock->tail, &curr, OSQ_UNLOCKED_VAL))
return;
/*
@@ -197,7 +228,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
return;
}
- next = osq_wait_next(lock, node, NULL);
+ next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..ef234469baac 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,165 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
-#include <linux/rwsem.h>
#include <linux/percpu.h>
#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
- const char *name, struct lock_class_key *rwsem_key)
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
+ const char *name, struct lock_class_key *key)
{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
+ sem->read_count = alloc_percpu(int);
+ if (unlikely(!sem->read_count))
return -ENOMEM;
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- atomic_set(&brw->write_ctr, 0);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
+ rcu_sync_init(&sem->rss);
+ rcuwait_init(&sem->writer);
+ init_waitqueue_head(&sem->waiters);
+ atomic_set(&sem->block, 0);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
return 0;
}
+EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
{
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
+ /*
+ * XXX: temporary kludge. The error path in alloc_super()
+ * assumes that percpu_free_rwsem() is safe after kzalloc().
+ */
+ if (!sem->read_count)
+ return;
+
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->read_count);
+ sem->read_count = NULL; /* catch use after free bugs */
+}
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
+
+static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ this_cpu_inc(*sem->read_count);
+
+ /*
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
+ *
+ * If the reader misses the writer's assignment of sem->block, then the
+ * writer is guaranteed to see the reader's increment.
+ *
+ * Conversely, any readers that increment their sem->read_count after
+ * the writer looks are guaranteed to see the sem->block value, which
+ * in turn means that they are guaranteed to immediately decrement
+ * their sem->read_count, so that it doesn't matter that the writer
+ * missed them.
+ */
+
+ smp_mb(); /* A matches D */
+
+ /*
+ * If !sem->block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(!atomic_read_acquire(&sem->block)))
+ return true;
+
+ this_cpu_dec(*sem->read_count);
+
+ /* Prod writer to re-evaluate readers_active_check() */
+ rcuwait_wake_up(&sem->writer);
+
+ return false;
+}
+
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+ if (atomic_read(&sem->block))
+ return false;
+
+ return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+ if (reader) {
+ bool ret;
+
+ preempt_disable();
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ return ret;
+ }
+ return __percpu_down_write_trylock(sem);
}
/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
+ * The return value of wait_queue_entry::func means:
*
- * R_W: down_write() comes after up_read(), the writer should see all
- * changes done by the reader
- * or
- * W_R: down_read() comes after up_write(), the reader should see all
- * changes done by the writer
+ * <0 - error, wakeup is terminated and the error is returned
+ * 0 - no wakeup, a next waiter is tried
+ * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
*
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
*
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
*/
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int wake_flags,
+ void *key)
{
- bool success = false;
+ bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+ struct percpu_rw_semaphore *sem = key;
+ struct task_struct *p;
- preempt_disable();
- if (likely(!atomic_read(&brw->write_ctr))) {
- __this_cpu_add(*brw->fast_read_ctr, val);
- success = true;
- }
- preempt_enable();
+ /* concurrent against percpu_down_write(), can get stolen */
+ if (!__percpu_rwsem_trylock(sem, reader))
+ return 1;
+
+ p = get_task_struct(wq_entry->private);
+ list_del_init(&wq_entry->entry);
+ smp_store_release(&wq_entry->private, NULL);
+
+ wake_up_process(p);
+ put_task_struct(p);
- return success;
+ return !reader; /* wake (readers until) 1 writer */
}
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader,
+ bool freeze)
{
- might_sleep();
- if (likely(update_fast_ctr(brw, +1))) {
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
- return;
+ DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+ bool wait;
+
+ spin_lock_irq(&sem->waiters.lock);
+ /*
+ * Serialize against the wakeup in percpu_up_write(), if we fail
+ * the trylock, the wakeup must see us on the list.
+ */
+ wait = !__percpu_rwsem_trylock(sem, reader);
+ if (wait) {
+ wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+ __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
}
+ spin_unlock_irq(&sem->waiters.lock);
- down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- /* avoid up_read()->rwsem_release() */
- __up_read(&brw->rw_sem);
+ while (wait) {
+ set_current_state(TASK_UNINTERRUPTIBLE |
+ (freeze ? TASK_FREEZABLE : 0));
+ if (!smp_load_acquire(&wq_entry.private))
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
}
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try,
+ bool freeze)
{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+ if (__percpu_down_read_trylock(sem))
+ return true;
- if (likely(update_fast_ctr(brw, -1)))
- return;
+ if (try)
+ return false;
- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
-}
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
+ preempt_enable();
+ percpu_rwsem_wait(sem, /* .reader = */ true, freeze);
+ preempt_disable();
+ trace_contention_end(sem, 0);
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
- unsigned int sum = 0;
- int cpu;
+ return true;
+}
+EXPORT_SYMBOL_GPL(__percpu_down_read);
- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
+#define per_cpu_sum(var) \
+({ \
+ TYPEOF_UNQUAL(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
- return sum;
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
*
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
+ * Assumes sem->block is set.
*/
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- /* tell update_fast_ctr() there is a pending writer */
- atomic_inc(&brw->write_ctr);
+ if (per_cpu_sum(*sem->read_count) != 0)
+ return false;
+
/*
- * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
- * so that update_fast_ctr() can't succeed.
- *
- * 2. Ensures we see the result of every previous this_cpu_add() in
- * update_fast_ctr().
- *
- * 3. Ensures that if any reader has exited its critical section via
- * fast-path, it executes a full memory barrier before we return.
- * See R_W case in the comment above update_fast_ctr().
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
*/
- synchronize_sched_expedited();
- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
+ smp_mb(); /* C matches B */
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+ return true;
+}
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
+{
+ bool contended = false;
+
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);
+
+ /*
+ * Try set sem->block; this provides writer-writer exclusion.
+ * Having sem->block set makes new readers block.
+ */
+ if (!__percpu_down_write_trylock(sem)) {
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
+ percpu_rwsem_wait(sem, /* .reader = */ false, false);
+ contended = true;
+ }
+
+ /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
+
+ /*
+ * If they don't see our store of sem->block, then we are guaranteed to
+ * see their sem->read_count increment, and therefore will wait for
+ * them.
+ */
+
+ /* Wait for all active readers to complete. */
+ rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ if (contended)
+ trace_contention_end(sem, 0);
}
+EXPORT_SYMBOL_GPL(percpu_down_write);
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
+ rwsem_release(&sem->dep_map, _RET_IP_);
+
+ /*
+ * Signal the writer is done, no fast path yet.
+ *
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
+ *
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
+ */
+ atomic_set_release(&sem->block, 0);
+
+ /*
+ * Prod any pending reader/writer to make progress.
+ */
+ __wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
+
/*
- * Insert the barrier before the next fast-path in down_read,
- * see W_R case in the comment above update_fast_ctr().
+ * Once this completes (at least one RCU-sched grace period hence) the
+ * reader fast path will be available again. Safe to use outside the
+ * exclusive write lock because its counting.
*/
- synchronize_sched_expedited();
- /* the last writer unblocks update_fast_ctr() */
- atomic_dec(&brw->write_ctr);
+ rcu_sync_exit(&sem->rss);
}
+EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f90d..d2ef312a8611 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Queue read/write lock
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * Queued read/write locks
*
* (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
*
@@ -20,113 +11,82 @@
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <asm/qrwlock.h>
+#include <linux/spinlock.h>
+#include <trace/events/lock.h>
/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
+ * queued_read_lock_slowpath - acquire read lock of a queued rwlock
+ * @lock: Pointer to queued rwlock structure
*/
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
+void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock)
{
- while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax_lowlatency();
- cnts = smp_load_acquire((u32 *)&lock->cnts);
- }
-}
-
-/**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
- * @lock: Pointer to queue rwlock structure
- */
-void queue_read_lock_slowpath(struct qrwlock *lock)
-{
- u32 cnts;
-
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
- * Readers in interrupt context will spin until the lock is
- * available without waiting in the queue.
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet),
+ * so spin with ACQUIRE semantics until the lock is available
+ * without waiting in the queue.
*/
- cnts = smp_load_acquire((u32 *)&lock->cnts);
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
- arch_spin_lock(&lock->lock);
+ arch_spin_lock(&lock->wait_lock);
+ atomic_add(_QR_BIAS, &lock->cnts);
/*
- * At the head of the wait queue now, wait until the writer state
- * goes to 0 and then try to increment the reader count and get
- * the lock. It is possible that an incoming writer may steal the
- * lock in the interim, so it is necessary to check the writer byte
- * to make sure that the write lock isn't taken.
+ * The ACQUIRE semantics of the following spinning code ensure
+ * that accesses can't leak upwards out of our subsequent critical
+ * section in the case that the lock is currently held for write.
*/
- while (atomic_read(&lock->cnts) & _QW_WMASK)
- cpu_relax_lowlatency();
-
- cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
/*
* Signal the next one in queue to become queue head
*/
- arch_spin_unlock(&lock->lock);
+ arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
- * @lock : Pointer to queue rwlock structure
+ * queued_write_lock_slowpath - acquire write lock of a queued rwlock
+ * @lock : Pointer to queued rwlock structure
*/
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
{
- u32 cnts;
+ int cnts;
+
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
/* Put the writer into the wait queue */
- arch_spin_lock(&lock->lock);
+ arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
- if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ if (!(cnts = atomic_read(&lock->cnts)) &&
+ atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED))
goto unlock;
- /*
- * Set the waiting flag to notify readers that a writer is pending,
- * or wait for a previous writer to go away.
- */
- for (;;) {
- cnts = atomic_read(&lock->cnts);
- if (!(cnts & _QW_WMASK) &&
- (atomic_cmpxchg(&lock->cnts, cnts,
- cnts | _QW_WAITING) == cnts))
- break;
-
- cpu_relax_lowlatency();
- }
-
- /* When no more readers, set the locked flag */
- for (;;) {
- cnts = atomic_read(&lock->cnts);
- if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
- break;
+ /* Set the waiting flag to notify readers that a writer is pending */
+ atomic_or(_QW_WAITING, &lock->cnts);
- cpu_relax_lowlatency();
- }
+ /* When no more readers or writers, set the locked flag */
+ do {
+ cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING);
+ } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
- arch_spin_unlock(&lock->lock);
+ arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 000000000000..af8d122bb649
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Queued spinlock
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <linux/prefetch.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+#include <trace/events/lock.h>
+
+/*
+ * Include queued spinlock definitions and statistics code
+ */
+#include "qspinlock.h"
+#include "qspinlock_stat.h"
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }
+
+#define pv_enabled() false
+
+#define pv_init_node __pv_init_node
+#define pv_wait_node __pv_wait_node
+#define pv_kick_node __pv_kick_node
+#define pv_wait_head_or_lock __pv_wait_head_or_lock
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ u32 old, tail;
+ int idx;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (pv_enabled())
+ goto pv_queue;
+
+ if (virt_spin_lock(lock))
+ return;
+
+ /*
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
+ }
+
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
+ */
+ val = queued_fetch_set_pending_acquire(lock);
+
+ /*
+ * If we observe contention, there is a concurrent locker.
+ *
+ * Undo and queue; our setting of PENDING might have made the
+ * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+ * on @next to become !NULL.
+ */
+ if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+ /* Undo PENDING if we set it. */
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
+
+ goto queue;
+ }
+
+ /*
+ * We're pending, wait for the owner to go away.
+ *
+ * 0,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
+ */
+ if (val & _Q_LOCKED_MASK)
+ smp_cond_load_acquire(&lock->locked, !VAL);
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ clear_pending_set_locked(lock);
+ lockevent_inc(lock_pending);
+ return;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ lockevent_inc(lock_slowpath);
+pv_queue:
+ node = this_cpu_ptr(&qnodes[0].mcs);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ trace_contention_begin(lock, LCB_F_SPIN);
+
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to spinning on the lock directly without using
+ * any MCS node. This is not the most elegant solution, but is
+ * simple enough.
+ */
+ if (unlikely(idx >= _Q_MAX_NODES)) {
+ lockevent_inc(lock_no_node);
+ while (!queued_spin_trylock(lock))
+ cpu_relax();
+ goto release;
+ }
+
+ node = grab_mcs_node(node, idx);
+
+ /*
+ * Keep counts of non-zero index values:
+ */
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
+ node->locked = 0;
+ node->next = NULL;
+ pv_init_node(node);
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+ next = NULL;
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ prev = decode_tail(old, qnodes);
+
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
+
+ pv_wait_node(node, prev);
+ arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ * The PV pv_wait_head_or_lock function, if active, will acquire
+ * the lock and return a non-zero value. So we have to skip the
+ * atomic_cond_read_acquire() call. As the next PV queue head hasn't
+ * been designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe.
+ *
+ * If PV isn't active, 0 will be returned instead.
+ *
+ */
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+
+ val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
+
+locked:
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,*,0 -> *,*,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
+ */
+
+ /*
+ * In the PV case we might already have _Q_LOCKED_VAL set, because
+ * of lock stealing; therefore we must also allow:
+ *
+ * n,0,1 -> 0,0,1
+ *
+ * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+ * above wait condition, therefore any concurrent setting of
+ * PENDING will make the uncontended transition fail.
+ */
+ if ((val & _Q_TAIL_MASK) == tail) {
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+ }
+
+ /*
+ * Either somebody is queued behind us or _Q_PENDING_VAL got set
+ * which will then detect the remaining tail and queue behind us
+ * ensuring we'll see a @next.
+ */
+ set_locked(lock);
+
+ /*
+ * contended path; wait for next if not observed yet, release.
+ */
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+ pv_kick_node(lock, next);
+
+release:
+ trace_contention_end(lock, 0);
+
+ /*
+ * release the node
+ */
+ __this_cpu_dec(qnodes[0].mcs.count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef pv_enabled
+#define pv_enabled() true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head_or_lock
+
+#undef queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+bool nopvspin;
+static __init int parse_nopvspin(char *arg)
+{
+ nopvspin = true;
+ return 0;
+}
+early_param("nopvspin", parse_nopvspin);
+#endif
diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h
new file mode 100644
index 000000000000..d69958a844f7
--- /dev/null
+++ b/kernel/locking/qspinlock.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Queued spinlock defines
+ *
+ * This file contains macro definitions and functions shared between different
+ * qspinlock slow path implementations.
+ */
+#ifndef __LINUX_QSPINLOCK_H
+#define __LINUX_QSPINLOCK_H
+
+#include <asm-generic/percpu.h>
+#include <linux/percpu-defs.h>
+#include <asm-generic/qspinlock.h>
+#include <asm-generic/mcs_spinlock.h>
+
+#define _Q_MAX_NODES 4
+
+/*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS 1
+#endif
+
+/*
+ * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
+ * size and four of them will fit nicely in one 64-byte cacheline. For
+ * pvqspinlock, however, we need more space for extra data. To accommodate
+ * that, we insert two more long words to pad it up to 32 bytes. IOW, only
+ * two of them can fit in a cacheline in this case. That is OK as it is rare
+ * to have more than 2 levels of slowpath nesting in actual use. We don't
+ * want to penalize pvqspinlocks to optimize for a rare case in native
+ * qspinlocks.
+ */
+struct qnode {
+ struct mcs_spinlock mcs;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ long reserved[2];
+#endif
+};
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline __pure u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail,
+ struct qnode __percpu *qnodes)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&qnodes[idx].mcs, cpu);
+}
+
+static inline __pure
+struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
+{
+ return &((struct qnode *)base + idx)->mcs;
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 0);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ /*
+ * We can use relaxed semantics since the caller ensures that the
+ * MCS node is properly initialized before updating the tail.
+ */
+ return (u32)xchg_relaxed(&lock->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ new = (old & _Q_LOCKED_PENDING_MASK) | tail;
+ /*
+ * We can use relaxed semantics since the caller ensures that
+ * the MCS node is properly initialized before updating the
+ * tail.
+ */
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+}
+
+#endif /* __LINUX_QSPINLOCK_H */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 000000000000..dc1cb90e3644
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,557 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/memblock.h>
+#include <linux/debug_locks.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ * pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ * pv_kick(cpu) -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+
+/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK 0xff
+
+/*
+ * Queue node uses: VCPU_RUNNING & VCPU_HALTED.
+ * Queue head uses: VCPU_RUNNING & VCPU_HASHED.
+ */
+enum vcpu_state {
+ VCPU_RUNNING = 0,
+ VCPU_HALTED, /* Used only in pv_wait_node */
+ VCPU_HASHED, /* = pv_hash'ed + VCPU_HALTED */
+};
+
+struct pv_node {
+ struct mcs_spinlock mcs;
+ int cpu;
+ u8 state;
+};
+
+/*
+ * Hybrid PV queued/unfair lock
+ *
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
+ */
+#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
+{
+ /*
+ * Stay in unfair lock mode as long as queued mode waiters are
+ * present in the MCS wait queue but the pending bit isn't set.
+ */
+ for (;;) {
+ int val = atomic_read(&lock->val);
+ u8 old = 0;
+
+ if (!(val & _Q_LOCKED_PENDING_MASK) &&
+ try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) {
+ lockevent_inc(pv_lock_stealing);
+ return true;
+ }
+ if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+ break;
+
+ cpu_relax();
+ }
+
+ return false;
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 1);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock just to be sure that it will get it.
+ */
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
+{
+ u16 old = _Q_PENDING_VAL;
+
+ return !READ_ONCE(lock->locked) &&
+ try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_or(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline bool trylock_clear_pending(struct qspinlock *lock)
+{
+ int old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ if (old & _Q_LOCKED_MASK)
+ return false;
+ /*
+ * Try to clear pending bit & set locked bit
+ */
+ new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new));
+
+ return true;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+ struct qspinlock *lock;
+ struct pv_node *node;
+};
+
+#define PV_HE_PER_LINE (SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN (PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+ int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+ if (pv_hash_size < PV_HE_MIN)
+ pv_hash_size = PV_HE_MIN;
+
+ /*
+ * Allocate space from bootmem which should be page-size aligned
+ * and hence cacheline aligned.
+ */
+ pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+ sizeof(struct pv_hash_entry),
+ pv_hash_size, 0,
+ HASH_EARLY | HASH_ZERO,
+ &pv_lock_hash_bits, NULL,
+ pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash) \
+ for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0; \
+ offset < (1 << pv_lock_hash_bits); \
+ offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+ int hopcnt = 0;
+
+ for_each_hash_entry(he, offset, hash) {
+ struct qspinlock *old = NULL;
+ hopcnt++;
+ if (try_cmpxchg(&he->lock, &old, lock)) {
+ WRITE_ONCE(he->node, node);
+ lockevent_pv_hop(hopcnt);
+ return &he->lock;
+ }
+ }
+ /*
+ * Hard assume there is a free entry for us.
+ *
+ * This is guaranteed by ensuring every blocked lock only ever consumes
+ * a single entry, and since we only have 4 nesting levels per CPU
+ * and allocated 4*nr_possible_cpus(), this must be so.
+ *
+ * The single entry is guaranteed by having the lock owner unhash
+ * before it releases.
+ */
+ BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+ unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+ struct pv_hash_entry *he;
+ struct pv_node *node;
+
+ for_each_hash_entry(he, offset, hash) {
+ if (READ_ONCE(he->lock) == lock) {
+ node = READ_ONCE(he->node);
+ WRITE_ONCE(he->lock, NULL);
+ return node;
+ }
+ }
+ /*
+ * Hard assume we'll find an entry.
+ *
+ * This guarantees a limited lookup time and is itself guaranteed by
+ * having the lock owner do the unhash -- IFF the unlock sees the
+ * SLOW flag, there MUST be a hash entry.
+ */
+ BUG();
+}
+
+/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+ if ((loop & PV_PREV_CHECK_MASK) != 0)
+ return false;
+
+ return READ_ONCE(prev->state) != VCPU_RUNNING;
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+
+ BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode));
+
+ pn->cpu = smp_processor_id();
+ pn->state = VCPU_RUNNING;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
+ */
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
+ bool wait_early;
+ int loop;
+
+ for (;;) {
+ for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
+ if (READ_ONCE(node->locked))
+ return;
+ if (pv_wait_early(pp, loop)) {
+ wait_early = true;
+ break;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * Order pn->state vs pn->locked thusly:
+ *
+ * [S] pn->state = VCPU_HALTED [S] next->locked = 1
+ * MB MB
+ * [L] pn->locked [RmW] pn->state = VCPU_HASHED
+ *
+ * Matches the cmpxchg() from pv_kick_node().
+ */
+ smp_store_mb(pn->state, VCPU_HALTED);
+
+ if (!READ_ONCE(node->locked)) {
+ lockevent_inc(pv_wait_node);
+ lockevent_cond_inc(pv_wait_early, wait_early);
+ pv_wait(&pn->state, VCPU_HALTED);
+ }
+
+ /*
+ * If pv_kick_node() changed us to VCPU_HASHED, retain that
+ * value so that pv_wait_head_or_lock() knows to not also try
+ * to hash this lock.
+ */
+ cmpxchg(&pn->state, VCPU_HALTED, VCPU_RUNNING);
+
+ /*
+ * If the locked flag is still not set after wakeup, it is a
+ * spurious wakeup and the vCPU should wait again. However,
+ * there is a pretty high overhead for CPU halting and kicking.
+ * So it is better to spin for a while in the hope that the
+ * MCS lock will be released soon.
+ */
+ lockevent_cond_inc(pv_spurious_wakeup,
+ !READ_ONCE(node->locked));
+ }
+
+ /*
+ * By now our node->locked should be 1 and our caller will not actually
+ * spin-wait for it. We do however rely on our caller to do a
+ * load-acquire for us.
+ */
+}
+
+/*
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
+ */
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ u8 old = VCPU_HALTED;
+ /*
+ * If the vCPU is indeed halted, advance its state to match that of
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+ * observe its next->locked value and advance itself.
+ *
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ *
+ * The write to next->locked in arch_mcs_spin_unlock_contended()
+ * must be ordered before the read of pn->state in the cmpxchg()
+ * below for the code to work correctly. To guarantee full ordering
+ * irrespective of the success or failure of the cmpxchg(),
+ * a relaxed version with explicit barrier is used. The control
+ * dependency will order the reading of pn->state before any
+ * subsequent writes.
+ */
+ smp_mb__before_atomic();
+ if (!try_cmpxchg_relaxed(&pn->state, &old, VCPU_HASHED))
+ return;
+
+ /*
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
+ *
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
+ */
+ WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
+}
+
+/*
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
+ */
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+ struct pv_node *pn = (struct pv_node *)node;
+ struct qspinlock **lp = NULL;
+ int waitcnt = 0;
+ int loop;
+
+ /*
+ * If pv_kick_node() already advanced our state, we don't need to
+ * insert ourselves into the hash table anymore.
+ */
+ if (READ_ONCE(pn->state) == VCPU_HASHED)
+ lp = (struct qspinlock **)1;
+
+ /*
+ * Tracking # of slowpath locking operations
+ */
+ lockevent_inc(lock_slowpath);
+
+ for (;; waitcnt++) {
+ /*
+ * Set correct vCPU state to be used by queue node wait-early
+ * mechanism.
+ */
+ WRITE_ONCE(pn->state, VCPU_RUNNING);
+
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing before attempting to acquire the lock.
+ */
+ set_pending(lock);
+ for (loop = SPIN_THRESHOLD; loop; loop--) {
+ if (trylock_clear_pending(lock))
+ goto gotlock;
+ cpu_relax();
+ }
+ clear_pending(lock);
+
+
+ if (!lp) { /* ONCE */
+ lp = pv_hash(lock, pn);
+
+ /*
+ * We must hash before setting _Q_SLOW_VAL, such that
+ * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+ * we'll be sure to be able to observe our hash entry.
+ *
+ * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
+ * MB RMB
+ * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
+ *
+ * Matches the smp_rmb() in __pv_queued_spin_unlock().
+ */
+ if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
+ /*
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
+ */
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+ WRITE_ONCE(*lp, NULL);
+ goto gotlock;
+ }
+ }
+ WRITE_ONCE(pn->state, VCPU_HASHED);
+ lockevent_inc(pv_wait_head);
+ lockevent_cond_inc(pv_wait_again, waitcnt);
+ pv_wait(&lock->locked, _Q_SLOW_VAL);
+
+ /*
+ * Because of lock stealing, the queue head vCPU may not be
+ * able to acquire the lock before it has to wait again.
+ */
+ }
+
+ /*
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+ * here is to indicate to the compiler that the value will always
+ * be nozero to enable better code optimization.
+ */
+gotlock:
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
+}
+
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
+ */
+#include <asm/qspinlock_paravirt.h>
+
+/*
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
+ */
+__visible __lockfunc void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
+{
+ struct pv_node *node;
+
+ if (unlikely(locked != _Q_SLOW_VAL)) {
+ WARN(!debug_locks_silent,
+ "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+ (unsigned long)lock, atomic_read(&lock->val));
+ return;
+ }
+
+ /*
+ * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+ * so we need a barrier to order the read of the node data in
+ * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+ *
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
+ */
+ smp_rmb();
+
+ /*
+ * Since the above failed to release, this must be the SLOW path.
+ * Therefore start by looking up the blocked node and unhashing it.
+ */
+ node = pv_unhash(lock);
+
+ /*
+ * Now that we have a reference to the (likely) blocked pv_node,
+ * release the lock.
+ */
+ smp_store_release(&lock->locked, 0);
+
+ /*
+ * At this point the memory pointed at by lock can be freed/reused,
+ * however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
+ */
+ lockevent_inc(pv_kick_unlock);
+ pv_kick(node->cpu);
+}
+
+#ifndef __pv_queued_spin_unlock
+__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ u8 locked = _Q_LOCKED_VAL;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ if (try_cmpxchg_release(&lock->locked, &locked, 0))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000000..e625bb410aa2
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#include "lock_events.h"
+
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Collect pvqspinlock locking event counts
+ */
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+
+#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
+
+/*
+ * PV specific per-cpu counter
+ */
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the PV qspinlock counts.
+ *
+ * The following counters are handled specially:
+ * 1. pv_latency_kick
+ * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. pv_latency_wake
+ * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. pv_hash_hops
+ * Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, id, len;
+ u64 sum = 0, kicks = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ id = (long)file_inode(file)->i_private;
+
+ if (id >= lockevent_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu) {
+ sum += per_cpu(lockevents[id], cpu);
+ /*
+ * Need to sum additional counters for some of them
+ */
+ switch (id) {
+
+ case LOCKEVENT_pv_latency_kick:
+ case LOCKEVENT_pv_hash_hops:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
+ break;
+
+ case LOCKEVENT_pv_latency_wake:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
+ break;
+ }
+ }
+
+ if (id == LOCKEVENT_pv_hash_hops) {
+ u64 frac = 0;
+
+ if (kicks) {
+ frac = 100ULL * do_div(sum, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+ }
+
+ /*
+ * Return a X.XX decimal number
+ */
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
+ sum, frac);
+ } else {
+ /*
+ * Round to the nearest ns
+ */
+ if ((id == LOCKEVENT_pv_latency_kick) ||
+ (id == LOCKEVENT_pv_latency_wake)) {
+ if (kicks)
+ sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
+ }
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+ }
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void lockevent_pv_hop(int hopcnt)
+{
+ this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ per_cpu(pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ this_cpu_add(EVENT_COUNT(pv_latency_wake),
+ sched_clock() - *pkick_time);
+ lockevent_inc(pv_kick_wake);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
+
+static inline void lockevent_pv_hop(int hopcnt) { }
+
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
deleted file mode 100644
index 62b6cee8ea7f..000000000000
--- a/kernel/locking/rtmutex-debug.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This code is based on the rt.c implementation in the preempt-rt tree.
- * Portions of said code are
- *
- * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
- * Copyright (C) 2006 Esben Nielsen
- * Copyright (C) 2006 Kihon Technologies Inc.,
- * Steven Rostedt <rostedt@goodmis.org>
- *
- * See rt.c in preempt-rt for proper credits and further information
- */
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/delay.h>
-#include <linux/export.h>
-#include <linux/spinlock.h>
-#include <linux/kallsyms.h>
-#include <linux/syscalls.h>
-#include <linux/interrupt.h>
-#include <linux/rbtree.h>
-#include <linux/fs.h>
-#include <linux/debug_locks.h>
-
-#include "rtmutex_common.h"
-
-static void printk_task(struct task_struct *p)
-{
- if (p)
- printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_lock(struct rt_mutex *lock, int print_owner)
-{
- if (lock->name)
- printk(" [%p] {%s}\n",
- lock, lock->name);
- else
- printk(" [%p] {%s:%d}\n",
- lock, lock->file, lock->line);
-
- if (print_owner && rt_mutex_owner(lock)) {
- printk(".. ->owner: %p\n", lock->owner);
- printk(".. held by: ");
- printk_task(rt_mutex_owner(lock));
- printk("\n");
- }
-}
-
-void rt_mutex_debug_task_free(struct task_struct *task)
-{
- DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
- DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
-}
-
-/*
- * We fill out the fields in the waiter to store the information about
- * the deadlock. We print when we return. act_waiter can be NULL in
- * case of a remove waiter operation.
- */
-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *act_waiter,
- struct rt_mutex *lock)
-{
- struct task_struct *task;
-
- if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
- return;
-
- task = rt_mutex_owner(act_waiter->lock);
- if (task && task != current) {
- act_waiter->deadlock_task_pid = get_pid(task_pid(task));
- act_waiter->deadlock_lock = lock;
- }
-}
-
-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
-{
- struct task_struct *task;
-
- if (!waiter->deadlock_lock || !debug_locks)
- return;
-
- rcu_read_lock();
- task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
- if (!task) {
- rcu_read_unlock();
- return;
- }
-
- if (!debug_locks_off()) {
- rcu_read_unlock();
- return;
- }
-
- printk("\n============================================\n");
- printk( "[ BUG: circular locking deadlock detected! ]\n");
- printk("%s\n", print_tainted());
- printk( "--------------------------------------------\n");
- printk("%s/%d is deadlocking current task %s/%d\n\n",
- task->comm, task_pid_nr(task),
- current->comm, task_pid_nr(current));
-
- printk("\n1) %s/%d is trying to acquire this lock:\n",
- current->comm, task_pid_nr(current));
- printk_lock(waiter->lock, 1);
-
- printk("\n2) %s/%d is blocked on this lock:\n",
- task->comm, task_pid_nr(task));
- printk_lock(waiter->deadlock_lock, 1);
-
- debug_show_held_locks(current);
- debug_show_held_locks(task);
-
- printk("\n%s/%d's [blocked] stackdump:\n\n",
- task->comm, task_pid_nr(task));
- show_stack(task, NULL);
- printk("\n%s/%d's [current] stackdump:\n\n",
- current->comm, task_pid_nr(current));
- dump_stack();
- debug_show_all_locks();
- rcu_read_unlock();
-
- printk("[ turning off deadlock detection."
- "Please report this trace. ]\n\n");
-}
-
-void debug_rt_mutex_lock(struct rt_mutex *lock)
-{
-}
-
-void debug_rt_mutex_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
-}
-
-void
-debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
-{
-}
-
-void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
-{
- DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
-}
-
-void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
-{
- memset(waiter, 0x11, sizeof(*waiter));
- waiter->deadlock_task_pid = NULL;
-}
-
-void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
-{
- put_pid(waiter->deadlock_task_pid);
- memset(waiter, 0x22, sizeof(*waiter));
-}
-
-void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
-{
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lock->name = name;
-}
-
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
deleted file mode 100644
index d0519c3432b6..000000000000
--- a/kernel/locking/rtmutex-debug.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c. Debug version.
- */
-
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
-extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
-extern void debug_rt_mutex_lock(struct rt_mutex *lock);
-extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
- struct task_struct *powner);
-extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *waiter,
- struct rt_mutex *lock);
-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w) \
- do { (w)->deadlock_lock = NULL; } while (0)
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk walk)
-{
- return (waiter != NULL);
-}
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- debug_rt_mutex_print_deadlock(w);
-}
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/device.h>
-#include <linux/kthread.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-#include <linux/stat.h>
-
-#include "rtmutex.h"
-
-#define MAX_RT_TEST_THREADS 8
-#define MAX_RT_TEST_MUTEXES 8
-
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-
-struct test_thread_data {
- int opcode;
- int opdata;
- int mutexes[MAX_RT_TEST_MUTEXES];
- int event;
- struct device dev;
-};
-
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-
-enum test_opcodes {
- RTTEST_NOP = 0,
- RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
- RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
- RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
- RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
- RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
- RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
- /* 9, 10 - reserved for BKL commemoration */
- RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
- RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
- RTTEST_RESET = 99, /* 99 Reset all pending operations */
-};
-
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
- int i, id, ret = -EINVAL;
-
- switch(td->opcode) {
-
- case RTTEST_NOP:
- return 0;
-
- case RTTEST_LOCKCONT:
- td->mutexes[td->opdata] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return 0;
-
- case RTTEST_RESET:
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
- if (td->mutexes[i] == 4) {
- rt_mutex_unlock(&mutexes[i]);
- td->mutexes[i] = 0;
- }
- }
- return 0;
-
- case RTTEST_RESETEVENT:
- atomic_set(&rttest_event, 0);
- return 0;
-
- default:
- if (lockwakeup)
- return ret;
- }
-
- switch(td->opcode) {
-
- case RTTEST_LOCK:
- case RTTEST_LOCKNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_lock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 4;
- return 0;
-
- case RTTEST_LOCKINT:
- case RTTEST_LOCKINTNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = ret ? 0 : 4;
- return ret ? -EINTR : 0;
-
- case RTTEST_UNLOCK:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
- return ret;
-
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_unlock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 0;
- return 0;
-
- default:
- break;
- }
- return ret;
-}
-
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
- int tid, op, dat;
- struct test_thread_data *td;
-
- /* We have to lookup the task */
- for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
- if (threads[tid] == current)
- break;
- }
-
- BUG_ON(tid == MAX_RT_TEST_THREADS);
-
- td = &thread_data[tid];
-
- op = td->opcode;
- dat = td->opdata;
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- break;
-
- if (td->mutexes[dat] != 1)
- break;
-
- td->mutexes[dat] = 2;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- default:
- break;
- }
-
- schedule();
-
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 3;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return;
-
- default:
- return;
- }
-
- td->opcode = 0;
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- int ret;
-
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 1);
- set_current_state(TASK_INTERRUPTIBLE);
- if (td->opcode == RTTEST_LOCKCONT)
- break;
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- }
-
- /* Restore previous command and data */
- td->opcode = op;
- td->opdata = dat;
-}
-
-static int test_func(void *data)
-{
- struct test_thread_data *td = data;
- int ret;
-
- current->flags |= PF_MUTEX_TESTER;
- set_freezable();
- allow_signal(SIGHUP);
-
- for(;;) {
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 0);
- set_current_state(TASK_INTERRUPTIBLE);
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- try_to_freeze();
-
- if (signal_pending(current))
- flush_signals(current);
-
- if(kthread_should_stop())
- break;
- }
- return 0;
-}
-
-/**
- * sysfs_test_command - interface for test commands
- * @dev: thread reference
- * @buf: command for actual step
- * @count: length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct sched_param schedpar;
- struct test_thread_data *td;
- char cmdbuf[32];
- int op, dat, tid, ret;
-
- td = container_of(dev, struct test_thread_data, dev);
- tid = td->dev.id;
-
- /* strings from sysfs write are not 0 terminated! */
- if (count >= sizeof(cmdbuf))
- return -EINVAL;
-
- /* strip of \n: */
- if (buf[count-1] == '\n')
- count--;
- if (count < 1)
- return -EINVAL;
-
- memcpy(cmdbuf, buf, count);
- cmdbuf[count] = 0;
-
- if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
- return -EINVAL;
-
- switch (op) {
- case RTTEST_SCHEDOT:
- schedpar.sched_priority = 0;
- ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
- if (ret)
- return ret;
- set_user_nice(current, 0);
- break;
-
- case RTTEST_SCHEDRT:
- schedpar.sched_priority = dat;
- ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
- if (ret)
- return ret;
- break;
-
- case RTTEST_SIGNAL:
- send_sig(SIGHUP, threads[tid], 0);
- break;
-
- default:
- if (td->opcode > 0)
- return -EBUSY;
- td->opdata = dat;
- td->opcode = op;
- wake_up_process(threads[tid]);
- }
-
- return count;
-}
-
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev: thread to query
- * @buf: char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct test_thread_data *td;
- struct task_struct *tsk;
- char *curr = buf;
- int i;
-
- td = container_of(dev, struct test_thread_data, dev);
- tsk = threads[td->dev.id];
-
- spin_lock(&rttest_lock);
-
- curr += sprintf(curr,
- "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
- td->opcode, td->event, tsk->state,
- (MAX_RT_PRIO - 1) - tsk->prio,
- (MAX_RT_PRIO - 1) - tsk->normal_prio,
- tsk->pi_blocked_on);
-
- for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
- curr += sprintf(curr, "%d", td->mutexes[i]);
-
- spin_unlock(&rttest_lock);
-
- curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
- mutexes[td->dev.id].owner);
-
- return curr - buf;
-}
-
-static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
-
-static struct bus_type rttest_subsys = {
- .name = "rttest",
- .dev_name = "rttest",
-};
-
-static int init_test_thread(int id)
-{
- thread_data[id].dev.bus = &rttest_subsys;
- thread_data[id].dev.id = id;
-
- threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
- if (IS_ERR(threads[id]))
- return PTR_ERR(threads[id]);
-
- return device_register(&thread_data[id].dev);
-}
-
-static int init_rttest(void)
-{
- int ret, i;
-
- spin_lock_init(&rttest_lock);
-
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
- rt_mutex_init(&mutexes[i]);
-
- ret = subsys_system_register(&rttest_subsys, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
- ret = init_test_thread(i);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
- if (ret)
- break;
- }
-
- printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-
- return ret;
-}
-
-device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b025295f4966..c80902eacd79 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* RT-Mutexes: simple blocking mutual exclusion locks with PI support
*
@@ -7,17 +8,62 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ * Adaptive Spinlocks:
+ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
+ * and Peter Morreale,
+ * Adaptive Spinlocks simplification:
+ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
*
- * See Documentation/locking/rt-mutex-design.txt for details.
+ * See Documentation/locking/rt-mutex-design.rst for details.
*/
-#include <linux/spinlock.h>
-#include <linux/export.h>
#include <linux/sched.h>
-#include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
#include <linux/sched/deadline.h>
-#include <linux/timer.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/ww_mutex.h>
+
+#include <trace/events/lock.h>
#include "rtmutex_common.h"
+#include "lock_events.h"
+
+#ifndef WW_RT
+# define build_ww_mutex() (false)
+# define ww_container_of(rtm) NULL
+
+static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
+ struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+ return 0;
+}
+
+static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+}
+
+static inline void ww_mutex_lock_acquired(struct ww_mutex *lock,
+ struct ww_acquire_ctx *ww_ctx)
+{
+}
+
+static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct ww_acquire_ctx *ww_ctx)
+{
+ return 0;
+}
+
+#else
+# define build_ww_mutex() (true)
+# define ww_container_of(rtm) container_of(rtm, struct ww_mutex, base)
+# include "ww_mutex.h"
+#endif
/*
* lock->owner state tracking:
@@ -46,42 +92,168 @@
* set this bit before looking at the lock.
*/
-static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
+static __always_inline struct task_struct *
+rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
{
unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- lock->owner = (struct task_struct *)val;
+ return (struct task_struct *)val;
+}
+
+static __always_inline void
+rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
+{
+ /*
+ * lock->wait_lock is held but explicit acquire semantics are needed
+ * for a new lock owner so WRITE_ONCE is insufficient.
+ */
+ xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
+}
+
+static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
+{
+ /* lock->wait_lock is held so the unlock provides release semantics. */
+ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
}
-static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
}
-static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void
+fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
+{
+ unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+ if (rt_mutex_has_waiters(lock))
+ return;
+
+ /*
+ * The rbtree has no waiters enqueued, now make sure that the
+ * lock->owner still has the waiters bit set, otherwise the
+ * following can happen:
+ *
+ * CPU 0 CPU 1 CPU2
+ * l->owner=T1
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T2)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ *
+ * rt_mutex_lock(l)
+ * lock(l->lock)
+ * l->owner = T1 | HAS_WAITERS;
+ * enqueue(T3)
+ * boost()
+ * unlock(l->lock)
+ * block()
+ * signal(->T2) signal(->T3)
+ * lock(l->lock)
+ * dequeue(T2)
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * dequeue(T3)
+ * ==> wait list is empty
+ * deboost()
+ * unlock(l->lock)
+ * lock(l->lock)
+ * fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * l->owner = owner
+ * owner = l->owner & ~HAS_WAITERS;
+ * ==> l->owner = T1
+ * }
+ * lock(l->lock)
+ * rt_mutex_unlock(l) fixup_rt_mutex_waiters()
+ * if (wait_list_empty(l) {
+ * owner = l->owner & ~HAS_WAITERS;
+ * cmpxchg(l->owner, T1, NULL)
+ * ===> Success (l->owner = NULL)
+ *
+ * l->owner = owner
+ * ==> l->owner = T1
+ * }
+ *
+ * With the check for the waiter bit in place T3 on CPU2 will not
+ * overwrite. All tasks fiddling with the waiters bit are
+ * serialized by l->lock, so nothing else can modify the waiters
+ * bit. If the bit is set then nothing can change l->owner either
+ * so the simple RMW is safe. The cmpxchg() will simply fail if it
+ * happens in the middle of the RMW because the waiters bit is
+ * still set.
+ */
+ owner = READ_ONCE(*p);
+ if (owner & RT_MUTEX_HAS_WAITERS) {
+ /*
+ * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
+ * why xchg_acquire() is used for updating owner for
+ * locking and WRITE_ONCE() for unlocking.
+ *
+ * WRITE_ONCE() would work for the acquire case too, but
+ * in case that the lock acquisition failed it might
+ * force other lockers into the slow path unnecessarily.
+ */
+ if (acquire_lock)
+ xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
+ else
+ WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
+ }
+}
+
+/*
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
+ */
+#ifndef CONFIG_DEBUG_RT_MUTEXES
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return try_cmpxchg_acquire(&lock->owner, &old, new);
+}
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
{
- if (!rt_mutex_has_waiters(lock))
- clear_rt_mutex_waiters(lock);
+ return try_cmpxchg_release(&lock->owner, &old, new);
}
/*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
*/
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
+ unsigned long *p = (unsigned long *) &lock->owner;
+ unsigned long owner, new;
+ owner = READ_ONCE(*p);
do {
- owner = *p;
- } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+ new = owner | RT_MUTEX_HAS_WAITERS;
+ } while (!try_cmpxchg_relaxed(p, &owner, new));
+
+ /*
+ * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
+ * operations in the event of contention. Ensure the successful
+ * cmpxchg is visible.
+ */
+ smp_mb__after_atomic();
}
/*
@@ -90,13 +262,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
clear_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
@@ -121,12 +294,40 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
* lock(wait_lock);
* acquire(lock);
*/
- return rt_mutex_cmpxchg(lock, owner, NULL);
+ return rt_mutex_cmpxchg_release(lock, owner, NULL);
}
#else
-# define rt_mutex_cmpxchg(l,c,n) (0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+
+}
+
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+ /*
+ * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+ *
+ * Avoid unconditionally taking the slow path by using
+ * rt_mutex_slow_trylock() which is covered by the debug code and can
+ * acquire a non-contended rtmutex.
+ */
+ return rt_mutex_slowtrylock(lock);
+}
+
+static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
+ struct task_struct *old,
+ struct task_struct *new)
+{
+ return false;
+}
+
+static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
@@ -135,18 +336,63 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true;
}
#endif
-static inline int
-rt_mutex_waiter_less(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int __waiter_prio(struct task_struct *task)
+{
+ int prio = task->prio;
+
+ if (!rt_or_dl_prio(prio))
+ return DEFAULT_PRIO;
+
+ return prio;
+}
+
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
+static __always_inline void
+waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+ waiter->tree.prio = __waiter_prio(task);
+ waiter->tree.deadline = task->dl.deadline;
+}
+
+/*
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
+ */
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert_held(&task->pi_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+ waiter->pi_tree.prio = waiter->tree.prio;
+ waiter->pi_tree.deadline = waiter->tree.deadline;
+}
+
+/*
+ * Only use with rt_waiter_node_{less,equal}()
+ */
+#define task_to_waiter_node(p) \
+ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
+#define task_to_waiter(p) \
+ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
+
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio < right->prio)
return 1;
@@ -158,155 +404,175 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
- return (left->task->dl.deadline < right->task->dl.deadline);
+ return dl_time_before(left->deadline, right->deadline);
return 0;
}
-static void
-rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
- struct rb_node **link = &lock->waiters.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- int leftmost = 1;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
+ if (left->prio != right->prio)
+ return 0;
- if (leftmost)
- lock->waiters_leftmost = &waiter->tree_entry;
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 0 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return left->deadline == right->deadline;
- rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color(&waiter->tree_entry, &lock->waiters);
+ return 1;
}
-static void
-rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
+static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
+ struct rt_mutex_waiter *top_waiter)
{
- if (RB_EMPTY_NODE(&waiter->tree_entry))
- return;
+ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
+ return true;
- if (lock->waiters_leftmost == &waiter->tree_entry)
- lock->waiters_leftmost = rb_next(&waiter->tree_entry);
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
+ /*
+ * Note that RT tasks are excluded from same priority (lateral)
+ * steals to prevent the introduction of an unbounded latency.
+ */
+ if (rt_or_dl_prio(waiter->tree.prio))
+ return false;
- rb_erase(&waiter->tree_entry, &lock->waiters);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
+#else
+ return false;
+#endif
}
-static void
-rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+#define __node_2_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, tree.entry)
+
+static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
{
- struct rb_node **link = &task->pi_waiters.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- int leftmost = 1;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = 0;
- }
+ struct rt_mutex_waiter *aw = __node_2_waiter(a);
+ struct rt_mutex_waiter *bw = __node_2_waiter(b);
+
+ if (rt_waiter_node_less(&aw->tree, &bw->tree))
+ return 1;
+
+ if (!build_ww_mutex())
+ return 0;
+
+ if (rt_waiter_node_less(&bw->tree, &aw->tree))
+ return 0;
+
+ /* NOTE: relies on waiter->ww_ctx being set before insertion */
+ if (aw->ww_ctx) {
+ if (!bw->ww_ctx)
+ return 1;
+
+ return (signed long)(aw->ww_ctx->stamp -
+ bw->ww_ctx->stamp) < 0;
}
- if (leftmost)
- task->pi_waiters_leftmost = &waiter->pi_tree_entry;
+ return 0;
+}
+
+static __always_inline void
+rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
+{
+ lockdep_assert_held(&lock->wait_lock);
- rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
}
-static void
-rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
+static __always_inline void
+rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (RB_EMPTY_NODE(&waiter->tree.entry))
return;
- if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
- task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
+ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+}
+
+#define __node_2_rt_node(node) \
+ rb_entry((node), struct rt_waiter_node, entry)
- rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+{
+ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
}
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
+static __always_inline void
+rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
+ lockdep_assert_held(&task->pi_lock);
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
+ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
}
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
+static __always_inline void
+rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
+ lockdep_assert_held(&task->pi_lock);
+
+ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
+ return;
- return task_top_pi_waiter(task)->task;
+ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
}
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+ struct task_struct *p)
{
- if (!task_has_pi_waiters(task))
- return newprio;
+ struct task_struct *pi_task = NULL;
+
+ lockdep_assert_held(&lock->wait_lock);
+ lockdep_assert(rt_mutex_owner(lock) == p);
+ lockdep_assert_held(&p->pi_lock);
+
+ if (task_has_pi_waiters(p))
+ pi_task = task_top_pi_waiter(p)->task;
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
+ rt_mutex_setprio(p, pi_task);
}
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+/* RT mutex specific wake_q wrappers */
+static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
+ struct task_struct *task,
+ unsigned int wake_state)
{
- int prio = rt_mutex_getprio(task);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(wqh->rtlock_task);
+ get_task_struct(task);
+ wqh->rtlock_task = task;
+ } else {
+ wake_q_add(&wqh->head, task);
+ }
+}
- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
+static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
+ struct rt_mutex_waiter *w)
+{
+ rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
}
-/*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-static void rt_mutex_adjust_prio(struct task_struct *task)
+static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
{
- unsigned long flags;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
+ wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
+ put_task_struct(wqh->rtlock_task);
+ wqh->rtlock_task = NULL;
+ }
+
+ if (!wake_q_empty(&wqh->head))
+ wake_up_q(&wqh->head);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- __rt_mutex_adjust_prio(task);
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ /* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
+ preempt_enable();
}
/*
@@ -322,25 +588,16 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
* deadlock detection is disabled independent of the detect argument
* and the config settings.
*/
-static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
- enum rtmutex_chainwalk chwalk)
+static __always_inline bool
+rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
{
- /*
- * This is just a wrapper function for the following call,
- * because debug_rt_mutex_detect_deadlock() smells like a magic
- * debug feature and I wanted to keep the cond function in the
- * main source file along with the comments instead of having
- * two of the same in the headers.
- */
- return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ return waiter != NULL;
+ return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
-/*
- * Max number of times we'll walk the boosting chain:
- */
-int max_lock_depth = 1024;
-
-static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p)
{
return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
}
@@ -369,9 +626,14 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* Chain walk basics and protection scope
*
* [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
* [L] rtmutex->wait_lock held
*
+ * Normal locking order:
+ *
+ * rtmutex->wait_lock
+ * task->pi_lock
+ *
* Step Description Protected by
* function arguments:
* @task [R]
@@ -386,41 +648,45 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* again:
* loop_sanity_check();
* retry:
- * [1] lock(task->pi_lock); [R] acquire [P]
- * [2] waiter = task->pi_blocked_on; [P]
- * [3] check_exit_conditions_1(); [P]
- * [4] lock = waiter->lock; [P]
- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
- * unlock(task->pi_lock); release [P]
+ * [1] lock(task->pi_lock); [R] acquire [P1]
+ * [2] waiter = task->pi_blocked_on; [P1]
+ * [3] check_exit_conditions_1(); [P1]
+ * [4] lock = waiter->lock; [P1]
+ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
+ * unlock(task->pi_lock); release [P1]
* goto retry;
* }
- * [6] check_exit_conditions_2(); [P] + [L]
- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
- * [8] unlock(task->pi_lock); release [P]
+ * [6] check_exit_conditions_2(); [P1] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
+ * [8] unlock(task->pi_lock); release [P1]
* put_task_struct(task); release [R]
* [9] check_exit_conditions_3(); [L]
* [10] task = owner(lock); [L]
* get_task_struct(task); [L] acquire [R]
- * lock(task->pi_lock); [L] acquire [P]
- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12] check_exit_conditions_4(); [P] + [L]
- * [13] unlock(task->pi_lock); release [P]
+ * lock(task->pi_lock); [L] acquire [P2]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12] check_exit_conditions_4(); [P2] + [L]
+ * [13] unlock(task->pi_lock); release [P2]
* unlock(lock->wait_lock); release [L]
* goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
*/
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- enum rtmutex_chainwalk chwalk,
- struct rt_mutex *orig_lock,
- struct rt_mutex *next_lock,
- struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task)
+static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_base *orig_lock,
+ struct rt_mutex_base *next_lock,
+ struct rt_mutex_waiter *orig_waiter,
+ struct task_struct *top_task)
{
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
struct rt_mutex_waiter *prerequeue_top_waiter;
int ret = 0, depth = 0;
- struct rt_mutex *lock;
+ struct rt_mutex_base *lock;
bool detect_deadlock;
- unsigned long flags;
bool requeue = true;
detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -463,7 +729,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* [1] Task cannot go away as we did a get_task() before !
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock_irq(&task->pi_lock);
/*
* [2] Get the waiter on which @task is blocked on.
@@ -502,6 +768,31 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * There could be 'spurious' loops in the lock graph due to ww_mutex,
+ * consider:
+ *
+ * P1: A, ww_A, ww_B
+ * P2: ww_B, ww_A
+ * P3: A
+ *
+ * P3 should not return -EDEADLK because it gets trapped in the cycle
+ * created by P1 and P2 (which will resolve -- and runs into
+ * max_lock_depth above). Therefore disable detect_deadlock such that
+ * the below termination condition can trigger once all relevant tasks
+ * are boosted.
+ *
+ * Even when we start with ww_mutex we can disable deadlock detection,
+ * since we would supress a ww_mutex induced deadlock at [6] anyway.
+ * Supressing it here however is not sufficient since we might still
+ * hit [6] due to adjustment driven iteration.
+ *
+ * NOTE: if someone were to create a deadlock between 2 ww_classes we'd
+ * utterly fail to report it; lockdep should.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock)
+ detect_deadlock = false;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
@@ -530,7 +821,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (waiter->prio == task->prio) {
+ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -538,16 +829,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * [4] Get the next lock
+ * [4] Get the next lock; per holding task->pi_lock we can't unblock
+ * and guarantee @lock's existence.
*/
lock = waiter->lock;
/*
* [5] We need to trylock here as we are holding task->pi_lock,
* which is the reverse lock order versus the other rtmutex
* operations.
+ *
+ * Per the above, holding task->pi_lock guarantees lock exists, so
+ * inverting this lock order is infeasible from a life-time
+ * perspective.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
cpu_relax();
goto retry;
}
@@ -562,9 +858,21 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
- raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
+
+ /*
+ * When the deadlock is due to ww_mutex; also see above. Don't
+ * report the deadlock and instead let the ww_mutex wound/die
+ * logic pick which of the contending threads gets -EDEADLK.
+ *
+ * NOTE: assumes the cycle only contains a single ww_class; any
+ * other configuration and we fail to report; also, see
+ * lockdep.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
+ ret = 0;
+
+ raw_spin_unlock(&lock->wait_lock);
goto out_unlock_pi;
}
@@ -578,7 +886,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* No requeue[7] here. Just release @task [8]
*/
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -586,14 +894,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* If there is no owner of the lock, end of chain.
*/
if (!rt_mutex_owner(lock)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ task = get_task_struct(rt_mutex_owner(lock));
+ raw_spin_lock(&task->pi_lock);
/*
* No requeue [11] here. We just do deadlock detection.
@@ -608,8 +915,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/* If owner is not blocked, end of chain. */
if (!next_lock)
@@ -624,13 +931,32 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
prerequeue_top_waiter = rt_mutex_top_waiter(lock);
- /* [7] Requeue the waiter in the lock waiter list. */
+ /* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
- waiter->prio = task->prio;
+
+ /*
+ * Update the waiter prio fields now that we're dequeued.
+ *
+ * These values can have changed through either:
+ *
+ * sys_sched_set_scheduler() / sys_sched_setattr()
+ *
+ * or
+ *
+ * DL CBS enforcement advancing the effective deadline.
+ */
+ waiter_update_prio(waiter, task);
+
rt_mutex_enqueue(lock, waiter);
- /* [8] Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ /*
+ * [8] Release the (blocking) task in preparation for
+ * taking the owner task in [10].
+ *
+ * Since we hold lock->waiter_lock, task cannot unblock, even if we
+ * release task->pi_lock.
+ */
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -646,34 +972,40 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* then we need to wake the new top waiter up to try
* to get the lock.
*/
- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
- wake_up_process(rt_mutex_top_waiter(lock)->task);
- raw_spin_unlock(&lock->wait_lock);
+ top_waiter = rt_mutex_top_waiter(lock);
+ if (prerequeue_top_waiter != top_waiter)
+ wake_up_state(top_waiter->task, top_waiter->wake_state);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
- /* [10] Grab the next task, i.e. the owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ /*
+ * [10] Grab the next task, i.e. the owner of @lock
+ *
+ * Per holding lock->wait_lock and checking for !owner above, there
+ * must be an owner and it cannot go away.
+ */
+ task = get_task_struct(rt_mutex_owner(lock));
+ raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
/*
* The waiter became the new top (highest priority)
* waiter on the lock. Replace the previous top waiter
- * in the owner tasks pi waiters list with this waiter
+ * in the owner tasks pi waiters tree with this waiter
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else if (prerequeue_top_waiter == waiter) {
/*
* The waiter was the top waiter on the lock, but is
- * no longer the top prority waiter. Replace waiter in
- * the owner tasks pi waiters list with the new top
+ * no longer the top priority waiter. Replace waiter in
+ * the owner tasks pi waiters tree with the new top
* (highest priority) waiter and adjust the priority
* of the owner.
* The new top waiter is stored in @waiter so that
@@ -682,8 +1014,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -709,8 +1042,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop the locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/*
* Make the actual exit decisions [12], based on the stored
@@ -733,7 +1066,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto again;
out_unlock_pi:
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
out_put_task:
put_task_struct(task);
@@ -743,17 +1076,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* Try to take an rt-mutex
*
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
- * @waiter: The waiter that is queued to the lock's wait list if the
+ * @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+static int __sched
+try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
{
- unsigned long flags;
+ lockdep_assert_held(&lock->wait_lock);
/*
* Before testing whether we can acquire @lock, we set the
@@ -782,23 +1116,25 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
/*
* If @waiter != NULL, @task has already enqueued the waiter
- * into @lock waiter list. If @waiter == NULL then this is a
+ * into @lock waiter tree. If @waiter == NULL then this is a
* trylock attempt.
*/
if (waiter) {
- /*
- * If waiter is not the highest priority waiter of
- * @lock, give up.
- */
- if (waiter != rt_mutex_top_waiter(lock))
- return 0;
+ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
/*
- * We can acquire the lock. Remove the waiter from the
- * lock waiters list.
+ * If waiter is the highest priority waiter of @lock,
+ * or allowed to steal it, take it over.
*/
- rt_mutex_dequeue(lock, waiter);
-
+ if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters tree.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
+ return 0;
+ }
} else {
/*
* If the lock has waiters already we check whether @task is
@@ -809,12 +1145,9 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- /*
- * If @task->prio is greater than or equal to
- * the top waiter priority (kernel view),
- * @task lost.
- */
- if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ /* Check whether the trylock can steal it. */
+ if (!rt_mutex_steal(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -827,7 +1160,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* No waiters. Take the lock without the
* pi_lock dance.@task->pi_blocked_on is NULL
* and we have no waiters to enqueue in @task
- * pi waiters list.
+ * pi waiters tree.
*/
goto takeit;
}
@@ -839,29 +1172,24 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* case, but conditionals are more expensive than a redundant
* store.
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
task->pi_blocked_on = NULL;
/*
* Finish the lock acquisition. @task is the new owner. If
* other waiters exist we have to insert the highest priority
- * waiter into @task->pi_waiters list.
+ * waiter into @task->pi_waiters tree.
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
takeit:
- /* We got the lock. */
- debug_rt_mutex_lock(lock);
-
/*
* This either preserves the RT_MUTEX_HAS_WAITERS bit if there
* are still waiters or clears it.
*/
rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task);
-
return 1;
}
@@ -870,18 +1198,21 @@ takeit:
*
* Prepare waiter and propagate pi chain
*
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
*/
-static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task,
- enum rtmutex_chainwalk chwalk)
+static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct ww_acquire_ctx *ww_ctx,
+ enum rtmutex_chainwalk chwalk,
+ struct wake_q_head *wake_q)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- struct rt_mutex *next_lock;
+ struct rt_mutex_base *next_lock;
int chain_walk = 0, res;
- unsigned long flags;
+
+ lockdep_assert_held(&lock->wait_lock);
/*
* Early deadlock detection. We really don't want the task to
@@ -891,15 +1222,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* the other will detect the deadlock and return -EDEADLOCK,
* which is wrong, as the other waiter is not in a deadlock
* situation.
+ *
+ * Except for ww_mutex, in that case the chain walk must already deal
+ * with spurious cycles, see the comments at [3] and [6].
*/
- if (owner == task)
+ if (owner == task && !(build_ww_mutex() && ww_ctx))
return -EDEADLK;
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- __rt_mutex_adjust_prio(task);
+ raw_spin_lock(&task->pi_lock);
waiter->task = task;
waiter->lock = lock;
- waiter->prio = task->prio;
+ waiter_update_prio(waiter, task);
+ waiter_clone_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -908,17 +1242,32 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
task->pi_blocked_on = waiter;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
+
+ if (build_ww_mutex() && ww_ctx) {
+ struct rt_mutex *rtm;
+
+ /* Check whether the waiter should back out immediately */
+ rtm = container_of(lock, struct rt_mutex, rtmutex);
+ res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q);
+ if (res) {
+ raw_spin_lock(&task->pi_lock);
+ rt_mutex_dequeue(lock, waiter);
+ task->pi_blocked_on = NULL;
+ raw_spin_unlock(&task->pi_lock);
+ return res;
+ }
+ }
if (!owner)
return 0;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
if (waiter == rt_mutex_top_waiter(lock)) {
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -928,7 +1277,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
@@ -944,40 +1293,42 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
return res;
}
/*
- * Wake up the next waiter on the lock.
- *
- * Remove the top waiter from the current tasks pi waiter list and
- * wake it up.
+ * Remove the top waiter from the current tasks pi waiter tree and
+ * queue it up.
*
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
*/
-static void wakeup_next_waiter(struct rt_mutex *lock)
+static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
+ struct rt_mutex_base *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ lockdep_assert_held(&lock->wait_lock);
+
+ raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
/*
- * Remove it from current->pi_waiters. We do not adjust a
- * possible priority boost right now. We execute wakeup in the
- * boosted mode and go back to normal after releasing
- * lock->wait_lock.
+ * Remove it from current->pi_waiters and deboost.
+ *
+ * We must in fact deboost here in order to ensure we call
+ * rt_mutex_setprio() to update p->pi_top_task before the
+ * task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
+ rt_mutex_adjust_prio(lock, current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -989,34 +1340,215 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ /*
+ * We deboosted before waking the top waiter task such that we don't
+ * run two tasks with the 'same' priority (and ensure the
+ * p->pi_top_task pointer points to a blocked task). This however can
+ * lead to priority inversion if we would get preempted after the
+ * deboost but before waking our donor task, hence the preempt_disable()
+ * before unlock.
+ *
+ * Pairs with preempt_enable() in rt_mutex_wake_up_q();
+ */
+ preempt_disable();
+ rt_mutex_wake_q_add(wqh, waiter);
+ raw_spin_unlock(&current->pi_lock);
+}
+
+static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock, true);
+
+ return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
+{
+ unsigned long flags;
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+
+ /*
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ ret = __rt_mutex_slowtrylock(lock);
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ return ret;
+}
+
+static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(lock);
+}
+
+/*
+ * Slow path to release a rt-mutex.
+ */
+static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+
+ debug_rt_mutex_unlock(lock);
+
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock, flags) == true)
+ return;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ }
/*
- * It's safe to dereference waiter as it cannot go away as
- * long as we hold lock->wait_lock. The waiter task needs to
- * acquire it in order to dequeue the waiter.
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ *
+ * Queue the next waiter for wakeup once we release the wait_lock.
*/
- wake_up_process(waiter->task);
+ mark_wakeup_next_waiter(&wqh, lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
+{
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(lock);
+}
+
+#ifdef CONFIG_SMP
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
+{
+ bool res = true;
+
+ rcu_read_lock();
+ for (;;) {
+ /* If owner changed, trylock again. */
+ if (owner != rt_mutex_owner(lock))
+ break;
+ /*
+ * Ensure that @owner is dereferenced after checking that
+ * the lock owner still matches @owner. If that fails,
+ * @owner might point to freed memory. If it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+ /*
+ * Stop spinning when:
+ * - the lock owner has been scheduled out
+ * - current is not longer the top waiter
+ * - current is requested to reschedule (redundant
+ * for CONFIG_PREEMPT_RCU=y)
+ * - the VCPU on which owner runs is preempted
+ */
+ if (!owner_on_cpu(owner) || need_resched() ||
+ !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
+ res = false;
+ break;
+ }
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return res;
}
+#else
+static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *owner)
+{
+ return false;
+}
+#endif
+
+#ifdef RT_MUTEX_BUILD_MUTEX
+/*
+ * Functions required for:
+ * - rtmutex, futex on all kernels
+ * - mutex and rwsem substitutions on RT kernels
+ */
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. It must
* have just failed to try_to_take_rt_mutex().
*/
-static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter)
+static void __sched remove_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
{
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock;
- unsigned long flags;
+ struct rt_mutex_base *next_lock;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ lockdep_assert_held(&lock->wait_lock);
+
+ raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
/*
* Only update priority if the waiter was the highest priority
@@ -1025,19 +1557,19 @@ static void remove_waiter(struct rt_mutex *lock,
if (!owner || !is_top_waiter)
return;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
rt_mutex_dequeue_pi(owner, waiter);
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Don't walk the chain, if the owner task is not blocked
@@ -1049,86 +1581,73 @@ static void remove_waiter(struct rt_mutex *lock,
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
next_lock, NULL, current);
- raw_spin_lock(&lock->wait_lock);
-}
-
-/*
- * Recheck the pi chain, in case we got a priority setting
- *
- * Called from sched_setscheduler
- */
-void rt_mutex_adjust_pi(struct task_struct *task)
-{
- struct rt_mutex_waiter *waiter;
- struct rt_mutex *next_lock;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
-
- waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio &&
- !dl_prio(task->prio))) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return;
- }
- next_lock = waiter->lock;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- /* gets dropped in rt_mutex_adjust_prio_chain()! */
- get_task_struct(task);
-
- rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
- next_lock, NULL, task);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/**
- * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
+ * @ww_ctx: WW mutex context pointer
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
+ * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock
*
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
-static int __sched
-__rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct hrtimer_sleeper *timeout,
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct task_struct *owner;
int ret = 0;
+ lockevent_inc(rtmutex_slow_block);
for (;;) {
/* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
+ if (try_to_take_rt_mutex(lock, current, waiter)) {
+ lockevent_inc(rtmutex_slow_acq3);
break;
+ }
- /*
- * TASK_INTERRUPTIBLE checks for signals and
- * timeout. Ignored otherwise.
- */
- if (unlikely(state == TASK_INTERRUPTIBLE)) {
- /* Signal pending? */
- if (signal_pending(current))
- ret = -EINTR;
- if (timeout && !timeout->task)
- ret = -ETIMEDOUT;
+ if (timeout && !timeout->task) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (build_ww_mutex() && ww_ctx) {
+ ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx);
if (ret)
break;
}
- raw_spin_unlock(&lock->wait_lock);
-
- debug_rt_mutex_print_deadlock(waiter);
+ if (waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- schedule_rt_mutex(lock);
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) {
+ lockevent_inc(rtmutex_slow_sleep);
+ rt_mutex_schedule();
+ }
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}
@@ -1136,8 +1655,9 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
-static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
- struct rt_mutex_waiter *w)
+static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *w)
{
/*
* If the result is not -EDEADLOCK or the caller requested
@@ -1146,503 +1666,234 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
if (res != -EDEADLOCK || detect_deadlock)
return;
- /*
- * Yell lowdly and stop the task right here.
- */
- rt_mutex_print_deadlock(w);
+ if (build_ww_mutex() && w->ww_ctx)
+ return;
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ WARN(1, "rtmutex deadlock detected\n");
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ rt_mutex_schedule();
}
}
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
+ * @chwalk: Indicator whether full or partial chainwalk is requested
+ * @waiter: Initializer waiter for blocking
+ * @wake_q: The wake_q to wake tasks after we release the wait_lock
*/
-static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk)
+static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
{
- struct rt_mutex_waiter waiter;
- int ret = 0;
-
- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
+ struct ww_mutex *ww = ww_container_of(rtm);
+ int ret;
- raw_spin_lock(&lock->wait_lock);
+ lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtmutex_slowlock);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ if (build_ww_mutex() && ww_ctx) {
+ __ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ lockevent_inc(rtmutex_slow_acq1);
return 0;
}
set_current_state(state);
- /* Setup the timer, when timeout != NULL */
- if (unlikely(timeout)) {
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
- if (!hrtimer_active(&timeout->timer))
- timeout->task = NULL;
- }
-
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
+ trace_contention_begin(lock, LCB_F_RT);
+ ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q);
if (likely(!ret))
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
-
- if (unlikely(ret)) {
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q);
+
+ if (likely(!ret)) {
+ /* acquired the lock */
+ if (build_ww_mutex() && ww_ctx) {
+ if (!ww_ctx->is_wait_die)
+ __ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
+ ww_mutex_lock_acquired(ww, ww_ctx);
+ }
+ lockevent_inc(rtmutex_slow_acq2);
+ } else {
__set_current_state(TASK_RUNNING);
- if (rt_mutex_has_waiters(lock))
- remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, chwalk, &waiter);
+ remove_waiter(lock, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
+ lockevent_inc(rtmutex_deadlock);
}
/*
* try_to_take_rt_mutex() sets the waiter bit
* unconditionally. We might have to fix that up.
*/
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock(&lock->wait_lock);
+ fixup_rt_mutex_waiters(lock, true);
- /* Remove pending timer: */
- if (unlikely(timeout))
- hrtimer_cancel(&timeout->timer);
-
- debug_rt_mutex_free_waiter(&waiter);
+ trace_contention_end(lock, ret);
return ret;
}
-/*
- * Slow path try-lock function:
- */
-static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
+static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state,
+ struct wake_q_head *wake_q)
{
+ struct rt_mutex_waiter waiter;
int ret;
- /*
- * If the lock already has an owner we fail to get the lock.
- * This can be done without taking the @lock->wait_lock as
- * it is only being read, and this is a trylock anyway.
- */
- if (rt_mutex_owner(lock))
- return 0;
-
- /*
- * The mutex has currently no owner. Lock the wait lock and
- * try to acquire the lock.
- */
- raw_spin_lock(&lock->wait_lock);
+ rt_mutex_init_waiter(&waiter);
+ waiter.ww_ctx = ww_ctx;
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock(&lock->wait_lock);
+ ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
+ &waiter, wake_q);
+ debug_rt_mutex_free_waiter(&waiter);
+ lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q));
return ret;
}
/*
- * Slow path to release a rt-mutex:
+ * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
+ * @lock: The rtmutex to block lock
+ * @ww_ctx: WW mutex context pointer
+ * @state: The task state for sleeping
*/
-static void __sched
-rt_mutex_slowunlock(struct rt_mutex *lock)
+static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ unsigned int state)
{
- raw_spin_lock(&lock->wait_lock);
-
- debug_rt_mutex_unlock(lock);
-
- rt_mutex_deadlock_account_unlock(current);
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long flags;
+ int ret;
/*
- * We must be careful here if the fast path is enabled. If we
- * have no waiters queued we cannot set owner to NULL here
- * because of:
- *
- * foo->lock->owner = NULL;
- * rtmutex_lock(foo->lock); <- fast path
- * free = atomic_dec_and_test(foo->refcnt);
- * rtmutex_unlock(foo->lock); <- fast path
- * if (free)
- * kfree(foo);
- * raw_spin_unlock(foo->lock->wait_lock);
- *
- * So for the fastpath enabled kernel:
- *
- * Nothing can set the waiters bit as long as we hold
- * lock->wait_lock. So we do the following sequence:
- *
- * owner = rt_mutex_owner(lock);
- * clear_rt_mutex_waiters(lock);
- * raw_spin_unlock(&lock->wait_lock);
- * if (cmpxchg(&lock->owner, owner, 0) == owner)
- * return;
- * goto retry;
- *
- * The fastpath disabled variant is simple as all access to
- * lock->owner is serialized by lock->wait_lock:
- *
- * lock->owner = NULL;
- * raw_spin_unlock(&lock->wait_lock);
+ * Do all pre-schedule work here, before we queue a waiter and invoke
+ * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+ * otherwise recurse back into task_blocks_on_rt_mutex() through
+ * rtlock_slowlock() and will then enqueue a second waiter for this
+ * same task and things get really confusing real fast.
*/
- while (!rt_mutex_has_waiters(lock)) {
- /* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock) == true)
- return;
- /* Relock the rtmutex and try again */
- raw_spin_lock(&lock->wait_lock);
- }
+ rt_mutex_pre_schedule();
/*
- * The wakeup next waiter path does not suffer from the above
- * race. See the comments there.
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
*/
- wakeup_next_waiter(lock);
-
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
+ rt_mutex_post_schedule();
- /* Undo pi boosting if necessary: */
- rt_mutex_adjust_prio(current);
+ return ret;
}
-/*
- * debug aware fast / slowpath lock,trylock,unlock
- *
- * The atomic acquire/release ops are compiled away, when either the
- * architecture does not support cmpxchg or when debugging is enabled.
- */
-static inline int
-rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
+ unsigned int state)
{
- if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
- return 0;
- } else
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
-}
+ lockdep_assert(!current->pi_blocked_on);
-static inline int
-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
-{
- if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_try_acquire(lock)))
return 0;
- } else
- return slowfn(lock, state, timeout, chwalk);
-}
-
-static inline int
-rt_mutex_fasttrylock(struct rt_mutex *lock,
- int (*slowfn)(struct rt_mutex *lock))
-{
- if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
- return 1;
- }
- return slowfn(lock);
-}
-
-static inline void
-rt_mutex_fastunlock(struct rt_mutex *lock,
- void (*slowfn)(struct rt_mutex *lock))
-{
- if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
- rt_mutex_deadlock_account_unlock(current);
- else
- slowfn(lock);
-}
-
-/**
- * rt_mutex_lock - lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- */
-void __sched rt_mutex_lock(struct rt_mutex *lock)
-{
- might_sleep();
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
+ return rt_mutex_slowlock(lock, NULL, state);
}
-EXPORT_SYMBOL_GPL(rt_mutex_lock);
-
-/**
- * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- */
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
-{
- might_sleep();
-
- return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+#endif /* RT_MUTEX_BUILD_MUTEX */
+#ifdef RT_MUTEX_BUILD_SPINLOCKS
/*
- * Futex variant with full deadlock detection.
+ * Functions required for spin/rw_lock substitution on RT kernels
*/
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
-{
- might_sleep();
-
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_FULL_CHAINWALK,
- rt_mutex_slowlock);
-}
/**
- * rt_mutex_timed_lock - lock a rt_mutex interruptible
- * the timeout structure is provided
- * by the caller
- *
- * @lock: the rt_mutex to be locked
- * @timeout: timeout structure or NULL (no timeout)
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -ETIMEDOUT when the timeout expired
+ * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
+ * @lock: The underlying RT mutex
+ * @wake_q: The wake_q to wake tasks after we release the wait_lock
*/
-int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
+static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
+ struct wake_q_head *wake_q)
+ __releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
- might_sleep();
-
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_MIN_CHAINWALK,
- rt_mutex_slowlock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
-
-/**
- * rt_mutex_trylock - try to lock a rt_mutex
- *
- * @lock: the rt_mutex to be locked
- *
- * Returns 1 on success and 0 on contention
- */
-int __sched rt_mutex_trylock(struct rt_mutex *lock)
-{
- return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_trylock);
-
-/**
- * rt_mutex_unlock - unlock a rt_mutex
- *
- * @lock: the rt_mutex to be unlocked
- */
-void __sched rt_mutex_unlock(struct rt_mutex *lock)
-{
- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
-}
-EXPORT_SYMBOL_GPL(rt_mutex_unlock);
-
-/**
- * rt_mutex_destroy - mark a mutex unusable
- * @lock: the mutex to be destroyed
- *
- * This function marks the mutex uninitialized, and any subsequent
- * use of the mutex is forbidden. The mutex must not be locked when
- * this function is called.
- */
-void rt_mutex_destroy(struct rt_mutex *lock)
-{
- WARN_ON(rt_mutex_is_locked(lock));
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- lock->magic = NULL;
-#endif
-}
+ struct rt_mutex_waiter waiter;
+ struct task_struct *owner;
-EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+ lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtlock_slowlock);
-/**
- * __rt_mutex_init - initialize the rt lock
- *
- * @lock: the rt lock to be initialized
- *
- * Initialize the rt lock to unlocked state.
- *
- * Initializing of a locked rt lock is not allowed
- */
-void __rt_mutex_init(struct rt_mutex *lock, const char *name)
-{
- lock->owner = NULL;
- raw_spin_lock_init(&lock->wait_lock);
- lock->waiters = RB_ROOT;
- lock->waiters_leftmost = NULL;
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ lockevent_inc(rtlock_slow_acq1);
+ return;
+ }
- debug_rt_mutex_init(lock, name);
-}
-EXPORT_SYMBOL_GPL(__rt_mutex_init);
+ rt_mutex_init_rtlock_waiter(&waiter);
-/**
- * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
- * proxy owner
- *
- * @lock: the rt_mutex to be locked
- * @proxy_owner:the task to set as owner
- *
- * No locking. Caller has to do serializing itself
- * Special API call for PI-futex support
- */
-void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
-{
- __rt_mutex_init(lock, NULL);
- debug_rt_mutex_proxy_lock(lock, proxy_owner);
- rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
-}
+ /* Save current state and set state to TASK_RTLOCK_WAIT */
+ current_save_and_set_rtlock_wait_state();
-/**
- * rt_mutex_proxy_unlock - release a lock on behalf of owner
- *
- * @lock: the rt_mutex to be locked
- *
- * No locking. Caller has to do serializing itself
- * Special API call for PI-futex support
- */
-void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
-{
- debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL);
- rt_mutex_deadlock_account_unlock(proxy_owner);
-}
+ trace_contention_begin(lock, LCB_F_RT);
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task)
-{
- int ret;
+ task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q);
- raw_spin_lock(&lock->wait_lock);
+ for (;;) {
+ /* Try to acquire the lock again */
+ if (try_to_take_rt_mutex(lock, current, &waiter)) {
+ lockevent_inc(rtlock_slow_acq2);
+ break;
+ }
- if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
- return 1;
- }
+ if (&waiter == rt_mutex_top_waiter(lock))
+ owner = rt_mutex_owner(lock);
+ else
+ owner = NULL;
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- /* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task,
- RT_MUTEX_FULL_CHAINWALK);
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) {
+ lockevent_inc(rtlock_slow_sleep);
+ schedule_rtlock();
+ }
- if (ret && !rt_mutex_owner(lock)) {
- /*
- * Reset the return value. We might have
- * returned with -EDEADLK and the owner
- * released the lock while we were walking the
- * pi chain. Let the waiter sort it out.
- */
- ret = 0;
+ raw_spin_lock_irq(&lock->wait_lock);
+ set_current_state(TASK_RTLOCK_WAIT);
}
- if (unlikely(ret))
- remove_waiter(lock, waiter);
+ /* Restore the task state */
+ current_restore_rtlock_saved_state();
- raw_spin_unlock(&lock->wait_lock);
-
- debug_rt_mutex_print_deadlock(waiter);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally.
+ * We might have to fix that up:
+ */
+ fixup_rt_mutex_waiters(lock, true);
+ debug_rt_mutex_free_waiter(&waiter);
- return ret;
+ trace_contention_end(lock, 0);
+ lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q));
}
-/**
- * rt_mutex_next_owner - return the next owner of the lock
- *
- * @lock: the rt lock query
- *
- * Returns the next owner of the lock or NULL
- *
- * Caller has to serialize against other accessors to the lock
- * itself.
- *
- * Special API call for PI-futex support
- */
-struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
{
- if (!rt_mutex_has_waiters(lock))
- return NULL;
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
- return rt_mutex_top_waiter(lock)->task;
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ rtlock_slowlock_locked(lock, &wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
-/**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
- * @lock: the rt_mutex we were woken on
- * @to: the timeout, null if none. hrtimer should already have
- * been started.
- * @waiter: the pre-initialized rt_mutex_waiter
- *
- * Complete the lock acquisition started our behalf by another thread.
- *
- * Returns:
- * 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT
- *
- * Special API call for PI-futex requeue support
- */
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter)
-{
- int ret;
-
- raw_spin_lock(&lock->wait_lock);
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-
- if (unlikely(ret))
- remove_waiter(lock, waiter);
-
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock(&lock->wait_lock);
-
- return ret;
-}
+#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
deleted file mode 100644
index c4060584c407..000000000000
--- a/kernel/locking/rtmutex.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * RT-Mutexes: blocking mutual exclusion locks with PI support
- *
- * started by Ingo Molnar and Thomas Gleixner:
- *
- * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- * This file contains macros used solely by rtmutex.c.
- * Non-debug version.
- */
-
-#define rt_mutex_deadlock_check(l) (0)
-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
-#define debug_rt_mutex_init_waiter(w) do { } while (0)
-#define debug_rt_mutex_free_waiter(w) do { } while (0)
-#define debug_rt_mutex_lock(l) do { } while (0)
-#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
-#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
-#define debug_rt_mutex_unlock(l) do { } while (0)
-#define debug_rt_mutex_init(m, n) do { } while (0)
-#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
-#define debug_rt_mutex_print_deadlock(w) do { } while (0)
-#define debug_rt_mutex_reset_waiter(w) do { } while (0)
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- WARN(1, "rtmutex deadlock detected\n");
-}
-
-static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
- enum rtmutex_chainwalk walk)
-{
- return walk == RT_MUTEX_FULL_CHAINWALK;
-}
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
new file mode 100644
index 000000000000..59dbd29cb219
--- /dev/null
+++ b/kernel/locking/rtmutex_api.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+static const struct ctl_table rtmutex_sysctl_table[] = {
+ {
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_rtmutex_sysctl(void)
+{
+ register_sysctl_init("kernel", rtmutex_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rtmutex_sysctl);
+
+/*
+ * Debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
+ unsigned int state,
+ struct lockdep_map *nest_lock,
+ unsigned int subclass)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
+void rt_mutex_base_init(struct rt_mutex_base *rtb)
+{
+ __rt_mutex_base_init(rtb);
+}
+EXPORT_SYMBOL(rt_mutex_base_init);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rt_mutex_lock_nested - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ * @subclass: the lockdep subclass
+ */
+void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
+
+void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
+}
+EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+#endif
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_killable - lock a rt_mutex killable
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * Returns:
+ * 0 on success
+ * -EINTR when interrupted by a signal
+ */
+int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
+{
+ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * This function can only be called in thread context. It's safe to call it
+ * from atomic regions, but not from hard or soft interrupt context.
+ *
+ * Returns:
+ * 1 on success
+ * 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/*
+ * Futex variants, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return rt_mutex_slowtrylock(lock);
+}
+
+int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
+/**
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wqh: The wake queue head from which to get the next lock waiter
+ */
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ /*
+ * mark_wakeup_next_waiter() deboosts and retains preemption
+ * disabled when dropping the wait_lock, to avoid inversion prior
+ * to the wakeup. preempt_disable() therein pairs with the
+ * preempt_enable() in rt_mutex_postunlock().
+ */
+ mark_wakeup_next_waiter(wqh, lock);
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex_base *lock)
+{
+ DEFINE_RT_WAKE_Q(wqh);
+ unsigned long flags;
+ bool postunlock;
+
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
+ postunlock = __rt_mutex_futex_unlock(lock, &wqh);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wqh);
+}
+
+/**
+ * __rt_mutex_init - initialize the rt_mutex
+ *
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
+ *
+ * Initialize the rt_mutex to unlocked state.
+ *
+ * Initializing of a locked rt_mutex is not allowed
+ */
+void __sched __rt_mutex_init(struct rt_mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ __rt_mutex_base_init(&lock->rtmutex);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ * proxy owner
+ *
+ * @lock: the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
+ */
+void __sched rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner)
+{
+ static struct lock_class_key pi_futex_key;
+
+ __rt_mutex_base_init(lock);
+ /*
+ * On PREEMPT_RT the futex hashbucket spinlock becomes 'sleeping'
+ * and rtmutex based. That causes a lockdep false positive, because
+ * some of the futex functions invoke spin_unlock(&hb->lock) with
+ * the wait_lock of the rtmutex associated to the pi_futex held.
+ * spin_unlock() in turn takes wait_lock of the rtmutex on which
+ * the spinlock is based, which makes lockdep notice a lock
+ * recursion. Give the futex/rtmutex wait_lock a separate key.
+ */
+ lockdep_set_class(&lock->wait_lock, &pi_futex_key);
+ rt_mutex_set_owner(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock: the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ *
+ * Special API call for PI-futex support. This just cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
+ */
+void __sched rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_clear_owner(lock);
+}
+
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ * @wake_q: the wake_q to wake tasks after we release the wait_lock
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct wake_q_head *wake_q)
+{
+ int ret;
+
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL,
+ RT_MUTEX_FULL_CHAINWALK, wake_q);
+
+ if (ret && !rt_mutex_owner(lock)) {
+ /*
+ * Reset the return value. We might have
+ * returned with -EDEADLK and the owner
+ * released the lock while we were walking the
+ * pi chain. Let the waiter sort it out.
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
+int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+ preempt_disable();
+ raw_spin_unlock_irq(&lock->wait_lock);
+ wake_up_q(&wake_q);
+ preempt_enable();
+
+ return ret;
+}
+
+/**
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Wait for the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+ * Special API call for PI-futex support
+ */
+int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, true);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool __sched rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ cleanup = true;
+ }
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock, false);
+
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void __sched rt_mutex_adjust_pi(struct task_struct *task)
+{
+ struct rt_mutex_waiter *waiter;
+ struct rt_mutex_base *next_lock;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ waiter = task->pi_blocked_on;
+ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ return;
+ }
+ next_lock = waiter->lock;
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+ /* gets dropped in rt_mutex_adjust_prio_chain()! */
+ get_task_struct(task);
+
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
+}
+
+/*
+ * Performs the wakeup of the top-waiter and re-enables preemption.
+ */
+void __sched rt_mutex_postunlock(struct rt_wake_q_head *wqh)
+{
+ rt_mutex_wake_up_q(wqh);
+}
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+ DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+/* Mutexes */
+static void __mutex_rt_init_generic(struct mutex *mutex)
+{
+ rt_mutex_base_init(&mutex->rtmutex);
+ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
+}
+
+static __always_inline int __mutex_lock_common(struct mutex *lock,
+ unsigned int state,
+ unsigned int subclass,
+ struct lockdep_map *nest_lock,
+ unsigned long ip)
+{
+ int ret;
+
+ might_sleep();
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
+ ret = __rt_mutex_lock(&lock->rtmutex, state);
+ if (ret)
+ mutex_release(&lock->dep_map, ip);
+ else
+ lock_acquired(&lock->dep_map, ip);
+ return ret;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key)
+{
+ __mutex_rt_init_generic(mutex);
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_rt_init_lockdep);
+
+void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+
+void __sched _mutex_lock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
+int __sched mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest_lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
+
+void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
+{
+ int token;
+
+ might_sleep();
+
+ token = io_schedule_prepare();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+
+int __sched _mutex_trylock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_rt_init_generic(struct mutex *mutex)
+{
+ __mutex_rt_init_generic(mutex);
+}
+EXPORT_SYMBOL(mutex_rt_init_generic);
+
+void __sched mutex_lock(struct mutex *lock)
+{
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock);
+
+int __sched mutex_lock_interruptible(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_interruptible);
+
+int __sched mutex_lock_killable(struct mutex *lock)
+{
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+}
+EXPORT_SYMBOL(mutex_lock_killable);
+
+void __sched mutex_lock_io(struct mutex *lock)
+{
+ int token = io_schedule_prepare();
+
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL(mutex_lock_io);
+
+int __sched mutex_trylock(struct mutex *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ return __rt_mutex_trylock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_trylock);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void __sched mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->rtmutex);
+}
+EXPORT_SYMBOL(mutex_unlock);
+
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501407..cf6ddd1b23a2 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* RT Mutexes: blocking mutual exclusion locks with PI support
*
@@ -12,93 +13,144 @@
#ifndef __KERNEL_RTMUTEX_COMMON_H
#define __KERNEL_RTMUTEX_COMMON_H
+#include <linux/debug_locks.h>
#include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
+
/*
- * The rtmutex in kernel tester is independent of rtmutex debugging. We
- * call schedule_rt_mutex_test() instead of schedule() for the tasks which
- * belong to the tester. That way we can delay the wakeup path of those
- * threads to provoke lock stealing and testing of complex boosting scenarios.
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
*/
-#ifdef CONFIG_RT_MUTEX_TESTER
-
-extern void schedule_rt_mutex_test(struct rt_mutex *lock);
-
-#define schedule_rt_mutex(_lock) \
- do { \
- if (!(current->flags & PF_MUTEX_TESTER)) \
- schedule(); \
- else \
- schedule_rt_mutex_test(_lock); \
- } while (0)
-
-#else
-# define schedule_rt_mutex(_lock) schedule()
-#endif
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
+ * @lock: Pointer to the rt_mutex on which the waiter blocks
+ * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
+ * @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
- struct rt_mutex *lock;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- unsigned long ip;
- struct pid *deadlock_task_pid;
- struct rt_mutex *deadlock_lock;
-#endif
- int prio;
+ struct rt_mutex_base *lock;
+ unsigned int wake_state;
+ struct ww_acquire_ctx *ww_ctx;
+};
+
+/**
+ * struct rt_wake_q_head - Wrapper around regular wake_q_head to support
+ * "sleeping" spinlocks on RT
+ * @head: The regular wake_q_head for sleeping lock variants
+ * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups
+ */
+struct rt_wake_q_head {
+ struct wake_q_head head;
+ struct task_struct *rtlock_task;
};
+#define DEFINE_RT_WAKE_Q(name) \
+ struct rt_wake_q_head name = { \
+ .head = WAKE_Q_HEAD_INITIALIZER(name.head), \
+ .rtlock_task = NULL, \
+ }
+
/*
- * Various helpers to access the waiters-tree:
+ * PI-futex support (proxy locking functions, etc.):
*/
-static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
+ struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task,
+ struct wake_q_head *);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
+ struct rt_wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
+
+/*
+ * Must be guarded because this header is included from rcu/tree_plugin.h
+ * unconditionally.
+ */
+#ifdef CONFIG_RT_MUTEXES
+static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
{
- return !RB_EMPTY_ROOT(&lock->waiters);
+ return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
}
-static inline struct rt_mutex_waiter *
-rt_mutex_top_waiter(struct rt_mutex *lock)
+/*
+ * Lockless speculative check whether @waiter is still the top waiter on
+ * @lock. This is solely comparing pointers and not derefencing the
+ * leftmost entry which might be about to vanish.
+ */
+static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
+}
+
+static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
{
- struct rt_mutex_waiter *w;
+ struct rb_node *leftmost = rb_first_cached(&lock->waiters);
+ struct rt_mutex_waiter *w = NULL;
- w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
- tree_entry);
- BUG_ON(w->lock != lock);
+ lockdep_assert_held(&lock->wait_lock);
+ if (leftmost) {
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
+ BUG_ON(w->lock != lock);
+ }
return w;
}
static inline int task_has_pi_waiters(struct task_struct *p)
{
- return !RB_EMPTY_ROOT(&p->pi_waiters);
+ return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
}
-static inline struct rt_mutex_waiter *
-task_top_pi_waiter(struct task_struct *p)
+static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
- return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
-}
-
-/*
- * lock->owner state tracking:
- */
-#define RT_MUTEX_HAS_WAITERS 1UL
-#define RT_MUTEX_OWNER_MASKALL 1UL
+ lockdep_assert_held(&p->pi_lock);
-static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
-{
- return (struct task_struct *)
- ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+ return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
+ pi_tree.entry);
}
/*
@@ -116,26 +168,59 @@ enum rtmutex_chainwalk {
RT_MUTEX_FULL_CHAINWALK,
};
-/*
- * PI-futex support (proxy locking functions, etc.):
- */
-extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
-extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
-extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter,
- struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-# include "rtmutex-debug.h"
-#else
-# include "rtmutex.h"
-#endif
+static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
+{
+ raw_spin_lock_init(&lock->wait_lock);
+ lock->waiters = RB_ROOT_CACHED;
+ lock->owner = NULL;
+}
+
+/* Debug functions */
+static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
+}
+
+static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
+}
+
+static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x11, sizeof(*waiter));
+}
+
+static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
+ memset(waiter, 0x22, sizeof(*waiter));
+}
+
+static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
+ waiter->wake_state = TASK_NORMAL;
+ waiter->task = NULL;
+}
+
+static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
+{
+ rt_mutex_init_waiter(waiter);
+ waiter->wake_state = TASK_RTLOCK_WAIT;
+}
+
+#else /* CONFIG_RT_MUTEXES */
+/* Used in rcu/tree_plugin.h */
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
+{
+ return NULL;
+}
+#endif /* !CONFIG_RT_MUTEXES */
#endif
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
new file mode 100644
index 000000000000..9f4322c07486
--- /dev/null
+++ b/kernel/locking/rwbase_rt.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * RT-specific reader/writer semaphores and reader/writer locks
+ *
+ * down_write/write_lock()
+ * 1) Lock rtmutex
+ * 2) Remove the reader BIAS to force readers into the slow path
+ * 3) Wait until all readers have left the critical section
+ * 4) Mark it write locked
+ *
+ * up_write/write_unlock()
+ * 1) Remove the write locked marker
+ * 2) Set the reader BIAS, so readers can use the fast path again
+ * 3) Unlock rtmutex, to release blocked readers
+ *
+ * down_read/read_lock()
+ * 1) Try fast path acquisition (reader BIAS is set)
+ * 2) Take tmutex::wait_lock, which protects the writelocked flag
+ * 3) If !writelocked, acquire it for read
+ * 4) If writelocked, block on tmutex
+ * 5) unlock rtmutex, goto 1)
+ *
+ * up_read/read_unlock()
+ * 1) Try fast path release (reader count != 1)
+ * 2) Wake the writer waiting in down_write()/write_lock() #3
+ *
+ * down_read/read_lock()#3 has the consequence, that rw semaphores and rw
+ * locks on RT are not writer fair, but writers, which should be avoided in
+ * RT tasks (think mmap_sem), are subject to the rtmutex priority/DL
+ * inheritance mechanism.
+ *
+ * It's possible to make the rw primitives writer fair by keeping a list of
+ * active readers. A blocked writer would force all newly incoming readers
+ * to block on the rtmutex, but the rtmutex would have to be proxy locked
+ * for one reader after the other. We can't use multi-reader inheritance
+ * because there is no way to support that with SCHED_DEADLINE.
+ * Implementing the one by one reader boosting/handover mechanism is a
+ * major surgery for a very dubious value.
+ *
+ * The risk of writer starvation is there, but the pathological use cases
+ * which trigger it are not necessarily the typical RT workloads.
+ *
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
+ * Common code shared between RT rw_semaphore and rwlock
+ */
+
+static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
+{
+ int r;
+
+ /*
+ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
+ * set.
+ */
+ for (r = atomic_read(&rwb->readers); r < 0;) {
+ if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1)))
+ return 1;
+ }
+ return 0;
+}
+
+static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ DEFINE_WAKE_Q(wake_q);
+ int ret;
+
+ rwbase_pre_schedule();
+ raw_spin_lock_irq(&rtm->wait_lock);
+
+ /*
+ * Call into the slow lock path with the rtmutex->wait_lock
+ * held, so this can't result in the following race:
+ *
+ * Reader1 Reader2 Writer
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * down_read()
+ * unlock(m->wait_lock)
+ * up_read()
+ * wake(Writer)
+ * lock(m->wait_lock)
+ * sem->writelocked=true
+ * unlock(m->wait_lock)
+ *
+ * up_write()
+ * sem->writelocked=false
+ * rtmutex_unlock(m)
+ * down_read()
+ * down_write()
+ * rtmutex_lock(m)
+ * wait()
+ * rtmutex_lock(m)
+ *
+ * That would put Reader1 behind the writer waiting on
+ * Reader2 to call up_read(), which might be unbound.
+ */
+
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
+ /*
+ * For rwlocks this returns 0 unconditionally, so the below
+ * !ret conditionals are optimized out.
+ */
+ ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q);
+
+ /*
+ * On success the rtmutex is held, so there can't be a writer
+ * active. Increment the reader count and immediately drop the
+ * rtmutex again.
+ *
+ * rtmutex->wait_lock has to be unlocked in any case of course.
+ */
+ if (!ret)
+ atomic_inc(&rwb->readers);
+
+ preempt_disable();
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ wake_up_q(&wake_q);
+ preempt_enable();
+
+ if (!ret)
+ rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
+ rwbase_post_schedule();
+ return ret;
+}
+
+static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (rwbase_read_trylock(rwb))
+ return 0;
+
+ return __rwbase_read_lock(rwb, state);
+}
+
+static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ struct task_struct *owner;
+ DEFINE_RT_WAKE_Q(wqh);
+
+ raw_spin_lock_irq(&rtm->wait_lock);
+ /*
+ * Wake the writer, i.e. the rtmutex owner. It might release the
+ * rtmutex concurrently in the fast path (due to a signal), but to
+ * clean up rwb->readers it needs to acquire rtm->wait_lock. The
+ * worst case which can happen is a spurious wakeup.
+ */
+ owner = rt_mutex_owner(rtm);
+ if (owner)
+ rt_mutex_wake_q_add_task(&wqh, owner, state);
+
+ /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */
+ preempt_disable();
+ raw_spin_unlock_irq(&rtm->wait_lock);
+ rt_mutex_wake_up_q(&wqh);
+}
+
+static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ /*
+ * rwb->readers can only hit 0 when a writer is waiting for the
+ * active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
+ */
+ if (unlikely(atomic_dec_and_test(&rwb->readers)))
+ __rwbase_read_unlock(rwb, state);
+}
+
+static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
+ unsigned long flags)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock().
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_rtmutex_unlock(rtm);
+}
+
+static inline void rwbase_write_unlock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ __rwbase_write_unlock(rwb, WRITER_BIAS, flags);
+}
+
+static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ /* Release it and account current as reader */
+ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
+}
+
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
+ unsigned int state)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ /* Take the rtmutex as a first step */
+ if (rwbase_rtmutex_lock_state(rtm, state))
+ return -EINTR;
+
+ /* Force readers into slow path */
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ rwbase_pre_schedule();
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
+
+ rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
+ for (;;) {
+ /* Optimized out for rwlocks */
+ if (rwbase_signal_pending_state(state, current)) {
+ rwbase_restore_current_state();
+ __rwbase_write_unlock(rwb, 0, flags);
+ rwbase_post_schedule();
+ trace_contention_end(rwb, -EINTR);
+ return -EINTR;
+ }
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+
+ set_current_state(state);
+ }
+ rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_post_schedule();
+ return 0;
+}
+
+static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ struct rt_mutex_base *rtm = &rwb->rtmutex;
+ unsigned long flags;
+
+ if (!rwbase_rtmutex_trylock(rtm))
+ return 0;
+
+ atomic_sub(READER_BIAS, &rwb->readers);
+
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
+ if (__rwbase_write_trylock(rwb)) {
+ raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ return 1;
+ }
+ __rwbase_write_unlock(rwb, 0, flags);
+ return 0;
+}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index 3a5048572065..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,303 +0,0 @@
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001 David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/export.h>
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
- int ret = 1;
- unsigned long flags;
-
- if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->count != 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- }
- return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- sem->count = 0;
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- * - the 'active count' _reached_ zero
- * - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- int woken;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wakewrite)
- /* Wake up a writer. Note that we do not grant it the
- * lock - it will have to acquire it when it runs. */
- wake_up_process(waiter->task);
- goto out;
- }
-
- /* grant an infinite number of read locks to the front of the queue */
- woken = 0;
- do {
- struct list_head *next = waiter->list.next;
-
- list_del(&waiter->list);
- tsk = waiter->task;
- /*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
- */
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- woken++;
- if (next == &sem->wait_list)
- break;
- waiter = list_entry(next, struct rwsem_waiter, list);
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- sem->count += woken;
-
- out:
- return sem;
-}
-
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
- struct rwsem_waiter *waiter;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
- wake_up_process(waiter->task);
-
- return sem;
-}
-
-/*
- * get a read lock on the semaphore
- */
-void __sched __down_read(struct rw_semaphore *sem)
-{
- struct rwsem_waiter waiter;
- struct task_struct *tsk;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- goto out;
- }
-
- tsk = current;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
- /* set up my own style of waitqueue */
- waiter.task = tsk;
- waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we don't need to touch the semaphore struct anymore */
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- /* wait to be given the lock */
- for (;;) {
- if (!waiter.task)
- break;
- schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- }
-
- __set_task_state(tsk, TASK_RUNNING);
- out:
- ;
-}
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * get a write lock on the semaphore
- */
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
- struct rwsem_waiter waiter;
- struct task_struct *tsk;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* set up my own style of waitqueue */
- tsk = current;
- waiter.task = tsk;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* wait for someone to release the lock */
- for (;;) {
- /*
- * That is the key to support write lock stealing: allows the
- * task already on CPU to get the lock soon rather than put
- * itself into sleep and waiting for system woke it or someone
- * else in the head of the wait list up.
- */
- if (sem->count == 0)
- break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- schedule();
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
- }
- /* got the lock */
- sem->count = -1;
- list_del(&waiter.list);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-void __sched __down_write(struct rw_semaphore *sem)
-{
- __down_write_nested(sem, 0);
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count == 0) {
- /* got the lock */
- sem->count = -1;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (--sem->count == 0 && !list_empty(&sem->wait_list))
- sem = __rwsem_wake_one_writer(sem);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 0;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 1);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 1;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 0);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
deleted file mode 100644
index 3417d0172a5d..000000000000
--- a/kernel/locking/rwsem-xadd.c
+++ /dev/null
@@ -1,531 +0,0 @@
-/* rwsem.c: R/W semaphores: contention handling functions
- *
- * Written by David Howells (dhowells@redhat.com).
- * Derived from arch/i386/kernel/semaphore.c
- *
- * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
- * and Michel Lespinasse <walken@google.com>
- *
- * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
- * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
- */
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched/rt.h>
-#include <linux/osq_lock.h>
-
-#include "rwsem.h"
-
-/*
- * Guide to the rw_semaphore's count field for common values.
- * (32-bit case illustrated, similar for 64-bit)
- *
- * 0x0000000X (1) X readers active or attempting lock, no writer waiting
- * X = #active_readers + #readers attempting to lock
- * (X*ACTIVE_BIAS)
- *
- * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
- * attempting to read lock or write lock.
- *
- * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
- * X = #active readers + # readers attempting lock
- * (X*ACTIVE_BIAS + WAITING_BIAS)
- * (2) 1 writer attempting lock, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- * (3) 1 writer active, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- *
- * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
- * (WAITING_BIAS + ACTIVE_BIAS)
- * (2) 1 writer active or attempting lock, no waiters for lock
- * (ACTIVE_WRITE_BIAS)
- *
- * 0xffff0000 (1) There are writers or readers queued but none active
- * or in the process of attempting lock.
- * (WAITING_BIAS)
- * Note: writer can attempt to steal lock for this count by adding
- * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
- *
- * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
- * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
- *
- * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
- * the count becomes more than 0 for successful lock acquisition,
- * i.e. the case where there are only readers or nobody has lock.
- * (1st and 2nd case above).
- *
- * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
- * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
- * acquisition (i.e. nobody else has lock or attempts lock). If
- * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
- * are only waiters but none active (5th case above), and attempt to
- * steal the lock.
- *
- */
-
-/*
- * Initialize an rwsem:
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- sem->count = RWSEM_UNLOCKED_VALUE;
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- sem->owner = NULL;
- osq_lock_init(&sem->osq);
-#endif
-}
-
-EXPORT_SYMBOL(__init_rwsem);
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-enum rwsem_wake_type {
- RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
- RWSEM_WAKE_READERS, /* Wake readers only */
- RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
-};
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here from up_xxxx(), then:
- * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
- * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- * - there must be someone on the queue
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
- */
-static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
-{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- struct list_head *next;
- long oldcount, woken, loop, adjustment;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY)
- /* Wake writer at the front of the queue, but do not
- * grant it the lock yet as we want other writers
- * to be able to steal it. Readers, on the other hand,
- * will block as they will notice the queued writer.
- */
- wake_up_process(waiter->task);
- goto out;
- }
-
- /* Writers might steal the lock before we grant it to the next reader.
- * We prefer to do the first reader grant before counting readers
- * so we can bail out early if a writer stole the lock.
- */
- adjustment = 0;
- if (wake_type != RWSEM_WAKE_READ_OWNED) {
- adjustment = RWSEM_ACTIVE_READ_BIAS;
- try_reader_grant:
- oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
- if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /* A writer stole the lock. Undo our reader grant. */
- if (rwsem_atomic_update(-adjustment, sem) &
- RWSEM_ACTIVE_MASK)
- goto out;
- /* Last active locker left. Retry waking readers. */
- goto try_reader_grant;
- }
- }
-
- /* Grant an infinite number of read locks to the readers at the front
- * of the queue. Note we increment the 'active part' of the count by
- * the number of readers before waking any processes up.
- */
- woken = 0;
- do {
- woken++;
-
- if (waiter->list.next == &sem->wait_list)
- break;
-
- waiter = list_entry(waiter->list.next,
- struct rwsem_waiter, list);
-
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
- if (waiter->type != RWSEM_WAITING_FOR_WRITE)
- /* hit end of list above */
- adjustment -= RWSEM_WAITING_BIAS;
-
- if (adjustment)
- rwsem_atomic_add(adjustment, sem);
-
- next = sem->wait_list.next;
- loop = woken;
- do {
- waiter = list_entry(next, struct rwsem_waiter, list);
- next = waiter->list.next;
- tsk = waiter->task;
- /*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
- */
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- } while (--loop);
-
- sem->wait_list.next = next;
- next->prev = &sem->wait_list;
-
- out:
- return sem;
-}
-
-/*
- * Wait for the read lock to be granted
- */
-__visible
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
-{
- long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
- struct rwsem_waiter waiter;
- struct task_struct *tsk = current;
-
- /* set up my own style of waitqueue */
- waiter.task = tsk;
- waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
-
- raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
-
- /* If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (count == RWSEM_WAITING_BIAS ||
- (count > RWSEM_WAITING_BIAS &&
- adjustment != -RWSEM_ACTIVE_READ_BIAS))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
-
- raw_spin_unlock_irq(&sem->wait_lock);
-
- /* wait to be given the lock */
- while (true) {
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
- if (!waiter.task)
- break;
- schedule();
- }
-
- __set_task_state(tsk, TASK_RUNNING);
- return sem;
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-
-static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
-{
- /*
- * Try acquiring the write lock. Check count first in order
- * to reduce unnecessary expensive cmpxchg() operations.
- */
- if (count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
- RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
- if (!list_is_singular(&sem->wait_list))
- rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
- rwsem_set_owner(sem);
- return true;
- }
-
- return false;
-}
-
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * Try to acquire write lock before the writer has been put on wait queue.
- */
-static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
-{
- long old, count = READ_ONCE(sem->count);
-
- while (true) {
- if (!(count == 0 || count == RWSEM_WAITING_BIAS))
- return false;
-
- old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count) {
- rwsem_set_owner(sem);
- return true;
- }
-
- count = old;
- }
-}
-
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner;
- bool ret = true;
-
- if (need_resched())
- return false;
-
- rcu_read_lock();
- owner = READ_ONCE(sem->owner);
- if (!owner) {
- long count = READ_ONCE(sem->count);
- /*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath with the lock being active, then there is a possibility
- * reader(s) may have the lock. To be safe, bail spinning in these
- * situations.
- */
- if (count & RWSEM_ACTIVE_MASK)
- ret = false;
- goto done;
- }
-
- ret = owner->on_cpu;
-done:
- rcu_read_unlock();
- return ret;
-}
-
-static noinline
-bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
-{
- long count;
-
- rcu_read_lock();
- while (sem->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking sem->owner still matches owner, if that fails,
- * owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- /* abort spinning when need_resched or owner is not running */
- if (!owner->on_cpu || need_resched()) {
- rcu_read_unlock();
- return false;
- }
-
- cpu_relax_lowlatency();
- }
- rcu_read_unlock();
-
- if (READ_ONCE(sem->owner))
- return true; /* new owner, continue spinning */
-
- /*
- * When the owner is not set, the lock could be free or
- * held by readers. Check the counter to verify the
- * state.
- */
- count = READ_ONCE(sem->count);
- return (count == 0 || count == RWSEM_WAITING_BIAS);
-}
-
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- struct task_struct *owner;
- bool taken = false;
-
- preempt_disable();
-
- /* sem->wait_lock should not be held when doing optimistic spinning */
- if (!rwsem_can_spin_on_owner(sem))
- goto done;
-
- if (!osq_lock(&sem->osq))
- goto done;
-
- while (true) {
- owner = READ_ONCE(sem->owner);
- if (owner && !rwsem_spin_on_owner(sem, owner))
- break;
-
- /* wait_lock will be acquired if write_lock is obtained */
- if (rwsem_try_write_lock_unqueued(sem)) {
- taken = true;
- break;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(current)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax_lowlatency();
- }
- osq_unlock(&sem->osq);
-done:
- preempt_enable();
- return taken;
-}
-
-#else
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- return false;
-}
-#endif
-
-/*
- * Wait until we successfully acquire the write lock
- */
-__visible
-struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
-{
- long count;
- bool waiting = true; /* any queued threads before us */
- struct rwsem_waiter waiter;
-
- /* undo write bias from down_write operation, stop active locking */
- count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
-
- /* do optimistic spinning and steal lock if possible */
- if (rwsem_optimistic_spin(sem))
- return sem;
-
- /*
- * Optimistic spinning failed, proceed to the slowpath
- * and block until we can acquire the sem.
- */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
-
- raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- if (list_empty(&sem->wait_list))
- waiting = false;
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- if (waiting) {
- count = READ_ONCE(sem->count);
-
- /*
- * If there were already threads queued before us and there are
- * no active writers, the lock must be read owned; so we try to
- * wake any read locks that were queued ahead of us.
- */
- if (count > RWSEM_WAITING_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
-
- } else
- count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
-
- /* wait until we successfully acquire the lock */
- set_current_state(TASK_UNINTERRUPTIBLE);
- while (true) {
- if (rwsem_try_write_lock(count, sem))
- break;
- raw_spin_unlock_irq(&sem->wait_lock);
-
- /* Block until there are no active lockers. */
- do {
- schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
- } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
-
- raw_spin_lock_irq(&sem->wait_lock);
- }
- __set_current_state(TASK_RUNNING);
-
- list_del(&waiter.list);
- raw_spin_unlock_irq(&sem->wait_lock);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_down_write_failed);
-
-/*
- * handle waking up a waiter on the semaphore
- * - up_read/up_write has decremented the active part of count if we come here
- */
-__visible
-struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* do nothing if list empty */
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_wake);
-
-/*
- * downgrade a write lock into a read lock
- * - caller incremented waiting part of count and discovered it still negative
- * - just wake up any readers at the front of the queue
- */
-__visible
-struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* do nothing if list empty */
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 205be0ce34de..24df4d98f7d2 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1,17 +1,1532 @@
+// SPDX-License-Identifier: GPL-2.0
/* kernel/rwsem.c: R/W semaphores, public implementation
*
* Written by David Howells (dhowells@redhat.com).
* Derived from asm-i386/semaphore.h
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ *
+ * Rwsem count bit fields re-definition and rwsem rearchitecture by
+ * Waiman Long <longman@redhat.com> and
+ * Peter Zijlstra <peterz@infradead.org>.
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <linux/hung_task.h>
+#include <trace/events/lock.h>
+
+#ifndef CONFIG_PREEMPT_RT
+#include "lock_events.h"
+
+/*
+ * The least significant 2 bits of the owner value has the following
+ * meanings when set.
+ * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
+ * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
+ *
+ * When the rwsem is reader-owned and a spinning writer has timed out,
+ * the nonspinnable bit will be set to disable optimistic spinning.
+
+ * When a writer acquires a rwsem, it puts its task_struct pointer
+ * into the owner field. It is cleared after an unlock.
+ *
+ * When a reader acquires a rwsem, it will also puts its task_struct
+ * pointer into the owner field with the RWSEM_READER_OWNED bit set.
+ * On unlock, the owner field will largely be left untouched. So
+ * for a free or reader-owned rwsem, the owner value may contain
+ * information about the last reader that acquires the rwsem.
+ *
+ * That information may be helpful in debugging cases where the system
+ * seems to hang on a reader owned rwsem especially if only one reader
+ * is involved. Ideally we would like to track all the readers that own
+ * a rwsem, but the overhead is simply too big.
+ *
+ * A fast path reader optimistic lock stealing is supported when the rwsem
+ * is previously owned by a writer and the following conditions are met:
+ * - rwsem is not currently writer owned
+ * - the handoff isn't set.
+ */
+#define RWSEM_READER_OWNED (1UL << 0)
+#define RWSEM_NONSPINNABLE (1UL << 1)
+#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
+
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
+ if (!debug_locks_silent && \
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ #c, atomic_long_read(&(sem)->count), \
+ (unsigned long) sem->magic, \
+ atomic_long_read(&(sem)->owner), (long)current, \
+ list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ debug_locks_off(); \
+ } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+
+/*
+ * On 64-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit 63 - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-30 - 23-bit reader count
+ * Bit 31 - read fail bit
+ *
+ * It is not likely that the most significant bit (read fail bit) will ever
+ * be set. This guard bit is still checked anyway in the down_read() fastpath
+ * just in case we need to use up more of the reader bits for other purpose
+ * in the future.
+ *
+ * atomic_long_fetch_add() is used to obtain reader lock, whereas
+ * atomic_long_cmpxchg() will be used to obtain writer lock.
+ *
+ * There are three places where the lock handoff bit may be set or cleared.
+ * 1) rwsem_mark_wake() for readers -- set, clear
+ * 2) rwsem_try_write_lock() for writers -- set, clear
+ * 3) rwsem_del_waiter() -- clear
+ *
+ * For all the above cases, wait_lock will be held. A writer must also
+ * be the first one in the wait_list to be eligible for setting the handoff
+ * bit. So concurrent setting/clearing of handoff bit is not possible.
+ */
+#define RWSEM_WRITER_LOCKED (1UL << 0)
+#define RWSEM_FLAG_WAITERS (1UL << 1)
+#define RWSEM_FLAG_HANDOFF (1UL << 2)
+#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
+
+#define RWSEM_READER_SHIFT 8
+#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
+#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
+#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
+#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
+#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
+ RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
+
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ *
+ * Both rwsem_{set,clear}_owner() functions should be in the same
+ * preempt disable section as the atomic op that changes sem->count.
+ */
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ lockdep_assert_preemption_disabled();
+ atomic_long_set(&sem->owner, (long)current);
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ lockdep_assert_preemption_disabled();
+ atomic_long_set(&sem->owner, 0);
+}
+
+/*
+ * Test the flags in the owner field.
+ */
+static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
+{
+ return atomic_long_read(&sem->owner) & flags;
+}
+
+/*
+ * The task_struct pointer of the last owning reader will be left in
+ * the owner field.
+ *
+ * Note that the owner value just indicates the task has owned the rwsem
+ * previously, it may not be the real owner or one of the real owners
+ * anymore when that field is examined, so take it with a grain of salt.
+ *
+ * The reader non-spinnable bit is preserved.
+ */
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
+ (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
+
+ atomic_long_set(&sem->owner, val);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ __rwsem_set_reader_owned(sem, current);
+}
+
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
+/*
+ * Return just the real task structure pointer of the owner
+ */
+struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+ return (struct task_struct *)
+ (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ /*
+ * Check the count to see if it is write-locked.
+ */
+ long count = atomic_long_read(&sem->count);
+
+ if (count & RWSEM_WRITER_MASK)
+ return false;
+ return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
+}
+
+/*
+ * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
+ * it will make sure that the owner field of a reader-owned rwsem either
+ * points to a real reader-owner(s) or gets cleared. The only exception is
+ * when the unlock is done by up_read_non_owner().
+ */
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+ unsigned long val = atomic_long_read(&sem->owner);
+
+ while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
+ if (atomic_long_try_cmpxchg(&sem->owner, &val,
+ val & RWSEM_OWNER_FLAGS_MASK))
+ return;
+ }
+}
+#else
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+}
+#endif
+
+/*
+ * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
+ * remains set. Otherwise, the operation will be aborted.
+ */
+static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ do {
+ if (!(owner & RWSEM_READER_OWNED))
+ break;
+ if (owner & RWSEM_NONSPINNABLE)
+ break;
+ } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
+ owner | RWSEM_NONSPINNABLE));
+}
+
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
+{
+ *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+
+ if (WARN_ON_ONCE(*cntp < 0))
+ rwsem_set_nonspinnable(sem);
+
+ if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
+ rwsem_set_reader_owned(sem);
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Return the real task structure pointer of the owner and the embedded
+ * flags in the owner. pflags must be non-NULL.
+ */
+static inline struct task_struct *
+rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
+ return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
+}
-#include "rwsem.h"
+/*
+ * Guide to the rw_semaphore's count field.
+ *
+ * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
+ * by a writer.
+ *
+ * The lock is owned by readers when
+ * (1) the RWSEM_WRITER_LOCKED isn't set in count,
+ * (2) some of the reader bits are set in count, and
+ * (3) the owner field has RWSEM_READ_OWNED bit set.
+ *
+ * Having some reader bits set is not enough to guarantee a readers owned
+ * lock as the readers may be in the process of backing out from the count
+ * and a writer has just released the lock. So another writer may steal
+ * the lock immediately after that.
+ */
+
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+#ifdef CONFIG_DEBUG_RWSEMS
+ sem->magic = sem;
+#endif
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+ atomic_long_set(&sem->owner, 0L);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ osq_lock_init(&sem->osq);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+ unsigned long timeout;
+ bool handoff_set;
+};
+#define rwsem_first_waiter(sem) \
+ list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
+
+enum rwsem_wake_type {
+ RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
+ RWSEM_WAKE_READERS, /* Wake readers only */
+ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
+};
+
+/*
+ * The typical HZ value is either 250 or 1000. So set the minimum waiting
+ * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
+ * queue before initiating the handoff protocol.
+ */
+#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
+
+/*
+ * Magic number to batch-wakeup waiting readers, even when writers are
+ * also present in the queue. This both limits the amount of work the
+ * waking thread must do and also prevents any potential counter overflow,
+ * however unlikely.
+ */
+#define MAX_READERS_WAKEUP 0x100
+
+static inline void
+rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_add_tail(&waiter->list, &sem->wait_list);
+ /* caller will set RWSEM_FLAG_WAITERS */
+}
+
+/*
+ * Remove a waiter from the wait_list and clear flags.
+ *
+ * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
+ * this function. Modify with care.
+ *
+ * Return: true if wait_list isn't empty and false otherwise
+ */
+static inline bool
+rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+{
+ lockdep_assert_held(&sem->wait_lock);
+ list_del(&waiter->list);
+ if (likely(!list_empty(&sem->wait_list)))
+ return true;
+
+ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ return false;
+}
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
+ * have been set.
+ * - there must be someone on the queue
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only marked woken if downgrading is false
+ *
+ * Implies rwsem_del_waiter() for all woken readers.
+ */
+static void rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
+{
+ struct rwsem_waiter *waiter, *tmp;
+ long oldcount, woken = 0, adjustment = 0;
+ struct list_head wlist;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = rwsem_first_waiter(sem);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
+ */
+ wake_q_add(wake_q, waiter->task);
+ lockevent_inc(rwsem_wake_writer);
+ }
+
+ return;
+ }
+
+ /*
+ * No reader wakeup if there are too many of them already.
+ */
+ if (unlikely(atomic_long_read(&sem->count) < 0))
+ return;
+
+ /*
+ * Writers might steal the lock before we grant it to the next reader.
+ * We prefer to do the first reader grant before counting readers
+ * so we can bail out early if a writer stole the lock.
+ */
+ if (wake_type != RWSEM_WAKE_READ_OWNED) {
+ struct task_struct *owner;
+
+ adjustment = RWSEM_READER_BIAS;
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+ if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
+ /*
+ * When we've been waiting "too" long (for writers
+ * to give up the lock), request a HANDOFF to
+ * force the issue.
+ */
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
+ }
+
+ atomic_long_add(-adjustment, &sem->count);
+ return;
+ }
+ /*
+ * Set it to reader-owned to give spinners an early
+ * indication that readers now have the lock.
+ * The reader nonspinnable bit seen at slowpath entry of
+ * the reader is copied over.
+ */
+ owner = waiter->task;
+ __rwsem_set_reader_owned(sem, owner);
+ }
+
+ /*
+ * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
+ * queue. We know that the woken will be at least 1 as we accounted
+ * for above. Note we increment the 'active part' of the count by the
+ * number of readers before waking any processes up.
+ *
+ * This is an adaptation of the phase-fair R/W locks where at the
+ * reader phase (first waiter is a reader), all readers are eligible
+ * to acquire the lock at the same time irrespective of their order
+ * in the queue. The writers acquire the lock according to their
+ * order in the queue.
+ *
+ * We have to do wakeup in 2 passes to prevent the possibility that
+ * the reader count may be decremented before it is incremented. It
+ * is because the to-be-woken waiter may not have slept yet. So it
+ * may see waiter->task got cleared, finish its critical section and
+ * do an unlock before the reader count increment.
+ *
+ * 1) Collect the read-waiters in a separate list, count them and
+ * fully increment the reader count in rwsem.
+ * 2) For each waiters in the new list, clear waiter->task and
+ * put them into wake_q to be woken up later.
+ */
+ INIT_LIST_HEAD(&wlist);
+ list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE)
+ continue;
+
+ woken++;
+ list_move_tail(&waiter->list, &wlist);
+
+ /*
+ * Limit # of readers that can be woken up per wakeup call.
+ */
+ if (unlikely(woken >= MAX_READERS_WAKEUP))
+ break;
+ }
+
+ adjustment = woken * RWSEM_READER_BIAS - adjustment;
+ lockevent_cond_inc(rwsem_wake_reader, woken);
+
+ oldcount = atomic_long_read(&sem->count);
+ if (list_empty(&sem->wait_list)) {
+ /*
+ * Combined with list_move_tail() above, this implies
+ * rwsem_del_waiter().
+ */
+ adjustment -= RWSEM_FLAG_WAITERS;
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ } else if (woken) {
+ /*
+ * When we've woken a reader, we no longer need to force
+ * writers to give up the lock and we can clear HANDOFF.
+ */
+ if (oldcount & RWSEM_FLAG_HANDOFF)
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ }
+
+ if (adjustment)
+ atomic_long_add(adjustment, &sem->count);
+
+ /* 2nd pass */
+ list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ struct task_struct *tsk;
+
+ tsk = waiter->task;
+ get_task_struct(tsk);
+
+ /*
+ * Ensure calling get_task_struct() before setting the reader
+ * waiter to nil such that rwsem_down_read_slowpath() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
+ */
+ smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add_safe(wake_q, tsk);
+ }
+}
+
+/*
+ * Remove a waiter and try to wake up other waiters in the wait queue
+ * This function is called from the out_nolock path of both the reader and
+ * writer slowpaths with wait_lock held. It releases the wait_lock and
+ * optionally wake up waiters before it returns.
+ */
+static inline void
+rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
+ struct wake_q_head *wake_q)
+ __releases(&sem->wait_lock)
+{
+ bool first = rwsem_first_waiter(sem) == waiter;
+
+ wake_q_init(wake_q);
+
+ /*
+ * If the wait_list isn't empty and the waiter to be deleted is
+ * the first waiter, we wake up the remaining waiters as they may
+ * be eligible to acquire or spin on the lock.
+ */
+ if (rwsem_del_waiter(sem, waiter) && first)
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ if (!wake_q_empty(wake_q))
+ wake_up_q(wake_q);
+}
+
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ *
+ * Implies rwsem_del_waiter() on success.
+ */
+static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+ struct rwsem_waiter *waiter)
+{
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
+ long count, new;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ count = atomic_long_read(&sem->count);
+ do {
+ bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
+
+ if (has_handoff) {
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
+ return false;
+ }
+
+ new = count;
+
+ if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
+ if (has_handoff || (!rt_or_dl_task(waiter->task) &&
+ !time_after(jiffies, waiter->timeout)))
+ return false;
+
+ new |= RWSEM_FLAG_HANDOFF;
+ } else {
+ new |= RWSEM_WRITER_LOCKED;
+ new &= ~RWSEM_FLAG_HANDOFF;
+
+ if (list_is_singular(&sem->wait_list))
+ new &= ~RWSEM_FLAG_WAITERS;
+ }
+ } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
+
+ /*
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
+ */
+ if (new & RWSEM_FLAG_HANDOFF) {
+ first->handoff_set = true;
+ lockevent_inc(rwsem_wlock_handoff);
+ return false;
+ }
+
+ /*
+ * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
+ * success.
+ */
+ list_del(&waiter->list);
+ rwsem_set_owner(sem);
+ return true;
+}
+
+/*
+ * The rwsem_spin_on_owner() function returns the following 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
+ */
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+
+ while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
+ count | RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ lockevent_inc(rwsem_opt_lock);
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ unsigned long flags;
+ bool ret = true;
+
+ if (need_resched()) {
+ lockevent_inc(rwsem_opt_fail);
+ return false;
+ }
+
+ /*
+ * Disable preemption is equal to the RCU read-side crital section,
+ * thus the task_strcut structure won't go away.
+ */
+ owner = rwsem_owner_flags(sem, &flags);
+ /*
+ * Don't check the read-owner as the entry may be stale.
+ */
+ if ((flags & RWSEM_NONSPINNABLE) ||
+ (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
+ ret = false;
+
+ lockevent_cond_inc(rwsem_opt_fail, !ret);
+ return ret;
+}
+
+static inline enum owner_state
+rwsem_owner_state(struct task_struct *owner, unsigned long flags)
+{
+ if (flags & RWSEM_NONSPINNABLE)
+ return OWNER_NONSPINNABLE;
+
+ if (flags & RWSEM_READER_OWNED)
+ return OWNER_READER;
+
+ return owner ? OWNER_WRITER : OWNER_NULL;
+}
+
+static noinline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *new, *owner;
+ unsigned long flags, new_flags;
+ enum owner_state state;
+
+ lockdep_assert_preemption_disabled();
+
+ owner = rwsem_owner_flags(sem, &flags);
+ state = rwsem_owner_state(owner, flags);
+ if (state != OWNER_WRITER)
+ return state;
+
+ for (;;) {
+ /*
+ * When a waiting writer set the handoff flag, it may spin
+ * on the owner as well. Once that writer acquires the lock,
+ * we can spin on it. So we don't need to quit even when the
+ * handoff bit is set.
+ */
+ new = rwsem_owner_flags(sem, &new_flags);
+ if ((new != owner) || (new_flags != flags)) {
+ state = rwsem_owner_state(new, new_flags);
+ break;
+ }
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * our spinning context already disabled preemption which is
+ * equal to RCU read-side crital section ensures the memory
+ * stays valid.
+ */
+ barrier();
+
+ if (need_resched() || !owner_on_cpu(owner)) {
+ state = OWNER_NONSPINNABLE;
+ break;
+ }
+
+ cpu_relax();
+ }
+
+ return state;
+}
+
+/*
+ * Calculate reader-owned rwsem spinning threshold for writer
+ *
+ * The more readers own the rwsem, the longer it will take for them to
+ * wind down and free the rwsem. So the empirical formula used to
+ * determine the actual spinning time limit here is:
+ *
+ * Spinning threshold = (10 + nr_readers/2)us
+ *
+ * The limit is capped to a maximum of 25us (30 readers). This is just
+ * a heuristic and is subjected to change in the future.
+ */
+static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+ int readers = count >> RWSEM_READER_SHIFT;
+ u64 delta;
+
+ if (readers > 30)
+ readers = 30;
+ delta = (20 + readers) * NSEC_PER_USEC / 2;
+
+ return sched_clock() + delta;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ bool taken = false;
+ int prev_owner_state = OWNER_NULL;
+ int loop = 0;
+ u64 rspin_threshold = 0;
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock and spinning time has exceeded limit.
+ */
+ for (;;) {
+ enum owner_state owner_state;
+
+ owner_state = rwsem_spin_on_owner(sem);
+ if (owner_state == OWNER_NONSPINNABLE)
+ break;
+
+ /*
+ * Try to acquire the lock
+ */
+ taken = rwsem_try_write_lock_unqueued(sem);
+
+ if (taken)
+ break;
+
+ /*
+ * Time-based reader-owned rwsem optimistic spinning
+ */
+ if (owner_state == OWNER_READER) {
+ /*
+ * Re-initialize rspin_threshold every time when
+ * the owner state changes from non-reader to reader.
+ * This allows a writer to steal the lock in between
+ * 2 reader phases and have the threshold reset at
+ * the beginning of the 2nd reader phase.
+ */
+ if (prev_owner_state != OWNER_READER) {
+ if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+ break;
+ rspin_threshold = rwsem_rspin_threshold(sem);
+ loop = 0;
+ }
+
+ /*
+ * Check time threshold once every 16 iterations to
+ * avoid calling sched_clock() too frequently so
+ * as to reduce the average latency between the times
+ * when the lock becomes free and when the spinner
+ * is ready to do a trylock.
+ */
+ else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
+ rwsem_set_nonspinnable(sem);
+ lockevent_inc(rwsem_opt_nospin);
+ break;
+ }
+ }
+
+ /*
+ * An RT task cannot do optimistic spinning if it cannot
+ * be sure the lock holder is running or live-lock may
+ * happen if the current task and the lock holder happen
+ * to run in the same CPU. However, aborting optimistic
+ * spinning while a NULL owner is detected may miss some
+ * opportunity where spinning can continue without causing
+ * problem.
+ *
+ * There are 2 possible cases where an RT task may be able
+ * to continue spinning.
+ *
+ * 1) The lock owner is in the process of releasing the
+ * lock, sem->owner is cleared but the lock has not
+ * been released yet.
+ * 2) The lock was free and owner cleared, but another
+ * task just comes in and acquire the lock before
+ * we try to get it. The new owner may be a spinnable
+ * writer.
+ *
+ * To take advantage of two scenarios listed above, the RT
+ * task is made to retry one more time to see if it can
+ * acquire the lock or continue spinning on the new owning
+ * writer. Of course, if the time lag is long enough or the
+ * new owner is not a writer or spinnable, the RT task will
+ * quit spinning.
+ *
+ * If the owner is a writer, the need_resched() check is
+ * done inside rwsem_spin_on_owner(). If the owner is not
+ * a writer, need_resched() check needs to be done here.
+ */
+ if (owner_state != OWNER_WRITER) {
+ if (need_resched())
+ break;
+ if (rt_or_dl_task(current) &&
+ (prev_owner_state != OWNER_WRITER))
+ break;
+ }
+ prev_owner_state = owner_state;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ lockevent_cond_inc(rwsem_opt_fail, !taken);
+ return taken;
+}
+
+/*
+ * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
+ * only be called when the reader count reaches 0.
+ */
+static inline void clear_nonspinnable(struct rw_semaphore *sem)
+{
+ if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
+ atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
+}
+
+#else
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ return false;
+}
+
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+
+static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
+
+static inline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem)
+{
+ return OWNER_NONSPINNABLE;
+}
+#endif
+
+/*
+ * Prepare to wake up waiter(s) in the wait queue by putting them into the
+ * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
+ * reader-owned, wake up read lock waiters in queue front or wake up any
+ * front waiter otherwise.
+
+ * This is being called from both reader and writer slow paths.
+ */
+static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
+ struct wake_q_head *wake_q)
+{
+ enum rwsem_wake_type wake_type;
+
+ if (count & RWSEM_WRITER_MASK)
+ return;
+
+ if (count & RWSEM_READER_MASK) {
+ wake_type = RWSEM_WAKE_READERS;
+ } else {
+ wake_type = RWSEM_WAKE_ANY;
+ clear_nonspinnable(sem);
+ }
+ rwsem_mark_wake(sem, wake_type, wake_q);
+}
+
+/*
+ * Wait for the read lock to be granted
+ */
+static struct rw_semaphore __sched *
+rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
+{
+ long adjustment = -RWSEM_READER_BIAS;
+ long rcnt = (count >> RWSEM_READER_SHIFT);
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+
+ /*
+ * To prevent a constant stream of readers from starving a sleeping
+ * writer, don't attempt optimistic lock stealing if the lock is
+ * very likely owned by readers.
+ */
+ if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
+ (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
+ goto queue;
+
+ /*
+ * Reader optimistic lock stealing.
+ */
+ if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_steal);
+
+ /*
+ * Wake up other readers in the wait queue if it is
+ * the first reader.
+ */
+ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
+ &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ }
+ return sem;
+ }
+
+queue:
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list)) {
+ /*
+ * In case the wait queue is empty and the lock isn't owned
+ * by a writer, this reader can exit the slowpath and return
+ * immediately as its RWSEM_READER_BIAS has already been set
+ * in the count.
+ */
+ if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
+ /* Provide lock ACQUIRE */
+ smp_acquire__after_ctrl_dep();
+ raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_fast);
+ return sem;
+ }
+ adjustment += RWSEM_FLAG_WAITERS;
+ }
+ rwsem_add_waiter(sem, &waiter);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ count = atomic_long_add_return(adjustment, &sem->count);
+
+ rwsem_cond_wake_waiter(sem, count, &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
+
+ trace_contention_begin(sem, LCB_F_READ);
+ set_current_state(state);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
+
+ /* wait to be given the lock */
+ for (;;) {
+ if (!smp_load_acquire(&waiter.task)) {
+ /* Matches rwsem_mark_wake()'s smp_store_release(). */
+ break;
+ }
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
+ break;
+ }
+ schedule_preempt_disabled();
+ lockevent_inc(rwsem_sleep_reader);
+ set_current_state(state);
+ }
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
+ return sem;
+
+out_nolock:
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+static struct rw_semaphore __sched *
+rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+{
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
+ /* rwsem_optimistic_spin() implies ACQUIRE on success */
+ return sem;
+ }
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ rwsem_add_waiter(sem, &waiter);
+
+ /* we're now waiting on the lock */
+ if (rwsem_first_waiter(sem) != &waiter) {
+ rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
+ &wake_q);
+ if (!wake_q_empty(&wake_q)) {
+ /*
+ * We want to minimize wait_lock hold time especially
+ * when a large number of readers are to be woken up.
+ */
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ } else {
+ atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
+ }
+
+ /* wait until we successfully acquire the lock */
+ set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+
+ for (;;) {
+ if (rwsem_try_write_lock(sem, &waiter)) {
+ /* rwsem_try_write_lock() implies ACQUIRE on success */
+ break;
+ }
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
+ /*
+ * After setting the handoff bit and failing to acquire
+ * the lock, attempt to spin on owner to accelerate lock
+ * transfer. If the previous owner is a on-cpu writer and it
+ * has just released the lock, OWNER_NULL will be returned.
+ * In this case, we attempt to acquire the lock again
+ * without sleeping.
+ */
+ if (waiter.handoff_set) {
+ enum owner_state owner_state;
+
+ owner_state = rwsem_spin_on_owner(sem);
+ if (owner_state == OWNER_NULL)
+ goto trylock_again;
+ }
+
+ schedule_preempt_disabled();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
+trylock_again:
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
+ __set_current_state(TASK_RUNNING);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ lockevent_inc(rwsem_wlock);
+ trace_contention_end(sem, 0);
+ return sem;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
+ rwsem_del_wake_waiter(sem, &waiter, &wake_q);
+ lockevent_inc(rwsem_wlock_fail);
+ trace_contention_end(sem, -EINTR);
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * lock for reading
+ */
+static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
+{
+ int ret = 0;
+ long count;
+
+ preempt_disable();
+ if (!rwsem_read_trylock(sem, &count)) {
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
+ ret = -EINTR;
+ goto out;
+ }
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ }
+out:
+ preempt_enable();
+ return ret;
+}
+
+static __always_inline void __down_read(struct rw_semaphore *sem)
+{
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_INTERRUPTIBLE);
+}
+
+static __always_inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = 0;
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+
+ preempt_disable();
+ tmp = atomic_long_read(&sem->count);
+ while (!(tmp & RWSEM_READ_FAILED_MASK)) {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ tmp + RWSEM_READER_BIAS)) {
+ rwsem_set_reader_owned(sem);
+ ret = 1;
+ break;
+ }
+ }
+ preempt_enable();
+ return ret;
+}
+
+/*
+ * lock for writing
+ */
+static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
+{
+ int ret = 0;
+
+ preempt_disable();
+ if (unlikely(!rwsem_write_trylock(sem))) {
+ if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
+ ret = -EINTR;
+ }
+ preempt_enable();
+ return ret;
+}
+
+static __always_inline void __down_write(struct rw_semaphore *sem)
+{
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+static __always_inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret;
+
+ preempt_disable();
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ ret = rwsem_write_trylock(sem);
+ preempt_enable();
+
+ return ret;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+
+ preempt_disable();
+ rwsem_clear_reader_owned(sem);
+ tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
+ DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
+ if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
+ RWSEM_FLAG_WAITERS)) {
+ clear_nonspinnable(sem);
+ rwsem_wake(sem);
+ }
+ preempt_enable();
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ /*
+ * sem->owner may differ from current if the ownership is transferred
+ * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
+ */
+ DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
+ !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+
+ preempt_disable();
+ rwsem_clear_owner(sem);
+ tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
+ if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+ rwsem_wake(sem);
+ preempt_enable();
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ /*
+ * When downgrading from exclusive to shared ownership,
+ * anything inside the write-locked region cannot leak
+ * into the read side. In contrast, anything in the
+ * read-locked region is ok to be re-ordered into the
+ * write side. As such, rely on RELEASE semantics.
+ */
+ DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ preempt_disable();
+ tmp = atomic_long_fetch_add_release(
+ -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
+ rwsem_set_reader_owned(sem);
+ if (tmp & RWSEM_FLAG_WAITERS)
+ rwsem_downgrade_wake(sem);
+ preempt_enable();
+}
+
+#else /* !CONFIG_PREEMPT_RT */
+
+#define RT_MUTEX_BUILD_MUTEX
+#include "rtmutex.c"
+
+#define rwbase_set_and_save_current_state(state) \
+ set_current_state(state)
+
+#define rwbase_restore_current_state() \
+ __set_current_state(TASK_RUNNING)
+
+#define rwbase_rtmutex_lock_state(rtm, state) \
+ __rt_mutex_lock(rtm, state)
+
+#define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \
+ __rt_mutex_slowlock_locked(rtm, NULL, state, wq)
+
+#define rwbase_rtmutex_unlock(rtm) \
+ __rt_mutex_unlock(rtm)
+
+#define rwbase_rtmutex_trylock(rtm) \
+ __rt_mutex_trylock(rtm)
+
+#define rwbase_signal_pending_state(state, current) \
+ signal_pending_state(state, current)
+
+#define rwbase_pre_schedule() \
+ rt_mutex_pre_schedule()
+
+#define rwbase_schedule() \
+ rt_mutex_schedule()
+
+#define rwbase_post_schedule() \
+ rt_mutex_post_schedule()
+
+#include "rwbase_rt.c"
+
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_read_trylock(&sem->rwbase);
+}
+
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
+}
+
+static inline void __sched __down_write(struct rw_semaphore *sem)
+{
+ rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ return rwbase_write_trylock(&sem->rwbase);
+}
+
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ rwbase_write_unlock(&sem->rwbase);
+}
+
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ rwbase_write_downgrade(&sem->rwbase);
+}
+
+/* Debug stubs for the common API */
+#define DEBUG_RWSEMS_WARN_ON(c, sem)
+
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+}
+
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+ int count = atomic_read(&sem->rwbase.readers);
+
+ return count < 0 && count != READER_BIAS;
+}
+
+#endif /* CONFIG_PREEMPT_RT */
/*
* lock for reading
@@ -23,9 +1538,36 @@ void __sched down_read(struct rw_semaphore *sem)
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
}
-
EXPORT_SYMBOL(down_read);
+int __sched down_read_interruptible(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_interruptible);
+
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_killable);
+
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
@@ -37,7 +1579,6 @@ int down_read_trylock(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
return ret;
}
-
EXPORT_SYMBOL(down_read_trylock);
/*
@@ -47,28 +1588,40 @@ void __sched down_write(struct rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(down_write);
/*
+ * lock for writing
+ */
+int __sched down_write_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_write_killable);
+
+/*
* trylock for writing -- returns 1 if successful, 0 if contention
*/
int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
- if (ret == 1) {
+ if (ret == 1)
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
- rwsem_set_owner(sem);
- }
return ret;
}
-
EXPORT_SYMBOL(down_write_trylock);
/*
@@ -76,11 +1629,9 @@ EXPORT_SYMBOL(down_write_trylock);
*/
void up_read(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
-
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read);
/*
@@ -88,12 +1639,9 @@ EXPORT_SYMBOL(up_read);
*/
void up_write(struct rw_semaphore *sem)
{
- rwsem_release(&sem->dep_map, 1, _RET_IP_);
-
- rwsem_clear_owner(sem);
+ rwsem_release(&sem->dep_map, _RET_IP_);
__up_write(sem);
}
-
EXPORT_SYMBOL(up_write);
/*
@@ -101,14 +1649,9 @@ EXPORT_SYMBOL(up_write);
*/
void downgrade_write(struct rw_semaphore *sem)
{
- /*
- * lockdep: a downgraded write will live on as a write
- * dependency.
- */
- rwsem_clear_owner(sem);
+ lock_downgrade(&sem->dep_map, _RET_IP_);
__downgrade_write(sem);
}
-
EXPORT_SYMBOL(downgrade_write);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -117,50 +1660,74 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
}
-
EXPORT_SYMBOL(down_read_nested);
+int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_killable_nested);
+
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
{
might_sleep();
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(_down_write_nest_lock);
void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
-
__down_read(sem);
+ /*
+ * The owner value for a reader-owned lock is mostly for debugging
+ * purpose only and is not critical to the correct functioning of
+ * rwsem. So it is perfectly fine to set it in a preempt-enabled
+ * context here.
+ */
+ __rwsem_set_reader_owned(sem, NULL);
}
-
EXPORT_SYMBOL(down_read_non_owner);
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(down_write_nested);
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_write_killable_nested);
+
void up_read_non_owner(struct rw_semaphore *sem)
{
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read_non_owner);
#endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
deleted file mode 100644
index 870ed9a5b426..000000000000
--- a/kernel/locking/rwsem.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- sem->owner = current;
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- sem->owner = NULL;
-}
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-#endif
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index b8120abe594b..3ef032e22f7e 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
- * Distributed under the terms of the GNU GPL, version 2
- *
* This file implements counting semaphores.
* A counting semaphore may be acquired 'n' times before sleeping.
* See mutex.c for single-acquisition sleeping locks which enforce
@@ -29,15 +28,54 @@
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>
+#include <linux/hung_task.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+ WRITE_ONCE((sem)->last_holder, (unsigned long)current);
+}
+
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+ if (READ_ONCE((sem)->last_holder) == (unsigned long)current)
+ WRITE_ONCE((sem)->last_holder, 0UL);
+}
+
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return READ_ONCE(sem->last_holder);
+}
+#else
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+}
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+}
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return 0UL;
+}
+#endif
+
+static inline void __sem_acquire(struct semaphore *sem)
+{
+ sem->count--;
+ hung_task_sem_set_holder(sem);
+}
/**
* down - acquire the semaphore
@@ -50,13 +88,14 @@ static noinline void __up(struct semaphore *sem);
* Use of this function is deprecated, please use down_interruptible() or
* down_killable() instead.
*/
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
{
unsigned long flags;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -72,14 +111,15 @@ EXPORT_SYMBOL(down);
* If the sleep is interrupted by a signal, this function will return -EINTR.
* If the semaphore is successfully acquired, this function returns 0.
*/
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_interruptible(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -98,14 +138,15 @@ EXPORT_SYMBOL(down_interruptible);
* -EINTR. If the semaphore is successfully acquired, this function returns
* 0.
*/
-int down_killable(struct semaphore *sem)
+int __sched down_killable(struct semaphore *sem)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_killable(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -119,7 +160,7 @@ EXPORT_SYMBOL(down_killable);
* @sem: the semaphore to be acquired
*
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
- * been acquired successfully or 1 if it it cannot be acquired.
+ * been acquired successfully or 1 if it cannot be acquired.
*
* NOTE: This return value is inverted from both spin_trylock and
* mutex_trylock! Be careful about this when converting code.
@@ -127,7 +168,7 @@ EXPORT_SYMBOL(down_killable);
* Unlike mutex_trylock, this function can be used from interrupt context,
* and the semaphore can be released by any task or interrupt.
*/
-int down_trylock(struct semaphore *sem)
+int __sched down_trylock(struct semaphore *sem)
{
unsigned long flags;
int count;
@@ -135,7 +176,7 @@ int down_trylock(struct semaphore *sem)
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
- sem->count = count;
+ __sem_acquire(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
@@ -152,14 +193,15 @@ EXPORT_SYMBOL(down_trylock);
* If the semaphore is not released within the specified number of jiffies,
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
*/
-int down_timeout(struct semaphore *sem, long timeout)
+int __sched down_timeout(struct semaphore *sem, long timeout)
{
unsigned long flags;
int result = 0;
+ might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -175,16 +217,22 @@ EXPORT_SYMBOL(down_timeout);
* Release the semaphore. Unlike mutexes, up() may be called from any
* context and even by tasks which have never called down().
*/
-void up(struct semaphore *sem)
+void __sched up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
+
+ hung_task_sem_clear_if_holder(sem);
+
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -201,27 +249,28 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
- struct task_struct *task = current;
struct semaphore_waiter waiter;
list_add_tail(&waiter.list, &sem->wait_list);
- waiter.task = task;
+ waiter.task = current;
waiter.up = false;
for (;;) {
- if (signal_pending_state(state, task))
+ if (signal_pending_state(state, current))
goto interrupted;
if (unlikely(timeout <= 0))
goto timed_out;
- __set_task_state(task, state);
+ __set_current_state(state);
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
- if (waiter.up)
+ if (waiter.up) {
+ hung_task_sem_set_holder(sem);
return 0;
+ }
}
timed_out:
@@ -233,6 +282,22 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ hung_task_clear_blocker();
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
@@ -253,11 +318,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index db3ccb1dd614..7685defd7c52 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (2004) Linus Torvalds
*
@@ -21,6 +22,13 @@
#include <linux/debug_locks.h>
#include <linux/export.h>
+#ifdef CONFIG_MMIOWB
+#ifndef arch_mmiowb_state
+DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
+EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
+#endif
+#endif
+
/*
* If lockdep is enabled then we use the non-preemption spin-ops
* even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
@@ -29,11 +37,10 @@
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
/*
* The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock : include/linux/rwlock_api_smp.h
*/
#else
-#define raw_read_can_lock(l) read_can_lock(l)
-#define raw_write_can_lock(l) write_can_lock(l)
/*
* Some architectures can relax in favour of the CPU owning the lock.
@@ -51,14 +58,14 @@
/*
* We build the __lock_function inlines here. They are too large for
* inlining all over the place, but here is only one user per function
- * which embedds them into the calling _lock_function below.
+ * which embeds them into the calling _lock_function below.
*
* This could be a long-held lock. We both prepare to spin for a long
- * time (making _this_ CPU preemptable if possible), and we also signal
+ * time (making _this_ CPU preemptible if possible), and we also signal
* towards that other CPU that it should break the lock ASAP.
*/
#define BUILD_LOCK_OPS(op, locktype) \
-void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
{ \
for (;;) { \
preempt_disable(); \
@@ -66,15 +73,11 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
break; \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
} \
\
-unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
+static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
{ \
unsigned long flags; \
\
@@ -86,21 +89,18 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
local_irq_restore(flags); \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
+ \
return flags; \
} \
\
-void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
{ \
_raw_##op##_lock_irqsave(lock); \
} \
\
-void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
+static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
{ \
unsigned long flags; \
\
@@ -124,13 +124,16 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
* __[spin|read|write]_lock_bh()
*/
BUILD_LOCK_OPS(spin, raw_spinlock);
+
+#ifndef CONFIG_PREEMPT_RT
BUILD_LOCK_OPS(read, rwlock);
BUILD_LOCK_OPS(write, rwlock);
+#endif
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK
-int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
{
return __raw_spin_trylock(lock);
}
@@ -138,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock);
#endif
#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
-int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
+noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
{
return __raw_spin_trylock_bh(lock);
}
@@ -146,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK
-void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
{
__raw_spin_lock(lock);
}
@@ -154,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
return __raw_spin_lock_irqsave(lock);
}
@@ -162,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
{
__raw_spin_lock_irq(lock);
}
@@ -170,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_LOCK_BH
-void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
@@ -178,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
-void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
}
@@ -186,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
{
__raw_spin_unlock_irqrestore(lock, flags);
}
@@ -194,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
{
__raw_spin_unlock_irq(lock);
}
@@ -202,15 +205,17 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq);
#endif
#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
{
__raw_spin_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_spin_unlock_bh);
#endif
+#ifndef CONFIG_PREEMPT_RT
+
#ifndef CONFIG_INLINE_READ_TRYLOCK
-int __lockfunc _raw_read_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_read_trylock(rwlock_t *lock)
{
return __raw_read_trylock(lock);
}
@@ -218,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _raw_read_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock(rwlock_t *lock)
{
__raw_read_lock(lock);
}
@@ -226,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
{
return __raw_read_lock_irqsave(lock);
}
@@ -234,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
{
__raw_read_lock_irq(lock);
}
@@ -242,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq);
#endif
#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
{
__raw_read_lock_bh(lock);
}
@@ -250,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _raw_read_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock(rwlock_t *lock)
{
__raw_read_unlock(lock);
}
@@ -258,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_read_unlock_irqrestore(lock, flags);
}
@@ -266,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
{
__raw_read_unlock_irq(lock);
}
@@ -274,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq);
#endif
#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
{
__raw_read_unlock_bh(lock);
}
@@ -282,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_TRYLOCK
-int __lockfunc _raw_write_trylock(rwlock_t *lock)
+noinline int __lockfunc _raw_write_trylock(rwlock_t *lock)
{
return __raw_write_trylock(lock);
}
@@ -290,15 +295,25 @@ EXPORT_SYMBOL(_raw_write_trylock);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK
-void __lockfunc _raw_write_lock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock(rwlock_t *lock)
{
__raw_write_lock(lock);
}
EXPORT_SYMBOL(_raw_write_lock);
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock)))
+#endif
+
+void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass)
+{
+ __raw_write_lock_nested(lock, subclass);
+}
+EXPORT_SYMBOL(_raw_write_lock_nested);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
-unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
{
return __raw_write_lock_irqsave(lock);
}
@@ -306,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
-void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
{
__raw_write_lock_irq(lock);
}
@@ -314,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_LOCK_BH
-void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
{
__raw_write_lock_bh(lock);
}
@@ -322,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK
-void __lockfunc _raw_write_unlock(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock(rwlock_t *lock)
{
__raw_write_unlock(lock);
}
@@ -330,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
-void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
__raw_write_unlock_irqrestore(lock, flags);
}
@@ -338,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
-void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
{
__raw_write_unlock_irq(lock);
}
@@ -346,13 +361,15 @@ EXPORT_SYMBOL(_raw_write_unlock_irq);
#endif
#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
-void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
{
__raw_write_unlock_bh(lock);
}
EXPORT_SYMBOL(_raw_write_unlock_bh);
#endif
+#endif /* !CONFIG_PREEMPT_RT */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
@@ -363,14 +380,6 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
}
EXPORT_SYMBOL(_raw_spin_lock_nested);
-void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
-{
- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
- spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
-}
-EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
-
unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
int subclass)
{
@@ -379,8 +388,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
- LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
- do_raw_spin_lock_flags, &flags);
+ LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
return flags;
}
EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
@@ -405,3 +413,11 @@ notrace int in_lock_functions(unsigned long addr)
&& addr < (unsigned long)__lock_text_end;
}
EXPORT_SYMBOL(in_lock_functions);
+
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_PREEMPT_RT)
+void notrace lockdep_assert_in_softirq_func(void)
+{
+ lockdep_assert_in_softirq();
+}
+EXPORT_SYMBOL(lockdep_assert_in_softirq_func);
+#endif
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..2338b3adfb55 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,16 +12,17 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/pid.h>
void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
- struct lock_class_key *key)
+ struct lock_class_key *key, short inner)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner);
#endif
lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
lock->magic = SPINLOCK_MAGIC;
@@ -31,6 +32,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
EXPORT_SYMBOL(__raw_spin_lock_init);
+#ifndef CONFIG_PREEMPT_RT
void __rwlock_init(rwlock_t *lock, const char *name,
struct lock_class_key *key)
{
@@ -39,7 +41,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG);
#endif
lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED;
lock->magic = RWLOCK_MAGIC;
@@ -48,22 +50,23 @@ void __rwlock_init(rwlock_t *lock, const char *name,
}
EXPORT_SYMBOL(__rwlock_init);
+#endif
static void spin_dump(raw_spinlock_t *lock, const char *msg)
{
- struct task_struct *owner = NULL;
+ struct task_struct *owner = READ_ONCE(lock->owner);
- if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
- owner = lock->owner;
+ if (owner == SPINLOCK_OWNER_INIT)
+ owner = NULL;
printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
- lock, lock->magic,
+ lock, READ_ONCE(lock->magic),
owner ? owner->comm : "<none>",
owner ? task_pid_nr(owner) : -1,
- lock->owner_cpu);
+ READ_ONCE(lock->owner_cpu));
dump_stack();
}
@@ -80,16 +83,16 @@ static void spin_bug(raw_spinlock_t *lock, const char *msg)
static inline void
debug_spin_lock_before(raw_spinlock_t *lock)
{
- SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic");
- SPIN_BUG_ON(lock->owner == current, lock, "recursion");
- SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ SPIN_BUG_ON(READ_ONCE(lock->magic) != SPINLOCK_MAGIC, lock, "bad magic");
+ SPIN_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ SPIN_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_spin_lock_after(raw_spinlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_spin_unlock(raw_spinlock_t *lock)
@@ -99,42 +102,19 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
SPIN_BUG_ON(lock->owner != current, lock, "wrong owner");
SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
-}
-
-static void __spin_lock_debug(raw_spinlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
-
- for (i = 0; i < loops; i++) {
- if (arch_spin_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- spin_dump(lock, "lockup suspected");
-#ifdef CONFIG_SMP
- trigger_all_cpu_backtrace();
-#endif
-
- /*
- * The trylock above was causing a livelock. Give the lower level arch
- * specific lock code a chance to acquire the lock. We have already
- * printed a warning/backtrace at this point. The non-debug arch
- * specific code might actually succeed in acquiring the lock. If it is
- * not successful, the end-result is the same - there is no forward
- * progress.
- */
- arch_spin_lock(&lock->raw_lock);
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
+/*
+ * We are now relying on the NMI watchdog to detect lockup instead of doing
+ * the detection here with an unfair lock which can cause problem of its own.
+ */
void do_raw_spin_lock(raw_spinlock_t *lock)
{
debug_spin_lock_before(lock);
- if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
- __spin_lock_debug(lock);
+ arch_spin_lock(&lock->raw_lock);
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
}
@@ -142,8 +122,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
{
int ret = arch_spin_trylock(&lock->raw_lock);
- if (ret)
+ if (ret) {
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
+ }
#ifndef CONFIG_SMP
/*
* Must not happen on UP:
@@ -155,10 +137,12 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
void do_raw_spin_unlock(raw_spinlock_t *lock)
{
+ mmiowb_spin_unlock();
debug_spin_unlock(lock);
arch_spin_unlock(&lock->raw_lock);
}
+#ifndef CONFIG_PREEMPT_RT
static void rwlock_bug(rwlock_t *lock, const char *msg)
{
if (!debug_locks_off())
@@ -172,32 +156,6 @@ static void rwlock_bug(rwlock_t *lock, const char *msg)
#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
-#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_read_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
-}
-#endif
-
void do_raw_read_lock(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
@@ -226,15 +184,15 @@ void do_raw_read_unlock(rwlock_t *lock)
static inline void debug_write_lock_before(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
- RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
static inline void debug_write_lock_after(rwlock_t *lock)
{
- lock->owner_cpu = raw_smp_processor_id();
- lock->owner = current;
+ WRITE_ONCE(lock->owner_cpu, raw_smp_processor_id());
+ WRITE_ONCE(lock->owner, current);
}
static inline void debug_write_unlock(rwlock_t *lock)
@@ -243,35 +201,9 @@ static inline void debug_write_unlock(rwlock_t *lock)
RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner");
RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(),
lock, "wrong CPU");
- lock->owner = SPINLOCK_OWNER_INIT;
- lock->owner_cpu = -1;
-}
-
-#if 0 /* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_write_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
+ WRITE_ONCE(lock->owner, SPINLOCK_OWNER_INIT);
+ WRITE_ONCE(lock->owner_cpu, -1);
}
-#endif
void do_raw_write_lock(rwlock_t *lock)
{
@@ -300,3 +232,5 @@ void do_raw_write_unlock(rwlock_t *lock)
debug_write_unlock(lock);
arch_write_unlock(&lock->raw_lock);
}
+
+#endif /* !CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
new file mode 100644
index 000000000000..db1e11b45de6
--- /dev/null
+++ b/kernel/locking/spinlock_rt.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PREEMPT_RT substitution for spin/rw_locks
+ *
+ * spinlocks and rwlocks on RT are based on rtmutexes, with a few twists to
+ * resemble the non RT semantics:
+ *
+ * - Contrary to plain rtmutexes, spinlocks and rwlocks are state
+ * preserving. The task state is saved before blocking on the underlying
+ * rtmutex, and restored when the lock has been acquired. Regular wakeups
+ * during that time are redirected to the saved state so no wake up is
+ * missed.
+ *
+ * - Non RT spin/rwlocks disable preemption and eventually interrupts.
+ * Disabling preemption has the side effect of disabling migration and
+ * preventing RCU grace periods.
+ *
+ * The RT substitutions explicitly disable migration and take
+ * rcu_read_lock() across the lock held section.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_SPINLOCKS
+#include "rtmutex.c"
+
+/*
+ * __might_resched() skips the state check as rtlocks are state
+ * preserving. Take RCU nesting into account as spin/read/write_lock() can
+ * legitimately nest into an RCU read side critical section.
+ */
+#define RTLOCK_RESCHED_OFFSETS \
+ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT)
+
+#define rtlock_might_resched() \
+ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS)
+
+static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
+{
+ lockdep_assert(!current->pi_blocked_on);
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+}
+
+static __always_inline void __rt_spin_lock(spinlock_t *lock)
+{
+ rtlock_might_resched();
+ rtlock_lock(&lock->lock);
+ rcu_read_lock();
+ migrate_disable();
+}
+
+void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU)
+{
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __sched rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ __rt_spin_lock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __sched rt_spin_unlock(spinlock_t *lock) __releases(RCU)
+{
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+
+ if (unlikely(!rt_mutex_cmpxchg_release(&lock->lock, current, NULL)))
+ rt_mutex_slowunlock(&lock->lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
+/*
+ * Wait for the lock to get unlocked: instead of polling for an unlock
+ * (like raw spinlocks do), lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __sched rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+static __always_inline int __rt_spin_trylock(spinlock_t *lock)
+{
+ int ret = 1;
+
+ if (unlikely(!rt_mutex_cmpxchg_acquire(&lock->lock, NULL, current)))
+ ret = rt_mutex_slowtrylock(&lock->lock);
+
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+
+int __sched rt_spin_trylock(spinlock_t *lock)
+{
+ return __rt_spin_trylock(lock);
+}
+EXPORT_SYMBOL(rt_spin_trylock);
+
+int __sched rt_spin_trylock_bh(spinlock_t *lock)
+{
+ int ret;
+
+ local_bh_disable();
+ ret = __rt_spin_trylock(lock);
+ if (!ret)
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_spin_lock_init(spinlock_t *lock, const char *name,
+ struct lock_class_key *key, bool percpu)
+{
+ u8 type = percpu ? LD_LOCK_PERCPU : LD_LOCK_NORMAL;
+
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_type(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG,
+ LD_WAIT_INV, type);
+}
+EXPORT_SYMBOL(__rt_spin_lock_init);
+#endif
+
+/*
+ * RT-specific reader/writer locks
+ */
+#define rwbase_set_and_save_current_state(state) \
+ current_save_and_set_rtlock_wait_state()
+
+#define rwbase_restore_current_state() \
+ current_restore_rtlock_saved_state()
+
+static __always_inline int
+rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
+{
+ if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ rtlock_slowlock(rtm);
+ return 0;
+}
+
+static __always_inline int
+rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state,
+ struct wake_q_head *wake_q)
+{
+ rtlock_slowlock_locked(rtm, wake_q);
+ return 0;
+}
+
+static __always_inline void rwbase_rtmutex_unlock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, current, NULL)))
+ return;
+
+ rt_mutex_slowunlock(rtm);
+}
+
+static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
+{
+ if (likely(rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
+ return 1;
+
+ return rt_mutex_slowtrylock(rtm);
+}
+
+#define rwbase_signal_pending_state(state, current) (0)
+
+#define rwbase_pre_schedule()
+
+#define rwbase_schedule() \
+ schedule_rtlock()
+
+#define rwbase_post_schedule()
+
+#include "rwbase_rt.c"
+/*
+ * The common functions which get wrapped into the rwlock API.
+ */
+int __sched rt_read_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_read_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
+
+int __sched rt_write_trylock(rwlock_t *rwlock)
+{
+ int ret;
+
+ ret = rwbase_write_trylock(&rwlock->rwbase);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ rcu_read_lock();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __sched rt_read_lock(rwlock_t *rwlock) __acquires(RCU)
+{
+ rtlock_might_resched();
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __sched rt_write_lock(rwlock_t *rwlock) __acquires(RCU)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) __acquires(RCU)
+{
+ rtlock_might_resched();
+ rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_);
+ rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+ rcu_read_lock();
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock_nested);
+#endif
+
+void __sched rt_read_unlock(rwlock_t *rwlock) __releases(RCU)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ rcu_read_unlock();
+ rwbase_read_unlock(&rwlock->rwbase, TASK_RTLOCK_WAIT);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __sched rt_write_unlock(rwlock_t *rwlock) __releases(RCU)
+{
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ rcu_read_unlock();
+ migrate_enable();
+ rwbase_write_unlock(&rwlock->rwbase);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __rt_rwlock_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
+ lockdep_init_map_wait(&rwlock->dep_map, name, key, 0, LD_WAIT_CONFIG);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);
+#endif
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
new file mode 100644
index 000000000000..bcb1b9fea588
--- /dev/null
+++ b/kernel/locking/test-ww_mutex.c
@@ -0,0 +1,699 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module-based API test facility for ww_mutexes
+ */
+
+#include <linux/kernel.h>
+
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/prandom.h>
+#include <linux/slab.h>
+#include <linux/ww_mutex.h>
+
+static DEFINE_WD_CLASS(ww_class);
+struct workqueue_struct *wq;
+
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+#define ww_acquire_init_noinject(a, b) do { \
+ ww_acquire_init((a), (b)); \
+ (a)->deadlock_inject_countdown = ~0U; \
+ } while (0)
+#else
+#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b))
+#endif
+
+struct test_mutex {
+ struct work_struct work;
+ struct ww_mutex mutex;
+ struct completion ready, go, done;
+ unsigned int flags;
+};
+
+#define TEST_MTX_SPIN BIT(0)
+#define TEST_MTX_TRY BIT(1)
+#define TEST_MTX_CTX BIT(2)
+#define __TEST_MTX_LAST BIT(3)
+
+static void test_mutex_work(struct work_struct *work)
+{
+ struct test_mutex *mtx = container_of(work, typeof(*mtx), work);
+
+ complete(&mtx->ready);
+ wait_for_completion(&mtx->go);
+
+ if (mtx->flags & TEST_MTX_TRY) {
+ while (!ww_mutex_trylock(&mtx->mutex, NULL))
+ cond_resched();
+ } else {
+ ww_mutex_lock(&mtx->mutex, NULL);
+ }
+ complete(&mtx->done);
+ ww_mutex_unlock(&mtx->mutex);
+}
+
+static int __test_mutex(unsigned int flags)
+{
+#define TIMEOUT (HZ / 16)
+ struct test_mutex mtx;
+ struct ww_acquire_ctx ctx;
+ int ret;
+
+ ww_mutex_init(&mtx.mutex, &ww_class);
+ if (flags & TEST_MTX_CTX)
+ ww_acquire_init(&ctx, &ww_class);
+
+ INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
+ init_completion(&mtx.ready);
+ init_completion(&mtx.go);
+ init_completion(&mtx.done);
+ mtx.flags = flags;
+
+ schedule_work(&mtx.work);
+
+ wait_for_completion(&mtx.ready);
+ ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL);
+ complete(&mtx.go);
+ if (flags & TEST_MTX_SPIN) {
+ unsigned long timeout = jiffies + TIMEOUT;
+
+ ret = 0;
+ do {
+ if (completion_done(&mtx.done)) {
+ ret = -EINVAL;
+ break;
+ }
+ cond_resched();
+ } while (time_before(jiffies, timeout));
+ } else {
+ ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
+ }
+ ww_mutex_unlock(&mtx.mutex);
+ if (flags & TEST_MTX_CTX)
+ ww_acquire_fini(&ctx);
+
+ if (ret) {
+ pr_err("%s(flags=%x): mutual exclusion failure\n",
+ __func__, flags);
+ ret = -EINVAL;
+ }
+
+ flush_work(&mtx.work);
+ destroy_work_on_stack(&mtx.work);
+ return ret;
+#undef TIMEOUT
+}
+
+static int test_mutex(void)
+{
+ int ret;
+ int i;
+
+ for (i = 0; i < __TEST_MTX_LAST; i++) {
+ ret = __test_mutex(i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int test_aa(bool trylock)
+{
+ struct ww_mutex mutex;
+ struct ww_acquire_ctx ctx;
+ int ret;
+ const char *from = trylock ? "trylock" : "lock";
+
+ ww_mutex_init(&mutex, &ww_class);
+ ww_acquire_init(&ctx, &ww_class);
+
+ if (!trylock) {
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial lock failed!\n", __func__);
+ goto out;
+ }
+ } else {
+ ret = !ww_mutex_trylock(&mutex, &ctx);
+ if (ret) {
+ pr_err("%s: initial trylock failed!\n", __func__);
+ goto out;
+ }
+ }
+
+ if (ww_mutex_trylock(&mutex, NULL)) {
+ pr_err("%s: trylocked itself without context from %s!\n", __func__, from);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ww_mutex_trylock(&mutex, &ctx)) {
+ pr_err("%s: trylocked itself with context from %s!\n", __func__, from);
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ww_mutex_lock(&mutex, &ctx);
+ if (ret != -EALREADY) {
+ pr_err("%s: missed deadlock for recursing, ret=%d from %s\n",
+ __func__, ret, from);
+ if (!ret)
+ ww_mutex_unlock(&mutex);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ww_mutex_unlock(&mutex);
+ ret = 0;
+out:
+ ww_acquire_fini(&ctx);
+ return ret;
+}
+
+struct test_abba {
+ struct work_struct work;
+ struct ww_mutex a_mutex;
+ struct ww_mutex b_mutex;
+ struct completion a_ready;
+ struct completion b_ready;
+ bool resolve, trylock;
+ int result;
+};
+
+static void test_abba_work(struct work_struct *work)
+{
+ struct test_abba *abba = container_of(work, typeof(*abba), work);
+ struct ww_acquire_ctx ctx;
+ int err;
+
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!abba->trylock)
+ ww_mutex_lock(&abba->b_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx);
+
+ complete(&abba->b_ready);
+ wait_for_completion(&abba->a_ready);
+
+ err = ww_mutex_lock(&abba->a_mutex, &ctx);
+ if (abba->resolve && err == -EDEADLK) {
+ ww_mutex_unlock(&abba->b_mutex);
+ ww_mutex_lock_slow(&abba->a_mutex, &ctx);
+ err = ww_mutex_lock(&abba->b_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(&abba->a_mutex);
+ ww_mutex_unlock(&abba->b_mutex);
+ ww_acquire_fini(&ctx);
+
+ abba->result = err;
+}
+
+static int test_abba(bool trylock, bool resolve)
+{
+ struct test_abba abba;
+ struct ww_acquire_ctx ctx;
+ int err, ret;
+
+ ww_mutex_init(&abba.a_mutex, &ww_class);
+ ww_mutex_init(&abba.b_mutex, &ww_class);
+ INIT_WORK_ONSTACK(&abba.work, test_abba_work);
+ init_completion(&abba.a_ready);
+ init_completion(&abba.b_ready);
+ abba.trylock = trylock;
+ abba.resolve = resolve;
+
+ schedule_work(&abba.work);
+
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ if (!trylock)
+ ww_mutex_lock(&abba.a_mutex, &ctx);
+ else
+ WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx));
+
+ WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx);
+
+ complete(&abba.a_ready);
+ wait_for_completion(&abba.b_ready);
+
+ err = ww_mutex_lock(&abba.b_mutex, &ctx);
+ if (resolve && err == -EDEADLK) {
+ ww_mutex_unlock(&abba.a_mutex);
+ ww_mutex_lock_slow(&abba.b_mutex, &ctx);
+ err = ww_mutex_lock(&abba.a_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(&abba.b_mutex);
+ ww_mutex_unlock(&abba.a_mutex);
+ ww_acquire_fini(&ctx);
+
+ flush_work(&abba.work);
+ destroy_work_on_stack(&abba.work);
+
+ ret = 0;
+ if (resolve) {
+ if (err || abba.result) {
+ pr_err("%s: failed to resolve ABBA deadlock, A err=%d, B err=%d\n",
+ __func__, err, abba.result);
+ ret = -EINVAL;
+ }
+ } else {
+ if (err != -EDEADLK && abba.result != -EDEADLK) {
+ pr_err("%s: missed ABBA deadlock, A err=%d, B err=%d\n",
+ __func__, err, abba.result);
+ ret = -EINVAL;
+ }
+ }
+ return ret;
+}
+
+struct test_cycle {
+ struct work_struct work;
+ struct ww_mutex a_mutex;
+ struct ww_mutex *b_mutex;
+ struct completion *a_signal;
+ struct completion b_signal;
+ int result;
+};
+
+static void test_cycle_work(struct work_struct *work)
+{
+ struct test_cycle *cycle = container_of(work, typeof(*cycle), work);
+ struct ww_acquire_ctx ctx;
+ int err, erra = 0;
+
+ ww_acquire_init_noinject(&ctx, &ww_class);
+ ww_mutex_lock(&cycle->a_mutex, &ctx);
+
+ complete(cycle->a_signal);
+ wait_for_completion(&cycle->b_signal);
+
+ err = ww_mutex_lock(cycle->b_mutex, &ctx);
+ if (err == -EDEADLK) {
+ err = 0;
+ ww_mutex_unlock(&cycle->a_mutex);
+ ww_mutex_lock_slow(cycle->b_mutex, &ctx);
+ erra = ww_mutex_lock(&cycle->a_mutex, &ctx);
+ }
+
+ if (!err)
+ ww_mutex_unlock(cycle->b_mutex);
+ if (!erra)
+ ww_mutex_unlock(&cycle->a_mutex);
+ ww_acquire_fini(&ctx);
+
+ cycle->result = err ?: erra;
+}
+
+static int __test_cycle(unsigned int nthreads)
+{
+ struct test_cycle *cycles;
+ unsigned int n, last = nthreads - 1;
+ int ret;
+
+ cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL);
+ if (!cycles)
+ return -ENOMEM;
+
+ for (n = 0; n < nthreads; n++) {
+ struct test_cycle *cycle = &cycles[n];
+
+ ww_mutex_init(&cycle->a_mutex, &ww_class);
+ if (n == last)
+ cycle->b_mutex = &cycles[0].a_mutex;
+ else
+ cycle->b_mutex = &cycles[n + 1].a_mutex;
+
+ if (n == 0)
+ cycle->a_signal = &cycles[last].b_signal;
+ else
+ cycle->a_signal = &cycles[n - 1].b_signal;
+ init_completion(&cycle->b_signal);
+
+ INIT_WORK(&cycle->work, test_cycle_work);
+ cycle->result = 0;
+ }
+
+ for (n = 0; n < nthreads; n++)
+ queue_work(wq, &cycles[n].work);
+
+ flush_workqueue(wq);
+
+ ret = 0;
+ for (n = 0; n < nthreads; n++) {
+ struct test_cycle *cycle = &cycles[n];
+
+ if (!cycle->result)
+ continue;
+
+ pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n",
+ n, nthreads, cycle->result);
+ ret = -EINVAL;
+ break;
+ }
+
+ for (n = 0; n < nthreads; n++)
+ ww_mutex_destroy(&cycles[n].a_mutex);
+ kfree(cycles);
+ return ret;
+}
+
+static int test_cycle(unsigned int ncpus)
+{
+ unsigned int n;
+ int ret;
+
+ for (n = 2; n <= ncpus + 1; n++) {
+ ret = __test_cycle(n);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct stress {
+ struct work_struct work;
+ struct ww_mutex *locks;
+ unsigned long timeout;
+ int nlocks;
+};
+
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+ u32 ret;
+
+ spin_lock(&rng_lock);
+ ret = prandom_u32_state(&rng) % ceil;
+ spin_unlock(&rng_lock);
+ return ret;
+}
+
+static int *get_random_order(int count)
+{
+ int *order;
+ int n, r;
+
+ order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
+ if (!order)
+ return order;
+
+ for (n = 0; n < count; n++)
+ order[n] = n;
+
+ for (n = count - 1; n > 1; n--) {
+ r = prandom_u32_below(n + 1);
+ if (r != n)
+ swap(order[n], order[r]);
+ }
+
+ return order;
+}
+
+static void dummy_load(struct stress *stress)
+{
+ usleep_range(1000, 2000);
+}
+
+static void stress_inorder_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ const int nlocks = stress->nlocks;
+ struct ww_mutex *locks = stress->locks;
+ struct ww_acquire_ctx ctx;
+ int *order;
+
+ order = get_random_order(nlocks);
+ if (!order)
+ return;
+
+ do {
+ int contended = -1;
+ int n, err;
+
+ ww_acquire_init(&ctx, &ww_class);
+retry:
+ err = 0;
+ for (n = 0; n < nlocks; n++) {
+ if (n == contended)
+ continue;
+
+ err = ww_mutex_lock(&locks[order[n]], &ctx);
+ if (err < 0)
+ break;
+ }
+ if (!err)
+ dummy_load(stress);
+
+ if (contended > n)
+ ww_mutex_unlock(&locks[order[contended]]);
+ contended = n;
+ while (n--)
+ ww_mutex_unlock(&locks[order[n]]);
+
+ if (err == -EDEADLK) {
+ if (!time_after(jiffies, stress->timeout)) {
+ ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+ goto retry;
+ }
+ }
+
+ ww_acquire_fini(&ctx);
+ if (err) {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+ } while (!time_after(jiffies, stress->timeout));
+
+ kfree(order);
+}
+
+struct reorder_lock {
+ struct list_head link;
+ struct ww_mutex *lock;
+};
+
+static void stress_reorder_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ LIST_HEAD(locks);
+ struct ww_acquire_ctx ctx;
+ struct reorder_lock *ll, *ln;
+ int *order;
+ int n, err;
+
+ order = get_random_order(stress->nlocks);
+ if (!order)
+ return;
+
+ for (n = 0; n < stress->nlocks; n++) {
+ ll = kmalloc(sizeof(*ll), GFP_KERNEL);
+ if (!ll)
+ goto out;
+
+ ll->lock = &stress->locks[order[n]];
+ list_add(&ll->link, &locks);
+ }
+ kfree(order);
+ order = NULL;
+
+ do {
+ ww_acquire_init(&ctx, &ww_class);
+
+ list_for_each_entry(ll, &locks, link) {
+ err = ww_mutex_lock(ll->lock, &ctx);
+ if (!err)
+ continue;
+
+ ln = ll;
+ list_for_each_entry_continue_reverse(ln, &locks, link)
+ ww_mutex_unlock(ln->lock);
+
+ if (err != -EDEADLK) {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+
+ ww_mutex_lock_slow(ll->lock, &ctx);
+ list_move(&ll->link, &locks); /* restarts iteration */
+ }
+
+ dummy_load(stress);
+ list_for_each_entry(ll, &locks, link)
+ ww_mutex_unlock(ll->lock);
+
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
+
+out:
+ list_for_each_entry_safe(ll, ln, &locks, link)
+ kfree(ll);
+ kfree(order);
+}
+
+static void stress_one_work(struct work_struct *work)
+{
+ struct stress *stress = container_of(work, typeof(*stress), work);
+ const int nlocks = stress->nlocks;
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
+ int err;
+
+ do {
+ err = ww_mutex_lock(lock, NULL);
+ if (!err) {
+ dummy_load(stress);
+ ww_mutex_unlock(lock);
+ } else {
+ pr_err_once("stress (%s) failed with %d\n",
+ __func__, err);
+ break;
+ }
+ } while (!time_after(jiffies, stress->timeout));
+}
+
+#define STRESS_INORDER BIT(0)
+#define STRESS_REORDER BIT(1)
+#define STRESS_ONE BIT(2)
+#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
+
+static int stress(int nlocks, int nthreads, unsigned int flags)
+{
+ struct ww_mutex *locks;
+ struct stress *stress_array;
+ int n, count;
+
+ locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
+ if (!locks)
+ return -ENOMEM;
+
+ stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+ GFP_KERNEL);
+ if (!stress_array) {
+ kfree(locks);
+ return -ENOMEM;
+ }
+
+ for (n = 0; n < nlocks; n++)
+ ww_mutex_init(&locks[n], &ww_class);
+
+ count = 0;
+ for (n = 0; nthreads; n++) {
+ struct stress *stress;
+ void (*fn)(struct work_struct *work);
+
+ fn = NULL;
+ switch (n & 3) {
+ case 0:
+ if (flags & STRESS_INORDER)
+ fn = stress_inorder_work;
+ break;
+ case 1:
+ if (flags & STRESS_REORDER)
+ fn = stress_reorder_work;
+ break;
+ case 2:
+ if (flags & STRESS_ONE)
+ fn = stress_one_work;
+ break;
+ }
+
+ if (!fn)
+ continue;
+
+ stress = &stress_array[count++];
+
+ INIT_WORK(&stress->work, fn);
+ stress->locks = locks;
+ stress->nlocks = nlocks;
+ stress->timeout = jiffies + 2*HZ;
+
+ queue_work(wq, &stress->work);
+ nthreads--;
+ }
+
+ flush_workqueue(wq);
+
+ for (n = 0; n < nlocks; n++)
+ ww_mutex_destroy(&locks[n]);
+ kfree(stress_array);
+ kfree(locks);
+
+ return 0;
+}
+
+static int __init test_ww_mutex_init(void)
+{
+ int ncpus = num_online_cpus();
+ int ret, i;
+
+ printk(KERN_INFO "Beginning ww mutex selftests\n");
+
+ prandom_seed_state(&rng, get_random_u64());
+
+ wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
+ if (!wq)
+ return -ENOMEM;
+
+ ret = test_mutex();
+ if (ret)
+ return ret;
+
+ ret = test_aa(false);
+ if (ret)
+ return ret;
+
+ ret = test_aa(true);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < 4; i++) {
+ ret = test_abba(i & 1, i & 2);
+ if (ret)
+ return ret;
+ }
+
+ ret = test_cycle(ncpus);
+ if (ret)
+ return ret;
+
+ ret = stress(16, 2*ncpus, STRESS_INORDER);
+ if (ret)
+ return ret;
+
+ ret = stress(16, 2*ncpus, STRESS_REORDER);
+ if (ret)
+ return ret;
+
+ ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ if (ret)
+ return ret;
+
+ printk(KERN_INFO "All ww mutex selftests passed\n");
+ return 0;
+}
+
+static void __exit test_ww_mutex_exit(void)
+{
+ destroy_workqueue(wq);
+}
+
+module_init(test_ww_mutex_init);
+module_exit(test_ww_mutex_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("API test facility for ww_mutexes");
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
new file mode 100644
index 000000000000..31a785afee6c
--- /dev/null
+++ b/kernel/locking/ww_mutex.h
@@ -0,0 +1,594 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef WW_RT
+
+#define MUTEX mutex
+#define MUTEX_WAITER mutex_waiter
+
+static inline struct mutex_waiter *
+__ww_waiter_first(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_first_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_next(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_next_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w)
+{
+ w = list_prev_entry(w, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline struct mutex_waiter *
+__ww_waiter_last(struct mutex *lock)
+{
+ struct mutex_waiter *w;
+
+ w = list_last_entry(&lock->wait_list, struct mutex_waiter, list);
+ if (list_entry_is_head(w, &lock->wait_list, list))
+ return NULL;
+
+ return w;
+}
+
+static inline void
+__ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos)
+{
+ struct list_head *p = &lock->wait_list;
+ if (pos)
+ p = &pos->list;
+ __mutex_add_waiter(lock, waiter, p);
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct mutex *lock)
+{
+ return __mutex_owner(lock);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct mutex *lock)
+{
+ return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS;
+}
+
+static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags)
+{
+ raw_spin_lock_irqsave(&lock->wait_lock, *flags);
+}
+
+static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags)
+{
+ raw_spin_unlock_irqrestore(&lock->wait_lock, *flags);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct mutex *lock)
+{
+ lockdep_assert_held(&lock->wait_lock);
+}
+
+#else /* WW_RT */
+
+#define MUTEX rt_mutex
+#define MUTEX_WAITER rt_mutex_waiter
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_first(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_next(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
+{
+ struct rb_node *n = rb_prev(&w->tree.entry);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline struct rt_mutex_waiter *
+__ww_waiter_last(struct rt_mutex *lock)
+{
+ struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
+}
+
+static inline void
+__ww_waiter_add(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *pos)
+{
+ /* RT unconditionally adds the waiter first and then removes it on error */
+}
+
+static inline struct task_struct *
+__ww_mutex_owner(struct rt_mutex *lock)
+{
+ return rt_mutex_owner(&lock->rtmutex);
+}
+
+static inline bool
+__ww_mutex_has_waiters(struct rt_mutex *lock)
+{
+ return rt_mutex_has_waiters(&lock->rtmutex);
+}
+
+static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+{
+ raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags);
+}
+
+static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags)
+{
+ raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags);
+}
+
+static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock)
+{
+ lockdep_assert_held(&lock->rtmutex.wait_lock);
+}
+
+#endif /* WW_RT */
+
+/*
+ * Wait-Die:
+ * The newer transactions are killed when:
+ * It (the new transaction) makes a request for a lock being held
+ * by an older transaction.
+ *
+ * Wound-Wait:
+ * The newer transactions are wounded when:
+ * An older transaction makes a request for a lock being held by
+ * the newer transaction.
+ */
+
+/*
+ * Associate the ww_mutex @ww with the context @ww_ctx under which we acquired
+ * it.
+ */
+static __always_inline void
+ww_mutex_lock_acquired(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef DEBUG_WW_MUTEXES
+ /*
+ * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+ * but released with a normal mutex_unlock in this call.
+ *
+ * This should never happen, always use ww_mutex_unlock.
+ */
+ DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+ /*
+ * Not quite done after calling ww_acquire_done() ?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+ if (ww_ctx->contending_lock) {
+ /*
+ * After -EDEADLK you tried to
+ * acquire a different ww_mutex? Bad!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+ /*
+ * You called ww_mutex_lock after receiving -EDEADLK,
+ * but 'forgot' to unlock everything else first?
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+ ww_ctx->contending_lock = NULL;
+ }
+
+ /*
+ * Naughty, using a different class will lead to undefined behavior!
+ */
+ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+ ww_ctx->acquired++;
+ ww->ctx = ww_ctx;
+}
+
+/*
+ * Determine if @a is 'less' than @b. IOW, either @a is a lower priority task
+ * or, when of equal priority, a younger transaction than @b.
+ *
+ * Depending on the algorithm, @a will either need to wait for @b, or die.
+ */
+static inline bool
+__ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b)
+{
+/*
+ * Can only do the RT prio for WW_RT, because task->prio isn't stable due to PI,
+ * so the wait_list ordering will go wobbly. rt_mutex re-queues the waiter and
+ * isn't affected by this.
+ */
+#ifdef WW_RT
+ /* kernel prio; less is more */
+ int a_prio = a->task->prio;
+ int b_prio = b->task->prio;
+
+ if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) {
+
+ if (a_prio > b_prio)
+ return true;
+
+ if (a_prio < b_prio)
+ return false;
+
+ /* equal static prio */
+
+ if (dl_prio(a_prio)) {
+ if (dl_time_before(b->task->dl.deadline,
+ a->task->dl.deadline))
+ return true;
+
+ if (dl_time_before(a->task->dl.deadline,
+ b->task->dl.deadline))
+ return false;
+ }
+
+ /* equal prio */
+ }
+#endif
+
+ /* FIFO order tie break -- bigger is younger */
+ return (signed long)(a->stamp - b->stamp) > 0;
+}
+
+/*
+ * Wait-Die; wake a lesser waiter context (when locks held) such that it can
+ * die.
+ *
+ * Among waiters with context, only the first one can have other locks acquired
+ * already (ctx->acquired > 0), because __ww_mutex_add_waiter() and
+ * __ww_mutex_check_kill() wake any but the earliest context.
+ */
+static bool
+__ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q)
+{
+ if (!ww_ctx->is_wait_die)
+ return false;
+
+ if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
+#ifndef WW_RT
+ debug_mutex_wake_waiter(lock, waiter);
+#endif
+ /*
+ * When waking up the task to die, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ */
+ __clear_task_blocked_on(waiter->task, lock);
+ wake_q_add(wake_q, waiter->task);
+ }
+
+ return true;
+}
+
+/*
+ * Wound-Wait; wound a lesser @hold_ctx if it holds the lock.
+ *
+ * Wound the lock holder if there are waiters with more important transactions
+ * than the lock holders. Even if multiple waiters may wound the lock holder,
+ * it's sufficient that only one does.
+ */
+static bool __ww_mutex_wound(struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct ww_acquire_ctx *hold_ctx,
+ struct wake_q_head *wake_q)
+{
+ struct task_struct *owner = __ww_mutex_owner(lock);
+
+ lockdep_assert_wait_lock_held(lock);
+
+ /*
+ * Possible through __ww_mutex_add_waiter() when we race with
+ * ww_mutex_set_context_fastpath(). In that case we'll get here again
+ * through __ww_mutex_check_waiters().
+ */
+ if (!hold_ctx)
+ return false;
+
+ /*
+ * Can have !owner because of __mutex_unlock_slowpath(), but if owner,
+ * it cannot go away because we'll have FLAG_WAITERS set and hold
+ * wait_lock.
+ */
+ if (!owner)
+ return false;
+
+ if (ww_ctx->acquired > 0 && __ww_ctx_less(hold_ctx, ww_ctx)) {
+ hold_ctx->wounded = 1;
+
+ /*
+ * wake_up_process() paired with set_current_state()
+ * inserts sufficient barriers to make sure @owner either sees
+ * it's wounded in __ww_mutex_check_kill() or has a
+ * wakeup pending to re-read the wounded state.
+ */
+ if (owner != current) {
+ /*
+ * When waking up the task to wound, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ *
+ * NOTE: We pass NULL here instead of lock, because we
+ * are waking the mutex owner, who may be currently
+ * blocked on a different mutex.
+ */
+ __clear_task_blocked_on(owner, NULL);
+ wake_q_add(wake_q, owner);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We just acquired @lock under @ww_ctx, if there are more important contexts
+ * waiting behind us on the wait-list, check if they need to die, or wound us.
+ *
+ * See __ww_mutex_add_waiter() for the list-order construction; basically the
+ * list is ordered by stamp, smallest (oldest) first.
+ *
+ * This relies on never mixing wait-die/wound-wait on the same wait-list;
+ * which is currently ensured by that being a ww_class property.
+ *
+ * The current task must not be on the wait list.
+ */
+static void
+__ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+ struct MUTEX_WAITER *cur;
+
+ lockdep_assert_wait_lock_held(lock);
+
+ for (cur = __ww_waiter_first(lock); cur;
+ cur = __ww_waiter_next(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) ||
+ __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q))
+ break;
+ }
+}
+
+/*
+ * After acquiring lock with fastpath, where we do not hold wait_lock, set ctx
+ * and wake up any waiters so they can recheck.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ DEFINE_WAKE_Q(wake_q);
+ unsigned long flags;
+
+ ww_mutex_lock_acquired(lock, ctx);
+
+ /*
+ * The lock->ctx update should be visible on all cores before
+ * the WAITERS check is done, otherwise contended waiters might be
+ * missed. The contended waiters will either see ww_ctx == NULL
+ * and keep spinning, or it will acquire wait_lock, add itself
+ * to waiter list and sleep.
+ */
+ smp_mb(); /* See comments above and below. */
+
+ /*
+ * [W] ww->ctx = ctx [W] MUTEX_FLAG_WAITERS
+ * MB MB
+ * [R] MUTEX_FLAG_WAITERS [R] ww->ctx
+ *
+ * The memory barrier above pairs with the memory barrier in
+ * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx
+ * and/or !empty list.
+ */
+ if (likely(!__ww_mutex_has_waiters(&lock->base)))
+ return;
+
+ /*
+ * Uh oh, we raced in fastpath, check if any of the waiters need to
+ * die or wound us.
+ */
+ lock_wait_lock(&lock->base, &flags);
+ __ww_mutex_check_waiters(&lock->base, ctx, &wake_q);
+ preempt_disable();
+ unlock_wait_lock(&lock->base, &flags);
+ wake_up_q(&wake_q);
+ preempt_enable();
+}
+
+static __always_inline int
+__ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ if (ww_ctx->acquired > 0) {
+#ifdef DEBUG_WW_MUTEXES
+ struct ww_mutex *ww;
+
+ ww = container_of(lock, struct ww_mutex, base);
+ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock);
+ ww_ctx->contending_lock = ww;
+#endif
+ return -EDEADLK;
+ }
+
+ return 0;
+}
+
+/*
+ * Check the wound condition for the current lock acquire.
+ *
+ * Wound-Wait: If we're wounded, kill ourself.
+ *
+ * Wait-Die: If we're trying to acquire a lock already held by an older
+ * context, kill ourselves.
+ *
+ * Since __ww_mutex_add_waiter() orders the wait-list on stamp, we only have to
+ * look at waiters before us in the wait-list.
+ */
+static inline int
+__ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
+ struct ww_acquire_ctx *ctx)
+{
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx);
+ struct MUTEX_WAITER *cur;
+
+ if (ctx->acquired == 0)
+ return 0;
+
+ if (!ctx->is_wait_die) {
+ if (ctx->wounded)
+ return __ww_mutex_kill(lock, ctx);
+
+ return 0;
+ }
+
+ if (hold_ctx && __ww_ctx_less(ctx, hold_ctx))
+ return __ww_mutex_kill(lock, ctx);
+
+ /*
+ * If there is a waiter in front of us that has a context, then its
+ * stamp is earlier than ours and we must kill ourself.
+ */
+ for (cur = __ww_waiter_prev(lock, waiter); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ return __ww_mutex_kill(lock, ctx);
+ }
+
+ return 0;
+}
+
+/*
+ * Add @waiter to the wait-list, keep the wait-list ordered by stamp, smallest
+ * first. Such that older contexts are preferred to acquire the lock over
+ * younger contexts.
+ *
+ * Waiters without context are interspersed in FIFO order.
+ *
+ * Furthermore, for Wait-Die kill ourself immediately when possible (there are
+ * older contexts already waiting) to avoid unnecessary waiting and for
+ * Wound-Wait ensure we wound the owning context when it is younger.
+ */
+static inline int
+__ww_mutex_add_waiter(struct MUTEX_WAITER *waiter,
+ struct MUTEX *lock,
+ struct ww_acquire_ctx *ww_ctx,
+ struct wake_q_head *wake_q)
+{
+ struct MUTEX_WAITER *cur, *pos = NULL;
+ bool is_wait_die;
+
+ if (!ww_ctx) {
+ __ww_waiter_add(lock, waiter, NULL);
+ return 0;
+ }
+
+ is_wait_die = ww_ctx->is_wait_die;
+
+ /*
+ * Add the waiter before the first waiter with a higher stamp.
+ * Waiters without a context are skipped to avoid starving
+ * them. Wait-Die waiters may die here. Wound-Wait waiters
+ * never die here, but they are sorted in stamp order and
+ * may wound the lock holder.
+ */
+ for (cur = __ww_waiter_last(lock); cur;
+ cur = __ww_waiter_prev(lock, cur)) {
+
+ if (!cur->ww_ctx)
+ continue;
+
+ if (__ww_ctx_less(ww_ctx, cur->ww_ctx)) {
+ /*
+ * Wait-Die: if we find an older context waiting, there
+ * is no point in queueing behind it, as we'd have to
+ * die the moment it would acquire the lock.
+ */
+ if (is_wait_die) {
+ int ret = __ww_mutex_kill(lock, ww_ctx);
+
+ if (ret)
+ return ret;
+ }
+
+ break;
+ }
+
+ pos = cur;
+
+ /* Wait-Die: ensure younger waiters die. */
+ __ww_mutex_die(lock, cur, ww_ctx, wake_q);
+ }
+
+ __ww_waiter_add(lock, waiter, pos);
+
+ /*
+ * Wound-Wait: if we're blocking on a mutex owned by a younger context,
+ * wound that such that we might proceed.
+ */
+ if (!is_wait_die) {
+ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+
+ /*
+ * See ww_mutex_set_context_fastpath(). Orders setting
+ * MUTEX_FLAG_WAITERS vs the ww->ctx load,
+ * such that either we or the fastpath will wound @ww->ctx.
+ */
+ smp_mb();
+ __ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q);
+ }
+
+ return 0;
+}
+
+static inline void __ww_mutex_unlock(struct ww_mutex *lock)
+{
+ if (lock->ctx) {
+#ifdef DEBUG_WW_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+ if (lock->ctx->acquired > 0)
+ lock->ctx->acquired--;
+ lock->ctx = NULL;
+ }
+}
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
new file mode 100644
index 000000000000..c7196de838ed
--- /dev/null
+++ b/kernel/locking/ww_rt_mutex.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * rtmutex API
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define RT_MUTEX_BUILD_MUTEX
+#define WW_RT
+#include "rtmutex.c"
+
+int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ if (!ww_ctx)
+ return rt_mutex_trylock(rtm);
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+ if (__rt_mutex_trylock(&rtm->rtmutex)) {
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ww_mutex_trylock);
+
+static int __sched
+__ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
+ unsigned int state, unsigned long ip)
+{
+ struct lockdep_map __maybe_unused *nest_lock = NULL;
+ struct rt_mutex *rtm = &lock->base;
+ int ret;
+
+ might_sleep();
+
+ if (ww_ctx) {
+ if (unlikely(ww_ctx == READ_ONCE(lock->ctx)))
+ return -EALREADY;
+
+ /*
+ * Reset the wounded flag after a kill. No other process can
+ * race and wound us here, since they can't have a valid owner
+ * pointer if we don't have any locks held.
+ */
+ if (ww_ctx->acquired == 0)
+ ww_ctx->wounded = 0;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ nest_lock = &ww_ctx->dep_map;
+#endif
+ }
+ mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
+
+ if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
+ if (ww_ctx)
+ ww_mutex_set_context_fastpath(lock, ww_ctx);
+ return 0;
+ }
+
+ ret = rt_mutex_slowlock(&rtm->rtmutex, ww_ctx, state);
+
+ if (ret)
+ mutex_release(&rtm->dep_map, ip);
+ return ret;
+}
+
+int __sched
+ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_UNINTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock);
+
+int __sched
+ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+ return __ww_rt_mutex_lock(lock, ctx, TASK_INTERRUPTIBLE, _RET_IP_);
+}
+EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ struct rt_mutex *rtm = &lock->base;
+
+ __ww_mutex_unlock(lock);
+
+ mutex_release(&rtm->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&rtm->rtmutex);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
deleted file mode 100644
index 915e123a430f..000000000000
--- a/kernel/module-internal.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* Module internals
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-extern int mod_verify_sig(const void *mod, unsigned long *_modlen);
diff --git a/kernel/module.c b/kernel/module.c
deleted file mode 100644
index cfc9e843a924..000000000000
--- a/kernel/module.c
+++ /dev/null
@@ -1,3917 +0,0 @@
-/*
- Copyright (C) 2002 Richard Henderson
- Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
-#include <linux/export.h>
-#include <linux/moduleloader.h>
-#include <linux/ftrace_event.h>
-#include <linux/init.h>
-#include <linux/kallsyms.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/sysfs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/elf.h>
-#include <linux/proc_fs.h>
-#include <linux/security.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/fcntl.h>
-#include <linux/rcupdate.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/moduleparam.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/vermagic.h>
-#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/device.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <asm/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/mmu_context.h>
-#include <linux/license.h>
-#include <asm/sections.h>
-#include <linux/tracepoint.h>
-#include <linux/ftrace.h>
-#include <linux/async.h>
-#include <linux/percpu.h>
-#include <linux/kmemleak.h>
-#include <linux/jump_label.h>
-#include <linux/pfn.h>
-#include <linux/bsearch.h>
-#include <uapi/linux/module.h>
-#include "module-internal.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/module.h>
-
-#ifndef ARCH_SHF_SMALL
-#define ARCH_SHF_SMALL 0
-#endif
-
-/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_DEBUG_SET_MODULE_RONX=y
- */
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
-# define debug_align(X) ALIGN(X, PAGE_SIZE)
-#else
-# define debug_align(X) (X)
-#endif
-
-/*
- * Given BASE and SIZE this macro calculates the number of pages the
- * memory regions occupies
- */
-#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
- (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
- PFN_DOWN((unsigned long)BASE) + 1) \
- : (0UL))
-
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-
-/*
- * Mutex protects:
- * 1) List of modules (also safely readable with preempt_disable),
- * 2) module_use links,
- * 3) module_addr_min/module_addr_max.
- * (delete and add uses RCU list operations). */
-DEFINE_MUTEX(module_mutex);
-EXPORT_SYMBOL_GPL(module_mutex);
-static LIST_HEAD(modules);
-#ifdef CONFIG_KGDB_KDB
-struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
-#endif /* CONFIG_KGDB_KDB */
-
-#ifdef CONFIG_MODULE_SIG
-#ifdef CONFIG_MODULE_SIG_FORCE
-static bool sig_enforce = true;
-#else
-static bool sig_enforce = false;
-
-static int param_set_bool_enable_only(const char *val,
- const struct kernel_param *kp)
-{
- int err;
- bool test;
- struct kernel_param dummy_kp = *kp;
-
- dummy_kp.arg = &test;
-
- err = param_set_bool(val, &dummy_kp);
- if (err)
- return err;
-
- /* Don't let them unset it once it's set! */
- if (!test && sig_enforce)
- return -EROFS;
-
- if (test)
- sig_enforce = true;
- return 0;
-}
-
-static const struct kernel_param_ops param_ops_bool_enable_only = {
- .flags = KERNEL_PARAM_OPS_FL_NOARG,
- .set = param_set_bool_enable_only,
- .get = param_get_bool,
-};
-#define param_check_bool_enable_only param_check_bool
-
-module_param(sig_enforce, bool_enable_only, 0644);
-#endif /* !CONFIG_MODULE_SIG_FORCE */
-#endif /* CONFIG_MODULE_SIG */
-
-/* Block module loading/unloading? */
-int modules_disabled = 0;
-core_param(nomodule, modules_disabled, bint, 0);
-
-/* Waiting for a module to finish initializing? */
-static DECLARE_WAIT_QUEUE_HEAD(module_wq);
-
-static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-
-/* Bounds of module allocation, for speeding __module_address.
- * Protected by module_mutex. */
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
-int register_module_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_register(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(register_module_notifier);
-
-int unregister_module_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&module_notify_list, nb);
-}
-EXPORT_SYMBOL(unregister_module_notifier);
-
-struct load_info {
- Elf_Ehdr *hdr;
- unsigned long len;
- Elf_Shdr *sechdrs;
- char *secstrings, *strtab;
- unsigned long symoffs, stroffs;
- struct _ddebug *debug;
- unsigned int num_debug;
- bool sig_ok;
- struct {
- unsigned int sym, str, mod, vers, info, pcpu;
- } index;
-};
-
-/* We require a truly strong try_module_get(): 0 means failure due to
- ongoing or failed initialization etc. */
-static inline int strong_try_module_get(struct module *mod)
-{
- BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
- if (mod && mod->state == MODULE_STATE_COMING)
- return -EBUSY;
- if (try_module_get(mod))
- return 0;
- else
- return -ENOENT;
-}
-
-static inline void add_taint_module(struct module *mod, unsigned flag,
- enum lockdep_ok lockdep_ok)
-{
- add_taint(flag, lockdep_ok);
- mod->taints |= (1U << flag);
-}
-
-/*
- * A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit. nfsd and lockd use this.
- */
-void __module_put_and_exit(struct module *mod, long code)
-{
- module_put(mod);
- do_exit(code);
-}
-EXPORT_SYMBOL(__module_put_and_exit);
-
-/* Find a module section: 0 means not found. */
-static unsigned int find_sec(const struct load_info *info, const char *name)
-{
- unsigned int i;
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *shdr = &info->sechdrs[i];
- /* Alloc bit cleared means "ignore it." */
- if ((shdr->sh_flags & SHF_ALLOC)
- && strcmp(info->secstrings + shdr->sh_name, name) == 0)
- return i;
- }
- return 0;
-}
-
-/* Find a module section, or NULL. */
-static void *section_addr(const struct load_info *info, const char *name)
-{
- /* Section 0 has sh_addr 0. */
- return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
-}
-
-/* Find a module section, or NULL. Fill in number of "objects" in section. */
-static void *section_objs(const struct load_info *info,
- const char *name,
- size_t object_size,
- unsigned int *num)
-{
- unsigned int sec = find_sec(info, name);
-
- /* Section 0 has sh_addr 0 and sh_size 0. */
- *num = info->sechdrs[sec].sh_size / object_size;
- return (void *)info->sechdrs[sec].sh_addr;
-}
-
-/* Provided by the linker */
-extern const struct kernel_symbol __start___ksymtab[];
-extern const struct kernel_symbol __stop___ksymtab[];
-extern const struct kernel_symbol __start___ksymtab_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const unsigned long __start___kcrctab[];
-extern const unsigned long __start___kcrctab_gpl[];
-extern const unsigned long __start___kcrctab_gpl_future[];
-#ifdef CONFIG_UNUSED_SYMBOLS
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
-extern const unsigned long __start___kcrctab_unused[];
-extern const unsigned long __start___kcrctab_unused_gpl[];
-#endif
-
-#ifndef CONFIG_MODVERSIONS
-#define symversion(base, idx) NULL
-#else
-#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
-#endif
-
-static bool each_symbol_in_section(const struct symsearch *arr,
- unsigned int arrsize,
- struct module *owner,
- bool (*fn)(const struct symsearch *syms,
- struct module *owner,
- void *data),
- void *data)
-{
- unsigned int j;
-
- for (j = 0; j < arrsize; j++) {
- if (fn(&arr[j], owner, data))
- return true;
- }
-
- return false;
-}
-
-/* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
- struct module *owner,
- void *data),
- void *data)
-{
- struct module *mod;
- static const struct symsearch arr[] = {
- { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
- NOT_GPL_ONLY, false },
- { __start___ksymtab_gpl, __stop___ksymtab_gpl,
- __start___kcrctab_gpl,
- GPL_ONLY, false },
- { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
- __start___kcrctab_gpl_future,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { __start___ksymtab_unused, __stop___ksymtab_unused,
- __start___kcrctab_unused,
- NOT_GPL_ONLY, true },
- { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
- __start___kcrctab_unused_gpl,
- GPL_ONLY, true },
-#endif
- };
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
- return true;
-
- list_for_each_entry_rcu(mod, &modules, list) {
- struct symsearch arr[] = {
- { mod->syms, mod->syms + mod->num_syms, mod->crcs,
- NOT_GPL_ONLY, false },
- { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
- mod->gpl_crcs,
- GPL_ONLY, false },
- { mod->gpl_future_syms,
- mod->gpl_future_syms + mod->num_gpl_future_syms,
- mod->gpl_future_crcs,
- WILL_BE_GPL_ONLY, false },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms,
- mod->unused_syms + mod->num_unused_syms,
- mod->unused_crcs,
- NOT_GPL_ONLY, true },
- { mod->unused_gpl_syms,
- mod->unused_gpl_syms + mod->num_unused_gpl_syms,
- mod->unused_gpl_crcs,
- GPL_ONLY, true },
-#endif
- };
-
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
-
- if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
- return true;
- }
- return false;
-}
-EXPORT_SYMBOL_GPL(each_symbol_section);
-
-struct find_symbol_arg {
- /* Input */
- const char *name;
- bool gplok;
- bool warn;
-
- /* Output */
- struct module *owner;
- const unsigned long *crc;
- const struct kernel_symbol *sym;
-};
-
-static bool check_symbol(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
-{
- struct find_symbol_arg *fsa = data;
-
- if (!fsa->gplok) {
- if (syms->licence == GPL_ONLY)
- return false;
- if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
- pr_warn("Symbol %s is being used by a non-GPL module, "
- "which will not be allowed in the future\n",
- fsa->name);
- }
- }
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- if (syms->unused && fsa->warn) {
- pr_warn("Symbol %s is marked as UNUSED, however this module is "
- "using it.\n", fsa->name);
- pr_warn("This symbol will go away in the future.\n");
- pr_warn("Please evaluate if this is the right api to use and "
- "if it really is, submit a report to the linux kernel "
- "mailing list together with submitting your code for "
- "inclusion.\n");
- }
-#endif
-
- fsa->owner = owner;
- fsa->crc = symversion(syms->crcs, symnum);
- fsa->sym = &syms->start[symnum];
- return true;
-}
-
-static int cmp_name(const void *va, const void *vb)
-{
- const char *a;
- const struct kernel_symbol *b;
- a = va; b = vb;
- return strcmp(a, b->name);
-}
-
-static bool find_symbol_in_section(const struct symsearch *syms,
- struct module *owner,
- void *data)
-{
- struct find_symbol_arg *fsa = data;
- struct kernel_symbol *sym;
-
- sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
- sizeof(struct kernel_symbol), cmp_name);
-
- if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
- return true;
-
- return false;
-}
-
-/* Find a symbol and return it, along with, (optional) crc and
- * (optional) module which owns it. Needs preempt disabled or module_mutex. */
-const struct kernel_symbol *find_symbol(const char *name,
- struct module **owner,
- const unsigned long **crc,
- bool gplok,
- bool warn)
-{
- struct find_symbol_arg fsa;
-
- fsa.name = name;
- fsa.gplok = gplok;
- fsa.warn = warn;
-
- if (each_symbol_section(find_symbol_in_section, &fsa)) {
- if (owner)
- *owner = fsa.owner;
- if (crc)
- *crc = fsa.crc;
- return fsa.sym;
- }
-
- pr_debug("Failed to find symbol %s\n", name);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(find_symbol);
-
-/* Search for module by name: must hold module_mutex. */
-static struct module *find_module_all(const char *name, size_t len,
- bool even_unformed)
-{
- struct module *mod;
-
- list_for_each_entry(mod, &modules, list) {
- if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
- return mod;
- }
- return NULL;
-}
-
-struct module *find_module(const char *name)
-{
- return find_module_all(name, strlen(name), false);
-}
-EXPORT_SYMBOL_GPL(find_module);
-
-#ifdef CONFIG_SMP
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
- return mod->percpu;
-}
-
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
- unsigned long align = pcpusec->sh_addralign;
-
- if (!pcpusec->sh_size)
- return 0;
-
- if (align > PAGE_SIZE) {
- pr_warn("%s: per-cpu alignment %li > %li\n",
- mod->name, align, PAGE_SIZE);
- align = PAGE_SIZE;
- }
-
- mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
- if (!mod->percpu) {
- pr_warn("%s: Could not allocate %lu bytes percpu data\n",
- mod->name, (unsigned long)pcpusec->sh_size);
- return -ENOMEM;
- }
- mod->percpu_size = pcpusec->sh_size;
- return 0;
-}
-
-static void percpu_modfree(struct module *mod)
-{
- free_percpu(mod->percpu);
-}
-
-static unsigned int find_pcpusec(struct load_info *info)
-{
- return find_sec(info, ".data..percpu");
-}
-
-static void percpu_modcopy(struct module *mod,
- const void *from, unsigned long size)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
-}
-
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
-{
- struct module *mod;
- unsigned int cpu;
-
- preempt_disable();
-
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (!mod->percpu_size)
- continue;
- for_each_possible_cpu(cpu) {
- void *start = per_cpu_ptr(mod->percpu, cpu);
-
- if ((void *)addr >= start &&
- (void *)addr < start + mod->percpu_size) {
- preempt_enable();
- return true;
- }
- }
- }
-
- preempt_enable();
- return false;
-}
-
-#else /* ... !CONFIG_SMP */
-
-static inline void __percpu *mod_percpu(struct module *mod)
-{
- return NULL;
-}
-static int percpu_modalloc(struct module *mod, struct load_info *info)
-{
- /* UP modules shouldn't have this section: ENOMEM isn't quite right */
- if (info->sechdrs[info->index.pcpu].sh_size != 0)
- return -ENOMEM;
- return 0;
-}
-static inline void percpu_modfree(struct module *mod)
-{
-}
-static unsigned int find_pcpusec(struct load_info *info)
-{
- return 0;
-}
-static inline void percpu_modcopy(struct module *mod,
- const void *from, unsigned long size)
-{
- /* pcpusec should be 0, and size of that section should be 0. */
- BUG_ON(size != 0);
-}
-bool is_module_percpu_address(unsigned long addr)
-{
- return false;
-}
-
-#endif /* CONFIG_SMP */
-
-#define MODINFO_ATTR(field) \
-static void setup_modinfo_##field(struct module *mod, const char *s) \
-{ \
- mod->field = kstrdup(s, GFP_KERNEL); \
-} \
-static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
- struct module_kobject *mk, char *buffer) \
-{ \
- return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
-} \
-static int modinfo_##field##_exists(struct module *mod) \
-{ \
- return mod->field != NULL; \
-} \
-static void free_modinfo_##field(struct module *mod) \
-{ \
- kfree(mod->field); \
- mod->field = NULL; \
-} \
-static struct module_attribute modinfo_##field = { \
- .attr = { .name = __stringify(field), .mode = 0444 }, \
- .show = show_modinfo_##field, \
- .setup = setup_modinfo_##field, \
- .test = modinfo_##field##_exists, \
- .free = free_modinfo_##field, \
-};
-
-MODINFO_ATTR(version);
-MODINFO_ATTR(srcversion);
-
-static char last_unloaded_module[MODULE_NAME_LEN+1];
-
-#ifdef CONFIG_MODULE_UNLOAD
-
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
-/* MODULE_REF_BASE is the base reference count by kmodule loader. */
-#define MODULE_REF_BASE 1
-
-/* Init the unload section of the module. */
-static int module_unload_init(struct module *mod)
-{
- /*
- * Initialize reference counter to MODULE_REF_BASE.
- * refcnt == 0 means module is going.
- */
- atomic_set(&mod->refcnt, MODULE_REF_BASE);
-
- INIT_LIST_HEAD(&mod->source_list);
- INIT_LIST_HEAD(&mod->target_list);
-
- /* Hold reference count during initialization. */
- atomic_inc(&mod->refcnt);
-
- return 0;
-}
-
-/* Does a already use b? */
-static int already_uses(struct module *a, struct module *b)
-{
- struct module_use *use;
-
- list_for_each_entry(use, &b->source_list, source_list) {
- if (use->source == a) {
- pr_debug("%s uses %s!\n", a->name, b->name);
- return 1;
- }
- }
- pr_debug("%s does not use %s!\n", a->name, b->name);
- return 0;
-}
-
-/*
- * Module a uses b
- * - we add 'a' as a "source", 'b' as a "target" of module use
- * - the module_use is added to the list of 'b' sources (so
- * 'b' can walk the list to see who sourced them), and of 'a'
- * targets (so 'a' can see what modules it targets).
- */
-static int add_module_usage(struct module *a, struct module *b)
-{
- struct module_use *use;
-
- pr_debug("Allocating new usage for %s.\n", a->name);
- use = kmalloc(sizeof(*use), GFP_ATOMIC);
- if (!use) {
- pr_warn("%s: out of memory loading\n", a->name);
- return -ENOMEM;
- }
-
- use->source = a;
- use->target = b;
- list_add(&use->source_list, &b->source_list);
- list_add(&use->target_list, &a->target_list);
- return 0;
-}
-
-/* Module a uses b: caller needs module_mutex() */
-int ref_module(struct module *a, struct module *b)
-{
- int err;
-
- if (b == NULL || already_uses(a, b))
- return 0;
-
- /* If module isn't available, we fail. */
- err = strong_try_module_get(b);
- if (err)
- return err;
-
- err = add_module_usage(a, b);
- if (err) {
- module_put(b);
- return err;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(ref_module);
-
-/* Clear the unload stuff of the module. */
-static void module_unload_free(struct module *mod)
-{
- struct module_use *use, *tmp;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
- struct module *i = use->target;
- pr_debug("%s unusing %s\n", mod->name, i->name);
- module_put(i);
- list_del(&use->source_list);
- list_del(&use->target_list);
- kfree(use);
- }
- mutex_unlock(&module_mutex);
-}
-
-#ifdef CONFIG_MODULE_FORCE_UNLOAD
-static inline int try_force_unload(unsigned int flags)
-{
- int ret = (flags & O_TRUNC);
- if (ret)
- add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
- return ret;
-}
-#else
-static inline int try_force_unload(unsigned int flags)
-{
- return 0;
-}
-#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-
-/* Try to release refcount of module, 0 means success. */
-static int try_release_module_ref(struct module *mod)
-{
- int ret;
-
- /* Try to decrement refcnt which we set at loading */
- ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
- BUG_ON(ret < 0);
- if (ret)
- /* Someone can put this right now, recover with checking */
- ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
-
- return ret;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- /* If it's not unused, quit unless we're forcing. */
- if (try_release_module_ref(mod) != 0) {
- *forced = try_force_unload(flags);
- if (!(*forced))
- return -EWOULDBLOCK;
- }
-
- /* Mark it as dying. */
- mod->state = MODULE_STATE_GOING;
-
- return 0;
-}
-
-/**
- * module_refcount - return the refcount or -1 if unloading
- *
- * @mod: the module we're checking
- *
- * Returns:
- * -1 if the module is in the process of unloading
- * otherwise the number of references in the kernel to the module
- */
-int module_refcount(struct module *mod)
-{
- return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
-}
-EXPORT_SYMBOL(module_refcount);
-
-/* This exists whether we can unload or not */
-static void free_module(struct module *mod);
-
-SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
- unsigned int, flags)
-{
- struct module *mod;
- char name[MODULE_NAME_LEN];
- int ret, forced = 0;
-
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
-
- if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
- return -EFAULT;
- name[MODULE_NAME_LEN-1] = '\0';
-
- if (mutex_lock_interruptible(&module_mutex) != 0)
- return -EINTR;
-
- mod = find_module(name);
- if (!mod) {
- ret = -ENOENT;
- goto out;
- }
-
- if (!list_empty(&mod->source_list)) {
- /* Other modules depend on us: get rid of them first. */
- ret = -EWOULDBLOCK;
- goto out;
- }
-
- /* Doing init or already dying? */
- if (mod->state != MODULE_STATE_LIVE) {
- /* FIXME: if (force), slam module count damn the torpedoes */
- pr_debug("%s already dying\n", mod->name);
- ret = -EBUSY;
- goto out;
- }
-
- /* If it has an init func, it must have an exit func to unload */
- if (mod->init && !mod->exit) {
- forced = try_force_unload(flags);
- if (!forced) {
- /* This module can't be removed */
- ret = -EBUSY;
- goto out;
- }
- }
-
- /* Stop the machine so refcounts can't move and disable module. */
- ret = try_stop_module(mod, flags, &forced);
- if (ret != 0)
- goto out;
-
- mutex_unlock(&module_mutex);
- /* Final destruction now no one is using it. */
- if (mod->exit != NULL)
- mod->exit();
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- async_synchronize_full();
-
- /* Store the name of the last unloaded module for diagnostic purposes */
- strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-
- free_module(mod);
- return 0;
-out:
- mutex_unlock(&module_mutex);
- return ret;
-}
-
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- struct module_use *use;
- int printed_something = 0;
-
- seq_printf(m, " %i ", module_refcount(mod));
-
- /*
- * Always include a trailing , so userspace can differentiate
- * between this and the old multi-field proc format.
- */
- list_for_each_entry(use, &mod->source_list, source_list) {
- printed_something = 1;
- seq_printf(m, "%s,", use->source->name);
- }
-
- if (mod->init != NULL && mod->exit == NULL) {
- printed_something = 1;
- seq_puts(m, "[permanent],");
- }
-
- if (!printed_something)
- seq_puts(m, "-");
-}
-
-void __symbol_put(const char *symbol)
-{
- struct module *owner;
-
- preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, true, false))
- BUG();
- module_put(owner);
- preempt_enable();
-}
-EXPORT_SYMBOL(__symbol_put);
-
-/* Note this assumes addr is a function, which it currently always is. */
-void symbol_put_addr(void *addr)
-{
- struct module *modaddr;
- unsigned long a = (unsigned long)dereference_function_descriptor(addr);
-
- if (core_kernel_text(a))
- return;
-
- /* module_text_address is safe here: we're supposed to have reference
- * to module from symbol_get, so it can't go away. */
- modaddr = __module_text_address(a);
- BUG_ON(!modaddr);
- module_put(modaddr);
-}
-EXPORT_SYMBOL_GPL(symbol_put_addr);
-
-static ssize_t show_refcnt(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%i\n", module_refcount(mk->mod));
-}
-
-static struct module_attribute modinfo_refcnt =
- __ATTR(refcnt, 0444, show_refcnt, NULL);
-
-void __module_get(struct module *module)
-{
- if (module) {
- preempt_disable();
- atomic_inc(&module->refcnt);
- trace_module_get(module, _RET_IP_);
- preempt_enable();
- }
-}
-EXPORT_SYMBOL(__module_get);
-
-bool try_module_get(struct module *module)
-{
- bool ret = true;
-
- if (module) {
- preempt_disable();
- /* Note: here, we can fail to get a reference */
- if (likely(module_is_live(module) &&
- atomic_inc_not_zero(&module->refcnt) != 0))
- trace_module_get(module, _RET_IP_);
- else
- ret = false;
-
- preempt_enable();
- }
- return ret;
-}
-EXPORT_SYMBOL(try_module_get);
-
-void module_put(struct module *module)
-{
- int ret;
-
- if (module) {
- preempt_disable();
- ret = atomic_dec_if_positive(&module->refcnt);
- WARN_ON(ret < 0); /* Failed to put refcount */
- trace_module_put(module, _RET_IP_);
- preempt_enable();
- }
-}
-EXPORT_SYMBOL(module_put);
-
-#else /* !CONFIG_MODULE_UNLOAD */
-static inline void print_unload_info(struct seq_file *m, struct module *mod)
-{
- /* We don't know the usage count, or what modules are using. */
- seq_puts(m, " - -");
-}
-
-static inline void module_unload_free(struct module *mod)
-{
-}
-
-int ref_module(struct module *a, struct module *b)
-{
- return strong_try_module_get(b);
-}
-EXPORT_SYMBOL_GPL(ref_module);
-
-static inline int module_unload_init(struct module *mod)
-{
- return 0;
-}
-#endif /* CONFIG_MODULE_UNLOAD */
-
-static size_t module_flags_taint(struct module *mod, char *buf)
-{
- size_t l = 0;
-
- if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
- buf[l++] = 'P';
- if (mod->taints & (1 << TAINT_OOT_MODULE))
- buf[l++] = 'O';
- if (mod->taints & (1 << TAINT_FORCED_MODULE))
- buf[l++] = 'F';
- if (mod->taints & (1 << TAINT_CRAP))
- buf[l++] = 'C';
- if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
- buf[l++] = 'E';
- /*
- * TAINT_FORCED_RMMOD: could be added.
- * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
- * apply to modules.
- */
- return l;
-}
-
-static ssize_t show_initstate(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- const char *state = "unknown";
-
- switch (mk->mod->state) {
- case MODULE_STATE_LIVE:
- state = "live";
- break;
- case MODULE_STATE_COMING:
- state = "coming";
- break;
- case MODULE_STATE_GOING:
- state = "going";
- break;
- default:
- BUG();
- }
- return sprintf(buffer, "%s\n", state);
-}
-
-static struct module_attribute modinfo_initstate =
- __ATTR(initstate, 0444, show_initstate, NULL);
-
-static ssize_t store_uevent(struct module_attribute *mattr,
- struct module_kobject *mk,
- const char *buffer, size_t count)
-{
- enum kobject_action action;
-
- if (kobject_action_type(buffer, count, &action) == 0)
- kobject_uevent(&mk->kobj, action);
- return count;
-}
-
-struct module_attribute module_uevent =
- __ATTR(uevent, 0200, NULL, store_uevent);
-
-static ssize_t show_coresize(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%u\n", mk->mod->core_size);
-}
-
-static struct module_attribute modinfo_coresize =
- __ATTR(coresize, 0444, show_coresize, NULL);
-
-static ssize_t show_initsize(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- return sprintf(buffer, "%u\n", mk->mod->init_size);
-}
-
-static struct module_attribute modinfo_initsize =
- __ATTR(initsize, 0444, show_initsize, NULL);
-
-static ssize_t show_taint(struct module_attribute *mattr,
- struct module_kobject *mk, char *buffer)
-{
- size_t l;
-
- l = module_flags_taint(mk->mod, buffer);
- buffer[l++] = '\n';
- return l;
-}
-
-static struct module_attribute modinfo_taint =
- __ATTR(taint, 0444, show_taint, NULL);
-
-static struct module_attribute *modinfo_attrs[] = {
- &module_uevent,
- &modinfo_version,
- &modinfo_srcversion,
- &modinfo_initstate,
- &modinfo_coresize,
- &modinfo_initsize,
- &modinfo_taint,
-#ifdef CONFIG_MODULE_UNLOAD
- &modinfo_refcnt,
-#endif
- NULL,
-};
-
-static const char vermagic[] = VERMAGIC_STRING;
-
-static int try_to_force_load(struct module *mod, const char *reason)
-{
-#ifdef CONFIG_MODULE_FORCE_LOAD
- if (!test_taint(TAINT_FORCED_MODULE))
- pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
- add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
- return 0;
-#else
- return -ENOEXEC;
-#endif
-}
-
-#ifdef CONFIG_MODVERSIONS
-/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
-static unsigned long maybe_relocated(unsigned long crc,
- const struct module *crc_owner)
-{
-#ifdef ARCH_RELOCATES_KCRCTAB
- if (crc_owner == NULL)
- return crc - (unsigned long)reloc_start;
-#endif
- return crc;
-}
-
-static int check_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
- const char *symname,
- struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
-{
- unsigned int i, num_versions;
- struct modversion_info *versions;
-
- /* Exporting module didn't supply crcs? OK, we're already tainted. */
- if (!crc)
- return 1;
-
- /* No versions at all? modprobe --force does this. */
- if (versindex == 0)
- return try_to_force_load(mod, symname) == 0;
-
- versions = (void *) sechdrs[versindex].sh_addr;
- num_versions = sechdrs[versindex].sh_size
- / sizeof(struct modversion_info);
-
- for (i = 0; i < num_versions; i++) {
- if (strcmp(versions[i].name, symname) != 0)
- continue;
-
- if (versions[i].crc == maybe_relocated(*crc, crc_owner))
- return 1;
- pr_debug("Found checksum %lX vs module %lX\n",
- maybe_relocated(*crc, crc_owner), versions[i].crc);
- goto bad_version;
- }
-
- pr_warn("%s: no symbol version for %s\n", mod->name, symname);
- return 0;
-
-bad_version:
- pr_warn("%s: disagrees about version of symbol %s\n",
- mod->name, symname);
- return 0;
-}
-
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
- struct module *mod)
-{
- const unsigned long *crc;
-
- /* Since this should be found in kernel (which can't be removed),
- * no locking is necessary. */
- if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
- &crc, true, false))
- BUG();
- return check_version(sechdrs, versindex,
- VMLINUX_SYMBOL_STR(module_layout), mod, crc,
- NULL);
-}
-
-/* First part is kernel version, which we ignore if module has crcs. */
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- if (has_crcs) {
- amagic += strcspn(amagic, " ");
- bmagic += strcspn(bmagic, " ");
- }
- return strcmp(amagic, bmagic) == 0;
-}
-#else
-static inline int check_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
- const char *symname,
- struct module *mod,
- const unsigned long *crc,
- const struct module *crc_owner)
-{
- return 1;
-}
-
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
- struct module *mod)
-{
- return 1;
-}
-
-static inline int same_magic(const char *amagic, const char *bmagic,
- bool has_crcs)
-{
- return strcmp(amagic, bmagic) == 0;
-}
-#endif /* CONFIG_MODVERSIONS */
-
-/* Resolve a symbol for this module. I.e. if we find one, record usage. */
-static const struct kernel_symbol *resolve_symbol(struct module *mod,
- const struct load_info *info,
- const char *name,
- char ownername[])
-{
- struct module *owner;
- const struct kernel_symbol *sym;
- const unsigned long *crc;
- int err;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc,
- !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
- if (!sym)
- goto unlock;
-
- if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
- owner)) {
- sym = ERR_PTR(-EINVAL);
- goto getname;
- }
-
- err = ref_module(mod, owner);
- if (err) {
- sym = ERR_PTR(err);
- goto getname;
- }
-
-getname:
- /* We must make copy under the lock if we failed to get ref. */
- strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
-unlock:
- mutex_unlock(&module_mutex);
- return sym;
-}
-
-static const struct kernel_symbol *
-resolve_symbol_wait(struct module *mod,
- const struct load_info *info,
- const char *name)
-{
- const struct kernel_symbol *ksym;
- char owner[MODULE_NAME_LEN];
-
- if (wait_event_interruptible_timeout(module_wq,
- !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
- || PTR_ERR(ksym) != -EBUSY,
- 30 * HZ) <= 0) {
- pr_warn("%s: gave up waiting for init of module %s.\n",
- mod->name, owner);
- }
- return ksym;
-}
-
-/*
- * /sys/module/foo/sections stuff
- * J. Corbet <corbet@lwn.net>
- */
-#ifdef CONFIG_SYSFS
-
-#ifdef CONFIG_KALLSYMS
-static inline bool sect_empty(const Elf_Shdr *sect)
-{
- return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
-}
-
-struct module_sect_attr {
- struct module_attribute mattr;
- char *name;
- unsigned long address;
-};
-
-struct module_sect_attrs {
- struct attribute_group grp;
- unsigned int nsections;
- struct module_sect_attr attrs[0];
-};
-
-static ssize_t module_sect_show(struct module_attribute *mattr,
- struct module_kobject *mk, char *buf)
-{
- struct module_sect_attr *sattr =
- container_of(mattr, struct module_sect_attr, mattr);
- return sprintf(buf, "0x%pK\n", (void *)sattr->address);
-}
-
-static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
-{
- unsigned int section;
-
- for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].name);
- kfree(sect_attrs);
-}
-
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int nloaded = 0, i, size[2];
- struct module_sect_attrs *sect_attrs;
- struct module_sect_attr *sattr;
- struct attribute **gattr;
-
- /* Count loaded sections and allocate structures */
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]))
- nloaded++;
- size[0] = ALIGN(sizeof(*sect_attrs)
- + nloaded * sizeof(sect_attrs->attrs[0]),
- sizeof(sect_attrs->grp.attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
- sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
- if (sect_attrs == NULL)
- return;
-
- /* Setup section attributes. */
- sect_attrs->grp.name = "sections";
- sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
-
- sect_attrs->nsections = 0;
- sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.attrs[0];
- for (i = 0; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *sec = &info->sechdrs[i];
- if (sect_empty(sec))
- continue;
- sattr->address = sec->sh_addr;
- sattr->name = kstrdup(info->secstrings + sec->sh_name,
- GFP_KERNEL);
- if (sattr->name == NULL)
- goto out;
- sect_attrs->nsections++;
- sysfs_attr_init(&sattr->mattr.attr);
- sattr->mattr.show = module_sect_show;
- sattr->mattr.store = NULL;
- sattr->mattr.attr.name = sattr->name;
- sattr->mattr.attr.mode = S_IRUGO;
- *(gattr++) = &(sattr++)->mattr.attr;
- }
- *gattr = NULL;
-
- if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
- goto out;
-
- mod->sect_attrs = sect_attrs;
- return;
- out:
- free_sect_attrs(sect_attrs);
-}
-
-static void remove_sect_attrs(struct module *mod)
-{
- if (mod->sect_attrs) {
- sysfs_remove_group(&mod->mkobj.kobj,
- &mod->sect_attrs->grp);
- /* We are positive that no one is using any sect attrs
- * at this point. Deallocate immediately. */
- free_sect_attrs(mod->sect_attrs);
- mod->sect_attrs = NULL;
- }
-}
-
-/*
- * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
- */
-
-struct module_notes_attrs {
- struct kobject *dir;
- unsigned int notes;
- struct bin_attribute attrs[0];
-};
-
-static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t pos, size_t count)
-{
- /*
- * The caller checked the pos and count against our size.
- */
- memcpy(buf, bin_attr->private + pos, count);
- return count;
-}
-
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
- unsigned int i)
-{
- if (notes_attrs->dir) {
- while (i-- > 0)
- sysfs_remove_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]);
- kobject_put(notes_attrs->dir);
- }
- kfree(notes_attrs);
-}
-
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
-{
- unsigned int notes, loaded, i;
- struct module_notes_attrs *notes_attrs;
- struct bin_attribute *nattr;
-
- /* failed to create section attributes, so can't create notes */
- if (!mod->sect_attrs)
- return;
-
- /* Count notes sections and allocate structures. */
- notes = 0;
- for (i = 0; i < info->hdr->e_shnum; i++)
- if (!sect_empty(&info->sechdrs[i]) &&
- (info->sechdrs[i].sh_type == SHT_NOTE))
- ++notes;
-
- if (notes == 0)
- return;
-
- notes_attrs = kzalloc(sizeof(*notes_attrs)
- + notes * sizeof(notes_attrs->attrs[0]),
- GFP_KERNEL);
- if (notes_attrs == NULL)
- return;
-
- notes_attrs->notes = notes;
- nattr = &notes_attrs->attrs[0];
- for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
- if (sect_empty(&info->sechdrs[i]))
- continue;
- if (info->sechdrs[i].sh_type == SHT_NOTE) {
- sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
- nattr->attr.mode = S_IRUGO;
- nattr->size = info->sechdrs[i].sh_size;
- nattr->private = (void *) info->sechdrs[i].sh_addr;
- nattr->read = module_notes_read;
- ++nattr;
- }
- ++loaded;
- }
-
- notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir)
- goto out;
-
- for (i = 0; i < notes; ++i)
- if (sysfs_create_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]))
- goto out;
-
- mod->notes_attrs = notes_attrs;
- return;
-
- out:
- free_notes_attrs(notes_attrs, i);
-}
-
-static void remove_notes_attrs(struct module *mod)
-{
- if (mod->notes_attrs)
- free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
-}
-
-#else
-
-static inline void add_sect_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_sect_attrs(struct module *mod)
-{
-}
-
-static inline void add_notes_attrs(struct module *mod,
- const struct load_info *info)
-{
-}
-
-static inline void remove_notes_attrs(struct module *mod)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void add_usage_links(struct module *mod)
-{
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
- int nowarn;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list) {
- nowarn = sysfs_create_link(use->target->holders_dir,
- &mod->mkobj.kobj, mod->name);
- }
- mutex_unlock(&module_mutex);
-#endif
-}
-
-static void del_usage_links(struct module *mod)
-{
-#ifdef CONFIG_MODULE_UNLOAD
- struct module_use *use;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list)
- sysfs_remove_link(use->target->holders_dir, mod->name);
- mutex_unlock(&module_mutex);
-#endif
-}
-
-static int module_add_modinfo_attrs(struct module *mod)
-{
- struct module_attribute *attr;
- struct module_attribute *temp_attr;
- int error = 0;
- int i;
-
- mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
- (ARRAY_SIZE(modinfo_attrs) + 1)),
- GFP_KERNEL);
- if (!mod->modinfo_attrs)
- return -ENOMEM;
-
- temp_attr = mod->modinfo_attrs;
- for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
- if (!attr->test ||
- (attr->test && attr->test(mod))) {
- memcpy(temp_attr, attr, sizeof(*temp_attr));
- sysfs_attr_init(&temp_attr->attr);
- error = sysfs_create_file(&mod->mkobj.kobj,
- &temp_attr->attr);
- ++temp_attr;
- }
- }
- return error;
-}
-
-static void module_remove_modinfo_attrs(struct module *mod)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
- /* pick a field to test for end of list */
- if (!attr->attr.name)
- break;
- sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
- if (attr->free)
- attr->free(mod);
- }
- kfree(mod->modinfo_attrs);
-}
-
-static void mod_kobject_put(struct module *mod)
-{
- DECLARE_COMPLETION_ONSTACK(c);
- mod->mkobj.kobj_completion = &c;
- kobject_put(&mod->mkobj.kobj);
- wait_for_completion(&c);
-}
-
-static int mod_sysfs_init(struct module *mod)
-{
- int err;
- struct kobject *kobj;
-
- if (!module_sysfs_initialized) {
- pr_err("%s: module sysfs not initialized\n", mod->name);
- err = -EINVAL;
- goto out;
- }
-
- kobj = kset_find_obj(module_kset, mod->name);
- if (kobj) {
- pr_err("%s: module is already loaded\n", mod->name);
- kobject_put(kobj);
- err = -EINVAL;
- goto out;
- }
-
- mod->mkobj.mod = mod;
-
- memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
- mod->mkobj.kobj.kset = module_kset;
- err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
- "%s", mod->name);
- if (err)
- mod_kobject_put(mod);
-
- /* delay uevent until full sysfs population */
-out:
- return err;
-}
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- int err;
-
- err = mod_sysfs_init(mod);
- if (err)
- goto out;
-
- mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
- if (!mod->holders_dir) {
- err = -ENOMEM;
- goto out_unreg;
- }
-
- err = module_param_sysfs_setup(mod, kparam, num_params);
- if (err)
- goto out_unreg_holders;
-
- err = module_add_modinfo_attrs(mod);
- if (err)
- goto out_unreg_param;
-
- add_usage_links(mod);
- add_sect_attrs(mod, info);
- add_notes_attrs(mod, info);
-
- kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
- return 0;
-
-out_unreg_param:
- module_param_sysfs_remove(mod);
-out_unreg_holders:
- kobject_put(mod->holders_dir);
-out_unreg:
- mod_kobject_put(mod);
-out:
- return err;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
- remove_notes_attrs(mod);
- remove_sect_attrs(mod);
- mod_kobject_put(mod);
-}
-
-#else /* !CONFIG_SYSFS */
-
-static int mod_sysfs_setup(struct module *mod,
- const struct load_info *info,
- struct kernel_param *kparam,
- unsigned int num_params)
-{
- return 0;
-}
-
-static void mod_sysfs_fini(struct module *mod)
-{
-}
-
-static void module_remove_modinfo_attrs(struct module *mod)
-{
-}
-
-static void del_usage_links(struct module *mod)
-{
-}
-
-#endif /* CONFIG_SYSFS */
-
-static void mod_sysfs_teardown(struct module *mod)
-{
- del_usage_links(mod);
- module_remove_modinfo_attrs(mod);
- module_param_sysfs_remove(mod);
- kobject_put(mod->mkobj.drivers_dir);
- kobject_put(mod->holders_dir);
- mod_sysfs_fini(mod);
-}
-
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- */
-void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
-{
- unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
- unsigned long end_pfn = PFN_DOWN((unsigned long)end);
-
- if (end_pfn > begin_pfn)
- set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
-}
-
-static void set_section_ro_nx(void *base,
- unsigned long text_size,
- unsigned long ro_size,
- unsigned long total_size)
-{
- /* begin and end PFNs of the current subsection */
- unsigned long begin_pfn;
- unsigned long end_pfn;
-
- /*
- * Set RO for module text and RO-data:
- * - Always protect first page.
- * - Do not protect last partial page.
- */
- if (ro_size > 0)
- set_page_attributes(base, base + ro_size, set_memory_ro);
-
- /*
- * Set NX permissions for module data:
- * - Do not protect first partial page.
- * - Always protect last page.
- */
- if (total_size > text_size) {
- begin_pfn = PFN_UP((unsigned long)base + text_size);
- end_pfn = PFN_UP((unsigned long)base + total_size);
- if (end_pfn > begin_pfn)
- set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
- }
-}
-
-static void unset_module_core_ro_nx(struct module *mod)
-{
- set_page_attributes(mod->module_core + mod->core_text_size,
- mod->module_core + mod->core_size,
- set_memory_x);
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_ro_size,
- set_memory_rw);
-}
-
-static void unset_module_init_ro_nx(struct module *mod)
-{
- set_page_attributes(mod->module_init + mod->init_text_size,
- mod->module_init + mod->init_size,
- set_memory_x);
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_ro_size,
- set_memory_rw);
-}
-
-/* Iterate through all modules and set each module's text as RW */
-void set_all_modules_text_rw(void)
-{
- struct module *mod;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_rw);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_rw);
- }
- }
- mutex_unlock(&module_mutex);
-}
-
-/* Iterate through all modules and set each module's text as RO */
-void set_all_modules_text_ro(void)
-{
- struct module *mod;
-
- mutex_lock(&module_mutex);
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if ((mod->module_core) && (mod->core_text_size)) {
- set_page_attributes(mod->module_core,
- mod->module_core + mod->core_text_size,
- set_memory_ro);
- }
- if ((mod->module_init) && (mod->init_text_size)) {
- set_page_attributes(mod->module_init,
- mod->module_init + mod->init_text_size,
- set_memory_ro);
- }
- }
- mutex_unlock(&module_mutex);
-}
-#else
-static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static void unset_module_core_ro_nx(struct module *mod) { }
-static void unset_module_init_ro_nx(struct module *mod) { }
-#endif
-
-void __weak module_memfree(void *module_region)
-{
- vfree(module_region);
-}
-
-void __weak module_arch_cleanup(struct module *mod)
-{
-}
-
-void __weak module_arch_freeing_init(struct module *mod)
-{
-}
-
-/* Free a module, remove from lists, etc. */
-static void free_module(struct module *mod)
-{
- trace_module_free(mod);
-
- mod_sysfs_teardown(mod);
-
- /* We leave it in list to prevent duplicate loads, but make sure
- * that noone uses it while it's being deconstructed. */
- mutex_lock(&module_mutex);
- mod->state = MODULE_STATE_UNFORMED;
- mutex_unlock(&module_mutex);
-
- /* Remove dynamic debug info */
- ddebug_remove_module(mod->name);
-
- /* Arch-specific cleanup. */
- module_arch_cleanup(mod);
-
- /* Module unload stuff */
- module_unload_free(mod);
-
- /* Free any allocated parameters. */
- destroy_params(mod->kp, mod->num_kp);
-
- /* Now we can delete it from the lists */
- mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
- /* Remove this module from bug list, this uses list_del_rcu */
- module_bug_cleanup(mod);
- /* Wait for RCU synchronizing before releasing mod->list and buglist. */
- synchronize_rcu();
- mutex_unlock(&module_mutex);
-
- /* This may be NULL, but that's OK */
- unset_module_init_ro_nx(mod);
- module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
- kfree(mod->args);
- percpu_modfree(mod);
-
- /* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->module_core, mod->core_size);
-
- /* Finally, free the core (containing the module structure) */
- unset_module_core_ro_nx(mod);
- module_memfree(mod->module_core);
-
-#ifdef CONFIG_MPU
- update_protections(current->mm);
-#endif
-}
-
-void *__symbol_get(const char *symbol)
-{
- struct module *owner;
- const struct kernel_symbol *sym;
-
- preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, true, true);
- if (sym && strong_try_module_get(owner))
- sym = NULL;
- preempt_enable();
-
- return sym ? (void *)sym->value : NULL;
-}
-EXPORT_SYMBOL_GPL(__symbol_get);
-
-/*
- * Ensure that an exported symbol [global namespace] does not already exist
- * in the kernel or in some other module's exported symbol table.
- *
- * You must hold the module_mutex.
- */
-static int verify_export_symbols(struct module *mod)
-{
- unsigned int i;
- struct module *owner;
- const struct kernel_symbol *s;
- struct {
- const struct kernel_symbol *sym;
- unsigned int num;
- } arr[] = {
- { mod->syms, mod->num_syms },
- { mod->gpl_syms, mod->num_gpl_syms },
- { mod->gpl_future_syms, mod->num_gpl_future_syms },
-#ifdef CONFIG_UNUSED_SYMBOLS
- { mod->unused_syms, mod->num_unused_syms },
- { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
-#endif
- };
-
- for (i = 0; i < ARRAY_SIZE(arr); i++) {
- for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
- if (find_symbol(s->name, &owner, NULL, true, false)) {
- pr_err("%s: exports duplicate symbol %s"
- " (owned by %s)\n",
- mod->name, s->name, module_name(owner));
- return -ENOEXEC;
- }
- }
- }
- return 0;
-}
-
-/* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(struct module *mod, const struct load_info *info)
-{
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- Elf_Sym *sym = (void *)symsec->sh_addr;
- unsigned long secbase;
- unsigned int i;
- int ret = 0;
- const struct kernel_symbol *ksym;
-
- for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
- const char *name = info->strtab + sym[i].st_name;
-
- switch (sym[i].st_shndx) {
- case SHN_COMMON:
- /* Ignore common symbols */
- if (!strncmp(name, "__gnu_lto", 9))
- break;
-
- /* We compiled with -fno-common. These are not
- supposed to happen. */
- pr_debug("Common symbol: %s\n", name);
- pr_warn("%s: please compile with -fno-common\n",
- mod->name);
- ret = -ENOEXEC;
- break;
-
- case SHN_ABS:
- /* Don't need to do anything */
- pr_debug("Absolute symbol: 0x%08lx\n",
- (long)sym[i].st_value);
- break;
-
- case SHN_UNDEF:
- ksym = resolve_symbol_wait(mod, info, name);
- /* Ok if resolved. */
- if (ksym && !IS_ERR(ksym)) {
- sym[i].st_value = ksym->value;
- break;
- }
-
- /* Ok if weak. */
- if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
- break;
-
- pr_warn("%s: Unknown symbol %s (err %li)\n",
- mod->name, name, PTR_ERR(ksym));
- ret = PTR_ERR(ksym) ?: -ENOENT;
- break;
-
- default:
- /* Divert to percpu allocation if a percpu var. */
- if (sym[i].st_shndx == info->index.pcpu)
- secbase = (unsigned long)mod_percpu(mod);
- else
- secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
- sym[i].st_value += secbase;
- break;
- }
- }
-
- return ret;
-}
-
-static int apply_relocations(struct module *mod, const struct load_info *info)
-{
- unsigned int i;
- int err = 0;
-
- /* Now do relocations. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- unsigned int infosec = info->sechdrs[i].sh_info;
-
- /* Not a valid relocation section? */
- if (infosec >= info->hdr->e_shnum)
- continue;
-
- /* Don't bother with non-allocated sections */
- if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
- continue;
-
- if (info->sechdrs[i].sh_type == SHT_REL)
- err = apply_relocate(info->sechdrs, info->strtab,
- info->index.sym, i, mod);
- else if (info->sechdrs[i].sh_type == SHT_RELA)
- err = apply_relocate_add(info->sechdrs, info->strtab,
- info->index.sym, i, mod);
- if (err < 0)
- break;
- }
- return err;
-}
-
-/* Additional bytes needed by arch in front of individual sections */
-unsigned int __weak arch_mod_section_prepend(struct module *mod,
- unsigned int section)
-{
- /* default implementation just returns zero */
- return 0;
-}
-
-/* Update size with this section: return offset. */
-static long get_offset(struct module *mod, unsigned int *size,
- Elf_Shdr *sechdr, unsigned int section)
-{
- long ret;
-
- *size += arch_mod_section_prepend(mod, section);
- ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
- *size = ret + sechdr->sh_size;
- return ret;
-}
-
-/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- might -- code, read-only data, read-write data, small data. Tally
- sizes, and place the offsets into sh_entsize fields: high bit means it
- belongs in init. */
-static void layout_sections(struct module *mod, struct load_info *info)
-{
- static unsigned long const masks[][2] = {
- /* NOTE: all executable code must be the first section
- * in this array; otherwise modify the text_size
- * finder in the two loops below */
- { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
- { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
- { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
- { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
- };
- unsigned int m, i;
-
- for (i = 0; i < info->hdr->e_shnum; i++)
- info->sechdrs[i].sh_entsize = ~0UL;
-
- pr_debug("Core section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
-
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || strstarts(sname, ".init"))
- continue;
- s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->core_size = debug_align(mod->core_size);
- mod->core_text_size = mod->core_size;
- break;
- case 1: /* RO: text and ro-data */
- mod->core_size = debug_align(mod->core_size);
- mod->core_ro_size = mod->core_size;
- break;
- case 3: /* whole core */
- mod->core_size = debug_align(mod->core_size);
- break;
- }
- }
-
- pr_debug("Init section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
-
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || !strstarts(sname, ".init"))
- continue;
- s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
- | INIT_OFFSET_MASK);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->init_size = debug_align(mod->init_size);
- mod->init_text_size = mod->init_size;
- break;
- case 1: /* RO: text and ro-data */
- mod->init_size = debug_align(mod->init_size);
- mod->init_ro_size = mod->init_size;
- break;
- case 3: /* whole init */
- mod->init_size = debug_align(mod->init_size);
- break;
- }
- }
-}
-
-static void set_license(struct module *mod, const char *license)
-{
- if (!license)
- license = "unspecified";
-
- if (!license_is_gpl_compatible(license)) {
- if (!test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license '%s' taints kernel.\n",
- mod->name, license);
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
- }
-}
-
-/* Parse tag=value strings from .modinfo section */
-static char *next_string(char *string, unsigned long *secsize)
-{
- /* Skip non-zero chars */
- while (string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
-
- /* Skip any zero padding. */
- while (!string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
- return string;
-}
-
-static char *get_modinfo(struct load_info *info, const char *tag)
-{
- char *p;
- unsigned int taglen = strlen(tag);
- Elf_Shdr *infosec = &info->sechdrs[info->index.info];
- unsigned long size = infosec->sh_size;
-
- for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
- if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
- return p + taglen + 1;
- }
- return NULL;
-}
-
-static void setup_modinfo(struct module *mod, struct load_info *info)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (attr->setup)
- attr->setup(mod, get_modinfo(info, attr->attr.name));
- }
-}
-
-static void free_modinfo(struct module *mod)
-{
- struct module_attribute *attr;
- int i;
-
- for (i = 0; (attr = modinfo_attrs[i]); i++) {
- if (attr->free)
- attr->free(mod);
- }
-}
-
-#ifdef CONFIG_KALLSYMS
-
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
- const struct kernel_symbol *start,
- const struct kernel_symbol *stop)
-{
- return bsearch(name, start, stop - start,
- sizeof(struct kernel_symbol), cmp_name);
-}
-
-static int is_exported(const char *name, unsigned long value,
- const struct module *mod)
-{
- const struct kernel_symbol *ks;
- if (!mod)
- ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
- else
- ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
- return ks != NULL && ks->value == value;
-}
-
-/* As per nm */
-static char elf_type(const Elf_Sym *sym, const struct load_info *info)
-{
- const Elf_Shdr *sechdrs = info->sechdrs;
-
- if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
- if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
- return 'v';
- else
- return 'w';
- }
- if (sym->st_shndx == SHN_UNDEF)
- return 'U';
- if (sym->st_shndx == SHN_ABS)
- return 'a';
- if (sym->st_shndx >= SHN_LORESERVE)
- return '?';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
- return 't';
- if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
- && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
- if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
- return 'r';
- else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 'g';
- else
- return 'd';
- }
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
- return 's';
- else
- return 'b';
- }
- if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
- ".debug")) {
- return 'n';
- }
- return '?';
-}
-
-static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
-{
- const Elf_Shdr *sec;
-
- if (src->st_shndx == SHN_UNDEF
- || src->st_shndx >= shnum
- || !src->st_name)
- return false;
-
- sec = sechdrs + src->st_shndx;
- if (!(sec->sh_flags & SHF_ALLOC)
-#ifndef CONFIG_KALLSYMS_ALL
- || !(sec->sh_flags & SHF_EXECINSTR)
-#endif
- || (sec->sh_entsize & INIT_OFFSET_MASK))
- return false;
-
- return true;
-}
-
-/*
- * We only allocate and copy the strings needed by the parts of symtab
- * we keep. This is simple, but has the effect of making multiple
- * copies of duplicates. We could be more sophisticated, see
- * linux-kernel thread starting with
- * <73defb5e4bca04a6431392cc341112b1@localhost>.
- */
-static void layout_symtab(struct module *mod, struct load_info *info)
-{
- Elf_Shdr *symsect = info->sechdrs + info->index.sym;
- Elf_Shdr *strsect = info->sechdrs + info->index.str;
- const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size = 0;
-
- /* Put symbol section at end of init part of module. */
- symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
- info->index.sym) | INIT_OFFSET_MASK;
- pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
-
- src = (void *)info->hdr + symsect->sh_offset;
- nsrc = symsect->sh_size / sizeof(*src);
-
- /* Compute total space required for the core symbols' strtab. */
- for (ndst = i = 0; i < nsrc; i++) {
- if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
- strtab_size += strlen(&info->strtab[src[i].st_name])+1;
- ndst++;
- }
- }
-
- /* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->core_size += strtab_size;
- mod->core_size = debug_align(mod->core_size);
-
- /* Put string table section at end of init part of module. */
- strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
- info->index.str) | INIT_OFFSET_MASK;
- mod->init_size = debug_align(mod->init_size);
- pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
- unsigned int i, ndst;
- const Elf_Sym *src;
- Elf_Sym *dst;
- char *s;
- Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
-
- mod->symtab = (void *)symsec->sh_addr;
- mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
- /* Make sure we get permanent strtab: don't use info->strtab. */
- mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
-
- /* Set types up while we still have access to sections. */
- for (i = 0; i < mod->num_symtab; i++)
- mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-
- mod->core_symtab = dst = mod->module_core + info->symoffs;
- mod->core_strtab = s = mod->module_core + info->stroffs;
- src = mod->symtab;
- for (ndst = i = 0; i < mod->num_symtab; i++) {
- if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
- dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_strtab;
- s += strlcpy(s, &mod->strtab[src[i].st_name],
- KSYM_NAME_LEN) + 1;
- }
- }
- mod->core_num_syms = ndst;
-}
-#else
-static inline void layout_symtab(struct module *mod, struct load_info *info)
-{
-}
-
-static void add_kallsyms(struct module *mod, const struct load_info *info)
-{
-}
-#endif /* CONFIG_KALLSYMS */
-
-static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
-{
- if (!debug)
- return;
-#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, debug->modname))
- pr_err("dynamic debug error adding module: %s\n",
- debug->modname);
-#endif
-}
-
-static void dynamic_debug_remove(struct _ddebug *debug)
-{
- if (debug)
- ddebug_remove_module(debug->modname);
-}
-
-void * __weak module_alloc(unsigned long size)
-{
- return vmalloc_exec(size);
-}
-
-static void *module_alloc_update_bounds(unsigned long size)
-{
- void *ret = module_alloc(size);
-
- if (ret) {
- mutex_lock(&module_mutex);
- /* Update module bounds. */
- if ((unsigned long)ret < module_addr_min)
- module_addr_min = (unsigned long)ret;
- if ((unsigned long)ret + size > module_addr_max)
- module_addr_max = (unsigned long)ret + size;
- mutex_unlock(&module_mutex);
- }
- return ret;
-}
-
-#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
- unsigned int i;
-
- /* only scan the sections containing data */
- kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- /* Scan all writable sections that's not executable */
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
- !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
- (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
- info->sechdrs[i].sh_size, GFP_KERNEL);
- }
-}
-#else
-static inline void kmemleak_load_module(const struct module *mod,
- const struct load_info *info)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
-{
- int err = -ENOKEY;
- const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
- const void *mod = info->hdr;
-
- if (info->len > markerlen &&
- memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
- /* We truncate the module to discard the signature */
- info->len -= markerlen;
- err = mod_verify_sig(mod, &info->len);
- }
-
- if (!err) {
- info->sig_ok = true;
- return 0;
- }
-
- /* Not having a signature is only an error if we're strict. */
- if (err == -ENOKEY && !sig_enforce)
- err = 0;
-
- return err;
-}
-#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
-{
- return 0;
-}
-#endif /* !CONFIG_MODULE_SIG */
-
-/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
-static int elf_header_check(struct load_info *info)
-{
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
-
- if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
- || info->hdr->e_type != ET_REL
- || !elf_check_arch(info->hdr)
- || info->hdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (info->hdr->e_shoff >= info->len
- || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
- info->len - info->hdr->e_shoff))
- return -ENOEXEC;
-
- return 0;
-}
-
-#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
-
-static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
-{
- do {
- unsigned long n = min(len, COPY_CHUNK_SIZE);
-
- if (copy_from_user(dst, usrc, n) != 0)
- return -EFAULT;
- cond_resched();
- dst += n;
- usrc += n;
- len -= n;
- } while (len);
- return 0;
-}
-
-/* Sets info->hdr and info->len. */
-static int copy_module_from_user(const void __user *umod, unsigned long len,
- struct load_info *info)
-{
- int err;
-
- info->len = len;
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
-
- err = security_kernel_module_from_file(NULL);
- if (err)
- return err;
-
- /* Suck in entire file: we'll want most of it. */
- info->hdr = __vmalloc(info->len,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL);
- if (!info->hdr)
- return -ENOMEM;
-
- if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
- vfree(info->hdr);
- return -EFAULT;
- }
-
- return 0;
-}
-
-/* Sets info->hdr and info->len. */
-static int copy_module_from_fd(int fd, struct load_info *info)
-{
- struct fd f = fdget(fd);
- int err;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -ENOEXEC;
-
- err = security_kernel_module_from_file(f.file);
- if (err)
- goto out;
-
- err = vfs_getattr(&f.file->f_path, &stat);
- if (err)
- goto out;
-
- if (stat.size > INT_MAX) {
- err = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- err = -EINVAL;
- goto out;
- }
-
- info->hdr = vmalloc(stat.size);
- if (!info->hdr) {
- err = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(info->hdr);
- err = bytes;
- goto out;
- }
- if (bytes == 0)
- break;
- pos += bytes;
- }
- info->len = pos;
-
-out:
- fdput(f);
- return err;
-}
-
-static void free_copy(struct load_info *info)
-{
- vfree(info->hdr);
-}
-
-static int rewrite_section_headers(struct load_info *info, int flags)
-{
- unsigned int i;
-
- /* This should always be true, but let's be sure. */
- info->sechdrs[0].sh_addr = 0;
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- Elf_Shdr *shdr = &info->sechdrs[i];
- if (shdr->sh_type != SHT_NOBITS
- && info->len < shdr->sh_offset + shdr->sh_size) {
- pr_err("Module len %lu truncated\n", info->len);
- return -ENOEXEC;
- }
-
- /* Mark all sections sh_addr with their address in the
- temporary image. */
- shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
-
-#ifndef CONFIG_MODULE_UNLOAD
- /* Don't load .exit sections */
- if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
- shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
-#endif
- }
-
- /* Track but don't keep modinfo and version sections. */
- if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
- info->index.vers = 0; /* Pretend no __versions section! */
- else
- info->index.vers = find_sec(info, "__versions");
- info->index.info = find_sec(info, ".modinfo");
- info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
- info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
- return 0;
-}
-
-/*
- * Set up our basic convenience variables (pointers to section headers,
- * search for module section index etc), and do some basic section
- * verification.
- *
- * Return the temporary module pointer (we'll replace it with the final
- * one when we move the module sections around).
- */
-static struct module *setup_load_info(struct load_info *info, int flags)
-{
- unsigned int i;
- int err;
- struct module *mod;
-
- /* Set up the convenience variables */
- info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
- info->secstrings = (void *)info->hdr
- + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
-
- err = rewrite_section_headers(info, flags);
- if (err)
- return ERR_PTR(err);
-
- /* Find internal symbols and strings. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
- info->index.sym = i;
- info->index.str = info->sechdrs[i].sh_link;
- info->strtab = (char *)info->hdr
- + info->sechdrs[info->index.str].sh_offset;
- break;
- }
- }
-
- info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
- if (!info->index.mod) {
- pr_warn("No module found in object\n");
- return ERR_PTR(-ENOEXEC);
- }
- /* This is temporary: point mod into copy of data. */
- mod = (void *)info->sechdrs[info->index.mod].sh_addr;
-
- if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
- return ERR_PTR(-ENOEXEC);
- }
-
- info->index.pcpu = find_pcpusec(info);
-
- /* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
- return ERR_PTR(-ENOEXEC);
-
- return mod;
-}
-
-static int check_modinfo(struct module *mod, struct load_info *info, int flags)
-{
- const char *modmagic = get_modinfo(info, "vermagic");
- int err;
-
- if (flags & MODULE_INIT_IGNORE_VERMAGIC)
- modmagic = NULL;
-
- /* This is allowed: modprobe --force will invalidate it. */
- if (!modmagic) {
- err = try_to_force_load(mod, "bad vermagic");
- if (err)
- return err;
- } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
- pr_err("%s: version magic '%s' should be '%s'\n",
- mod->name, modmagic, vermagic);
- return -ENOEXEC;
- }
-
- if (!get_modinfo(info, "intree"))
- add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
-
- if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- pr_warn("%s: module is from the staging directory, the quality "
- "is unknown, you have been warned.\n", mod->name);
- }
-
- /* Set up license info based on the info section */
- set_license(mod, get_modinfo(info, "license"));
-
- return 0;
-}
-
-static int find_module_sections(struct module *mod, struct load_info *info)
-{
- mod->kp = section_objs(info, "__param",
- sizeof(*mod->kp), &mod->num_kp);
- mod->syms = section_objs(info, "__ksymtab",
- sizeof(*mod->syms), &mod->num_syms);
- mod->crcs = section_addr(info, "__kcrctab");
- mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
- sizeof(*mod->gpl_syms),
- &mod->num_gpl_syms);
- mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
- mod->gpl_future_syms = section_objs(info,
- "__ksymtab_gpl_future",
- sizeof(*mod->gpl_future_syms),
- &mod->num_gpl_future_syms);
- mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- mod->unused_syms = section_objs(info, "__ksymtab_unused",
- sizeof(*mod->unused_syms),
- &mod->num_unused_syms);
- mod->unused_crcs = section_addr(info, "__kcrctab_unused");
- mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
- sizeof(*mod->unused_gpl_syms),
- &mod->num_unused_gpl_syms);
- mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
-#endif
-#ifdef CONFIG_CONSTRUCTORS
- mod->ctors = section_objs(info, ".ctors",
- sizeof(*mod->ctors), &mod->num_ctors);
- if (!mod->ctors)
- mod->ctors = section_objs(info, ".init_array",
- sizeof(*mod->ctors), &mod->num_ctors);
- else if (find_sec(info, ".init_array")) {
- /*
- * This shouldn't happen with same compiler and binutils
- * building all parts of the module.
- */
- pr_warn("%s: has both .ctors and .init_array.\n",
- mod->name);
- return -EINVAL;
- }
-#endif
-
-#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
- sizeof(*mod->tracepoints_ptrs),
- &mod->num_tracepoints);
-#endif
-#ifdef HAVE_JUMP_LABEL
- mod->jump_entries = section_objs(info, "__jump_table",
- sizeof(*mod->jump_entries),
- &mod->num_jump_entries);
-#endif
-#ifdef CONFIG_EVENT_TRACING
- mod->trace_events = section_objs(info, "_ftrace_events",
- sizeof(*mod->trace_events),
- &mod->num_trace_events);
- mod->trace_enums = section_objs(info, "_ftrace_enum_map",
- sizeof(*mod->trace_enums),
- &mod->num_trace_enums);
-#endif
-#ifdef CONFIG_TRACING
- mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
- sizeof(*mod->trace_bprintk_fmt_start),
- &mod->num_trace_bprintk_fmt);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
- /* sechdrs[0].sh_size is always zero */
- mod->ftrace_callsites = section_objs(info, "__mcount_loc",
- sizeof(*mod->ftrace_callsites),
- &mod->num_ftrace_callsites);
-#endif
-
- mod->extable = section_objs(info, "__ex_table",
- sizeof(*mod->extable), &mod->num_exentries);
-
- if (section_addr(info, "__obsparm"))
- pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
-
- info->debug = section_objs(info, "__verbose",
- sizeof(*info->debug), &info->num_debug);
-
- return 0;
-}
-
-static int move_module(struct module *mod, struct load_info *info)
-{
- int i;
- void *ptr;
-
- /* Do the allocs. */
- ptr = module_alloc_update_bounds(mod->core_size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. Just mark it as not being a
- * leak.
- */
- kmemleak_not_leak(ptr);
- if (!ptr)
- return -ENOMEM;
-
- memset(ptr, 0, mod->core_size);
- mod->module_core = ptr;
-
- if (mod->init_size) {
- ptr = module_alloc_update_bounds(mod->init_size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
- */
- kmemleak_ignore(ptr);
- if (!ptr) {
- module_memfree(mod->module_core);
- return -ENOMEM;
- }
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
- } else
- mod->module_init = NULL;
-
- /* Transfer each section which specifies SHF_ALLOC */
- pr_debug("final section addresses:\n");
- for (i = 0; i < info->hdr->e_shnum; i++) {
- void *dest;
- Elf_Shdr *shdr = &info->sechdrs[i];
-
- if (!(shdr->sh_flags & SHF_ALLOC))
- continue;
-
- if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->module_init
- + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
- else
- dest = mod->module_core + shdr->sh_entsize;
-
- if (shdr->sh_type != SHT_NOBITS)
- memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
- /* Update sh_addr to point to copy in image. */
- shdr->sh_addr = (unsigned long)dest;
- pr_debug("\t0x%lx %s\n",
- (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
- }
-
- return 0;
-}
-
-static int check_module_license_and_versions(struct module *mod)
-{
- /*
- * ndiswrapper is under GPL by itself, but loads proprietary modules.
- * Don't use add_taint_module(), as it would prevent ndiswrapper from
- * using GPL-only symbols it needs.
- */
- if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
-
- /* driverloader was caught wrongly pretending to be under GPL */
- if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- /* lve claims to be GPL but upstream won't provide source */
- if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
-#ifdef CONFIG_MODVERSIONS
- if ((mod->num_syms && !mod->crcs)
- || (mod->num_gpl_syms && !mod->gpl_crcs)
- || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
-#ifdef CONFIG_UNUSED_SYMBOLS
- || (mod->num_unused_syms && !mod->unused_crcs)
- || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
-#endif
- ) {
- return try_to_force_load(mod,
- "no versions for exported symbols");
- }
-#endif
- return 0;
-}
-
-static void flush_module_icache(const struct module *mod)
-{
- mm_segment_t old_fs;
-
- /* flush the icache in correct context */
- old_fs = get_fs();
- set_fs(KERNEL_DS);
-
- /*
- * Flush the instruction cache, since we've played with text.
- * Do it before processing of module parameters, so the module
- * can provide parameter accessor functions of its own.
- */
- if (mod->module_init)
- flush_icache_range((unsigned long)mod->module_init,
- (unsigned long)mod->module_init
- + mod->init_size);
- flush_icache_range((unsigned long)mod->module_core,
- (unsigned long)mod->module_core + mod->core_size);
-
- set_fs(old_fs);
-}
-
-int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- char *secstrings,
- struct module *mod)
-{
- return 0;
-}
-
-static struct module *layout_and_allocate(struct load_info *info, int flags)
-{
- /* Module within temporary copy. */
- struct module *mod;
- int err;
-
- mod = setup_load_info(info, flags);
- if (IS_ERR(mod))
- return mod;
-
- err = check_modinfo(mod, info, flags);
- if (err)
- return ERR_PTR(err);
-
- /* Allow arches to frob section contents and sizes. */
- err = module_frob_arch_sections(info->hdr, info->sechdrs,
- info->secstrings, mod);
- if (err < 0)
- return ERR_PTR(err);
-
- /* We will do a special allocation for per-cpu sections later. */
- info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
-
- /* Determine total sizes, and put offsets in sh_entsize. For now
- this is done generically; there doesn't appear to be any
- special cases for the architectures. */
- layout_sections(mod, info);
- layout_symtab(mod, info);
-
- /* Allocate and move to the final place */
- err = move_module(mod, info);
- if (err)
- return ERR_PTR(err);
-
- /* Module has been copied to its final place now: return it. */
- mod = (void *)info->sechdrs[info->index.mod].sh_addr;
- kmemleak_load_module(mod, info);
- return mod;
-}
-
-/* mod is no longer valid after this! */
-static void module_deallocate(struct module *mod, struct load_info *info)
-{
- percpu_modfree(mod);
- module_arch_freeing_init(mod);
- module_memfree(mod->module_init);
- module_memfree(mod->module_core);
-}
-
-int __weak module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- return 0;
-}
-
-static int post_relocation(struct module *mod, const struct load_info *info)
-{
- /* Sort exception table now relocations are done. */
- sort_extable(mod->extable, mod->extable + mod->num_exentries);
-
- /* Copy relocated percpu area over. */
- percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
- info->sechdrs[info->index.pcpu].sh_size);
-
- /* Setup kallsyms-specific fields. */
- add_kallsyms(mod, info);
-
- /* Arch-specific module finalizing. */
- return module_finalize(info->hdr, info->sechdrs, mod);
-}
-
-/* Is this module of this name done loading? No locks held. */
-static bool finished_loading(const char *name)
-{
- struct module *mod;
- bool ret;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE
- || mod->state == MODULE_STATE_GOING;
- mutex_unlock(&module_mutex);
-
- return ret;
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
- unsigned long i;
-
- for (i = 0; i < mod->num_ctors; i++)
- mod->ctors[i]();
-#endif
-}
-
-/* For freeing module_init on success, in case kallsyms traversing */
-struct mod_initfree {
- struct rcu_head rcu;
- void *module_init;
-};
-
-static void do_free_init(struct rcu_head *head)
-{
- struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
- module_memfree(m->module_init);
- kfree(m);
-}
-
-/*
- * This is where the real work happens.
- *
- * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
- * helper command 'lx-symbols'.
- */
-static noinline int do_init_module(struct module *mod)
-{
- int ret = 0;
- struct mod_initfree *freeinit;
-
- freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
- if (!freeinit) {
- ret = -ENOMEM;
- goto fail;
- }
- freeinit->module_init = mod->module_init;
-
- /*
- * We want to find out whether @mod uses async during init. Clear
- * PF_USED_ASYNC. async_schedule*() will set it.
- */
- current->flags &= ~PF_USED_ASYNC;
-
- do_mod_ctors(mod);
- /* Start the module */
- if (mod->init != NULL)
- ret = do_one_initcall(mod->init);
- if (ret < 0) {
- goto fail_free_freeinit;
- }
- if (ret > 0) {
- pr_warn("%s: '%s'->init suspiciously returned %d, it should "
- "follow 0/-E convention\n"
- "%s: loading module anyway...\n",
- __func__, mod->name, ret, __func__);
- dump_stack();
- }
-
- /* Now it's a first class citizen! */
- mod->state = MODULE_STATE_LIVE;
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_LIVE, mod);
-
- /*
- * We need to finish all async code before the module init sequence
- * is done. This has potential to deadlock. For example, a newly
- * detected block device can trigger request_module() of the
- * default iosched from async probing task. Once userland helper
- * reaches here, async_synchronize_full() will wait on the async
- * task waiting on request_module() and deadlock.
- *
- * This deadlock is avoided by perfomring async_synchronize_full()
- * iff module init queued any async jobs. This isn't a full
- * solution as it will deadlock the same if module loading from
- * async jobs nests more than once; however, due to the various
- * constraints, this hack seems to be the best option for now.
- * Please refer to the following thread for details.
- *
- * http://thread.gmane.org/gmane.linux.kernel/1420814
- */
- if (current->flags & PF_USED_ASYNC)
- async_synchronize_full();
-
- mutex_lock(&module_mutex);
- /* Drop initial reference. */
- module_put(mod);
- trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
-#endif
- unset_module_init_ro_nx(mod);
- module_arch_freeing_init(mod);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
- /*
- * We want to free module_init, but be aware that kallsyms may be
- * walking this with preempt disabled. In all the failure paths,
- * we call synchronize_rcu/synchronize_sched, but we don't want
- * to slow down the success path, so use actual RCU here.
- */
- call_rcu(&freeinit->rcu, do_free_init);
- mutex_unlock(&module_mutex);
- wake_up_all(&module_wq);
-
- return 0;
-
-fail_free_freeinit:
- kfree(freeinit);
-fail:
- /* Try to protect us from buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
-}
-
-static int may_init_module(void)
-{
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
-
- return 0;
-}
-
-/*
- * We try to place it in the list now to make sure it's unique before
- * we dedicate too many resources. In particular, temporary percpu
- * memory exhaustion.
- */
-static int add_unformed_module(struct module *mod)
-{
- int err;
- struct module *old;
-
- mod->state = MODULE_STATE_UNFORMED;
-
-again:
- mutex_lock(&module_mutex);
- old = find_module_all(mod->name, strlen(mod->name), true);
- if (old != NULL) {
- if (old->state == MODULE_STATE_COMING
- || old->state == MODULE_STATE_UNFORMED) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto out_unlocked;
- goto again;
- }
- err = -EEXIST;
- goto out;
- }
- list_add_rcu(&mod->list, &modules);
- err = 0;
-
-out:
- mutex_unlock(&module_mutex);
-out_unlocked:
- return err;
-}
-
-static int complete_formation(struct module *mod, struct load_info *info)
-{
- int err;
-
- mutex_lock(&module_mutex);
-
- /* Find duplicate symbols (must be called under lock). */
- err = verify_export_symbols(mod);
- if (err < 0)
- goto out;
-
- /* This relies on module_mutex for list integrity. */
- module_bug_finalize(info->hdr, info->sechdrs, mod);
-
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
-
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
-
- /* Mark state as coming so strong_try_module_get() ignores us,
- * but kallsyms etc. can see us. */
- mod->state = MODULE_STATE_COMING;
- mutex_unlock(&module_mutex);
-
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
- return 0;
-
-out:
- mutex_unlock(&module_mutex);
- return err;
-}
-
-static int unknown_module_param_cb(char *param, char *val, const char *modname)
-{
- /* Check for magic 'dyndbg' arg */
- int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
- if (ret != 0)
- pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
- return 0;
-}
-
-/* Allocate and load the module: note that size of section 0 is always
- zero, and we rely on this for optional sections. */
-static int load_module(struct load_info *info, const char __user *uargs,
- int flags)
-{
- struct module *mod;
- long err;
- char *after_dashes;
-
- err = module_sig_check(info);
- if (err)
- goto free_copy;
-
- err = elf_header_check(info);
- if (err)
- goto free_copy;
-
- /* Figure out module layout, and allocate all the memory. */
- mod = layout_and_allocate(info, flags);
- if (IS_ERR(mod)) {
- err = PTR_ERR(mod);
- goto free_copy;
- }
-
- /* Reserve our place in the list. */
- err = add_unformed_module(mod);
- if (err)
- goto free_module;
-
-#ifdef CONFIG_MODULE_SIG
- mod->sig_ok = info->sig_ok;
- if (!mod->sig_ok) {
- pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
- "kernel\n", mod->name);
- add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
- }
-#endif
-
- /* To avoid stressing percpu allocator, do this once we're unique. */
- err = percpu_modalloc(mod, info);
- if (err)
- goto unlink_mod;
-
- /* Now module is in final location, initialize linked lists, etc. */
- err = module_unload_init(mod);
- if (err)
- goto unlink_mod;
-
- /* Now we've got everything in the final locations, we can
- * find optional sections. */
- err = find_module_sections(mod, info);
- if (err)
- goto free_unload;
-
- err = check_module_license_and_versions(mod);
- if (err)
- goto free_unload;
-
- /* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, info);
-
- /* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- err = apply_relocations(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- err = post_relocation(mod, info);
- if (err < 0)
- goto free_modinfo;
-
- flush_module_icache(mod);
-
- /* Now copy in args */
- mod->args = strndup_user(uargs, ~0UL >> 1);
- if (IS_ERR(mod->args)) {
- err = PTR_ERR(mod->args);
- goto free_arch_cleanup;
- }
-
- dynamic_debug_setup(info->debug, info->num_debug);
-
- /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
- ftrace_module_init(mod);
-
- /* Finally it's fully formed, ready to start executing. */
- err = complete_formation(mod, info);
- if (err)
- goto ddebug_cleanup;
-
- /* Module is ready to execute: parsing args may do that. */
- after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, unknown_module_param_cb);
- if (IS_ERR(after_dashes)) {
- err = PTR_ERR(after_dashes);
- goto bug_cleanup;
- } else if (after_dashes) {
- pr_warn("%s: parameters '%s' after `--' ignored\n",
- mod->name, after_dashes);
- }
-
- /* Link in to syfs. */
- err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
- if (err < 0)
- goto bug_cleanup;
-
- /* Get rid of temporary copy. */
- free_copy(info);
-
- /* Done! */
- trace_module_load(mod);
-
- return do_init_module(mod);
-
- bug_cleanup:
- /* module_bug_cleanup needs module_mutex protection */
- mutex_lock(&module_mutex);
- module_bug_cleanup(mod);
- mutex_unlock(&module_mutex);
-
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
-
- /* we can't deallocate the module until we clear memory protection */
- unset_module_init_ro_nx(mod);
- unset_module_core_ro_nx(mod);
-
- ddebug_cleanup:
- dynamic_debug_remove(info->debug);
- synchronize_sched();
- kfree(mod->args);
- free_arch_cleanup:
- module_arch_cleanup(mod);
- free_modinfo:
- free_modinfo(mod);
- free_unload:
- module_unload_free(mod);
- unlink_mod:
- mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
- wake_up_all(&module_wq);
- /* Wait for RCU synchronizing before releasing mod->list. */
- synchronize_rcu();
- mutex_unlock(&module_mutex);
- free_module:
- /* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->module_core, mod->core_size);
-
- module_deallocate(mod, info);
- free_copy:
- free_copy(info);
- return err;
-}
-
-SYSCALL_DEFINE3(init_module, void __user *, umod,
- unsigned long, len, const char __user *, uargs)
-{
- int err;
- struct load_info info = { };
-
- err = may_init_module();
- if (err)
- return err;
-
- pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
-
- err = copy_module_from_user(umod, len, &info);
- if (err)
- return err;
-
- return load_module(&info, uargs, 0);
-}
-
-SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
-{
- int err;
- struct load_info info = { };
-
- err = may_init_module();
- if (err)
- return err;
-
- pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
-
- if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
- |MODULE_INIT_IGNORE_VERMAGIC))
- return -EINVAL;
-
- err = copy_module_from_fd(fd, &info);
- if (err)
- return err;
-
- return load_module(&info, uargs, flags);
-}
-
-static inline int within(unsigned long addr, void *start, unsigned long size)
-{
- return ((void *)addr >= start && (void *)addr < start + size);
-}
-
-#ifdef CONFIG_KALLSYMS
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
- if (str[0] == '.' && str[1] == 'L')
- return true;
- return str[0] == '$' && strchr("axtd", str[1])
- && (str[2] == '\0' || str[2] == '.');
-}
-
-static const char *get_ksymbol(struct module *mod,
- unsigned long addr,
- unsigned long *size,
- unsigned long *offset)
-{
- unsigned int i, best = 0;
- unsigned long nextval;
-
- /* At worse, next value is at end of module */
- if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->module_init+mod->init_text_size;
- else
- nextval = (unsigned long)mod->module_core+mod->core_text_size;
-
- /* Scan for closest preceding symbol, and next symbol. (ELF
- starts real symbols at 1). */
- for (i = 1; i < mod->num_symtab; i++) {
- if (mod->symtab[i].st_shndx == SHN_UNDEF)
- continue;
-
- /* We ignore unnamed symbols: they're uninformative
- * and inserted at a whim. */
- if (mod->symtab[i].st_value <= addr
- && mod->symtab[i].st_value > mod->symtab[best].st_value
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
- best = i;
- if (mod->symtab[i].st_value > addr
- && mod->symtab[i].st_value < nextval
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
- nextval = mod->symtab[i].st_value;
- }
-
- if (!best)
- return NULL;
-
- if (size)
- *size = nextval - mod->symtab[best].st_value;
- if (offset)
- *offset = addr - mod->symtab[best].st_value;
- return mod->strtab + mod->symtab[best].st_name;
-}
-
-/* For kallsyms to ask for address resolution. NULL means not found. Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption. */
-const char *module_address_lookup(unsigned long addr,
- unsigned long *size,
- unsigned long *offset,
- char **modname,
- char *namebuf)
-{
- struct module *mod;
- const char *ret = NULL;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- if (modname)
- *modname = mod->name;
- ret = get_ksymbol(mod, addr, size, offset);
- break;
- }
- }
- /* Make a copy in here where it's safe */
- if (ret) {
- strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
- ret = namebuf;
- }
- preempt_enable();
- return ret;
-}
-
-int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = get_ksymbol(mod, addr, NULL, NULL);
- if (!sym)
- goto out;
- strlcpy(symname, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
- unsigned long *offset, char *modname, char *name)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod)) {
- const char *sym;
-
- sym = get_ksymbol(mod, addr, size, offset);
- if (!sym)
- goto out;
- if (modname)
- strlcpy(modname, mod->name, MODULE_NAME_LEN);
- if (name)
- strlcpy(name, sym, KSYM_NAME_LEN);
- preempt_enable();
- return 0;
- }
- }
-out:
- preempt_enable();
- return -ERANGE;
-}
-
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
- char *name, char *module_name, int *exported)
-{
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (symnum < mod->num_symtab) {
- *value = mod->symtab[symnum].st_value;
- *type = mod->symtab[symnum].st_info;
- strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
- KSYM_NAME_LEN);
- strlcpy(module_name, mod->name, MODULE_NAME_LEN);
- *exported = is_exported(name, *value, mod);
- preempt_enable();
- return 0;
- }
- symnum -= mod->num_symtab;
- }
- preempt_enable();
- return -ERANGE;
-}
-
-static unsigned long mod_find_symname(struct module *mod, const char *name)
-{
- unsigned int i;
-
- for (i = 0; i < mod->num_symtab; i++)
- if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
- mod->symtab[i].st_info != 'U')
- return mod->symtab[i].st_value;
- return 0;
-}
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name)
-{
- struct module *mod;
- char *colon;
- unsigned long ret = 0;
-
- /* Don't lock: we're in enough trouble already. */
- preempt_disable();
- if ((colon = strchr(name, ':')) != NULL) {
- if ((mod = find_module_all(name, colon - name, false)) != NULL)
- ret = mod_find_symname(mod, colon+1);
- } else {
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if ((ret = mod_find_symname(mod, name)) != 0)
- break;
- }
- }
- preempt_enable();
- return ret;
-}
-
-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
- struct module *, unsigned long),
- void *data)
-{
- struct module *mod;
- unsigned int i;
- int ret;
-
- list_for_each_entry(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- for (i = 0; i < mod->num_symtab; i++) {
- ret = fn(data, mod->strtab + mod->symtab[i].st_name,
- mod, mod->symtab[i].st_value);
- if (ret != 0)
- return ret;
- }
- }
- return 0;
-}
-#endif /* CONFIG_KALLSYMS */
-
-static char *module_flags(struct module *mod, char *buf)
-{
- int bx = 0;
-
- BUG_ON(mod->state == MODULE_STATE_UNFORMED);
- if (mod->taints ||
- mod->state == MODULE_STATE_GOING ||
- mod->state == MODULE_STATE_COMING) {
- buf[bx++] = '(';
- bx += module_flags_taint(mod, buf + bx);
- /* Show a - for module-is-being-unloaded */
- if (mod->state == MODULE_STATE_GOING)
- buf[bx++] = '-';
- /* Show a + for module-is-being-loaded */
- if (mod->state == MODULE_STATE_COMING)
- buf[bx++] = '+';
- buf[bx++] = ')';
- }
- buf[bx] = '\0';
-
- return buf;
-}
-
-#ifdef CONFIG_PROC_FS
-/* Called by the /proc file system to return a list of modules. */
-static void *m_start(struct seq_file *m, loff_t *pos)
-{
- mutex_lock(&module_mutex);
- return seq_list_start(&modules, *pos);
-}
-
-static void *m_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &modules, pos);
-}
-
-static void m_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&module_mutex);
-}
-
-static int m_show(struct seq_file *m, void *p)
-{
- struct module *mod = list_entry(p, struct module, list);
- char buf[8];
-
- /* We always ignore unformed modules. */
- if (mod->state == MODULE_STATE_UNFORMED)
- return 0;
-
- seq_printf(m, "%s %u",
- mod->name, mod->init_size + mod->core_size);
- print_unload_info(m, mod);
-
- /* Informative for users. */
- seq_printf(m, " %s",
- mod->state == MODULE_STATE_GOING ? "Unloading" :
- mod->state == MODULE_STATE_COMING ? "Loading" :
- "Live");
- /* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->module_core);
-
- /* Taints info */
- if (mod->taints)
- seq_printf(m, " %s", module_flags(mod, buf));
-
- seq_puts(m, "\n");
- return 0;
-}
-
-/* Format: modulename size refcount deps address
-
- Where refcount is a number or -, and deps is a comma-separated list
- of depends or -.
-*/
-static const struct seq_operations modules_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = m_show
-};
-
-static int modules_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &modules_op);
-}
-
-static const struct file_operations proc_modules_operations = {
- .open = modules_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init proc_modules_init(void)
-{
- proc_create("modules", 0, NULL, &proc_modules_operations);
- return 0;
-}
-module_init(proc_modules_init);
-#endif
-
-/* Given an address, look for it in the module exception tables. */
-const struct exception_table_entry *search_module_extables(unsigned long addr)
-{
- const struct exception_table_entry *e = NULL;
- struct module *mod;
-
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (mod->num_exentries == 0)
- continue;
-
- e = search_extable(mod->extable,
- mod->extable + mod->num_exentries - 1,
- addr);
- if (e)
- break;
- }
- preempt_enable();
-
- /* Now, if we found one, we are running inside it now, hence
- we cannot unload the module, hence no refcnt needed. */
- return e;
-}
-
-/*
- * is_module_address - is this address inside a module?
- * @addr: the address to check.
- *
- * See is_module_text_address() if you simply want to see if the address
- * is code (not data).
- */
-bool is_module_address(unsigned long addr)
-{
- bool ret;
-
- preempt_disable();
- ret = __module_address(addr) != NULL;
- preempt_enable();
-
- return ret;
-}
-
-/*
- * __module_address - get the module which contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_address(unsigned long addr)
-{
- struct module *mod;
-
- if (addr < module_addr_min || addr > module_addr_max)
- return NULL;
-
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (within_module(addr, mod))
- return mod;
- }
- return NULL;
-}
-EXPORT_SYMBOL_GPL(__module_address);
-
-/*
- * is_module_text_address - is this address inside module code?
- * @addr: the address to check.
- *
- * See is_module_address() if you simply want to see if the address is
- * anywhere in a module. See kernel_text_address() for testing if an
- * address corresponds to kernel or module code.
- */
-bool is_module_text_address(unsigned long addr)
-{
- bool ret;
-
- preempt_disable();
- ret = __module_text_address(addr) != NULL;
- preempt_enable();
-
- return ret;
-}
-
-/*
- * __module_text_address - get the module whose code contains an address.
- * @addr: the address.
- *
- * Must be called with preempt disabled or module mutex held so that
- * module doesn't get freed during this.
- */
-struct module *__module_text_address(unsigned long addr)
-{
- struct module *mod = __module_address(addr);
- if (mod) {
- /* Make sure it's within the text section. */
- if (!within(addr, mod->module_init, mod->init_text_size)
- && !within(addr, mod->module_core, mod->core_text_size))
- mod = NULL;
- }
- return mod;
-}
-EXPORT_SYMBOL_GPL(__module_text_address);
-
-/* Don't grab lock, we're oopsing. */
-void print_modules(void)
-{
- struct module *mod;
- char buf[8];
-
- printk(KERN_DEFAULT "Modules linked in:");
- /* Most callers should already have preempt disabled, but make sure */
- preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- pr_cont(" %s%s", mod->name, module_flags(mod, buf));
- }
- preempt_enable();
- if (last_unloaded_module[0])
- pr_cont(" [last unloaded: %s]", last_unloaded_module);
- pr_cont("\n");
-}
-
-#ifdef CONFIG_MODVERSIONS
-/* Generate the signature for all relevant module structures here.
- * If these change, we don't want to try to parse the module. */
-void module_layout(struct module *mod,
- struct modversion_info *ver,
- struct kernel_param *kp,
- struct kernel_symbol *ks,
- struct tracepoint * const *tp)
-{
-}
-EXPORT_SYMBOL(module_layout);
-#endif
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
new file mode 100644
index 000000000000..2a1beebf1d37
--- /dev/null
+++ b/kernel/module/Kconfig
@@ -0,0 +1,465 @@
+# SPDX-License-Identifier: GPL-2.0-only
+menuconfig MODULES
+ bool "Enable loadable module support"
+ modules
+ select EXECMEM
+ help
+ Kernel modules are small pieces of compiled code which can
+ be inserted in the running kernel, rather than being
+ permanently built into the kernel. You use the "modprobe"
+ tool to add (and sometimes remove) them. If you say Y here,
+ many parts of the kernel can be built as modules (by
+ answering M instead of Y where indicated): this is most
+ useful for infrequently used options which are not required
+ for booting. For more information, see the man pages for
+ modprobe, lsmod, modinfo, insmod and rmmod.
+
+ If you say Y here, you will need to run "make
+ modules_install" to put the modules under /lib/modules/
+ where modprobe can find them (you may need to be root to do
+ this).
+
+ If unsure, say Y.
+
+if MODULES
+
+config MODULE_DEBUGFS
+ bool
+
+config MODULE_DEBUG
+ bool "Module debugging"
+ depends on DEBUG_FS
+ help
+ Allows you to enable / disable features which can help you debug
+ modules. You don't need these options on production systems.
+
+if MODULE_DEBUG
+
+config MODULE_STATS
+ bool "Module statistics"
+ depends on DEBUG_FS
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of module statistics.
+ For example, size of all modules, average size, text size, a list
+ of failed modules and the size for each of those. For failed
+ modules we keep track of modules which failed due to either the
+ existing module taking too long to load or that module was already
+ loaded.
+
+ You should enable this if you are debugging production loads
+ and want to see if userspace or the kernel is doing stupid things
+ with loading modules when it shouldn't or if you want to help
+ optimize userspace / kernel space module autoloading schemes.
+ You might want to do this because failed modules tend to use
+ up significant amount of memory, and so you'd be doing everyone a
+ favor in avoiding these failures proactively.
+
+ This functionality is also useful for those experimenting with
+ module .text ELF section optimization.
+
+ If unsure, say N.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS
+ bool "Debug duplicate modules with auto-loading"
+ help
+ Module autoloading allows in-kernel code to request modules through
+ the *request_module*() API calls. This in turn just calls userspace
+ modprobe. Although modprobe checks to see if a module is already
+ loaded before trying to load a module there is a small time window in
+ which multiple duplicate requests can end up in userspace and multiple
+ modprobe calls race calling finit_module() around the same time for
+ duplicate modules. The finit_module() system call can consume in the
+ worst case more than twice the respective module size in virtual
+ memory for each duplicate module requests. Although duplicate module
+ requests are non-fatal virtual memory is a limited resource and each
+ duplicate module request ends up just unnecessarily straining virtual
+ memory.
+
+ This debugging facility will create pr_warn() splats for duplicate
+ module requests to help identify if module auto-loading may be the
+ culprit to your early boot virtual memory pressure. Since virtual
+ memory abuse caused by duplicate module requests could render a
+ system unusable this functionality will also converge races in
+ requests for the same module to a single request. You can boot with
+ the module.enable_dups_trace=1 kernel parameter to use WARN_ON()
+ instead of the pr_warn().
+
+ If the first module request used request_module_nowait() we cannot
+ use that as the anchor to wait for duplicate module requests, since
+ users of request_module() do want a proper return value. If a call
+ for the same module happened earlier with request_module() though,
+ then a duplicate request_module_nowait() would be detected. The
+ non-wait request_module() call is synchronous and waits until modprobe
+ completes. Subsequent auto-loading requests for the same module do
+ not trigger a new finit_module() calls and do not strain virtual
+ memory, and so as soon as modprobe successfully completes we remove
+ tracking for duplicates for that module.
+
+ Enable this functionality to try to debug virtual memory abuse during
+ boot on systems which are failing to boot or if you suspect you may be
+ straining virtual memory during boot, and you want to identify if the
+ abuse was due to module auto-loading. These issues are currently only
+ known to occur on systems with many CPUs (over 400) and is likely the
+ result of udev issuing duplicate module requests for each CPU, and so
+ module auto-loading is not the culprit. There may very well still be
+ many duplicate module auto-loading requests which could be optimized
+ for and this debugging facility can be used to help identify them.
+
+ Only enable this for debugging system functionality, never have it
+ enabled on real systems.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS_TRACE
+ bool "Force full stack trace when duplicates are found"
+ depends on MODULE_DEBUG_AUTOLOAD_DUPS
+ help
+ Enabling this will force a full stack trace for duplicate module
+ auto-loading requests using WARN_ON() instead of pr_warn(). You
+ should keep this disabled at all times unless you are a developer
+ and are doing a manual inspection and want to debug exactly why
+ these duplicates occur.
+
+endif # MODULE_DEBUG
+
+config MODULE_FORCE_LOAD
+ bool "Forced module loading"
+ default n
+ help
+ Allow loading of modules without version information (ie. modprobe
+ --force). Forced module loading sets the 'F' (forced) taint flag and
+ is usually a really bad idea.
+
+config MODULE_UNLOAD
+ bool "Module unloading"
+ help
+ Without this option you will not be able to unload any
+ modules (note that some modules may not be unloadable
+ anyway), which makes your kernel smaller, faster
+ and simpler. If unsure, say Y.
+
+config MODULE_FORCE_UNLOAD
+ bool "Forced module unloading"
+ depends on MODULE_UNLOAD
+ help
+ This option allows you to force a module to unload, even if the
+ kernel believes it is unsafe: the kernel will remove the module
+ without waiting for anyone to stop using it (using the -f option to
+ rmmod). This is mainly for kernel developers and desperate users.
+ If unsure, say N.
+
+config MODULE_UNLOAD_TAINT_TRACKING
+ bool "Tainted module unload tracking"
+ depends on MODULE_UNLOAD
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of each unloaded
+ module that tainted the kernel. In addition to displaying a
+ list of linked (or loaded) modules e.g. on detection of a bad
+ page (see bad_page()), the aforementioned details are also
+ shown. If unsure, say N.
+
+config MODVERSIONS
+ bool "Module versioning support"
+ depends on !COMPILE_TEST
+ help
+ Usually, you have to use modules compiled with your kernel.
+ Saying Y here makes it sometimes possible to use modules
+ compiled for different kernels, by adding enough information
+ to the modules to (hopefully) spot any changes which would
+ make them incompatible with the kernel you are running. If
+ unsure, say N.
+
+choice
+ prompt "Module versioning implementation"
+ depends on MODVERSIONS
+ help
+ Select the tool used to calculate symbol versions for modules.
+
+ If unsure, select GENKSYMS.
+
+config GENKSYMS
+ bool "genksyms (from source code)"
+ help
+ Calculate symbol versions from pre-processed source code using
+ genksyms.
+
+ If unsure, say Y.
+
+config GENDWARFKSYMS
+ bool "gendwarfksyms (from debugging information)"
+ depends on DEBUG_INFO
+ # Requires full debugging information, split DWARF not supported.
+ depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT
+ # Requires ELF object files.
+ depends on !LTO
+ # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on
+ # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop
+ # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder:
+ # Verify 0 address DWARF variables are in ELF section").
+ depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129
+ help
+ Calculate symbol versions from DWARF debugging information using
+ gendwarfksyms. Requires DEBUG_INFO to be enabled.
+
+ If unsure, say N.
+endchoice
+
+config ASM_MODVERSIONS
+ bool
+ default HAVE_ASM_MODVERSIONS && MODVERSIONS
+ help
+ This enables module versioning for exported symbols also from
+ assembly. This can be enabled only when the target architecture
+ supports it.
+
+config EXTENDED_MODVERSIONS
+ bool "Extended Module Versioning Support"
+ depends on MODVERSIONS
+ help
+ This enables extended MODVERSIONs support, allowing long symbol
+ names to be versioned.
+
+ The most likely reason you would enable this is to enable Rust
+ support. If unsure, say N.
+
+config BASIC_MODVERSIONS
+ bool "Basic Module Versioning Support"
+ depends on MODVERSIONS
+ default y
+ help
+ This enables basic MODVERSIONS support, allowing older tools or
+ kernels to potentially load modules.
+
+ Disabling this may cause older `modprobe` or `kmod` to be unable
+ to read MODVERSIONS information from built modules. With this
+ disabled, older kernels may treat this module as unversioned.
+
+ This is enabled by default when MODVERSIONS are enabled.
+ If unsure, say Y.
+
+config MODULE_SRCVERSION_ALL
+ bool "Source checksum for all modules"
+ help
+ Modules which contain a MODULE_VERSION get an extra "srcversion"
+ field inserted into their modinfo section, which contains a
+ sum of the source files which made it. This helps maintainers
+ see exactly which source was used to build a module (since
+ others sometimes change the module source without updating
+ the version). With this option, such a "srcversion" field
+ will be created for all modules. If unsure, say N.
+
+config MODULE_SIG
+ bool "Module signature verification"
+ select MODULE_SIG_FORMAT
+ help
+ Check modules for valid signatures upon load: the signature
+ is simply appended to the module. For more information see
+ <file:Documentation/admin-guide/module-signing.rst>.
+
+ Note that this option adds the OpenSSL development packages as a
+ kernel build dependency so that the signing tool can use its crypto
+ library.
+
+ You should enable this option if you wish to use either
+ CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via
+ another LSM - otherwise unsigned modules will be loadable regardless
+ of the lockdown policy.
+
+ !!!WARNING!!! If you enable this option, you MUST make sure that the
+ module DOES NOT get stripped after being signed. This includes the
+ debuginfo strip done by some packagers (such as rpmbuild) and
+ inclusion into an initramfs that wants the module size reduced.
+
+config MODULE_SIG_FORCE
+ bool "Require modules to be validly signed"
+ depends on MODULE_SIG
+ help
+ Reject unsigned modules or signed modules for which we don't have a
+ key. Without this, such modules will simply taint the kernel.
+
+config MODULE_SIG_ALL
+ bool "Automatically sign all modules"
+ default y
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ help
+ Sign all modules during make modules_install. Without this option,
+ modules must be signed manually, using the scripts/sign-file tool.
+
+comment "Do not forget to sign required modules with scripts/sign-file"
+ depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL
+
+choice
+ prompt "Hash algorithm to sign modules"
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default MODULE_SIG_SHA512
+ help
+ This determines which sort of hashing algorithm will be used during
+ signature generation. This algorithm _must_ be built into the kernel
+ directly so that signature verification can take place. It is not
+ possible to load a signed module containing the algorithm to check
+ the signature on that module.
+
+config MODULE_SIG_SHA1
+ bool "SHA-1"
+ select CRYPTO_SHA1
+
+config MODULE_SIG_SHA256
+ bool "SHA-256"
+ select CRYPTO_SHA256
+
+config MODULE_SIG_SHA384
+ bool "SHA-384"
+ select CRYPTO_SHA512
+
+config MODULE_SIG_SHA512
+ bool "SHA-512"
+ select CRYPTO_SHA512
+
+config MODULE_SIG_SHA3_256
+ bool "SHA3-256"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_384
+ bool "SHA3-384"
+ select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_512
+ bool "SHA3-512"
+ select CRYPTO_SHA3
+
+endchoice
+
+config MODULE_SIG_HASH
+ string
+ depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default "sha1" if MODULE_SIG_SHA1
+ default "sha256" if MODULE_SIG_SHA256
+ default "sha384" if MODULE_SIG_SHA384
+ default "sha512" if MODULE_SIG_SHA512
+ default "sha3-256" if MODULE_SIG_SHA3_256
+ default "sha3-384" if MODULE_SIG_SHA3_384
+ default "sha3-512" if MODULE_SIG_SHA3_512
+
+config MODULE_COMPRESS
+ bool "Module compression"
+ help
+ Enable module compression to reduce on-disk size of module binaries.
+ This is fully compatible with signed modules.
+
+ The tool used to work with modules needs to support the selected
+ compression type. kmod MAY support gzip, xz and zstd. Other tools
+ might have a limited selection of the supported types.
+
+ Note that for modules inside an initrd or initramfs, it's more
+ efficient to compress the whole ramdisk instead.
+
+ If unsure, say N.
+
+choice
+ prompt "Module compression type"
+ depends on MODULE_COMPRESS
+ help
+ Choose the supported algorithm for module compression.
+
+config MODULE_COMPRESS_GZIP
+ bool "GZIP"
+ help
+ Support modules compressed with GZIP. The installed modules are
+ suffixed with .ko.gz.
+
+config MODULE_COMPRESS_XZ
+ bool "XZ"
+ help
+ Support modules compressed with XZ. The installed modules are
+ suffixed with .ko.xz.
+
+config MODULE_COMPRESS_ZSTD
+ bool "ZSTD"
+ help
+ Support modules compressed with ZSTD. The installed modules are
+ suffixed with .ko.zst.
+
+endchoice
+
+config MODULE_COMPRESS_ALL
+ bool "Automatically compress all modules"
+ default y
+ depends on MODULE_COMPRESS
+ help
+ Compress all modules during 'make modules_install'.
+
+ Your build system needs to provide the appropriate compression tool
+ for the selected compression type. External modules will also be
+ compressed in the same way during the installation.
+
+config MODULE_DECOMPRESS
+ bool "Support in-kernel module decompression"
+ depends on MODULE_COMPRESS
+ select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
+ select XZ_DEC if MODULE_COMPRESS_XZ
+ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
+ help
+ Support for decompressing kernel modules by the kernel itself
+ instead of relying on userspace to perform this task. Useful when
+ load pinning security policy is enabled.
+
+ If unsure, say N.
+
+config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ bool "Allow loading of modules with missing namespace imports"
+ help
+ Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in
+ a namespace. A module that makes use of a symbol exported with such a
+ namespace is required to import the namespace via MODULE_IMPORT_NS("").
+ There is no technical reason to enforce correct namespace imports,
+ but it creates consistency between symbols defining namespaces and
+ users importing namespaces they make use of. This option relaxes this
+ requirement and lifts the enforcement when loading a module.
+
+ If unsure, say N.
+
+config MODPROBE_PATH
+ string "Path to modprobe binary"
+ default "/sbin/modprobe"
+ help
+ When kernel code requests a module, it does so by calling
+ the "modprobe" userspace utility. This option allows you to
+ set the path where that binary is found. This can be changed
+ at runtime via the sysctl file
+ /proc/sys/kernel/modprobe. Setting this to the empty string
+ removes the kernel's ability to request modules (but
+ userspace can still load modules explicitly).
+
+config TRIM_UNUSED_KSYMS
+ bool "Trim unused exported kernel symbols"
+ help
+ The kernel and some modules make many symbols available for
+ other modules to use via EXPORT_SYMBOL() and variants. Depending
+ on the set of modules being selected in your kernel configuration,
+ many of those exported symbols might never be used.
+
+ This option allows for unused exported symbols to be dropped from
+ the build. In turn, this provides the compiler more opportunities
+ (especially when using LTO) for optimizing the code and reducing
+ binary size. This might have some security advantages as well.
+
+ If unsure, or if you need to build out-of-tree modules, say N.
+
+config UNUSED_KSYMS_WHITELIST
+ string "Whitelist of symbols to keep in ksymtab"
+ depends on TRIM_UNUSED_KSYMS
+ help
+ By default, all unused exported symbols will be un-exported from the
+ build when TRIM_UNUSED_KSYMS is selected.
+
+ UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept
+ exported at all times, even in absence of in-tree users. The value to
+ set here is the path to a text file containing the list of symbols,
+ one per line. The path can be absolute, or relative to the kernel
+ source or obj tree.
+
+config MODULES_TREE_LOOKUP
+ def_bool y
+ depends on PERF_EVENTS || TRACING || CFI
+
+endif # MODULES
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
new file mode 100644
index 000000000000..50ffcc413b54
--- /dev/null
+++ b/kernel/module/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for linux kernel module support
+#
+
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_main.o := n
+
+obj-y += main.o
+obj-y += strict_rwx.o
+obj-y += kmod.o
+obj-$(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS) += dups.o
+obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
+obj-$(CONFIG_MODULE_SIG) += signing.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_MODULES_TREE_LOOKUP) += tree_lookup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += debug_kmemleak.o
+obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_PROC_FS) += procfs.o
+obj-$(CONFIG_SYSFS) += sysfs.o
+obj-$(CONFIG_KGDB_KDB) += kdb.o
+obj-$(CONFIG_MODVERSIONS) += version.o
+obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
+obj-$(CONFIG_MODULE_STATS) += stats.o
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
new file mode 100644
index 000000000000..df873dad049d
--- /dev/null
+++ b/kernel/module/debug_kmemleak.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kmemleak support
+ *
+ * Copyright (C) 2009 Catalin Marinas
+ */
+
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include "internal.h"
+
+void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
+{
+ /* only scan writable, non-executable sections */
+ for_each_mod_mem_type(type) {
+ if (type != MOD_DATA && type != MOD_INIT_DATA &&
+ !mod->mem[type].is_rox)
+ kmemleak_no_scan(mod->mem[type].base);
+ }
+}
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
new file mode 100644
index 000000000000..474e68f0f063
--- /dev/null
+++ b/kernel/module/decompress.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2021 Google LLC.
+ */
+
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+
+#include "internal.h"
+
+static int module_extend_max_pages(struct load_info *info, unsigned int extent)
+{
+ struct page **new_pages;
+
+ new_pages = kvmalloc_array(info->max_pages + extent,
+ sizeof(info->pages), GFP_KERNEL);
+ if (!new_pages)
+ return -ENOMEM;
+
+ memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages));
+ kvfree(info->pages);
+ info->pages = new_pages;
+ info->max_pages += extent;
+
+ return 0;
+}
+
+static struct page *module_get_next_page(struct load_info *info)
+{
+ struct page *page;
+ int error;
+
+ if (info->max_pages == info->used_pages) {
+ error = module_extend_max_pages(info, info->used_pages);
+ if (error)
+ return ERR_PTR(error);
+ }
+
+ page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ info->pages[info->used_pages++] = page;
+ return page;
+}
+
+#if defined(CONFIG_MODULE_COMPRESS_GZIP)
+#include <linux/zlib.h>
+#define MODULE_COMPRESSION gzip
+#define MODULE_DECOMPRESS_FN module_gzip_decompress
+
+/*
+ * Calculate length of the header which consists of signature, header
+ * flags, time stamp and operating system ID (10 bytes total), plus
+ * an optional filename.
+ */
+static size_t module_gzip_header_len(const u8 *buf, size_t size)
+{
+ const u8 signature[] = { 0x1f, 0x8b, 0x08 };
+ size_t len = 10;
+
+ if (size < len || memcmp(buf, signature, sizeof(signature)))
+ return 0;
+
+ if (buf[3] & 0x08) {
+ do {
+ /*
+ * If we can't find the end of the file name we must
+ * be dealing with a corrupted file.
+ */
+ if (len == size)
+ return 0;
+ } while (buf[len++] != '\0');
+ }
+
+ return len;
+}
+
+static ssize_t module_gzip_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ struct z_stream_s s = { 0 };
+ size_t new_size = 0;
+ size_t gzip_hdr_len;
+ ssize_t retval;
+ int rc;
+
+ gzip_hdr_len = module_gzip_header_len(buf, size);
+ if (!gzip_hdr_len) {
+ pr_err("not a gzip compressed module\n");
+ return -EINVAL;
+ }
+
+ s.next_in = buf + gzip_hdr_len;
+ s.avail_in = size - gzip_hdr_len;
+
+ s.workspace = kvmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+ if (!s.workspace)
+ return -ENOMEM;
+
+ rc = zlib_inflateInit2(&s, -MAX_WBITS);
+ if (rc != Z_OK) {
+ pr_err("failed to initialize decompressor: %d\n", rc);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out_inflate_end;
+ }
+
+ s.next_out = kmap_local_page(page);
+ s.avail_out = PAGE_SIZE;
+ rc = zlib_inflate(&s, 0);
+ kunmap_local(s.next_out);
+
+ new_size += PAGE_SIZE - s.avail_out;
+ } while (rc == Z_OK);
+
+ if (rc != Z_STREAM_END) {
+ pr_err("decompression failed with status %d\n", rc);
+ retval = -EINVAL;
+ goto out_inflate_end;
+ }
+
+ retval = new_size;
+
+out_inflate_end:
+ zlib_inflateEnd(&s);
+out:
+ kvfree(s.workspace);
+ return retval;
+}
+#elif defined(CONFIG_MODULE_COMPRESS_XZ)
+#include <linux/xz.h>
+#define MODULE_COMPRESSION xz
+#define MODULE_DECOMPRESS_FN module_xz_decompress
+
+static ssize_t module_xz_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 };
+ struct xz_dec *xz_dec;
+ struct xz_buf xz_buf;
+ enum xz_ret xz_ret;
+ size_t new_size = 0;
+ ssize_t retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not an xz compressed module\n");
+ return -EINVAL;
+ }
+
+ xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1);
+ if (!xz_dec)
+ return -ENOMEM;
+
+ xz_buf.in_size = size;
+ xz_buf.in = buf;
+ xz_buf.in_pos = 0;
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ xz_buf.out = kmap_local_page(page);
+ xz_buf.out_pos = 0;
+ xz_buf.out_size = PAGE_SIZE;
+ xz_ret = xz_dec_run(xz_dec, &xz_buf);
+ kunmap_local(xz_buf.out);
+
+ new_size += xz_buf.out_pos;
+ } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK);
+
+ if (xz_ret != XZ_STREAM_END) {
+ pr_err("decompression failed with status %d\n", xz_ret);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ xz_dec_end(xz_dec);
+ return retval;
+}
+#elif defined(CONFIG_MODULE_COMPRESS_ZSTD)
+#include <linux/zstd.h>
+#define MODULE_COMPRESSION zstd
+#define MODULE_DECOMPRESS_FN module_zstd_decompress
+
+static ssize_t module_zstd_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd };
+ ZSTD_outBuffer zstd_dec;
+ ZSTD_inBuffer zstd_buf;
+ zstd_frame_header header;
+ size_t wksp_size;
+ void *wksp = NULL;
+ ZSTD_DStream *dstream;
+ size_t ret;
+ size_t new_size = 0;
+ int retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not a zstd compressed module\n");
+ return -EINVAL;
+ }
+
+ zstd_buf.src = buf;
+ zstd_buf.pos = 0;
+ zstd_buf.size = size;
+
+ ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size);
+ if (ret != 0) {
+ pr_err("ZSTD-compressed data has an incomplete frame header\n");
+ retval = -EINVAL;
+ goto out;
+ }
+ if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) {
+ pr_err("ZSTD-compressed data has too large a window size\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ wksp_size = zstd_dstream_workspace_bound(header.windowSize);
+ wksp = kvmalloc(wksp_size, GFP_KERNEL);
+ if (!wksp) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size);
+ if (!dstream) {
+ pr_err("Can't initialize ZSTD stream\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ zstd_dec.dst = kmap_local_page(page);
+ zstd_dec.pos = 0;
+ zstd_dec.size = PAGE_SIZE;
+
+ ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+ kunmap_local(zstd_dec.dst);
+ retval = zstd_get_error_code(ret);
+ if (retval)
+ break;
+
+ new_size += zstd_dec.pos;
+ } while (zstd_dec.pos == PAGE_SIZE && ret != 0);
+
+ if (retval) {
+ pr_err("ZSTD-decompression failed with status %d\n", retval);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ kvfree(wksp);
+ return retval;
+}
+#else
+#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
+#endif
+
+int module_decompress(struct load_info *info, const void *buf, size_t size)
+{
+ unsigned int n_pages;
+ ssize_t data_size;
+ int error;
+
+#if defined(CONFIG_MODULE_STATS)
+ info->compressed_len = size;
+#endif
+
+ /*
+ * Start with number of pages twice as big as needed for
+ * compressed data.
+ */
+ n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2;
+ error = module_extend_max_pages(info, n_pages);
+
+ data_size = MODULE_DECOMPRESS_FN(info, buf, size);
+ if (data_size < 0) {
+ error = data_size;
+ goto err;
+ }
+
+ info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL);
+ if (!info->hdr) {
+ error = -ENOMEM;
+ goto err;
+ }
+
+ info->len = data_size;
+ return 0;
+
+err:
+ module_decompress_cleanup(info);
+ return error;
+}
+
+void module_decompress_cleanup(struct load_info *info)
+{
+ int i;
+
+ if (info->hdr)
+ vunmap(info->hdr);
+
+ for (i = 0; i < info->used_pages; i++)
+ __free_page(info->pages[i]);
+
+ kvfree(info->pages);
+
+ info->pages = NULL;
+ info->max_pages = info->used_pages = 0;
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t compression_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n");
+}
+
+static struct kobj_attribute module_compression_attr = __ATTR_RO(compression);
+
+static int __init module_decompress_sysfs_init(void)
+{
+ int error;
+
+ error = sysfs_create_file(&module_kset->kobj,
+ &module_compression_attr.attr);
+ if (error)
+ pr_warn("Failed to create 'compression' attribute");
+
+ return 0;
+}
+late_initcall(module_decompress_sysfs_init);
+#endif
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
new file mode 100644
index 000000000000..bd2149fbe117
--- /dev/null
+++ b/kernel/module/dups.c
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * kmod dups - the kernel module autoloader duplicate suppressor
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define pr_fmt(fmt) "module: " fmt
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+static bool enable_dups_trace = IS_ENABLED(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS_TRACE);
+module_param(enable_dups_trace, bool_enable_only, 0644);
+
+/*
+ * Protects dup_kmod_reqs list, adds / removals with RCU.
+ */
+static DEFINE_MUTEX(kmod_dup_mutex);
+static LIST_HEAD(dup_kmod_reqs);
+
+struct kmod_dup_req {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ struct completion first_req_done;
+ struct work_struct complete_work;
+ struct delayed_work delete_work;
+ int dup_ret;
+};
+
+static struct kmod_dup_req *kmod_dup_request_lookup(char *module_name)
+{
+ struct kmod_dup_req *kmod_req;
+
+ list_for_each_entry_rcu(kmod_req, &dup_kmod_reqs, list,
+ lockdep_is_held(&kmod_dup_mutex)) {
+ if (strlen(kmod_req->name) == strlen(module_name) &&
+ !memcmp(kmod_req->name, module_name, strlen(module_name))) {
+ return kmod_req;
+ }
+ }
+
+ return NULL;
+}
+
+static void kmod_dup_request_delete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+ kmod_req = container_of(to_delayed_work(work), struct kmod_dup_req, delete_work);
+
+ /*
+ * The typical situation is a module successully loaded. In that
+ * situation the module will be present already in userspace. If
+ * new requests come in after that, userspace will already know the
+ * module is loaded so will just return 0 right away. There is still
+ * a small chance right after we delete this entry new request_module()
+ * calls may happen after that, they can happen. These heuristics
+ * are to protect finit_module() abuse for auto-loading, if modules
+ * are still tryign to auto-load even if a module is already loaded,
+ * that's on them, and those inneficiencies should not be fixed by
+ * kmod. The inneficies there are a call to modprobe and modprobe
+ * just returning 0.
+ */
+ mutex_lock(&kmod_dup_mutex);
+ list_del_rcu(&kmod_req->list);
+ synchronize_rcu();
+ mutex_unlock(&kmod_dup_mutex);
+ kfree(kmod_req);
+}
+
+static void kmod_dup_request_complete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+
+ kmod_req = container_of(work, struct kmod_dup_req, complete_work);
+
+ /*
+ * This will ensure that the kernel will let all the waiters get
+ * informed its time to check the return value. It's time to
+ * go home.
+ */
+ complete_all(&kmod_req->first_req_done);
+
+ /*
+ * Now that we have allowed prior request_module() calls to go on
+ * with life, let's schedule deleting this entry. We don't have
+ * to do it right away, but we *eventually* want to do it so to not
+ * let this linger forever as this is just a boot optimization for
+ * possible abuses of vmalloc() incurred by finit_module() thrashing.
+ */
+ queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ);
+}
+
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ struct kmod_dup_req *kmod_req, *new_kmod_req;
+ int ret;
+
+ /*
+ * Pre-allocate the entry in case we have to use it later
+ * to avoid contention with the mutex.
+ */
+ new_kmod_req = kzalloc(sizeof(*new_kmod_req), GFP_KERNEL);
+ if (!new_kmod_req)
+ return false;
+
+ memcpy(new_kmod_req->name, module_name, strlen(module_name));
+ INIT_WORK(&new_kmod_req->complete_work, kmod_dup_request_complete);
+ INIT_DELAYED_WORK(&new_kmod_req->delete_work, kmod_dup_request_delete);
+ init_completion(&new_kmod_req->first_req_done);
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req) {
+ /*
+ * If the first request that came through for a module
+ * was with request_module_nowait() we cannot wait for it
+ * and share its return value with other users which may
+ * have used request_module() and need a proper return value
+ * so just skip using them as an anchor.
+ *
+ * If a prior request to this one came through with
+ * request_module() though, then a request_module_nowait()
+ * would benefit from duplicate detection.
+ */
+ if (!wait) {
+ kfree(new_kmod_req);
+ pr_debug("New request_module_nowait() for %s -- cannot track duplicates for this request\n", module_name);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+
+ /*
+ * There was no duplicate, just add the request so we can
+ * keep tab on duplicates later.
+ */
+ pr_debug("New request_module() for %s\n", module_name);
+ list_add_rcu(&new_kmod_req->list, &dup_kmod_reqs);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+ mutex_unlock(&kmod_dup_mutex);
+
+ /* We are dealing with a duplicate request now */
+ kfree(new_kmod_req);
+
+ /*
+ * To fix these try to use try_then_request_module() instead as that
+ * will check if the component you are looking for is present or not.
+ * You could also just queue a single request to load the module once,
+ * instead of having each and everything you need try to request for
+ * the module.
+ *
+ * Duplicate request_module() calls can cause quite a bit of wasted
+ * vmalloc() space when racing with userspace.
+ */
+ if (enable_dups_trace)
+ WARN(1, "module-autoload: duplicate request for module %s\n", module_name);
+ else
+ pr_warn("module-autoload: duplicate request for module %s\n", module_name);
+
+ if (!wait) {
+ /*
+ * If request_module_nowait() was used then the user just
+ * wanted to issue the request and if another module request
+ * was already its way with the same name we don't care for
+ * the return value either. Let duplicate request_module_nowait()
+ * calls bail out right away.
+ */
+ *dup_ret = 0;
+ return true;
+ }
+
+ /*
+ * If a duplicate request_module() was used they *may* care for
+ * the return value, so we have no other option but to wait for
+ * the first caller to complete. If the first caller used
+ * the request_module_nowait() call, subsquent callers will
+ * deal with the comprmise of getting a successful call with this
+ * optimization enabled ...
+ */
+ ret = wait_for_completion_state(&kmod_req->first_req_done,
+ TASK_KILLABLE);
+ if (ret) {
+ *dup_ret = ret;
+ return true;
+ }
+
+ /* Now the duplicate request has the same exact return value as the first request */
+ *dup_ret = kmod_req->dup_ret;
+
+ return true;
+}
+
+void kmod_dup_request_announce(char *module_name, int ret)
+{
+ struct kmod_dup_req *kmod_req;
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req)
+ goto out;
+
+ kmod_req->dup_ret = ret;
+
+ /*
+ * If we complete() here we may allow duplicate threads
+ * to continue before the first one that submitted the
+ * request. We're in no rush also, given that each and
+ * every bounce back to userspace is slow we avoid that
+ * with a slight delay here. So queueue up the completion
+ * and let duplicates suffer, just wait a tad bit longer.
+ * There is no rush. But we also don't want to hold the
+ * caller up forever or introduce any boot delays.
+ */
+ queue_work(system_wq, &kmod_req->complete_work);
+
+out:
+ mutex_unlock(&kmod_dup_mutex);
+}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
new file mode 100644
index 000000000000..618202578b42
--- /dev/null
+++ b/kernel/module/internal.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Module internals
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/elf.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/mm.h>
+
+#ifndef ARCH_SHF_SMALL
+#define ARCH_SHF_SMALL 0
+#endif
+
+/*
+ * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
+ * section. This leaves 28 bits for offset on 32-bit systems, which is
+ * about 256 MiB (WARN_ON_ONCE if we exceed that).
+ */
+
+#define SH_ENTSIZE_TYPE_BITS 4
+#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
+#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
+#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)
+
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+struct kernel_symbol {
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ int value_offset;
+ int name_offset;
+ int namespace_offset;
+#else
+ unsigned long value;
+ const char *name;
+ const char *namespace;
+#endif
+};
+
+extern struct mutex module_mutex;
+extern struct list_head modules;
+
+extern const struct module_attribute *const modinfo_attrs[];
+extern const size_t modinfo_attrs_count;
+
+/* Provided by the linker */
+extern const struct kernel_symbol __start___ksymtab[];
+extern const struct kernel_symbol __stop___ksymtab[];
+extern const struct kernel_symbol __start___ksymtab_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const u32 __start___kcrctab[];
+extern const u32 __start___kcrctab_gpl[];
+
+#define KMOD_PATH_LEN 256
+extern char modprobe_path[];
+
+struct load_info {
+ const char *name;
+ /* pointer to module in temporary copy, freed at end of load_module() */
+ struct module *mod;
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
+ bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
+#ifdef CONFIG_MODULE_DECOMPRESS
+#ifdef CONFIG_MODULE_STATS
+ unsigned long compressed_len;
+#endif
+ struct page **pages;
+ unsigned int max_pages;
+ unsigned int used_pages;
+#endif
+ struct {
+ unsigned int sym;
+ unsigned int str;
+ unsigned int mod;
+ unsigned int vers;
+ unsigned int info;
+ unsigned int pcpu;
+ unsigned int vers_ext_crc;
+ unsigned int vers_ext_name;
+ } index;
+};
+
+enum mod_license {
+ NOT_GPL_ONLY,
+ GPL_ONLY,
+};
+
+struct find_symbol_arg {
+ /* Input */
+ const char *name;
+ bool gplok;
+ bool warn;
+
+ /* Output */
+ struct module *owner;
+ const u32 *crc;
+ const struct kernel_symbol *sym;
+ enum mod_license license;
+};
+
+/* modules using other modules */
+struct module_use {
+ struct list_head source_list;
+ struct list_head target_list;
+ struct module *source, *target;
+};
+
+int mod_verify_sig(const void *mod, struct load_info *info);
+int try_to_force_load(struct module *mod, const char *reason);
+bool find_symbol(struct find_symbol_arg *fsa);
+struct module *find_module_all(const char *name, size_t len, bool even_unformed);
+int cmp_name(const void *name, const void *sym);
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section);
+char *module_flags(struct module *mod, char *buf, bool show_state);
+size_t module_flags_taint(unsigned long taints, char *buf);
+
+char *module_next_tag_pair(char *string, unsigned long *secsize);
+
+#define for_each_modinfo_entry(entry, info, name) \
+ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))
+
+static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return (unsigned long)offset_to_ptr(&sym->value_offset);
+#else
+ return sym->value;
+#endif
+}
+
+#ifdef CONFIG_LIVEPATCH
+int copy_module_elf(struct module *mod, struct load_info *info);
+void free_module_elf(struct module *mod);
+#else /* !CONFIG_LIVEPATCH */
+static inline int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ return 0;
+}
+
+static inline void free_module_elf(struct module *mod) { }
+#endif /* CONFIG_LIVEPATCH */
+
+static inline bool set_livepatch_module(struct module *mod)
+{
+#ifdef CONFIG_LIVEPATCH
+ mod->klp = true;
+ return true;
+#else
+ return false;
+#endif
+}
+
+/**
+ * enum fail_dup_mod_reason - state at which a duplicate module was detected
+ *
+ * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but
+ * we've determined that another module with the same name is already loaded
+ * or being processed on our &modules list. This happens on early_mod_check()
+ * right before layout_and_allocate(). The kernel would have already
+ * vmalloc()'d space for the entire module through finit_module(). If
+ * decompression was used two vmap() spaces were used. These failures can
+ * happen when userspace has not seen the module present on the kernel and
+ * tries to load the module multiple times at same time.
+ * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation
+ * checks and the kernel determines that the module was unique and because
+ * of this allocated yet another private kernel copy of the module space in
+ * layout_and_allocate() but after this determined in add_unformed_module()
+ * that another module with the same name is already loaded or being processed.
+ * These failures should be mitigated as much as possible and are indicative
+ * of really fast races in loading modules. Without module decompression
+ * they waste twice as much vmap space. With module decompression three
+ * times the module's size vmap space is wasted.
+ */
+enum fail_dup_mod_reason {
+ FAIL_DUP_MOD_BECOMING = 0,
+ FAIL_DUP_MOD_LOAD,
+};
+
+#ifdef CONFIG_MODULE_DEBUGFS
+extern struct dentry *mod_debugfs_root;
+#endif
+
+#ifdef CONFIG_MODULE_STATS
+
+#define mod_stat_add_long(count, var) atomic_long_add(count, var)
+#define mod_stat_inc(name) atomic_inc(name)
+
+extern atomic_long_t total_mod_size;
+extern atomic_long_t total_text_size;
+extern atomic_long_t invalid_kread_bytes;
+extern atomic_long_t invalid_decompress_bytes;
+
+extern atomic_t modcount;
+extern atomic_t failed_kreads;
+extern atomic_t failed_decompress;
+struct mod_fail_load {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ atomic_long_t count;
+ unsigned long dup_fail_mask;
+};
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason);
+void mod_stat_bump_invalid(struct load_info *info, int flags);
+void mod_stat_bump_becoming(struct load_info *info, int flags);
+
+#else
+
+#define mod_stat_add_long(name, var)
+#define mod_stat_inc(name)
+
+static inline int try_add_failed_module(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ return 0;
+}
+
+static inline void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+}
+
+static inline void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+}
+
+#endif /* CONFIG_MODULE_STATS */
+
+#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret);
+void kmod_dup_request_announce(char *module_name, int ret);
+#else
+static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ return false;
+}
+
+static inline void kmod_dup_request_announce(char *module_name, int ret)
+{
+}
+#endif
+
+#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
+struct mod_unload_taint {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ unsigned long taints;
+ u64 count;
+};
+
+int try_add_tainted_module(struct module *mod);
+void print_unloaded_tainted_modules(void);
+#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+static inline int try_add_tainted_module(struct module *mod)
+{
+ return 0;
+}
+
+static inline void print_unloaded_tainted_modules(void)
+{
+}
+#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
+
+#ifdef CONFIG_MODULE_DECOMPRESS
+int module_decompress(struct load_info *info, const void *buf, size_t size);
+void module_decompress_cleanup(struct load_info *info);
+#else
+static inline int module_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void module_decompress_cleanup(struct load_info *info)
+{
+}
+#endif
+
+struct mod_tree_root {
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+ struct latch_tree_root root;
+#endif
+ unsigned long addr_min;
+ unsigned long addr_max;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ unsigned long data_addr_min;
+ unsigned long data_addr_max;
+#endif
+};
+
+extern struct mod_tree_root mod_tree;
+
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+void mod_tree_insert(struct module *mod);
+void mod_tree_remove_init(struct module *mod);
+void mod_tree_remove(struct module *mod);
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
+#else /* !CONFIG_MODULES_TREE_LOOKUP */
+
+static inline void mod_tree_insert(struct module *mod) { }
+static inline void mod_tree_remove_init(struct module *mod) { }
+static inline void mod_tree_remove(struct module *mod) { }
+static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (within_module(addr, mod))
+ return mod;
+ }
+
+ return NULL;
+}
+#endif /* CONFIG_MODULES_TREE_LOOKUP */
+
+int module_enable_rodata_ro(const struct module *mod);
+int module_enable_rodata_ro_after_init(const struct module *mod);
+int module_enable_data_nx(const struct module *mod);
+int module_enable_text_rox(const struct module *mod);
+int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const struct module *mod);
+void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ const char *secstrings);
+
+#ifdef CONFIG_MODULE_SIG
+int module_sig_check(struct load_info *info, int flags);
+#else /* !CONFIG_MODULE_SIG */
+static inline int module_sig_check(struct load_info *info, int flags)
+{
+ return 0;
+}
+#endif /* !CONFIG_MODULE_SIG */
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+void kmemleak_load_module(const struct module *mod, const struct load_info *info);
+#else /* !CONFIG_DEBUG_KMEMLEAK */
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info) { }
+#endif /* CONFIG_DEBUG_KMEMLEAK */
+
+#ifdef CONFIG_KALLSYMS
+void init_build_id(struct module *mod, const struct load_info *info);
+void layout_symtab(struct module *mod, struct load_info *info);
+void add_kallsyms(struct module *mod, const struct load_info *info);
+
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+ return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
+#else /* !CONFIG_KALLSYMS */
+static inline void init_build_id(struct module *mod, const struct load_info *info) { }
+static inline void layout_symtab(struct module *mod, struct load_info *info) { }
+static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+#endif /* CONFIG_KALLSYMS */
+
+#ifdef CONFIG_SYSFS
+int mod_sysfs_setup(struct module *mod, const struct load_info *info,
+ struct kernel_param *kparam, unsigned int num_params);
+void mod_sysfs_teardown(struct module *mod);
+void init_param_lock(struct module *mod);
+#else /* !CONFIG_SYSFS */
+static inline int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ return 0;
+}
+
+static inline void mod_sysfs_teardown(struct module *mod) { }
+static inline void init_param_lock(struct module *mod) { }
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MODVERSIONS
+int check_version(const struct load_info *info,
+ const char *symname, struct module *mod, const u32 *crc);
+void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
+ struct kernel_symbol *ks, struct tracepoint * const *tp);
+int check_modstruct_version(const struct load_info *info, struct module *mod);
+int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
+struct modversion_info_ext {
+ size_t remaining;
+ const u32 *crc;
+ const char *name;
+};
+void modversion_ext_start(const struct load_info *info, struct modversion_info_ext *ver);
+void modversion_ext_advance(struct modversion_info_ext *ver);
+#define for_each_modversion_info_ext(ver, info) \
+ for (modversion_ext_start(info, &ver); ver.remaining > 0; modversion_ext_advance(&ver))
+#else /* !CONFIG_MODVERSIONS */
+static inline int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const u32 *crc)
+{
+ return 1;
+}
+
+static inline int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ return 1;
+}
+
+static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
+{
+ return strcmp(amagic, bmagic) == 0;
+}
+#endif /* CONFIG_MODVERSIONS */
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
new file mode 100644
index 000000000000..00a60796327c
--- /dev/null
+++ b/kernel/module/kallsyms.c
@@ -0,0 +1,501 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kallsyms support
+ *
+ * Copyright (C) 2010 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/module_symbol.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/bsearch.h>
+#include "internal.h"
+
+/* Lookup exported symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ const struct kernel_symbol *start,
+ const struct kernel_symbol *stop)
+{
+ return bsearch(name, start, stop - start,
+ sizeof(struct kernel_symbol), cmp_name);
+}
+
+static int is_exported(const char *name, unsigned long value,
+ const struct module *mod)
+{
+ const struct kernel_symbol *ks;
+
+ if (!mod)
+ ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+ else
+ ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+
+ return ks && kernel_symbol_value(ks) == value;
+}
+
+/* As per nm */
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
+ if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ return 'v';
+ else
+ return 'w';
+ }
+ if (sym->st_shndx == SHN_UNDEF)
+ return 'U';
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+ return 'a';
+ if (sym->st_shndx >= SHN_LORESERVE)
+ return '?';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ return 't';
+ if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC &&
+ sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ return 'r';
+ else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 'g';
+ else
+ return 'd';
+ }
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ return 's';
+ else
+ return 'b';
+ }
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
+ return 'n';
+ }
+ return '?';
+}
+
+static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+ unsigned int shnum, unsigned int pcpundx)
+{
+ const Elf_Shdr *sec;
+ enum mod_mem_type type;
+
+ if (src->st_shndx == SHN_UNDEF ||
+ src->st_shndx >= shnum ||
+ !src->st_name)
+ return false;
+
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
+ sec = sechdrs + src->st_shndx;
+ type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ if (!(sec->sh_flags & SHF_ALLOC)
+#ifndef CONFIG_KALLSYMS_ALL
+ || !(sec->sh_flags & SHF_EXECINSTR)
+#endif
+ || mod_mem_type_is_init(type))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
+void layout_symtab(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
+ const Elf_Sym *src;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
+ struct module_memory *mod_mem_data = &mod->mem[MOD_DATA];
+ struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA];
+
+ /* Put symbol section at end of init part of module. */
+ symsect->sh_flags |= SHF_ALLOC;
+ symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ symsect, info->index.sym);
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+
+ src = (void *)info->hdr + symsect->sh_offset;
+ nsrc = symsect->sh_size / sizeof(*src);
+
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = 0; i < nsrc; i++) {
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ strtab_size += strlen(&info->strtab[src[i].st_name]) + 1;
+ ndst++;
+ }
+ }
+
+ /* Append room for core symbols at end of core part. */
+ info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod_mem_data->size += strtab_size;
+ /* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */
+ info->core_typeoffs = mod_mem_data->size;
+ mod_mem_data->size += ndst * sizeof(char);
+
+ /* Put string table section at end of init part of module. */
+ strsect->sh_flags |= SHF_ALLOC;
+ strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ strsect, info->index.str);
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod_mem_init_data->size = ALIGN(mod_mem_init_data->size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod_mem_init_data->size;
+
+ mod_mem_init_data->size += sizeof(struct mod_kallsyms);
+ info->init_typeoffs = mod_mem_init_data->size;
+ mod_mem_init_data->size += nsrc * sizeof(char);
+}
+
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
+void add_kallsyms(struct module *mod, const struct load_info *info)
+{
+ unsigned int i, ndst;
+ const Elf_Sym *src;
+ Elf_Sym *dst;
+ char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ unsigned long strtab_size;
+ void *data_base = mod->mem[MOD_DATA].base;
+ void *init_data_base = mod->mem[MOD_INIT_DATA].base;
+ struct mod_kallsyms *kallsyms;
+
+ kallsyms = init_data_base + info->mod_kallsyms_init_off;
+
+ kallsyms->symtab = (void *)symsec->sh_addr;
+ kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ kallsyms->typetab = init_data_base + info->init_typeoffs;
+
+ /*
+ * Now populate the cut down core kallsyms for after init
+ * and set types up while we still have access to sections.
+ */
+ mod->core_kallsyms.symtab = dst = data_base + info->symoffs;
+ mod->core_kallsyms.strtab = s = data_base + info->stroffs;
+ mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
+ strtab_size = info->core_typeoffs - info->stroffs;
+ src = kallsyms->symtab;
+ for (ndst = i = 0; i < kallsyms->num_symtab; i++) {
+ kallsyms->typetab[i] = elf_type(src + i, info);
+ if (i == 0 || is_livepatch_module(mod) ||
+ is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
+ ssize_t ret;
+
+ mod->core_kallsyms.typetab[ndst] =
+ kallsyms->typetab[i];
+ dst[ndst] = src[i];
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ ret = strscpy(s, &kallsyms->strtab[src[i].st_name],
+ strtab_size);
+ if (ret < 0)
+ break;
+ s += ret + 1;
+ strtab_size -= ret + 1;
+ }
+ }
+
+ /* Set up to point into init section. */
+ rcu_assign_pointer(mod->kallsyms, kallsyms);
+ mod->core_kallsyms.num_symtab = ndst;
+}
+
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+ const Elf_Shdr *sechdr;
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ sechdr = &info->sechdrs[i];
+ if (!sect_empty(sechdr) && sechdr->sh_type == SHT_NOTE &&
+ !build_id_parse_buf((void *)sechdr->sh_addr, mod->build_id,
+ sechdr->sh_size))
+ break;
+ }
+}
+#else
+void init_build_id(struct module *mod, const struct load_info *info)
+{
+}
+#endif
+
+static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
+/*
+ * Given a module and address, find the corresponding symbol and return its name
+ * while providing its size and offset if needed.
+ */
+static const char *find_kallsyms_symbol(struct module *mod,
+ unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset)
+{
+ unsigned int i, best = 0;
+ unsigned long nextval, bestval;
+ struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms);
+ struct module_memory *mod_mem;
+
+ /* At worse, next value is at end of module */
+ if (within_module_init(addr, mod))
+ mod_mem = &mod->mem[MOD_INIT_TEXT];
+ else
+ mod_mem = &mod->mem[MOD_TEXT];
+
+ nextval = (unsigned long)mod_mem->base + mod_mem->size;
+
+ bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+
+ /*
+ * Scan for closest preceding symbol, and next symbol. (ELF
+ * starts real symbols at 1).
+ */
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+ unsigned long thisval = kallsyms_symbol_value(sym);
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ /*
+ * We ignore unnamed symbols: they're uninformative
+ * and inserted at a whim.
+ */
+ if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
+ is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+ continue;
+
+ if (thisval <= addr && thisval > bestval) {
+ best = i;
+ bestval = thisval;
+ }
+ if (thisval > addr && thisval < nextval)
+ nextval = thisval;
+ }
+
+ if (!best)
+ return NULL;
+
+ if (size)
+ *size = nextval - bestval;
+ if (offset)
+ *offset = addr - bestval;
+
+ return kallsyms_symbol_name(kallsyms, best);
+}
+
+void * __weak dereference_module_function_descriptor(struct module *mod,
+ void *ptr)
+{
+ return ptr;
+}
+
+/*
+ * For kallsyms to ask for address resolution. NULL means not found. Careful
+ * not to lock to avoid deadlock on oopses, RCU is enough.
+ */
+int module_address_lookup(unsigned long addr,
+ unsigned long *size,
+ unsigned long *offset,
+ char **modname,
+ const unsigned char **modbuildid,
+ char *namebuf)
+{
+ const char *sym;
+ int ret = 0;
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_address(addr);
+ if (mod) {
+ if (modname)
+ *modname = mod->name;
+ if (modbuildid) {
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID)
+ *modbuildid = mod->build_id;
+#else
+ *modbuildid = NULL;
+#endif
+ }
+
+ sym = find_kallsyms_symbol(mod, addr, size, offset);
+
+ if (sym)
+ ret = strscpy(namebuf, sym, KSYM_NAME_LEN);
+ }
+ return ret;
+}
+
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (within_module(addr, mod)) {
+ const char *sym;
+
+ sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+
+ strscpy(symname, sym, KSYM_NAME_LEN);
+ return 0;
+ }
+ }
+out:
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ kallsyms = rcu_dereference(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ const Elf_Sym *sym = &kallsyms->symtab[symnum];
+
+ *value = kallsyms_symbol_value(sym);
+ *type = kallsyms->typetab[symnum];
+ strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+ strscpy(module_name, mod->name, MODULE_NAME_LEN);
+ *exported = is_exported(name, *value, mod);
+ return 0;
+ }
+ symnum -= kallsyms->num_symtab;
+ }
+ return -ERANGE;
+}
+
+/* Given a module and name of symbol, find and return the symbol's value */
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms);
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ sym->st_shndx != SHN_UNDEF)
+ return kallsyms_symbol_value(sym);
+ }
+ return 0;
+}
+
+static unsigned long __module_kallsyms_lookup_name(const char *name)
+{
+ struct module *mod;
+ char *colon;
+
+ colon = strnchr(name, MODULE_NAME_LEN, ':');
+ if (colon) {
+ mod = find_module_all(name, colon - name, false);
+ if (mod)
+ return __find_kallsyms_symbol_value(mod, colon + 1);
+ return 0;
+ }
+
+ list_for_each_entry_rcu(mod, &modules, list) {
+ unsigned long ret;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ ret = __find_kallsyms_symbol_value(mod, name);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name)
+{
+ /* Don't lock: we're in enough trouble already. */
+ guard(rcu)();
+ return __module_kallsyms_lookup_name(name);
+}
+
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ guard(rcu)();
+ return __find_kallsyms_symbol_value(mod, name);
+}
+
+int module_kallsyms_on_each_symbol(const char *modname,
+ int (*fn)(void *, const char *, unsigned long),
+ void *data)
+{
+ struct module *mod;
+ unsigned int i;
+ int ret = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ if (modname && strcmp(modname, mod->name))
+ continue;
+
+ kallsyms = rcu_dereference_check(mod->kallsyms,
+ lockdep_is_held(&module_mutex));
+
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ const Elf_Sym *sym = &kallsyms->symtab[i];
+
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ kallsyms_symbol_value(sym));
+ if (ret != 0)
+ goto out;
+ }
+
+ /*
+ * The given module is found, the subsequent modules do not
+ * need to be compared.
+ */
+ if (modname)
+ break;
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
new file mode 100644
index 000000000000..995c32d3698f
--- /dev/null
+++ b/kernel/module/kdb.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module kdb support
+ *
+ * Copyright (C) 2010 Jason Wessel
+ */
+
+#include <linux/module.h>
+#include <linux/kdb.h>
+#include "internal.h"
+
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command. Lists
+ * currently loaded kernel modules.
+ * Mostly taken from userland lsmod.
+ */
+int kdb_lsmod(int argc, const char **argv)
+{
+ struct module *mod;
+
+ if (argc != 0)
+ return KDB_ARGCOUNT;
+
+ kdb_printf("Module Size modstruct Used by\n");
+ list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size);
+ kdb_printf("/%8u", mod->mem[MOD_RODATA].size);
+ kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size);
+ kdb_printf("/%8u", mod->mem[MOD_DATA].size);
+
+ kdb_printf(" 0x%px ", (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+ kdb_printf("%4d ", module_refcount(mod));
+#endif
+ if (mod->state == MODULE_STATE_GOING)
+ kdb_printf(" (Unloading)");
+ else if (mod->state == MODULE_STATE_COMING)
+ kdb_printf(" (Loading)");
+ else
+ kdb_printf(" (Live)");
+ kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RODATA].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_DATA].base);
+
+#ifdef CONFIG_MODULE_UNLOAD
+ {
+ struct module_use *use;
+
+ kdb_printf(" [ ");
+ list_for_each_entry(use, &mod->source_list,
+ source_list)
+ kdb_printf("%s ", use->target->name);
+ kdb_printf("]\n");
+ }
+#endif
+ }
+
+ return 0;
+}
diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c
new file mode 100644
index 000000000000..25f253812512
--- /dev/null
+++ b/kernel/module/kmod.c
@@ -0,0 +1,179 @@
+/*
+ * kmod - the kernel module loader
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#include <trace/events/module.h>
+#include "internal.h"
+
+/*
+ * Assuming:
+ *
+ * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ * (u64) THREAD_SIZE * 8UL);
+ *
+ * If you need less than 50 threads would mean we're dealing with systems
+ * smaller than 3200 pages. This assumes you are capable of having ~13M memory,
+ * and this would only be an upper limit, after which the OOM killer would take
+ * effect. Systems like these are very unlikely if modules are enabled.
+ */
+#define MAX_KMOD_CONCURRENT 50
+static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT);
+
+/*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
+
+/*
+ modprobe_path is set via /proc/sys.
+*/
+char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;
+
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+ kfree(info->argv[3]); /* check call_modprobe() */
+ kfree(info->argv);
+}
+
+static int call_modprobe(char *orig_module_name, int wait)
+{
+ struct subprocess_info *info;
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *module_name;
+ int ret;
+
+ char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+ if (!argv)
+ goto out;
+
+ module_name = kstrdup(orig_module_name, GFP_KERNEL);
+ if (!module_name)
+ goto free_argv;
+
+ argv[0] = modprobe_path;
+ argv[1] = "-q";
+ argv[2] = "--";
+ argv[3] = module_name; /* check free_modprobe_argv() */
+ argv[4] = NULL;
+
+ info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
+ NULL, free_modprobe_argv, NULL);
+ if (!info)
+ goto free_module_name;
+
+ ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ kmod_dup_request_announce(orig_module_name, ret);
+ return ret;
+
+free_module_name:
+ kfree(module_name);
+free_argv:
+ kfree(argv);
+out:
+ kmod_dup_request_announce(orig_module_name, -ENOMEM);
+ return -ENOMEM;
+}
+
+/**
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
+ * @fmt: printf style format string for the name of the module
+ * @...: arguments as specified in the format string
+ *
+ * Load a module using the user mode module loader. The function returns
+ * zero on success or a negative errno code or positive exit code from
+ * "modprobe" on failure. Note that a successful module load does not mean
+ * the module did not then unload and exit on an error of its own. Callers
+ * must check that the service they requested is now available not blindly
+ * invoke it.
+ *
+ * If module auto-loading support is disabled then this function
+ * simply returns -ENOENT.
+ */
+int __request_module(bool wait, const char *fmt, ...)
+{
+ va_list args;
+ char module_name[MODULE_NAME_LEN];
+ int ret, dup_ret;
+
+ /*
+ * We don't allow synchronous module loading from async. Module
+ * init may invoke async_synchronize_full() which will end up
+ * waiting for this task which already is waiting for the module
+ * loading to complete, leading to a deadlock.
+ */
+ WARN_ON_ONCE(wait && current_is_async());
+
+ if (!modprobe_path[0])
+ return -ENOENT;
+
+ va_start(args, fmt);
+ ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ va_end(args);
+ if (ret >= MODULE_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ ret = security_kernel_module_request(module_name);
+ if (ret)
+ return ret;
+
+ ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return ret;
+ }
+
+ trace_module_request(module_name, wait, _RET_IP_);
+
+ if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) {
+ ret = dup_ret;
+ goto out;
+ }
+
+ ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+
+out:
+ up(&kmod_concurrent_max);
+
+ return ret;
+}
+EXPORT_SYMBOL(__request_module);
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
new file mode 100644
index 000000000000..a89f01e1d6b7
--- /dev/null
+++ b/kernel/module/livepatch.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module livepatch support
+ *
+ * Copyright (C) 2016 Jessica Yu <jeyu@redhat.com>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/*
+ * Persist ELF information about a module. Copy the ELF header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+int copy_module_elf(struct module *mod, struct load_info *info)
+{
+ unsigned int size, symndx;
+ int ret;
+
+ size = sizeof(*mod->klp_info);
+ mod->klp_info = kmalloc(size, GFP_KERNEL);
+ if (!mod->klp_info)
+ return -ENOMEM;
+
+ /* ELF header */
+ size = sizeof(mod->klp_info->hdr);
+ memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+ /* ELF section header table */
+ size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+ mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+ if (!mod->klp_info->sechdrs) {
+ ret = -ENOMEM;
+ goto free_info;
+ }
+
+ /* ELF section name string table */
+ size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+ mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+ if (!mod->klp_info->secstrings) {
+ ret = -ENOMEM;
+ goto free_sechdrs;
+ }
+
+ /* ELF symbol section index */
+ symndx = info->index.sym;
+ mod->klp_info->symndx = symndx;
+
+ /*
+ * For livepatch modules, core_kallsyms.symtab is a complete
+ * copy of the original symbol table. Adjust sh_addr to point
+ * to core_kallsyms.symtab since the copy of the symtab in module
+ * init memory is freed at the end of do_init_module().
+ */
+ mod->klp_info->sechdrs[symndx].sh_addr = (unsigned long)mod->core_kallsyms.symtab;
+
+ return 0;
+
+free_sechdrs:
+ kfree(mod->klp_info->sechdrs);
+free_info:
+ kfree(mod->klp_info);
+ return ret;
+}
+
+void free_module_elf(struct module *mod)
+{
+ kfree(mod->klp_info->sechdrs);
+ kfree(mod->klp_info->secstrings);
+ kfree(mod->klp_info);
+}
diff --git a/kernel/module/main.c b/kernel/module/main.c
new file mode 100644
index 000000000000..710ee30b3bea
--- /dev/null
+++ b/kernel/module/main.c
@@ -0,0 +1,3923 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2002 Richard Henderson
+ * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define INCLUDE_VERMAGIC
+
+#include <linux/export.h>
+#include <linux/extable.h>
+#include <linux/moduleloader.h>
+#include <linux/module_signature.h>
+#include <linux/trace_events.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/buildid.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kernel_read_file.h>
+#include <linux/kstrtox.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/elf.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/rcupdate.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/vermagic.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/string.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <linux/set_memory.h>
+#include <asm/mmu_context.h>
+#include <linux/license.h>
+#include <asm/sections.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+#include <linux/livepatch.h>
+#include <linux/async.h>
+#include <linux/percpu.h>
+#include <linux/kmemleak.h>
+#include <linux/jump_label.h>
+#include <linux/pfn.h>
+#include <linux/bsearch.h>
+#include <linux/dynamic_debug.h>
+#include <linux/audit.h>
+#include <linux/cfi.h>
+#include <linux/codetag.h>
+#include <linux/debugfs.h>
+#include <linux/execmem.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/module.h>
+
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable within RCU read section),
+ * 2) module_use links,
+ * 3) mod_tree.addr_min/mod_tree.addr_max.
+ * (delete and add uses RCU list operations).
+ */
+DEFINE_MUTEX(module_mutex);
+LIST_HEAD(modules);
+
+/* Work queue for freeing init sections in success case */
+static void do_free_init(struct work_struct *w);
+static DECLARE_WORK(init_free_wq, do_free_init);
+static LLIST_HEAD(init_free_list);
+
+struct mod_tree_root mod_tree __cacheline_aligned = {
+ .addr_min = -1UL,
+};
+
+struct symsearch {
+ const struct kernel_symbol *start, *stop;
+ const u32 *crcs;
+ enum mod_license license;
+};
+
+/*
+ * Bounds of module memory, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
+ unsigned int size, struct mod_tree_root *tree)
+{
+ unsigned long min = (unsigned long)base;
+ unsigned long max = min + size;
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (mod_mem_type_is_core_data(type)) {
+ if (min < tree->data_addr_min)
+ tree->data_addr_min = min;
+ if (max > tree->data_addr_max)
+ tree->data_addr_max = max;
+ return;
+ }
+#endif
+ if (min < tree->addr_min)
+ tree->addr_min = min;
+ if (max > tree->addr_max)
+ tree->addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size)
+ __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
+ }
+}
+
+/* Block module loading/unloading? */
+static int modules_disabled;
+core_param(nomodule, modules_disabled, bint, 0);
+
+static const struct ctl_table module_sysctl_table[] = {
+ {
+ .procname = "modprobe",
+ .data = &modprobe_path,
+ .maxlen = KMOD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "modules_disabled",
+ .data = &modules_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init init_module_sysctl(void)
+{
+ register_sysctl_init("kernel", module_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_module_sysctl);
+
+/* Waiting for a module to finish initializing? */
+static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+
+int register_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(register_module_notifier);
+
+int unregister_module_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&module_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_module_notifier);
+
+/*
+ * We require a truly strong try_module_get(): 0 means success.
+ * Otherwise an error is returned due to ongoing or failed
+ * initialization etc.
+ */
+static inline int strong_try_module_get(struct module *mod)
+{
+ BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
+ if (mod && mod->state == MODULE_STATE_COMING)
+ return -EBUSY;
+ if (try_module_get(mod))
+ return 0;
+ else
+ return -ENOENT;
+}
+
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
+{
+ add_taint(flag, lockdep_ok);
+ set_bit(flag, &mod->taints);
+}
+
+/*
+ * Like strncmp(), except s/-/_/g as per scripts/Makefile.lib:name-fix-token rule.
+ */
+static int mod_strncmp(const char *str_a, const char *str_b, size_t n)
+{
+ for (int i = 0; i < n; i++) {
+ char a = str_a[i];
+ char b = str_b[i];
+ int d;
+
+ if (a == '-') a = '_';
+ if (b == '-') b = '_';
+
+ d = a - b;
+ if (d)
+ return d;
+
+ if (!a)
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * A thread that wants to hold a reference to a module only while it
+ * is running can call this to safely exit.
+ */
+void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
+{
+ module_put(mod);
+ kthread_exit(code);
+}
+EXPORT_SYMBOL(__module_put_and_kthread_exit);
+
+/* Find a module section: 0 means not found. */
+static unsigned int find_sec(const struct load_info *info, const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ /* Alloc bit cleared means "ignore it." */
+ if ((shdr->sh_flags & SHF_ALLOC)
+ && strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ return i;
+ }
+ return 0;
+}
+
+/**
+ * find_any_unique_sec() - Find a unique section index by name
+ * @info: Load info for the module to scan
+ * @name: Name of the section we're looking for
+ *
+ * Locates a unique section by name. Ignores SHF_ALLOC.
+ *
+ * Return: Section index if found uniquely, zero if absent, negative count
+ * of total instances if multiple were found.
+ */
+static int find_any_unique_sec(const struct load_info *info, const char *name)
+{
+ unsigned int idx;
+ unsigned int count = 0;
+ int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ if (strcmp(info->secstrings + info->sechdrs[i].sh_name,
+ name) == 0) {
+ count++;
+ idx = i;
+ }
+ }
+ if (count == 1) {
+ return idx;
+ } else if (count == 0) {
+ return 0;
+ } else {
+ return -count;
+ }
+}
+
+/* Find a module section, or NULL. */
+static void *section_addr(const struct load_info *info, const char *name)
+{
+ /* Section 0 has sh_addr 0. */
+ return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
+}
+
+/* Find a module section, or NULL. Fill in number of "objects" in section. */
+static void *section_objs(const struct load_info *info,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_sec(info, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
+}
+
+/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
+static unsigned int find_any_sec(const struct load_info *info, const char *name)
+{
+ unsigned int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ return i;
+ }
+ return 0;
+}
+
+/*
+ * Find a module section, or NULL. Fill in number of "objects" in section.
+ * Ignores SHF_ALLOC flag.
+ */
+static __maybe_unused void *any_section_objs(const struct load_info *info,
+ const char *name,
+ size_t object_size,
+ unsigned int *num)
+{
+ unsigned int sec = find_any_sec(info, name);
+
+ /* Section 0 has sh_addr 0 and sh_size 0. */
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
+}
+
+#ifndef CONFIG_MODVERSIONS
+#define symversion(base, idx) NULL
+#else
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
+#endif
+
+static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ return offset_to_ptr(&sym->name_offset);
+#else
+ return sym->name;
+#endif
+}
+
+static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
+{
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ if (!sym->namespace_offset)
+ return NULL;
+ return offset_to_ptr(&sym->namespace_offset);
+#else
+ return sym->namespace;
+#endif
+}
+
+int cmp_name(const void *name, const void *sym)
+{
+ return strcmp(name, kernel_symbol_name(sym));
+}
+
+static bool find_exported_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ struct find_symbol_arg *fsa)
+{
+ struct kernel_symbol *sym;
+
+ if (!fsa->gplok && syms->license == GPL_ONLY)
+ return false;
+
+ sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+ sizeof(struct kernel_symbol), cmp_name);
+ if (!sym)
+ return false;
+
+ fsa->owner = owner;
+ fsa->crc = symversion(syms->crcs, sym - syms->start);
+ fsa->sym = sym;
+ fsa->license = syms->license;
+
+ return true;
+}
+
+/*
+ * Find an exported symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it. Needs RCU or module_mutex.
+ */
+bool find_symbol(struct find_symbol_arg *fsa)
+{
+ static const struct symsearch arr[] = {
+ { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ NOT_GPL_ONLY },
+ { __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ __start___kcrctab_gpl,
+ GPL_ONLY },
+ };
+ struct module *mod;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
+ return true;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ struct symsearch arr[] = {
+ { mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ NOT_GPL_ONLY },
+ { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ mod->gpl_crcs,
+ GPL_ONLY },
+ };
+
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++)
+ if (find_exported_symbol_in_section(&arr[i], mod, fsa))
+ return true;
+ }
+
+ pr_debug("Failed to find symbol %s\n", fsa->name);
+ return false;
+}
+
+/*
+ * Search for module by name: must hold module_mutex (or RCU for read-only
+ * access).
+ */
+struct module *find_module_all(const char *name, size_t len,
+ bool even_unformed)
+{
+ struct module *mod;
+
+ list_for_each_entry_rcu(mod, &modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
+ return mod;
+ }
+ return NULL;
+}
+
+struct module *find_module(const char *name)
+{
+ return find_module_all(name, strlen(name), false);
+}
+
+#ifdef CONFIG_SMP
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return mod->percpu;
+}
+
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ unsigned long align = pcpusec->sh_addralign;
+
+ if (!pcpusec->sh_size)
+ return 0;
+
+ if (align > PAGE_SIZE) {
+ pr_warn("%s: per-cpu alignment %li > %li\n",
+ mod->name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
+ }
+
+ mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
+ if (!mod->percpu) {
+ pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, (unsigned long)pcpusec->sh_size);
+ return -ENOMEM;
+ }
+ mod->percpu_size = pcpusec->sh_size;
+ return 0;
+}
+
+static void percpu_modfree(struct module *mod)
+{
+ free_percpu(mod->percpu);
+}
+
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return find_sec(info, ".data..percpu");
+}
+
+static void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ struct module *mod;
+ unsigned int cpu;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (!mod->percpu_size)
+ continue;
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(mod->percpu, cpu);
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + mod->percpu_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(mod->percpu,
+ get_boot_cpu_id());
+ }
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/**
+ * is_module_percpu_address() - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * Return: %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+ return __is_module_percpu_address(addr, NULL);
+}
+
+#else /* ... !CONFIG_SMP */
+
+static inline void __percpu *mod_percpu(struct module *mod)
+{
+ return NULL;
+}
+static int percpu_modalloc(struct module *mod, struct load_info *info)
+{
+ /* UP modules shouldn't have this section: ENOMEM isn't quite right */
+ if (info->sechdrs[info->index.pcpu].sh_size != 0)
+ return -ENOMEM;
+ return 0;
+}
+static inline void percpu_modfree(struct module *mod)
+{
+}
+static unsigned int find_pcpusec(struct load_info *info)
+{
+ return 0;
+}
+static inline void percpu_modcopy(struct module *mod,
+ const void *from, unsigned long size)
+{
+ /* pcpusec should be 0, and size of that section should be 0. */
+ BUG_ON(size != 0);
+}
+bool is_module_percpu_address(unsigned long addr)
+{
+ return false;
+}
+
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
+#endif /* CONFIG_SMP */
+
+#define MODINFO_ATTR(field) \
+static void setup_modinfo_##field(struct module *mod, const char *s) \
+{ \
+ mod->field = kstrdup(s, GFP_KERNEL); \
+} \
+static ssize_t show_modinfo_##field(const struct module_attribute *mattr, \
+ struct module_kobject *mk, char *buffer) \
+{ \
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
+} \
+static int modinfo_##field##_exists(struct module *mod) \
+{ \
+ return mod->field != NULL; \
+} \
+static void free_modinfo_##field(struct module *mod) \
+{ \
+ kfree(mod->field); \
+ mod->field = NULL; \
+} \
+static const struct module_attribute modinfo_##field = { \
+ .attr = { .name = __stringify(field), .mode = 0444 }, \
+ .show = show_modinfo_##field, \
+ .setup = setup_modinfo_##field, \
+ .test = modinfo_##field##_exists, \
+ .free = free_modinfo_##field, \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static struct {
+ char name[MODULE_NAME_LEN];
+ char taints[MODULE_FLAGS_BUF_SIZE];
+} last_unloaded_module;
+
+#ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
+/* Init the unload section of the module. */
+static int module_unload_init(struct module *mod)
+{
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
+
+ INIT_LIST_HEAD(&mod->source_list);
+ INIT_LIST_HEAD(&mod->target_list);
+
+ /* Hold reference count during initialization. */
+ atomic_inc(&mod->refcnt);
+
+ return 0;
+}
+
+/* Does a already use b? */
+static int already_uses(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ list_for_each_entry(use, &b->source_list, source_list) {
+ if (use->source == a)
+ return 1;
+ }
+ pr_debug("%s does not use %s!\n", a->name, b->name);
+ return 0;
+}
+
+/*
+ * Module a uses b
+ * - we add 'a' as a "source", 'b' as a "target" of module use
+ * - the module_use is added to the list of 'b' sources (so
+ * 'b' can walk the list to see who sourced them), and of 'a'
+ * targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
+{
+ struct module_use *use;
+
+ pr_debug("Allocating new usage for %s.\n", a->name);
+ use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ if (!use)
+ return -ENOMEM;
+
+ use->source = a;
+ use->target = b;
+ list_add(&use->source_list, &b->source_list);
+ list_add(&use->target_list, &a->target_list);
+ return 0;
+}
+
+/* Module a uses b: caller needs module_mutex() */
+static int ref_module(struct module *a, struct module *b)
+{
+ int err;
+
+ if (b == NULL || already_uses(a, b))
+ return 0;
+
+ /* If module isn't available, we fail. */
+ err = strong_try_module_get(b);
+ if (err)
+ return err;
+
+ err = add_module_usage(a, b);
+ if (err) {
+ module_put(b);
+ return err;
+ }
+ return 0;
+}
+
+/* Clear the unload stuff of the module. */
+static void module_unload_free(struct module *mod)
+{
+ struct module_use *use, *tmp;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+ struct module *i = use->target;
+ pr_debug("%s unusing %s\n", mod->name, i->name);
+ module_put(i);
+ list_del(&use->source_list);
+ list_del(&use->target_list);
+ kfree(use);
+ }
+ mutex_unlock(&module_mutex);
+}
+
+#ifdef CONFIG_MODULE_FORCE_UNLOAD
+static inline int try_force_unload(unsigned int flags)
+{
+ int ret = (flags & O_TRUNC);
+ if (ret)
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
+ return ret;
+}
+#else
+static inline int try_force_unload(unsigned int flags)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_FORCE_UNLOAD */
+
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
+{
+ int ret;
+
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
+ /* If it's not unused, quit unless we're forcing. */
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
+ return -EWOULDBLOCK;
+ }
+
+ /* Mark it as dying. */
+ mod->state = MODULE_STATE_GOING;
+
+ return 0;
+}
+
+/**
+ * module_refcount() - return the refcount or -1 if unloading
+ * @mod: the module we're checking
+ *
+ * Return:
+ * -1 if the module is in the process of unloading
+ * otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
+{
+ return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+}
+EXPORT_SYMBOL(module_refcount);
+
+/* This exists whether we can unload or not */
+static void free_module(struct module *mod);
+
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+ unsigned int, flags)
+{
+ struct module *mod;
+ char name[MODULE_NAME_LEN];
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ int ret, len, forced = 0;
+
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ len = strncpy_from_user(name, name_user, MODULE_NAME_LEN);
+ if (len == 0 || len == MODULE_NAME_LEN)
+ return -ENOENT;
+ if (len < 0)
+ return len;
+
+ audit_log_kern_module(name);
+
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
+
+ mod = find_module(name);
+ if (!mod) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (!list_empty(&mod->source_list)) {
+ /* Other modules depend on us: get rid of them first. */
+ ret = -EWOULDBLOCK;
+ goto out;
+ }
+
+ /* Doing init or already dying? */
+ if (mod->state != MODULE_STATE_LIVE) {
+ /* FIXME: if (force), slam module count damn the torpedoes */
+ pr_debug("%s already dying\n", mod->name);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* If it has an init func, it must have an exit func to unload */
+ if (mod->init && !mod->exit) {
+ forced = try_force_unload(flags);
+ if (!forced) {
+ /* This module can't be removed */
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ ret = try_stop_module(mod, flags, &forced);
+ if (ret != 0)
+ goto out;
+
+ mutex_unlock(&module_mutex);
+ /* Final destruction now no one is using it. */
+ if (mod->exit != NULL)
+ mod->exit();
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
+
+ async_synchronize_full();
+
+ /* Store the name and taints of the last unloaded module for diagnostic purposes */
+ strscpy(last_unloaded_module.name, mod->name);
+ strscpy(last_unloaded_module.taints, module_flags(mod, buf, false));
+
+ free_module(mod);
+ /* someone could wait for the module in add_unformed_module() */
+ wake_up_all(&module_wq);
+ return 0;
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
+
+void __symbol_put(const char *symbol)
+{
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ };
+
+ guard(rcu)();
+ BUG_ON(!find_symbol(&fsa));
+ module_put(fsa.owner);
+}
+EXPORT_SYMBOL(__symbol_put);
+
+/* Note this assumes addr is a function, which it currently always is. */
+void symbol_put_addr(void *addr)
+{
+ struct module *modaddr;
+ unsigned long a = (unsigned long)dereference_function_descriptor(addr);
+
+ if (core_kernel_text(a))
+ return;
+
+ /*
+ * Even though we hold a reference on the module; we still need to
+ * RCU read section in order to safely traverse the data structure.
+ */
+ guard(rcu)();
+ modaddr = __module_text_address(a);
+ BUG_ON(!modaddr);
+ module_put(modaddr);
+}
+EXPORT_SYMBOL_GPL(symbol_put_addr);
+
+static ssize_t show_refcnt(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%i\n", module_refcount(mk->mod));
+}
+
+static const struct module_attribute modinfo_refcnt =
+ __ATTR(refcnt, 0444, show_refcnt, NULL);
+
+void __module_get(struct module *module)
+{
+ if (module) {
+ atomic_inc(&module->refcnt);
+ trace_module_get(module, _RET_IP_);
+ }
+}
+EXPORT_SYMBOL(__module_get);
+
+bool try_module_get(struct module *module)
+{
+ bool ret = true;
+
+ if (module) {
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
+ trace_module_get(module, _RET_IP_);
+ else
+ ret = false;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(try_module_get);
+
+void module_put(struct module *module)
+{
+ int ret;
+
+ if (module) {
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
+ trace_module_put(module, _RET_IP_);
+ }
+}
+EXPORT_SYMBOL(module_put);
+
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void module_unload_free(struct module *mod)
+{
+}
+
+static int ref_module(struct module *a, struct module *b)
+{
+ return strong_try_module_get(b);
+}
+
+static inline int module_unload_init(struct module *mod)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+size_t module_flags_taint(unsigned long taints, char *buf)
+{
+ size_t l = 0;
+ int i;
+
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ if (test_bit(i, &taints))
+ buf[l++] = taint_flags[i].c_true;
+ }
+
+ return l;
+}
+
+static ssize_t show_initstate(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ const char *state = "unknown";
+
+ switch (mk->mod->state) {
+ case MODULE_STATE_LIVE:
+ state = "live";
+ break;
+ case MODULE_STATE_COMING:
+ state = "coming";
+ break;
+ case MODULE_STATE_GOING:
+ state = "going";
+ break;
+ default:
+ BUG();
+ }
+ return sprintf(buffer, "%s\n", state);
+}
+
+static const struct module_attribute modinfo_initstate =
+ __ATTR(initstate, 0444, show_initstate, NULL);
+
+static ssize_t store_uevent(const struct module_attribute *mattr,
+ struct module_kobject *mk,
+ const char *buffer, size_t count)
+{
+ int rc;
+
+ rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+ return rc ? rc : count;
+}
+
+const struct module_attribute module_uevent =
+ __ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = mk->mod->mem[MOD_TEXT].size;
+
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ }
+ return sprintf(buffer, "%u\n", size);
+}
+
+static const struct module_attribute modinfo_coresize =
+ __ATTR(coresize, 0444, show_coresize, NULL);
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+static ssize_t show_datasize(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
+}
+
+static const struct module_attribute modinfo_datasize =
+ __ATTR(datasize, 0444, show_datasize, NULL);
+#endif
+
+static ssize_t show_initsize(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, init)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
+}
+
+static const struct module_attribute modinfo_initsize =
+ __ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(const struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ size_t l;
+
+ l = module_flags_taint(mk->mod->taints, buffer);
+ buffer[l++] = '\n';
+ return l;
+}
+
+static const struct module_attribute modinfo_taint =
+ __ATTR(taint, 0444, show_taint, NULL);
+
+const struct module_attribute *const modinfo_attrs[] = {
+ &module_uevent,
+ &modinfo_version,
+ &modinfo_srcversion,
+ &modinfo_initstate,
+ &modinfo_coresize,
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ &modinfo_datasize,
+#endif
+ &modinfo_initsize,
+ &modinfo_taint,
+#ifdef CONFIG_MODULE_UNLOAD
+ &modinfo_refcnt,
+#endif
+ NULL,
+};
+
+const size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
+
+static const char vermagic[] = VERMAGIC_STRING;
+
+int try_to_force_load(struct module *mod, const char *reason)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+ if (!test_taint(TAINT_FORCED_MODULE))
+ pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
+ return 0;
+#else
+ return -ENOEXEC;
+#endif
+}
+
+/* Parse tag=value strings from .modinfo section */
+char *module_next_tag_pair(char *string, unsigned long *secsize)
+{
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
+}
+
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev)
+{
+ char *p;
+ unsigned int taglen = strlen(tag);
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
+
+ /*
+ * get_modinfo() calls made before rewrite_section_headers()
+ * must use sh_offset, as sh_addr isn't set!
+ */
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = module_next_tag_pair(prev, &size);
+ }
+
+ for (p = modinfo; p; p = module_next_tag_pair(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
+}
+
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+ return get_next_modinfo(info, tag, NULL);
+}
+
+/**
+ * verify_module_namespace() - does @modname have access to this symbol's @namespace
+ * @namespace: export symbol namespace
+ * @modname: module name
+ *
+ * If @namespace is prefixed with "module:" to indicate it is a module namespace
+ * then test if @modname matches any of the comma separated patterns.
+ *
+ * The patterns only support tail-glob.
+ */
+static bool verify_module_namespace(const char *namespace, const char *modname)
+{
+ size_t len, modlen = strlen(modname);
+ const char *prefix = "module:";
+ const char *sep;
+ bool glob;
+
+ if (!strstarts(namespace, prefix))
+ return false;
+
+ for (namespace += strlen(prefix); *namespace; namespace = sep) {
+ sep = strchrnul(namespace, ',');
+ len = sep - namespace;
+
+ glob = false;
+ if (sep[-1] == '*') {
+ len--;
+ glob = true;
+ }
+
+ if (*sep)
+ sep++;
+
+ if (mod_strncmp(namespace, modname, len) == 0 && (glob || len == modlen))
+ return true;
+ }
+
+ return false;
+}
+
+static int verify_namespace_is_imported(const struct load_info *info,
+ const struct kernel_symbol *sym,
+ struct module *mod)
+{
+ const char *namespace;
+ char *imported_namespace;
+
+ namespace = kernel_symbol_namespace(sym);
+ if (namespace && namespace[0]) {
+
+ if (verify_module_namespace(namespace, mod->name))
+ return 0;
+
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
+ if (strcmp(namespace, imported_namespace) == 0)
+ return 0;
+ }
+#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ pr_warn(
+#else
+ pr_err(
+#endif
+ "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+ mod->name, kernel_symbol_name(sym), namespace);
+#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
+{
+ if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+ return true;
+
+ if (mod->using_gplonly_symbols) {
+ pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
+ mod->name, name, owner->name);
+ return false;
+ }
+
+ if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+ pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
+ mod->name, name, owner->name);
+ set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+ }
+ return true;
+}
+
+/* Resolve a symbol for this module. I.e. if we find one, record usage. */
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+ const struct load_info *info,
+ const char *name,
+ char ownername[])
+{
+ struct find_symbol_arg fsa = {
+ .name = name,
+ .gplok = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
+ .warn = true,
+ };
+ int err;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ if (!find_symbol(&fsa))
+ goto unlock;
+
+ if (fsa.license == GPL_ONLY)
+ mod->using_gplonly_symbols = true;
+
+ if (!inherit_taint(mod, fsa.owner, name)) {
+ fsa.sym = NULL;
+ goto getname;
+ }
+
+ if (!check_version(info, name, mod, fsa.crc)) {
+ fsa.sym = ERR_PTR(-EINVAL);
+ goto getname;
+ }
+
+ err = verify_namespace_is_imported(info, fsa.sym, mod);
+ if (err) {
+ fsa.sym = ERR_PTR(err);
+ goto getname;
+ }
+
+ err = ref_module(mod, fsa.owner);
+ if (err) {
+ fsa.sym = ERR_PTR(err);
+ goto getname;
+ }
+
+getname:
+ /* We must make copy under the lock if we failed to get ref. */
+ strscpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
+unlock:
+ mutex_unlock(&module_mutex);
+ return fsa.sym;
+}
+
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+ const struct load_info *info,
+ const char *name)
+{
+ const struct kernel_symbol *ksym;
+ char owner[MODULE_NAME_LEN];
+
+ if (wait_event_interruptible_timeout(module_wq,
+ !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+ || PTR_ERR(ksym) != -EBUSY,
+ 30 * HZ) <= 0) {
+ pr_warn("%s: gave up waiting for init of module %s.\n",
+ mod->name, owner);
+ }
+ return ksym;
+}
+
+void __weak module_arch_cleanup(struct module *mod)
+{
+}
+
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
+static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
+{
+ unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+ enum execmem_type execmem_type;
+ void *ptr;
+
+ mod->mem[type].size = size;
+
+ if (mod_mem_type_is_data(type))
+ execmem_type = EXECMEM_MODULE_DATA;
+ else
+ execmem_type = EXECMEM_MODULE_TEXT;
+
+ ptr = execmem_alloc_rw(execmem_type, size);
+ if (!ptr)
+ return -ENOMEM;
+
+ mod->mem[type].is_rox = execmem_is_rox(execmem_type);
+
+ /*
+ * The pointer to these blocks of memory are stored on the module
+ * structure and we keep that around so long as the module is
+ * around. We only free that memory when we unload the module.
+ * Just mark them as not being a leak then. The .init* ELF
+ * sections *do* get freed after boot so we *could* treat them
+ * slightly differently with kmemleak_ignore() and only grey
+ * them out as they work as typical memory allocations which
+ * *do* eventually get freed, but let's just keep things simple
+ * and avoid *any* false positives.
+ */
+ if (!mod->mem[type].is_rox)
+ kmemleak_not_leak(ptr);
+
+ memset(ptr, 0, size);
+ mod->mem[type].base = ptr;
+
+ return 0;
+}
+
+static void module_memory_restore_rox(struct module *mod)
+{
+ for_class_mod_mem_type(type, text) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (mem->is_rox)
+ execmem_restore_rox(mem->base, mem->size);
+ }
+}
+
+static void module_memory_free(struct module *mod, enum mod_mem_type type)
+{
+ struct module_memory *mem = &mod->mem[type];
+
+ execmem_free(mem->base);
+}
+
+static void free_mod_mem(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (type == MOD_DATA)
+ continue;
+
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
+ lockdep_free_key_range(mod_mem->base, mod_mem->size);
+ if (mod_mem->size)
+ module_memory_free(mod, type);
+ }
+
+ /* MOD_DATA hosts mod, so free it at last */
+ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
+ module_memory_free(mod, MOD_DATA);
+}
+
+/* Free a module, remove from lists, etc. */
+static void free_module(struct module *mod)
+{
+ trace_module_free(mod);
+
+ codetag_unload_module(mod);
+
+ mod_sysfs_teardown(mod);
+
+ /*
+ * We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed.
+ */
+ mutex_lock(&module_mutex);
+ mod->state = MODULE_STATE_UNFORMED;
+ mutex_unlock(&module_mutex);
+
+ /* Arch-specific cleanup. */
+ module_arch_cleanup(mod);
+
+ /* Module unload stuff */
+ module_unload_free(mod);
+
+ /* Free any allocated parameters. */
+ destroy_params(mod->kp, mod->num_kp);
+
+ if (is_livepatch_module(mod))
+ free_module_elf(mod);
+
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
+ /* Remove this module from bug list, this uses list_del_rcu */
+ module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
+ if (try_add_tainted_module(mod))
+ pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
+ mod->name);
+ mutex_unlock(&module_mutex);
+
+ /* This may be empty, but that's OK */
+ module_arch_freeing_init(mod);
+ kfree(mod->args);
+ percpu_modfree(mod);
+
+ free_mod_mem(mod);
+}
+
+void *__symbol_get(const char *symbol)
+{
+ struct find_symbol_arg fsa = {
+ .name = symbol,
+ .gplok = true,
+ .warn = true,
+ };
+
+ scoped_guard(rcu) {
+ if (!find_symbol(&fsa))
+ return NULL;
+ if (fsa.license != GPL_ONLY) {
+ pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
+ symbol);
+ return NULL;
+ }
+ if (strong_try_module_get(fsa.owner))
+ return NULL;
+ }
+ return (void *)kernel_symbol_value(fsa.sym);
+}
+EXPORT_SYMBOL_GPL(__symbol_get);
+
+/*
+ * Ensure that an exported symbol [global namespace] does not already exist
+ * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
+ */
+static int verify_exported_symbols(struct module *mod)
+{
+ unsigned int i;
+ const struct kernel_symbol *s;
+ struct {
+ const struct kernel_symbol *sym;
+ unsigned int num;
+ } arr[] = {
+ { mod->syms, mod->num_syms },
+ { mod->gpl_syms, mod->num_gpl_syms },
+ };
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+ struct find_symbol_arg fsa = {
+ .name = kernel_symbol_name(s),
+ .gplok = true,
+ };
+ if (find_symbol(&fsa)) {
+ pr_err("%s: exports duplicate symbol %s"
+ " (owned by %s)\n",
+ mod->name, kernel_symbol_name(s),
+ module_name(fsa.owner));
+ return -ENOEXEC;
+ }
+ }
+ }
+ return 0;
+}
+
+static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+{
+ /*
+ * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+ * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+ * i386 has a similar problem but may not deserve a fix.
+ *
+ * If we ever have to ignore many symbols, consider refactoring the code to
+ * only warn if referenced by a relocation.
+ */
+ if (emachine == EM_386 || emachine == EM_X86_64)
+ return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+ return false;
+}
+
+/* Change all symbols so that st_value encodes the pointer directly. */
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ Elf_Sym *sym = (void *)symsec->sh_addr;
+ unsigned long secbase;
+ unsigned int i;
+ int ret = 0;
+ const struct kernel_symbol *ksym;
+
+ for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ const char *name = info->strtab + sym[i].st_name;
+
+ switch (sym[i].st_shndx) {
+ case SHN_COMMON:
+ /* Ignore common symbols */
+ if (!strncmp(name, "__gnu_lto", 9))
+ break;
+
+ /*
+ * We compiled with -fno-common. These are not
+ * supposed to happen.
+ */
+ pr_debug("Common symbol: %s\n", name);
+ pr_warn("%s: please compile with -fno-common\n",
+ mod->name);
+ ret = -ENOEXEC;
+ break;
+
+ case SHN_ABS:
+ /* Don't need to do anything */
+ pr_debug("Absolute symbol: 0x%08lx %s\n",
+ (long)sym[i].st_value, name);
+ break;
+
+ case SHN_LIVEPATCH:
+ /* Livepatch symbols are resolved by livepatch */
+ break;
+
+ case SHN_UNDEF:
+ ksym = resolve_symbol_wait(mod, info, name);
+ /* Ok if resolved. */
+ if (ksym && !IS_ERR(ksym)) {
+ sym[i].st_value = kernel_symbol_value(ksym);
+ break;
+ }
+
+ /* Ok if weak or ignored. */
+ if (!ksym &&
+ (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+ ignore_undef_symbol(info->hdr->e_machine, name)))
+ break;
+
+ ret = PTR_ERR(ksym) ?: -ENOENT;
+ pr_warn("%s: Unknown symbol %s (err %d)\n",
+ mod->name, name, ret);
+ break;
+
+ default:
+ /* Divert to percpu allocation if a percpu var. */
+ if (sym[i].st_shndx == info->index.pcpu)
+ secbase = (unsigned long)mod_percpu(mod);
+ else
+ secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
+ sym[i].st_value += secbase;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+ unsigned int i;
+ int err = 0;
+
+ /* Now do relocations. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ unsigned int infosec = info->sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (infosec >= info->hdr->e_shnum)
+ continue;
+
+ /*
+ * Don't bother with non-allocated sections.
+ * An exception is the percpu section, which has separate allocations
+ * for individual CPUs. We relocate the percpu section in the initial
+ * ELF template and subsequently copy it to the per-CPU destinations.
+ */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) &&
+ (!infosec || infosec != info->index.pcpu))
+ continue;
+
+ if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+ err = klp_apply_section_relocs(mod, info->sechdrs,
+ info->secstrings,
+ info->strtab,
+ info->index.sym, i,
+ NULL);
+ else if (info->sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ else if (info->sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+/* Additional bytes needed by arch in front of individual sections */
+unsigned int __weak arch_mod_section_prepend(struct module *mod,
+ unsigned int section)
+{
+ /* default implementation just returns zero */
+ return 0;
+}
+
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section)
+{
+ long offset;
+ long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;
+
+ mod->mem[type].size += arch_mod_section_prepend(mod, section);
+ offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
+ mod->mem[type].size = offset + sechdr->sh_size;
+
+ WARN_ON_ONCE(offset & mask);
+ return offset | mask;
+}
+
+bool module_init_layout_section(const char *sname)
+{
+#ifndef CONFIG_MODULE_UNLOAD
+ if (module_exit_section(sname))
+ return true;
+#endif
+ return module_init_section(sname);
+}
+
+static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
+{
+ unsigned int m, i;
+
+ /*
+ * { Mask of required section header flags,
+ * Mask of excluded section header flags }
+ */
+ static const unsigned long masks[][2] = {
+ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
+ { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+ { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+ };
+ static const int core_m_to_mem_type[] = {
+ MOD_TEXT,
+ MOD_RODATA,
+ MOD_RO_AFTER_INIT,
+ MOD_DATA,
+ MOD_DATA,
+ };
+ static const int init_m_to_mem_type[] = {
+ MOD_INIT_TEXT,
+ MOD_INIT_RODATA,
+ MOD_INVALID,
+ MOD_INIT_DATA,
+ MOD_INIT_DATA,
+ };
+
+ for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];
+
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
+
+ if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ || (s->sh_flags & masks[m][1])
+ || s->sh_entsize != ~0UL
+ || is_init != module_init_layout_section(sname))
+ continue;
+
+ if (WARN_ON_ONCE(type == MOD_INVALID))
+ continue;
+
+ /*
+ * Do not allocate codetag memory as we load it into
+ * preallocated contiguous memory.
+ */
+ if (codetag_needs_module_section(mod, sname, s->sh_size)) {
+ /*
+ * s->sh_entsize won't be used but populate the
+ * type field to avoid confusion.
+ */
+ s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK)
+ << SH_ENTSIZE_TYPE_SHIFT;
+ continue;
+ }
+
+ s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
+ pr_debug("\t%s\n", sname);
+ }
+ }
+}
+
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data. Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
+
+ pr_debug("Core section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, false);
+
+ pr_debug("Init section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, true);
+}
+
+static void module_license_taint_check(struct module *mod, const char *license)
+{
+ if (!license)
+ license = "unspecified";
+
+ if (!license_is_gpl_compatible(license)) {
+ if (!test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license '%s' taints kernel.\n",
+ mod->name, license);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+ }
+}
+
+static int setup_modinfo(struct module *mod, struct load_info *info)
+{
+ const struct module_attribute *attr;
+ char *imported_namespace;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->setup)
+ attr->setup(mod, get_modinfo(info, attr->attr.name));
+ }
+
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
+ /*
+ * 'module:' prefixed namespaces are implicit, disallow
+ * explicit imports.
+ */
+ if (strstarts(imported_namespace, "module:")) {
+ pr_err("%s: module tries to import module namespace: %s\n",
+ mod->name, imported_namespace);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+static void free_modinfo(struct module *mod)
+{
+ const struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->free)
+ attr->free(mod);
+ }
+}
+
+bool __weak module_init_section(const char *name)
+{
+ return strstarts(name, ".init");
+}
+
+bool __weak module_exit_section(const char *name)
+{
+ return strstarts(name, ".exit");
+}
+
+static int validate_section_offset(const struct load_info *info, Elf_Shdr *shdr)
+{
+#if defined(CONFIG_64BIT)
+ unsigned long long secend;
+#else
+ unsigned long secend;
+#endif
+
+ /*
+ * Check for both overflow and offset/size being
+ * too large.
+ */
+ secend = shdr->sh_offset + shdr->sh_size;
+ if (secend < shdr->sh_offset || secend > info->len)
+ return -ENOEXEC;
+
+ return 0;
+}
+
+/**
+ * elf_validity_ehdr() - Checks an ELF header for module validity
+ * @info: Load info containing the ELF header to check
+ *
+ * Checks whether an ELF header could belong to a valid module. Checks:
+ *
+ * * ELF header is within the data the user provided
+ * * ELF magic is present
+ * * It is relocatable (not final linked, not core file, etc.)
+ * * The header's machine type matches what the architecture expects.
+ * * Optional arch-specific hook for other properties
+ * - module_elf_check_arch() is currently only used by PPC to check
+ * ELF ABI version, but may be used by others in the future.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_ehdr(const struct load_info *info)
+{
+ if (info->len < sizeof(*(info->hdr))) {
+ pr_err("Invalid ELF header len %lu\n", info->len);
+ return -ENOEXEC;
+ }
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+ pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+ return -ENOEXEC;
+ }
+ if (info->hdr->e_type != ET_REL) {
+ pr_err("Invalid ELF header type: %u != %u\n",
+ info->hdr->e_type, ET_REL);
+ return -ENOEXEC;
+ }
+ if (!elf_check_arch(info->hdr)) {
+ pr_err("Invalid architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ return -ENOEXEC;
+ }
+ if (!module_elf_check_arch(info->hdr)) {
+ pr_err("Invalid module architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ return -ENOEXEC;
+ }
+ return 0;
+}
+
+/**
+ * elf_validity_cache_sechdrs() - Cache section headers if valid
+ * @info: Load info to compute section headers from
+ *
+ * Checks:
+ *
+ * * ELF header is valid (see elf_validity_ehdr())
+ * * Section headers are the size we expect
+ * * Section array fits in the user provided data
+ * * Section index 0 is NULL
+ * * Section contents are inbounds
+ *
+ * Then updates @info with a &load_info->sechdrs pointer if valid.
+ *
+ * Return: %0 if valid, negative error code if validation failed.
+ */
+static int elf_validity_cache_sechdrs(struct load_info *info)
+{
+ Elf_Shdr *sechdrs;
+ Elf_Shdr *shdr;
+ int i;
+ int err;
+
+ err = elf_validity_ehdr(info);
+ if (err < 0)
+ return err;
+
+ if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ pr_err("Invalid ELF section header size\n");
+ return -ENOEXEC;
+ }
+
+ /*
+ * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
+ * known and small. So e_shnum * sizeof(Elf_Shdr)
+ * will not overflow unsigned long on any platform.
+ */
+ if (info->hdr->e_shoff >= info->len
+ || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ info->len - info->hdr->e_shoff)) {
+ pr_err("Invalid ELF section header overflow\n");
+ return -ENOEXEC;
+ }
+
+ sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+
+ /*
+ * The code assumes that section 0 has a length of zero and
+ * an addr of zero, so check for it.
+ */
+ if (sechdrs[0].sh_type != SHT_NULL
+ || sechdrs[0].sh_size != 0
+ || sechdrs[0].sh_addr != 0) {
+ pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+ sechdrs[0].sh_type);
+ return -ENOEXEC;
+ }
+
+ /* Validate contents are inbounds */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ shdr = &sechdrs[i];
+ switch (shdr->sh_type) {
+ case SHT_NULL:
+ case SHT_NOBITS:
+ /* No contents, offset/size don't mean anything */
+ continue;
+ default:
+ err = validate_section_offset(info, shdr);
+ if (err < 0) {
+ pr_err("Invalid ELF section in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return err;
+ }
+ }
+ }
+
+ info->sechdrs = sechdrs;
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_secstrings() - Caches section names if valid
+ * @info: Load info to cache section names from. Must have valid sechdrs.
+ *
+ * Specifically checks:
+ *
+ * * Section name table index is inbounds of section headers
+ * * Section name table is not empty
+ * * Section name table is NUL terminated
+ * * All section name offsets are inbounds of the section
+ *
+ * Then updates @info with a &load_info->secstrings pointer if valid.
+ *
+ * Return: %0 if valid, negative error code if validation failed.
+ */
+static int elf_validity_cache_secstrings(struct load_info *info)
+{
+ Elf_Shdr *strhdr, *shdr;
+ char *secstrings;
+ int i;
+
+ /*
+ * Verify if the section name table index is valid.
+ */
+ if (info->hdr->e_shstrndx == SHN_UNDEF
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+ info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+ info->hdr->e_shnum);
+ return -ENOEXEC;
+ }
+
+ strhdr = &info->sechdrs[info->hdr->e_shstrndx];
+
+ /*
+ * The section name table must be NUL-terminated, as required
+ * by the spec. This makes strcmp and pr_* calls that access
+ * strings in the section safe.
+ */
+ secstrings = (void *)info->hdr + strhdr->sh_offset;
+ if (strhdr->sh_size == 0) {
+ pr_err("empty section name table\n");
+ return -ENOEXEC;
+ }
+ if (secstrings[strhdr->sh_size - 1] != '\0') {
+ pr_err("ELF Spec violation: section name table isn't null terminated\n");
+ return -ENOEXEC;
+ }
+
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ shdr = &info->sechdrs[i];
+ /* SHT_NULL means sh_name has an undefined value */
+ if (shdr->sh_type == SHT_NULL)
+ continue;
+ if (shdr->sh_name >= strhdr->sh_size) {
+ pr_err("Invalid ELF section name in module (section %u type %u)\n",
+ i, shdr->sh_type);
+ return -ENOEXEC;
+ }
+ }
+
+ info->secstrings = secstrings;
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_info() - Validate and cache modinfo section
+ * @info: Load info to populate the modinfo index on.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated
+ *
+ * Checks that if there is a .modinfo section, it is unique.
+ * Then, it caches its index in &load_info->index.info.
+ * Finally, it tries to populate the name to improve error messages.
+ *
+ * Return: %0 if valid, %-ENOEXEC if multiple modinfo sections were found.
+ */
+static int elf_validity_cache_index_info(struct load_info *info)
+{
+ int info_idx;
+
+ info_idx = find_any_unique_sec(info, ".modinfo");
+
+ if (info_idx == 0)
+ /* Early return, no .modinfo */
+ return 0;
+
+ if (info_idx < 0) {
+ pr_err("Only one .modinfo section must exist.\n");
+ return -ENOEXEC;
+ }
+
+ info->index.info = info_idx;
+ /* Try to find a name early so we can log errors with a module name */
+ info->name = get_modinfo(info, "name");
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_mod() - Validates and caches this_module section
+ * @info: Load info to cache this_module on.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated
+ *
+ * The ".gnu.linkonce.this_module" ELF section is special. It is what modpost
+ * uses to refer to __this_module and let's use rely on THIS_MODULE to point
+ * to &__this_module properly. The kernel's modpost declares it on each
+ * modules's *.mod.c file. If the struct module of the kernel changes a full
+ * kernel rebuild is required.
+ *
+ * We have a few expectations for this special section, this function
+ * validates all this for us:
+ *
+ * * The section has contents
+ * * The section is unique
+ * * We expect the kernel to always have to allocate it: SHF_ALLOC
+ * * The section size must match the kernel's run time's struct module
+ * size
+ *
+ * If all checks pass, the index will be cached in &load_info->index.mod
+ *
+ * Return: %0 on validation success, %-ENOEXEC on failure
+ */
+static int elf_validity_cache_index_mod(struct load_info *info)
+{
+ Elf_Shdr *shdr;
+ int mod_idx;
+
+ mod_idx = find_any_unique_sec(info, ".gnu.linkonce.this_module");
+ if (mod_idx <= 0) {
+ pr_err("module %s: Exactly one .gnu.linkonce.this_module section must exist.\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ shdr = &info->sechdrs[mod_idx];
+
+ if (shdr->sh_type == SHT_NOBITS) {
+ pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ if (!(shdr->sh_flags & SHF_ALLOC)) {
+ pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ if (shdr->sh_size != sizeof(struct module)) {
+ pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ info->index.mod = mod_idx;
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_sym() - Validate and cache symtab index
+ * @info: Load info to cache symtab index in.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ *
+ * Checks that there is exactly one symbol table, then caches its index in
+ * &load_info->index.sym.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_cache_index_sym(struct load_info *info)
+{
+ unsigned int sym_idx;
+ unsigned int num_sym_secs = 0;
+ int i;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+ num_sym_secs++;
+ sym_idx = i;
+ }
+ }
+
+ if (num_sym_secs != 1) {
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ return -ENOEXEC;
+ }
+
+ info->index.sym = sym_idx;
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_str() - Validate and cache strtab index
+ * @info: Load info to cache strtab index in.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ * Must have &load_info->index.sym populated.
+ *
+ * Looks at the symbol table's associated string table, makes sure it is
+ * in-bounds, and caches it.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_cache_index_str(struct load_info *info)
+{
+ unsigned int str_idx = info->sechdrs[info->index.sym].sh_link;
+
+ if (str_idx == SHN_UNDEF || str_idx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+ str_idx, str_idx, info->hdr->e_shnum);
+ return -ENOEXEC;
+ }
+
+ info->index.str = str_idx;
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index_versions() - Validate and cache version indices
+ * @info: Load info to cache version indices in.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ * @flags: Load flags, relevant to suppress version loading, see
+ * uapi/linux/module.h
+ *
+ * If we're ignoring modversions based on @flags, zero all version indices
+ * and return validity. Othewrise check:
+ *
+ * * If "__version_ext_crcs" is present, "__version_ext_names" is present
+ * * There is a name present for every crc
+ *
+ * Then populate:
+ *
+ * * &load_info->index.vers
+ * * &load_info->index.vers_ext_crc
+ * * &load_info->index.vers_ext_names
+ *
+ * if present.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_cache_index_versions(struct load_info *info, int flags)
+{
+ unsigned int vers_ext_crc;
+ unsigned int vers_ext_name;
+ size_t crc_count;
+ size_t remaining_len;
+ size_t name_size;
+ char *name;
+
+ /* If modversions were suppressed, pretend we didn't find any */
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS) {
+ info->index.vers = 0;
+ info->index.vers_ext_crc = 0;
+ info->index.vers_ext_name = 0;
+ return 0;
+ }
+
+ vers_ext_crc = find_sec(info, "__version_ext_crcs");
+ vers_ext_name = find_sec(info, "__version_ext_names");
+
+ /* If we have one field, we must have the other */
+ if (!!vers_ext_crc != !!vers_ext_name) {
+ pr_err("extended version crc+name presence does not match");
+ return -ENOEXEC;
+ }
+
+ /*
+ * If we have extended version information, we should have the same
+ * number of entries in every section.
+ */
+ if (vers_ext_crc) {
+ crc_count = info->sechdrs[vers_ext_crc].sh_size / sizeof(u32);
+ name = (void *)info->hdr +
+ info->sechdrs[vers_ext_name].sh_offset;
+ remaining_len = info->sechdrs[vers_ext_name].sh_size;
+
+ while (crc_count--) {
+ name_size = strnlen(name, remaining_len) + 1;
+ if (name_size > remaining_len) {
+ pr_err("more extended version crcs than names");
+ return -ENOEXEC;
+ }
+ remaining_len -= name_size;
+ name += name_size;
+ }
+ }
+
+ info->index.vers = find_sec(info, "__versions");
+ info->index.vers_ext_crc = vers_ext_crc;
+ info->index.vers_ext_name = vers_ext_name;
+ return 0;
+}
+
+/**
+ * elf_validity_cache_index() - Resolve, validate, cache section indices
+ * @info: Load info to read from and update.
+ * &load_info->sechdrs and &load_info->secstrings must be populated.
+ * @flags: Load flags, relevant to suppress version loading, see
+ * uapi/linux/module.h
+ *
+ * Populates &load_info->index, validating as it goes.
+ * See child functions for per-field validation:
+ *
+ * * elf_validity_cache_index_info()
+ * * elf_validity_cache_index_mod()
+ * * elf_validity_cache_index_sym()
+ * * elf_validity_cache_index_str()
+ * * elf_validity_cache_index_versions()
+ *
+ * If CONFIG_SMP is enabled, load the percpu section by name with no
+ * validation.
+ *
+ * Return: 0 on success, negative error code if an index failed validation.
+ */
+static int elf_validity_cache_index(struct load_info *info, int flags)
+{
+ int err;
+
+ err = elf_validity_cache_index_info(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_mod(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_sym(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_str(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index_versions(info, flags);
+ if (err < 0)
+ return err;
+
+ info->index.pcpu = find_pcpusec(info);
+
+ return 0;
+}
+
+/**
+ * elf_validity_cache_strtab() - Validate and cache symbol string table
+ * @info: Load info to read from and update.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ * Must have &load_info->index populated.
+ *
+ * Checks:
+ *
+ * * The string table is not empty.
+ * * The string table starts and ends with NUL (required by ELF spec).
+ * * Every &Elf_Sym->st_name offset in the symbol table is inbounds of the
+ * string table.
+ *
+ * And caches the pointer as &load_info->strtab in @info.
+ *
+ * Return: 0 on success, negative error code if a check failed.
+ */
+static int elf_validity_cache_strtab(struct load_info *info)
+{
+ Elf_Shdr *str_shdr = &info->sechdrs[info->index.str];
+ Elf_Shdr *sym_shdr = &info->sechdrs[info->index.sym];
+ char *strtab = (char *)info->hdr + str_shdr->sh_offset;
+ Elf_Sym *syms = (void *)info->hdr + sym_shdr->sh_offset;
+ int i;
+
+ if (str_shdr->sh_size == 0) {
+ pr_err("empty symbol string table\n");
+ return -ENOEXEC;
+ }
+ if (strtab[0] != '\0') {
+ pr_err("symbol string table missing leading NUL\n");
+ return -ENOEXEC;
+ }
+ if (strtab[str_shdr->sh_size - 1] != '\0') {
+ pr_err("symbol string table isn't NUL terminated\n");
+ return -ENOEXEC;
+ }
+
+ /*
+ * Now that we know strtab is correctly structured, check symbol
+ * starts are inbounds before they're used later.
+ */
+ for (i = 0; i < sym_shdr->sh_size / sizeof(*syms); i++) {
+ if (syms[i].st_name >= str_shdr->sh_size) {
+ pr_err("symbol name out of bounds in string table");
+ return -ENOEXEC;
+ }
+ }
+
+ info->strtab = strtab;
+ return 0;
+}
+
+/*
+ * Check userspace passed ELF module against our expectations, and cache
+ * useful variables for further processing as we go.
+ *
+ * This does basic validity checks against section offsets and sizes, the
+ * section name string table, and the indices used for it (sh_name).
+ *
+ * As a last step, since we're already checking the ELF sections we cache
+ * useful variables which will be used later for our convenience:
+ *
+ * o pointers to section headers
+ * o cache the modinfo symbol section
+ * o cache the string symbol section
+ * o cache the module section
+ *
+ * As a last step we set info->mod to the temporary copy of the module in
+ * info->hdr. The final one will be allocated in move_module(). Any
+ * modifications we make to our copy of the module will be carried over
+ * to the final minted module.
+ */
+static int elf_validity_cache_copy(struct load_info *info, int flags)
+{
+ int err;
+
+ err = elf_validity_cache_sechdrs(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_secstrings(info);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_index(info, flags);
+ if (err < 0)
+ return err;
+ err = elf_validity_cache_strtab(info);
+ if (err < 0)
+ return err;
+
+ /* This is temporary: point mod into copy of data. */
+ info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
+
+ /*
+ * If we didn't load the .modinfo 'name' field earlier, fall back to
+ * on-disk struct mod 'name' field.
+ */
+ if (!info->name)
+ info->name = info->mod->name;
+
+ return 0;
+}
+
+#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+
+static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+{
+ do {
+ unsigned long n = min(len, COPY_CHUNK_SIZE);
+
+ if (copy_from_user(dst, usrc, n) != 0)
+ return -EFAULT;
+ cond_resched();
+ dst += n;
+ usrc += n;
+ len -= n;
+ } while (len);
+ return 0;
+}
+
+static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+{
+ if (!get_modinfo(info, "livepatch"))
+ /* Nothing more to do */
+ return 0;
+
+ if (set_livepatch_module(mod))
+ return 0;
+
+ pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+ mod->name);
+ return -ENOEXEC;
+}
+
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
+{
+ int err;
+
+ info->len = len;
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ err = security_kernel_load_data(LOADING_MODULE, true);
+ if (err)
+ return err;
+
+ /* Suck in entire file: we'll want most of it. */
+ info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
+ if (!info->hdr)
+ return -ENOMEM;
+
+ if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = security_kernel_post_load_data((char *)info->hdr, info->len,
+ LOADING_MODULE, "init_module");
+out:
+ if (err)
+ vfree(info->hdr);
+
+ return err;
+}
+
+static void free_copy(struct load_info *info, int flags)
+{
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ module_decompress_cleanup(info);
+ else
+ vfree(info->hdr);
+}
+
+static int rewrite_section_headers(struct load_info *info, int flags)
+{
+ unsigned int i;
+
+ /* This should always be true, but let's be sure. */
+ info->sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+
+ /*
+ * Mark all sections sh_addr with their address in the
+ * temporary image.
+ */
+ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
+
+ }
+
+ /* Track but don't keep modinfo and version sections. */
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers_ext_crc].sh_flags &=
+ ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers_ext_name].sh_flags &=
+ ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+ return 0;
+}
+
+static const char *const module_license_offenders[] = {
+ /* driverloader was caught wrongly pretending to be under GPL */
+ "driverloader",
+
+ /* lve claims to be GPL but upstream won't provide source */
+ "lve",
+};
+
+/*
+ * These calls taint the kernel depending certain module circumstances */
+static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
+{
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+ size_t i;
+
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
+
+ check_modinfo_retpoline(mod, info);
+
+ if (get_modinfo(info, "staging")) {
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
+ }
+
+ if (is_livepatch_module(mod)) {
+ add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ mod->name);
+ }
+
+ module_license_taint_check(mod, get_modinfo(info, "license"));
+
+ if (get_modinfo(info, "test")) {
+ if (!test_taint(TAINT_TEST))
+ pr_warn("%s: loading test module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
+ }
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ }
+#endif
+
+ /*
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
+ */
+ if (strcmp(mod->name, "ndiswrapper") == 0)
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
+
+ for (i = 0; i < ARRAY_SIZE(module_license_offenders); ++i) {
+ if (strcmp(mod->name, module_license_offenders[i]) == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
+
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+{
+ const char *modmagic = get_modinfo(info, "vermagic");
+ int err;
+
+ if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ modmagic = NULL;
+
+ /* This is allowed: modprobe --force will invalidate it. */
+ if (!modmagic) {
+ err = try_to_force_load(mod, "bad vermagic");
+ if (err)
+ return err;
+ } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
+ pr_err("%s: version magic '%s' should be '%s'\n",
+ info->name, modmagic, vermagic);
+ return -ENOEXEC;
+ }
+
+ err = check_modinfo_livepatch(mod, info);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int find_module_sections(struct module *mod, struct load_info *info)
+{
+ mod->kp = section_objs(info, "__param",
+ sizeof(*mod->kp), &mod->num_kp);
+ mod->syms = section_objs(info, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(info, "__kcrctab");
+ mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(info, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ if (!mod->ctors)
+ mod->ctors = section_objs(info, ".init_array",
+ sizeof(*mod->ctors), &mod->num_ctors);
+ else if (find_sec(info, ".init_array")) {
+ /*
+ * This shouldn't happen with same compiler and binutils
+ * building all parts of the module.
+ */
+ pr_warn("%s: has both .ctors and .init_array.\n",
+ mod->name);
+ return -EINVAL;
+ }
+#endif
+
+ mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
+ &mod->noinstr_text_size);
+
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+ sizeof(*mod->tracepoints_ptrs),
+ &mod->num_tracepoints);
+#endif
+#ifdef CONFIG_TREE_SRCU
+ mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+ sizeof(*mod->srcu_struct_ptrs),
+ &mod->num_srcu_structs);
+#endif
+#ifdef CONFIG_BPF_EVENTS
+ mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+ sizeof(*mod->bpf_raw_events),
+ &mod->num_bpf_raw_events);
+#endif
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
+ mod->btf_base_data = any_section_objs(info, ".BTF.base", 1,
+ &mod->btf_base_data_size);
+#endif
+#ifdef CONFIG_JUMP_LABEL
+ mod->jump_entries = section_objs(info, "__jump_table",
+ sizeof(*mod->jump_entries),
+ &mod->num_jump_entries);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(info, "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+ mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+ sizeof(*mod->trace_evals),
+ &mod->num_trace_evals);
+#endif
+#ifdef CONFIG_TRACING
+ mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+ sizeof(*mod->trace_bprintk_fmt_start),
+ &mod->num_trace_bprintk_fmt);
+#endif
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
+#ifdef CONFIG_FUNCTION_ERROR_INJECTION
+ mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+ sizeof(*mod->ei_funcs),
+ &mod->num_ei_funcs);
+#endif
+#ifdef CONFIG_KPROBES
+ mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
+ &mod->kprobes_text_size);
+ mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
+ sizeof(unsigned long),
+ &mod->num_kprobe_blacklist);
+#endif
+#ifdef CONFIG_PRINTK_INDEX
+ mod->printk_index_start = section_objs(info, ".printk_index",
+ sizeof(*mod->printk_index_start),
+ &mod->printk_index_size);
+#endif
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+ mod->static_call_sites = section_objs(info, ".static_call_sites",
+ sizeof(*mod->static_call_sites),
+ &mod->num_static_call_sites);
+#endif
+#if IS_ENABLED(CONFIG_KUNIT)
+ mod->kunit_suites = section_objs(info, ".kunit_test_suites",
+ sizeof(*mod->kunit_suites),
+ &mod->num_kunit_suites);
+ mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
+ sizeof(*mod->kunit_init_suites),
+ &mod->num_kunit_init_suites);
+#endif
+
+ mod->extable = section_objs(info, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+
+ if (section_addr(info, "__obsparm"))
+ pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
+
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+ mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
+ sizeof(*mod->dyndbg_info.descs),
+ &mod->dyndbg_info.num_descs);
+ mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
+ sizeof(*mod->dyndbg_info.classes),
+ &mod->dyndbg_info.num_classes);
+#endif
+
+ return 0;
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+ int i, ret;
+ enum mod_mem_type t = MOD_MEM_NUM_TYPES;
+ bool codetag_section_found = false;
+
+ for_each_mod_mem_type(type) {
+ if (!mod->mem[type].size) {
+ mod->mem[type].base = NULL;
+ continue;
+ }
+
+ ret = module_memory_alloc(mod, type);
+ if (ret) {
+ t = type;
+ goto out_err;
+ }
+ }
+
+ /* Transfer each section which specifies SHF_ALLOC */
+ pr_debug("Final section addresses for %s:\n", mod->name);
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ void *dest;
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ const char *sname;
+
+ if (!(shdr->sh_flags & SHF_ALLOC))
+ continue;
+
+ sname = info->secstrings + shdr->sh_name;
+ /*
+ * Load codetag sections separately as they might still be used
+ * after module unload.
+ */
+ if (codetag_needs_module_section(mod, sname, shdr->sh_size)) {
+ dest = codetag_alloc_module_section(mod, sname, shdr->sh_size,
+ arch_mod_section_prepend(mod, i), shdr->sh_addralign);
+ if (WARN_ON(!dest)) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+ if (IS_ERR(dest)) {
+ ret = PTR_ERR(dest);
+ goto out_err;
+ }
+ codetag_section_found = true;
+ } else {
+ enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
+ unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
+
+ dest = mod->mem[type].base + offset;
+ }
+
+ if (shdr->sh_type != SHT_NOBITS) {
+ /*
+ * Our ELF checker already validated this, but let's
+ * be pedantic and make the goal clearer. We actually
+ * end up copying over all modifications made to the
+ * userspace copy of the entire struct module.
+ */
+ if (i == info->index.mod &&
+ (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
+ ret = -ENOEXEC;
+ goto out_err;
+ }
+ memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+ }
+ /*
+ * Update the userspace copy's ELF section address to point to
+ * our newly allocated memory as a pure convenience so that
+ * users of info can keep taking advantage and using the newly
+ * minted official memory area.
+ */
+ shdr->sh_addr = (unsigned long)dest;
+ pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
+ (long)shdr->sh_size, info->secstrings + shdr->sh_name);
+ }
+
+ return 0;
+out_err:
+ module_memory_restore_rox(mod);
+ while (t--)
+ module_memory_free(mod, t);
+ if (codetag_section_found)
+ codetag_free_module_sections(mod);
+
+ return ret;
+}
+
+static int check_export_symbol_versions(struct module *mod)
+{
+#ifdef CONFIG_MODVERSIONS
+ if ((mod->num_syms && !mod->crcs) ||
+ (mod->num_gpl_syms && !mod->gpl_crcs)) {
+ return try_to_force_load(mod,
+ "no versions for exported symbols");
+ }
+#endif
+ return 0;
+}
+
+static void flush_module_icache(const struct module *mod)
+{
+ /*
+ * Flush the instruction cache, since we've played with text.
+ * Do it before processing of module parameters, so the module
+ * can provide parameter accessor functions of its own.
+ */
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size) {
+ flush_icache_range((unsigned long)mod_mem->base,
+ (unsigned long)mod_mem->base + mod_mem->size);
+ }
+ }
+}
+
+bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
+{
+ return true;
+}
+
+int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ char *secstrings,
+ struct module *mod)
+{
+ return 0;
+}
+
+/* module_blacklist is a comma-separated list of module names */
+static char *module_blacklist;
+static bool blacklisted(const char *module_name)
+{
+ const char *p;
+ size_t len;
+
+ if (!module_blacklist)
+ return false;
+
+ for (p = module_blacklist; *p; p += len) {
+ len = strcspn(p, ",");
+ if (strlen(module_name) == len && !memcmp(module_name, p, len))
+ return true;
+ if (p[len] == ',')
+ len++;
+ }
+ return false;
+}
+core_param(module_blacklist, module_blacklist, charp, 0400);
+
+static struct module *layout_and_allocate(struct load_info *info, int flags)
+{
+ struct module *mod;
+ int err;
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(info->hdr, info->sechdrs,
+ info->secstrings, info->mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
+ info->secstrings, info->mod);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* We will do a special allocation for per-cpu sections later. */
+ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
+ /*
+ * Mark relevant sections as SHF_RO_AFTER_INIT so layout_sections() can
+ * put them in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+ module_mark_ro_after_init(info->hdr, info->sechdrs, info->secstrings);
+
+ /*
+ * Determine total sizes, and put offsets in sh_entsize. For now
+ * this is done generically; there doesn't appear to be any
+ * special cases for the architectures.
+ */
+ layout_sections(info->mod, info);
+ layout_symtab(info->mod, info);
+
+ /* Allocate and move to the final place */
+ err = move_module(info->mod, info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Module has been copied to its final place now: return it. */
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ kmemleak_load_module(mod, info);
+ codetag_module_replaced(info->mod, mod);
+
+ return mod;
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+ percpu_modfree(mod);
+ module_arch_freeing_init(mod);
+ codetag_free_module_sections(mod);
+
+ free_mod_mem(mod);
+}
+
+int __weak module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ return 0;
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+ /* Sort exception table now relocations are done. */
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+ /* Copy relocated percpu area over. */
+ percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+ info->sechdrs[info->index.pcpu].sh_size);
+
+ /* Setup kallsyms-specific fields. */
+ add_kallsyms(mod, info);
+
+ /* Arch-specific module finalizing. */
+ return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
+/* For freeing module_init on success, in case kallsyms traversing */
+struct mod_initfree {
+ struct llist_node node;
+ void *init_text;
+ void *init_data;
+ void *init_rodata;
+};
+
+static void do_free_init(struct work_struct *w)
+{
+ struct llist_node *pos, *n, *list;
+ struct mod_initfree *initfree;
+
+ list = llist_del_all(&init_free_list);
+
+ synchronize_rcu();
+
+ llist_for_each_safe(pos, n, list) {
+ initfree = container_of(pos, struct mod_initfree, node);
+ execmem_free(initfree->init_text);
+ execmem_free(initfree->init_data);
+ execmem_free(initfree->init_rodata);
+ kfree(initfree);
+ }
+}
+
+void flush_module_init_free_work(void)
+{
+ flush_work(&init_free_wq);
+}
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+/* Default value for module->async_probe_requested */
+static bool async_probe;
+module_param(async_probe, bool, 0644);
+
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
+{
+ int ret = 0;
+ struct mod_initfree *freeinit;
+#if defined(CONFIG_MODULE_STATS)
+ unsigned int text_size = 0, total_size = 0;
+
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+ if (mod_mem->size) {
+ total_size += mod_mem->size;
+ if (type == MOD_TEXT || type == MOD_INIT_TEXT)
+ text_size += mod_mem->size;
+ }
+ }
+#endif
+
+ freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ if (!freeinit) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
+ freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
+ freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;
+
+ do_mod_ctors(mod);
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ goto fail_free_freeinit;
+ }
+ if (ret > 0) {
+ pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+ "follow 0/-E convention\n"
+ "%s: loading module anyway...\n",
+ __func__, mod->name, ret, __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! */
+ mod->state = MODULE_STATE_LIVE;
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_LIVE, mod);
+
+ /* Delay uevent until module has finished its init routine */
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+
+ /*
+ * We need to finish all async code before the module init sequence
+ * is done. This has potential to deadlock if synchronous module
+ * loading is requested from async (which is not allowed!).
+ *
+ * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
+ * request_module() from async workers") for more details.
+ */
+ if (!mod->async_probe_requested)
+ async_synchronize_full();
+
+ ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
+ mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+ /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
+#endif
+ ret = module_enable_rodata_ro_after_init(mod);
+ if (ret)
+ pr_warn("%s: module_enable_rodata_ro_after_init() returned %d, "
+ "ro_after_init data might still be writable\n",
+ mod->name, ret);
+
+ mod_tree_remove_init(mod);
+ module_arch_freeing_init(mod);
+ for_class_mod_mem_type(type, init) {
+ mod->mem[type].base = NULL;
+ mod->mem[type].size = 0;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointers */
+ mod->btf_data = NULL;
+ mod->btf_base_data = NULL;
+#endif
+ /*
+ * We want to free module_init, but be aware that kallsyms may be
+ * walking this within an RCU read section. In all the failure paths, we
+ * call synchronize_rcu(), but we don't want to slow down the success
+ * path. execmem_free() cannot be called in an interrupt, so do the
+ * work and call synchronize_rcu() in a work queue.
+ *
+ * Note that execmem_alloc() on most architectures creates W+X page
+ * mappings which won't be cleaned up until do_free_init() runs. Any
+ * code such as mark_rodata_ro() which depends on those mappings to
+ * be cleaned up needs to sync with the queued work by invoking
+ * flush_module_init_free_work().
+ */
+ if (llist_add(&freeinit->node, &init_free_list))
+ schedule_work(&init_free_wq);
+
+ mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
+
+ mod_stat_add_long(text_size, &total_text_size);
+ mod_stat_add_long(total_size, &total_mod_size);
+
+ mod_stat_inc(&modcount);
+
+ return 0;
+
+fail_free_freeinit:
+ kfree(freeinit);
+fail:
+ /* Try to protect us from buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_rcu();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ ftrace_release_mod(mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+
+ return ret;
+}
+
+static int may_init_module(void)
+{
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ return 0;
+}
+
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ mod = find_module_all(name, strlen(name), true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
+/* Must be called with module_mutex held */
+static int module_patient_check_exists(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ struct module *old;
+ int err = 0;
+
+ old = find_module_all(name, strlen(name), true);
+ if (old == NULL)
+ return 0;
+
+ if (old->state == MODULE_STATE_COMING ||
+ old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(name));
+ mutex_lock(&module_mutex);
+ if (err)
+ return err;
+
+ /* The module might have gone in the meantime. */
+ old = find_module_all(name, strlen(name), true);
+ }
+
+ if (try_add_failed_module(name, reason))
+ pr_warn("Could not add fail-tracking for module: %s\n", name);
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ return -EEXIST;
+ return -EBUSY;
+}
+
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD);
+ if (err)
+ goto out;
+
+ mod_update_bounds(mod);
+ list_add_rcu(&mod->list, &modules);
+ mod_tree_insert(mod);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_exported_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* These rely on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+ module_cfi_finalize(info->hdr, info->sechdrs, mod);
+
+ err = module_enable_rodata_ro(mod);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_data_nx(mod);
+ if (err)
+ goto out_strict_rwx;
+ err = module_enable_text_rox(mod);
+ if (err)
+ goto out_strict_rwx;
+
+ /*
+ * Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us.
+ */
+ mod->state = MODULE_STATE_COMING;
+ mutex_unlock(&module_mutex);
+
+ return 0;
+
+out_strict_rwx:
+ module_bug_cleanup(mod);
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
+static int prepare_coming_module(struct module *mod)
+{
+ int err;
+
+ ftrace_module_enable(mod);
+ err = klp_module_coming(mod);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain_robust(&module_notify_list,
+ MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
+ err = notifier_to_errno(err);
+ if (err)
+ klp_module_going(mod);
+
+ return err;
+}
+
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+ void *arg)
+{
+ struct module *mod = arg;
+ int ret;
+
+ if (strcmp(param, "async_probe") == 0) {
+ if (kstrtobool(val, &mod->async_probe_requested))
+ mod->async_probe_requested = true;
+ return 0;
+ }
+
+ /* Check for magic 'dyndbg' arg */
+ ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ if (ret != 0)
+ pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+ return 0;
+}
+
+/* Module within temporary copy, this doesn't do any allocation */
+static int early_mod_check(struct load_info *info, int flags)
+{
+ int err;
+
+ /*
+ * Now that we know we have the correct module name, check
+ * if it's blacklisted.
+ */
+ if (blacklisted(info->name)) {
+ pr_err("Module %s is blacklisted\n", info->name);
+ return -EPERM;
+ }
+
+ err = rewrite_section_headers(info, flags);
+ if (err)
+ return err;
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(info, info->mod))
+ return -ENOEXEC;
+
+ err = check_modinfo(info->mod, info, flags);
+ if (err)
+ return err;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING);
+ mutex_unlock(&module_mutex);
+
+ return err;
+}
+
+/*
+ * Allocate and load the module: note that size of section 0 is always
+ * zero, and we rely on this for optional sections.
+ */
+static int load_module(struct load_info *info, const char __user *uargs,
+ int flags)
+{
+ struct module *mod;
+ bool module_allocated = false;
+ long err = 0;
+ char *after_dashes;
+
+ /*
+ * Do the signature check (if any) first. All that
+ * the signature check needs is info->len, it does
+ * not need any of the section info. That can be
+ * set up later. This will minimize the chances
+ * of a corrupt module causing problems before
+ * we even get to the signature check.
+ *
+ * The check will also adjust info->len by stripping
+ * off the sig length at the end of the module, making
+ * checks against info->len more correct.
+ */
+ err = module_sig_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /*
+ * Do basic sanity checks against the ELF header and
+ * sections. Cache useful sections and set the
+ * info->mod to the userspace passed struct module.
+ */
+ err = elf_validity_cache_copy(info, flags);
+ if (err)
+ goto free_copy;
+
+ err = early_mod_check(info, flags);
+ if (err)
+ goto free_copy;
+
+ /* Figure out module layout, and allocate all the memory. */
+ mod = layout_and_allocate(info, flags);
+ if (IS_ERR(mod)) {
+ err = PTR_ERR(mod);
+ goto free_copy;
+ }
+
+ module_allocated = true;
+
+ audit_log_kern_module(info->name);
+
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
+ goto free_module;
+
+ /*
+ * We are tainting your kernel if your module gets into
+ * the modules linked list somehow.
+ */
+ module_augment_kernel_taints(mod, info);
+
+ /* To avoid stressing percpu allocator, do this once we're unique. */
+ err = percpu_modalloc(mod, info);
+ if (err)
+ goto unlink_mod;
+
+ /* Now module is in final location, initialize linked lists, etc. */
+ err = module_unload_init(mod);
+ if (err)
+ goto unlink_mod;
+
+ init_param_lock(mod);
+
+ /*
+ * Now we've got everything in the final locations, we can
+ * find optional sections.
+ */
+ err = find_module_sections(mod, info);
+ if (err)
+ goto free_unload;
+
+ err = check_export_symbol_versions(mod);
+ if (err)
+ goto free_unload;
+
+ /* Set up MODINFO_ATTR fields */
+ err = setup_modinfo(mod, info);
+ if (err)
+ goto free_modinfo;
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = apply_relocations(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = post_relocation(mod, info);
+ if (err < 0)
+ goto free_modinfo;
+
+ flush_module_icache(mod);
+
+ /* Now copy in args */
+ mod->args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(mod->args)) {
+ err = PTR_ERR(mod->args);
+ goto free_arch_cleanup;
+ }
+
+ init_build_id(mod, info);
+
+ /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ ftrace_module_init(mod);
+
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
+ goto ddebug_cleanup;
+
+ err = prepare_coming_module(mod);
+ if (err)
+ goto bug_cleanup;
+
+ mod->async_probe_requested = async_probe;
+
+ /* Module is ready to execute: parsing args may do that. */
+ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, mod,
+ unknown_module_param_cb);
+ if (IS_ERR(after_dashes)) {
+ err = PTR_ERR(after_dashes);
+ goto coming_cleanup;
+ } else if (after_dashes) {
+ pr_warn("%s: parameters '%s' after `--' ignored\n",
+ mod->name, after_dashes);
+ }
+
+ /* Link in to sysfs. */
+ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
+ if (err < 0)
+ goto coming_cleanup;
+
+ if (is_livepatch_module(mod)) {
+ err = copy_module_elf(mod, info);
+ if (err < 0)
+ goto sysfs_cleanup;
+ }
+
+ if (codetag_load_module(mod))
+ goto sysfs_cleanup;
+
+ /* Get rid of temporary copy. */
+ free_copy(info, flags);
+
+ /* Done! */
+ trace_module_load(mod);
+
+ return do_init_module(mod);
+
+ sysfs_cleanup:
+ mod_sysfs_teardown(mod);
+ coming_cleanup:
+ mod->state = MODULE_STATE_GOING;
+ destroy_params(mod->kp, mod->num_kp);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ klp_module_going(mod);
+ bug_cleanup:
+ mod->state = MODULE_STATE_GOING;
+ /* module_bug_cleanup needs module_mutex protection */
+ mutex_lock(&module_mutex);
+ module_bug_cleanup(mod);
+ mutex_unlock(&module_mutex);
+
+ ddebug_cleanup:
+ ftrace_release_mod(mod);
+ synchronize_rcu();
+ kfree(mod->args);
+ free_arch_cleanup:
+ module_arch_cleanup(mod);
+ free_modinfo:
+ free_modinfo(mod);
+ free_unload:
+ module_unload_free(mod);
+ unlink_mod:
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ mod_tree_remove(mod);
+ wake_up_all(&module_wq);
+ /* Wait for RCU-sched synchronizing before releasing mod->list. */
+ synchronize_rcu();
+ mutex_unlock(&module_mutex);
+ free_module:
+ mod_stat_bump_invalid(info, flags);
+ /* Free lock-classes; relies on the preceding sync_rcu() */
+ for_class_mod_mem_type(type, core_data) {
+ lockdep_free_key_range(mod->mem[type].base,
+ mod->mem[type].size);
+ }
+
+ module_memory_restore_rox(mod);
+ module_deallocate(mod, info);
+ free_copy:
+ /*
+ * The info->len is always set. We distinguish between
+ * failures once the proper module was allocated and
+ * before that.
+ */
+ if (!module_allocated) {
+ audit_log_kern_module(info->name ? info->name : "?");
+ mod_stat_bump_becoming(info, flags);
+ }
+ free_copy(info, flags);
+ return err;
+}
+
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+ unsigned long, len, const char __user *, uargs)
+{
+ int err;
+ struct load_info info = { };
+
+ err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+
+ err = copy_module_from_user(umod, len, &info);
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ mod_stat_add_long(len, &invalid_kread_bytes);
+ return err;
+ }
+
+ return load_module(&info, uargs, 0);
+}
+
+struct idempotent {
+ const void *cookie;
+ struct hlist_node entry;
+ struct completion complete;
+ int ret;
+};
+
+#define IDEM_HASH_BITS 8
+static struct hlist_head idem_hash[1 << IDEM_HASH_BITS];
+static DEFINE_SPINLOCK(idem_lock);
+
+static bool idempotent(struct idempotent *u, const void *cookie)
+{
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct idempotent *existing;
+ bool first;
+
+ u->ret = -EINTR;
+ u->cookie = cookie;
+ init_completion(&u->complete);
+
+ spin_lock(&idem_lock);
+ first = true;
+ hlist_for_each_entry(existing, head, entry) {
+ if (existing->cookie != cookie)
+ continue;
+ first = false;
+ break;
+ }
+ hlist_add_head(&u->entry, idem_hash + hash);
+ spin_unlock(&idem_lock);
+
+ return !first;
+}
+
+/*
+ * We were the first one with 'cookie' on the list, and we ended
+ * up completing the operation. We now need to walk the list,
+ * remove everybody - which includes ourselves - fill in the return
+ * value, and then complete the operation.
+ */
+static int idempotent_complete(struct idempotent *u, int ret)
+{
+ const void *cookie = u->cookie;
+ int hash = hash_ptr(cookie, IDEM_HASH_BITS);
+ struct hlist_head *head = idem_hash + hash;
+ struct hlist_node *next;
+ struct idempotent *pos;
+
+ spin_lock(&idem_lock);
+ hlist_for_each_entry_safe(pos, next, head, entry) {
+ if (pos->cookie != cookie)
+ continue;
+ hlist_del_init(&pos->entry);
+ pos->ret = ret;
+ complete(&pos->complete);
+ }
+ spin_unlock(&idem_lock);
+ return ret;
+}
+
+/*
+ * Wait for the idempotent worker.
+ *
+ * If we get interrupted, we need to remove ourselves from the
+ * the idempotent list, and the completion may still come in.
+ *
+ * The 'idem_lock' protects against the race, and 'idem.ret' was
+ * initialized to -EINTR and is thus always the right return
+ * value even if the idempotent work then completes between
+ * the wait_for_completion and the cleanup.
+ */
+static int idempotent_wait_for_completion(struct idempotent *u)
+{
+ if (wait_for_completion_interruptible(&u->complete)) {
+ spin_lock(&idem_lock);
+ if (!hlist_unhashed(&u->entry))
+ hlist_del(&u->entry);
+ spin_unlock(&idem_lock);
+ }
+ return u->ret;
+}
+
+static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
+{
+ bool compressed = !!(flags & MODULE_INIT_COMPRESSED_FILE);
+ struct load_info info = { };
+ void *buf = NULL;
+ int len;
+ int err;
+
+ len = kernel_read_file(f, 0, &buf, INT_MAX, NULL,
+ compressed ? READING_MODULE_COMPRESSED :
+ READING_MODULE);
+ if (len < 0) {
+ mod_stat_inc(&failed_kreads);
+ return len;
+ }
+
+ if (compressed) {
+ err = module_decompress(&info, buf, len);
+ vfree(buf); /* compressed data is no longer needed */
+ if (err) {
+ mod_stat_inc(&failed_decompress);
+ mod_stat_add_long(len, &invalid_decompress_bytes);
+ return err;
+ }
+ err = security_kernel_post_read_file(f, (char *)info.hdr, info.len,
+ READING_MODULE);
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ free_copy(&info, flags);
+ return err;
+ }
+ } else {
+ info.hdr = buf;
+ info.len = len;
+ }
+
+ return load_module(&info, uargs, flags);
+}
+
+static int idempotent_init_module(struct file *f, const char __user * uargs, int flags)
+{
+ struct idempotent idem;
+
+ if (!(f->f_mode & FMODE_READ))
+ return -EBADF;
+
+ /* Are we the winners of the race and get to do this? */
+ if (!idempotent(&idem, file_inode(f))) {
+ int ret = init_module_from_file(f, uargs, flags);
+ return idempotent_complete(&idem, ret);
+ }
+
+ /*
+ * Somebody else won the race and is loading the module.
+ */
+ return idempotent_wait_for_completion(&idem);
+}
+
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err = may_init_module();
+ if (err)
+ return err;
+
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC
+ |MODULE_INIT_COMPRESSED_FILE))
+ return -EINVAL;
+
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ return idempotent_init_module(fd_file(f), uargs, flags);
+}
+
+/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
+char *module_flags(struct module *mod, char *buf, bool show_state)
+{
+ int bx = 0;
+
+ BUG_ON(mod->state == MODULE_STATE_UNFORMED);
+ if (!mod->taints && !show_state)
+ goto out;
+ if (mod->taints ||
+ mod->state == MODULE_STATE_GOING ||
+ mod->state == MODULE_STATE_COMING) {
+ buf[bx++] = '(';
+ bx += module_flags_taint(mod->taints, buf + bx);
+ /* Show a - for module-is-being-unloaded */
+ if (mod->state == MODULE_STATE_GOING && show_state)
+ buf[bx++] = '-';
+ /* Show a + for module-is-being-loaded */
+ if (mod->state == MODULE_STATE_COMING && show_state)
+ buf[bx++] = '+';
+ buf[bx++] = ')';
+ }
+out:
+ buf[bx] = '\0';
+
+ return buf;
+}
+
+/* Given an address, look for it in the module exception tables. */
+const struct exception_table_entry *search_module_extables(unsigned long addr)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ mod = __module_address(addr);
+ if (!mod)
+ return NULL;
+
+ if (!mod->num_exentries)
+ return NULL;
+ /*
+ * The address passed here belongs to a module that is currently
+ * invoked (we are running inside it). Therefore its module::refcnt
+ * needs already be >0 to ensure that it is not removed at this stage.
+ * All other user need to invoke this function within a RCU read
+ * section.
+ */
+ return search_extable(mod->extable, mod->num_exentries, addr);
+}
+
+/**
+ * is_module_address() - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
+ */
+bool is_module_address(unsigned long addr)
+{
+ guard(rcu)();
+ return __module_address(addr) != NULL;
+}
+
+/**
+ * __module_address() - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called within RCU read section or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_address(unsigned long addr)
+{
+ struct module *mod;
+
+ if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
+ goto lookup;
+
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
+ goto lookup;
+#endif
+
+ return NULL;
+
+lookup:
+ mod = mod_find(addr, &mod_tree);
+ if (mod) {
+ BUG_ON(!within_module(addr, mod));
+ if (mod->state == MODULE_STATE_UNFORMED)
+ mod = NULL;
+ }
+ return mod;
+}
+
+/**
+ * is_module_text_address() - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module. See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+ guard(rcu)();
+ return __module_text_address(addr) != NULL;
+}
+
+void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (func(mod, data))
+ break;
+ }
+}
+
+/**
+ * __module_text_address() - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called within RCU read section or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+ struct module *mod = __module_address(addr);
+ if (mod) {
+ /* Make sure it's within the text section. */
+ if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
+ !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
+ mod = NULL;
+ }
+ return mod;
+}
+
+/* Don't grab lock, we're oopsing. */
+void print_modules(void)
+{
+ struct module *mod;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ printk(KERN_DEFAULT "Modules linked in:");
+ /* Most callers should already have preempt disabled, but make sure */
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ pr_cont(" %s%s", mod->name, module_flags(mod, buf, true));
+ }
+
+ print_unloaded_tainted_modules();
+ if (last_unloaded_module.name[0])
+ pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name,
+ last_unloaded_module.taints);
+ pr_cont("\n");
+}
+
+#ifdef CONFIG_MODULE_DEBUGFS
+struct dentry *mod_debugfs_root;
+
+static int module_debugfs_init(void)
+{
+ mod_debugfs_root = debugfs_create_dir("modules", NULL);
+ return 0;
+}
+module_init(module_debugfs_init);
+#endif
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
new file mode 100644
index 000000000000..0a4841e88adb
--- /dev/null
+++ b/kernel/module/procfs.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module proc support
+ *
+ * Copyright (C) 2008 Alexey Dobriyan
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "internal.h"
+
+#ifdef CONFIG_MODULE_UNLOAD
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ struct module_use *use;
+ int printed_something = 0;
+
+ seq_printf(m, " %i ", module_refcount(mod));
+
+ /*
+ * Always include a trailing , so userspace can differentiate
+ * between this and the old multi-field proc format.
+ */
+ list_for_each_entry(use, &mod->source_list, source_list) {
+ printed_something = 1;
+ seq_printf(m, "%s,", use->source->name);
+ }
+
+ if (mod->init && !mod->exit) {
+ printed_something = 1;
+ seq_puts(m, "[permanent],");
+ }
+
+ if (!printed_something)
+ seq_puts(m, "-");
+}
+#else /* !CONFIG_MODULE_UNLOAD */
+static inline void print_unload_info(struct seq_file *m, struct module *mod)
+{
+ /* We don't know the usage count, or what modules are using. */
+ seq_puts(m, " - -");
+}
+#endif /* CONFIG_MODULE_UNLOAD */
+
+/* Called by the /proc file system to return a list of modules. */
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&module_mutex);
+ return seq_list_start(&modules, *pos);
+}
+
+static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &modules, pos);
+}
+
+static void m_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&module_mutex);
+}
+
+static unsigned int module_total_size(struct module *mod)
+{
+ int size = 0;
+
+ for_each_mod_mem_type(type)
+ size += mod->mem[type].size;
+ return size;
+}
+
+static int m_show(struct seq_file *m, void *p)
+{
+ struct module *mod = list_entry(p, struct module, list);
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ void *value;
+ unsigned int size;
+
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
+ size = module_total_size(mod);
+ seq_printf(m, "%s %u", mod->name, size);
+ print_unload_info(m, mod);
+
+ /* Informative for users. */
+ seq_printf(m, " %s",
+ mod->state == MODULE_STATE_GOING ? "Unloading" :
+ mod->state == MODULE_STATE_COMING ? "Loading" :
+ "Live");
+ /* Used by oprofile and other similar tools. */
+ value = m->private ? NULL : mod->mem[MOD_TEXT].base;
+ seq_printf(m, " 0x%px", value);
+
+ /* Taints info */
+ if (mod->taints)
+ seq_printf(m, " %s", module_flags(mod, buf, true));
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+/*
+ * Format: modulename size refcount deps address
+ *
+ * Where refcount is a number or -, and deps is a comma-separated list
+ * of depends or -.
+ */
+static const struct seq_operations modules_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = m_show
+};
+
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
+static int modules_open(struct inode *inode, struct file *file)
+{
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+
+ m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+ }
+
+ return err;
+}
+
+static const struct proc_ops modules_proc_ops = {
+ .proc_flags = PROC_ENTRY_PERMANENT,
+ .proc_open = modules_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+};
+
+static int __init proc_modules_init(void)
+{
+ proc_create("modules", 0, NULL, &modules_proc_ops);
+ return 0;
+}
+module_init(proc_modules_init);
diff --git a/kernel/module/signing.c b/kernel/module/signing.c
new file mode 100644
index 000000000000..a2ff4242e623
--- /dev/null
+++ b/kernel/module/signing.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/module_signature.h>
+#include <linux/string.h>
+#include <linux/verification.h>
+#include <linux/security.h>
+#include <crypto/public_key.h>
+#include <uapi/linux/module.h>
+#include "internal.h"
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+
+static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+module_param(sig_enforce, bool_enable_only, 0644);
+
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
+void set_module_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+
+/*
+ * Verify the signature on a module.
+ */
+int mod_verify_sig(const void *mod, struct load_info *info)
+{
+ struct module_signature ms;
+ size_t sig_len, modlen = info->len;
+ int ret;
+
+ pr_devel("==>%s(,%zu)\n", __func__, modlen);
+
+ if (modlen <= sizeof(ms))
+ return -EBADMSG;
+
+ memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+
+ ret = mod_check_sig(&ms, modlen, "module");
+ if (ret)
+ return ret;
+
+ sig_len = be32_to_cpu(ms.sig_len);
+ modlen -= sig_len + sizeof(ms);
+ info->len = modlen;
+
+ return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+}
+
+int module_sig_check(struct load_info *info, int flags)
+{
+ int err = -ENODATA;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const char *reason;
+ const void *mod = info->hdr;
+ bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS |
+ MODULE_INIT_IGNORE_VERMAGIC);
+ /*
+ * Do not allow mangled modules as a module with version information
+ * removed is no longer the module that was signed.
+ */
+ if (!mangled_module &&
+ info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ /* We truncate the module to discard the signature */
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, info);
+ if (!err) {
+ info->sig_ok = true;
+ return 0;
+ }
+ }
+
+ /*
+ * We don't permit modules to be loaded into the trusted kernels
+ * without a valid signature on them, but if we're not enforcing,
+ * certain errors are non-fatal.
+ */
+ switch (err) {
+ case -ENODATA:
+ reason = "unsigned module";
+ break;
+ case -ENOPKG:
+ reason = "module with unsupported crypto";
+ break;
+ case -ENOKEY:
+ reason = "module with unavailable key";
+ break;
+
+ default:
+ /*
+ * All other errors are fatal, including lack of memory,
+ * unparseable signatures, and signature check failures --
+ * even if signatures aren't required.
+ */
+ return err;
+ }
+
+ if (is_module_sig_enforced()) {
+ pr_notice("Loading of %s is rejected\n", reason);
+ return -EKEYREJECTED;
+ }
+
+ return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+}
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
new file mode 100644
index 000000000000..3ba0e98b3c91
--- /dev/null
+++ b/kernel/module/stats.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Debugging module statistics.
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <uapi/linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include <linux/math.h>
+
+#include "internal.h"
+
+/**
+ * DOC: module debugging statistics overview
+ *
+ * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
+ * are useful to monitor and root cause memory pressure issues with module
+ * loading. These statistics are useful to allow us to improve production
+ * workloads.
+ *
+ * The current module debugging statistics supported help keep track of module
+ * loading failures to enable improvements either for kernel module auto-loading
+ * usage (request_module()) or interactions with userspace. Statistics are
+ * provided to track all possible failures in the finit_module() path and memory
+ * wasted in this process space. Each of the failure counters are associated
+ * to a type of module loading failure which is known to incur a certain amount
+ * of memory allocation loss. In the worst case loading a module will fail after
+ * a 3 step memory allocation process:
+ *
+ * a) memory allocated with kernel_read_file_from_fd()
+ * b) module decompression processes the file read from
+ * kernel_read_file_from_fd(), and vmap() is used to map
+ * the decompressed module to a new local buffer which represents
+ * a copy of the decompressed module passed from userspace. The buffer
+ * from kernel_read_file_from_fd() is freed right away.
+ * c) layout_and_allocate() allocates space for the final resting
+ * place where we would keep the module if it were to be processed
+ * successfully.
+ *
+ * If a failure occurs after these three different allocations only one
+ * counter will be incremented with the summation of the allocated bytes freed
+ * incurred during this failure. Likewise, if module loading failed only after
+ * step b) a separate counter is used and incremented for the bytes freed and
+ * not used during both of those allocations.
+ *
+ * Virtual memory space can be limited, for example on x86 virtual memory size
+ * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
+ * memory allocations when possible. These module debugging statistics help
+ * to evaluate how much memory is being wasted on bootup due to module loading
+ * failures.
+ *
+ * All counters are designed to be incremental. Atomic counters are used so to
+ * remain simple and avoid delays and deadlocks.
+ */
+
+/**
+ * DOC: dup_failed_modules - tracks duplicate failed modules
+ *
+ * Linked list of modules which failed to be loaded because an already existing
+ * module with the same name was already being processed or already loaded.
+ * The finit_module() system call incurs heavy virtual memory allocations. In
+ * the worst case an finit_module() system call can end up allocating virtual
+ * memory 3 times:
+ *
+ * 1) kernel_read_file_from_fd() call uses vmalloc()
+ * 2) optional module decompression uses vmap()
+ * 3) layout_and allocate() can use vzalloc() or an arch specific variation of
+ * vmalloc to deal with ELF sections requiring special permissions
+ *
+ * In practice on a typical boot today most finit_module() calls fail due to
+ * the module with the same name already being loaded or about to be processed.
+ * All virtual memory allocated to these failed modules will be freed with
+ * no functional use.
+ *
+ * To help with this the dup_failed_modules allows us to track modules which
+ * failed to load due to the fact that a module was already loaded or being
+ * processed. There are only two points at which we can fail such calls,
+ * we list them below along with the number of virtual memory allocation
+ * calls:
+ *
+ * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
+ * layout_and_allocate().
+ * - with module decompression: 2 virtual memory allocation calls
+ * - without module decompression: 1 virtual memory allocation calls
+ * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
+ * - with module decompression 3 virtual memory allocation calls
+ * - without module decompression 2 virtual memory allocation calls
+ *
+ * We should strive to get this list to be as small as possible. If this list
+ * is not empty it is a reflection of possible work or optimizations possible
+ * either in-kernel or in userspace.
+ */
+static LIST_HEAD(dup_failed_modules);
+
+/**
+ * DOC: module statistics debugfs counters
+ *
+ * The total amount of wasted virtual memory allocation space during module
+ * loading can be computed by adding the total from the summation:
+ *
+ * * @invalid_kread_bytes +
+ * @invalid_decompress_bytes +
+ * @invalid_becoming_bytes +
+ * @invalid_mod_bytes
+ *
+ * The following debugfs counters are available to inspect module loading
+ * failures:
+ *
+ * * total_mod_size: total bytes ever used by all modules we've dealt with on
+ * this system
+ * * total_text_size: total bytes of the .text and .init.text ELF section
+ * sizes we've dealt with on this system
+ * * invalid_kread_bytes: bytes allocated and then freed on failures which
+ * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
+ * uses vmalloc(). These should typically not happen unless your system is
+ * under memory pressure.
+ * * invalid_decompress_bytes: number of bytes allocated and freed due to
+ * memory allocations in the module decompression path that use vmap().
+ * These typically should not happen unless your system is under memory
+ * pressure.
+ * * invalid_becoming_bytes: total number of bytes allocated and freed used
+ * to read the kernel module userspace wants us to read before we
+ * promote it to be processed to be added to our @modules linked list. These
+ * failures can happen if we had a check in between a successful kernel_read_file_from_fd()
+ * call and right before we allocate the our private memory for the module
+ * which would be kept if the module is successfully loaded. The most common
+ * reason for this failure is when userspace is racing to load a module
+ * which it does not yet see loaded. The first module to succeed in
+ * add_unformed_module() will add a module to our &modules list and
+ * subsequent loads of modules with the same name will error out at the
+ * end of early_mod_check(). The check for module_patient_check_exists()
+ * at the end of early_mod_check() prevents duplicate allocations
+ * on layout_and_allocate() for modules already being processed. These
+ * duplicate failed modules are non-fatal, however they typically are
+ * indicative of userspace not seeing a module in userspace loaded yet and
+ * unnecessarily trying to load a module before the kernel even has a chance
+ * to begin to process prior requests. Although duplicate failures can be
+ * non-fatal, we should try to reduce vmalloc() pressure proactively, so
+ * ideally after boot this will be close to as 0 as possible. If module
+ * decompression was used we also add to this counter the cost of the
+ * initial kernel_read_file_from_fd() of the compressed module. If module
+ * decompression was not used the value represents the total allocated and
+ * freed bytes in kernel_read_file_from_fd() calls for these type of
+ * failures. These failures can occur because:
+ *
+ * * module_sig_check() - module signature checks
+ * * elf_validity_cache_copy() - some ELF validation issue
+ * * early_mod_check():
+ *
+ * * blacklisting
+ * * failed to rewrite section headers
+ * * version magic
+ * * live patch requirements didn't check out
+ * * the module was detected as being already present
+ *
+ * * invalid_mod_bytes: these are the total number of bytes allocated and
+ * freed due to failures after we did all the sanity checks of the module
+ * which userspace passed to us and after our first check that the module
+ * is unique. A module can still fail to load if we detect the module is
+ * loaded after we allocate space for it with layout_and_allocate(), we do
+ * this check right before processing the module as live and run its
+ * initialization routines. Note that you have a failure of this type it
+ * also means the respective kernel_read_file_from_fd() memory space was
+ * also freed and not used, and so we increment this counter with twice
+ * the size of the module. Additionally if you used module decompression
+ * the size of the compressed module is also added to this counter.
+ *
+ * * modcount: how many modules we've loaded in our kernel life time
+ * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
+ * * failed_decompress: how many failed module decompression attempts we've had.
+ * These really should not happen unless your compression / decompression
+ * might be broken.
+ * * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
+ * it and before we allocate memory for it with layout_and_allocate(). This
+ * counter is never incremented if you manage to validate the module and
+ * call layout_and_allocate() for it.
+ * * failed_load_modules: how many modules failed once we've allocated our
+ * private space for our module using layout_and_allocate(). These failures
+ * should hopefully mostly be dealt with already. Races in theory could
+ * still exist here, but it would just mean the kernel had started processing
+ * two threads concurrently up to early_mod_check() and one thread won.
+ * These failures are good signs the kernel or userspace is doing something
+ * seriously stupid or that could be improved. We should strive to fix these,
+ * but it is perhaps not easy to fix them. A recent example are the modules
+ * requests incurred for frequency modules, a separate module request was
+ * being issued for each CPU on a system.
+ */
+
+atomic_long_t total_mod_size;
+atomic_long_t total_text_size;
+atomic_long_t invalid_kread_bytes;
+atomic_long_t invalid_decompress_bytes;
+static atomic_long_t invalid_becoming_bytes;
+static atomic_long_t invalid_mod_bytes;
+atomic_t modcount;
+atomic_t failed_kreads;
+atomic_t failed_decompress;
+static atomic_t failed_becoming;
+static atomic_t failed_load_modules;
+
+static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
+{
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
+ test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Becoming & Load";
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
+ return "Becoming";
+ if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Load";
+ return "Bug-on-stats";
+}
+
+void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+ atomic_long_add(info->len * 2, &invalid_mod_bytes);
+ atomic_inc(&failed_load_modules);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_mod_bytes);
+#endif
+}
+
+void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+ atomic_inc(&failed_becoming);
+ atomic_long_add(info->len, &invalid_becoming_bytes);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
+#endif
+}
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
+{
+ struct mod_fail_load *mod_fail;
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_fail->name, name)) {
+ atomic_long_inc(&mod_fail->count);
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ goto out;
+ }
+ }
+
+ mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
+ if (!mod_fail)
+ return -ENOMEM;
+ memcpy(mod_fail->name, name, strlen(name));
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ atomic_long_inc(&mod_fail->count);
+ list_add_rcu(&mod_fail->list, &dup_failed_modules);
+out:
+ return 0;
+}
+
+/*
+ * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
+ * 112 module prints within 8k.
+ *
+ * 1024 + (64*112) = 8k
+ */
+#define MAX_PREAMBLE 1024
+#define MAX_FAILED_MOD_PRINT 112
+#define MAX_BYTES_PER_MOD 64
+static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct mod_fail_load *mod_fail;
+ unsigned int len, size, count_failed = 0;
+ char *buf;
+ int ret;
+ u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
+ unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
+ idecompress_bytes, imod_bytes, total_virtual_lost;
+
+ live_mod_count = atomic_read(&modcount);
+ fkreads = atomic_read(&failed_kreads);
+ fdecompress = atomic_read(&failed_decompress);
+ fbecoming = atomic_read(&failed_becoming);
+ floads = atomic_read(&failed_load_modules);
+
+ total_size = atomic_long_read(&total_mod_size);
+ text_size = atomic_long_read(&total_text_size);
+ ikread_bytes = atomic_long_read(&invalid_kread_bytes);
+ idecompress_bytes = atomic_long_read(&invalid_decompress_bytes);
+ ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes);
+ imod_bytes = atomic_long_read(&invalid_mod_bytes);
+
+ total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
+
+ size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
+ (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
+ buf = kzalloc(size, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* The beginning of our debug preamble */
+ len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
+ fdecompress);
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size);
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes",
+ idecompress_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost);
+
+ if (live_mod_count && total_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size",
+ DIV_ROUND_UP(total_size, live_mod_count));
+ }
+
+ if (live_mod_count && text_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size",
+ DIV_ROUND_UP(text_size, live_mod_count));
+ }
+
+ /*
+ * We use WARN_ON_ONCE() for the counters to ensure we always have parity
+ * for keeping tabs on a type of failure with one type of byte counter.
+ * The counters for imod_bytes does not increase for fkreads failures
+ * for example, and so on.
+ */
+
+ WARN_ON_ONCE(ikread_bytes && !fkreads);
+ if (fkreads && ikread_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes",
+ DIV_ROUND_UP(ikread_bytes, fkreads));
+ }
+
+ WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
+ if (fbecoming && ibecoming_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes",
+ DIV_ROUND_UP(ibecoming_bytes, fbecoming));
+ }
+
+ WARN_ON_ONCE(idecompress_bytes && !fdecompress);
+ if (fdecompress && idecompress_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes",
+ DIV_ROUND_UP(idecompress_bytes, fdecompress));
+ }
+
+ WARN_ON_ONCE(imod_bytes && !floads);
+ if (floads && imod_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes",
+ DIV_ROUND_UP(imod_bytes, floads));
+ }
+
+ /* End of our debug preamble header. */
+
+ /* Catch when we've gone beyond our expected preamble */
+ WARN_ON_ONCE(len >= MAX_PREAMBLE);
+
+ if (list_empty(&dup_failed_modules))
+ goto out;
+
+ len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
+ len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
+ "Module-name", "How-many-times", "Reason");
+ mutex_lock(&module_mutex);
+
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
+ if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
+ goto out_unlock;
+ len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name,
+ atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail));
+ }
+out_unlock:
+ mutex_unlock(&module_mutex);
+out:
+ ret = simple_read_from_buffer(user_buf, count, ppos, buf, len);
+ kfree(buf);
+ return ret;
+}
+#undef MAX_PREAMBLE
+#undef MAX_FAILED_MOD_PRINT
+#undef MAX_BYTES_PER_MOD
+
+static const struct file_operations fops_mod_stats = {
+ .read = read_file_mod_stats,
+ .open = simple_open,
+ .owner = THIS_MODULE,
+ .llseek = default_llseek,
+};
+
+#define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
+#define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
+static int __init module_stats_init(void)
+{
+ mod_debug_add_ulong(total_mod_size);
+ mod_debug_add_ulong(total_text_size);
+ mod_debug_add_ulong(invalid_kread_bytes);
+ mod_debug_add_ulong(invalid_decompress_bytes);
+ mod_debug_add_ulong(invalid_becoming_bytes);
+ mod_debug_add_ulong(invalid_mod_bytes);
+
+ mod_debug_add_atomic(modcount);
+ mod_debug_add_atomic(failed_kreads);
+ mod_debug_add_atomic(failed_decompress);
+ mod_debug_add_atomic(failed_becoming);
+ mod_debug_add_atomic(failed_load_modules);
+
+ debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
+
+ return 0;
+}
+#undef mod_debug_add_ulong
+#undef mod_debug_add_atomic
+module_init(module_stats_init);
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
new file mode 100644
index 000000000000..8fd438529fbc
--- /dev/null
+++ b/kernel/module/strict_rwx.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module strict rwx
+ *
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+#include <linux/execmem.h>
+#include "internal.h"
+
+static int module_set_memory(const struct module *mod, enum mod_mem_type type,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ if (!mod_mem->base)
+ return 0;
+
+ set_vm_flush_reset_perms(mod_mem->base);
+ return set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+}
+
+/*
+ * Since some arches are moving towards PAGE_KERNEL module allocations instead
+ * of PAGE_KERNEL_EXEC, keep module_enable_x() independent of
+ * CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
+ * are strict.
+ */
+int module_enable_text_rox(const struct module *mod)
+{
+ for_class_mod_mem_type(type, text) {
+ const struct module_memory *mem = &mod->mem[type];
+ int ret;
+
+ if (mem->is_rox)
+ ret = execmem_restore_rox(mem->base, mem->size);
+ else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ ret = module_set_memory(mod, type, set_memory_rox);
+ else
+ ret = module_set_memory(mod, type, set_memory_x);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int module_enable_rodata_ro(const struct module *mod)
+{
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled)
+ return 0;
+
+ ret = module_set_memory(mod, MOD_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
+ ret = module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int module_enable_rodata_ro_after_init(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled)
+ return 0;
+
+ return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+}
+
+int module_enable_data_nx(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return 0;
+
+ for_class_mod_mem_type(type, data) {
+ int ret = module_set_memory(mod, type, set_memory_nx);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const struct module *mod)
+{
+ const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ return 0;
+
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) {
+ pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n",
+ mod->name, secstrings + sechdrs[i].sh_name, i);
+ return -ENOEXEC;
+ }
+ }
+
+ return 0;
+}
+
+static const char *const ro_after_init[] = {
+ /*
+ * Section .data..ro_after_init holds data explicitly annotated by
+ * __ro_after_init.
+ */
+ ".data..ro_after_init",
+
+ /*
+ * Section __jump_table holds data structures that are never modified,
+ * with the exception of entries that refer to code in the __init
+ * section, which are marked as such at module load time.
+ */
+ "__jump_table",
+
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+ /*
+ * Section .static_call_sites holds data structures that need to be
+ * sorted and processed at module load time but are never modified
+ * afterwards.
+ */
+ ".static_call_sites",
+#endif
+};
+
+void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ int i, j;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &sechdrs[i];
+
+ for (j = 0; j < ARRAY_SIZE(ro_after_init); j++) {
+ if (strcmp(secstrings + shdr->sh_name,
+ ro_after_init[j]) == 0) {
+ shdr->sh_flags |= SHF_RO_AFTER_INIT;
+ break;
+ }
+ }
+ }
+}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
new file mode 100644
index 000000000000..c7622ff5226a
--- /dev/null
+++ b/kernel/module/sysfs.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module sysfs support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include "internal.h"
+
+/*
+ * /sys/module/foo/sections stuff
+ * J. Corbet <corbet@lwn.net>
+ */
+#ifdef CONFIG_KALLSYMS
+struct module_sect_attrs {
+ struct attribute_group grp;
+ struct bin_attribute attrs[];
+};
+
+#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ const struct bin_attribute *battr,
+ char *buf, loff_t pos, size_t count)
+{
+ char bounce[MODULE_SECT_READ_SIZE + 1];
+ size_t wrote;
+
+ if (pos != 0)
+ return -EINVAL;
+
+ /*
+ * Since we're a binary read handler, we must account for the
+ * trailing NUL byte that sprintf will write: if "buf" is
+ * too small to hold the NUL, or the NUL is exactly the last
+ * byte, the read will look like it got truncated by one byte.
+ * Since there is no way to ask sprintf nicely to not write
+ * the NUL, we have to use a bounce buffer.
+ */
+ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ kallsyms_show_value(file->f_cred)
+ ? battr->private : NULL);
+ count = min(count, wrote);
+ memcpy(buf, bounce, count);
+
+ return count;
+}
+
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+ const struct bin_attribute *const *bin_attr;
+
+ for (bin_attr = sect_attrs->grp.bin_attrs; *bin_attr; bin_attr++)
+ kfree((*bin_attr)->attr.name);
+ kfree(sect_attrs->grp.bin_attrs);
+ kfree(sect_attrs);
+}
+
+static int add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ struct module_sect_attrs *sect_attrs;
+ const struct bin_attribute **gattr;
+ struct bin_attribute *sattr;
+ unsigned int nloaded = 0, i;
+ int ret;
+
+ /* Count loaded sections and allocate structures */
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
+ nloaded++;
+ sect_attrs = kzalloc(struct_size(sect_attrs, attrs, nloaded), GFP_KERNEL);
+ if (!sect_attrs)
+ return -ENOMEM;
+
+ gattr = kcalloc(nloaded + 1, sizeof(*gattr), GFP_KERNEL);
+ if (!gattr) {
+ kfree(sect_attrs);
+ return -ENOMEM;
+ }
+
+ /* Setup section attributes. */
+ sect_attrs->grp.name = "sections";
+ sect_attrs->grp.bin_attrs = gattr;
+
+ sattr = &sect_attrs->attrs[0];
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+
+ if (sect_empty(sec))
+ continue;
+ sysfs_bin_attr_init(sattr);
+ sattr->attr.name =
+ kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ if (!sattr->attr.name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sattr->read = module_sect_read;
+ sattr->private = (void *)sec->sh_addr;
+ sattr->size = MODULE_SECT_READ_SIZE;
+ sattr->attr.mode = 0400;
+ *(gattr++) = sattr++;
+ }
+
+ ret = sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp);
+ if (ret)
+ goto out;
+
+ mod->sect_attrs = sect_attrs;
+ return 0;
+out:
+ free_sect_attrs(sect_attrs);
+ return ret;
+}
+
+static void remove_sect_attrs(struct module *mod)
+{
+ if (mod->sect_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->sect_attrs->grp);
+ /*
+ * We are positive that no one is using any sect attrs
+ * at this point. Deallocate immediately.
+ */
+ free_sect_attrs(mod->sect_attrs);
+ mod->sect_attrs = NULL;
+ }
+}
+
+/*
+ * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+ */
+
+struct module_notes_attrs {
+ struct attribute_group grp;
+ struct bin_attribute attrs[];
+};
+
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs)
+{
+ kfree(notes_attrs->grp.bin_attrs);
+ kfree(notes_attrs);
+}
+
+static int add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ unsigned int notes, loaded, i;
+ struct module_notes_attrs *notes_attrs;
+ const struct bin_attribute **gattr;
+ struct bin_attribute *nattr;
+ int ret;
+
+ /* Count notes sections and allocate structures. */
+ notes = 0;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ info->sechdrs[i].sh_type == SHT_NOTE)
+ ++notes;
+
+ if (notes == 0)
+ return 0;
+
+ notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+ GFP_KERNEL);
+ if (!notes_attrs)
+ return -ENOMEM;
+
+ gattr = kcalloc(notes + 1, sizeof(*gattr), GFP_KERNEL);
+ if (!gattr) {
+ kfree(notes_attrs);
+ return -ENOMEM;
+ }
+
+ notes_attrs->grp.name = "notes";
+ notes_attrs->grp.bin_attrs = gattr;
+
+ nattr = &notes_attrs->attrs[0];
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
+ continue;
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
+ sysfs_bin_attr_init(nattr);
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].attr.name;
+ nattr->attr.mode = 0444;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *)info->sechdrs[i].sh_addr;
+ nattr->read = sysfs_bin_attr_simple_read;
+ *(gattr++) = nattr++;
+ }
+ ++loaded;
+ }
+
+ ret = sysfs_create_group(&mod->mkobj.kobj, &notes_attrs->grp);
+ if (ret)
+ goto out;
+
+ mod->notes_attrs = notes_attrs;
+ return 0;
+
+out:
+ free_notes_attrs(notes_attrs);
+ return ret;
+}
+
+static void remove_notes_attrs(struct module *mod)
+{
+ if (mod->notes_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->notes_attrs->grp);
+ /*
+ * We are positive that no one is using any notes attrs
+ * at this point. Deallocate immediately.
+ */
+ free_notes_attrs(mod->notes_attrs);
+ mod->notes_attrs = NULL;
+ }
+}
+
+#else /* !CONFIG_KALLSYMS */
+static inline int add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
+static inline void remove_sect_attrs(struct module *mod) { }
+static inline int add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
+static inline void remove_notes_attrs(struct module *mod) { }
+#endif /* CONFIG_KALLSYMS */
+
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
+ mutex_unlock(&module_mutex);
+#endif
+}
+
+static int add_usage_links(struct module *mod)
+{
+ int ret = 0;
+#ifdef CONFIG_MODULE_UNLOAD
+ struct module_use *use;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ ret = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&module_mutex);
+ if (ret)
+ del_usage_links(mod);
+#endif
+ return ret;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod, int end)
+{
+ const struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ if (end >= 0 && i > end)
+ break;
+ /* pick a field to test for end of list */
+ if (!attr->attr.name)
+ break;
+ sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+ if (attr->free)
+ attr->free(mod);
+ }
+ kfree(mod->modinfo_attrs);
+}
+
+static int module_add_modinfo_attrs(struct module *mod)
+{
+ const struct module_attribute *attr;
+ struct module_attribute *temp_attr;
+ int error = 0;
+ int i;
+
+ mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ (modinfo_attrs_count + 1)),
+ GFP_KERNEL);
+ if (!mod->modinfo_attrs)
+ return -ENOMEM;
+
+ temp_attr = mod->modinfo_attrs;
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (!attr->test || attr->test(mod)) {
+ memcpy(temp_attr, attr, sizeof(*temp_attr));
+ sysfs_attr_init(&temp_attr->attr);
+ error = sysfs_create_file(&mod->mkobj.kobj,
+ &temp_attr->attr);
+ if (error)
+ goto error_out;
+ ++temp_attr;
+ }
+ }
+
+ return 0;
+
+error_out:
+ if (i > 0)
+ module_remove_modinfo_attrs(mod, --i);
+ else
+ kfree(mod->modinfo_attrs);
+ return error;
+}
+
+static void mod_kobject_put(struct module *mod)
+{
+ DECLARE_COMPLETION_ONSTACK(c);
+
+ mod->mkobj.kobj_completion = &c;
+ kobject_put(&mod->mkobj.kobj);
+ wait_for_completion(&c);
+}
+
+static int mod_sysfs_init(struct module *mod)
+{
+ int err;
+ struct kobject *kobj;
+
+ if (!module_kset) {
+ pr_err("%s: module sysfs not initialized\n", mod->name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ kobj = kset_find_obj(module_kset, mod->name);
+ if (kobj) {
+ pr_err("%s: module is already loaded\n", mod->name);
+ kobject_put(kobj);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mod->mkobj.mod = mod;
+
+ memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ mod->mkobj.kobj.kset = module_kset;
+ err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ "%s", mod->name);
+ if (err)
+ mod_kobject_put(mod);
+
+out:
+ return err;
+}
+
+int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
+ struct kernel_param *kparam,
+ unsigned int num_params)
+{
+ int err;
+
+ err = mod_sysfs_init(mod);
+ if (err)
+ goto out;
+
+ mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
+ goto out_unreg;
+ }
+
+ err = module_param_sysfs_setup(mod, kparam, num_params);
+ if (err)
+ goto out_unreg_holders;
+
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg_param;
+
+ err = add_usage_links(mod);
+ if (err)
+ goto out_unreg_modinfo_attrs;
+
+ err = add_sect_attrs(mod, info);
+ if (err)
+ goto out_del_usage_links;
+
+ err = add_notes_attrs(mod, info);
+ if (err)
+ goto out_unreg_sect_attrs;
+
+ return 0;
+
+out_unreg_sect_attrs:
+ remove_sect_attrs(mod);
+out_del_usage_links:
+ del_usage_links(mod);
+out_unreg_modinfo_attrs:
+ module_remove_modinfo_attrs(mod, -1);
+out_unreg_param:
+ module_param_sysfs_remove(mod);
+out_unreg_holders:
+ kobject_put(mod->holders_dir);
+out_unreg:
+ mod_kobject_put(mod);
+out:
+ return err;
+}
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
+ mod_kobject_put(mod);
+}
+
+void mod_sysfs_teardown(struct module *mod)
+{
+ del_usage_links(mod);
+ module_remove_modinfo_attrs(mod, -1);
+ module_param_sysfs_remove(mod);
+ kobject_put(mod->mkobj.drivers_dir);
+ kobject_put(mod->holders_dir);
+ mod_sysfs_fini(mod);
+}
+
+void init_param_lock(struct module *mod)
+{
+ mutex_init(&mod->param_lock);
+}
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
new file mode 100644
index 000000000000..4fefec5b683c
--- /dev/null
+++ b/kernel/module/tracking.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module taint unload tracking support
+ *
+ * Copyright (C) 2022 Aaron Tomlin
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include "internal.h"
+
+static LIST_HEAD(unloaded_tainted_modules);
+extern struct dentry *mod_debugfs_root;
+
+int try_add_tainted_module(struct module *mod)
+{
+ struct mod_unload_taint *mod_taint;
+
+ if (!mod->taints)
+ goto out;
+
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_taint->name, mod->name) &&
+ mod_taint->taints & mod->taints) {
+ mod_taint->count++;
+ goto out;
+ }
+ }
+
+ mod_taint = kmalloc(sizeof(*mod_taint), GFP_KERNEL);
+ if (unlikely(!mod_taint))
+ return -ENOMEM;
+ strscpy(mod_taint->name, mod->name, MODULE_NAME_LEN);
+ mod_taint->taints = mod->taints;
+ list_add_rcu(&mod_taint->list, &unloaded_tainted_modules);
+ mod_taint->count = 1;
+out:
+ return 0;
+}
+
+void print_unloaded_tainted_modules(void)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+
+ if (!list_empty(&unloaded_tainted_modules)) {
+ printk(KERN_DEFAULT "Unloaded tainted modules:");
+ list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules,
+ list) {
+ size_t l;
+
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+ pr_cont(" %s(%s):%llu", mod_taint->name, buf,
+ mod_taint->count);
+ }
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_rcu(&unloaded_tainted_modules, *pos);
+}
+
+static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next_rcu(p, &unloaded_tainted_modules, pos);
+}
+
+static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p)
+{
+ struct mod_unload_taint *mod_taint;
+ char buf[MODULE_FLAGS_BUF_SIZE];
+ size_t l;
+
+ mod_taint = list_entry(p, struct mod_unload_taint, list);
+ l = module_flags_taint(mod_taint->taints, buf);
+ buf[l++] = '\0';
+
+ seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count);
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static const struct seq_operations unloaded_tainted_modules_seq_ops = {
+ .start = unloaded_tainted_modules_seq_start,
+ .next = unloaded_tainted_modules_seq_next,
+ .stop = unloaded_tainted_modules_seq_stop,
+ .show = unloaded_tainted_modules_seq_show,
+};
+
+static int unloaded_tainted_modules_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unloaded_tainted_modules_seq_ops);
+}
+
+static const struct file_operations unloaded_tainted_modules_fops = {
+ .open = unloaded_tainted_modules_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init unloaded_tainted_modules_init(void)
+{
+ debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL,
+ &unloaded_tainted_modules_fops);
+ return 0;
+}
+module_init(unloaded_tainted_modules_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
new file mode 100644
index 000000000000..f8e8c126705c
--- /dev/null
+++ b/kernel/module/tree_lookup.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Modules tree lookup
+ *
+ * Copyright (C) 2015 Peter Zijlstra
+ * Copyright (C) 2015 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/rbtree_latch.h>
+#include "internal.h"
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU lookups of the address from any context.
+ *
+ * This is conditional on PERF_EVENTS || TRACING || CFI because those can
+ * really hit __module_address() hard by doing a lot of stack unwinding;
+ * potentially from NMI context.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->base;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
+
+ return (unsigned long)mod_mem->size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+ return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long start, end;
+
+ start = __mod_tree_val(n);
+ if (val < start)
+ return -1;
+
+ end = start + __mod_tree_size(n);
+ if (val >= end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+ .less = mod_tree_less,
+ .comp = mod_tree_comp,
+};
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_insert(&node->node, &tree->root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *tree)
+{
+ latch_tree_erase(&node->node, &tree->root, &mod_tree_ops);
+}
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+void mod_tree_insert(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ mod->mem[type].mtn.mod = mod;
+ if (mod->mem[type].size)
+ __mod_tree_insert(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove_init(struct module *mod)
+{
+ for_class_mod_mem_type(type, init) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+void mod_tree_remove(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
+}
+
+struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
+{
+ struct latch_tree_node *ltn;
+
+ ltn = latch_tree_find((void *)addr, &tree->root, &mod_tree_ops);
+ if (!ltn)
+ return NULL;
+
+ return container_of(ltn, struct mod_tree_node, node)->mod;
+}
diff --git a/kernel/module/version.c b/kernel/module/version.c
new file mode 100644
index 000000000000..2beefeba82d9
--- /dev/null
+++ b/kernel/module/version.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module version support
+ *
+ * Copyright (C) 2008 Rusty Russell
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include "internal.h"
+
+int check_version(const struct load_info *info,
+ const char *symname,
+ struct module *mod,
+ const u32 *crc)
+{
+ Elf_Shdr *sechdrs = info->sechdrs;
+ unsigned int versindex = info->index.vers;
+ unsigned int i, num_versions;
+ struct modversion_info *versions;
+ struct modversion_info_ext version_ext;
+
+ /* Exporting module didn't supply crcs? OK, we're already tainted. */
+ if (!crc)
+ return 1;
+
+ /* If we have extended version info, rely on it */
+ if (info->index.vers_ext_crc) {
+ for_each_modversion_info_ext(version_ext, info) {
+ if (strcmp(version_ext.name, symname) != 0)
+ continue;
+ if (*version_ext.crc == *crc)
+ return 1;
+ pr_debug("Found checksum %X vs module %X\n",
+ *crc, *version_ext.crc);
+ goto bad_version;
+ }
+ pr_warn_once("%s: no extended symbol version for %s\n",
+ info->name, symname);
+ return 1;
+ }
+
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
+ versions = (void *)sechdrs[versindex].sh_addr;
+ num_versions = sechdrs[versindex].sh_size
+ / sizeof(struct modversion_info);
+
+ for (i = 0; i < num_versions; i++) {
+ u32 crcval;
+
+ if (strcmp(versions[i].name, symname) != 0)
+ continue;
+
+ crcval = *crc;
+ if (versions[i].crc == crcval)
+ return 1;
+ pr_debug("Found checksum %X vs module %lX\n",
+ crcval, versions[i].crc);
+ goto bad_version;
+ }
+
+ /* Broken toolchain. Warn once, then let it go.. */
+ pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+ return 1;
+
+bad_version:
+ pr_warn("%s: disagrees about version of symbol %s\n", info->name, symname);
+ return 0;
+}
+
+int check_modstruct_version(const struct load_info *info,
+ struct module *mod)
+{
+ struct find_symbol_arg fsa = {
+ .name = "module_layout",
+ .gplok = true,
+ };
+ bool have_symbol;
+
+ /*
+ * Since this should be found in kernel (which can't be removed), no
+ * locking is necessary. Regardless use a RCU read section to keep
+ * lockdep happy.
+ */
+ scoped_guard(rcu)
+ have_symbol = find_symbol(&fsa);
+ BUG_ON(!have_symbol);
+
+ return check_version(info, "module_layout", mod, fsa.crc);
+}
+
+/* First part is kernel version, which we ignore if module has crcs. */
+int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
+{
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
+ return strcmp(amagic, bmagic) == 0;
+}
+
+void modversion_ext_start(const struct load_info *info,
+ struct modversion_info_ext *start)
+{
+ unsigned int crc_idx = info->index.vers_ext_crc;
+ unsigned int name_idx = info->index.vers_ext_name;
+ Elf_Shdr *sechdrs = info->sechdrs;
+
+ /*
+ * Both of these fields are needed for this to be useful
+ * Any future fields should be initialized to NULL if absent.
+ */
+ if (crc_idx == 0 || name_idx == 0) {
+ start->remaining = 0;
+ return;
+ }
+
+ start->crc = (const u32 *)sechdrs[crc_idx].sh_addr;
+ start->name = (const char *)sechdrs[name_idx].sh_addr;
+ start->remaining = sechdrs[crc_idx].sh_size / sizeof(*start->crc);
+}
+
+void modversion_ext_advance(struct modversion_info_ext *vers)
+{
+ vers->remaining--;
+ vers->crc++;
+ vers->name += strlen(vers->name) + 1;
+}
+
+/*
+ * Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module.
+ */
+void module_layout(struct module *mod,
+ struct modversion_info *ver,
+ struct kernel_param *kp,
+ struct kernel_symbol *ks,
+ struct tracepoint * const *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
new file mode 100644
index 000000000000..00132d12487c
--- /dev/null
+++ b/kernel/module_signature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Module signature checker
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/errno.h>
+#include <linux/printk.h>
+#include <linux/module_signature.h>
+#include <asm/byteorder.h>
+
+/**
+ * mod_check_sig - check that the given signature is sane
+ *
+ * @ms: Signature to check.
+ * @file_len: Size of the file to which @ms is appended.
+ * @name: What is being checked. Used for error messages.
+ */
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+ const char *name)
+{
+ if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms))
+ return -EBADMSG;
+
+ if (ms->id_type != PKEY_ID_PKCS7) {
+ pr_err("%s: not signed with expected PKCS#7 message\n",
+ name);
+ return -ENOPKG;
+ }
+
+ if (ms->algo != 0 ||
+ ms->hash != 0 ||
+ ms->signer_len != 0 ||
+ ms->key_id_len != 0 ||
+ ms->__pad[0] != 0 ||
+ ms->__pad[1] != 0 ||
+ ms->__pad[2] != 0) {
+ pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n",
+ name);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
deleted file mode 100644
index be5b8fac4bd0..000000000000
--- a/kernel/module_signing.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Module signature checker
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-
-/*
- * Module signature information block.
- *
- * The constituents of the signature section are, in order:
- *
- * - Signer's name
- * - Key identifier
- * - Signature data
- * - Information block
- */
-struct module_signature {
- u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
- u8 hash; /* Digest algorithm [enum hash_algo] */
- u8 id_type; /* Key identifier type [enum pkey_id_type] */
- u8 signer_len; /* Length of signer's name */
- u8 key_id_len; /* Length of key identifier */
- u8 __pad[3];
- __be32 sig_len; /* Length of signature data */
-};
-
-/*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
- const void *mod,
- unsigned long modlen)
-{
- struct public_key_signature *pks;
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- size_t digest_size, desc_size;
- int ret;
-
- pr_devel("==>%s()\n", __func__);
-
- /* Allocate the hashing algorithm we're going to need and find out how
- * big the hash operational data will be.
- */
- tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
- if (IS_ERR(tfm))
- return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- digest_size = crypto_shash_digestsize(tfm);
-
- /* We allocate the hash operational data storage on the end of our
- * context data and the digest output buffer on the end of that.
- */
- ret = -ENOMEM;
- pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
- if (!pks)
- goto error_no_pks;
-
- pks->pkey_hash_algo = hash;
- pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
- pks->digest_size = digest_size;
-
- desc = (void *)pks + sizeof(*pks);
- desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto error;
-
- ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
- if (ret < 0)
- goto error;
-
- crypto_free_shash(tfm);
- pr_devel("<==%s() = ok\n", __func__);
- return pks;
-
-error:
- kfree(pks);
-error_no_pks:
- crypto_free_shash(tfm);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ERR_PTR(ret);
-}
-
-/*
- * Extract an MPI array from the signature data. This represents the actual
- * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
- const void *data, size_t len)
-{
- size_t nbytes;
- MPI mpi;
-
- if (len < 3)
- return -EBADMSG;
- nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
- data += 2;
- len -= 2;
- if (len != nbytes)
- return -EBADMSG;
-
- mpi = mpi_read_raw_data(data, nbytes);
- if (!mpi)
- return -ENOMEM;
- pks->mpi[0] = mpi;
- pks->nr_mpi = 1;
- return 0;
-}
-
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
- const u8 *key_id, size_t key_id_len)
-{
- key_ref_t key;
- size_t i;
- char *id, *q;
-
- pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-
- /* Construct an identifier. */
- id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
- if (!id)
- return ERR_PTR(-ENOKEY);
-
- memcpy(id, signer, signer_len);
-
- q = id + signer_len;
- *q++ = ':';
- *q++ = ' ';
- for (i = 0; i < key_id_len; i++) {
- *q++ = hex_asc[*key_id >> 4];
- *q++ = hex_asc[*key_id++ & 0x0f];
- }
-
- *q = 0;
-
- pr_debug("Look up: \"%s\"\n", id);
-
- key = keyring_search(make_key_ref(system_trusted_keyring, 1),
- &key_type_asymmetric, id);
- if (IS_ERR(key))
- pr_warn("Request for unknown module key '%s' err %ld\n",
- id, PTR_ERR(key));
- kfree(id);
-
- if (IS_ERR(key)) {
- switch (PTR_ERR(key)) {
- /* Hide some search errors */
- case -EACCES:
- case -ENOTDIR:
- case -EAGAIN:
- return ERR_PTR(-ENOKEY);
- default:
- return ERR_CAST(key);
- }
- }
-
- pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
- return key_ref_to_ptr(key);
-}
-
-/*
- * Verify the signature on a module.
- */
-int mod_verify_sig(const void *mod, unsigned long *_modlen)
-{
- struct public_key_signature *pks;
- struct module_signature ms;
- struct key *key;
- const void *sig;
- size_t modlen = *_modlen, sig_len;
- int ret;
-
- pr_devel("==>%s(,%zu)\n", __func__, modlen);
-
- if (modlen <= sizeof(ms))
- return -EBADMSG;
-
- memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
- modlen -= sizeof(ms);
-
- sig_len = be32_to_cpu(ms.sig_len);
- if (sig_len >= modlen)
- return -EBADMSG;
- modlen -= sig_len;
- if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
- return -EBADMSG;
- modlen -= (size_t)ms.signer_len + ms.key_id_len;
-
- *_modlen = modlen;
- sig = mod + modlen;
-
- /* For the moment, only support RSA and X.509 identifiers */
- if (ms.algo != PKEY_ALGO_RSA ||
- ms.id_type != PKEY_ID_X509)
- return -ENOPKG;
-
- if (ms.hash >= PKEY_HASH__LAST ||
- !hash_algo_name[ms.hash])
- return -ENOPKG;
-
- key = request_asymmetric_key(sig, ms.signer_len,
- sig + ms.signer_len, ms.key_id_len);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- pks = mod_make_digest(ms.hash, mod, modlen);
- if (IS_ERR(pks)) {
- ret = PTR_ERR(pks);
- goto error_put_key;
- }
-
- ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
- sig_len);
- if (ret < 0)
- goto error_free_pks;
-
- ret = verify_signature(key, pks);
- pr_devel("verify_signature() = %d\n", ret);
-
-error_free_pks:
- mpi_free(pks->rsa.s);
- kfree(pks);
-error_put_key:
- key_put(key);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ret;
-}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ae9fc7cc360e..2f9fe7c30287 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,17 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
-#include <linux/reboot.h>
-/*
- * Notifier list for kernel code which wants to be called
- * at shutdown. This is used to stop any idling DMA operations
- * and the like.
- */
-BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+#define CREATE_TRACE_POINTS
+#include <trace/events/notifier.h>
/*
* Notifier chain core routines. The exported routines below
@@ -19,30 +15,24 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
*/
static int notifier_chain_register(struct notifier_block **nl,
- struct notifier_block *n)
-{
- while ((*nl) != NULL) {
- if (n->priority > (*nl)->priority)
- break;
- nl = &((*nl)->next);
- }
- n->next = *nl;
- rcu_assign_pointer(*nl, n);
- return 0;
-}
-
-static int notifier_chain_cond_register(struct notifier_block **nl,
- struct notifier_block *n)
+ struct notifier_block *n,
+ bool unique_priority)
{
while ((*nl) != NULL) {
- if ((*nl) == n)
- return 0;
+ if (unlikely((*nl) == n)) {
+ WARN(1, "notifier callback %ps already registered",
+ n->notifier_call);
+ return -EEXIST;
+ }
if (n->priority > (*nl)->priority)
break;
+ if (n->priority == (*nl)->priority && unique_priority)
+ return -EBUSY;
nl = &((*nl)->next);
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
+ trace_notifier_register((void *)n->notifier_call);
return 0;
}
@@ -52,6 +42,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
while ((*nl) != NULL) {
if ((*nl) == n) {
rcu_assign_pointer(*nl, n->next);
+ trace_notifier_unregister((void *)n->notifier_call);
return 0;
}
nl = &((*nl)->next);
@@ -68,7 +59,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
- * @returns: notifier_call_chain returns the value returned by the
+ * Return: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
@@ -90,12 +81,13 @@ static int notifier_call_chain(struct notifier_block **nl,
continue;
}
#endif
+ trace_notifier_run((void *)nb->notifier_call);
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
(*nr_calls)++;
- if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+ if (ret & NOTIFY_STOP_MASK)
break;
nb = next_nb;
nr_to_call--;
@@ -104,6 +96,34 @@ static int notifier_call_chain(struct notifier_block **nl,
}
NOKPROBE_SYMBOL(notifier_call_chain);
+/**
+ * notifier_call_chain_robust - Inform the registered notifiers about an event
+ * and rollback on error.
+ * @nl: Pointer to head of the blocking notifier chain
+ * @val_up: Value passed unmodified to the notifier function
+ * @val_down: Value passed unmodified to the notifier function when recovering
+ * from an error on @val_up
+ * @v: Pointer passed unmodified to the notifier function
+ *
+ * NOTE: It is important the @nl chain doesn't change between the two
+ * invocations of notifier_call_chain() such that we visit the
+ * exact same notifier callbacks; this rules out any RCU usage.
+ *
+ * Return: the return value of the @val_up call.
+ */
+static int notifier_call_chain_robust(struct notifier_block **nl,
+ unsigned long val_up, unsigned long val_down,
+ void *v)
+{
+ int ret, nr = 0;
+
+ ret = notifier_call_chain(nl, val_up, v, -1, &nr);
+ if (ret & NOTIFY_STOP_MASK)
+ notifier_call_chain(nl, val_down, v, nr-1, NULL);
+
+ return ret;
+}
+
/*
* Atomic notifier chain routines. Registration and unregistration
* use a spinlock, and call_chain is synchronized by RCU (no locks).
@@ -116,7 +136,7 @@ NOKPROBE_SYMBOL(notifier_call_chain);
*
* Adds a notifier to an atomic notifier chain.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
struct notifier_block *n)
@@ -125,13 +145,36 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
int ret;
spin_lock_irqsave(&nh->lock, flags);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
/**
+ * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
+ * @nh: Pointer to head of the atomic notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an atomic notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
+ struct notifier_block *n)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&nh->lock, flags);
+ ret = notifier_chain_register(&nh->head, n, true);
+ spin_unlock_irqrestore(&nh->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);
+
+/**
* atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @n: Entry to remove from notifier chain
@@ -155,12 +198,10 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
/**
- * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ * atomic_notifier_call_chain - Call functions in an atomic notifier chain
* @nh: Pointer to head of the atomic notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See the comment for notifier_call_chain.
- * @nr_calls: See the comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in an atomic context, so they must not block.
@@ -173,45 +214,41 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret;
rcu_read_lock();
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
rcu_read_unlock();
+
return ret;
}
-EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
-NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
-int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+/**
+ * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
+ * @nh: Pointer to head of the atomic notifier chain
+ *
+ * Checks whether notifier chain is empty.
+ *
+ * Returns true is notifier chain is empty, false otherwise.
+ */
+bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
{
- return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+ return !rcu_access_pointer(nh->head);
}
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
-NOKPROBE_SYMBOL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
* synchronized by an rwsem.
*/
-/**
- * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
- * @nh: Pointer to head of the blocking notifier chain
- * @n: New entry in notifier chain
- *
- * Adds a notifier to a blocking notifier chain.
- * Must be called in process context.
- *
- * Currently always returns zero.
- */
-int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
- struct notifier_block *n)
+static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+ struct notifier_block *n,
+ bool unique_priority)
{
int ret;
@@ -221,37 +258,47 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
* such times we must not call down_write().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, unique_priority);
down_write(&nh->rwsem);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, unique_priority);
up_write(&nh->rwsem);
return ret;
}
-EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
/**
- * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain
+ * blocking_notifier_chain_register - Add notifier to a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @n: New entry in notifier chain
*
- * Adds a notifier to a blocking notifier chain, only if not already
- * present in the chain.
+ * Adds a notifier to a blocking notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
-int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh,
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
struct notifier_block *n)
{
- int ret;
+ return __blocking_notifier_chain_register(nh, n, false);
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
- down_write(&nh->rwsem);
- ret = notifier_chain_cond_register(&nh->head, n);
- up_write(&nh->rwsem);
- return ret;
+/**
+ * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
+ * @nh: Pointer to head of the blocking notifier chain
+ * @n: New entry in notifier chain
+ *
+ * Adds a notifier to an blocking notifier chain if there is no other
+ * notifier registered using the same priority.
+ *
+ * Returns 0 on success, %-EEXIST or %-EBUSY on error.
+ */
+int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
+ struct notifier_block *n)
+{
+ return __blocking_notifier_chain_register(nh, n, true);
}
-EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register);
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);
/**
* blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
@@ -283,13 +330,30 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
+ unsigned long val_up, unsigned long val_down, void *v)
+{
+ int ret = NOTIFY_DONE;
+
+ /*
+ * We check the head outside the lock, but if this access is
+ * racy then it does not matter what the result of the test
+ * is, we re-check the list after having taken the lock anyway:
+ */
+ if (rcu_access_pointer(nh->head)) {
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+ up_read(&nh->rwsem);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);
+
/**
- * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ * blocking_notifier_call_chain - Call functions in a blocking notifier chain
* @nh: Pointer to head of the blocking notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain.
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -301,9 +365,8 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret = NOTIFY_DONE;
@@ -314,19 +377,11 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
*/
if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
- nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
up_read(&nh->rwsem);
}
return ret;
}
-EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
-
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
- unsigned long val, void *v)
-{
- return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
-}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
/*
@@ -342,12 +397,12 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
* Adds a notifier to a raw notifier chain.
* All locking must be provided by the caller.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int raw_notifier_chain_register(struct raw_notifier_head *nh,
struct notifier_block *n)
{
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
@@ -368,13 +423,18 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
+ unsigned long val_up, unsigned long val_down, void *v)
+{
+ return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
+}
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);
+
/**
- * __raw_notifier_call_chain - Call functions in a raw notifier chain
+ * raw_notifier_call_chain - Call functions in a raw notifier chain
* @nh: Pointer to head of the raw notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in an undefined context.
@@ -387,22 +447,13 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __raw_notifier_call_chain(struct raw_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
-{
- return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
-}
-EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
-
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
- return __raw_notifier_call_chain(nh, val, v, -1, NULL);
+ return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
-#ifdef CONFIG_SRCU
/*
* SRCU notifier chain routines. Registration and unregistration
* use a mutex, and call_chain is synchronized by SRCU (no locks).
@@ -416,7 +467,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
* Adds a notifier to an SRCU notifier chain.
* Must be called in process context.
*
- * Currently always returns zero.
+ * Returns 0 on success, %-EEXIST on error.
*/
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
struct notifier_block *n)
@@ -429,10 +480,10 @@ int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
* such times we must not call mutex_lock().
*/
if (unlikely(system_state == SYSTEM_BOOTING))
- return notifier_chain_register(&nh->head, n);
+ return notifier_chain_register(&nh->head, n, false);
mutex_lock(&nh->mutex);
- ret = notifier_chain_register(&nh->head, n);
+ ret = notifier_chain_register(&nh->head, n, false);
mutex_unlock(&nh->mutex);
return ret;
}
@@ -470,12 +521,10 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
/**
- * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ * srcu_notifier_call_chain - Call functions in an SRCU notifier chain
* @nh: Pointer to head of the SRCU notifier chain
* @val: Value passed unmodified to notifier function
* @v: Pointer passed unmodified to notifier function
- * @nr_to_call: See comment for notifier_call_chain.
- * @nr_calls: See comment for notifier_call_chain
*
* Calls each function in a notifier chain in turn. The functions
* run in a process context, so they are allowed to block.
@@ -487,25 +536,17 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+ unsigned long val, void *v)
{
int ret;
int idx;
idx = srcu_read_lock(&nh->srcu);
- ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
srcu_read_unlock(&nh->srcu, idx);
return ret;
}
-EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
-
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
- unsigned long val, void *v)
-{
- return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
-}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
/**
@@ -529,8 +570,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
-#endif /* CONFIG_SRCU */
-
static ATOMIC_NOTIFIER_HEAD(die_chain);
int notrace notify_die(enum die_val val, const char *str,
@@ -544,13 +583,14 @@ int notrace notify_die(enum die_val val, const char *str,
.signr = sig,
};
+ RCU_LOCKDEP_WARN(!rcu_is_watching(),
+ "notify_die called but RCU thinks we're quiescent");
return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_all();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
new file mode 100644
index 000000000000..bdc3c86231d3
--- /dev/null
+++ b/kernel/nscommon.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+
+#include <linux/ns_common.h>
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/user_namespace.h>
+#include <linux/vfsdebug.h>
+
+#ifdef CONFIG_DEBUG_VFS
+static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
+ break;
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ VFS_WARN_ON_ONCE(ops != &ipcns_operations);
+ break;
+#endif
+ case CLONE_NEWNS:
+ VFS_WARN_ON_ONCE(ops != &mntns_operations);
+ break;
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ VFS_WARN_ON_ONCE(ops != &netns_operations);
+ break;
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ VFS_WARN_ON_ONCE(ops != &pidns_operations);
+ break;
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ VFS_WARN_ON_ONCE(ops != &timens_operations);
+ break;
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ VFS_WARN_ON_ONCE(ops != &userns_operations);
+ break;
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ VFS_WARN_ON_ONCE(ops != &utsns_operations);
+ break;
+#endif
+ }
+}
+#endif
+
+int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
+{
+ int ret = 0;
+
+ refcount_set(&ns->__ns_ref, 1);
+ ns->stashed = NULL;
+ ns->ops = ops;
+ ns->ns_id = 0;
+ ns->ns_type = ns_type;
+ ns_tree_node_init(&ns->ns_tree_node);
+ ns_tree_node_init(&ns->ns_unified_node);
+ ns_tree_node_init(&ns->ns_owner_node);
+ ns_tree_root_init(&ns->ns_owner_root);
+
+#ifdef CONFIG_DEBUG_VFS
+ ns_debug(ns, ops);
+#endif
+
+ if (inum)
+ ns->inum = inum;
+ else
+ ret = proc_alloc_inum(&ns->inum);
+ if (ret)
+ return ret;
+ /*
+ * Tree ref starts at 0. It's incremented when namespace enters
+ * active use (installed in nsproxy) and decremented when all
+ * active uses are gone. Initial namespaces are always active.
+ */
+ if (is_ns_init_inum(ns))
+ atomic_set(&ns->__ns_ref_active, 1);
+ else
+ atomic_set(&ns->__ns_ref_active, 0);
+ return 0;
+}
+
+void __ns_common_free(struct ns_common *ns)
+{
+ proc_free_inum(ns->inum);
+}
+
+struct ns_common *__must_check ns_owner(struct ns_common *ns)
+{
+ struct user_namespace *owner;
+
+ if (unlikely(!ns->ops))
+ return NULL;
+ VFS_WARN_ON_ONCE(!ns->ops->owner);
+ owner = ns->ops->owner(ns);
+ VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
+ if (!owner)
+ return NULL;
+ /* Skip init_user_ns as it's always active */
+ if (owner == &init_user_ns)
+ return NULL;
+ return to_ns_common(owner);
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down.
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * The iteration stops once we reach a namespace that still has active
+ * references.
+ */
+void __ns_ref_active_put(struct ns_common *ns)
+{
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+ }
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down. This makes it possible to efficiently
+ * resurrect a namespace tree:
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Assume the whole tree is dead but all namespaces are still active:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - - -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
+ *
+ * net_ns pid_ns
+ * \ /
+ * + -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - + -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If net_ns had a zero reference count and we bumped it we also need to
+ * take another reference on its owning user namespace. Similarly, if
+ * pid_ns had a zero reference count it also needs to take another
+ * reference on its owning user namespace. So both net_ns and pid_ns
+ * will each have their own reference on the owning user namespace.
+ *
+ * If the owning user namespace user_ns1 had a zero reference count then
+ * it also needs to take another reference on its owning user namespace
+ * and so on.
+ */
+void __ns_ref_active_get(struct ns_common *ns)
+{
+ int prev;
+
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ /* If we didn't resurrect the namespace we're done. */
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+
+ /*
+ * We did resurrect it. Walk the ownership hierarchy upwards
+ * until we found an owning user namespace that is active.
+ */
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+ }
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 49746c81ad8d..259c4b4f1eeb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2006 IBM Corporation
*
* Author: Serge Hallyn <serue@us.ibm.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- *
* Jun 2006 - namespaces support
* OpenVZ, SWsoft Inc.
* Pavel Emelianov <xemul@openvz.org>
@@ -22,14 +18,20 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/fs_struct.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
+#include <linux/cgroup.h>
+#include <linux/perf_event.h>
+#include <linux/nstree.h>
static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &init_ipc_ns,
@@ -39,6 +41,13 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
+#ifdef CONFIG_CGROUPS
+ .cgroup_ns = &init_cgroup_ns,
+#endif
+#ifdef CONFIG_TIME_NS
+ .time_ns = &init_time_ns,
+ .time_ns_for_children = &init_time_ns,
+#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -47,16 +56,35 @@ static inline struct nsproxy *create_nsproxy(void)
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
if (nsproxy)
- atomic_set(&nsproxy->count, 1);
+ refcount_set(&nsproxy->count, 1);
return nsproxy;
}
+static inline void nsproxy_free(struct nsproxy *ns)
+{
+ put_mnt_ns(ns->mnt_ns);
+ put_uts_ns(ns->uts_ns);
+ put_ipc_ns(ns->ipc_ns);
+ put_pid_ns(ns->pid_ns_for_children);
+ put_time_ns(ns->time_ns);
+ put_time_ns(ns->time_ns_for_children);
+ put_cgroup_ns(ns->cgroup_ns);
+ put_net(ns->net_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
+}
+
+void deactivate_nsproxy(struct nsproxy *ns)
+{
+ nsproxy_ns_active_put(ns);
+ nsproxy_free(ns);
+}
+
/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy. Do not attach this to the task,
* leave it to the caller to do proper locking and attach it to task.
*/
-static struct nsproxy *create_new_namespaces(unsigned long flags,
+static struct nsproxy *create_new_namespaces(u64 flags,
struct task_struct *tsk, struct user_namespace *user_ns,
struct fs_struct *new_fs)
{
@@ -92,26 +120,41 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_pid;
}
+ new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
+ tsk->nsproxy->cgroup_ns);
+ if (IS_ERR(new_nsp->cgroup_ns)) {
+ err = PTR_ERR(new_nsp->cgroup_ns);
+ goto out_cgroup;
+ }
+
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
goto out_net;
}
+ new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
+ tsk->nsproxy->time_ns_for_children);
+ if (IS_ERR(new_nsp->time_ns_for_children)) {
+ err = PTR_ERR(new_nsp->time_ns_for_children);
+ goto out_time;
+ }
+ new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
+
return new_nsp;
+out_time:
+ put_net(new_nsp->net_ns);
out_net:
- if (new_nsp->pid_ns_for_children)
- put_pid_ns(new_nsp->pid_ns_for_children);
+ put_cgroup_ns(new_nsp->cgroup_ns);
+out_cgroup:
+ put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
- if (new_nsp->ipc_ns)
- put_ipc_ns(new_nsp->ipc_ns);
+ put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
- if (new_nsp->uts_ns)
- put_uts_ns(new_nsp->uts_ns);
+ put_uts_ns(new_nsp->uts_ns);
out_uts:
- if (new_nsp->mnt_ns)
- put_mnt_ns(new_nsp->mnt_ns);
+ put_mnt_ns(new_nsp->mnt_ns);
out_ns:
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);
@@ -121,19 +164,21 @@ out_ns:
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(u64 flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWPID | CLONE_NEWNET)))) {
- get_nsproxy(old_ns);
- return 0;
- }
-
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ CLONE_NEWPID | CLONE_NEWNET |
+ CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+ if ((flags & CLONE_VM) ||
+ likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ get_nsproxy(old_ns);
+ return 0;
+ }
+ } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -144,31 +189,21 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
* it along with CLONE_NEWIPC.
*/
if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
- (CLONE_NEWIPC | CLONE_SYSVSEM))
+ (CLONE_NEWIPC | CLONE_SYSVSEM))
return -EINVAL;
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
+ if ((flags & CLONE_VM) == 0)
+ timens_on_fork(new_ns, tsk);
+
+ nsproxy_ns_active_get(new_ns);
tsk->nsproxy = new_ns;
return 0;
}
-void free_nsproxy(struct nsproxy *ns)
-{
- if (ns->mnt_ns)
- put_mnt_ns(ns->mnt_ns);
- if (ns->uts_ns)
- put_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- put_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns_for_children)
- put_pid_ns(ns->pid_ns_for_children);
- put_net(ns->net_ns);
- kmem_cache_free(nsproxy_cachep, ns);
-}
-
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
* On success, returns the new nsproxy.
@@ -180,7 +215,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWTIME)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
@@ -204,56 +240,368 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ if (new)
+ nsproxy_ns_active_get(new);
+
task_lock(p);
ns = p->nsproxy;
p->nsproxy = new;
task_unlock(p);
- if (ns && atomic_dec_and_test(&ns->count))
- free_nsproxy(ns);
+ if (ns)
+ put_nsproxy(ns);
}
-void exit_task_namespaces(struct task_struct *p)
+void exit_nsproxy_namespaces(struct task_struct *p)
{
switch_task_namespaces(p, NULL);
}
-SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+void switch_cred_namespaces(const struct cred *old, const struct cred *new)
+{
+ ns_ref_active_get(new->user_ns);
+ ns_ref_active_put(old->user_ns);
+}
+
+void get_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_get(tsk->real_cred->user_ns);
+}
+
+void exit_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_put(tsk->real_cred->user_ns);
+}
+
+int exec_task_namespaces(void)
{
struct task_struct *tsk = current;
- struct nsproxy *new_nsproxy;
- struct file *file;
- struct ns_common *ns;
- int err;
+ struct nsproxy *new;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return PTR_ERR(file);
+ if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
+ return 0;
- err = -EINVAL;
- ns = get_proc_ns(file_inode(file));
- if (nstype && (ns->ops->type != nstype))
- goto out;
+ new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ timens_on_fork(new, tsk);
+ switch_task_namespaces(tsk, new);
+ return 0;
+}
+
+static int check_setns_flags(unsigned long flags)
+{
+ if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
+ CLONE_NEWPID | CLONE_NEWCGROUP)))
+ return -EINVAL;
+
+#ifndef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET)
+ return -EINVAL;
+#endif
+#ifndef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ return -EINVAL;
+#endif
+
+ return 0;
+}
+
+static void put_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
- new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
- if (IS_ERR(new_nsproxy)) {
- err = PTR_ERR(new_nsproxy);
+ if (flags & CLONE_NEWUSER)
+ put_cred(nsset_cred(nsset));
+ /*
+ * We only created a temporary copy if we attached to more than just
+ * the mount namespace.
+ */
+ if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
+ free_fs_struct(nsset->fs);
+ if (nsset->nsproxy)
+ nsproxy_free(nsset->nsproxy);
+}
+
+static int prepare_nsset(unsigned flags, struct nsset *nsset)
+{
+ struct task_struct *me = current;
+
+ nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
+ if (IS_ERR(nsset->nsproxy))
+ return PTR_ERR(nsset->nsproxy);
+
+ if (flags & CLONE_NEWUSER)
+ nsset->cred = prepare_creds();
+ else
+ nsset->cred = current_cred();
+ if (!nsset->cred)
goto out;
+
+ /* Only create a temporary copy of fs_struct if we really need to. */
+ if (flags == CLONE_NEWNS) {
+ nsset->fs = me->fs;
+ } else if (flags & CLONE_NEWNS) {
+ nsset->fs = copy_fs_struct(me->fs);
+ if (!nsset->fs)
+ goto out;
}
- err = ns->ops->install(new_nsproxy, ns);
- if (err) {
- free_nsproxy(new_nsproxy);
+ nsset->flags = flags;
+ return 0;
+
+out:
+ put_nsset(nsset);
+ return -ENOMEM;
+}
+
+static inline int validate_ns(struct nsset *nsset, struct ns_common *ns)
+{
+ return ns->ops->install(nsset, ns);
+}
+
+/*
+ * This is the inverse operation to unshare().
+ * Ordering is equivalent to the standard ordering used everywhere else
+ * during unshare and process creation. The switch to the new set of
+ * namespaces occurs at the point of no return after installation of
+ * all requested namespaces was successful in commit_nsset().
+ */
+static int validate_nsset(struct nsset *nsset, struct pid *pid)
+{
+ int ret = 0;
+ unsigned flags = nsset->flags;
+ struct user_namespace *user_ns = NULL;
+ struct pid_namespace *pid_ns = NULL;
+ struct nsproxy *nsp;
+ struct task_struct *tsk;
+
+ /* Take a "snapshot" of the target task's namespaces. */
+ rcu_read_lock();
+ tsk = pid_task(pid, PIDTYPE_PID);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+
+ task_lock(tsk);
+ nsp = tsk->nsproxy;
+ if (nsp)
+ get_nsproxy(nsp);
+ task_unlock(tsk);
+ if (!nsp) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ pid_ns = task_active_pid_ns(tsk);
+ if (unlikely(!pid_ns)) {
+ rcu_read_unlock();
+ ret = -ESRCH;
+ goto out;
+ }
+ get_pid_ns(pid_ns);
+ }
+#endif
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER)
+ user_ns = get_user_ns(__task_cred(tsk)->user_ns);
+#endif
+ rcu_read_unlock();
+
+ /*
+ * Install requested namespaces. The caller will have
+ * verified earlier that the requested namespaces are
+ * supported on this kernel. We don't report errors here
+ * if a namespace is requested that isn't supported.
+ */
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ ret = validate_ns(nsset, &user_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+ if (flags & CLONE_NEWNS) {
+ ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns));
+ if (ret)
+ goto out;
+ }
+
+#ifdef CONFIG_UTS_NS
+ if (flags & CLONE_NEWUTS) {
+ ret = validate_ns(nsset, &nsp->uts_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC) {
+ ret = validate_ns(nsset, &nsp->ipc_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_PID_NS
+ if (flags & CLONE_NEWPID) {
+ ret = validate_ns(nsset, &pid_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_CGROUPS
+ if (flags & CLONE_NEWCGROUP) {
+ ret = validate_ns(nsset, &nsp->cgroup_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_NET_NS
+ if (flags & CLONE_NEWNET) {
+ ret = validate_ns(nsset, &nsp->net_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME) {
+ ret = validate_ns(nsset, &nsp->time_ns->ns);
+ if (ret)
+ goto out;
+ }
+#endif
+
+out:
+ if (pid_ns)
+ put_pid_ns(pid_ns);
+ if (nsp)
+ put_nsproxy(nsp);
+ put_user_ns(user_ns);
+
+ return ret;
+}
+
+/*
+ * This is the point of no return. There are just a few namespaces
+ * that do some actual work here and it's sufficiently minimal that
+ * a separate ns_common operation seems unnecessary for now.
+ * Unshare is doing the same thing. If we'll end up needing to do
+ * more in a given namespace or a helper here is ultimately not
+ * exported anymore a simple commit handler for each namespace
+ * should be added to ns_common.
+ */
+static void commit_nsset(struct nsset *nsset)
+{
+ unsigned flags = nsset->flags;
+ struct task_struct *me = current;
+
+#ifdef CONFIG_USER_NS
+ if (flags & CLONE_NEWUSER) {
+ /* transfer ownership */
+ commit_creds(nsset_cred(nsset));
+ nsset->cred = NULL;
+ }
+#endif
+
+ /* We only need to commit if we have used a temporary fs_struct. */
+ if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) {
+ set_fs_root(me->fs, &nsset->fs->root);
+ set_fs_pwd(me->fs, &nsset->fs->pwd);
+ }
+
+#ifdef CONFIG_IPC_NS
+ if (flags & CLONE_NEWIPC)
+ exit_sem(me);
+#endif
+
+#ifdef CONFIG_TIME_NS
+ if (flags & CLONE_NEWTIME)
+ timens_commit(me, nsset->nsproxy->time_ns);
+#endif
+
+ /* transfer ownership */
+ switch_task_namespaces(me, nsset->nsproxy);
+ nsset->nsproxy = NULL;
+}
+
+SYSCALL_DEFINE2(setns, int, fd, int, flags)
+{
+ CLASS(fd, f)(fd);
+ struct ns_common *ns = NULL;
+ struct nsset nsset = {};
+ int err = 0;
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (proc_ns_file(fd_file(f))) {
+ ns = get_proc_ns(file_inode(fd_file(f)));
+ if (flags && (ns->ns_type != flags))
+ err = -EINVAL;
+ flags = ns->ns_type;
+ } else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
+ err = check_setns_flags(flags);
+ } else {
+ err = -EINVAL;
+ }
+ if (err)
+ goto out;
+
+ err = prepare_nsset(flags, &nsset);
+ if (err)
goto out;
+
+ if (proc_ns_file(fd_file(f)))
+ err = validate_ns(&nsset, ns);
+ else
+ err = validate_nsset(&nsset, pidfd_pid(fd_file(f)));
+ if (!err) {
+ commit_nsset(&nsset);
+ perf_event_namespaces(current);
}
- switch_task_namespaces(tsk, new_nsproxy);
+ put_nsset(&nsset);
out:
- fput(file);
return err;
}
int __init nsproxy_cache_init(void)
{
- nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+ nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
return 0;
}
diff --git a/kernel/nstree.c b/kernel/nstree.c
new file mode 100644
index 000000000000..f36c59e6951d
--- /dev/null
+++ b/kernel/nstree.c
@@ -0,0 +1,813 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/rculist.h>
+#include <linux/vfsdebug.h>
+#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
+
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
+
+DEFINE_LOCK_GUARD_0(ns_tree_writer,
+ write_seqlock(&ns_tree_lock),
+ write_sequnlock(&ns_tree_lock))
+
+DEFINE_LOCK_GUARD_0(ns_tree_locked_reader,
+ read_seqlock_excl(&ns_tree_lock),
+ read_sequnlock_excl(&ns_tree_lock))
+
+static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head),
+};
+
+struct ns_tree_root mnt_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root net_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head),
+};
+EXPORT_SYMBOL_GPL(net_ns_tree);
+
+struct ns_tree_root uts_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root user_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root ipc_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root pid_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root cgroup_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head),
+};
+
+struct ns_tree_root time_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head),
+};
+
+/**
+ * ns_tree_node_init - Initialize a namespace tree node
+ * @node: The node to initialize
+ *
+ * Initializes both the rbtree node and list entry.
+ */
+void ns_tree_node_init(struct ns_tree_node *node)
+{
+ RB_CLEAR_NODE(&node->ns_node);
+ INIT_LIST_HEAD(&node->ns_list_entry);
+}
+
+/**
+ * ns_tree_root_init - Initialize a namespace tree root
+ * @root: The root to initialize
+ *
+ * Initializes both the rbtree root and list head.
+ */
+void ns_tree_root_init(struct ns_tree_root *root)
+{
+ root->ns_rb = RB_ROOT;
+ INIT_LIST_HEAD(&root->ns_list_head);
+}
+
+/**
+ * ns_tree_node_empty - Check if a namespace tree node is empty
+ * @node: The node to check
+ *
+ * Returns true if the node is not in any tree.
+ */
+bool ns_tree_node_empty(const struct ns_tree_node *node)
+{
+ return RB_EMPTY_NODE(&node->ns_node);
+}
+
+/**
+ * ns_tree_node_add - Add a node to a namespace tree
+ * @node: The node to add
+ * @root: The tree root to add to
+ * @cmp: Comparison function for rbtree insertion
+ *
+ * Adds the node to both the rbtree and the list, maintaining sorted order.
+ * The list is maintained in the same order as the rbtree to enable efficient
+ * iteration.
+ *
+ * Returns: NULL if insertion succeeded, existing node if duplicate found
+ */
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node *ret, *prev;
+
+ /* Add to rbtree */
+ ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp);
+
+ /* Add to list in sorted order */
+ prev = rb_prev(&node->ns_node);
+ if (!prev) {
+ /* No previous node, add at head */
+ list_add_rcu(&node->ns_list_entry, &root->ns_list_head);
+ } else {
+ /* Add after previous node */
+ struct ns_tree_node *prev_node;
+ prev_node = rb_entry(prev, struct ns_tree_node, ns_node);
+ list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry);
+ }
+
+ return ret;
+}
+
+/**
+ * ns_tree_node_del - Remove a node from a namespace tree
+ * @node: The node to remove
+ * @root: The tree root to remove from
+ *
+ * Removes the node from both the rbtree and the list atomically.
+ */
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root)
+{
+ rb_erase(&node->ns_node, &root->ns_rb);
+ RB_CLEAR_NODE(&node->ns_node);
+ list_bidir_del_rcu(&node->ns_list_entry);
+}
+
+static inline struct ns_common *node_to_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_tree_node.ns_node);
+}
+
+static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_unified_node.ns_node);
+}
+
+static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_owner_node.ns_node);
+}
+
+static int ns_id_cmp(u64 id_a, u64 id_b)
+{
+ if (id_a < id_b)
+ return -1;
+ if (id_a > id_b)
+ return 1;
+ return 0;
+}
+
+static int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id);
+}
+
+static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id);
+}
+
+static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id);
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree)
+{
+ struct rb_node *node;
+ const struct proc_ns_operations *ops = ns->ops;
+
+ VFS_WARN_ON_ONCE(!ns->ns_id);
+
+ guard(ns_tree_writer)();
+
+ /* Add to per-type tree and list */
+ node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp);
+
+ /* Add to unified tree and list */
+ ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified);
+
+ /* Add to owner's tree if applicable */
+ if (ops) {
+ struct user_namespace *user_ns;
+
+ VFS_WARN_ON_ONCE(!ops->owner);
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ /* Insert into owner's tree and list */
+ ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner);
+ } else {
+ /* Only the initial user namespace doesn't have an owner. */
+ VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
+ }
+ }
+
+ VFS_WARN_ON_ONCE(node);
+}
+
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree)
+{
+ const struct proc_ns_operations *ops = ns->ops;
+ struct user_namespace *user_ns;
+
+ VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry));
+
+ write_seqlock(&ns_tree_lock);
+
+ /* Remove from per-type tree and list */
+ ns_tree_node_del(&ns->ns_tree_node, ns_tree);
+
+ /* Remove from unified tree and list */
+ ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root);
+
+ /* Remove from owner's tree if applicable */
+ if (ops) {
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root);
+ }
+ }
+
+ write_sequnlock(&ns_tree_lock);
+}
+EXPORT_SYMBOL_GPL(__ns_tree_remove);
+
+static int ns_find(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+static int ns_find_unified(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns_unified(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+static struct ns_tree_root *ns_tree_from_type(int ns_type)
+{
+ switch (ns_type) {
+ case CLONE_NEWCGROUP:
+ return &cgroup_ns_tree;
+ case CLONE_NEWIPC:
+ return &ipc_ns_tree;
+ case CLONE_NEWNS:
+ return &mnt_ns_tree;
+ case CLONE_NEWNET:
+ return &net_ns_tree;
+ case CLONE_NEWPID:
+ return &pid_ns_tree;
+ case CLONE_NEWUSER:
+ return &user_ns_tree;
+ case CLONE_NEWUTS:
+ return &uts_ns_tree;
+ case CLONE_NEWTIME:
+ return &time_ns_tree;
+ }
+
+ return NULL;
+}
+
+static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
+{
+ struct rb_node *node;
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree_lock, seq));
+
+ return node_to_ns_unified(node);
+}
+
+static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree_root *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
+
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+
+ do {
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree_lock, seq));
+
+ return node_to_ns(node);
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+ if (ns_type)
+ return __ns_tree_lookup_rcu(ns_id, ns_type);
+
+ return __ns_unified_tree_lookup_rcu(ns_id);
+}
+
+/**
+ * __ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * tree
+ * @ns: namespace to start from
+ * @ns_tree: namespace tree to search in
+ * @previous: if true find the previous namespace, otherwise the next
+ *
+ * Find the next or previous namespace in the same tree as @ns. If
+ * there is no next/previous namespace, -ENOENT is returned.
+ */
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+ struct ns_tree_root *ns_tree, bool previous)
+{
+ struct list_head *list;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
+
+ if (previous)
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry));
+ else
+ list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry));
+ if (list_is_head(list, &ns_tree->ns_list_head))
+ return ERR_PTR(-ENOENT);
+
+ return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry);
+}
+
+/**
+ * __ns_tree_gen_id - generate a new namespace id
+ * @ns: namespace to generate id for
+ * @id: if non-zero, this is the initial namespace and this is a fixed id
+ *
+ * Generates a new namespace id and assigns it to the namespace. All
+ * namespaces types share the same id space and thus can be compared
+ * directly. IOW, when two ids of two namespace are equal, they are
+ * identical.
+ */
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
+{
+ static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1);
+
+ if (id)
+ ns->ns_id = id;
+ else
+ ns->ns_id = atomic64_inc_return(&namespace_cookie);
+ return ns->ns_id;
+}
+
+struct klistns {
+ u64 __user *uns_ids;
+ u32 nr_ns_ids;
+ u64 last_ns_id;
+ u64 user_ns_id;
+ u32 ns_type;
+ struct user_namespace *user_ns;
+ bool userns_capable;
+ struct ns_common *first_ns;
+};
+
+static void __free_klistns_free(const struct klistns *kls)
+{
+ if (kls->user_ns_id != LISTNS_CURRENT_USER)
+ put_user_ns(kls->user_ns);
+ if (kls->first_ns && kls->first_ns->ops)
+ kls->first_ns->ops->put(kls->first_ns);
+}
+
+#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS)
+
+static int copy_ns_id_req(const struct ns_id_req __user *req,
+ struct ns_id_req *kreq)
+{
+ int ret;
+ size_t usize;
+
+ BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0);
+
+ ret = get_user(usize, &req->size);
+ if (ret)
+ return -EFAULT;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < NS_ID_REQ_SIZE_VER0))
+ return -EINVAL;
+ memset(kreq, 0, sizeof(*kreq));
+ ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+ if (ret)
+ return ret;
+ if (kreq->spare != 0)
+ return -EINVAL;
+ if (kreq->ns_type & ~NS_ALL)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq,
+ u64 __user *ns_ids, size_t nr_ns_ids)
+{
+ kls->last_ns_id = kreq->ns_id;
+ kls->user_ns_id = kreq->user_ns_id;
+ kls->nr_ns_ids = nr_ns_ids;
+ kls->ns_type = kreq->ns_type;
+ kls->uns_ids = ns_ids;
+ return 0;
+}
+
+/*
+ * Lookup a namespace owned by owner with id >= ns_id.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
+{
+ struct ns_common *ret = NULL;
+ struct rb_node *node;
+
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ guard(ns_tree_locked_reader)();
+
+ node = owner->ns_owner_root.ns_rb.rb_node;
+ while (node) {
+ struct ns_common *ns;
+
+ ns = node_to_ns_owner(node);
+ if (ns_id <= ns->ns_id) {
+ ret = ns;
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type)
+{
+ struct ns_common *ns;
+
+ guard(rcu)();
+ ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type);
+ if (!ns)
+ return NULL;
+
+ if (!ns_get_unless_inactive(ns))
+ return NULL;
+
+ return ns;
+}
+
+static inline bool __must_check ns_requested(const struct klistns *kls,
+ const struct ns_common *ns)
+{
+ return !kls->ns_type || (kls->ns_type & ns->ns_type);
+}
+
+static inline bool __must_check may_list_ns(const struct klistns *kls,
+ struct ns_common *ns)
+{
+ if (kls->user_ns) {
+ if (kls->userns_capable)
+ return true;
+ } else {
+ struct ns_common *owner;
+ struct user_namespace *user_ns;
+
+ owner = ns_owner(ns);
+ if (owner)
+ user_ns = to_user_ns(owner);
+ else
+ user_ns = &init_user_ns;
+ if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
+ return true;
+ }
+
+ if (is_current_namespace(ns))
+ return true;
+
+ if (ns->ns_type != CLONE_NEWUSER)
+ return false;
+
+ if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+ return true;
+
+ return false;
+}
+
+static inline void ns_put(struct ns_common *ns)
+{
+ if (ns && ns->ops)
+ ns->ops->put(ns);
+}
+
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T))
+
+static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
+ struct ns_common *candidate)
+{
+ struct ns_common *ns __free(ns_put) = NULL;
+
+ if (!ns_requested(kls, candidate))
+ return NULL;
+
+ ns = ns_get_unless_inactive(candidate);
+ if (!ns)
+ return NULL;
+
+ if (!may_list_ns(kls, ns))
+ return NULL;
+
+ return no_free_ptr(ns);
+}
+
+static ssize_t do_listns_userns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL;
+ const struct list_head *head;
+ ssize_t ret;
+
+ VFS_WARN_ON_ONCE(!kls->user_ns_id);
+
+ if (kls->user_ns_id == LISTNS_CURRENT_USER)
+ ns = to_ns_common(current_user_ns());
+ else if (kls->user_ns_id)
+ ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER);
+ if (!ns)
+ return -EINVAL;
+ kls->user_ns = to_user_ns(ns);
+
+ /*
+ * Use the rbtree to find the first namespace we care about and
+ * then use it's list entry to iterate from there.
+ */
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head;
+ kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry);
+
+ ns = first_ns;
+ list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) {
+ struct ns_common *valid;
+
+ if (!nr_ns_ids)
+ break;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+/*
+ * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
+{
+ struct ns_common *ret = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ struct rb_node *node;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+ }
+
+ guard(ns_tree_locked_reader)();
+
+ if (ns_tree)
+ node = ns_tree->ns_rb.rb_node;
+ else
+ node = ns_unified_root.ns_rb.rb_node;
+
+ while (node) {
+ struct ns_common *ns;
+
+ if (ns_type)
+ ns = node_to_ns(node);
+ else
+ ns = node_to_ns_unified(node);
+
+ if (ns_id <= ns->ns_id) {
+ if (ns_type)
+ ret = node_to_ns(node);
+ else
+ ret = node_to_ns_unified(node);
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static inline struct ns_common *first_ns_common(const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline struct ns_common *next_ns_common(struct ns_common *ns,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline bool ns_common_is_head(struct ns_common *ns,
+ const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return &ns->ns_tree_node.ns_list_entry == head;
+ return &ns->ns_unified_node.ns_list_entry == head;
+}
+
+static ssize_t do_listns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns, *first_ns = NULL, *prev = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ const struct list_head *head;
+ u32 ns_type;
+ ssize_t ret;
+
+ if (hweight32(kls->ns_type) == 1)
+ ns_type = kls->ns_type;
+ else
+ ns_type = 0;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return -EINVAL;
+ }
+
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ if (ns_tree)
+ head = &ns_tree->ns_list_head;
+ else
+ head = &ns_unified_root.ns_list_head;
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = first_ns_common(head, ns_tree);
+
+ for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
+ ns = next_ns_common(ns, ns_tree)) {
+ struct ns_common *valid;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req,
+ u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags)
+{
+ struct klistns klns __free(klistns_free) = {};
+ const size_t maxcount = 1000000;
+ struct ns_id_req kreq;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (unlikely(nr_ns_ids > maxcount))
+ return -EOVERFLOW;
+
+ if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids)))
+ return -EFAULT;
+
+ ret = copy_ns_id_req(req, &kreq);
+ if (ret)
+ return ret;
+
+ ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids);
+ if (ret)
+ return ret;
+
+ if (kreq.user_ns_id)
+ return do_listns_userns(&klns);
+
+ return do_listns(&klns);
+}
diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9c466a..aa66d91e20f9 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,25 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* padata.c - generic interface to process data streams in parallel
*
- * See Documentation/padata.txt for an api documentation.
+ * See Documentation/core-api/padata.rst for more information.
*
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan <daniel.m.jordan@oracle.com>
*/
+#include <linux/completion.h>
#include <linux/export.h>
#include <linux/cpumask.h>
#include <linux/err.h>
@@ -31,117 +23,211 @@
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
-#define MAX_OBJ_NUM 1000
+#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */
-static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
-{
- int cpu, target_cpu;
+struct padata_work {
+ struct work_struct pw_work;
+ struct list_head pw_list; /* padata_free_works linkage */
+ void *pw_data;
+};
+
+static DEFINE_SPINLOCK(padata_works_lock);
+static struct padata_work *padata_works;
+static LIST_HEAD(padata_free_works);
+
+struct padata_mt_job_state {
+ spinlock_t lock;
+ struct completion completion;
+ struct padata_mt_job *job;
+ int nworks;
+ int nworks_fini;
+ unsigned long chunk_size;
+};
- target_cpu = cpumask_first(pd->cpumask.pcpu);
- for (cpu = 0; cpu < cpu_index; cpu++)
- target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
+static void padata_free_pd(struct parallel_data *pd);
+static void __init padata_mt_helper(struct work_struct *work);
- return target_cpu;
+static inline void padata_get_pd(struct parallel_data *pd)
+{
+ refcount_inc(&pd->refcnt);
+}
+
+static inline void padata_put_pd_cnt(struct parallel_data *pd, int cnt)
+{
+ if (refcount_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
}
-static int padata_cpu_hash(struct parallel_data *pd)
+static inline void padata_put_pd(struct parallel_data *pd)
{
- unsigned int seq_nr;
- int cpu_index;
+ padata_put_pd_cnt(pd, 1);
+}
+static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
+{
/*
* Hash the sequence numbers to the cpus by taking
* seq_nr mod. number of cpus in use.
*/
+ int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
- seq_nr = atomic_inc_return(&pd->seq_nr);
- cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
+ return cpumask_nth(cpu_index, pd->cpumask.pcpu);
+}
+
+static struct padata_work *padata_work_alloc(void)
+{
+ struct padata_work *pw;
- return padata_index_to_cpu(pd, cpu_index);
+ lockdep_assert_held(&padata_works_lock);
+
+ if (list_empty(&padata_free_works))
+ return NULL; /* No more work items allowed to be queued. */
+
+ pw = list_first_entry(&padata_free_works, struct padata_work, pw_list);
+ list_del(&pw->pw_list);
+ return pw;
}
-static void padata_parallel_worker(struct work_struct *parallel_work)
+/*
+ * This function is marked __ref because this function may be optimized in such
+ * a way that it directly refers to work_fn's address, which causes modpost to
+ * complain when work_fn is marked __init. This scenario was observed with clang
+ * LTO, where padata_work_init() was optimized to refer directly to
+ * padata_mt_helper() because the calls to padata_work_init() with other work_fn
+ * values were eliminated or inlined.
+ */
+static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
{
- struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
- struct padata_instance *pinst;
- LIST_HEAD(local_list);
+ if (flags & PADATA_WORK_ONSTACK)
+ INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
+ else
+ INIT_WORK(&pw->pw_work, work_fn);
+ pw->pw_data = data;
+}
- local_bh_disable();
- pqueue = container_of(parallel_work,
- struct padata_parallel_queue, work);
- pd = pqueue->pd;
- pinst = pd->pinst;
+static int __init padata_work_alloc_mt(int nworks, void *data,
+ struct list_head *head)
+{
+ int i;
- spin_lock(&pqueue->parallel.lock);
- list_replace_init(&pqueue->parallel.list, &local_list);
- spin_unlock(&pqueue->parallel.lock);
+ spin_lock_bh(&padata_works_lock);
+ /* Start at 1 because the current task participates in the job. */
+ for (i = 1; i < nworks; ++i) {
+ struct padata_work *pw = padata_work_alloc();
- while (!list_empty(&local_list)) {
- struct padata_priv *padata;
+ if (!pw)
+ break;
+ padata_work_init(pw, padata_mt_helper, data, 0);
+ list_add(&pw->pw_list, head);
+ }
+ spin_unlock_bh(&padata_works_lock);
- padata = list_entry(local_list.next,
- struct padata_priv, list);
+ return i;
+}
- list_del_init(&padata->list);
+static void padata_work_free(struct padata_work *pw)
+{
+ lockdep_assert_held(&padata_works_lock);
+ list_add(&pw->pw_list, &padata_free_works);
+}
- padata->parallel(padata);
+static void __init padata_works_free(struct list_head *works)
+{
+ struct padata_work *cur, *next;
+
+ if (list_empty(works))
+ return;
+
+ spin_lock_bh(&padata_works_lock);
+ list_for_each_entry_safe(cur, next, works, pw_list) {
+ list_del(&cur->pw_list);
+ padata_work_free(cur);
}
+ spin_unlock_bh(&padata_works_lock);
+}
+
+static void padata_parallel_worker(struct work_struct *parallel_work)
+{
+ struct padata_work *pw = container_of(parallel_work, struct padata_work,
+ pw_work);
+ struct padata_priv *padata = pw->pw_data;
+ local_bh_disable();
+ padata->parallel(padata);
+ spin_lock(&padata_works_lock);
+ padata_work_free(pw);
+ spin_unlock(&padata_works_lock);
local_bh_enable();
}
/**
* padata_do_parallel - padata parallelization function
*
- * @pinst: padata instance
+ * @ps: padatashell
* @padata: object to be parallelized
- * @cb_cpu: cpu the serialization callback function will run on,
- * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
+ * @cb_cpu: pointer to the CPU that the serialization callback function should
+ * run on. If it's not in the serial cpumask of @pinst
+ * (i.e. cpumask.cbcpu), this function selects a fallback CPU and if
+ * none found, returns -EINVAL.
*
* The parallelization callback function will run with BHs off.
* Note: Every object which is parallelized by padata_do_parallel
* must be seen by padata_do_serial.
+ *
+ * Return: 0 on success or else negative error code.
*/
-int padata_do_parallel(struct padata_instance *pinst,
- struct padata_priv *padata, int cb_cpu)
+int padata_do_parallel(struct padata_shell *ps,
+ struct padata_priv *padata, int *cb_cpu)
{
- int target_cpu, err;
- struct padata_parallel_queue *queue;
+ struct padata_instance *pinst = ps->pinst;
struct parallel_data *pd;
+ struct padata_work *pw;
+ int cpu_index, err;
rcu_read_lock_bh();
- pd = rcu_dereference_bh(pinst->pd);
+ pd = rcu_dereference_bh(ps->pd);
err = -EINVAL;
if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
goto out;
- if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
- goto out;
+ if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
+ if (cpumask_empty(pd->cpumask.cbcpu))
+ goto out;
- err = -EBUSY;
- if ((pinst->flags & PADATA_RESET))
- goto out;
+ /* Select an alternate fallback CPU and notify the caller. */
+ cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu);
+ *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu);
+ }
- if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+ err = -EBUSY;
+ if ((pinst->flags & PADATA_RESET))
goto out;
- err = 0;
- atomic_inc(&pd->refcnt);
+ padata_get_pd(pd);
padata->pd = pd;
- padata->cb_cpu = cb_cpu;
+ padata->cb_cpu = *cb_cpu;
- target_cpu = padata_cpu_hash(pd);
- queue = per_cpu_ptr(pd->pqueue, target_cpu);
+ spin_lock(&padata_works_lock);
+ padata->seq_nr = ++pd->seq_nr;
+ pw = padata_work_alloc();
+ spin_unlock(&padata_works_lock);
- spin_lock(&queue->parallel.lock);
- list_add_tail(&padata->list, &queue->parallel.list);
- spin_unlock(&queue->parallel.lock);
+ if (!pw) {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+
+ rcu_read_unlock_bh();
- queue_work_on(target_cpu, pinst->wq, &queue->work);
+ if (pw) {
+ padata_work_init(pw, padata_parallel_worker, padata, 0);
+ queue_work(pinst->parallel_wq, &pw->pw_work);
+ }
+ return 0;
out:
rcu_read_unlock_bh();
@@ -150,143 +236,83 @@ out:
EXPORT_SYMBOL(padata_do_parallel);
/*
- * padata_get_next - Get the next object that needs serialization.
+ * padata_find_next - Find the next object that needs serialization.
*
- * Return values are:
- *
- * A pointer to the control struct of the next object that needs
- * serialization, if present in one of the percpu reorder queues.
- *
- * NULL, if all percpu reorder queues are empty.
- *
- * -EINPROGRESS, if the next object that needs serialization will
- * be parallel processed by another cpu and is not yet present in
- * the cpu's reorder queue.
- *
- * -ENODATA, if this cpu has to do the parallel processing for
- * the next object.
+ * Return:
+ * * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ * * NULL, if the next object that needs serialization will
+ * be parallel processed by another cpu and is not yet present in
+ * the cpu's reorder queue.
*/
-static struct padata_priv *padata_get_next(struct parallel_data *pd)
+static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu,
+ unsigned int processed)
{
- int cpu, num_cpus;
- unsigned int next_nr, next_index;
- struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
- num_cpus = cpumask_weight(pd->cpumask.pcpu);
-
- /*
- * Calculate the percpu reorder queue and the sequence
- * number of the next object.
- */
- next_nr = pd->processed;
- next_index = next_nr % num_cpus;
- cpu = padata_index_to_cpu(pd, next_index);
- next_queue = per_cpu_ptr(pd->pqueue, cpu);
-
- padata = NULL;
-
- reorder = &next_queue->reorder;
-
- if (!list_empty(&reorder->list)) {
- padata = list_entry(reorder->list.next,
- struct padata_priv, list);
-
- spin_lock(&reorder->lock);
- list_del_init(&padata->list);
- atomic_dec(&pd->reorder_objects);
- spin_unlock(&reorder->lock);
+ reorder = per_cpu_ptr(pd->reorder_list, cpu);
- pd->processed++;
+ spin_lock(&reorder->lock);
+ if (list_empty(&reorder->list))
+ goto notfound;
- goto out;
- }
+ padata = list_entry(reorder->list.next, struct padata_priv, list);
- if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
- padata = ERR_PTR(-ENODATA);
- goto out;
- }
+ /*
+ * Checks the rare case where two or more parallel jobs have hashed to
+ * the same CPU and one of the later ones finishes first.
+ */
+ if (padata->seq_nr != processed)
+ goto notfound;
- padata = ERR_PTR(-EINPROGRESS);
-out:
+ list_del_init(&padata->list);
+ spin_unlock(&reorder->lock);
return padata;
+
+notfound:
+ pd->processed = processed;
+ pd->cpu = cpu;
+ spin_unlock(&reorder->lock);
+ return NULL;
}
-static void padata_reorder(struct parallel_data *pd)
+static void padata_reorder(struct padata_priv *padata)
{
- int cb_cpu;
- struct padata_priv *padata;
- struct padata_serial_queue *squeue;
- struct padata_instance *pinst = pd->pinst;
-
- /*
- * We need to ensure that only one cpu can work on dequeueing of
- * the reorder queue the time. Calculating in which percpu reorder
- * queue the next object will arrive takes some time. A spinlock
- * would be highly contended. Also it is not clear in which order
- * the objects arrive to the reorder queues. So a cpu could wait to
- * get the lock just to notice that there is nothing to do at the
- * moment. Therefore we use a trylock and let the holder of the lock
- * care for all the objects enqueued during the holdtime of the lock.
- */
- if (!spin_trylock_bh(&pd->lock))
- return;
+ struct parallel_data *pd = padata->pd;
+ struct padata_instance *pinst = pd->ps->pinst;
+ unsigned int processed;
+ int cpu;
- while (1) {
- padata = padata_get_next(pd);
+ processed = pd->processed;
+ cpu = pd->cpu;
- /*
- * All reorder queues are empty, or the next object that needs
- * serialization is parallel processed by another cpu and is
- * still on it's way to the cpu's reorder queue, nothing to
- * do for now.
- */
- if (!padata || PTR_ERR(padata) == -EINPROGRESS)
- break;
+ do {
+ struct padata_serial_queue *squeue;
+ int cb_cpu;
- /*
- * This cpu has to do the parallel processing of the next
- * object. It's waiting in the cpu's parallelization queue,
- * so exit immediately.
- */
- if (PTR_ERR(padata) == -ENODATA) {
- del_timer(&pd->timer);
- spin_unlock_bh(&pd->lock);
- return;
- }
+ processed++;
+ /* When sequence wraps around, reset to the first CPU. */
+ if (unlikely(processed == 0))
+ cpu = cpumask_first(pd->cpumask.pcpu);
+ else
+ cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
cb_cpu = padata->cb_cpu;
squeue = per_cpu_ptr(pd->squeue, cb_cpu);
spin_lock(&squeue->serial.lock);
list_add_tail(&padata->list, &squeue->serial.list);
- spin_unlock(&squeue->serial.lock);
+ queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
- queue_work_on(cb_cpu, pinst->wq, &squeue->work);
- }
-
- spin_unlock_bh(&pd->lock);
-
- /*
- * The next object that needs serialization might have arrived to
- * the reorder queues in the meantime, we will be called again
- * from the timer function if no one else cares for it.
- */
- if (atomic_read(&pd->reorder_objects)
- && !(pinst->flags & PADATA_RESET))
- mod_timer(&pd->timer, jiffies + HZ);
- else
- del_timer(&pd->timer);
-
- return;
-}
-
-static void padata_reorder_timer(unsigned long arg)
-{
- struct parallel_data *pd = (struct parallel_data *)arg;
-
- padata_reorder(pd);
+ /*
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, end the loop.
+ */
+ padata = padata_find_next(pd, cpu, processed);
+ spin_unlock(&squeue->serial.lock);
+ } while (padata);
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -294,6 +320,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
+ int cnt;
local_bh_disable();
squeue = container_of(serial_work, struct padata_serial_queue, work);
@@ -303,6 +330,8 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_replace_init(&squeue->serial.list, &local_list);
spin_unlock(&squeue->serial.lock);
+ cnt = 0;
+
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -312,9 +341,11 @@ static void padata_serial_worker(struct work_struct *serial_work)
list_del_init(&padata->list);
padata->serial(padata);
- atomic_dec(&pd->refcnt);
+ cnt++;
}
local_bh_enable();
+
+ padata_put_pd_cnt(pd, cnt);
}
/**
@@ -327,47 +358,152 @@ static void padata_serial_worker(struct work_struct *serial_work)
*/
void padata_do_serial(struct padata_priv *padata)
{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
+ struct parallel_data *pd = padata->pd;
+ int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
+ struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
+ struct padata_priv *cur;
+ struct list_head *pos;
+ bool gotit = true;
+
+ spin_lock(&reorder->lock);
+ /* Sort in ascending order of sequence number. */
+ list_for_each_prev(pos, &reorder->list) {
+ cur = list_entry(pos, struct padata_priv, list);
+ /* Compare by difference to consider integer wrap around */
+ if ((signed int)(cur->seq_nr - padata->seq_nr) < 0)
+ break;
+ }
+ if (padata->seq_nr != pd->processed) {
+ gotit = false;
+ list_add(&padata->list, pos);
+ }
+ spin_unlock(&reorder->lock);
- pd = padata->pd;
+ if (gotit)
+ padata_reorder(padata);
+}
+EXPORT_SYMBOL(padata_do_serial);
- cpu = get_cpu();
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
+static int padata_setup_cpumasks(struct padata_instance *pinst)
+{
+ struct workqueue_attrs *attrs;
+ int err;
- spin_lock(&pqueue->reorder.lock);
- atomic_inc(&pd->reorder_objects);
- list_add_tail(&padata->list, &pqueue->reorder.list);
- spin_unlock(&pqueue->reorder.lock);
+ attrs = alloc_workqueue_attrs();
+ if (!attrs)
+ return -ENOMEM;
- put_cpu();
+ /* Restrict parallel_wq workers to pd->cpumask.pcpu. */
+ cpumask_copy(attrs->cpumask, pinst->cpumask.pcpu);
+ err = apply_workqueue_attrs(pinst->parallel_wq, attrs);
+ free_workqueue_attrs(attrs);
- padata_reorder(pd);
+ return err;
}
-EXPORT_SYMBOL(padata_do_serial);
-static int padata_setup_cpumasks(struct parallel_data *pd,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static void __init padata_mt_helper(struct work_struct *w)
{
- if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
- return -ENOMEM;
+ struct padata_work *pw = container_of(w, struct padata_work, pw_work);
+ struct padata_mt_job_state *ps = pw->pw_data;
+ struct padata_mt_job *job = ps->job;
+ bool done;
- cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
- if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pd->cpumask.cbcpu);
- return -ENOMEM;
+ spin_lock(&ps->lock);
+
+ while (job->size > 0) {
+ unsigned long start, size, end;
+
+ start = job->start;
+ /* So end is chunk size aligned if enough work remains. */
+ size = roundup(start + 1, ps->chunk_size) - start;
+ size = min(size, job->size);
+ end = start + size;
+
+ job->start = end;
+ job->size -= size;
+
+ spin_unlock(&ps->lock);
+ job->thread_fn(start, end, job->fn_arg);
+ spin_lock(&ps->lock);
}
- cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
- return 0;
+ ++ps->nworks_fini;
+ done = (ps->nworks_fini == ps->nworks);
+ spin_unlock(&ps->lock);
+
+ if (done)
+ complete(&ps->completion);
}
-static void __padata_list_init(struct padata_list *pd_list)
+/**
+ * padata_do_multithreaded - run a multithreaded job
+ * @job: Description of the job.
+ *
+ * See the definition of struct padata_mt_job for more details.
+ */
+void __init padata_do_multithreaded(struct padata_mt_job *job)
{
- INIT_LIST_HEAD(&pd_list->list);
- spin_lock_init(&pd_list->lock);
+ /* In case threads finish at different times. */
+ static const unsigned long load_balance_factor = 4;
+ struct padata_work my_work, *pw;
+ struct padata_mt_job_state ps;
+ LIST_HEAD(works);
+ int nworks, nid;
+ static atomic_t last_used_nid __initdata;
+
+ if (job->size == 0)
+ return;
+
+ /* Ensure at least one thread when size < min_chunk. */
+ nworks = max(job->size / max(job->min_chunk, job->align), 1ul);
+ nworks = min(nworks, job->max_threads);
+
+ if (nworks == 1) {
+ /* Single thread, no coordination needed, cut to the chase. */
+ job->thread_fn(job->start, job->start + job->size, job->fn_arg);
+ return;
+ }
+
+ spin_lock_init(&ps.lock);
+ init_completion(&ps.completion);
+ ps.job = job;
+ ps.nworks = padata_work_alloc_mt(nworks, &ps, &works);
+ ps.nworks_fini = 0;
+
+ /*
+ * Chunk size is the amount of work a helper does per call to the
+ * thread function. Load balance large jobs between threads by
+ * increasing the number of chunks, guarantee at least the minimum
+ * chunk size from the caller, and honor the caller's alignment.
+ * Ensure chunk_size is at least 1 to prevent divide-by-0
+ * panic in padata_mt_helper().
+ */
+ ps.chunk_size = job->size / (ps.nworks * load_balance_factor);
+ ps.chunk_size = max(ps.chunk_size, job->min_chunk);
+ ps.chunk_size = max(ps.chunk_size, 1ul);
+ ps.chunk_size = roundup(ps.chunk_size, job->align);
+
+ list_for_each_entry(pw, &works, pw_list)
+ if (job->numa_aware) {
+ int old_node = atomic_read(&last_used_nid);
+
+ do {
+ nid = next_node_in(old_node, node_states[N_CPU]);
+ } while (!atomic_try_cmpxchg(&last_used_nid, &old_node, nid));
+ queue_work_node(nid, system_dfl_wq, &pw->pw_work);
+ } else {
+ queue_work(system_dfl_wq, &pw->pw_work);
+ }
+
+ /* Use the current thread, which saves starting a workqueue worker. */
+ padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
+ padata_mt_helper(&my_work.pw_work);
+
+ /* Wait for all the helpers to finish. */
+ wait_for_completion(&ps.completion);
+
+ destroy_work_on_stack(&my_work.pw_work);
+ padata_works_free(&works);
}
/* Initialize all percpu queues used by serial workers */
@@ -379,67 +515,67 @@ static void padata_init_squeues(struct parallel_data *pd)
for_each_cpu(cpu, pd->cpumask.cbcpu) {
squeue = per_cpu_ptr(pd->squeue, cpu);
squeue->pd = pd;
- __padata_list_init(&squeue->serial);
+ INIT_LIST_HEAD(&squeue->serial.list);
+ spin_lock_init(&squeue->serial.lock);
INIT_WORK(&squeue->work, padata_serial_worker);
}
}
-/* Initialize all percpu queues used by parallel workers */
-static void padata_init_pqueues(struct parallel_data *pd)
+/* Initialize per-CPU reorder lists */
+static void padata_init_reorder_list(struct parallel_data *pd)
{
- int cpu_index, cpu;
- struct padata_parallel_queue *pqueue;
+ int cpu;
+ struct padata_list *list;
- cpu_index = 0;
for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- pqueue->pd = pd;
- pqueue->cpu_index = cpu_index;
- cpu_index++;
-
- __padata_list_init(&pqueue->reorder);
- __padata_list_init(&pqueue->parallel);
- INIT_WORK(&pqueue->work, padata_parallel_worker);
- atomic_set(&pqueue->num_obj, 0);
+ list = per_cpu_ptr(pd->reorder_list, cpu);
+ INIT_LIST_HEAD(&list->list);
+ spin_lock_init(&list->lock);
}
}
/* Allocate and initialize the internal cpumask dependend resources. */
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
{
+ struct padata_instance *pinst = ps->pinst;
struct parallel_data *pd;
pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
if (!pd)
goto err;
- pd->pqueue = alloc_percpu(struct padata_parallel_queue);
- if (!pd->pqueue)
+ pd->reorder_list = alloc_percpu(struct padata_list);
+ if (!pd->reorder_list)
goto err_free_pd;
pd->squeue = alloc_percpu(struct padata_serial_queue);
if (!pd->squeue)
- goto err_free_pqueue;
- if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+ goto err_free_reorder_list;
+
+ pd->ps = ps;
+
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
goto err_free_squeue;
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
+ goto err_free_pcpu;
+
+ cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask);
+ cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask);
- padata_init_pqueues(pd);
+ padata_init_reorder_list(pd);
padata_init_squeues(pd);
- setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
- atomic_set(&pd->seq_nr, -1);
- atomic_set(&pd->reorder_objects, 0);
- atomic_set(&pd->refcnt, 0);
- pd->pinst = pinst;
- spin_lock_init(&pd->lock);
+ pd->seq_nr = -1;
+ refcount_set(&pd->refcnt, 1);
+ pd->cpu = cpumask_first(pd->cpumask.pcpu);
return pd;
+err_free_pcpu:
+ free_cpumask_var(pd->cpumask.pcpu);
err_free_squeue:
free_percpu(pd->squeue);
-err_free_pqueue:
- free_percpu(pd->pqueue);
+err_free_reorder_list:
+ free_percpu(pd->reorder_list);
err_free_pd:
kfree(pd);
err:
@@ -450,36 +586,11 @@ static void padata_free_pd(struct parallel_data *pd)
{
free_cpumask_var(pd->cpumask.pcpu);
free_cpumask_var(pd->cpumask.cbcpu);
- free_percpu(pd->pqueue);
+ free_percpu(pd->reorder_list);
free_percpu(pd->squeue);
kfree(pd);
}
-/* Flush all objects out of the padata queues. */
-static void padata_flush_queues(struct parallel_data *pd)
-{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct padata_serial_queue *squeue;
-
- for_each_cpu(cpu, pd->cpumask.pcpu) {
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
- flush_work(&pqueue->work);
- }
-
- del_timer_sync(&pd->timer);
-
- if (atomic_read(&pd->reorder_objects))
- padata_reorder(pd);
-
- for_each_cpu(cpu, pd->cpumask.cbcpu) {
- squeue = per_cpu_ptr(pd->squeue, cpu);
- flush_work(&squeue->work);
- }
-
- BUG_ON(atomic_read(&pd->refcnt) != 0);
-}
-
static void __padata_start(struct padata_instance *pinst)
{
pinst->flags |= PADATA_INIT;
@@ -493,72 +604,45 @@ static void __padata_stop(struct padata_instance *pinst)
pinst->flags &= ~PADATA_INIT;
synchronize_rcu();
-
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
}
/* Replace the internal control structure with a new one. */
-static void padata_replace(struct padata_instance *pinst,
- struct parallel_data *pd_new)
+static int padata_replace_one(struct padata_shell *ps)
{
- struct parallel_data *pd_old = pinst->pd;
- int notification_mask = 0;
+ struct parallel_data *pd_new;
- pinst->flags |= PADATA_RESET;
+ pd_new = padata_alloc_pd(ps);
+ if (!pd_new)
+ return -ENOMEM;
- rcu_assign_pointer(pinst->pd, pd_new);
+ ps->opd = rcu_dereference_protected(ps->pd, 1);
+ rcu_assign_pointer(ps->pd, pd_new);
- synchronize_rcu();
+ return 0;
+}
- if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
- notification_mask |= PADATA_CPU_PARALLEL;
- if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
- notification_mask |= PADATA_CPU_SERIAL;
+static int padata_replace(struct padata_instance *pinst)
+{
+ struct padata_shell *ps;
+ int err = 0;
- padata_flush_queues(pd_old);
- padata_free_pd(pd_old);
+ pinst->flags |= PADATA_RESET;
- if (notification_mask)
- blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
- notification_mask,
- &pd_new->cpumask);
+ list_for_each_entry(ps, &pinst->pslist, list) {
+ err = padata_replace_one(ps);
+ if (err)
+ break;
+ }
- pinst->flags &= ~PADATA_RESET;
-}
+ synchronize_rcu();
-/**
- * padata_register_cpumask_notifier - Registers a notifier that will be called
- * if either pcpu or cbcpu or both cpumasks change.
- *
- * @pinst: A poineter to padata instance
- * @nblock: A pointer to notifier block.
- */
-int padata_register_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
- nblock);
-}
-EXPORT_SYMBOL(padata_register_cpumask_notifier);
+ list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
+ padata_put_pd(ps->opd);
-/**
- * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
- * registered earlier using padata_register_cpumask_notifier
- *
- * @pinst: A pointer to data instance.
- * @nlock: A pointer to notifier block.
- */
-int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
- struct notifier_block *nblock)
-{
- return blocking_notifier_chain_unregister(
- &pinst->cpumask_change_notifier,
- nblock);
-}
-EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
+ pinst->flags &= ~PADATA_RESET;
+ return err;
+}
/* If cpumask contains no active cpu, we mark the instance as invalid. */
static bool padata_validate_cpumask(struct padata_instance *pinst,
@@ -578,7 +662,7 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
cpumask_var_t cbcpumask)
{
int valid;
- struct parallel_data *pd;
+ int err;
valid = padata_validate_cpumask(pinst, pcpumask);
if (!valid) {
@@ -591,56 +675,26 @@ static int __padata_set_cpumasks(struct padata_instance *pinst,
__padata_stop(pinst);
out_replace:
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
- return -ENOMEM;
-
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
- padata_replace(pinst, pd);
+ err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst);
if (valid)
__padata_start(pinst);
- return 0;
-}
-
-/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- * one is used by parallel workers and the second one
- * by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
- cpumask_var_t cbcpumask)
-{
- int err;
-
- mutex_lock(&pinst->lock);
- get_online_cpus();
-
- err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
- put_online_cpus();
- mutex_unlock(&pinst->lock);
-
return err;
-
}
-EXPORT_SYMBOL(padata_set_cpumasks);
/**
- * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
- * equivalent to @cpumask.
- *
+ * padata_set_cpumask - Sets specified by @cpumask_type cpumask to the value
+ * equivalent to @cpumask.
* @pinst: padata instance
* @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
* to parallel and serial cpumasks respectively.
* @cpumask: the cpumask to use
+ *
+ * Return: 0 on success or negative error code
*/
int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask)
@@ -648,8 +702,8 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
+ cpus_read_lock();
mutex_lock(&pinst->lock);
- get_online_cpus();
switch (cpumask_type) {
case PADATA_CPU_PARALLEL:
@@ -667,223 +721,98 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
- put_online_cpus();
mutex_unlock(&pinst->lock);
+ cpus_read_unlock();
return err;
}
EXPORT_SYMBOL(padata_set_cpumask);
+#ifdef CONFIG_HOTPLUG_CPU
+
static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd;
+ int err = 0;
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
+ err = padata_replace(pinst);
if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_start(pinst);
}
- return 0;
-}
-
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_add_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
return err;
}
-EXPORT_SYMBOL(padata_add_cpu);
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd = NULL;
-
- if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+ int err = 0;
+ if (!cpumask_test_cpu(cpu, cpu_online_mask)) {
if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
!padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
__padata_stop(pinst);
- pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
- pinst->cpumask.cbcpu);
- if (!pd)
- return -ENOMEM;
-
- padata_replace(pinst, pd);
-
- cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
- cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
+ err = padata_replace(pinst);
}
- return 0;
+ return err;
}
- /**
- * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
- * padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to remove
- * @mask: bitmask specifying from which cpumask @cpu should be removed
- * The @mask may be any combination of the following flags:
- * PADATA_CPU_SERIAL - serial cpumask
- * PADATA_CPU_PARALLEL - parallel cpumask
- */
-int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
+static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
{
- int err;
-
- if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
- return -EINVAL;
-
- mutex_lock(&pinst->lock);
-
- get_online_cpus();
- if (mask & PADATA_CPU_SERIAL)
- cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
- if (mask & PADATA_CPU_PARALLEL)
- cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
-
- err = __padata_remove_cpu(pinst, cpu);
- put_online_cpus();
-
- mutex_unlock(&pinst->lock);
-
- return err;
+ return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
+ cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
}
-EXPORT_SYMBOL(padata_remove_cpu);
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- */
-int padata_start(struct padata_instance *pinst)
+static int padata_cpu_online(unsigned int cpu, struct hlist_node *node)
{
- int err = 0;
-
- mutex_lock(&pinst->lock);
-
- if (pinst->flags & PADATA_INVALID)
- err =-EINVAL;
-
- __padata_start(pinst);
-
- mutex_unlock(&pinst->lock);
+ struct padata_instance *pinst;
+ int ret;
- return err;
-}
-EXPORT_SYMBOL(padata_start);
+ pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node);
+ if (!pinst_has_cpu(pinst, cpu))
+ return 0;
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
mutex_lock(&pinst->lock);
- __padata_stop(pinst);
+ ret = __padata_add_cpu(pinst, cpu);
mutex_unlock(&pinst->lock);
+ return ret;
}
-EXPORT_SYMBOL(padata_stop);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
-{
- return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
- cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
-}
-
-static int padata_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
- int err;
struct padata_instance *pinst;
- int cpu = (unsigned long)hcpu;
-
- pinst = container_of(nfb, struct padata_instance, cpu_notifier);
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- err = __padata_add_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
- if (err)
- return notifier_from_errno(err);
- break;
+ int ret;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!pinst_has_cpu(pinst, cpu))
- break;
- mutex_lock(&pinst->lock);
- err = __padata_remove_cpu(pinst, cpu);
- mutex_unlock(&pinst->lock);
- if (err)
- return notifier_from_errno(err);
- break;
- }
+ pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node);
+ if (!pinst_has_cpu(pinst, cpu))
+ return 0;
- return NOTIFY_OK;
+ mutex_lock(&pinst->lock);
+ ret = __padata_remove_cpu(pinst, cpu);
+ mutex_unlock(&pinst->lock);
+ return ret;
}
+
+static enum cpuhp_state hp_online;
#endif
static void __padata_free(struct padata_instance *pinst)
{
#ifdef CONFIG_HOTPLUG_CPU
- unregister_hotcpu_notifier(&pinst->cpu_notifier);
+ cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD,
+ &pinst->cpu_dead_node);
+ cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node);
#endif
- padata_stop(pinst);
- padata_free_pd(pinst->pd);
+ WARN_ON(!list_empty(&pinst->pslist));
+
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
+ destroy_workqueue(pinst->serial_wq);
+ destroy_workqueue(pinst->parallel_wq);
kfree(pinst);
}
@@ -970,6 +899,7 @@ static struct attribute *padata_default_attrs[] = {
&parallel_cpumask_attr.attr,
NULL,
};
+ATTRIBUTE_GROUPS(padata_default);
static ssize_t padata_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *buf)
@@ -995,7 +925,7 @@ static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
pinst = kobj2pinst(kobj);
pentry = attr2pentry(attr);
- if (pentry->show)
+ if (pentry->store)
ret = pentry->store(pinst, attr, buf, count);
return ret;
@@ -1006,88 +936,80 @@ static const struct sysfs_ops padata_sysfs_ops = {
.store = padata_sysfs_store,
};
-static struct kobj_type padata_attr_type = {
+static const struct kobj_type padata_attr_type = {
.sysfs_ops = &padata_sysfs_ops,
- .default_attrs = padata_default_attrs,
+ .default_groups = padata_default_groups,
.release = padata_sysfs_release,
};
/**
- * padata_alloc_possible - Allocate and initialize padata instance.
- * Use the cpu_possible_mask for serial and
- * parallel workers.
+ * padata_alloc - allocate and initialize a padata instance
+ * @name: used to identify the instance
*
- * @wq: workqueue to use for the allocated padata instance
+ * Return: new instance on success, NULL on error
*/
-struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
-{
- return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
-}
-EXPORT_SYMBOL(padata_alloc_possible);
-
-/**
- * padata_alloc - allocate and initialize a padata instance and specify
- * cpumasks for serial and parallel workers.
- *
- * @wq: workqueue to use for the allocated padata instance
- * @pcpumask: cpumask that will be used for padata parallelization
- * @cbcpumask: cpumask that will be used for padata serialization
- */
-struct padata_instance *padata_alloc(struct workqueue_struct *wq,
- const struct cpumask *pcpumask,
- const struct cpumask *cbcpumask)
+struct padata_instance *padata_alloc(const char *name)
{
struct padata_instance *pinst;
- struct parallel_data *pd = NULL;
pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
if (!pinst)
goto err;
- get_online_cpus();
- if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_UNBOUND, 0,
+ name);
+ if (!pinst->parallel_wq)
goto err_free_inst;
+
+ cpus_read_lock();
+
+ pinst->serial_wq = alloc_workqueue("%s_serial",
+ WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE | WQ_PERCPU,
+ 1, name);
+ if (!pinst->serial_wq)
+ goto err_put_cpus;
+
+ if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ goto err_free_serial_wq;
if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
free_cpumask_var(pinst->cpumask.pcpu);
- goto err_free_inst;
+ goto err_free_serial_wq;
}
- if (!padata_validate_cpumask(pinst, pcpumask) ||
- !padata_validate_cpumask(pinst, cbcpumask))
- goto err_free_masks;
-
- pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
- if (!pd)
- goto err_free_masks;
-
- rcu_assign_pointer(pinst->pd, pd);
- pinst->wq = wq;
+ INIT_LIST_HEAD(&pinst->pslist);
- cpumask_copy(pinst->cpumask.pcpu, pcpumask);
- cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+ cpumask_copy(pinst->cpumask.pcpu, cpu_possible_mask);
+ cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask);
- pinst->flags = 0;
+ if (padata_setup_cpumasks(pinst))
+ goto err_free_masks;
- put_online_cpus();
+ __padata_start(pinst);
- BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
#ifdef CONFIG_HOTPLUG_CPU
- pinst->cpu_notifier.notifier_call = padata_cpu_callback;
- pinst->cpu_notifier.priority = 0;
- register_hotcpu_notifier(&pinst->cpu_notifier);
+ cpuhp_state_add_instance_nocalls_cpuslocked(hp_online,
+ &pinst->cpu_online_node);
+ cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD,
+ &pinst->cpu_dead_node);
#endif
+ cpus_read_unlock();
+
return pinst;
err_free_masks:
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
+err_free_serial_wq:
+ destroy_workqueue(pinst->serial_wq);
+err_put_cpus:
+ cpus_read_unlock();
+ destroy_workqueue(pinst->parallel_wq);
err_free_inst:
kfree(pinst);
- put_online_cpus();
err:
return NULL;
}
@@ -1096,10 +1018,110 @@ EXPORT_SYMBOL(padata_alloc);
/**
* padata_free - free a padata instance
*
- * @padata_inst: padata instance to free
+ * @pinst: padata instance to free
*/
void padata_free(struct padata_instance *pinst)
{
kobject_put(&pinst->kobj);
}
EXPORT_SYMBOL(padata_free);
+
+/**
+ * padata_alloc_shell - Allocate and initialize padata shell.
+ *
+ * @pinst: Parent padata_instance object.
+ *
+ * Return: new shell on success, NULL on error
+ */
+struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
+{
+ struct parallel_data *pd;
+ struct padata_shell *ps;
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto out;
+
+ ps->pinst = pinst;
+
+ cpus_read_lock();
+ pd = padata_alloc_pd(ps);
+ cpus_read_unlock();
+
+ if (!pd)
+ goto out_free_ps;
+
+ mutex_lock(&pinst->lock);
+ RCU_INIT_POINTER(ps->pd, pd);
+ list_add(&ps->list, &pinst->pslist);
+ mutex_unlock(&pinst->lock);
+
+ return ps;
+
+out_free_ps:
+ kfree(ps);
+out:
+ return NULL;
+}
+EXPORT_SYMBOL(padata_alloc_shell);
+
+/**
+ * padata_free_shell - free a padata shell
+ *
+ * @ps: padata shell to free
+ */
+void padata_free_shell(struct padata_shell *ps)
+{
+ struct parallel_data *pd;
+
+ if (!ps)
+ return;
+
+ mutex_lock(&ps->pinst->lock);
+ list_del(&ps->list);
+ pd = rcu_dereference_protected(ps->pd, 1);
+ padata_put_pd(pd);
+ mutex_unlock(&ps->pinst->lock);
+
+ kfree(ps);
+}
+EXPORT_SYMBOL(padata_free_shell);
+
+void __init padata_init(void)
+{
+ unsigned int i, possible_cpus;
+#ifdef CONFIG_HOTPLUG_CPU
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
+ padata_cpu_online, NULL);
+ if (ret < 0)
+ goto err;
+ hp_online = ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
+ NULL, padata_cpu_dead);
+ if (ret < 0)
+ goto remove_online_state;
+#endif
+
+ possible_cpus = num_possible_cpus();
+ padata_works = kmalloc_array(possible_cpus, sizeof(struct padata_work),
+ GFP_KERNEL);
+ if (!padata_works)
+ goto remove_dead_state;
+
+ for (i = 0; i < possible_cpus; ++i)
+ list_add(&padata_works[i].pw_list, &padata_free_works);
+
+ return;
+
+remove_dead_state:
+#ifdef CONFIG_HOTPLUG_CPU
+ cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
+remove_online_state:
+ cpuhp_remove_multi_state(hp_online);
+err:
+#endif
+ pr_warn("padata: initialization failed\n");
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index 8136ad76e5fd..0d52210a9e2b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/panic.c
*
@@ -9,39 +10,241 @@
* to indicate a major problem.
*/
#include <linux/debug_locks.h>
+#include <linux/sched/debug.h>
#include <linux/interrupt.h>
+#include <linux/kgdb.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
+#include <linux/vt_kern.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/ftrace.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
+#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
+#include <linux/console.h>
+#include <linux/bug.h>
+#include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <linux/sysfs.h>
+#include <linux/context_tracking.h>
+#include <linux/seq_buf.h>
+#include <linux/sys_info.h>
+#include <trace/events/error_report.h>
+#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
-int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
-static unsigned long tainted_mask;
+#ifdef CONFIG_SMP
+/*
+ * Should we dump all CPUs backtraces in an oops event?
+ * Defaults to 0, can be changed via sysctl.
+ */
+static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
+#else
+#define sysctl_oops_all_cpu_backtrace 0
+#endif /* CONFIG_SMP */
+
+int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS);
+static unsigned long tainted_mask =
+ IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
-static bool crash_kexec_post_notifiers;
+bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
+unsigned long panic_on_taint;
+bool panic_on_taint_nousertaint = false;
+static unsigned int warn_limit __read_mostly;
+static bool panic_console_replay;
+
+bool panic_triggering_all_cpu_backtrace;
+static bool panic_this_cpu_backtrace_printed;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
+unsigned long panic_print;
+
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+static void panic_print_deprecated(void)
+{
+ pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n");
+}
+
+#ifdef CONFIG_SYSCTL
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ int i;
+
+ /*
+ * If we are relying on panic_on_taint not producing
+ * false positives due to userspace input, bail out
+ * before setting the requested taint flags.
+ */
+ if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
+ return -EINVAL;
+
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++)
+ if ((1UL << i) & tmptaint)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+
+ return err;
+}
+
+static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ panic_print_deprecated();
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+static const struct ctl_table kern_panic_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "oops_all_cpu_backtrace",
+ .data = &sysctl_oops_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ {
+ .procname = "tainted",
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = proc_taint,
+ },
+ {
+ .procname = "panic",
+ .data = &panic_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_oops",
+ .data = &panic_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_print",
+ .data = &panic_print,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = sysctl_panic_print_handler,
+ },
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
+ defined(CONFIG_DEBUG_STACKOVERFLOW)
+ {
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "panic_sys_info",
+ .data = &panic_print,
+ .maxlen = sizeof(panic_print),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
+};
+
+static __init int kernel_panic_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_panic_table);
+ return 0;
+}
+late_initcall(kernel_panic_sysctls_init);
+#endif
+
+/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */
+static int __init setup_panic_sys_info(char *buf)
+{
+ /* There is no risk of race in kernel boot phase */
+ panic_print = sys_info_parse_param(buf);
+ return 1;
+}
+__setup("panic_sys_info=", setup_panic_sys_info);
+
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -54,35 +257,199 @@ EXPORT_SYMBOL(panic_blink);
/*
* Stop ourself in panic -- architecture code may override this
*/
-void __weak panic_smp_self_stop(void)
+void __weak __noreturn panic_smp_self_stop(void)
{
while (1)
cpu_relax();
}
-/**
- * panic - halt the system
- * @fmt: The text string to print
+/*
+ * Stop ourselves in NMI context if another CPU has already panicked. Arch code
+ * may override this to prepare for crash dumping, e.g. save regs info.
+ */
+void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
+{
+ panic_smp_self_stop();
+}
+
+/*
+ * Stop other CPUs in panic. Architecture dependent code may override this
+ * with more suitable version. For example, if the architecture supports
+ * crash dump, it should save registers of each stopped CPU and disable
+ * per-CPU features such as virtualization extensions.
+ */
+void __weak crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ /*
+ * This function can be called twice in panic path, but obviously
+ * we execute this only once.
+ */
+ if (cpus_stopped)
+ return;
+
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a panic
+ * situation.
+ */
+ smp_send_stop();
+ cpus_stopped = 1;
+}
+
+atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
+bool panic_try_start(void)
+{
+ int old_cpu, this_cpu;
+
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
+
+ return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu);
+}
+EXPORT_SYMBOL(panic_try_start);
+
+void panic_reset(void)
+{
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_reset);
+
+bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_in_progress);
+
+/* Return true if a panic is in progress on the current CPU. */
+bool panic_on_this_cpu(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+EXPORT_SYMBOL(panic_on_this_cpu);
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
*
- * Display a message, then perform cleanups.
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool panic_on_other_cpu(void)
+{
+ return (panic_in_progress() && !panic_on_this_cpu());
+}
+EXPORT_SYMBOL(panic_on_other_cpu);
+
+/*
+ * A variant of panic() called from NMI context. We return if we've already
+ * panicked on this CPU. If another CPU already panicked, loop in
+ * nmi_panic_self_stop() which can provide architecture dependent code such
+ * as saving register state for crash dump.
+ */
+void nmi_panic(struct pt_regs *regs, const char *msg)
+{
+ if (panic_try_start())
+ panic("%s", msg);
+ else if (panic_on_other_cpu())
+ nmi_panic_self_stop(regs);
+}
+EXPORT_SYMBOL(nmi_panic);
+
+void check_panic_on_warn(const char *origin)
+{
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
+
+static void panic_trigger_all_cpu_backtrace(void)
+{
+ /* Temporary allow non-panic CPUs to write their backtraces. */
+ panic_triggering_all_cpu_backtrace = true;
+
+ if (panic_this_cpu_backtrace_printed)
+ trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id());
+ else
+ trigger_all_cpu_backtrace();
+
+ panic_triggering_all_cpu_backtrace = false;
+}
+
+/*
+ * Helper that triggers the NMI backtrace (if set in panic_print)
+ * and then performs the secondary CPUs shutdown - we cannot have
+ * the NMI backtrace after the CPUs are off!
+ */
+static void panic_other_cpus_shutdown(bool crash_kexec)
+{
+ if (panic_print & SYS_INFO_ALL_BT)
+ panic_trigger_all_cpu_backtrace();
+
+ /*
+ * Note that smp_send_stop() is the usual SMP shutdown function,
+ * which unfortunately may not be hardened to work in a panic
+ * situation. If we want to do crash dump after notifier calls
+ * and kmsg_dump, we will need architecture dependent extra
+ * bits in addition to stopping other CPUs, hence we rely on
+ * crash_smp_send_stop() for that.
+ */
+ if (!crash_kexec)
+ smp_send_stop();
+ else
+ crash_smp_send_stop();
+}
+
+/**
+ * vpanic - halt the system
+ * @fmt: The text string to print
+ * @args: Arguments for the format string
*
- * This function never returns.
+ * Display a message, then perform cleanups. This function never returns.
*/
-void panic(const char *fmt, ...)
+void vpanic(const char *fmt, va_list args)
{
- static DEFINE_SPINLOCK(panic_lock);
static char buf[1024];
- va_list args;
- long i, i_next = 0;
+ long i, i_next = 0, len;
int state = 0;
+ bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
* there is nothing to prevent an interrupt handler (that runs
- * after the panic_lock is acquired) from invoking panic again.
+ * after setting panic_cpu) from invoking panic() again.
*/
local_irq_disable();
+ preempt_disable_notrace();
/*
* It's possible to come here directly from a panic-assertion and
@@ -93,39 +460,57 @@ void panic(const char *fmt, ...)
* multiple parallel invocations of panic, all other CPUs either
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
+ *
+ * cmpxchg success means this is the 1st CPU which comes here,
+ * so go ahead.
+ * `old_cpu == this_cpu' means we came from nmi_panic() which sets
+ * panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- if (!spin_trylock(&panic_lock))
+ /* atomic_try_cmpxchg updates old_cpu on failure */
+ if (panic_try_start()) {
+ /* go ahead */
+ } else if (panic_on_other_cpu())
panic_smp_self_stop();
console_verbose();
bust_spinlocks(1);
- va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
+ len = vscnprintf(buf, sizeof(buf), fmt, args);
+
+ if (len && buf[len - 1] == '\n')
+ buf[len - 1] = '\0';
+
pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
+ panic_this_cpu_backtrace_printed = true;
+ } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
dump_stack();
-#endif
+ panic_this_cpu_backtrace_printed = true;
+ }
+
+ /*
+ * If kgdb is enabled, give it a chance to run before we stop all
+ * the other CPUs or else we won't be able to debug processes left
+ * running on them.
+ */
+ kgdb_panic(buf);
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
* If we want to run this after calling panic_notifiers, pass
* the "crash_kexec_post_notifiers" option to the kernel.
+ *
+ * Bypass the panic_cpu check and call __crash_kexec directly.
*/
- if (!crash_kexec_post_notifiers)
- crash_kexec(NULL);
+ if (!_crash_kexec_post_notifiers)
+ __crash_kexec(NULL);
- /*
- * Note smp_send_stop is the usual smp shutdown function, which
- * unfortunately means it may not be hardened to work in a panic
- * situation.
- */
- smp_send_stop();
+ panic_other_cpus_shutdown(_crash_kexec_post_notifiers);
+
+ printk_legacy_allow_panic_sync();
/*
* Run any panic handlers, including those that might need to
@@ -133,7 +518,9 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- kmsg_dump(KMSG_DUMP_PANIC);
+ sys_info(panic_print);
+
+ kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
/*
* If you doubt kdump always works fine in any situation,
@@ -141,10 +528,28 @@ void panic(const char *fmt, ...)
* panic_notifiers and dumping kmsg before kdump.
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
+ *
+ * Bypass the panic_cpu check and call __crash_kexec directly.
*/
- crash_kexec(NULL);
+ if (_crash_kexec_post_notifiers)
+ __crash_kexec(NULL);
- bust_spinlocks(0);
+ console_unblank();
+
+ /*
+ * We may have ended up stopping the CPU holding the lock (in
+ * smp_send_stop()) while still having some valuable data in the console
+ * buffer. Try to acquire the lock then release it regardless of the
+ * result. The release will also print the buffers out. Locks debug
+ * should be disabled to avoid reporting bad unlock balance when
+ * panic() is not being callled from OOPS.
+ */
+ debug_locks_off();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+
+ if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) ||
+ panic_console_replay)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
if (!panic_blink)
panic_blink = no_blink;
@@ -154,7 +559,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
@@ -171,6 +576,8 @@ void panic(const char *fmt, ...)
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
+ if (panic_reboot_mode != REBOOT_UNDEFINED)
+ reboot_mode = panic_reboot_mode;
emergency_restart();
}
#ifdef __sparc__
@@ -178,18 +585,26 @@ void panic(const char *fmt, ...)
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
- pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
+ pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
+ "twice on console to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
- {
- unsigned long caller;
-
- caller = (unsigned long)__builtin_return_address(0);
- disabled_wait(caller);
- }
+ disabled_wait();
#endif
- pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
+ pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
+
+ /* Do not scroll important messages printed above */
+ suppress_printk = 1;
+
+ /*
+ * The final messages may not have been printed if in a context that
+ * defers printing (such as NMI) and irq_work is not available.
+ * Explicitly flush the kernel log buffer one last time.
+ */
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+ nbcon_atomic_flush_unsafe();
+
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -200,76 +615,118 @@ void panic(const char *fmt, ...)
mdelay(PANIC_TIMER_STEP);
}
}
+EXPORT_SYMBOL(vpanic);
+
+/* Identical to vpanic(), except it takes variadic arguments instead of va_list */
+void panic(const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ vpanic(fmt, args);
+ va_end(args);
+}
EXPORT_SYMBOL(panic);
+#define TAINT_FLAG(taint, _c_true, _c_false) \
+ [ TAINT_##taint ] = { \
+ .c_true = _c_true, .c_false = _c_false, \
+ .desc = #taint, \
+ }
-struct tnt {
- u8 bit;
- char true;
- char false;
+/*
+ * NOTE: if you modify the taint_flags or TAINT_FLAGS_COUNT,
+ * please also modify tools/debugging/kernel-chktaint and
+ * Documentation/admin-guide/tainted-kernels.rst, including its
+ * small shell script that prints the TAINT_FLAGS_COUNT bits of
+ * /proc/sys/kernel/tainted.
+ */
+const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
+ TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'),
+ TAINT_FLAG(FORCED_MODULE, 'F', ' '),
+ TAINT_FLAG(CPU_OUT_OF_SPEC, 'S', ' '),
+ TAINT_FLAG(FORCED_RMMOD, 'R', ' '),
+ TAINT_FLAG(MACHINE_CHECK, 'M', ' '),
+ TAINT_FLAG(BAD_PAGE, 'B', ' '),
+ TAINT_FLAG(USER, 'U', ' '),
+ TAINT_FLAG(DIE, 'D', ' '),
+ TAINT_FLAG(OVERRIDDEN_ACPI_TABLE, 'A', ' '),
+ TAINT_FLAG(WARN, 'W', ' '),
+ TAINT_FLAG(CRAP, 'C', ' '),
+ TAINT_FLAG(FIRMWARE_WORKAROUND, 'I', ' '),
+ TAINT_FLAG(OOT_MODULE, 'O', ' '),
+ TAINT_FLAG(UNSIGNED_MODULE, 'E', ' '),
+ TAINT_FLAG(SOFTLOCKUP, 'L', ' '),
+ TAINT_FLAG(LIVEPATCH, 'K', ' '),
+ TAINT_FLAG(AUX, 'X', ' '),
+ TAINT_FLAG(RANDSTRUCT, 'T', ' '),
+ TAINT_FLAG(TEST, 'N', ' '),
+ TAINT_FLAG(FWCTL, 'J', ' '),
};
-static const struct tnt tnts[] = {
- { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
- { TAINT_FORCED_MODULE, 'F', ' ' },
- { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
- { TAINT_FORCED_RMMOD, 'R', ' ' },
- { TAINT_MACHINE_CHECK, 'M', ' ' },
- { TAINT_BAD_PAGE, 'B', ' ' },
- { TAINT_USER, 'U', ' ' },
- { TAINT_DIE, 'D', ' ' },
- { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
- { TAINT_WARN, 'W', ' ' },
- { TAINT_CRAP, 'C', ' ' },
- { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
- { TAINT_OOT_MODULE, 'O', ' ' },
- { TAINT_UNSIGNED_MODULE, 'E', ' ' },
- { TAINT_SOFTLOCKUP, 'L', ' ' },
- { TAINT_LIVEPATCH, 'K', ' ' },
-};
+#undef TAINT_FLAG
+
+static void print_tainted_seq(struct seq_buf *s, bool verbose)
+{
+ const char *sep = "";
+ int i;
+
+ if (!tainted_mask) {
+ seq_buf_puts(s, "Not tainted");
+ return;
+ }
+
+ seq_buf_printf(s, "Tainted: ");
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ const struct taint_flag *t = &taint_flags[i];
+ bool is_set = test_bit(i, &tainted_mask);
+ char c = is_set ? t->c_true : t->c_false;
+
+ if (verbose) {
+ if (is_set) {
+ seq_buf_printf(s, "%s[%c]=%s", sep, c, t->desc);
+ sep = ", ";
+ }
+ } else {
+ seq_buf_putc(s, c);
+ }
+ }
+}
+
+static const char *_print_tainted(bool verbose)
+{
+ /* FIXME: what should the size be? */
+ static char buf[sizeof(taint_flags)];
+ struct seq_buf s;
+
+ BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
+
+ seq_buf_init(&s, buf, sizeof(buf));
+
+ print_tainted_seq(&s, verbose);
+
+ return seq_buf_str(&s);
+}
/**
- * print_tainted - return a string to represent the kernel taint state.
+ * print_tainted - return a string to represent the kernel taint state.
*
- * 'P' - Proprietary module has been loaded.
- * 'F' - Module has been forcibly loaded.
- * 'S' - SMP with CPUs not designed for SMP.
- * 'R' - User forced a module unload.
- * 'M' - System experienced a machine check exception.
- * 'B' - System has hit bad_page.
- * 'U' - Userspace-defined naughtiness.
- * 'D' - Kernel has oopsed before
- * 'A' - ACPI table overridden.
- * 'W' - Taint on warning.
- * 'C' - modules from drivers/staging are loaded.
- * 'I' - Working around severe firmware bug.
- * 'O' - Out-of-tree module has been loaded.
- * 'E' - Unsigned module has been loaded.
- * 'L' - A soft lockup has previously occurred.
- * 'K' - Kernel has been live patched.
+ * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst
*
- * The string is overwritten by the next call to print_tainted().
+ * The string is overwritten by the next call to print_tainted(),
+ * but is always NULL terminated.
*/
const char *print_tainted(void)
{
- static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
-
- if (tainted_mask) {
- char *s;
- int i;
-
- s = buf + sprintf(buf, "Tainted: ");
- for (i = 0; i < ARRAY_SIZE(tnts); i++) {
- const struct tnt *t = &tnts[i];
- *s++ = test_bit(t->bit, &tainted_mask) ?
- t->true : t->false;
- }
- *s = 0;
- } else
- snprintf(buf, sizeof(buf), "Not tainted");
+ return _print_tainted(false);
+}
- return buf;
+/**
+ * print_tainted_verbose - A more verbose version of print_tainted()
+ */
+const char *print_tainted_verbose(void)
+{
+ return _print_tainted(true);
}
int test_taint(unsigned flag)
@@ -297,6 +754,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
pr_warn("Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
+
+ if (tainted_mask & panic_on_taint) {
+ panic_on_taint = 0;
+ panic("panic_on_taint set ...");
+ }
}
EXPORT_SYMBOL(add_taint);
@@ -353,7 +815,7 @@ static void do_oops_enter_exit(void)
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
-int oops_may_print(void)
+bool oops_may_print(void)
{
return pause_on_oops_flag == 0;
}
@@ -374,32 +836,19 @@ int oops_may_print(void)
*/
void oops_enter(void)
{
+ nbcon_cpu_emergency_enter();
tracing_off();
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
do_oops_enter_exit();
-}
-
-/*
- * 64-bit random ID for oopses:
- */
-static u64 oops_id;
-static int init_oops_id(void)
-{
- if (!oops_id)
- get_random_bytes(&oops_id, sizeof(oops_id));
- else
- oops_id++;
-
- return 0;
+ if (sysctl_oops_all_cpu_backtrace)
+ trigger_all_cpu_backtrace();
}
-late_initcall(init_oops_id);
-void print_oops_end_marker(void)
+static void print_oops_end_marker(void)
{
- init_oops_id();
- pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
+ pr_warn("---[ end trace %016llx ]---\n", 0ULL);
}
/*
@@ -410,88 +859,142 @@ void oops_exit(void)
{
do_oops_enter_exit();
print_oops_end_marker();
+ nbcon_cpu_emergency_exit();
kmsg_dump(KMSG_DUMP_OOPS);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-struct slowpath_args {
+struct warn_args {
const char *fmt;
va_list args;
};
-static void warn_slowpath_common(const char *file, int line, void *caller,
- unsigned taint, struct slowpath_args *args)
+void __warn(const char *file, int line, void *caller, unsigned taint,
+ struct pt_regs *regs, struct warn_args *args)
{
+ nbcon_cpu_emergency_enter();
+
disable_trace_on_warning();
- pr_warn("------------[ cut here ]------------\n");
- pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
- raw_smp_processor_id(), current->pid, file, line, caller);
+ if (file) {
+ pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n",
+ file, line, caller,
+ raw_smp_processor_id(), current->comm, current->pid);
+ } else {
+ pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n",
+ caller,
+ raw_smp_processor_id(), current->comm, current->pid);
+ }
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
+#endif
if (args)
vprintk(args->fmt, args->args);
-
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
- panic("panic_on_warn set ...\n");
- }
+#pragma GCC diagnostic pop
print_modules();
- dump_stack();
+
+ if (regs)
+ show_regs(regs);
+
+ check_panic_on_warn("kernel");
+
+ if (!regs)
+ dump_stack();
+
+ print_irqtrace_events(current);
+
print_oops_end_marker();
+ trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller);
+
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
+
+ nbcon_cpu_emergency_exit();
}
-void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
+#ifdef CONFIG_BUG
+#ifndef __WARN_FLAGS
+void warn_slowpath_fmt(const char *file, int line, unsigned taint,
+ const char *fmt, ...)
{
- struct slowpath_args args;
+ bool rcu = warn_rcu_enter();
+ struct warn_args args;
+
+ pr_warn(CUT_HERE);
+
+ if (!fmt) {
+ __warn(file, line, __builtin_return_address(0), taint,
+ NULL, NULL);
+ warn_rcu_exit(rcu);
+ return;
+ }
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, &args);
+ __warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
+ warn_rcu_exit(rcu);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
-
-void warn_slowpath_fmt_taint(const char *file, int line,
- unsigned taint, const char *fmt, ...)
+#else
+void __warn_printk(const char *fmt, ...)
{
- struct slowpath_args args;
+ bool rcu = warn_rcu_enter();
+ va_list args;
- args.fmt = fmt;
- va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- taint, &args);
- va_end(args.args);
+ pr_warn(CUT_HERE);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+ warn_rcu_exit(rcu);
}
-EXPORT_SYMBOL(warn_slowpath_fmt_taint);
+EXPORT_SYMBOL(__warn_printk);
+#endif
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+ generic_bug_clear_once();
+ memset(__start_once, 0, __end_once - __start_once);
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
+ "%lld\n");
-void warn_slowpath_null(const char *file, int line)
+static __init int register_warn_debugfs(void)
{
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, NULL);
+ /* Don't care about failure */
+ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
+ &clear_warn_once_fops);
+ return 0;
}
-EXPORT_SYMBOL(warn_slowpath_null);
+
+device_initcall(register_warn_debugfs);
#endif
-#ifdef CONFIG_CC_STACKPROTECTOR
+#ifdef CONFIG_STACKPROTECTOR
/*
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
-__visible void __stack_chk_fail(void)
+__visible noinstr void __stack_chk_fail(void)
{
- panic("stack-protector: Kernel stack is corrupted in: %p\n",
+ unsigned long flags;
+
+ instrumentation_begin();
+ flags = user_access_save();
+
+ panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
+
+ user_access_restore(flags);
+ instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);
@@ -500,13 +1003,26 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
+core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
+core_param(panic_console_replay, panic_console_replay, bool, 0644);
-static int __init setup_crash_kexec_post_notifiers(char *s)
+static int panic_print_set(const char *val, const struct kernel_param *kp)
{
- crash_kexec_post_notifiers = true;
- return 0;
+ panic_print_deprecated();
+ return param_set_ulong(val, kp);
+}
+
+static int panic_print_get(char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_get_ulong(val, kp);
}
-early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
+
+static const struct kernel_param_ops panic_print_ops = {
+ .set = panic_print_set,
+ .get = panic_print_get,
+};
+__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644);
static int __init oops_setup(char *s)
{
@@ -517,3 +1033,30 @@ static int __init oops_setup(char *s)
return 0;
}
early_param("oops", oops_setup);
+
+static int __init panic_on_taint_setup(char *s)
+{
+ char *taint_str;
+
+ if (!s)
+ return -EINVAL;
+
+ taint_str = strsep(&s, ",");
+ if (kstrtoul(taint_str, 16, &panic_on_taint))
+ return -EINVAL;
+
+ /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */
+ panic_on_taint &= TAINT_FLAGS_MAX;
+
+ if (!panic_on_taint)
+ return -EINVAL;
+
+ if (s && !strcmp(s, "nousertaint"))
+ panic_on_taint_nousertaint = true;
+
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
+
+ return 0;
+}
+early_param("panic_on_taint", panic_on_taint_setup);
diff --git a/kernel/params.c b/kernel/params.c
index a22d6a759b1a..b96cfd693c99 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,49 +1,62 @@
-/* Helpers for initial module or kernel cmdline parsing
- Copyright (C) 2001 Rusty Russell.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-*/
-#include <linux/kernel.h>
-#include <linux/string.h>
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Helpers for initial module or kernel cmdline parsing
+ * Copyright (C) 2001 Rusty Russell.
+ */
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/err.h>
+#include <linux/overflow.h>
+#include <linux/security.h>
#include <linux/slab.h>
-#include <linux/ctype.h>
+#include <linux/string.h>
-/* Protects all parameters, and incidentally kmalloced_param list. */
+#ifdef CONFIG_SYSFS
+/* Protects all built-in parameters, modules use their own param_lock */
static DEFINE_MUTEX(param_lock);
+/* Use the module's mutex, or if built-in use the built-in mutex */
+#ifdef CONFIG_MODULES
+#define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : &param_lock)
+#else
+#define KPARAM_MUTEX(mod) (&param_lock)
+#endif
+
+static inline void check_kparam_locked(struct module *mod)
+{
+ BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod)));
+}
+#else
+static inline void check_kparam_locked(struct module *mod)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
/* This just allows us to keep track of which parameters are kmalloced. */
struct kmalloced_param {
struct list_head list;
char val[];
};
static LIST_HEAD(kmalloced_params);
+static DEFINE_SPINLOCK(kmalloced_params_lock);
static void *kmalloc_parameter(unsigned int size)
{
struct kmalloced_param *p;
- p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+ p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
if (!p)
return NULL;
+ spin_lock(&kmalloced_params_lock);
list_add(&p->list, &kmalloced_params);
+ spin_unlock(&kmalloced_params_lock);
+
return p->val;
}
@@ -52,6 +65,7 @@ static void maybe_kfree_parameter(void *param)
{
struct kmalloced_param *p;
+ spin_lock(&kmalloced_params_lock);
list_for_each_entry(p, &kmalloced_params, list) {
if (p->val == param) {
list_del(&p->list);
@@ -59,6 +73,7 @@ static void maybe_kfree_parameter(void *param)
break;
}
}
+ spin_unlock(&kmalloced_params_lock);
}
static char dash2underscore(char c)
@@ -84,13 +99,19 @@ bool parameq(const char *a, const char *b)
return parameqn(a, b, strlen(a)+1);
}
-static void param_check_unsafe(const struct kernel_param *kp)
+static bool param_check_unsafe(const struct kernel_param *kp)
{
+ if (kp->flags & KERNEL_PARAM_FL_HWPARAM &&
+ security_locked_down(LOCKDOWN_MODULE_PARAMETERS))
+ return false;
+
if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
- pr_warn("Setting dangerous option %s - tainting kernel\n",
- kp->name);
+ pr_notice("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
add_taint(TAINT_USER, LOCKDEP_STILL_OK);
}
+
+ return true;
}
static int parse_one(char *param,
@@ -100,8 +121,7 @@ static int parse_one(char *param,
unsigned num_params,
s16 min_level,
s16 max_level,
- int (*handle_unknown)(char *param, char *val,
- const char *doing))
+ void *arg, parse_unknown_fn handle_unknown)
{
unsigned int i;
int err;
@@ -118,75 +138,25 @@ static int parse_one(char *param,
return -EINVAL;
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
- mutex_lock(&param_lock);
- param_check_unsafe(&params[i]);
- err = params[i].ops->set(val, &params[i]);
- mutex_unlock(&param_lock);
+ kernel_param_lock(params[i].mod);
+ if (param_check_unsafe(&params[i]))
+ err = params[i].ops->set(val, &params[i]);
+ else
+ err = -EPERM;
+ kernel_param_unlock(params[i].mod);
return err;
}
}
if (handle_unknown) {
pr_debug("doing %s: %s='%s'\n", doing, param, val);
- return handle_unknown(param, val, doing);
+ return handle_unknown(param, val, doing, arg);
}
pr_debug("Unknown argument '%s'\n", param);
return -ENOENT;
}
-/* You can use " around spaces, but can't escape ". */
-/* Hyphens and underscores equivalent in parameter names. */
-static char *next_arg(char *args, char **param, char **val)
-{
- unsigned int i, equals = 0;
- int in_quote = 0, quoted = 0;
- char *next;
-
- if (*args == '"') {
- args++;
- in_quote = 1;
- quoted = 1;
- }
-
- for (i = 0; args[i]; i++) {
- if (isspace(args[i]) && !in_quote)
- break;
- if (equals == 0) {
- if (args[i] == '=')
- equals = i;
- }
- if (args[i] == '"')
- in_quote = !in_quote;
- }
-
- *param = args;
- if (!equals)
- *val = NULL;
- else {
- args[equals] = '\0';
- *val = args + equals + 1;
-
- /* Don't include quotes in value. */
- if (**val == '"') {
- (*val)++;
- if (args[i-1] == '"')
- args[i-1] = '\0';
- }
- }
- if (quoted && args[i-1] == '"')
- args[i-1] = '\0';
-
- if (args[i]) {
- args[i] = '\0';
- next = args + i + 1;
- } else
- next = args + i;
-
- /* Chew up trailing spaces. */
- return skip_spaces(next);
-}
-
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
char *parse_args(const char *doing,
char *args,
@@ -194,9 +164,9 @@ char *parse_args(const char *doing,
unsigned num,
s16 min_level,
s16 max_level,
- int (*unknown)(char *param, char *val, const char *doing))
+ void *arg, parse_unknown_fn unknown)
{
- char *param, *val;
+ char *param, *val, *err = NULL;
/* Chew leading spaces */
args = skip_spaces(args);
@@ -211,33 +181,34 @@ char *parse_args(const char *doing,
args = next_arg(args, &param, &val);
/* Stop at -- */
if (!val && strcmp(param, "--") == 0)
- return args;
+ return err ?: args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
- min_level, max_level, unknown);
+ min_level, max_level, arg, unknown);
if (irq_was_disabled && !irqs_disabled())
pr_warn("%s: option '%s' enabled irq's!\n",
doing, param);
switch (ret) {
+ case 0:
+ continue;
case -ENOENT:
pr_err("%s: Unknown parameter `%s'\n", doing, param);
- return ERR_PTR(ret);
+ break;
case -ENOSPC:
pr_err("%s: `%s' too large for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
- case 0:
break;
default:
pr_err("%s: `%s' invalid for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
+ break;
}
+
+ err = ERR_PTR(ret);
}
- /* All parsed OK. */
- return NULL;
+ return err;
}
/* Lazy bastard, eh? */
@@ -248,10 +219,10 @@ char *parse_args(const char *doing,
} \
int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
- return scnprintf(buffer, PAGE_SIZE, format, \
+ return scnprintf(buffer, PAGE_SIZE, format "\n", \
*((type *)kp->arg)); \
} \
- struct kernel_param_ops param_ops_##name = { \
+ const struct kernel_param_ops param_ops_##name = { \
.set = param_set_##name, \
.get = param_get_##name, \
}; \
@@ -260,28 +231,52 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint);
+
+int param_set_uint_minmax(const char *val, const struct kernel_param *kp,
+ unsigned int min, unsigned int max)
+{
+ unsigned int num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtouint(val, 0, &num);
+ if (ret)
+ return ret;
+ if (num < min || num > max)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(param_set_uint_minmax);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
- if (strlen(val) > 1024) {
+ size_t len, maxlen = 1024;
+
+ len = strnlen(val, maxlen + 1);
+ if (len == maxlen + 1) {
pr_err("%s: string parameter too long\n", kp->name);
return -ENOSPC;
}
maybe_kfree_parameter(*(char **)kp->arg);
- /* This is a hack. We can't kmalloc in early boot, and we
- * don't need to; this mangled commandline is preserved. */
+ /*
+ * This is a hack. We can't kmalloc() in early boot, and we
+ * don't need to; this mangled commandline is preserved.
+ */
if (slab_is_available()) {
- *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
+ *(char **)kp->arg = kmalloc_parameter(len + 1);
if (!*(char **)kp->arg)
return -ENOMEM;
strcpy(*(char **)kp->arg, val);
@@ -294,16 +289,17 @@ EXPORT_SYMBOL(param_set_charp);
int param_get_charp(char *buffer, const struct kernel_param *kp)
{
- return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);
-static void param_free_charp(void *arg)
+void param_free_charp(void *arg)
{
maybe_kfree_parameter(*((char **)arg));
}
+EXPORT_SYMBOL(param_free_charp);
-struct kernel_param_ops param_ops_charp = {
+const struct kernel_param_ops param_ops_charp = {
.set = param_set_charp,
.get = param_get_charp,
.free = param_free_charp,
@@ -317,24 +313,55 @@ int param_set_bool(const char *val, const struct kernel_param *kp)
if (!val) val = "1";
/* One of =[yYnN01] */
- return strtobool(val, kp->arg);
+ return kstrtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
-struct kernel_param_ops param_ops_bool = {
+const struct kernel_param_ops param_ops_bool = {
.flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
EXPORT_SYMBOL(param_ops_bool);
+int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
+{
+ int err;
+ bool new_value;
+ bool orig_value = *(bool *)kp->arg;
+ struct kernel_param dummy_kp = *kp;
+
+ dummy_kp.arg = &new_value;
+
+ err = param_set_bool(val, &dummy_kp);
+ if (err)
+ return err;
+
+ /* Don't let them unset it once it's set! */
+ if (!new_value && orig_value)
+ return -EROFS;
+
+ if (new_value)
+ err = param_set_bool(val, kp);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(param_set_bool_enable_only);
+
+const struct kernel_param_ops param_ops_bool_enable_only = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = param_set_bool_enable_only,
+ .get = param_get_bool,
+};
+EXPORT_SYMBOL_GPL(param_ops_bool_enable_only);
+
/* This one must be bool. */
int param_set_invbool(const char *val, const struct kernel_param *kp)
{
@@ -352,11 +379,11 @@ EXPORT_SYMBOL(param_set_invbool);
int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
}
EXPORT_SYMBOL(param_get_invbool);
-struct kernel_param_ops param_ops_invbool = {
+const struct kernel_param_ops param_ops_invbool = {
.set = param_set_invbool,
.get = param_get_invbool,
};
@@ -364,12 +391,11 @@ EXPORT_SYMBOL(param_ops_invbool);
int param_set_bint(const char *val, const struct kernel_param *kp)
{
- struct kernel_param boolkp;
+ /* Match bool exactly, by re-using it. */
+ struct kernel_param boolkp = *kp;
bool v;
int ret;
- /* Match bool exactly, by re-using it. */
- boolkp = *kp;
boolkp.arg = &v;
ret = param_set_bool(val, &boolkp);
@@ -379,7 +405,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_set_bint);
-struct kernel_param_ops param_ops_bint = {
+const struct kernel_param_ops param_ops_bint = {
.flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
@@ -387,7 +413,8 @@ struct kernel_param_ops param_ops_bint = {
EXPORT_SYMBOL(param_ops_bint);
/* We break the rule and mangle the string. */
-static int param_array(const char *name,
+static int param_array(struct module *mod,
+ const char *name,
const char *val,
unsigned int min, unsigned int max,
void *elem, int elemsize,
@@ -418,7 +445,7 @@ static int param_array(const char *name,
/* nul-terminate and parse */
save = val[len];
((char *)val)[len] = '\0';
- BUG_ON(!mutex_is_locked(&param_lock));
+ check_kparam_locked(mod);
ret = set(val, &kp);
if (ret != 0)
@@ -440,7 +467,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
const struct kparam_array *arr = kp->arr;
unsigned int temp_num;
- return param_array(kp->name, val, 1, arr->max, arr->elem,
+ return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem,
arr->elemsize, arr->ops->set, kp->level,
arr->num ?: &temp_num);
}
@@ -449,14 +476,14 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
{
int i, off, ret;
const struct kparam_array *arr = kp->arr;
- struct kernel_param p;
+ struct kernel_param p = *kp;
- p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ /* Replace \n with comma */
if (i)
- buffer[off++] = ',';
+ buffer[off - 1] = ',';
p.arg = arr->elem + arr->elemsize * i;
- BUG_ON(!mutex_is_locked(&param_lock));
+ check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
if (ret < 0)
return ret;
@@ -476,7 +503,7 @@ static void param_array_free(void *arg)
arr->ops->free(arr->elem + arr->elemsize * i);
}
-struct kernel_param_ops param_array_ops = {
+const struct kernel_param_ops param_array_ops = {
.set = param_array_set,
.get = param_array_get,
.free = param_array_free,
@@ -486,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops);
int param_set_copystring(const char *val, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
+ const size_t len = strnlen(val, kps->maxlen);
- if (strlen(val)+1 > kps->maxlen) {
+ if (len == kps->maxlen) {
pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
- strcpy(kps->string, val);
+ memcpy(kps->string, val, len + 1);
return 0;
}
EXPORT_SYMBOL(param_set_copystring);
@@ -500,18 +528,18 @@ EXPORT_SYMBOL(param_set_copystring);
int param_get_string(char *buffer, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- return strlcpy(buffer, kps->string, kps->maxlen);
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
}
EXPORT_SYMBOL(param_get_string);
-struct kernel_param_ops param_ops_string = {
+const struct kernel_param_ops param_ops_string = {
.set = param_set_copystring,
.get = param_get_string,
};
EXPORT_SYMBOL(param_ops_string);
/* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr)
+#define to_module_attr(n) container_of_const(n, struct module_attribute, attr)
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
struct param_attribute
@@ -524,46 +552,44 @@ struct module_param_attrs
{
unsigned int num;
struct attribute_group grp;
- struct param_attribute attrs[0];
+ struct param_attribute attrs[] __counted_by(num);
};
#ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
+#define to_param_attr(n) container_of_const(n, struct param_attribute, mattr)
-static ssize_t param_attr_show(struct module_attribute *mattr,
+static ssize_t param_attr_show(const struct module_attribute *mattr,
struct module_kobject *mk, char *buf)
{
int count;
- struct param_attribute *attribute = to_param_attr(mattr);
+ const struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->ops->get)
return -EPERM;
- mutex_lock(&param_lock);
+ kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
- mutex_unlock(&param_lock);
- if (count > 0) {
- strcat(buf, "\n");
- ++count;
- }
+ kernel_param_unlock(mk->mod);
return count;
}
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
-static ssize_t param_attr_store(struct module_attribute *mattr,
- struct module_kobject *km,
+static ssize_t param_attr_store(const struct module_attribute *mattr,
+ struct module_kobject *mk,
const char *buf, size_t len)
{
int err;
- struct param_attribute *attribute = to_param_attr(mattr);
+ const struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->ops->set)
return -EPERM;
- mutex_lock(&param_lock);
- param_check_unsafe(attribute->param);
- err = attribute->param->ops->set(buf, attribute->param);
- mutex_unlock(&param_lock);
+ kernel_param_lock(mk->mod);
+ if (param_check_unsafe(attribute->param))
+ err = attribute->param->ops->set(buf, attribute->param);
+ else
+ err = -EPERM;
+ kernel_param_unlock(mk->mod);
if (!err)
return len;
return err;
@@ -577,22 +603,23 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
#endif
#ifdef CONFIG_SYSFS
-void __kernel_param_lock(void)
+void kernel_param_lock(struct module *mod)
{
- mutex_lock(&param_lock);
+ mutex_lock(KPARAM_MUTEX(mod));
}
-EXPORT_SYMBOL(__kernel_param_lock);
-void __kernel_param_unlock(void)
+void kernel_param_unlock(struct module *mod)
{
- mutex_unlock(&param_lock);
+ mutex_unlock(KPARAM_MUTEX(mod));
}
-EXPORT_SYMBOL(__kernel_param_unlock);
+
+EXPORT_SYMBOL(kernel_param_lock);
+EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
* @name: name of parameter
*
* Create a kobject if for a (per-module) parameter if mp NULL, and
@@ -625,35 +652,32 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
}
/* Enlarge allocations. */
- new_mp = krealloc(mk->mp,
- sizeof(*mk->mp) +
- sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1),
GFP_KERNEL);
if (!new_mp)
return -ENOMEM;
mk->mp = new_mp;
+ mk->mp->num++;
/* Extra pointer for NULL terminator */
- new_attrs = krealloc(mk->mp->grp.attrs,
- sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
- GFP_KERNEL);
+ new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1,
+ sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL);
if (!new_attrs)
return -ENOMEM;
mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
- memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
- sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
- mk->mp->attrs[mk->mp->num].param = kp;
- mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0]));
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr);
+ mk->mp->attrs[mk->mp->num - 1].param = kp;
+ mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show;
/* Do not allow runtime DAC changes to make param writable. */
if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
- mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store;
else
- mk->mp->attrs[mk->mp->num].mattr.store = NULL;
- mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
- mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
- mk->mp->num++;
+ mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL;
+ mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm;
/* Fix up all the pointers, since krealloc can move us */
for (i = 0; i < mk->mp->num; i++)
@@ -719,8 +743,10 @@ void module_param_sysfs_remove(struct module *mod)
{
if (mod->mkobj.mp) {
sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
- /* We are positive that no one is using any param
- * attrs at this point. Deallocate immediately. */
+ /*
+ * We are positive that no one is using any param
+ * attrs at this point. Deallocate immediately.
+ */
free_module_param_attrs(&mod->mkobj);
}
}
@@ -735,38 +761,35 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static struct module_kobject * __init locate_module_kobject(const char *name)
+struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
int err;
kobj = kset_find_obj(module_kset, name);
- if (kobj) {
- mk = to_module_kobject(kobj);
- } else {
- mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
- BUG_ON(!mk);
-
- mk->mod = THIS_MODULE;
- mk->kobj.kset = module_kset;
- err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
- "%s", name);
-#ifdef CONFIG_MODULES
- if (!err)
- err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
-#endif
- if (err) {
- kobject_put(&mk->kobj);
- pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
- name, err);
- return NULL;
- }
+ if (kobj)
+ return to_module_kobject(kobj);
+
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ if (!mk)
+ return NULL;
- /* So that we hold reference in both cases. */
- kobject_get(&mk->kobj);
+ mk->mod = THIS_MODULE;
+ mk->kobj.kset = module_kset;
+ err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
+ if (IS_ENABLED(CONFIG_MODULES) && !err)
+ err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
+ if (err) {
+ kobject_put(&mk->kobj);
+ pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n",
+ name, err);
+ return NULL;
}
+ /* So that we hold reference in both cases. */
+ kobject_get(&mk->kobj);
+
return mk;
}
@@ -777,7 +800,7 @@ static void __init kernel_add_sysfs_param(const char *name,
struct module_kobject *mk;
int err;
- mk = locate_module_kobject(name);
+ mk = lookup_or_create_module_kobject(name);
if (!mk)
return;
@@ -819,40 +842,39 @@ static void __init param_sysfs_builtin(void)
dot = strchr(kp->name, '.');
if (!dot) {
/* This happens for core_param() */
- strcpy(modname, "kernel");
+ strscpy(modname, "kernel");
name_len = 0;
} else {
name_len = dot - kp->name + 1;
- strlcpy(modname, kp->name, name_len);
+ strscpy(modname, kp->name, name_len);
}
kernel_add_sysfs_param(modname, kp, name_len);
}
}
-ssize_t __modver_version_show(struct module_attribute *mattr,
+ssize_t __modver_version_show(const struct module_attribute *mattr,
struct module_kobject *mk, char *buf)
{
- struct module_version_attribute *vattr =
- container_of(mattr, struct module_version_attribute, mattr);
+ const struct module_version_attribute *vattr =
+ container_of_const(mattr, struct module_version_attribute, mattr);
return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
}
-extern const struct module_version_attribute *__start___modver[];
-extern const struct module_version_attribute *__stop___modver[];
+extern const struct module_version_attribute __start___modver[];
+extern const struct module_version_attribute __stop___modver[];
static void __init version_sysfs_builtin(void)
{
- const struct module_version_attribute **p;
+ const struct module_version_attribute *vattr;
struct module_kobject *mk;
int err;
- for (p = __start___modver; p < __stop___modver; p++) {
- const struct module_version_attribute *vattr = *p;
-
- mk = locate_module_kobject(vattr->module_name);
+ for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+ mk = lookup_or_create_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+ WARN_ON_ONCE(err);
kobject_uevent(&mk->kobj, KOBJ_ADD);
kobject_put(&mk->kobj);
}
@@ -865,7 +887,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
- struct module_attribute *attribute;
+ const struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
@@ -884,7 +906,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t len)
{
- struct module_attribute *attribute;
+ const struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
@@ -904,9 +926,9 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
+static int uevent_filter(const struct kobject *kobj)
{
- struct kobj_type *ktype = get_ktype(kobj);
+ const struct kobj_type *ktype = get_ktype(kobj);
if (ktype == &module_ktype)
return 1;
@@ -918,21 +940,26 @@ static const struct kset_uevent_ops module_uevent_ops = {
};
struct kset *module_kset;
-int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
struct module_kobject *mk = to_module_kobject(kobj);
- complete(mk->kobj_completion);
+
+ if (mk->kobj_completion)
+ complete(mk->kobj_completion);
}
-struct kobj_type module_ktype = {
+const struct kobj_type module_ktype = {
.release = module_kobj_release,
.sysfs_ops = &module_sysfs_ops,
};
/*
- * param_sysfs_init - wrapper for built-in params support
+ * param_sysfs_init - create "module" kset
+ *
+ * This must be done before the initramfs is unpacked and
+ * request_module() thus becomes possible, because otherwise the
+ * module load would fail in mod_sysfs_init.
*/
static int __init param_sysfs_init(void)
{
@@ -942,13 +969,25 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__);
return -ENOMEM;
}
- module_sysfs_initialized = 1;
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+/*
+ * param_sysfs_builtin_init - add sysfs version and parameter
+ * attributes for built-in modules
+ */
+static int __init param_sysfs_builtin_init(void)
+{
+ if (!module_kset)
+ return -ENOMEM;
version_sysfs_builtin();
param_sysfs_builtin();
return 0;
}
-subsys_initcall(param_sysfs_init);
+late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..a31771bc89c1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
@@ -31,35 +32,37 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
-#include <linux/bootmem.h>
-#include <linux/hash.h>
+#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
-
-#define pid_hashfn(nr, ns) \
- hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
-struct pid init_struct_pid = INIT_STRUCT_PID;
-
-int pid_max = PID_MAX_DEFAULT;
-
-#define RESERVED_PIDS 300
-
-int pid_max_min = RESERVED_PIDS + 1;
-int pid_max_max = PID_MAX_LIMIT;
-
-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
+#include <linux/refcount.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/idr.h>
+#include <linux/pidfs.h>
+#include <linux/seqlock.h>
+#include <net/sock.h>
+#include <uapi/linux/pidfd.h>
+
+struct pid init_struct_pid = {
+ .count = REFCOUNT_INIT(1),
+ .tasks = {
+ { .first = NULL },
+ { .first = NULL },
+ { .first = NULL },
+ },
+ .level = 0,
+ .numbers = { {
+ .nr = 0,
+ .ns = &init_pid_ns,
+ }, }
+};
-#define find_next_offset(map, off) \
- find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
+static int pid_max_min = RESERVED_PIDS + 1;
+static int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -68,171 +71,21 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .kref = {
- .refcount = ATOMIC_INIT(2),
- },
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
- .last_pid = 0,
- .nr_hashed = PIDNS_HASH_ADDING,
+ .ns = NS_COMMON_INIT(init_pid_ns),
+ .idr = IDR_INIT(init_pid_ns.idr),
+ .pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = PROC_PID_INIT_INO,
-#ifdef CONFIG_PID_NS
- .ns.ops = &pidns_operations,
+ .pid_max = PID_MAX_DEFAULT,
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-/*
- * Note: disable interrupts while the pidmap_lock is held as an
- * interrupt might come in and do read_lock(&tasklist_lock).
- *
- * If we don't disable interrupts there is a nasty deadlock between
- * detach_pid()->free_pid() and another cpu that does
- * spin_lock(&pidmap_lock) followed by an interrupt routine that does
- * read_lock(&tasklist_lock);
- *
- * After we clean up the tasklist_lock and know there are no
- * irq handlers that take it we can leave the interrupts enabled.
- * For now it is easier to be safe than to prove it can't happen.
- */
-
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-
-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
+seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
void put_pid(struct pid *pid)
{
@@ -242,8 +95,8 @@ void put_pid(struct pid *pid)
return;
ns = pid->numbers[pid->level].ns;
- if ((atomic_read(&pid->count) == 1) ||
- atomic_dec_and_test(&pid->count)) {
+ if (refcount_dec_and_test(&pid->count)) {
+ pidfs_free_pid(pid);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -258,16 +111,19 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
- /* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
- unsigned long flags;
+ struct pid_namespace *active_ns;
- spin_lock_irqsave(&pidmap_lock, flags);
+ lockdep_assert_not_held(&tasklist_lock);
+
+ active_ns = pid->numbers[pid->level].ns;
+ ns_ref_active_put(active_ns);
+
+ spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
- hlist_del_rcu(&upid->pid_chain);
- switch(--ns->nr_hashed) {
+ switch (--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
@@ -276,25 +132,35 @@ void free_pid(struct pid *pid)
*/
wake_up_process(ns->child_reaper);
break;
- case PIDNS_HASH_ADDING:
+ case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
- ns->nr_hashed = 0;
- /* fall through */
- case 0:
- schedule_work(&ns->proc_work);
+ ns->pid_allocated = 0;
break;
}
- }
- spin_unlock_irqrestore(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, upid->nr);
+ }
+ pidfs_remove_pid(pid);
+ spin_unlock(&pidmap_lock);
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+void free_pids(struct pid **pids)
+{
+ int tmp;
+
+ /*
+ * This can batch pidmap_lock.
+ */
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (pids[tmp])
+ free_pid(pids[tmp]);
+}
+
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
+ size_t set_tid_size)
{
struct pid *pid;
enum pid_type type;
@@ -303,16 +169,79 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct upid *upid;
int retval = -ENOMEM;
+ /*
+ * set_tid_size contains the size of the set_tid array. Starting at
+ * the most nested currently active PID namespace it tells alloc_pid()
+ * which PID to set for a process in that most nested PID namespace
+ * up to set_tid_size PID namespaces. It does not have to set the PID
+ * for a process in all nested PID namespaces but set_tid_size must
+ * never be greater than the current ns->level + 1.
+ */
+ if (set_tid_size > ns->level + 1)
+ return ERR_PTR(-EINVAL);
+
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
tmp = ns;
pid->level = ns->level;
+
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
- if (IS_ERR_VALUE(nr)) {
- retval = nr;
+ int tid = 0;
+ int pid_max = READ_ONCE(tmp->pid_max);
+
+ if (set_tid_size) {
+ tid = set_tid[ns->level - i];
+
+ retval = -EINVAL;
+ if (tid < 1 || tid >= pid_max)
+ goto out_free;
+ /*
+ * Also fail if a PID != 1 is requested and
+ * no PID 1 exists.
+ */
+ if (tid != 1 && !tmp->child_reaper)
+ goto out_free;
+ retval = -EPERM;
+ if (!checkpoint_restore_ns_capable(tmp->user_ns))
+ goto out_free;
+ set_tid_size--;
+ }
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
+
+ if (tid) {
+ nr = idr_alloc(&tmp->idr, NULL, tid,
+ tid + 1, GFP_ATOMIC);
+ /*
+ * If ENOSPC is returned it means that the PID is
+ * alreay in use. Return EEXIST in that case.
+ */
+ if (nr == -ENOSPC)
+ nr = -EEXIST;
+ } else {
+ int pid_min = 1;
+ /*
+ * init really needs pid 1, but after reaching the
+ * maximum wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ }
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
+
+ if (nr < 0) {
+ retval = (nr == -ENOSPC) ? -EAGAIN : nr;
goto out_free;
}
@@ -321,36 +250,59 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = tmp->parent;
}
- if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns))
- goto out_free;
- }
+ /*
+ * ENOMEM is not the most obvious choice especially for the case
+ * where the child subreaper has already exited and the pid
+ * namespace denies the creation of any new processes. But ENOMEM
+ * is what we have exposed to userspace for a long time and it is
+ * documented behavior for pid namespaces. So we can't easily
+ * change it even if there were an error code better suited.
+ */
+ retval = -ENOMEM;
get_pid_ns(ns);
- atomic_set(&pid->count, 1);
+ refcount_set(&pid->count, 1);
+ spin_lock_init(&pid->lock);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd);
+ INIT_HLIST_HEAD(&pid->inodes);
+
upid = pid->numbers + ns->level;
- spin_lock_irq(&pidmap_lock);
- if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
+ if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
+ pidfs_add_pid(pid);
for ( ; upid >= pid->numbers; --upid) {
- hlist_add_head_rcu(&upid->pid_chain,
- &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- upid->ns->nr_hashed++;
+ /* Make the PID visible to find_pid_ns. */
+ idr_replace(&upid->ns->idr, pid, upid->nr);
+ upid->ns->pid_allocated++;
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
+ ns_ref_active_get(ns);
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
put_pid_ns(ns);
out_free:
- while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ spin_lock(&pidmap_lock);
+ while (++i <= ns->level) {
+ upid = pid->numbers + i;
+ idr_remove(&upid->ns->idr, upid->nr);
+ }
+
+ /* On failure to allocate the first pid, reset the state */
+ if (ns->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
+ spin_unlock(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -358,22 +310,14 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
- spin_lock_irq(&pidmap_lock);
- ns->nr_hashed &= ~PIDNS_HASH_ADDING;
- spin_unlock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
+ ns->pid_allocated &= ~PIDNS_ADDING;
+ spin_unlock(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct upid *pnr;
-
- hlist_for_each_entry_rcu(pnr,
- &pid_hash[pid_hashfn(nr, ns)], pid_chain)
- if (pnr->nr == nr && pnr->ns == ns)
- return container_of(pnr, struct pid,
- numbers[ns->level]);
-
- return NULL;
+ return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
@@ -383,53 +327,88 @@ struct pid *find_vpid(int nr)
}
EXPORT_SYMBOL_GPL(find_vpid);
+static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
+{
+ return (type == PIDTYPE_PID) ?
+ &task->thread_pid :
+ &task->signal->pids[type];
+}
+
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid_link *link = &task->pids[type];
- hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
+ struct pid *pid;
+
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid = *task_pid_ptr(task, type);
+ hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
- struct pid *new)
+static void __change_pid(struct pid **pids, struct task_struct *task,
+ enum pid_type type, struct pid *new)
{
- struct pid_link *link;
- struct pid *pid;
+ struct pid **pid_ptr, *pid;
int tmp;
- link = &task->pids[type];
- pid = link->pid;
+ lockdep_assert_held_write(&tasklist_lock);
- hlist_del_rcu(&link->node);
- link->pid = new;
+ pid_ptr = task_pid_ptr(task, type);
+ pid = *pid_ptr;
+
+ hlist_del_rcu(&task->pid_links[type]);
+ *pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
- if (!hlist_empty(&pid->tasks[tmp]))
+ if (pid_has_task(pid, tmp))
return;
- free_pid(pid);
+ WARN_ON(pids[type]);
+ pids[type] = pid;
}
-void detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ __change_pid(pids, task, type, NULL);
}
-void change_pid(struct task_struct *task, enum pid_type type,
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ __change_pid(pids, task, type, pid);
attach_pid(task, type);
}
+void exchange_tids(struct task_struct *left, struct task_struct *right)
+{
+ struct pid *pid1 = left->thread_pid;
+ struct pid *pid2 = right->thread_pid;
+ struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
+ struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+
+ lockdep_assert_held_write(&tasklist_lock);
+
+ /* Swap the single entry tid lists */
+ hlists_swap_heads_rcu(head1, head2);
+
+ /* Swap the per task_struct pid */
+ rcu_assign_pointer(left->thread_pid, pid2);
+ rcu_assign_pointer(right->thread_pid, pid1);
+
+ /* Swap the cached value */
+ WRITE_ONCE(left->pid, pid_nr(pid2));
+ WRITE_ONCE(right->pid, pid_nr(pid1));
+}
+
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
- new->pids[type].pid = old->pids[type].pid;
- hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+ WARN_ON_ONCE(type == PIDTYPE_PID);
+ lockdep_assert_held_write(&tasklist_lock);
+ hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
@@ -440,7 +419,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
lockdep_tasklist_lock_is_held());
if (first)
- result = hlist_entry(first, struct task_struct, pids[(type)].node);
+ result = hlist_entry(first, struct task_struct, pid_links[(type)]);
}
return result;
}
@@ -451,9 +430,8 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- rcu_lockdep_assert(rcu_read_lock_held(),
- "find_task_by_pid_ns() needs rcu_read_lock()"
- " protection");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ "find_task_by_pid_ns() needs rcu_read_lock() protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
@@ -462,13 +440,24 @@ struct task_struct *find_task_by_vpid(pid_t vnr)
return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
+struct task_struct *find_get_task_by_vpid(pid_t nr)
+{
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = find_task_by_vpid(nr);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+}
+
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
- if (type != PIDTYPE_PID)
- task = task->group_leader;
- pid = get_pid(task->pids[type].pid);
+ pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
rcu_read_unlock();
return pid;
}
@@ -503,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
struct upid *upid;
pid_t nr = 0;
- if (pid && ns->level <= pid->level) {
+ if (pid && ns && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
@@ -526,23 +515,14 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
- if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID)
- task = task->group_leader;
- nr = pid_nr_ns(task->pids[type].pid, ns);
- }
+ if (ns)
+ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
@@ -556,54 +536,388 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
+ return idr_get_next(&ns->idr, &nr);
+}
+EXPORT_SYMBOL_GPL(find_ge_pid);
+
+struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
+{
+ CLASS(fd, f)(fd);
struct pid *pid;
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+ pid = pidfd_pid(fd_file(f));
+ if (!IS_ERR(pid)) {
+ get_pid(pid);
+ *flags = fd_file(f)->f_flags;
+ }
return pid;
}
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
+/**
+ * pidfd_get_task() - Get the task associated with a pidfd
+ *
+ * @pidfd: pidfd for which to get the task
+ * @flags: flags associated with this pidfd
+ *
+ * Return the task associated with @pidfd. The function takes a reference on
+ * the returned task. The caller is responsible for releasing that reference.
+ *
+ * Return: On success, the task_struct associated with the pidfd.
+ * On error, a negative errno number will be returned.
*/
-void __init pidhash_init(void)
+struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
+{
+ unsigned int f_flags = 0;
+ struct pid *pid;
+ struct task_struct *task;
+ enum pid_type type;
+
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ type = PIDTYPE_PID;
+ pid = get_task_pid(current, type);
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ pid = get_task_pid(current, type);
+ break;
+ default:
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+ type = PIDTYPE_TGID;
+ break;
+ }
+
+ task = get_pid_task(pid, type);
+ put_pid(pid);
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ *flags = f_flags;
+ return task;
+}
+
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid: struct pid that the pidfd will reference
+ * @flags: flags to pass
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * This symbol should not be explicitly exported to loadable modules.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ * On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid, unsigned int flags)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
+
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
+}
+
+/**
+ * sys_pidfd_open() - Open new pid file descriptor.
+ *
+ * @pid: pid for which to retrieve a pidfd
+ * @flags: flags to pass
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set for
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ * On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
+{
+ int fd;
+ struct pid *p;
+
+ if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
+ return -EINVAL;
+
+ if (pid <= 0)
+ return -EINVAL;
+
+ p = find_get_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ fd = pidfd_create(p, flags);
+
+ put_pid(p);
+ return fd;
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
+{
+ return &task_active_pid_ns(current)->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return &task_active_pid_ns(current)->set == set;
+}
+
+static int pid_table_root_permissions(struct ctl_table_header *head,
+ const struct ctl_table *table)
{
- unsigned int i, pidhash_size;
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ int mode = table->mode;
- pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL,
- &pidhash_shift, NULL,
- 0, 4096);
- pidhash_size = 1U << pidhash_shift;
+ if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) ||
+ uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXU) >> 6;
+ else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXG) >> 3;
+ else
+ mode = mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static void pid_table_root_set_ownership(struct ctl_table_header *head,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(pidns->user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
- for (i = 0; i < pidhash_size; i++)
- INIT_HLIST_HEAD(&pid_hash[i]);
+ ns_root_gid = make_kgid(pidns->user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
}
-void __init pidmap_init(void)
+static struct ctl_table_root pid_table_root = {
+ .lookup = pid_table_root_lookup,
+ .permissions = pid_table_root_permissions,
+ .set_ownership = pid_table_root_set_ownership,
+};
+
+static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
- /* Veryify no one has done anything silly */
- BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+ struct pid *new_pid;
+ pid_t tmp_pid;
+ int r;
+ struct ctl_table tmp_table = *table;
+
+ tmp_pid = pid_vnr(cad_pid);
+ tmp_table.data = &tmp_pid;
+
+ r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp_pid);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
+static const struct ctl_table pid_table[] = {
+ {
+ .procname = "pid_max",
+ .data = &init_pid_ns.pid_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "cad_pid",
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_do_cad_pid,
+ },
+#endif
+};
+#endif
+
+int register_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
+
+ tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
+ if (!tbl)
+ return -ENOMEM;
+ tbl->data = &pidns->pid_max;
+ pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+
+ pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
+ ARRAY_SIZE(pid_table));
+ if (!pidns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&pidns->set);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+void unregister_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ const struct ctl_table *tbl;
+
+ tbl = pidns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(pidns->sysctls);
+ retire_sysctl_set(&pidns->set);
+ kfree(tbl);
+#endif
+}
+
+void __init pid_idr_init(void)
+{
+ /* Verify no one has done anything silly: */
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
- PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
+
+ idr_init(&init_pid_ns.idr);
+
+ init_pid_ns.pid_cachep = kmem_cache_create("pid",
+ struct_size_t(struct pid, numbers, 1),
+ __alignof__(struct pid),
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
+}
+
+static __init int pid_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ /* "kernel" directory will have already been initialized. */
+ BUG_ON(register_pidns_sysctls(&init_pid_ns));
+#endif
+ return 0;
+}
+subsys_initcall(pid_namespace_sysctl_init);
+
+static struct file *__pidfd_fget(struct task_struct *task, int fd)
+{
+ struct file *file;
+ int ret;
+
+ ret = down_read_killable(&task->signal->exec_update_lock);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
+ file = fget_task(task, fd);
+ else
+ file = ERR_PTR(-EPERM);
+
+ up_read(&task->signal->exec_update_lock);
+
+ if (!file) {
+ /*
+ * It is possible that the target thread is exiting; it can be
+ * either:
+ * 1. before exit_signals(), which gives a real fd
+ * 2. before exit_files() takes the task_lock() gives a real fd
+ * 3. after exit_files() releases task_lock(), ->files is NULL;
+ * this has PF_EXITING, since it was set in exit_signals(),
+ * __pidfd_fget() returns EBADF.
+ * In case 3 we get EBADF, but that really means ESRCH, since
+ * the task is currently exiting and has freed its files
+ * struct, so we fix it up.
+ */
+ if (task->flags & PF_EXITING)
+ file = ERR_PTR(-ESRCH);
+ else
+ file = ERR_PTR(-EBADF);
+ }
+
+ return file;
+}
+
+static int pidfd_getfd(struct pid *pid, int fd)
+{
+ struct task_struct *task;
+ struct file *file;
+ int ret;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
+
+ file = __pidfd_fget(task, fd);
+ put_task_struct(task);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = receive_fd(file, NULL, O_CLOEXEC);
+ fput(file);
+
+ return ret;
+}
+
+/**
+ * sys_pidfd_getfd() - Get a file descriptor from another process
+ *
+ * @pidfd: the pidfd file descriptor of the process
+ * @fd: the file descriptor number to get
+ * @flags: flags on how to get the fd (reserved)
+ *
+ * This syscall gets a copy of a file descriptor from another process
+ * based on the pidfd, and file descriptor number. It requires that
+ * the calling process has the ability to ptrace the process represented
+ * by the pidfd. The process which is having its file descriptor copied
+ * is otherwise unaffected.
+ *
+ * Return: On success, a cloexec file descriptor is returned.
+ * On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
+ unsigned int, flags)
+{
+ struct pid *pid;
+
+ /* flags is currently unused - make sure it's unset */
+ if (flags)
+ return -EINVAL;
+
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
+ return -EBADF;
- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ pid = pidfd_pid(fd_file(f));
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
- init_pid_ns.pid_cachep = KMEM_CACHE(pid,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ return pidfd_getfd(pid, fd);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba137fd15..e48f5de41361 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Pid namespaces
*
@@ -12,145 +13,166 @@
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/syscalls.h>
+#include <linux/cred.h>
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
#include <linux/proc_ns.h>
#include <linux/reboot.h>
#include <linux/export.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+#include <linux/idr.h>
+#include <linux/nstree.h>
+#include <uapi/linux/wait.h>
+#include "pid_sysctl.h"
-struct pid_cache {
- int nr_ids;
- char name[16];
- struct kmem_cache *cachep;
- struct list_head list;
-};
-
-static LIST_HEAD(pid_caches_lh);
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
+/* Write once array, filled from the beginning. */
+static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
/*
* creates the kmem cache to allocate pids from.
- * @nr_ids: the number of numerical ids this pid will have to carry
+ * @level: pid namespace level
*/
-static struct kmem_cache *create_pid_cachep(int nr_ids)
+static struct kmem_cache *create_pid_cachep(unsigned int level)
{
- struct pid_cache *pcache;
- struct kmem_cache *cachep;
-
+ /* Level 0 is init_pid_ns.pid_cachep */
+ struct kmem_cache **pkc = &pid_cache[level - 1];
+ struct kmem_cache *kc;
+ char name[4 + 10 + 1];
+ unsigned int len;
+
+ kc = READ_ONCE(*pkc);
+ if (kc)
+ return kc;
+
+ snprintf(name, sizeof(name), "pid_%u", level + 1);
+ len = struct_size_t(struct pid, numbers, level + 1);
mutex_lock(&pid_caches_mutex);
- list_for_each_entry(pcache, &pid_caches_lh, list)
- if (pcache->nr_ids == nr_ids)
- goto out;
-
- pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
- if (pcache == NULL)
- goto err_alloc;
-
- snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
- cachep = kmem_cache_create(pcache->name,
- sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (cachep == NULL)
- goto err_cachep;
-
- pcache->nr_ids = nr_ids;
- pcache->cachep = cachep;
- list_add(&pcache->list, &pid_caches_lh);
-out:
+ /* Name collision forces to do allocation under mutex. */
+ if (!*pkc)
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
mutex_unlock(&pid_caches_mutex);
- return pcache->cachep;
+ /* current can fail, but someone else can succeed. */
+ return READ_ONCE(*pkc);
+}
-err_cachep:
- kfree(pcache);
-err_alloc:
- mutex_unlock(&pid_caches_mutex);
- return NULL;
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
}
-static void proc_cleanup_work(struct work_struct *work)
+static void dec_pid_namespaces(struct ucounts *ucounts)
{
- struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
- pid_ns_release_proc(ns);
+ dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}
-/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
-#define MAX_PID_NS_LEVEL 32
+static void destroy_pid_namespace_work(struct work_struct *work);
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
- int i;
+ struct ucounts *ucounts;
int err;
- if (level > MAX_PID_NS_LEVEL) {
- err = -EINVAL;
+ err = -EINVAL;
+ if (!in_userns(parent_pid_ns->user_ns, user_ns))
+ goto out;
+
+ err = -ENOSPC;
+ if (level > MAX_PID_NS_LEVEL)
+ goto out;
+ ucounts = inc_pid_namespaces(user_ns);
+ if (!ucounts)
goto out;
- }
err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
- goto out;
+ goto out_dec;
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);
- ns->pid_cachep = create_pid_cachep(level + 1);
+ ns->pid_cachep = create_pid_cachep(level);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;
+
+ err = ns_common_init(ns);
+ if (err)
+ goto out_free_idr;
- err = ns_alloc_inum(&ns->ns);
+ ns->pid_max = PID_MAX_LIMIT;
+ err = register_pidns_sysctls(ns);
if (err)
- goto out_free_map;
- ns->ns.ops = &pidns_operations;
+ goto out_free_inum;
- kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
- ns->nr_hashed = PIDNS_HASH_ADDING;
- INIT_WORK(&ns->proc_work, proc_cleanup_work);
+ ns->ucounts = ucounts;
+ ns->pid_allocated = PIDNS_ADDING;
+ INIT_WORK(&ns->work, destroy_pid_namespace_work);
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
+#endif
+ ns_tree_add(ns);
return ns;
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_inum:
+ ns_common_free(ns);
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+ dec_pid_namespaces(ucounts);
out:
return ERR_PTR(err);
}
static void delayed_free_pidns(struct rcu_head *p)
{
- kmem_cache_free(pid_ns_cachep,
- container_of(p, struct pid_namespace, rcu));
+ struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
+
+ dec_pid_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+
+ kmem_cache_free(pid_ns_cachep, ns);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
+ ns_tree_remove(ns);
+ unregister_pidns_sysctls(ns);
- ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
- put_user_ns(ns->user_ns);
+ ns_common_free(ns);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
-struct pid_namespace *copy_pid_ns(unsigned long flags,
+static void destroy_pid_namespace_work(struct work_struct *work)
+{
+ struct pid_namespace *ns =
+ container_of(work, struct pid_namespace, work);
+
+ do {
+ struct pid_namespace *parent;
+
+ parent = ns->parent;
+ destroy_pid_namespace(ns);
+ ns = parent;
+ } while (ns != &init_pid_ns && ns_ref_put(ns));
+}
+
+struct pid_namespace *copy_pid_ns(u64 flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
@@ -160,24 +182,10 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
return create_pid_namespace(user_ns, old_ns);
}
-static void free_pid_ns(struct kref *kref)
-{
- struct pid_namespace *ns;
-
- ns = container_of(kref, struct pid_namespace, kref);
- destroy_pid_namespace(ns);
-}
-
void put_pid_ns(struct pid_namespace *ns)
{
- struct pid_namespace *parent;
-
- while (ns != &init_pid_ns) {
- parent = ns->parent;
- if (!kref_put(&ns->kref, free_pid_ns))
- break;
- ns = parent;
- }
+ if (ns && ns_ref_put(ns))
+ schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -187,6 +195,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -213,50 +222,54 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* maintain a tasklist for each pid namespace.
*
*/
+ rcu_read_lock();
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- rcu_read_lock();
-
- task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ nr = 2;
+ idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+ task = pid_task(pid, PIDTYPE_PID);
if (task && !__fatal_signal_pending(task))
- send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
- rcu_read_unlock();
-
- nr = next_pidmap(pid_ns, nr);
+ group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX);
}
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
/*
* Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
- * sys_wait4() will also block until our children traced from the
+ * kernel_wait4() will also block until our children traced from the
* parent namespace are detached and become EXIT_DEAD.
*/
do {
clear_thread_flag(TIF_SIGPENDING);
- rc = sys_wait4(-1, NULL, __WALL, NULL);
+ clear_thread_flag(TIF_NOTIFY_SIGNAL);
+ rc = kernel_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
/*
- * sys_wait4() above can't reap the EXIT_DEAD children but we do not
- * really care, we could reparent them to the global init. We could
- * exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
- * pid_ns can not go away until proc_kill_sb() drops the reference.
+ * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
+ * process whose parents processes are outside of the pid
+ * namespace. Such processes are created with setns()+fork().
+ *
+ * If those EXIT_ZOMBIE processes are not reaped by their
+ * parents before their parents exit, they will be reparented
+ * to pid_ns->child_reaper. Thus pidns->child_reaper needs to
+ * stay valid until they all go away.
*
- * But this ns can also have other tasks injected by setns()+fork().
- * Again, ignoring the user visible semantics we do not really need
- * to wait until they are all reaped, but they can be reparented to
- * us and thus we need to ensure that pid->child_reaper stays valid
- * until they all go away. See free_pid()->wake_up_process().
+ * The code relies on the pid_ns->child_reaper ignoring
+ * SIGCHILD to cause those EXIT_ZOMBIE processes to be
+ * autoreaped if reparented.
*
- * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
- * if reparented.
+ * Semantically it is also desirable to wait for EXIT_ZOMBIE
+ * processes before allowing the child_reaper to be reaped, as
+ * that gives the invariant that when the init process of a
+ * pid namespace is reaped all of the processes in the pid
+ * namespace are gone.
+ *
+ * Once all of the other tasks are gone from the pid_namespace
+ * free_pid() will awaken this task.
*/
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (pid_ns->nr_hashed == init_pids)
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (pid_ns->pid_allocated == init_pids)
break;
schedule();
}
@@ -270,39 +283,37 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
#ifdef CONFIG_CHECKPOINT_RESTORE
-static int pid_ns_ctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+ int ret, next;
- if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
+ if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
return -EPERM;
- /*
- * Writing directly to ns' last_pid field is OK, since this field
- * is volatile in a living namespace anyway and a code writing to
- * it should synchronize its usage with external means.
- */
+ next = idr_get_cursor(&pid_ns->idr) - 1;
- tmp.data = &pid_ns->last_pid;
- return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ tmp.data = &next;
+ tmp.extra2 = &pid_ns->pid_max;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (!ret && write)
+ idr_set_cursor(&pid_ns->idr, next + 1);
+
+ return ret;
}
-extern int pid_max;
-static int zero = 0;
-static struct ctl_table pid_ns_ctl_table[] = {
+static const struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
- .extra1 = &zero,
- .extra2 = &pid_max,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &init_pid_ns.pid_max,
},
- { }
};
-static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
#endif /* CONFIG_CHECKPOINT_RESTORE */
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
@@ -325,7 +336,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
}
read_lock(&tasklist_lock);
- force_sig(SIGKILL, pid_ns->child_reaper);
+ send_sig(SIGKILL, pid_ns->child_reaper, 1);
read_unlock(&tasklist_lock);
do_exit(0);
@@ -334,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
-{
- return container_of(ns, struct pid_namespace, ns);
-}
-
static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -352,18 +358,54 @@ static struct ns_common *pidns_get(struct task_struct *task)
return ns ? &ns->ns : NULL;
}
+static struct ns_common *pidns_for_children_get(struct task_struct *task)
+{
+ struct pid_namespace *ns = NULL;
+
+ task_lock(task);
+ if (task->nsproxy) {
+ ns = task->nsproxy->pid_ns_for_children;
+ get_pid_ns(ns);
+ }
+ task_unlock(task);
+
+ if (ns) {
+ read_lock(&tasklist_lock);
+ if (!ns->child_reaper) {
+ put_pid_ns(ns);
+ ns = NULL;
+ }
+ read_unlock(&tasklist_lock);
+ }
+
+ return ns ? &ns->ns : NULL;
+}
+
static void pidns_put(struct ns_common *ns)
{
put_pid_ns(to_pid_ns(ns));
}
-static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+bool pidns_is_ancestor(struct pid_namespace *child,
+ struct pid_namespace *ancestor)
{
+ struct pid_namespace *ns;
+
+ if (child->level < ancestor->level)
+ return false;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return ns == ancestor;
+}
+
+static int pidns_install(struct nsset *nsset, struct ns_common *ns)
+{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = to_pid_ns(ns);
+ struct pid_namespace *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -374,13 +416,7 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
* this maintains the property that processes and their
* children can not escape their current pid namespace.
*/
- if (new->level < active->level)
- return -EINVAL;
-
- ancestor = new;
- while (ancestor->level > active->level)
- ancestor = ancestor->parent;
- if (ancestor != active)
+ if (!pidns_is_ancestor(new, active))
return -EINVAL;
put_pid_ns(nsproxy->pid_ns_for_children);
@@ -388,21 +424,58 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0;
}
+static struct ns_common *pidns_get_parent(struct ns_common *ns)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *pid_ns, *p;
+
+ /* See if the parent is in the current namespace */
+ pid_ns = p = to_pid_ns(ns)->parent;
+ for (;;) {
+ if (!p)
+ return ERR_PTR(-EPERM);
+ if (p == active)
+ break;
+ p = p->parent;
+ }
+
+ return &get_pid_ns(pid_ns)->ns;
+}
+
+static struct user_namespace *pidns_owner(struct ns_common *ns)
+{
+ return to_pid_ns(ns)->user_ns;
+}
+
const struct proc_ns_operations pidns_operations = {
.name = "pid",
- .type = CLONE_NEWPID,
.get = pidns_get,
.put = pidns_put,
.install = pidns_install,
+ .owner = pidns_owner,
+ .get_parent = pidns_get_parent,
+};
+
+const struct proc_ns_operations pidns_for_children_operations = {
+ .name = "pid_for_children",
+ .real_ns_name = "pid",
+ .get = pidns_for_children_get,
+ .put = pidns_put,
+ .install = pidns_install,
+ .owner = pidns_owner,
+ .get_parent = pidns_get_parent,
};
static __init int pid_namespaces_init(void)
{
- pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
#ifdef CONFIG_CHECKPOINT_RESTORE
- register_sysctl_paths(kern_path, pid_ns_ctl_table);
+ register_sysctl_init("kernel", pid_ns_ctl_table);
#endif
+
+ register_pid_ns_sysctl_table_vm();
+ ns_tree_add(&init_pid_ns);
return 0;
}
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
new file mode 100644
index 000000000000..5d8f981de7c5
--- /dev/null
+++ b/kernel/pid_sysctl.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_PID_SYSCTL_H
+#define LINUX_PID_SYSCTL_H
+
+#include <linux/pid_namespace.h>
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table,
+ int write, void *buf, size_t *lenp, loff_t *ppos)
+{
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct ctl_table table_copy;
+ int err, scope, parent_scope;
+
+ if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ table_copy = *table;
+
+ /* You cannot set a lower enforcement value than your parent. */
+ parent_scope = pidns_memfd_noexec_scope(ns->parent);
+ /* Equivalent to pidns_memfd_noexec_scope(ns). */
+ scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope);
+
+ table_copy.data = &scope;
+ table_copy.extra1 = &parent_scope;
+
+ err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos);
+ if (!err && write)
+ WRITE_ONCE(ns->memfd_noexec_scope, scope);
+ return err;
+}
+
+static const struct ctl_table pid_ns_ctl_table_vm[] = {
+ {
+ .procname = "memfd_noexec",
+ .data = &init_pid_ns.memfd_noexec_scope,
+ .maxlen = sizeof(init_pid_ns.memfd_noexec_scope),
+ .mode = 0644,
+ .proc_handler = pid_mfd_noexec_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+};
+static inline void register_pid_ns_sysctl_table_vm(void)
+{
+ register_sysctl("vm", pid_ns_ctl_table_vm);
+}
+#else
+static inline void register_pid_ns_sysctl_table_vm(void) {}
+#endif
+
+#endif /* LINUX_PID_SYSCTL_H */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 7e01f78f0417..05337f437cca 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,8 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
default y
- ---help---
+ help
Allow the system to enter sleep states in which main memory is
powered and thus its contents are preserved, such as the
suspend-to-RAM state (e.g. the ACPI S3 state).
@@ -18,6 +19,19 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config SUSPEND_SKIP_SYNC
+ bool "Skip kernel's sys_sync() on suspend to RAM/standby"
+ depends on SUSPEND
+ depends on EXPERT
+ help
+ Skip the kernel sys_sync() before freezing user processes.
+ Some systems prefer not to pay this cost on every invocation
+ of suspend, or they are content with invoking sync() from
+ user-space before invoking suspend. There's a run-time switch
+ at '/sys/power/sync_on_suspend' to configure this behaviour.
+ This setting changes the default for the run-tim switch. Say Y
+ to change the default to disable the kernel sys_sync().
+
config HIBERNATE_CALLBACKS
bool
@@ -25,10 +39,10 @@ config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
depends on SWAP && ARCH_HIBERNATION_POSSIBLE
select HIBERNATE_CALLBACKS
- select LZO_COMPRESS
- select LZO_DECOMPRESS
select CRC32
- ---help---
+ select CRYPTO
+ select CRYPTO_LZO
+ help
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
system and powers it off; and restores that checkpoint on reboot.
@@ -55,7 +69,7 @@ config HIBERNATION
need to run mkswap against the swap partition used for the suspend.
It also works with swap files to a limited extent (for details see
- <file:Documentation/power/swsusp-and-swap-files.txt>).
+ <file:Documentation/power/swsusp-and-swap-files.rst>).
Right now you may boot without resuming and resume later but in the
meantime you cannot use the swap partition(s)/file(s) involved in
@@ -64,31 +78,62 @@ config HIBERNATION
MOUNT any journaled filesystems mounted before the suspend or they
will get corrupted in a nasty way.
- For more information take a look at <file:Documentation/power/swsusp.txt>.
+ For more information take a look at <file:Documentation/power/swsusp.rst>.
-config ARCH_SAVE_PAGE_KEYS
- bool
+config HIBERNATION_SNAPSHOT_DEV
+ bool "Userspace snapshot device"
+ depends on HIBERNATION
+ default y
+ help
+ Device used by the uswsusp tools.
+
+ Say N if no snapshotting from userspace is needed, this also
+ reduces the attack surface of the kernel.
+
+ If in doubt, say Y.
+
+choice
+ prompt "Default compressor"
+ default HIBERNATION_COMP_LZO
+ depends on HIBERNATION
+
+config HIBERNATION_COMP_LZO
+ bool "lzo"
+ depends on CRYPTO_LZO
+
+config HIBERNATION_COMP_LZ4
+ bool "lz4"
+ depends on CRYPTO_LZ4
+
+endchoice
+
+config HIBERNATION_DEF_COMP
+ string
+ default "lzo" if HIBERNATION_COMP_LZO
+ default "lz4" if HIBERNATION_COMP_LZ4
+ help
+ Default compressor to be used for hibernation.
config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
default ""
- ---help---
+ help
The default resume partition is the partition that the suspend-
- to-disk implementation will look for a suspended disk image.
+ to-disk implementation will look for a suspended disk image.
- The partition specified here will be different for almost every user.
+ The partition specified here will be different for almost every user.
It should be a valid swap partition (at least for now) that is turned
- on before suspending.
+ on before suspending.
The partition specified can be overridden by specifying:
- resume=/dev/<other device>
+ resume=/dev/<other device>
- which will set the resume partition to the device specified.
+ which will set the resume partition to the device specified.
Note there is currently not a way to specify which device to save the
- suspended image to. It will simply pick the first available swap
+ suspended image to. It will simply pick the first available swap
device.
config PM_SLEEP
@@ -103,19 +148,46 @@ config PM_SLEEP_SMP
depends on PM_SLEEP
select HOTPLUG_CPU
+config PM_SLEEP_SMP_NONZERO_CPU
+ def_bool y
+ depends on PM_SLEEP_SMP
+ depends on ARCH_SUSPEND_NONZERO_CPU
+ help
+ If an arch can suspend (for suspend, hibernate, kexec, etc) on a
+ non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
+ will allow nohz_full mask to include CPU0.
+
config PM_AUTOSLEEP
bool "Opportunistic sleep"
depends on PM_SLEEP
- default n
- ---help---
+ help
Allow the kernel to trigger a system transition into a global sleep
state automatically whenever there are no active wakeup sources.
+config PM_USERSPACE_AUTOSLEEP
+ bool "Userspace opportunistic sleep"
+ depends on PM_SLEEP
+ help
+ Notify kernel of aggressive userspace autosleep power management policy.
+
+ This option changes the behavior of various sleep-sensitive code to deal
+ with frequent userspace-initiated transitions into a global sleep state.
+
+ Saying Y here, disables code paths that most users really should keep
+ enabled. In particular, only enable this if it is very common to be
+ asleep/awake for very short periods of time (<= 2 seconds).
+
+ Only platforms, such as Android, that implement opportunistic sleep from
+ a userspace power manager service should enable this option; and not
+ other machines. Therefore, you should say N here, unless you are
+ extremely certain that this is what you want. The option otherwise has
+ bad, undesirable effects, and should not be enabled just for fun.
+
+
config PM_WAKELOCKS
bool "User space wakeup sources interface"
depends on PM_SLEEP
- default n
- ---help---
+ help
Allow user space to create, activate and deactivate wakeup source
objects with the help of a sysfs-based interface.
@@ -130,9 +202,20 @@ config PM_WAKELOCKS_GC
depends on PM_WAKELOCKS
default y
+config PM_QOS_CPU_SYSTEM_WAKEUP
+ bool "User space interface for CPU system wakeup QoS"
+ depends on CPU_IDLE
+ help
+ Enable this to allow user space via the cpu_wakeup_latency file to
+ specify a CPU system wakeup latency limit.
+
+ This may be particularly useful for platforms supporting multiple low
+ power states for CPUs during system-wide suspend and s2idle in
+ particular.
+
config PM
bool "Device power management core functionality"
- ---help---
+ help
Enable functionality allowing I/O devices to be put into energy-saving
(low power) states, for example after a specified period of inactivity
(autosuspended), and woken up in response to a hardware-generated
@@ -146,7 +229,7 @@ config PM
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
- ---help---
+ help
This option enables various debugging support in the Power Management
code. This is helpful when debugging and reporting PM bugs, like
suspend support.
@@ -154,7 +237,7 @@ config PM_DEBUG
config PM_ADVANCED_DEBUG
bool "Extra PM attributes in sysfs for low-level debugging/testing"
depends on PM_DEBUG
- ---help---
+ help
Add extra sysfs attributes allowing one to access some Power Management
fields of device objects from user space. If you are not a kernel
developer interested in debugging/testing Power Management, say "no".
@@ -162,7 +245,7 @@ config PM_ADVANCED_DEBUG
config PM_TEST_SUSPEND
bool "Test suspend/resume and wakealarm during bootup"
depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
- ---help---
+ help
This option will let you suspend your machine during bootup, and
make it wake up a few seconds later using an RTC wakeup alarm.
Enable this with a kernel parameter like "test_suspend=mem".
@@ -176,8 +259,8 @@ config PM_SLEEP_DEBUG
config DPM_WATCHDOG
bool "Device suspend/resume watchdog"
- depends on PM_DEBUG && PSTORE
- ---help---
+ depends on PM_DEBUG && PSTORE && EXPERT
+ help
Sets up a watchdog timer to capture drivers that are
locked up attempting to suspend/resume a device.
A detected lockup causes system panic with message
@@ -185,10 +268,29 @@ config DPM_WATCHDOG
boot session.
config DPM_WATCHDOG_TIMEOUT
- int "Watchdog timeout in seconds"
+ int "Watchdog timeout to panic in seconds"
range 1 120
- default 12
+ default 120
+ depends on DPM_WATCHDOG
+
+config DPM_WATCHDOG_WARNING_TIMEOUT
+ int "Watchdog timeout to warn in seconds"
+ range 1 DPM_WATCHDOG_TIMEOUT
+ default DPM_WATCHDOG_TIMEOUT
depends on DPM_WATCHDOG
+ help
+ If the DPM watchdog warning timeout and main timeout are
+ different then a non-fatal warning (with a stack trace of
+ the stuck suspend routine) will be printed when the warning
+ timeout expires. If the suspend routine gets un-stuck
+ before the main timeout expires then no other action is
+ taken. If the routine continues to be stuck and the main
+ timeout expires then an emergency-level message and stack
+ trace will be printed and the system will panic.
+
+ If the warning timeout is equal to the main timeout (the
+ default) then the warning will never happen and the system
+ will jump straight to panic when the main timeout expires.
config PM_TRACE
bool
@@ -210,7 +312,7 @@ config PM_TRACE_RTC
depends on PM_SLEEP_DEBUG
depends on X86
select PM_TRACE
- ---help---
+ help
This enables some cheesy code to save the last PM event point in the
RTC across reboots, so that you can debug a machine that just hangs
during suspend (or more commonly, during resume).
@@ -225,7 +327,7 @@ config PM_TRACE_RTC
config APM_EMULATION
tristate "Advanced Power Management Emulation"
- depends on PM && SYS_SUPPORTS_APM_EMULATION
+ depends on SYS_SUPPORTS_APM_EMULATION
help
APM is a BIOS specification for saving power using several different
techniques. This is mostly useful for battery powered laptops with
@@ -235,7 +337,7 @@ config APM_EMULATION
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/power/apm-acpi.txt>
+ and more information, read <file:Documentation/power/apm-acpi.rst>
and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
@@ -249,20 +351,6 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config PM_OPP
- bool
- select SRCU
- ---help---
- SOCs have a standard set of tuples consisting of frequency and
- voltage pairs that the device will support per voltage domain. This
- is called Operating Performance Point or OPP. The actual definitions
- of OPP varies over silicon within the same family of devices.
-
- OPP layer organizes the data internally using device pointers
- representing individual voltage domains and provides SOC
- implementations a ready to use framework to manage OPPs.
- For more information, read <file:Documentation/power/opp.txt>
-
config PM_CLK
def_bool y
depends on PM && HAVE_CLK
@@ -274,7 +362,6 @@ config PM_GENERIC_DOMAINS
config WQ_POWER_EFFICIENT_DEFAULT
bool "Enable workqueue power-efficient mode by default"
depends on PM
- default n
help
Per-cpu workqueues are generally preferred because they show
better performance thanks to cache locality; unfortunately,
@@ -301,3 +388,16 @@ config PM_GENERIC_DOMAINS_OF
config CPU_PM
bool
+
+config ENERGY_MODEL
+ bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)"
+ depends on CPU_FREQ || PM_DEVFREQ
+ help
+ Several subsystems (thermal and/or the task scheduler for example)
+ can leverage information about the energy consumed by devices to
+ make smarter decisions. This config option enables the framework
+ from which subsystems can access the energy models.
+
+ The exact usage of the energy model is subsystem-dependent.
+
+ If in doubt, say N.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 29472bff11ef..773e2789412b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,5 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
-ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+ifeq ($(CONFIG_DYNAMIC_DEBUG), y)
+CFLAGS_swap.o := -DDEBUG
+CFLAGS_snapshot.o := -DDEBUG
+CFLAGS_energy_model.o := -DDEBUG
+endif
+
+KASAN_SANITIZE_snapshot.o := n
obj-y += qos.o
obj-$(CONFIG_PM) += main.o
@@ -7,9 +14,13 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
-obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
- block_io.o
+obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o
+obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o
obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
+
+obj-$(CONFIG_ENERGY_MODEL) += em.o
+em-y := energy_model.o
+em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 9012ecf7b814..865df641b97c 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/power/autosleep.c
*
@@ -8,7 +9,6 @@
#include <linux/device.h>
#include <linux/mutex.h>
-#include <linux/pm_wakeup.h>
#include "power.h"
@@ -53,7 +53,7 @@ static void try_to_suspend(struct work_struct *work)
goto out;
/*
- * If the wakeup occured for an unknown reason, wait to prevent the
+ * If the wakeup occurred for an unknown reason, wait to prevent the
* system from trying to suspend and waking up in a tight loop.
*/
if (final_count == initial_count)
@@ -115,7 +115,7 @@ int pm_autosleep_set_state(suspend_state_t state)
int __init pm_autosleep_init(void)
{
- autosleep_ws = wakeup_source_register("autosleep");
+ autosleep_ws = wakeup_source_register(NULL, "autosleep");
if (!autosleep_ws)
return -ENOMEM;
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
deleted file mode 100644
index 9a58bc258810..000000000000
--- a/kernel/power/block_io.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * This file provides functions for block I/O operations on swap/file.
- *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
- * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/bio.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include "power.h"
-
-/**
- * submit - submit BIO request.
- * @rw: READ or WRITE.
- * @off physical offset of page.
- * @page: page we're reading or writing.
- * @bio_chain: list of pending biod (for async reading)
- *
- * Straight from the textbook - allocate and initialize the bio.
- * If we're reading, make sure the page is marked as dirty.
- * Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, struct block_device *bdev, sector_t sector,
- struct page *page, struct bio **bio_chain)
-{
- const int bio_rw = rw | REQ_SYNC;
- struct bio *bio;
-
- bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
- bio->bi_iter.bi_sector = sector;
- bio->bi_bdev = bdev;
- bio->bi_end_io = end_swap_bio_read;
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
- (unsigned long long)sector);
- bio_put(bio);
- return -EFAULT;
- }
-
- lock_page(page);
- bio_get(bio);
-
- if (bio_chain == NULL) {
- submit_bio(bio_rw, bio);
- wait_on_page_locked(page);
- if (rw == READ)
- bio_set_pages_dirty(bio);
- bio_put(bio);
- } else {
- if (rw == READ)
- get_page(page); /* These pages are freed later */
- bio->bi_private = *bio_chain;
- *bio_chain = bio;
- submit_bio(bio_rw, bio);
- }
- return 0;
-}
-
-int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
- return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
- virt_to_page(addr), bio_chain);
-}
-
-int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
- return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
- virt_to_page(addr), bio_chain);
-}
-
-int hib_wait_on_bio_chain(struct bio **bio_chain)
-{
- struct bio *bio;
- struct bio *next_bio;
- int ret = 0;
-
- if (bio_chain == NULL)
- return 0;
-
- bio = *bio_chain;
- if (bio == NULL)
- return 0;
- while (bio) {
- struct page *page;
-
- next_bio = bio->bi_private;
- page = bio->bi_io_vec[0].bv_page;
- wait_on_page_locked(page);
- if (!PageUptodate(page) || PageError(page))
- ret = -EIO;
- put_page(page);
- bio_put(bio);
- bio = next_bio;
- }
- *bio_chain = NULL;
- return ret;
-}
diff --git a/kernel/power/console.c b/kernel/power/console.c
index aba9c545a0e3..a906a0ac0f9b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Functions for saving/restoring console.
*
@@ -15,6 +16,7 @@
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
+static bool vt_switch_done;
static DEFINE_MUTEX(vt_switch_mutex);
@@ -42,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list);
* no_console_suspend argument has been passed on the command line, VT
* switches will occur.
*/
-void pm_vt_switch_required(struct device *dev, bool required)
+int pm_vt_switch_required(struct device *dev, bool required)
{
struct pm_vt_switch *entry, *tmp;
+ int ret = 0;
mutex_lock(&vt_switch_mutex);
list_for_each_entry(tmp, &pm_vt_switch_list, head) {
@@ -56,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required)
}
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
+ if (!entry) {
+ ret = -ENOMEM;
goto out;
+ }
entry->required = required;
entry->dev = dev;
@@ -65,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required)
list_add(&entry->head, &pm_vt_switch_list);
out:
mutex_unlock(&vt_switch_mutex);
+ return ret;
}
EXPORT_SYMBOL(pm_vt_switch_required);
@@ -126,26 +132,30 @@ out:
return ret;
}
-int pm_prepare_console(void)
+void pm_prepare_console(void)
{
if (!pm_vt_switch())
- return 0;
+ return;
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
- return 1;
+ return;
+
+ vt_switch_done = true;
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
- return 0;
+ return;
}
void pm_restore_console(void)
{
- if (!pm_vt_switch())
+ if (!pm_vt_switch() && !vt_switch_done)
return;
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
}
+
+ vt_switch_done = false;
}
diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c
new file mode 100644
index 000000000000..4b85da138a06
--- /dev/null
+++ b/kernel/power/em_netlink.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/energy_model.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <uapi/linux/energy_model.h>
+
+#include "em_netlink.h"
+#include "em_netlink_autogen.h"
+
+#define EM_A_PD_CPUS_LEN 256
+
+/*************************** Command encoding ********************************/
+static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data)
+{
+ char cpus_buf[EM_A_PD_CPUS_LEN];
+ int *tot_msg_sz = data;
+ int msg_sz, cpus_sz;
+
+ cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+ cpumask_pr_args(to_cpumask(pd->cpus)));
+
+ msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */
+ nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */
+ nla_total_size(cpus_sz); /* EM_A_PD_CPUS */
+
+ *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz));
+ return 0;
+}
+
+static int __em_nl_get_pd(struct em_perf_domain *pd, void *data)
+{
+ char cpus_buf[EM_A_PD_CPUS_LEN];
+ struct sk_buff *msg = data;
+ struct nlattr *entry;
+
+ entry = nla_nest_start(msg, EM_A_PDS_PD);
+ if (!entry)
+ goto out_cancel_nest;
+
+ if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id))
+ goto out_cancel_nest;
+
+ if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD))
+ goto out_cancel_nest;
+
+ snprintf(cpus_buf, sizeof(cpus_buf), "%*pb",
+ cpumask_pr_args(to_cpumask(pd->cpus)));
+ if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf))
+ goto out_cancel_nest;
+
+ nla_nest_end(msg, entry);
+
+ return 0;
+
+out_cancel_nest:
+ nla_nest_cancel(msg, entry);
+
+ return -EMSGSIZE;
+}
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int cmd = info->genlhdr->cmd;
+ int ret = -EMSGSIZE, msg_sz = 0;
+
+ for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = for_each_em_perf_domain(__em_nl_get_pd, msg);
+ if (ret)
+ goto out_cancel_msg;
+
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+
+ return ret;
+}
+
+static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs)
+{
+ struct em_perf_domain *pd;
+ int id;
+
+ if (!attrs[EM_A_PD_TABLE_PD_ID])
+ return NULL;
+
+ id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]);
+ pd = em_perf_domain_get_by_id(id);
+ return pd;
+}
+
+static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd)
+{
+ int id_sz, ps_sz;
+
+ id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+ ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */
+ nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */
+ nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */
+ ps_sz *= pd->nr_perf_states;
+
+ return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz));
+}
+
+static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd)
+{
+ struct em_perf_state *table, *ps;
+ struct nlattr *entry;
+ int i;
+
+ if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id))
+ goto out_err;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd((struct em_perf_domain *)pd);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ ps = &table[i];
+
+ entry = nla_nest_start(msg, EM_A_PD_TABLE_PS);
+ if (!entry)
+ goto out_unlock_ps;
+
+ if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE,
+ ps->performance, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY,
+ ps->frequency, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_POWER,
+ ps->power, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_COST,
+ ps->cost, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+ if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS,
+ ps->flags, EM_A_PS_PAD))
+ goto out_cancel_ps_nest;
+
+ nla_nest_end(msg, entry);
+ }
+ rcu_read_unlock();
+ return 0;
+
+out_cancel_ps_nest:
+ nla_nest_cancel(msg, entry);
+out_unlock_ps:
+ rcu_read_unlock();
+out_err:
+ return -EMSGSIZE;
+}
+
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ int cmd = info->genlhdr->cmd;
+ int msg_sz, ret = -EMSGSIZE;
+ struct em_perf_domain *pd;
+ struct sk_buff *msg;
+ void *hdr;
+
+ pd = __em_nl_get_pd_table_id(info->attrs);
+ if (!pd)
+ return -EINVAL;
+
+ msg_sz = __em_nl_get_pd_table_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = __em_nl_get_pd_table(msg, pd);
+ if (ret)
+ goto out_free_msg;
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+out_free_msg:
+ nlmsg_free(msg);
+ return ret;
+}
+
+
+/**************************** Event encoding *********************************/
+static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type)
+{
+ struct sk_buff *msg;
+ int msg_sz, ret = -EMSGSIZE;
+ void *hdr;
+
+ if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+ return;
+
+ msg_sz = __em_nl_get_pd_table_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type);
+ if (!hdr)
+ goto out_free_msg;
+
+ ret = __em_nl_get_pd_table(msg, pd);
+ if (ret)
+ goto out_free_msg;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+ return;
+}
+
+void em_notify_pd_created(const struct em_perf_domain *pd)
+{
+ __em_notify_pd_table(pd, EM_CMD_PD_CREATED);
+}
+
+void em_notify_pd_updated(const struct em_perf_domain *pd)
+{
+ __em_notify_pd_table(pd, EM_CMD_PD_UPDATED);
+}
+
+static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd)
+{
+ int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */
+
+ return nlmsg_total_size(genlmsg_msg_size(id_sz));
+}
+
+void em_notify_pd_deleted(const struct em_perf_domain *pd)
+{
+ struct sk_buff *msg;
+ void *hdr;
+ int msg_sz;
+
+ if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT))
+ return;
+
+ msg_sz = __em_notify_pd_deleted_size(pd);
+
+ msg = genlmsg_new(msg_sz, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED);
+ if (!hdr)
+ goto out_free_msg;
+
+ if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) {
+ goto out_free_msg;
+ }
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL);
+
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+ return;
+}
+
+/**************************** Initialization *********************************/
+static int __init em_netlink_init(void)
+{
+ return genl_register_family(&em_nl_family);
+}
+postcore_initcall(em_netlink_init);
diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h
new file mode 100644
index 000000000000..583d7f1c3939
--- /dev/null
+++ b/kernel/power/em_netlink.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Generic netlink for energy model.
+ *
+ * Copyright (c) 2025 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+#ifndef _EM_NETLINK_H
+#define _EM_NETLINK_H
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data);
+struct em_perf_domain *em_perf_domain_get_by_id(int id);
+void em_notify_pd_created(const struct em_perf_domain *pd);
+void em_notify_pd_deleted(const struct em_perf_domain *pd);
+void em_notify_pd_updated(const struct em_perf_domain *pd);
+#else
+static inline
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data)
+{
+ return -EINVAL;
+}
+static inline
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+ return NULL;
+}
+
+static inline void em_notify_pd_created(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {}
+
+static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {}
+#endif
+
+#endif /* _EM_NETLINK_H */
diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c
new file mode 100644
index 000000000000..a7a09ab1d1c2
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "em_netlink_autogen.h"
+
+#include <uapi/linux/energy_model.h>
+
+/* EM_CMD_GET_PD_TABLE - do */
+static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = {
+ [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, },
+};
+
+/* Ops table for em */
+static const struct genl_split_ops em_nl_ops[] = {
+ {
+ .cmd = EM_CMD_GET_PDS,
+ .doit = em_nl_get_pds_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = EM_CMD_GET_PD_TABLE,
+ .doit = em_nl_get_pd_table_doit,
+ .policy = em_get_pd_table_nl_policy,
+ .maxattr = EM_A_PD_TABLE_PD_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group em_nl_mcgrps[] = {
+ [EM_NLGRP_EVENT] = { "event", },
+};
+
+struct genl_family em_nl_family __ro_after_init = {
+ .name = EM_FAMILY_NAME,
+ .version = EM_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = em_nl_ops,
+ .n_split_ops = ARRAY_SIZE(em_nl_ops),
+ .mcgrps = em_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps),
+};
diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h
new file mode 100644
index 000000000000..78ce609641f1
--- /dev/null
+++ b/kernel/power/em_netlink_autogen.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/em.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_EM_GEN_H
+#define _LINUX_EM_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/energy_model.h>
+
+int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info);
+int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ EM_NLGRP_EVENT,
+};
+
+extern struct genl_family em_nl_family;
+
+#endif /* _LINUX_EM_GEN_H */
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
new file mode 100644
index 000000000000..11af9f64aa82
--- /dev/null
+++ b/kernel/power/energy_model.c
@@ -0,0 +1,1046 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Energy Model of devices
+ *
+ * Copyright (c) 2018-2021, Arm ltd.
+ * Written by: Quentin Perret, Arm ltd.
+ * Improvements provided by: Lukasz Luba, Arm ltd.
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/sched/topology.h>
+#include <linux/slab.h>
+
+#include "em_netlink.h"
+
+/*
+ * Mutex serializing the registrations of performance domains and letting
+ * callbacks defined by drivers sleep.
+ */
+static DEFINE_MUTEX(em_pd_mutex);
+
+/*
+ * Manage performance domains with IDs. One can iterate the performance domains
+ * through the list and pick one with their associated ID. The mutex serializes
+ * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be
+ * taken to avoid potential deadlock.
+ */
+static DEFINE_IDA(em_pd_ida);
+static LIST_HEAD(em_pd_list);
+static DEFINE_MUTEX(em_pd_list_mutex);
+
+static void em_cpufreq_update_efficiencies(struct device *dev,
+ struct em_perf_state *table);
+static void em_check_capacity_update(void);
+static void em_update_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn);
+
+static bool _is_cpu_device(struct device *dev)
+{
+ return (dev->bus == &cpu_subsys);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *rootdir;
+
+struct em_dbg_info {
+ struct em_perf_domain *pd;
+ int ps_id;
+};
+
+#define DEFINE_EM_DBG_SHOW(name, fname) \
+static int em_debug_##fname##_show(struct seq_file *s, void *unused) \
+{ \
+ struct em_dbg_info *em_dbg = s->private; \
+ struct em_perf_state *table; \
+ unsigned long val; \
+ \
+ rcu_read_lock(); \
+ table = em_perf_state_from_pd(em_dbg->pd); \
+ val = table[em_dbg->ps_id].name; \
+ rcu_read_unlock(); \
+ \
+ seq_printf(s, "%lu\n", val); \
+ return 0; \
+} \
+DEFINE_SHOW_ATTRIBUTE(em_debug_##fname)
+
+DEFINE_EM_DBG_SHOW(frequency, frequency);
+DEFINE_EM_DBG_SHOW(power, power);
+DEFINE_EM_DBG_SHOW(cost, cost);
+DEFINE_EM_DBG_SHOW(performance, performance);
+DEFINE_EM_DBG_SHOW(flags, inefficiency);
+
+static void em_debug_create_ps(struct em_perf_domain *em_pd,
+ struct em_dbg_info *em_dbg, int i,
+ struct dentry *pd)
+{
+ struct em_perf_state *table;
+ unsigned long freq;
+ struct dentry *d;
+ char name[24];
+
+ em_dbg[i].pd = em_pd;
+ em_dbg[i].ps_id = i;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(em_pd);
+ freq = table[i].frequency;
+ rcu_read_unlock();
+
+ snprintf(name, sizeof(name), "ps:%lu", freq);
+
+ /* Create per-ps directory */
+ d = debugfs_create_dir(name, pd);
+ debugfs_create_file("frequency", 0444, d, &em_dbg[i],
+ &em_debug_frequency_fops);
+ debugfs_create_file("power", 0444, d, &em_dbg[i],
+ &em_debug_power_fops);
+ debugfs_create_file("cost", 0444, d, &em_dbg[i],
+ &em_debug_cost_fops);
+ debugfs_create_file("performance", 0444, d, &em_dbg[i],
+ &em_debug_performance_fops);
+ debugfs_create_file("inefficient", 0444, d, &em_dbg[i],
+ &em_debug_inefficiency_fops);
+}
+
+static int em_debug_cpus_show(struct seq_file *s, void *unused)
+{
+ seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
+
+static int em_debug_flags_show(struct seq_file *s, void *unused)
+{
+ struct em_perf_domain *pd = s->private;
+
+ seq_printf(s, "%#lx\n", pd->flags);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_flags);
+
+static int em_debug_id_show(struct seq_file *s, void *unused)
+{
+ struct em_perf_domain *pd = s->private;
+
+ seq_printf(s, "%d\n", pd->id);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_id);
+
+static void em_debug_create_pd(struct device *dev)
+{
+ struct em_dbg_info *em_dbg;
+ struct dentry *d;
+ int i;
+
+ /* Create the directory of the performance domain */
+ d = debugfs_create_dir(dev_name(dev), rootdir);
+
+ if (_is_cpu_device(dev))
+ debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus,
+ &em_debug_cpus_fops);
+
+ debugfs_create_file("flags", 0444, d, dev->em_pd,
+ &em_debug_flags_fops);
+
+ debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops);
+
+ em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states,
+ sizeof(*em_dbg), GFP_KERNEL);
+ if (!em_dbg)
+ return;
+
+ /* Create a sub-directory for each performance state */
+ for (i = 0; i < dev->em_pd->nr_perf_states; i++)
+ em_debug_create_ps(dev->em_pd, em_dbg, i, d);
+
+}
+
+static void em_debug_remove_pd(struct device *dev)
+{
+ debugfs_lookup_and_remove(dev_name(dev), rootdir);
+}
+
+static int __init em_debug_init(void)
+{
+ /* Create /sys/kernel/debug/energy_model directory */
+ rootdir = debugfs_create_dir("energy_model", NULL);
+
+ return 0;
+}
+fs_initcall(em_debug_init);
+#else /* CONFIG_DEBUG_FS */
+static void em_debug_create_pd(struct device *dev) {}
+static void em_debug_remove_pd(struct device *dev) {}
+#endif
+
+static void em_release_table_kref(struct kref *kref)
+{
+ /* It was the last owner of this table so we can free */
+ kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
+}
+
+/**
+ * em_table_free() - Handles safe free of the EM table when needed
+ * @table : EM table which is going to be freed
+ *
+ * No return values.
+ */
+void em_table_free(struct em_perf_table *table)
+{
+ kref_put(&table->kref, em_release_table_kref);
+}
+
+/**
+ * em_table_alloc() - Allocate a new EM table
+ * @pd : EM performance domain for which this must be done
+ *
+ * Allocate a new EM table and initialize its kref to indicate that it
+ * has a user.
+ * Returns allocated table or NULL.
+ */
+struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
+{
+ struct em_perf_table *table;
+ int table_size;
+
+ table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+
+ table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ kref_init(&table->kref);
+
+ return table;
+}
+
+static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_state *table, int nr_states)
+{
+ u64 fmax, max_cap;
+ int i, cpu;
+
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return;
+
+ cpu = cpumask_first(em_span_cpus(pd));
+
+ /*
+ * Calculate the performance value for each frequency with
+ * linear relationship. The final CPU capacity might not be ready at
+ * boot time, but the EM will be updated a bit later with correct one.
+ */
+ fmax = (u64) table[nr_states - 1].frequency;
+ max_cap = (u64) arch_scale_cpu_capacity(cpu);
+ for (i = 0; i < nr_states; i++)
+ table[i].performance = div64_u64(max_cap * table[i].frequency,
+ fmax);
+}
+
+static int em_compute_costs(struct device *dev, struct em_perf_state *table,
+ const struct em_data_callback *cb, int nr_states,
+ unsigned long flags)
+{
+ unsigned long prev_cost = ULONG_MAX;
+ int i, ret;
+
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return 0;
+
+ /* Compute the cost of each performance state. */
+ for (i = nr_states - 1; i >= 0; i--) {
+ unsigned long power_res, cost;
+
+ if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) {
+ ret = cb->get_cost(dev, table[i].frequency, &cost);
+ if (ret || !cost || cost > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid cost %lu %d\n",
+ cost, ret);
+ return -EINVAL;
+ }
+ } else {
+ /* increase resolution of 'cost' precision */
+ power_res = table[i].power * 10;
+ cost = power_res / table[i].performance;
+ }
+
+ table[i].cost = cost;
+
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * em_dev_compute_costs() - Calculate cost values for new runtime EM table
+ * @dev : Device for which the EM table is to be updated
+ * @table : The new EM table that is going to get the costs calculated
+ * @nr_states : Number of performance states
+ *
+ * Calculate the em_perf_state::cost values for new runtime EM table. The
+ * values are used for EAS during task placement. It also calculates and sets
+ * the efficiency flag for each performance state. When the function finish
+ * successfully the EM table is ready to be updated and used by EAS.
+ *
+ * Return 0 on success or a proper error in case of failure.
+ */
+int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
+ int nr_states)
+{
+ return em_compute_costs(dev, table, NULL, nr_states, 0);
+}
+
+/**
+ * em_dev_update_perf_domain() - Update runtime EM table for a device
+ * @dev : Device for which the EM is to be updated
+ * @new_table : The new EM table that is going to be used from now
+ *
+ * Update EM runtime modifiable table for the @dev using the provided @table.
+ *
+ * This function uses a mutex to serialize writers, so it must not be called
+ * from a non-sleeping context.
+ *
+ * Return 0 on success or an error code on failure.
+ */
+int em_dev_update_perf_domain(struct device *dev,
+ struct em_perf_table *new_table)
+{
+ struct em_perf_table *old_table;
+ struct em_perf_domain *pd;
+
+ if (!dev)
+ return -EINVAL;
+
+ /* Serialize update/unregister or concurrent updates */
+ mutex_lock(&em_pd_mutex);
+
+ if (!dev->em_pd) {
+ mutex_unlock(&em_pd_mutex);
+ return -EINVAL;
+ }
+ pd = dev->em_pd;
+
+ kref_get(&new_table->kref);
+
+ old_table = rcu_dereference_protected(pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
+ rcu_assign_pointer(pd->em_table, new_table);
+
+ em_cpufreq_update_efficiencies(dev, new_table->state);
+
+ em_table_free(old_table);
+
+ mutex_unlock(&em_pd_mutex);
+
+ em_notify_pd_updated(pd);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
+
+static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_state *table,
+ const struct em_data_callback *cb,
+ unsigned long flags)
+{
+ unsigned long power, freq, prev_freq = 0;
+ int nr_states = pd->nr_perf_states;
+ int i, ret;
+
+ /* Build the list of performance states for this performance domain */
+ for (i = 0, freq = 0; i < nr_states; i++, freq++) {
+ /*
+ * active_power() is a driver callback which ceils 'freq' to
+ * lowest performance state of 'dev' above 'freq' and updates
+ * 'power' and 'freq' accordingly.
+ */
+ ret = cb->active_power(dev, &power, &freq);
+ if (ret) {
+ dev_err(dev, "EM: invalid perf. state: %d\n",
+ ret);
+ return -EINVAL;
+ }
+
+ /*
+ * We expect the driver callback to increase the frequency for
+ * higher performance states.
+ */
+ if (freq <= prev_freq) {
+ dev_err(dev, "EM: non-increasing freq: %lu\n",
+ freq);
+ return -EINVAL;
+ }
+
+ /*
+ * The power returned by active_state() is expected to be
+ * positive and be in range.
+ */
+ if (!power || power > EM_MAX_POWER) {
+ dev_err(dev, "EM: invalid power: %lu\n",
+ power);
+ return -EINVAL;
+ }
+
+ table[i].power = power;
+ table[i].frequency = prev_freq = freq;
+ }
+
+ em_init_performance(dev, pd, table, nr_states);
+
+ ret = em_compute_costs(dev, table, cb, nr_states, flags);
+ if (ret)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int em_create_pd(struct device *dev, int nr_states,
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus,
+ unsigned long flags)
+{
+ struct em_perf_table *em_table;
+ struct em_perf_domain *pd;
+ struct device *cpu_dev;
+ int cpu, ret, num_cpus, id;
+
+ if (_is_cpu_device(dev)) {
+ num_cpus = cpumask_weight(cpus);
+
+ /* Prevent max possible energy calculation to not overflow */
+ if (num_cpus > EM_MAX_NUM_CPUS) {
+ dev_err(dev, "EM: too many CPUs, overflow possible\n");
+ return -EINVAL;
+ }
+
+ pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+
+ cpumask_copy(em_span_cpus(pd), cpus);
+ } else {
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return -ENOMEM;
+ }
+
+ pd->nr_perf_states = nr_states;
+
+ INIT_LIST_HEAD(&pd->node);
+
+ id = ida_alloc(&em_pd_ida, GFP_KERNEL);
+ if (id < 0)
+ return -ENOMEM;
+ pd->id = id;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table)
+ goto free_pd;
+
+ ret = em_create_perf_table(dev, pd, em_table->state, cb, flags);
+ if (ret)
+ goto free_pd_table;
+
+ rcu_assign_pointer(pd->em_table, em_table);
+
+ if (_is_cpu_device(dev))
+ for_each_cpu(cpu, cpus) {
+ cpu_dev = get_cpu_device(cpu);
+ cpu_dev->em_pd = pd;
+ }
+
+ dev->em_pd = pd;
+
+ return 0;
+
+free_pd_table:
+ kfree(em_table);
+free_pd:
+ kfree(pd);
+ ida_free(&em_pd_ida, id);
+ return -EINVAL;
+}
+
+static void
+em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table)
+{
+ struct em_perf_domain *pd = dev->em_pd;
+ struct cpufreq_policy *policy;
+ int found = 0;
+ int i, cpu;
+
+ if (!_is_cpu_device(dev))
+ return;
+
+ /* Try to get a CPU which is active and in this PD */
+ cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask);
+ if (cpu >= nr_cpu_ids) {
+ dev_warn(dev, "EM: No online CPU for CPUFreq policy\n");
+ return;
+ }
+
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed\n");
+ return;
+ }
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
+ continue;
+
+ if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
+ found++;
+ }
+
+ cpufreq_cpu_put(policy);
+
+ if (!found)
+ return;
+
+ /*
+ * Efficiencies have been installed in CPUFreq, inefficient frequencies
+ * will be skipped. The EM can do the same.
+ */
+ pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
+}
+
+/**
+ * em_pd_get() - Return the performance domain for a device
+ * @dev : Device to find the performance domain for
+ *
+ * Returns the performance domain to which @dev belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_pd_get(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev))
+ return NULL;
+
+ return dev->em_pd;
+}
+EXPORT_SYMBOL_GPL(em_pd_get);
+
+/**
+ * em_cpu_get() - Return the performance domain for a CPU
+ * @cpu : CPU to find the performance domain for
+ *
+ * Returns the performance domain to which @cpu belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_cpu_get(int cpu)
+{
+ struct device *cpu_dev;
+
+ cpu_dev = get_cpu_device(cpu);
+ if (!cpu_dev)
+ return NULL;
+
+ return em_pd_get(cpu_dev);
+}
+EXPORT_SYMBOL_GPL(em_cpu_get);
+
+/**
+ * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device
+ * @dev : Device for which the EM is to register
+ * @nr_states : Number of performance states to register
+ * @cb : Callback functions providing the data of the Energy Model
+ * @cpus : Pointer to cpumask_t, which in case of a CPU device is
+ * obligatory. It can be taken from i.e. 'policy->cpus'. For other
+ * type of devices this should be set to NULL.
+ * @microwatts : Flag indicating that the power values are in micro-Watts or
+ * in some other scale. It must be set properly.
+ *
+ * Create Energy Model tables for a performance domain using the callbacks
+ * defined in cb.
+ *
+ * The @microwatts is important to set with correct value. Some kernel
+ * sub-systems might rely on this flag and check if all devices in the EM are
+ * using the same scale.
+ *
+ * If multiple clients register the same performance domain, all but the first
+ * registration will be ignored.
+ *
+ * Return 0 on success
+ */
+int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus, bool microwatts)
+{
+ int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts);
+
+ if (_is_cpu_device(dev))
+ em_check_capacity_update();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
+
+/**
+ * em_dev_register_pd_no_update() - Register a perf domain for a device
+ * @dev : Device to register the PD for
+ * @nr_states : Number of performance states in the new PD
+ * @cb : Callback functions for populating the energy model
+ * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device)
+ * @microwatts : Whether or not the power values in the EM will be in uW
+ *
+ * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity
+ * update after registering the PD, even if @dev is a CPU device.
+ */
+int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states,
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus, bool microwatts)
+{
+ struct em_perf_table *em_table;
+ unsigned long cap, prev_cap = 0;
+ unsigned long flags = 0;
+ int cpu, ret;
+
+ if (!dev || !nr_states || !cb)
+ return -EINVAL;
+
+ /*
+ * Use a mutex to serialize the registration of performance domains and
+ * let the driver-defined callback functions sleep.
+ */
+ mutex_lock(&em_pd_mutex);
+
+ if (dev->em_pd) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+
+ if (_is_cpu_device(dev)) {
+ if (!cpus) {
+ dev_err(dev, "EM: invalid CPU mask\n");
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ for_each_cpu(cpu, cpus) {
+ if (em_cpu_get(cpu)) {
+ dev_err(dev, "EM: exists for CPU%d\n", cpu);
+ ret = -EEXIST;
+ goto unlock;
+ }
+ /*
+ * All CPUs of a domain must have the same
+ * micro-architecture since they all share the same
+ * table.
+ */
+ cap = arch_scale_cpu_capacity(cpu);
+ if (prev_cap && prev_cap != cap) {
+ dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
+ cpumask_pr_args(cpus));
+
+ ret = -EINVAL;
+ goto unlock;
+ }
+ prev_cap = cap;
+ }
+ }
+
+ if (microwatts)
+ flags |= EM_PERF_DOMAIN_MICROWATTS;
+ else if (cb->get_cost)
+ flags |= EM_PERF_DOMAIN_ARTIFICIAL;
+
+ /*
+ * EM only supports uW (exception is artificial EM).
+ * Therefore, check and force the drivers to provide
+ * power in uW.
+ */
+ if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) {
+ dev_err(dev, "EM: only supports uW power values\n");
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ ret = em_create_pd(dev, nr_states, cb, cpus, flags);
+ if (ret)
+ goto unlock;
+
+ dev->em_pd->flags |= flags;
+ dev->em_pd->min_perf_state = 0;
+ dev->em_pd->max_perf_state = nr_states - 1;
+
+ em_table = rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
+ em_cpufreq_update_efficiencies(dev, em_table->state);
+
+ em_debug_create_pd(dev);
+ dev_info(dev, "EM: created perf domain\n");
+
+unlock:
+ mutex_unlock(&em_pd_mutex);
+ if (ret)
+ return ret;
+
+ mutex_lock(&em_pd_list_mutex);
+ list_add_tail(&dev->em_pd->node, &em_pd_list);
+ mutex_unlock(&em_pd_list_mutex);
+
+ em_notify_pd_created(dev->em_pd);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update);
+
+/**
+ * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device
+ * @dev : Device for which the EM is registered
+ *
+ * Unregister the EM for the specified @dev (but not a CPU device).
+ */
+void em_dev_unregister_perf_domain(struct device *dev)
+{
+ if (IS_ERR_OR_NULL(dev) || !dev->em_pd)
+ return;
+
+ if (_is_cpu_device(dev))
+ return;
+
+ mutex_lock(&em_pd_list_mutex);
+ list_del_init(&dev->em_pd->node);
+ mutex_unlock(&em_pd_list_mutex);
+
+ em_notify_pd_deleted(dev->em_pd);
+
+ /*
+ * The mutex separates all register/unregister requests and protects
+ * from potential clean-up/setup issues in the debugfs directories.
+ * The debugfs directory name is the same as device's name.
+ */
+ mutex_lock(&em_pd_mutex);
+ em_debug_remove_pd(dev);
+
+ em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex)));
+
+ ida_free(&em_pd_ida, dev->em_pd->id);
+
+ kfree(dev->em_pd);
+ dev->em_pd = NULL;
+ mutex_unlock(&em_pd_mutex);
+}
+EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
+
+static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
+{
+ struct em_perf_table *em_table;
+ struct em_perf_state *ps, *new_ps;
+ int ps_size;
+
+ em_table = em_table_alloc(pd);
+ if (!em_table)
+ return NULL;
+
+ new_ps = em_table->state;
+
+ rcu_read_lock();
+ ps = em_perf_state_from_pd(pd);
+ /* Initialize data based on old table */
+ ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
+ memcpy(new_ps, ps, ps_size);
+
+ rcu_read_unlock();
+
+ return em_table;
+}
+
+static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
+ struct em_perf_table *em_table)
+{
+ int ret;
+
+ if (!em_is_artificial(pd)) {
+ ret = em_compute_costs(dev, em_table->state, NULL,
+ pd->nr_perf_states, pd->flags);
+ if (ret)
+ goto free_em_table;
+ }
+
+ ret = em_dev_update_perf_domain(dev, em_table);
+ if (ret)
+ goto free_em_table;
+
+ /*
+ * This is one-time-update, so give up the ownership in this updater.
+ * The EM framework has incremented the usage counter and from now
+ * will keep the reference (then free the memory when needed).
+ */
+free_em_table:
+ em_table_free(em_table);
+ return ret;
+}
+
+/*
+ * Adjustment of CPU performance values after boot, when all CPUs capacites
+ * are correctly calculated.
+ */
+static void em_adjust_new_capacity(unsigned int cpu, struct device *dev,
+ struct em_perf_domain *pd)
+{
+ unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu);
+ struct em_perf_table *em_table;
+ struct em_perf_state *table;
+ unsigned long em_max_perf;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+ em_max_perf = table[pd->nr_perf_states - 1].performance;
+ rcu_read_unlock();
+
+ if (em_max_perf == cpu_capacity)
+ return;
+
+ pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu,
+ cpu_capacity, em_max_perf);
+
+ em_table = em_table_dup(pd);
+ if (!em_table) {
+ dev_warn(dev, "EM: allocation failed\n");
+ return;
+ }
+
+ em_init_performance(dev, pd, em_table->state, pd->nr_perf_states);
+
+ em_recalc_and_update(dev, pd, em_table);
+}
+
+/**
+ * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update.
+ * @cpu: Target CPU.
+ *
+ * Adjust the existing EM for @cpu after a capacity update under the assumption
+ * that the capacity has been updated in the same way for all of the CPUs in
+ * the same perf domain.
+ */
+void em_adjust_cpu_capacity(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+ struct em_perf_domain *pd;
+
+ pd = em_pd_get(dev);
+ if (pd)
+ em_adjust_new_capacity(cpu, dev, pd);
+}
+
+static void em_check_capacity_update(void)
+{
+ cpumask_var_t cpu_done_mask;
+ int cpu, failed_cpus = 0;
+
+ if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
+ pr_warn("no free memory\n");
+ return;
+ }
+
+ /* Check if CPUs capacity has changed than update EM */
+ for_each_possible_cpu(cpu) {
+ struct cpufreq_policy *policy;
+ struct em_perf_domain *pd;
+ struct device *dev;
+
+ if (cpumask_test_cpu(cpu, cpu_done_mask))
+ continue;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (!policy) {
+ failed_cpus++;
+ continue;
+ }
+ cpufreq_cpu_put(policy);
+
+ dev = get_cpu_device(cpu);
+ pd = em_pd_get(dev);
+ if (!pd || em_is_artificial(pd))
+ continue;
+
+ cpumask_or(cpu_done_mask, cpu_done_mask,
+ em_span_cpus(pd));
+
+ em_adjust_new_capacity(cpu, dev, pd);
+ }
+
+ if (failed_cpus)
+ schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000));
+
+ free_cpumask_var(cpu_done_mask);
+}
+
+static void em_update_workfn(struct work_struct *work)
+{
+ em_check_capacity_update();
+}
+
+/**
+ * em_dev_update_chip_binning() - Update Energy Model after the new voltage
+ * information is present in the OPPs.
+ * @dev : Device for which the Energy Model has to be updated.
+ *
+ * This function allows to update easily the EM with new values available in
+ * the OPP framework and DT. It can be used after the chip has been properly
+ * verified by device drivers and the voltages adjusted for the 'chip binning'.
+ */
+int em_dev_update_chip_binning(struct device *dev)
+{
+ struct em_perf_table *em_table;
+ struct em_perf_domain *pd;
+ int i, ret;
+
+ if (IS_ERR_OR_NULL(dev))
+ return -EINVAL;
+
+ pd = em_pd_get(dev);
+ if (!pd) {
+ dev_warn(dev, "Couldn't find Energy Model\n");
+ return -EINVAL;
+ }
+
+ em_table = em_table_dup(pd);
+ if (!em_table) {
+ dev_warn(dev, "EM: allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /* Update power values which might change due to new voltage in OPPs */
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ unsigned long freq = em_table->state[i].frequency;
+ unsigned long power;
+
+ ret = dev_pm_opp_calc_power(dev, &power, &freq);
+ if (ret) {
+ em_table_free(em_table);
+ return ret;
+ }
+
+ em_table->state[i].power = power;
+ }
+
+ return em_recalc_and_update(dev, pd, em_table);
+}
+EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
+
+
+/**
+ * em_update_performance_limits() - Update Energy Model with performance
+ * limits information.
+ * @pd : Performance Domain with EM that has to be updated.
+ * @freq_min_khz : New minimum allowed frequency for this device.
+ * @freq_max_khz : New maximum allowed frequency for this device.
+ *
+ * This function allows to update the EM with information about available
+ * performance levels. It takes the minimum and maximum frequency in kHz
+ * and does internal translation to performance levels.
+ * Returns 0 on success or -EINVAL when failed.
+ */
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+ struct em_perf_state *table;
+ int min_ps = -1;
+ int max_ps = -1;
+ int i;
+
+ if (!pd)
+ return -EINVAL;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (freq_min_khz == table[i].frequency)
+ min_ps = i;
+ if (freq_max_khz == table[i].frequency)
+ max_ps = i;
+ }
+ rcu_read_unlock();
+
+ /* Only update when both are found and sane */
+ if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
+ return -EINVAL;
+
+
+ /* Guard simultaneous updates and make them atomic */
+ mutex_lock(&em_pd_mutex);
+ pd->min_perf_state = min_ps;
+ pd->max_perf_state = max_ps;
+ mutex_unlock(&em_pd_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_update_performance_limits);
+
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+ rebuild_sched_domains_energy();
+}
+
+void em_rebuild_sched_domains(void)
+{
+ static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+ /*
+ * When called from the cpufreq_register_driver() path, the
+ * cpu_hotplug_lock is already held, so use a work item to
+ * avoid nested locking in rebuild_sched_domains().
+ */
+ schedule_work(&rebuild_sd_work);
+}
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET)
+int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *),
+ void *data)
+{
+ struct em_perf_domain *pd;
+
+ lockdep_assert_not_held(&em_pd_mutex);
+ guard(mutex)(&em_pd_list_mutex);
+
+ list_for_each_entry(pd, &em_pd_list, node) {
+ int ret;
+
+ ret = cb(pd, data);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct em_perf_domain *em_perf_domain_get_by_id(int id)
+{
+ struct em_perf_domain *pd;
+
+ lockdep_assert_not_held(&em_pd_mutex);
+ guard(mutex)(&em_pd_list_mutex);
+
+ list_for_each_entry(pd, &em_pd_list, node) {
+ if (pd->id == id)
+ return pd;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 2329daae5255..af8d07bafe02 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
*
@@ -6,13 +7,14 @@
* Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
* Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: hibernation: " fmt
+
+#include <crypto/acompress.h>
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/suspend.h>
-#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
@@ -21,14 +23,16 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pm.h>
+#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/ktime.h>
+#include <linux/security.h>
+#include <linux/secretmem.h>
#include <trace/events/power.h>
#include "power.h"
@@ -44,6 +48,15 @@ dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
__visible int in_suspend __nosavedata;
+static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP;
+
+/*
+ * Compression/decompression algorithm to be used while saving/loading
+ * image to/from disk. This would later be used in 'kernel/power/swap.c'
+ * to allocate comp streams.
+ */
+char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
enum {
HIBERNATION_INVALID,
HIBERNATION_PLATFORM,
@@ -52,6 +65,7 @@ enum {
#ifdef CONFIG_SUSPEND
HIBERNATION_SUSPEND,
#endif
+ HIBERNATION_TEST_RESUME,
/* keep last */
__HIBERNATION_AFTER_LAST
};
@@ -64,9 +78,39 @@ bool freezer_test_done;
static const struct platform_hibernation_ops *hibernation_ops;
+static atomic_t hibernate_atomic = ATOMIC_INIT(1);
+
+#ifdef CONFIG_SUSPEND
+/**
+ * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend
+ */
+bool pm_hibernation_mode_is_suspend(void)
+{
+ return hibernation_mode == HIBERNATION_SUSPEND;
+}
+EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend);
+#endif
+
+bool hibernate_acquire(void)
+{
+ return atomic_add_unless(&hibernate_atomic, -1, 0);
+}
+
+void hibernate_release(void)
+{
+ atomic_inc(&hibernate_atomic);
+}
+
+bool hibernation_in_progress(void)
+{
+ return !atomic_read(&hibernate_atomic);
+}
+
bool hibernation_available(void)
{
- return (nohibernate == 0);
+ return nohibernate == 0 &&
+ !security_locked_down(LOCKDOWN_HIBERNATION) &&
+ !secretmem_active() && !cxl_mem_active();
}
/**
@@ -75,20 +119,24 @@ bool hibernation_available(void)
*/
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
+ unsigned int sleep_flags;
+
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
&& ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
- lock_system_sleep();
+
+ sleep_flags = lock_system_sleep();
+
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(hibernation_set_ops);
@@ -101,10 +149,15 @@ bool system_entering_hibernation(void)
EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+ "Number of seconds to wait before resuming from hibernation test");
static void hibernation_debug_sleep(void)
{
- printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
- mdelay(5000);
+ pr_info("hibernation debug: Waiting for %d second(s).\n",
+ pm_test_delay);
+ mdelay(pm_test_delay * 1000);
}
static int hibernation_test(int level)
@@ -126,7 +179,7 @@ static int hibernation_test(int level) { return 0; }
static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
- hibernation_ops->begin() : 0;
+ hibernation_ops->begin(PMSG_FREEZE) : 0;
}
/**
@@ -249,10 +302,14 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
- printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
- msg, k,
- centisecs / 100, centisecs % 100,
- kps / 1000, (kps % 1000) / 10);
+ pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
+ msg, k, centisecs / 100, centisecs % 100, kps / 1000,
+ (kps % 1000) / 10);
+}
+
+__weak int arch_resume_nosmt(void)
+{
+ return 0;
}
/**
@@ -270,8 +327,7 @@ static int create_image(int platform_mode)
error = dpm_suspend_end(PMSG_FREEZE);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down, "
- "aborting hibernation\n");
+ pr_err("Some devices failed to power down, aborting\n");
return error;
}
@@ -279,16 +335,17 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
- error = disable_nonboot_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (error) {
- printk(KERN_ERR "PM: Some system devices failed to power down, "
- "aborting hibernation\n");
+ pr_err("Some system devices failed to power down, aborting\n");
goto Enable_irqs;
}
@@ -299,14 +356,16 @@ static int create_image(int platform_mode)
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
- trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
- if (error)
- printk(KERN_ERR "PM: Error %d creating hibernation image\n",
- error);
/* Restore control flow magically appears here */
restore_processor_state();
- if (!in_suspend)
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
+ if (error)
+ pr_err("Error %d creating image\n", error);
+
+ if (!in_suspend) {
events_check_enabled = false;
+ clear_or_poison_free_pages();
+ }
platform_leave(platform_mode);
@@ -314,10 +373,15 @@ static int create_image(int platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ pm_sleep_enable_secondary_cpus();
+
+ /* Allow architectures to do nosmt-specific post-resume dances */
+ if (!in_suspend)
+ error = arch_resume_nosmt();
Platform_finish:
platform_finish(platform_mode);
@@ -328,17 +392,35 @@ static int create_image(int platform_mode)
return error;
}
+static void shrink_shmem_memory(void)
+{
+ struct sysinfo info;
+ unsigned long nr_shmem_pages, nr_freed_pages;
+
+ si_meminfo(&info);
+ nr_shmem_pages = info.sharedram; /* current page count used for shmem */
+ /*
+ * The intent is to reclaim all shmem pages. Though shrink_all_memory() can
+ * only reclaim about half of them, it's enough for creating the hibernation
+ * image.
+ */
+ nr_freed_pages = shrink_all_memory(nr_shmem_pages);
+ pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n",
+ nr_shmem_pages, nr_freed_pages);
+}
+
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
*
- * This routine must be called with pm_mutex held.
+ * This routine must be called with system_transition_mutex held.
*/
int hibernation_snapshot(int platform_mode)
{
pm_message_t msg;
int error;
+ pm_suspend_clear_flags();
error = platform_begin(platform_mode);
if (error)
goto Close;
@@ -368,7 +450,16 @@ int hibernation_snapshot(int platform_mode)
goto Thaw;
}
- suspend_console();
+ /*
+ * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem
+ * pages will use lots of system memory, causing hibernation image creation
+ * fail due to insufficient free memory.
+ * This call is to force flush the shmem pages to swap disk and reclaim
+ * the system memory so that image creation can succeed.
+ */
+ shrink_shmem_memory();
+
+ console_suspend_all();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -394,7 +485,7 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
- resume_console();
+ console_resume_all();
dpm_complete(msg);
Close:
@@ -408,6 +499,11 @@ int hibernation_snapshot(int platform_mode)
goto Close;
}
+int __weak hibernate_resume_nonboot_cpu_disable(void)
+{
+ return suspend_disable_secondary_cpus();
+}
+
/**
* resume_target_kernel - Restore system state from a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
@@ -423,8 +519,7 @@ static int resume_target_kernel(bool platform_mode)
error = dpm_suspend_end(PMSG_QUIESCE);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down, "
- "aborting resume\n");
+ pr_err("Some devices failed to power down, aborting resume\n");
return error;
}
@@ -432,11 +527,14 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
- error = disable_nonboot_cpus();
+ cpuidle_pause();
+
+ error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
local_irq_disable();
+ system_state = SYSTEM_SUSPEND;
error = syscore_suspend();
if (error)
@@ -470,10 +568,11 @@ static int resume_target_kernel(bool platform_mode)
syscore_resume();
Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ pm_sleep_enable_secondary_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
@@ -487,16 +586,16 @@ static int resume_target_kernel(bool platform_mode)
* hibernation_restore - Quiesce devices and restore from a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
*
- * This routine must be called with pm_mutex held. If it is successful, control
- * reappears in the restored target kernel in hibernation_snapshot().
+ * This routine must be called with system_transition_mutex held. If it is
+ * successful, control reappears in the restored target kernel in
+ * hibernation_snapshot().
*/
int hibernation_restore(int platform_mode)
{
int error;
pm_prepare_console();
- suspend_console();
- pm_restrict_gfp_mask();
+ console_suspend_all();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
@@ -508,8 +607,7 @@ int hibernation_restore(int platform_mode)
BUG_ON(!error);
}
dpm_resume_end(PMSG_RECOVER);
- pm_restore_gfp_mask();
- resume_console();
+ console_resume_all();
pm_restore_console();
return error;
}
@@ -529,12 +627,12 @@ int hibernation_platform_enter(void)
* hibernation_ops->finish() before saving the image, so we should let
* the firmware know that we're going to enter the sleep state after all
*/
- error = hibernation_ops->begin();
+ error = hibernation_ops->begin(PMSG_HIBERNATE);
if (error)
goto Close;
entering_platform_hibernation = true;
- suspend_console();
+ console_suspend_all();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -550,12 +648,17 @@ int hibernation_platform_enter(void)
if (error)
goto Platform_finish;
- error = disable_nonboot_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error)
- goto Platform_finish;
+ goto Enable_cpus;
local_irq_disable();
- syscore_suspend();
+ system_state = SYSTEM_SUSPEND;
+
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+
if (pm_wakeup_pending()) {
error = -EAGAIN;
goto Power_up;
@@ -567,8 +670,12 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
+ Enable_irqs:
+ system_state = SYSTEM_RUNNING;
local_irq_enable();
- enable_nonboot_cpus();
+
+ Enable_cpus:
+ pm_sleep_enable_secondary_cpus();
Platform_finish:
hibernation_ops->finish();
@@ -578,7 +685,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- resume_console();
+ console_resume_all();
Close:
hibernation_ops->end();
@@ -595,8 +702,17 @@ int hibernation_platform_enter(void)
*/
static void power_down(void)
{
-#ifdef CONFIG_SUSPEND
int error;
+
+#ifdef CONFIG_SUSPEND
+ if (hibernation_mode == HIBERNATION_SUSPEND) {
+ error = suspend_devices_and_enter(mem_sleep_current);
+ if (!error)
+ goto exit;
+
+ hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM :
+ HIBERNATION_SHUTDOWN;
+ }
#endif
switch (hibernation_mode) {
@@ -604,68 +720,111 @@ static void power_down(void)
kernel_restart(NULL);
break;
case HIBERNATION_PLATFORM:
- hibernation_platform_enter();
+ error = hibernation_platform_enter();
+ if (error == -EAGAIN || error == -EBUSY) {
+ events_check_enabled = false;
+ pr_info("Wakeup event detected during hibernation, rolling back.\n");
+ goto exit;
+ }
+ fallthrough;
case HIBERNATION_SHUTDOWN:
- if (pm_power_off)
+ if (kernel_can_power_off()) {
+ entering_platform_hibernation = true;
kernel_power_off();
- break;
-#ifdef CONFIG_SUSPEND
- case HIBERNATION_SUSPEND:
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- if (error) {
- if (hibernation_ops)
- hibernation_mode = HIBERNATION_PLATFORM;
- else
- hibernation_mode = HIBERNATION_SHUTDOWN;
- power_down();
+ entering_platform_hibernation = false;
}
- /*
- * Restore swap signature.
- */
- error = swsusp_unmark();
- if (error)
- printk(KERN_ERR "PM: Swap will be unusable! "
- "Try swapon -a.\n");
- return;
-#endif
+ break;
}
kernel_halt();
/*
* Valid image is on the disk, if we continue we risk serious data
* corruption after resume.
*/
- printk(KERN_CRIT "PM: Please power down manually\n");
+ pr_crit("Power down manually\n");
while (1)
cpu_relax();
+
+exit:
+ /* Restore swap signature. */
+ error = swsusp_unmark();
+ if (error)
+ pr_err("Swap will be unusable! Try swapon -a.\n");
+}
+
+static int load_image_and_restore(void)
+{
+ int error;
+ unsigned int flags;
+
+ pm_pr_dbg("Loading hibernation image.\n");
+
+ lock_device_hotplug();
+ error = create_basic_memory_bitmaps();
+ if (error) {
+ swsusp_close();
+ goto Unlock;
+ }
+
+ error = swsusp_read(&flags);
+ swsusp_close();
+ if (!error)
+ error = hibernation_restore(flags & SF_PLATFORM_MODE);
+
+ pr_err("Failed to load image, recovering.\n");
+ swsusp_free();
+ free_basic_memory_bitmaps();
+ Unlock:
+ unlock_device_hotplug();
+
+ return error;
}
+#define COMPRESSION_ALGO_LZO "lzo"
+#define COMPRESSION_ALGO_LZ4 "lz4"
+
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
int hibernate(void)
{
+ bool snapshot_test = false;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available()) {
- pr_debug("PM: Hibernation not available.\n");
+ pm_pr_dbg("Hibernation not available.\n");
return -EPERM;
}
- lock_system_sleep();
+ /*
+ * Query for the compression algorithm support if compression is enabled.
+ */
+ if (!nocompress) {
+ strscpy(hib_comp_algo, hibernate_compressor);
+ if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ sleep_flags = lock_system_sleep();
/* The snapshot device should not be opened while we're running */
- if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ if (!hibernate_acquire()) {
error = -EBUSY;
goto Unlock;
}
+ pr_info("hibernation entry\n");
pm_prepare_console();
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
if (error)
- goto Exit;
+ goto Restore;
+
+ error = pm_sleep_fs_sync();
+ if (error)
+ goto Notify;
- printk(KERN_INFO "PM: Syncing filesystems ... ");
- sys_sync();
- printk("done.\n");
+ filesystems_freeze(filesystem_freeze_enabled);
error = freeze_processes();
if (error)
@@ -686,186 +845,310 @@ int hibernate(void)
if (hibernation_mode == HIBERNATION_PLATFORM)
flags |= SF_PLATFORM_MODE;
- if (nocompress)
+ if (nocompress) {
flags |= SF_NOCOMPRESS_MODE;
- else
+ } else {
flags |= SF_CRC32_MODE;
- pr_debug("PM: writing image.\n");
+ /*
+ * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4
+ * to override this behaviour and use LZ4.
+ *
+ * Refer kernel/power/power.h for more details
+ */
+
+ if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4))
+ flags |= SF_COMPRESSION_ALG_LZ4;
+ else
+ flags |= SF_COMPRESSION_ALG_LZO;
+ }
+
+ pm_pr_dbg("Writing hibernation image.\n");
error = swsusp_write(flags);
swsusp_free();
- if (!error)
- power_down();
+ if (!error) {
+ if (hibernation_mode == HIBERNATION_TEST_RESUME)
+ snapshot_test = true;
+ else
+ power_down();
+ }
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pr_debug("PM: Image restored successfully.\n");
+ pm_pr_dbg("Hibernation image restored successfully.\n");
}
Free_bitmaps:
free_basic_memory_bitmaps();
Thaw:
unlock_device_hotplug();
+ if (snapshot_test) {
+ pm_pr_dbg("Checking hibernation image\n");
+ error = swsusp_check(false);
+ if (!error)
+ error = load_image_and_restore();
+ }
thaw_processes();
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
+ filesystems_thaw();
+ Notify:
pm_notifier_call_chain(PM_POST_HIBERNATION);
+ Restore:
pm_restore_console();
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
+ pr_info("hibernation exit\n");
+
return error;
}
-
/**
- * software_resume - Resume from a saved hibernation image.
- *
- * This routine is called as a late initcall, when all devices have been
- * discovered and initialized already.
+ * hibernate_quiet_exec - Execute a function with all devices frozen.
+ * @func: Function to execute.
+ * @data: Data pointer to pass to @func.
*
- * The image reading code is called to see if there is a hibernation image
- * available for reading. If that is the case, devices are quiesced and the
- * contents of memory is restored from the saved image.
- *
- * If this is successful, control reappears in the restored target kernel in
- * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
- * attempts to recover gracefully and make the kernel return to the normal mode
- * of operation.
+ * Return the @func return value or an error code if it cannot be executed.
*/
-static int software_resume(void)
+int hibernate_quiet_exec(int (*func)(void *data), void *data)
{
+ unsigned int sleep_flags;
int error;
- unsigned int flags;
- /*
- * If the user said "noresume".. bail out early.
- */
- if (noresume || !hibernation_available())
- return 0;
+ sleep_flags = lock_system_sleep();
- /*
- * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
- * is configured into the kernel. Since the regular hibernate
- * trigger path is via sysfs which takes a buffer mutex before
- * calling hibernate functions (which take pm_mutex) this can
- * cause lockdep to complain about a possible ABBA deadlock
- * which cannot happen since we're in the boot code here and
- * sysfs can't be invoked yet. Therefore, we use a subclass
- * here to avoid lockdep complaining.
- */
- mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
+ if (!hibernate_acquire()) {
+ error = -EBUSY;
+ goto unlock;
+ }
- if (swsusp_resume_device)
- goto Check_image;
+ pm_prepare_console();
- if (!strlen(resume_file)) {
- error = -ENOENT;
- goto Unlock;
- }
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
+ if (error)
+ goto restore;
+
+ filesystems_freeze(filesystem_freeze_enabled);
+
+ error = freeze_processes();
+ if (error)
+ goto exit;
+
+ lock_device_hotplug();
+
+ pm_suspend_clear_flags();
- pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+ error = platform_begin(true);
+ if (error)
+ goto thaw;
+
+ error = freeze_kernel_threads();
+ if (error)
+ goto thaw;
+
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error)
+ goto dpm_complete;
+
+ console_suspend_all();
+
+ error = dpm_suspend(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = platform_pre_snapshot(true);
+ if (error)
+ goto skip;
+
+ error = func(data);
+
+skip:
+ platform_finish(true);
+
+ dpm_resume_start(PMSG_THAW);
+
+dpm_resume:
+ dpm_resume(PMSG_THAW);
+
+ console_resume_all();
+
+dpm_complete:
+ dpm_complete(PMSG_THAW);
+
+ thaw_kernel_threads();
+
+thaw:
+ platform_end(true);
+
+ unlock_device_hotplug();
+
+ thaw_processes();
+
+exit:
+ filesystems_thaw();
+ pm_notifier_call_chain(PM_POST_HIBERNATION);
+
+restore:
+ pm_restore_console();
+
+ hibernate_release();
+
+unlock:
+ unlock_system_sleep(sleep_flags);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
+
+static int __init find_resume_device(void)
+{
+ if (!strlen(resume_file))
+ return -ENOENT;
+
+ pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
if (resume_delay) {
- printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+ pr_info("Waiting %dsec before reading resume device ...\n",
resume_delay);
ssleep(resume_delay);
}
/* Check if the device is there */
- swsusp_resume_device = name_to_dev_t(resume_file);
+ if (!early_lookup_bdev(resume_file, &swsusp_resume_device))
+ return 0;
/*
- * name_to_dev_t is ineffective to verify parition if resume_file is in
- * integer format. (e.g. major:minor)
+ * Some device discovery might still be in progress; we need to wait for
+ * this to finish.
*/
- if (isdigit(resume_file[0]) && resume_wait) {
- int partno;
- while (!get_gendisk(swsusp_resume_device, &partno))
+ wait_for_device_probe();
+ if (resume_wait) {
+ while (early_lookup_bdev(resume_file, &swsusp_resume_device))
msleep(10);
+ async_synchronize_full();
}
- if (!swsusp_resume_device) {
- /*
- * Some device discovery might still be in progress; we need
- * to wait for this to finish.
- */
- wait_for_device_probe();
-
- if (resume_wait) {
- while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
- msleep(10);
- async_synchronize_full();
- }
+ return early_lookup_bdev(resume_file, &swsusp_resume_device);
+}
- swsusp_resume_device = name_to_dev_t(resume_file);
- if (!swsusp_resume_device) {
- error = -ENODEV;
- goto Unlock;
- }
- }
+static int software_resume(void)
+{
+ int error;
- Check_image:
- pr_debug("PM: Hibernation image partition %d:%d present\n",
+ pm_pr_dbg("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- pr_debug("PM: Looking for hibernation image.\n");
- error = swsusp_check();
+ pm_pr_dbg("Looking for hibernation image.\n");
+
+ mutex_lock(&system_transition_mutex);
+ error = swsusp_check(true);
if (error)
goto Unlock;
+ /*
+ * Check if the hibernation image is compressed. If so, query for
+ * the algorithm support.
+ */
+ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) {
+ if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4)
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4);
+ else
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO);
+ if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
+ pr_err("%s compression is not available\n", hib_comp_algo);
+ error = -EOPNOTSUPP;
+ goto Unlock;
+ }
+ }
+
/* The snapshot device should not be opened while we're running */
- if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(FMODE_READ);
+ swsusp_close();
goto Unlock;
}
+ pr_info("resume from hibernation\n");
pm_prepare_console();
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
if (error)
- goto Close_Finish;
+ goto Restore;
- pr_debug("PM: Preparing processes for restore.\n");
+ filesystems_freeze(filesystem_freeze_enabled);
+
+ pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
- if (error)
+ if (error) {
+ filesystems_thaw();
goto Close_Finish;
+ }
- pr_debug("PM: Loading hibernation image.\n");
-
- lock_device_hotplug();
- error = create_basic_memory_bitmaps();
- if (error)
- goto Thaw;
-
- error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
- if (!error)
- hibernation_restore(flags & SF_PLATFORM_MODE);
+ error = freeze_kernel_threads();
+ if (error) {
+ thaw_processes();
+ filesystems_thaw();
+ goto Close_Finish;
+ }
- printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
- swsusp_free();
- free_basic_memory_bitmaps();
- Thaw:
- unlock_device_hotplug();
+ error = load_image_and_restore();
thaw_processes();
+ filesystems_thaw();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
+ Restore:
pm_restore_console();
- atomic_inc(&snapshot_device_available);
+ pr_info("resume failed (%d)\n", error);
+ hibernate_release();
/* For success case, the suspend path will release the lock */
Unlock:
- mutex_unlock(&pm_mutex);
- pr_debug("PM: Hibernation image not present or could not be loaded.\n");
+ mutex_unlock(&system_transition_mutex);
+ pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ);
+ swsusp_close();
goto Finish;
}
-late_initcall_sync(software_resume);
+/**
+ * software_resume_initcall - Resume from a saved hibernation image.
+ *
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
+ *
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
+ */
+static int __init software_resume_initcall(void)
+{
+ /*
+ * If the user said "noresume".. bail out early.
+ */
+ if (noresume || !hibernation_available())
+ return 0;
+
+ if (!swsusp_resume_device) {
+ int error = find_resume_device();
+
+ if (error)
+ return error;
+ }
+
+ return software_resume();
+}
+late_initcall_sync(software_resume_initcall);
static const char * const hibernation_modes[] = {
@@ -875,6 +1158,7 @@ static const char * const hibernation_modes[] = {
#ifdef CONFIG_SUSPEND
[HIBERNATION_SUSPEND] = "suspend",
#endif
+ [HIBERNATION_TEST_RESUME] = "test_resume",
};
/*
@@ -906,11 +1190,11 @@ static const char * const hibernation_modes[] = {
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
+ ssize_t count = 0;
int i;
- char *start = buf;
if (!hibernation_available())
- return sprintf(buf, "[disabled]\n");
+ return sysfs_emit(buf, "[disabled]\n");
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
@@ -921,6 +1205,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
@@ -929,22 +1214,27 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
continue;
}
if (i == hibernation_mode)
- buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
+ count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]);
else
- buf += sprintf(buf, "%s ", hibernation_modes[i]);
+ count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]);
}
- buf += sprintf(buf, "\n");
- return buf-start;
+
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
+
+ return count;
}
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ int mode = HIBERNATION_INVALID;
+ unsigned int sleep_flags;
int error = 0;
- int i;
int len;
char *p;
- int mode = HIBERNATION_INVALID;
+ int i;
if (!hibernation_available())
return -EPERM;
@@ -952,7 +1242,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
@@ -967,6 +1257,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
#endif
+ case HIBERNATION_TEST_RESUME:
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
@@ -979,9 +1270,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
error = -EINVAL;
if (!error)
- pr_debug("PM: Hibernation mode set to '%s'\n",
- hibernation_modes[mode]);
- unlock_system_sleep();
+ pm_pr_dbg("Hibernation mode set to '%s'\n",
+ hibernation_modes[mode]);
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
@@ -990,16 +1281,21 @@ power_attr(disk);
static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
- MINOR(swsusp_resume_device));
+ return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device),
+ MINOR(swsusp_resume_device));
}
static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
- dev_t res;
+ unsigned int sleep_flags;
int len = n;
char *name;
+ dev_t dev;
+ int error;
+
+ if (!hibernation_available())
+ return n;
if (len && buf[len-1] == '\n')
len--;
@@ -1007,15 +1303,34 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!name)
return -ENOMEM;
- res = name_to_dev_t(name);
+ error = lookup_bdev(name, &dev);
+ if (error) {
+ unsigned maj, min, offset;
+ char *p, dummy;
+
+ error = 0;
+ if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 ||
+ sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset,
+ &dummy) == 3) {
+ dev = MKDEV(maj, min);
+ if (maj != MAJOR(dev) || min != MINOR(dev))
+ error = -EINVAL;
+ } else {
+ dev = new_decode_dev(simple_strtoul(name, &p, 16));
+ if (*p)
+ error = -EINVAL;
+ }
+ }
kfree(name);
- if (!res)
- return -EINVAL;
+ if (error)
+ return error;
+
+ sleep_flags = lock_system_sleep();
+ swsusp_resume_device = dev;
+ unlock_system_sleep(sleep_flags);
- lock_system_sleep();
- swsusp_resume_device = res;
- unlock_system_sleep();
- printk(KERN_INFO "PM: Starting manual resume from disk\n");
+ pm_pr_dbg("Configured hibernation resume from disk to %u\n",
+ swsusp_resume_device);
noresume = 0;
software_resume();
return n;
@@ -1023,10 +1338,33 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(resume);
+static ssize_t resume_offset_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block);
+}
+
+static ssize_t resume_offset_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t n)
+{
+ unsigned long long offset;
+ int rc;
+
+ rc = kstrtoull(buf, 0, &offset);
+ if (rc)
+ return rc;
+ swsusp_resume_block = offset;
+
+ return n;
+}
+
+power_attr(resume_offset);
+
static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n", image_size);
+ return sysfs_emit(buf, "%lu\n", image_size);
}
static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1047,7 +1385,7 @@ power_attr(image_size);
static ssize_t reserved_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", reserved_size);
+ return sysfs_emit(buf, "%lu\n", reserved_size);
}
static ssize_t reserved_size_store(struct kobject *kobj,
@@ -1066,8 +1404,9 @@ static ssize_t reserved_size_store(struct kobject *kobj,
power_attr(reserved_size);
-static struct attribute * g[] = {
+static struct attribute *g[] = {
&disk_attr.attr,
+ &resume_offset_attr.attr,
&resume_attr.attr,
&image_size_attr.attr,
&reserved_size_attr.attr,
@@ -1075,7 +1414,7 @@ static struct attribute * g[] = {
};
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
.attrs = g,
};
@@ -1093,7 +1432,7 @@ static int __init resume_setup(char *str)
if (noresume)
return 1;
- strncpy( resume_file, str, 255 );
+ strscpy(resume_file, str);
return 1;
}
@@ -1112,13 +1451,16 @@ static int __init resume_offset_setup(char *str)
static int __init hibernate_setup(char *str)
{
- if (!strncmp(str, "noresume", 8))
+ if (!strncmp(str, "noresume", 8)) {
noresume = 1;
- else if (!strncmp(str, "nocompress", 10))
+ } else if (!strncmp(str, "nocompress", 10)) {
nocompress = 1;
- else if (!strncmp(str, "no", 2)) {
+ } else if (!strncmp(str, "no", 2)) {
noresume = 1;
nohibernate = 1;
+ } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)
+ && !strncmp(str, "protect_image", 13)) {
+ enable_restore_image_protection();
}
return 1;
}
@@ -1140,7 +1482,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
- return rc;
+ pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}
@@ -1151,11 +1493,56 @@ static int __init nohibernate_setup(char *str)
return 1;
}
-static int __init kaslr_nohibernate_setup(char *str)
+static const char * const comp_alg_enabled[] = {
+#if IS_ENABLED(CONFIG_CRYPTO_LZO)
+ COMPRESSION_ALGO_LZO,
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
+ COMPRESSION_ALGO_LZ4,
+#endif
+};
+
+static int hibernate_compressor_param_set(const char *compressor,
+ const struct kernel_param *kp)
{
- return nohibernate_setup(str);
+ int index, ret;
+
+ if (!mutex_trylock(&system_transition_mutex))
+ return -EBUSY;
+
+ index = sysfs_match_string(comp_alg_enabled, compressor);
+ if (index >= 0) {
+ ret = param_set_copystring(comp_alg_enabled[index], kp);
+ if (!ret)
+ strscpy(hib_comp_algo, comp_alg_enabled[index]);
+ } else {
+ ret = index;
+ }
+
+ mutex_unlock(&system_transition_mutex);
+
+ if (ret)
+ pr_debug("Cannot set specified compressor %s\n",
+ compressor);
+
+ return ret;
}
+static const struct kernel_param_ops hibernate_compressor_param_ops = {
+ .set = hibernate_compressor_param_set,
+ .get = param_get_string,
+};
+
+static struct kparam_string hibernate_compressor_param_string = {
+ .maxlen = sizeof(hibernate_compressor),
+ .string = hibernate_compressor,
+};
+
+module_param_cb(compressor, &hibernate_compressor_param_ops,
+ &hibernate_compressor_param_string, 0644);
+MODULE_PARM_DESC(compressor,
+ "Compression algorithm to be used with hibernation");
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
@@ -1163,4 +1550,3 @@ __setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
__setup("nohibernate", nohibernate_setup);
-__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 86e8157a450f..03b2c5495c77 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -1,26 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/main.c - PM subsystem core functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
- *
- * This file is released under the GPLv2
- *
*/
+#include <linux/acpi.h>
#include <linux/export.h>
+#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/pm-trace.h>
#include <linux/workqueue.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/pm_runtime.h>
+#include <linux/atomic.h>
+#include <linux/wait.h>
#include "power.h"
-DEFINE_MUTEX(pm_mutex);
-
#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with system_transition_mutex held
+ * (gfp_allowed_mask also should only be modified with system_transition_mutex
+ * held, unless the suspend/hibernate code is guaranteed not to run in parallel
+ * with that modification).
+ */
+static unsigned int saved_gfp_count;
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+
+ if (WARN_ON(!saved_gfp_count) || --saved_gfp_count)
+ return;
+
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+
+ pm_pr_dbg("GFP mask restored\n");
+}
+
+void pm_restrict_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&system_transition_mutex));
+
+ if (saved_gfp_count++) {
+ WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask);
+ return;
+ }
+
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
+
+ pm_pr_dbg("GFP mask restricted\n");
+}
+
+unsigned int lock_system_sleep(void)
+{
+ unsigned int flags = current->flags;
+ current->flags |= PF_NOFREEZE;
+ mutex_lock(&system_transition_mutex);
+ return flags;
+}
+EXPORT_SYMBOL_GPL(lock_system_sleep);
+
+void unlock_system_sleep(unsigned int flags)
+{
+ if (!(flags & PF_NOFREEZE))
+ current->flags &= ~PF_NOFREEZE;
+ mutex_unlock(&system_transition_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_system_sleep);
+
+void ksys_sync_helper(void)
+{
+ ktime_t start;
+ long elapsed_msecs;
+
+ start = ktime_get();
+ ksys_sync();
+ elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start));
+ pr_info("Filesystems sync: %ld.%03ld seconds\n",
+ elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC);
+}
+EXPORT_SYMBOL_GPL(ksys_sync_helper);
+
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+/* Wakeup events handling resolution while syncing file systems in jiffies */
+#define PM_FS_SYNC_WAKEUP_RESOLUTION 5
+
+static atomic_t pm_fs_sync_count = ATOMIC_INIT(0);
+static struct workqueue_struct *pm_fs_sync_wq;
+static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait);
+
+static bool pm_fs_sync_completed(void)
+{
+ return atomic_read(&pm_fs_sync_count) == 0;
+}
+
+static void pm_fs_sync_work_fn(struct work_struct *work)
+{
+ ksys_sync_helper();
+
+ if (atomic_dec_and_test(&pm_fs_sync_count))
+ wake_up(&pm_fs_sync_wait);
+}
+static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn);
+
+/**
+ * pm_sleep_fs_sync() - Sync file systems in an interruptible way
+ *
+ * Return: 0 on successful file system sync, or -EBUSY if the file system sync
+ * was aborted.
+ */
+int pm_sleep_fs_sync(void)
+{
+ pm_wakeup_clear(0);
+
+ /*
+ * Take back-to-back sleeps into account by queuing a subsequent fs sync
+ * only if the previous fs sync is running or is not queued. Multiple fs
+ * syncs increase the likelihood of saving the latest files immediately
+ * before sleep.
+ */
+ if (!work_pending(&pm_fs_sync_work)) {
+ atomic_inc(&pm_fs_sync_count);
+ queue_work(pm_fs_sync_wq, &pm_fs_sync_work);
+ }
+
+ while (!pm_fs_sync_completed()) {
+ if (pm_wakeup_pending())
+ return -EBUSY;
+
+ wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(),
+ PM_FS_SYNC_WAKEUP_RESOLUTION);
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
/* Routines for PM-transition notifications */
@@ -38,20 +165,35 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int pm_notifier_call_chain(unsigned long val)
+int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
- int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+ int ret;
+
+ ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL);
return notifier_to_errno(ret);
}
+int pm_notifier_call_chain(unsigned long val)
+{
+ return blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+}
+
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
+static int __init pm_async_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ pm_async_enabled = 0;
+ return 1;
+}
+__setup("pm_async=", pm_async_setup);
+
static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", pm_async_enabled);
+ return sysfs_emit(buf, "%d\n", pm_async_enabled);
}
static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -71,7 +213,114 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_async);
-#ifdef CONFIG_PM_DEBUG
+#ifdef CONFIG_SUSPEND
+static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ ssize_t count = 0;
+ suspend_state_t i;
+
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
+ if (i >= PM_SUSPEND_MEM && cxl_mem_active())
+ continue;
+ if (mem_sleep_states[i]) {
+ const char *label = mem_sleep_states[i];
+
+ if (mem_sleep_current == i)
+ count += sysfs_emit_at(buf, count, "[%s] ", label);
+ else
+ count += sysfs_emit_at(buf, count, "%s ", label);
+ }
+ }
+
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
+
+ return count;
+}
+
+static suspend_state_t decode_suspend_state(const char *buf, size_t n)
+{
+ suspend_state_t state;
+ char *p;
+ int len;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = mem_sleep_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
+ return state;
+ }
+
+ return PM_SUSPEND_ON;
+}
+
+static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ state = decode_suspend_state(buf, n);
+ if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON)
+ mem_sleep_current = state;
+ else
+ error = -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
+ return error ? error : n;
+}
+
+power_attr(mem_sleep);
+
+/*
+ * sync_on_suspend: Sync file systems before suspend.
+ *
+ * show() returns whether file systems sync before suspend is enabled.
+ * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it.
+ */
+bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC);
+
+static ssize_t sync_on_suspend_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled);
+}
+
+static ssize_t sync_on_suspend_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ sync_on_suspend_enabled = !!val;
+ return n;
+}
+
+power_attr(sync_on_suspend);
+#endif /* CONFIG_SUSPEND */
+
+#ifdef CONFIG_PM_SLEEP_DEBUG
int pm_test_level = TEST_NONE;
static const char * const pm_tests[__TEST_AFTER_LAST] = {
@@ -86,37 +335,38 @@ static const char * const pm_tests[__TEST_AFTER_LAST] = {
static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
int level;
for (level = TEST_FIRST; level <= TEST_MAX; level++)
if (pm_tests[level]) {
if (level == pm_test_level)
- s += sprintf(s, "[%s] ", pm_tests[level]);
+ count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]);
else
- s += sprintf(s, "%s ", pm_tests[level]);
+ count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]);
}
- if (s != buf)
- /* convert the last space to a newline */
- *(s-1) = '\n';
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
- return (s - buf);
+ return count;
}
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
+ unsigned int sleep_flags;
const char * const *s;
+ int error = -EINVAL;
int level;
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,38 +376,213 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
break;
}
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error ? error : n;
}
power_attr(pm_test);
-#endif /* CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_SLEEP_DEBUG */
-#ifdef CONFIG_DEBUG_FS
-static char *suspend_step_name(enum suspend_stat_step step)
-{
- switch (step) {
- case SUSPEND_FREEZE:
- return "freeze";
- case SUSPEND_PREPARE:
- return "prepare";
- case SUSPEND_SUSPEND:
- return "suspend";
- case SUSPEND_SUSPEND_NOIRQ:
- return "suspend_noirq";
- case SUSPEND_RESUME_NOIRQ:
- return "resume_noirq";
- case SUSPEND_RESUME:
- return "resume";
- default:
- return "";
+#define SUSPEND_NR_STEPS SUSPEND_RESUME
+#define REC_FAILED_NUM 2
+
+struct suspend_stats {
+ unsigned int step_failures[SUSPEND_NR_STEPS];
+ unsigned int success;
+ unsigned int fail;
+ int last_failed_dev;
+ char failed_devs[REC_FAILED_NUM][40];
+ int last_failed_errno;
+ int errno[REC_FAILED_NUM];
+ int last_failed_step;
+ u64 last_hw_sleep;
+ u64 total_hw_sleep;
+ u64 max_hw_sleep;
+ enum suspend_stat_step failed_steps[REC_FAILED_NUM];
+};
+
+static struct suspend_stats suspend_stats;
+static DEFINE_MUTEX(suspend_stats_lock);
+
+void dpm_save_failed_dev(const char *name)
+{
+ mutex_lock(&suspend_stats_lock);
+
+ strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
+ name, sizeof(suspend_stats.failed_devs[0]));
+ suspend_stats.last_failed_dev++;
+ suspend_stats.last_failed_dev %= REC_FAILED_NUM;
+
+ mutex_unlock(&suspend_stats_lock);
+}
+
+void dpm_save_failed_step(enum suspend_stat_step step)
+{
+ suspend_stats.step_failures[step-1]++;
+ suspend_stats.failed_steps[suspend_stats.last_failed_step] = step;
+ suspend_stats.last_failed_step++;
+ suspend_stats.last_failed_step %= REC_FAILED_NUM;
+}
+
+void dpm_save_errno(int err)
+{
+ if (!err) {
+ suspend_stats.success++;
+ return;
}
+
+ suspend_stats.fail++;
+
+ suspend_stats.errno[suspend_stats.last_failed_errno] = err;
+ suspend_stats.last_failed_errno++;
+ suspend_stats.last_failed_errno %= REC_FAILED_NUM;
}
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
+static const char * const suspend_step_names[] = {
+ [SUSPEND_WORKING] = "",
+ [SUSPEND_FREEZE] = "freeze",
+ [SUSPEND_PREPARE] = "prepare",
+ [SUSPEND_SUSPEND] = "suspend",
+ [SUSPEND_SUSPEND_LATE] = "suspend_late",
+ [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq",
+ [SUSPEND_RESUME_NOIRQ] = "resume_noirq",
+ [SUSPEND_RESUME_EARLY] = "resume_early",
+ [SUSPEND_RESUME] = "resume",
+};
+
+#define suspend_attr(_name, format_str) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sysfs_emit(buf, format_str, suspend_stats._name);\
+} \
+static struct kobj_attribute _name = __ATTR_RO(_name)
+
+suspend_attr(success, "%u\n");
+suspend_attr(fail, "%u\n");
+suspend_attr(last_hw_sleep, "%llu\n");
+suspend_attr(total_hw_sleep, "%llu\n");
+suspend_attr(max_hw_sleep, "%llu\n");
+
+#define suspend_step_attr(_name, step) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sysfs_emit(buf, "%u\n", \
+ suspend_stats.step_failures[step-1]); \
+} \
+static struct kobj_attribute _name = __ATTR_RO(_name)
+
+suspend_step_attr(failed_freeze, SUSPEND_FREEZE);
+suspend_step_attr(failed_prepare, SUSPEND_PREPARE);
+suspend_step_attr(failed_suspend, SUSPEND_SUSPEND);
+suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE);
+suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ);
+suspend_step_attr(failed_resume, SUSPEND_RESUME);
+suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY);
+suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ);
+
+static ssize_t last_failed_dev_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ char *last_failed_dev = NULL;
+
+ index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ last_failed_dev = suspend_stats.failed_devs[index];
+
+ return sysfs_emit(buf, "%s\n", last_failed_dev);
+}
+static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev);
+
+static ssize_t last_failed_errno_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ int last_failed_errno;
+
+ index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ last_failed_errno = suspend_stats.errno[index];
+
+ return sysfs_emit(buf, "%d\n", last_failed_errno);
+}
+static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno);
+
+static ssize_t last_failed_step_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ enum suspend_stat_step step;
+ int index;
+
+ index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ step = suspend_stats.failed_steps[index];
+
+ return sysfs_emit(buf, "%s\n", suspend_step_names[step]);
+}
+static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step);
+
+static struct attribute *suspend_attrs[] = {
+ &success.attr,
+ &fail.attr,
+ &failed_freeze.attr,
+ &failed_prepare.attr,
+ &failed_suspend.attr,
+ &failed_suspend_late.attr,
+ &failed_suspend_noirq.attr,
+ &failed_resume.attr,
+ &failed_resume_early.attr,
+ &failed_resume_noirq.attr,
+ &last_failed_dev.attr,
+ &last_failed_errno.attr,
+ &last_failed_step.attr,
+ &last_hw_sleep.attr,
+ &total_hw_sleep.attr,
+ &max_hw_sleep.attr,
+ NULL,
+};
+
+static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+ if (attr != &last_hw_sleep.attr &&
+ attr != &total_hw_sleep.attr &&
+ attr != &max_hw_sleep.attr)
+ return 0444;
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)
+ return 0444;
+#endif
+ return 0;
+}
+
+static const struct attribute_group suspend_attr_group = {
+ .name = "suspend_stats",
+ .attrs = suspend_attrs,
+ .is_visible = suspend_attr_is_visible,
+};
+
+#ifdef CONFIG_DEBUG_FS
static int suspend_stats_show(struct seq_file *s, void *unused)
{
int i, index, last_dev, last_errno, last_step;
+ enum suspend_stat_step step;
last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
last_dev %= REC_FAILED_NUM;
@@ -165,74 +590,55 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
last_errno %= REC_FAILED_NUM;
last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
last_step %= REC_FAILED_NUM;
- seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
- "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
- "success", suspend_stats.success,
- "fail", suspend_stats.fail,
- "failed_freeze", suspend_stats.failed_freeze,
- "failed_prepare", suspend_stats.failed_prepare,
- "failed_suspend", suspend_stats.failed_suspend,
- "failed_suspend_late",
- suspend_stats.failed_suspend_late,
- "failed_suspend_noirq",
- suspend_stats.failed_suspend_noirq,
- "failed_resume", suspend_stats.failed_resume,
- "failed_resume_early",
- suspend_stats.failed_resume_early,
- "failed_resume_noirq",
- suspend_stats.failed_resume_noirq);
+
+ seq_printf(s, "success: %u\nfail: %u\n",
+ suspend_stats.success, suspend_stats.fail);
+
+ for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++)
+ seq_printf(s, "failed_%s: %u\n", suspend_step_names[step],
+ suspend_stats.step_failures[step-1]);
+
seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
- suspend_stats.failed_devs[last_dev]);
+ suspend_stats.failed_devs[last_dev]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_dev + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-s\n",
- suspend_stats.failed_devs[index]);
+ seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]);
}
seq_printf(s, " last_failed_errno:\t%-d\n",
suspend_stats.errno[last_errno]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_errno + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
- seq_printf(s, "\t\t\t%-d\n",
- suspend_stats.errno[index]);
+ seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]);
}
seq_printf(s, " last_failed_step:\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[last_step]));
+ suspend_step_names[suspend_stats.failed_steps[last_step]]);
for (i = 1; i < REC_FAILED_NUM; i++) {
index = last_step + REC_FAILED_NUM - i;
index %= REC_FAILED_NUM;
seq_printf(s, "\t\t\t%-s\n",
- suspend_step_name(
- suspend_stats.failed_steps[index]));
+ suspend_step_names[suspend_stats.failed_steps[index]]);
}
return 0;
}
-
-static int suspend_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, suspend_stats_show, NULL);
-}
-
-static const struct file_operations suspend_stats_operations = {
- .open = suspend_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(suspend_stats);
static int __init pm_debugfs_init(void)
{
debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
- NULL, NULL, &suspend_stats_operations);
+ NULL, NULL, &suspend_stats_fops);
return 0;
}
late_initcall(pm_debugfs_init);
#endif /* CONFIG_DEBUG_FS */
+bool pm_sleep_transition_in_progress(void)
+{
+ return pm_suspend_in_progress() || hibernation_in_progress();
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_PM_SLEEP_DEBUG
@@ -247,7 +653,7 @@ bool pm_print_times_enabled;
static ssize_t pm_print_times_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", pm_print_times_enabled);
+ return sysfs_emit(buf, "%d\n", pm_print_times_enabled);
}
static ssize_t pm_print_times_store(struct kobject *kobj,
@@ -270,20 +676,73 @@ power_attr(pm_print_times);
static inline void pm_print_times_init(void)
{
- pm_print_times_enabled = !!initcall_debug;
+ pm_print_times_enabled = initcall_debug;
+}
+
+static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ if (!pm_wakeup_irq())
+ return -ENODATA;
+
+ return sysfs_emit(buf, "%u\n", pm_wakeup_irq());
}
-#else /* !CONFIG_PP_SLEEP_DEBUG */
+
+power_attr_ro(pm_wakeup_irq);
+
+bool pm_debug_messages_on __read_mostly;
+
+bool pm_debug_messages_should_print(void)
+{
+ return pm_debug_messages_on && pm_sleep_transition_in_progress();
+}
+EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
+
+static ssize_t pm_debug_messages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", pm_debug_messages_on);
+}
+
+static ssize_t pm_debug_messages_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ pm_debug_messages_on = !!val;
+ return n;
+}
+
+power_attr(pm_debug_messages);
+
+static int __init pm_debug_messages_setup(char *str)
+{
+ pm_debug_messages_on = true;
+ return 1;
+}
+__setup("pm_debug_messages", pm_debug_messages_setup);
+
+#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
struct kobject *power_kobj;
-/**
+/*
* state - control system sleep states.
*
* show() returns available sleep state labels, which may be "mem", "standby",
- * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
- * description of what they mean.
+ * "freeze" and "disk" (hibernation).
+ * See Documentation/admin-guide/pm/sleep-states.rst for a description of
+ * what they mean.
*
* store() accepts one of those strings, translates it into the proper
* enumerated value, and initiates a suspend transition.
@@ -291,21 +750,23 @@ struct kobject *power_kobj;
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- char *s = buf;
+ ssize_t count = 0;
#ifdef CONFIG_SUSPEND
suspend_state_t i;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (pm_states[i])
- s += sprintf(s,"%s ", pm_states[i]);
+ count += sysfs_emit_at(buf, count, "%s ", pm_states[i]);
#endif
if (hibernation_available())
- s += sprintf(s, "disk ");
- if (s != buf)
- /* convert the last space to a newline */
- *(s-1) = '\n';
- return (s - buf);
+ count += sysfs_emit_at(buf, count, "disk ");
+
+ /* Convert the last space to a newline if needed. */
+ if (count > 0)
+ buf[count - 1] = '\n';
+
+ return count;
}
static suspend_state_t decode_state(const char *buf, size_t n)
@@ -320,7 +781,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
len = p ? p - buf : n;
/* Check hibernation first. */
- if (len == 4 && !strncmp(buf, "disk", len))
+ if (len == 4 && str_has_prefix(buf, "disk"))
return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
@@ -351,12 +812,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
}
state = decode_state(buf, n);
- if (state < PM_SUSPEND_MAX)
+ if (state < PM_SUSPEND_MAX) {
+ if (state == PM_SUSPEND_MEM)
+ state = mem_sleep_current;
+
error = pm_suspend(state);
- else if (state == PM_SUSPEND_MAX)
+ } else if (state == PM_SUSPEND_MAX) {
error = hibernate();
- else
+ } else {
error = -EINVAL;
+ }
out:
pm_autosleep_unlock();
@@ -401,7 +866,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
unsigned int val;
return pm_get_wakeup_count(&val, true) ?
- sprintf(buf, "%u\n", val) : -EINTR;
+ sysfs_emit(buf, "%u\n", val) : -EINTR;
}
static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -443,17 +908,17 @@ static ssize_t autosleep_show(struct kobject *kobj,
suspend_state_t state = pm_autosleep_state();
if (state == PM_SUSPEND_ON)
- return sprintf(buf, "off\n");
+ return sysfs_emit(buf, "off\n");
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", pm_states[state] ?
+ return sysfs_emit(buf, "%s\n", pm_states[state] ?
pm_states[state] : "error");
#endif
#ifdef CONFIG_HIBERNATION
- return sprintf(buf, "disk\n");
+ return sysfs_emit(buf, "disk\n");
#else
- return sprintf(buf, "error");
+ return sysfs_emit(buf, "error\n");
#endif
}
@@ -468,6 +933,9 @@ static ssize_t autosleep_store(struct kobject *kobj,
&& strcmp(buf, "off") && strcmp(buf, "off\n"))
return -EINVAL;
+ if (state == PM_SUSPEND_MEM)
+ state = mem_sleep_current;
+
error = pm_autosleep_set_state(state);
return error ? error : n;
}
@@ -519,7 +987,7 @@ int pm_trace_enabled;
static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", pm_trace_enabled);
+ return sysfs_emit(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
@@ -548,14 +1016,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj,
return show_trace_dev_match(buf, PAGE_SIZE);
}
-static ssize_t
-pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
-{
- return -EINVAL;
-}
-
-power_attr(pm_trace_dev_match);
+power_attr_ro(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
@@ -563,7 +1024,7 @@ power_attr(pm_trace_dev_match);
static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", freeze_timeout_msecs);
+ return sysfs_emit(buf, "%u\n", freeze_timeout_msecs);
}
static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
@@ -583,6 +1044,34 @@ power_attr(pm_freeze_timeout);
#endif /* CONFIG_FREEZER*/
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+bool filesystem_freeze_enabled = false;
+
+static ssize_t freeze_filesystems_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled);
+}
+
+static ssize_t freeze_filesystems_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ filesystem_freeze_enabled = !!val;
+ return n;
+}
+
+power_attr(freeze_filesystems);
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -592,6 +1081,10 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
&wakeup_count_attr.attr,
+#ifdef CONFIG_SUSPEND
+ &mem_sleep_attr.attr,
+ &sync_on_suspend_attr.attr,
+#endif
#ifdef CONFIG_PM_AUTOSLEEP
&autosleep_attr.attr,
#endif
@@ -599,44 +1092,66 @@ static struct attribute * g[] = {
&wake_lock_attr.attr,
&wake_unlock_attr.attr,
#endif
-#ifdef CONFIG_PM_DEBUG
- &pm_test_attr.attr,
-#endif
#ifdef CONFIG_PM_SLEEP_DEBUG
+ &pm_test_attr.attr,
&pm_print_times_attr.attr,
+ &pm_wakeup_irq_attr.attr,
+ &pm_debug_messages_attr.attr,
#endif
#endif
#ifdef CONFIG_FREEZER
&pm_freeze_timeout_attr.attr,
#endif
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+ &freeze_filesystems_attr.attr,
+#endif
NULL,
};
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
.attrs = g,
};
+static const struct attribute_group *attr_groups[] = {
+ &attr_group,
+#ifdef CONFIG_PM_SLEEP
+ &suspend_attr_group,
+#endif
+ NULL,
+};
+
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
-static int __init pm_start_workqueue(void)
+static int __init pm_start_workqueues(void)
{
- pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
+ pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0);
+ if (!pm_wq)
+ return -ENOMEM;
+
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+ pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0);
+ if (!pm_fs_sync_wq) {
+ destroy_workqueue(pm_wq);
+ return -ENOMEM;
+ }
+#endif
- return pm_wq ? 0 : -ENOMEM;
+ return 0;
}
static int __init pm_init(void)
{
- int error = pm_start_workqueue();
+ int error = pm_start_workqueues();
if (error)
return error;
hibernate_image_size_init();
hibernate_reserved_size_init();
+ pm_states_init();
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
- error = sysfs_create_group(power_kobj, &attr_group);
+ error = sysfs_create_groups(power_kobj, attr_groups);
if (error)
return error;
pm_print_times_init();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ce9b8328a689..75b63843886e 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,8 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/suspend.h>
#include <linux/suspend_ioctls.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
#include <linux/compiler.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/crypto.h>
struct swsusp_info {
struct new_utsname uts;
@@ -14,6 +18,11 @@ struct swsusp_info {
unsigned long size;
} __aligned(PAGE_SIZE);
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+extern int pm_sleep_fs_sync(void);
+extern bool filesystem_freeze_enabled;
+#endif
+
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
extern void __init hibernate_reserved_size_init(void);
@@ -23,15 +32,12 @@ extern void __init hibernate_image_size_init(void);
/* Maximum size of architecture specific data in a hibernation header */
#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
-extern int arch_hibernation_header_save(void *addr, unsigned int max_size);
-extern int arch_hibernation_header_restore(void *addr);
-
static inline int init_header_complete(struct swsusp_info *info)
{
return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE);
}
-static inline char *check_image_kernel(struct swsusp_info *info)
+static inline const char *check_image_kernel(struct swsusp_info *info)
{
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
@@ -54,18 +60,31 @@ asmlinkage int swsusp_save(void);
/* kernel/power/hibernate.c */
extern bool freezer_test_done;
+extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME];
+
+/* kernel/power/swap.c */
+extern unsigned int swsusp_header_flags;
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
+#ifdef CONFIG_STRICT_KERNEL_RWX
+/* kernel/power/snapshot.c */
+extern void enable_restore_image_protection(void);
+#else
+static inline void enable_restore_image_protection(void) {}
+#endif /* CONFIG_STRICT_KERNEL_RWX */
+
+extern bool hibernation_in_progress(void);
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
static inline void hibernate_image_size_init(void) {}
-#endif /* !CONFIG_HIBERNATION */
-extern int pfn_is_nosave(unsigned long);
+static inline bool hibernation_in_progress(void) { return false; }
+#endif /* !CONFIG_HIBERNATION */
#define power_attr(_name) \
static struct kobj_attribute _name##_attr = { \
@@ -77,6 +96,15 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
+#define power_attr_ro(_name) \
+static struct kobj_attribute _name##_attr = { \
+ .attr = { \
+ .name = __stringify(_name), \
+ .mode = S_IRUGO, \
+ }, \
+ .show = _name##_show, \
+}
+
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
@@ -85,14 +113,13 @@ extern int in_suspend;
extern dev_t swsusp_resume_device;
extern sector_t swsusp_resume_block;
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-
extern int create_basic_memory_bitmaps(void);
extern void free_basic_memory_bitmaps(void);
extern int hibernate_preallocate_memory(void);
-/**
+extern void clear_or_poison_free_pages(void);
+
+/*
* Auxiliary structure used for reading the snapshot image data and
* metadata from and writing them to the list of page backup entries
* (PBEs) which is the main data structure of swsusp.
@@ -135,11 +162,11 @@ extern unsigned int snapshot_additional_pages(struct zone *zone);
extern unsigned long snapshot_get_image_size(void);
extern int snapshot_read_next(struct snapshot_handle *handle);
extern int snapshot_write_next(struct snapshot_handle *handle);
-extern void snapshot_write_finalize(struct snapshot_handle *handle);
+int snapshot_write_finalize(struct snapshot_handle *handle);
extern int snapshot_image_loaded(struct snapshot_handle *handle);
-/* If unset, the snapshot device cannot be open. */
-extern atomic_t snapshot_device_available;
+extern bool hibernate_acquire(void);
+extern void hibernate_release(void);
extern sector_t alloc_swapdev_block(int swap);
extern void free_all_swap_pages(int swap);
@@ -149,40 +176,51 @@ extern int swsusp_swap_in_use(void);
* Flags that can be passed from the hibernatig hernel to the "boot" kernel in
* the image header.
*/
+#define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
#define SF_CRC32_MODE 4
+#define SF_HW_SIG 8
+
+/*
+ * Bit to indicate the compression algorithm to be used(for LZ4). The same
+ * could be checked while saving/loading image to/from disk to use the
+ * corresponding algorithms.
+ *
+ * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use
+ * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4.
+ *
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO
+ * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4
+ */
+#define SF_COMPRESSION_ALG_LZ4 16
/* kernel/power/hibernate.c */
-extern int swsusp_check(void);
+int swsusp_check(bool exclusive);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-extern void swsusp_close(fmode_t);
+void swsusp_close(void);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
+#else
+static inline int swsusp_unmark(void) { return 0; }
#endif
-/* kernel/power/block_io.c */
-extern struct block_device *hib_resume_bdev;
-
-extern int hib_bio_read_page(pgoff_t page_off, void *addr,
- struct bio **bio_chain);
-extern int hib_bio_write_page(pgoff_t page_off, void *addr,
- struct bio **bio_chain);
-extern int hib_wait_on_bio_chain(struct bio **bio_chain);
-
-struct timeval;
+struct __kernel_old_timeval;
/* kernel/power/swsusp.c */
extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
-extern const char *pm_labels[];
+extern const char * const pm_labels[];
extern const char *pm_states[];
+extern const char *mem_sleep_states[];
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
+#define mem_sleep_current PM_SUSPEND_ON
+
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
@@ -200,6 +238,7 @@ static inline void suspend_test_finish(const char *label) {}
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
+extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
#endif
@@ -228,7 +267,11 @@ enum {
#define TEST_FIRST TEST_NONE
#define TEST_MAX (__TEST_AFTER_LAST - 1)
+#ifdef CONFIG_PM_SLEEP_DEBUG
extern int pm_test_level;
+#else
+#define pm_test_level (TEST_NONE)
+#endif
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
@@ -295,3 +338,17 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
+
+static inline int pm_sleep_disable_secondary_cpus(void)
+{
+ cpuidle_pause();
+ return suspend_disable_secondary_cpus();
+}
+
+static inline void pm_sleep_enable_secondary_cpus(void)
+{
+ suspend_enable_secondary_cpus();
+ cpuidle_resume();
+}
+
+void dpm_save_errno(int err);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7ef6866b521d..1f306f158696 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* poweroff.c - sysrq handler to gracefully power down machine.
- *
- * This file is released under the GPL v2
*/
#include <linux/kernel.h>
@@ -24,13 +23,13 @@ static void do_poweroff(struct work_struct *dummy)
static DECLARE_WORK(poweroff_work, do_poweroff);
-static void handle_poweroff(int key)
+static void handle_poweroff(u8 key)
{
/* run sysrq poweroff on boot cpu */
schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
}
-static struct sysrq_key_op sysrq_poweroff_op = {
+static const struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
.help_msg = "poweroff(o)",
.action_msg = "Power Off",
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 564f786df470..dc0dfc349f22 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -1,42 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * drivers/power/process.c - Functions for starting/stopping processes on
+ * drivers/power/process.c - Functions for starting/stopping processes on
* suspend transitions.
*
* Originally from swsusp.
*/
-
-#undef DEBUG
-
#include <linux/interrupt.h>
#include <linux/oom.h>
#include <linux/suspend.h>
#include <linux/module.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
+ const char *what = user_only ? "user space processes" :
+ "remaining freezable tasks";
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
bool wq_busy = false;
- struct timeval start, end;
- u64 elapsed_msecs64;
+ ktime_t start, end, elapsed;
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
- do_gettimeofday(&start);
+ pr_info("Freezing %s\n", what);
+
+ start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -50,8 +54,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- if (!freezer_should_skip(p))
- todo++;
+ todo++;
}
read_unlock(&tasklist_lock);
@@ -78,31 +81,31 @@ static int try_to_freeze_tasks(bool user_only)
sleep_usecs *= 2;
}
- do_gettimeofday(&end);
- elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
- do_div(elapsed_msecs64, NSEC_PER_MSEC);
- elapsed_msecs = elapsed_msecs64;
+ end = ktime_get_boottime();
+ elapsed = ktime_sub(end, start);
+ elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
- pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ pr_err("Freezing %s %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n", what,
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
- if (!wakeup) {
+ if (wq_busy)
+ show_freezable_workqueues();
+
+ if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p != current && !freezer_should_skip(p)
- && freezing(p) && !frozen(p))
+ if (p != current && freezing(p) && !frozen(p))
sched_show_task(p);
}
read_unlock(&tasklist_lock);
}
} else {
- pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
- elapsed_msecs % 1000);
+ pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n",
+ what, elapsed_msecs / 1000, elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -127,25 +130,23 @@ int freeze_processes(void)
current->flags |= PF_SUSPEND_TASK;
if (!pm_freezing)
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
- pm_wakeup_clear();
- pr_info("Freezing user space processes ... ");
+ pm_wakeup_clear(0);
pm_freezing = true;
error = try_to_freeze_tasks(true);
- if (!error) {
+ if (!error)
__usermodehelper_set_disable_depth(UMH_DISABLED);
- pr_cont("done.");
- }
- pr_cont("\n");
+
BUG_ON(in_atomic());
/*
- * Now that the whole userspace is frozen we need to disbale
+ * Now that the whole userspace is frozen we need to disable
* the OOM killer to disallow any further interference with
- * killable tasks.
+ * killable tasks. There is no guarantee oom victims will
+ * ever reach a point they go away we have to wait with a timeout.
*/
- if (!error && !oom_killer_disable())
+ if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs)))
error = -EBUSY;
if (error)
@@ -165,14 +166,9 @@ int freeze_kernel_threads(void)
{
int error;
- pr_info("Freezing remaining freezable tasks ... ");
-
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
- if (!error)
- pr_cont("done.");
- pr_cont("\n");
BUG_ON(in_atomic());
if (error)
@@ -187,13 +183,13 @@ void thaw_processes(void)
trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
pm_freezing = false;
pm_nosig_freezing = false;
oom_killer_enable();
- pr_info("Restarting tasks ... ");
+ pr_info("Restarting tasks: Starting\n");
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
@@ -212,7 +208,7 @@ void thaw_processes(void)
usermodehelper_enable();
schedule();
- pr_cont("done.\n");
+ pr_info("Restarting tasks: Done\n");
trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
@@ -221,17 +217,17 @@ void thaw_kernel_threads(void)
struct task_struct *g, *p;
pm_nosig_freezing = false;
- pr_info("Restarting kernel threads ... ");
+ pr_info("Restarting kernel threads ...\n");
thaw_workqueues();
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
- if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
+ if (p->flags & PF_KTHREAD)
__thaw_task(p);
}
read_unlock(&tasklist_lock);
schedule();
- pr_cont("done.\n");
+ pr_info("Done restarting kernel threads.\n");
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..f7d8064e9adc 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -1,30 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This module exposes the interface to kernel space for specifying
- * QoS dependencies. It provides infrastructure for registration of:
+ * Power Management Quality of Service (PM QoS) support base.
*
- * Dependents on a QoS value : register requests
- * Watchers of QoS value : get notified when target QoS value changes
+ * Copyright (C) 2020 Intel Corporation
*
- * This QoS design is best effort based. Dependents register their QoS needs.
- * Watchers register to keep track of the current QoS needs of the system.
+ * Authors:
+ * Mark Gross <mgross@linux.intel.com>
+ * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*
- * There are 3 basic classes of QoS parameter: latency, timeout, throughput
- * each have defined units:
- * latency: usec
- * timeout: usec <-- currently not used.
- * throughput: kbs (kilo byte / sec)
+ * Provided here is an interface for specifying PM QoS dependencies. It allows
+ * entities depending on QoS constraints to register their requests which are
+ * aggregated as appropriate to produce effective constraints (target values)
+ * that can be monitored by entities needing to respect them, either by polling
+ * or through a built-in notification mechanism.
*
- * There are lists of pm_qos_objects each one wrapping requests, notifiers
- *
- * User mode requests on a QOS parameter register themselves to the
- * subsystem by opening the device node /dev/... and writing there request to
- * the node. As long as the process holds a file handle open to the node the
- * client continues to be accounted for. Upon file release the usermode
- * request is removed and a new qos target is computed. This way when the
- * request that the application has is cleaned up when closes the file
- * pointer or exits the pm_qos_object will get an opportunity to clean up.
- *
- * Mark Gross <mgross@linux.intel.com>
+ * In addition to the basic functionality, more specific interfaces for managing
+ * global CPU latency QoS requests and frequency QoS requests are provided.
*/
/*#define DEBUG*/
@@ -53,104 +44,19 @@
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
-struct pm_qos_object {
- struct pm_qos_constraints *constraints;
- struct miscdevice pm_qos_power_miscdev;
- char *name;
-};
-
static DEFINE_SPINLOCK(pm_qos_lock);
-static struct pm_qos_object null_pm_qos;
-
-static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_constraints cpu_dma_constraints = {
- .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
- .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
- .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
- .type = PM_QOS_MIN,
- .notifiers = &cpu_dma_lat_notifier,
-};
-static struct pm_qos_object cpu_dma_pm_qos = {
- .constraints = &cpu_dma_constraints,
- .name = "cpu_dma_latency",
-};
-
-static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_constraints network_lat_constraints = {
- .list = PLIST_HEAD_INIT(network_lat_constraints.list),
- .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .type = PM_QOS_MIN,
- .notifiers = &network_lat_notifier,
-};
-static struct pm_qos_object network_lat_pm_qos = {
- .constraints = &network_lat_constraints,
- .name = "network_latency",
-};
-
-
-static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_constraints network_tput_constraints = {
- .list = PLIST_HEAD_INIT(network_tput_constraints.list),
- .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .type = PM_QOS_MAX,
- .notifiers = &network_throughput_notifier,
-};
-static struct pm_qos_object network_throughput_pm_qos = {
- .constraints = &network_tput_constraints,
- .name = "network_throughput",
-};
-
-
-static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
-static struct pm_qos_constraints memory_bw_constraints = {
- .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
- .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .type = PM_QOS_SUM,
- .notifiers = &memory_bandwidth_notifier,
-};
-static struct pm_qos_object memory_bandwidth_pm_qos = {
- .constraints = &memory_bw_constraints,
- .name = "memory_bandwidth",
-};
-
-
-static struct pm_qos_object *pm_qos_array[] = {
- &null_pm_qos,
- &cpu_dma_pm_qos,
- &network_lat_pm_qos,
- &network_throughput_pm_qos,
- &memory_bandwidth_pm_qos,
-};
-
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *f_pos);
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
- size_t count, loff_t *f_pos);
-static int pm_qos_power_open(struct inode *inode, struct file *filp);
-static int pm_qos_power_release(struct inode *inode, struct file *filp);
-
-static const struct file_operations pm_qos_power_fops = {
- .write = pm_qos_power_write,
- .read = pm_qos_power_read,
- .open = pm_qos_power_open,
- .release = pm_qos_power_release,
- .llseek = noop_llseek,
-};
-
-/* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_constraints *c)
+/**
+ * pm_qos_read_value - Return the current effective constraint value.
+ * @c: List of PM QoS constraint requests.
+ */
+s32 pm_qos_read_value(struct pm_qos_constraints *c)
{
- struct plist_node *node;
- int total_value = 0;
+ return READ_ONCE(c->target_value);
+}
+static int pm_qos_get_value(struct pm_qos_constraints *c)
+{
if (plist_head_empty(&c->list))
return c->no_constraint_value;
@@ -161,123 +67,42 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
case PM_QOS_MAX:
return plist_last(&c->list)->prio;
- case PM_QOS_SUM:
- plist_for_each(node, &c->list)
- total_value += node->prio;
-
- return total_value;
-
default:
- /* runtime check for not using enum */
- BUG();
+ WARN(1, "Unknown PM QoS type in %s\n", __func__);
return PM_QOS_DEFAULT_VALUE;
}
}
-s32 pm_qos_read_value(struct pm_qos_constraints *c)
+static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
{
- return c->target_value;
+ WRITE_ONCE(c->target_value, value);
}
-static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
-{
- c->target_value = value;
-}
-
-static inline int pm_qos_get_value(struct pm_qos_constraints *c);
-static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
-{
- struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
- struct pm_qos_constraints *c;
- struct pm_qos_request *req;
- char *type;
- unsigned long flags;
- int tot_reqs = 0;
- int active_reqs = 0;
-
- if (IS_ERR_OR_NULL(qos)) {
- pr_err("%s: bad qos param!\n", __func__);
- return -EINVAL;
- }
- c = qos->constraints;
- if (IS_ERR_OR_NULL(c)) {
- pr_err("%s: Bad constraints on qos?\n", __func__);
- return -EINVAL;
- }
-
- /* Lock to ensure we have a snapshot */
- spin_lock_irqsave(&pm_qos_lock, flags);
- if (plist_head_empty(&c->list)) {
- seq_puts(s, "Empty!\n");
- goto out;
- }
-
- switch (c->type) {
- case PM_QOS_MIN:
- type = "Minimum";
- break;
- case PM_QOS_MAX:
- type = "Maximum";
- break;
- case PM_QOS_SUM:
- type = "Sum";
- break;
- default:
- type = "Unknown";
- }
-
- plist_for_each_entry(req, &c->list, node) {
- char *state = "Default";
-
- if ((req->node).prio != c->default_value) {
- active_reqs++;
- state = "Active";
- }
- tot_reqs++;
- seq_printf(s, "%d: %d: %s\n", tot_reqs,
- (req->node).prio, state);
- }
-
- seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
- type, pm_qos_get_value(c), active_reqs, tot_reqs);
-
-out:
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- return 0;
-}
-
-static int pm_qos_dbg_open(struct inode *inode, struct file *file)
-{
- return single_open(file, pm_qos_dbg_show_requests,
- inode->i_private);
-}
-
-static const struct file_operations pm_qos_debug_fops = {
- .open = pm_qos_dbg_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/**
- * pm_qos_update_target - manages the constraints list and calls the notifiers
- * if needed
- * @c: constraints data struct
- * @node: request to add to the list, to update or to remove
- * @action: action to take on the constraints list
- * @value: value of the request to add or update
+ * pm_qos_update_target - Update a list of PM QoS constraint requests.
+ * @c: List of PM QoS requests.
+ * @node: Target list entry.
+ * @action: Action to carry out (add, update or remove).
+ * @value: New request value for the target list entry.
+ *
+ * Update the given list of PM QoS constraint requests, @c, by carrying an
+ * @action involving the @node list entry and @value on it.
*
- * This function returns 1 if the aggregated constraint value has changed, 0
- * otherwise.
+ * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node
+ * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store
+ * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove
+ * @node from the list, ignore @value).
+ *
+ * Return: 1 if the aggregate constraint value has changed, 0 otherwise.
*/
int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
enum pm_qos_req_action action, int value)
{
- unsigned long flags;
int prev_value, curr_value, new_value;
- int ret;
+ unsigned long flags;
spin_lock_irqsave(&pm_qos_lock, flags);
+
prev_value = pm_qos_get_value(c);
if (value == PM_QOS_DEFAULT_VALUE)
new_value = c->default_value;
@@ -290,11 +115,11 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
break;
case PM_QOS_UPDATE_REQ:
/*
- * to change the list, we atomically remove, reinit
- * with new value and add, then see if the extremal
- * changed
+ * To change the list, atomically remove, reinit with new value
+ * and add, then see if the aggregate has changed.
*/
plist_del(node, &c->list);
+ fallthrough;
case PM_QOS_ADD_REQ:
plist_node_init(node, new_value);
plist_add(node, &c->list);
@@ -310,16 +135,14 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
spin_unlock_irqrestore(&pm_qos_lock, flags);
trace_pm_qos_update_target(action, prev_value, curr_value);
- if (prev_value != curr_value) {
- ret = 1;
- if (c->notifiers)
- blocking_notifier_call_chain(c->notifiers,
- (unsigned long)curr_value,
- NULL);
- } else {
- ret = 0;
- }
- return ret;
+
+ if (prev_value == curr_value)
+ return 0;
+
+ if (c->notifiers)
+ blocking_notifier_call_chain(c->notifiers, curr_value, NULL);
+
+ return 1;
}
/**
@@ -341,14 +164,12 @@ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
/**
* pm_qos_update_flags - Update a set of PM QoS flags.
- * @pqf: Set of flags to update.
+ * @pqf: Set of PM QoS flags to update.
* @req: Request to add to the set, to modify, or to remove from the set.
* @action: Action to take on the set.
* @val: Value of the request to add or modify.
*
- * Update the given set of PM QoS flags and call notifiers if the aggregate
- * value has changed. Returns 1 if the aggregate constraint value has changed,
- * 0 otherwise.
+ * Return: 1 if the aggregate constraint value has changed, 0 otherwise.
*/
bool pm_qos_update_flags(struct pm_qos_flags *pqf,
struct pm_qos_flags_request *req,
@@ -367,6 +188,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
break;
case PM_QOS_UPDATE_REQ:
pm_qos_flags_remove_req(pqf, req);
+ fallthrough;
case PM_QOS_ADD_REQ:
req->flags = val;
INIT_LIST_HEAD(&req->node);
@@ -383,290 +205,280 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
spin_unlock_irqrestore(&pm_qos_lock, irqflags);
trace_pm_qos_update_flags(action, prev_value, curr_value);
+
return prev_value != curr_value;
}
-/**
- * pm_qos_request - returns current system wide qos expectation
- * @pm_qos_class: identification of which qos value is requested
- *
- * This function returns the current target value.
- */
-int pm_qos_request(int pm_qos_class)
-{
- return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
-}
-EXPORT_SYMBOL_GPL(pm_qos_request);
+#ifdef CONFIG_CPU_IDLE
+/* Definitions related to the CPU latency QoS. */
-int pm_qos_request_active(struct pm_qos_request *req)
-{
- return req->pm_qos_class != 0;
-}
-EXPORT_SYMBOL_GPL(pm_qos_request_active);
+static struct pm_qos_constraints cpu_latency_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_latency_constraints.list),
+ .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+};
-static void __pm_qos_update_request(struct pm_qos_request *req,
- s32 new_value)
+static inline bool cpu_latency_qos_value_invalid(s32 value)
{
- trace_pm_qos_update_request(req->pm_qos_class, new_value);
-
- if (new_value != req->node.prio)
- pm_qos_update_target(
- pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ return value < 0 && value != PM_QOS_DEFAULT_VALUE;
}
/**
- * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
- * @work: work struct for the delayed work (timeout)
- *
- * This cancels the timeout request by falling back to the default at timeout.
+ * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit.
*/
-static void pm_qos_work_fn(struct work_struct *work)
+s32 cpu_latency_qos_limit(void)
{
- struct pm_qos_request *req = container_of(to_delayed_work(work),
- struct pm_qos_request,
- work);
-
- __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+ return pm_qos_read_value(&cpu_latency_constraints);
}
/**
- * pm_qos_add_request - inserts new qos request into the list
- * @req: pointer to a preallocated handle
- * @pm_qos_class: identifies which list of qos request to use
- * @value: defines the qos request
+ * cpu_latency_qos_request_active - Check the given PM QoS request.
+ * @req: PM QoS request to check.
*
- * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance characteristics. It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request
- * handle. Caller needs to save this handle for later use in updates and
- * removal.
+ * Return: 'true' if @req has been added to the CPU latency QoS list, 'false'
+ * otherwise.
*/
-
-void pm_qos_add_request(struct pm_qos_request *req,
- int pm_qos_class, s32 value)
+bool cpu_latency_qos_request_active(struct pm_qos_request *req)
{
- if (!req) /*guard against callers passing in null */
- return;
+ return req->qos == &cpu_latency_constraints;
+}
+EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active);
- if (pm_qos_request_active(req)) {
- WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
- return;
- }
- req->pm_qos_class = pm_qos_class;
- INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
- trace_pm_qos_add_request(pm_qos_class, value);
- pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
- &req->node, PM_QOS_ADD_REQ, value);
+static void cpu_latency_qos_apply(struct pm_qos_request *req,
+ enum pm_qos_req_action action, s32 value)
+{
+ int ret = pm_qos_update_target(req->qos, &req->node, action, value);
+ if (ret > 0)
+ wake_up_all_idle_cpus();
}
-EXPORT_SYMBOL_GPL(pm_qos_add_request);
/**
- * pm_qos_update_request - modifies an existing qos request
- * @req : handle to list element holding a pm_qos request to use
- * @value: defines the qos request
+ * cpu_latency_qos_add_request - Add new CPU latency QoS request.
+ * @req: Pointer to a preallocated handle.
+ * @value: Requested constraint value.
*
- * Updates an existing qos request for the pm_qos_class of parameters along
- * with updating the target pm_qos_class value.
+ * Use @value to initialize the request handle pointed to by @req, insert it as
+ * a new entry to the CPU latency QoS list and recompute the effective QoS
+ * constraint for that list.
*
- * Attempts are made to make this code callable on hot code paths.
+ * Callers need to save the handle for later use in updates and removal of the
+ * QoS request represented by it.
*/
-void pm_qos_update_request(struct pm_qos_request *req,
- s32 new_value)
+void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value)
{
- if (!req) /*guard against callers passing in null */
+ if (!req || cpu_latency_qos_value_invalid(value))
return;
- if (!pm_qos_request_active(req)) {
- WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+ if (cpu_latency_qos_request_active(req)) {
+ WARN(1, KERN_ERR "%s called for already added request\n", __func__);
return;
}
- cancel_delayed_work_sync(&req->work);
- __pm_qos_update_request(req, new_value);
+ trace_pm_qos_add_request(value);
+
+ req->qos = &cpu_latency_constraints;
+ cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value);
}
-EXPORT_SYMBOL_GPL(pm_qos_update_request);
+EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request);
/**
- * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
- * @req : handle to list element holding a pm_qos request to use
- * @new_value: defines the temporal qos request
- * @timeout_us: the effective duration of this qos request in usecs.
+ * cpu_latency_qos_update_request - Modify existing CPU latency QoS request.
+ * @req : QoS request to update.
+ * @new_value: New requested constraint value.
*
- * After timeout_us, this qos request is cancelled automatically.
+ * Use @new_value to update the QoS request represented by @req in the CPU
+ * latency QoS list along with updating the effective constraint value for that
+ * list.
*/
-void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
- unsigned long timeout_us)
+void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value)
{
- if (!req)
+ if (!req || cpu_latency_qos_value_invalid(new_value))
return;
- if (WARN(!pm_qos_request_active(req),
- "%s called for unknown object.", __func__))
+
+ if (!cpu_latency_qos_request_active(req)) {
+ WARN(1, KERN_ERR "%s called for unknown object\n", __func__);
return;
+ }
- cancel_delayed_work_sync(&req->work);
+ trace_pm_qos_update_request(new_value);
- trace_pm_qos_update_request_timeout(req->pm_qos_class,
- new_value, timeout_us);
- if (new_value != req->node.prio)
- pm_qos_update_target(
- pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ if (new_value == req->node.prio)
+ return;
- schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
+ cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value);
}
+EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request);
/**
- * pm_qos_remove_request - modifies an existing qos request
- * @req: handle to request list element
+ * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request.
+ * @req: QoS request to remove.
*
- * Will remove pm qos request from the list of constraints and
- * recompute the current target value for the pm_qos_class. Call this
- * on slow code paths.
+ * Remove the CPU latency QoS request represented by @req from the CPU latency
+ * QoS list along with updating the effective constraint value for that list.
*/
-void pm_qos_remove_request(struct pm_qos_request *req)
+void cpu_latency_qos_remove_request(struct pm_qos_request *req)
{
- if (!req) /*guard against callers passing in null */
+ if (!req)
return;
- /* silent return to keep pcm code cleaner */
- if (!pm_qos_request_active(req)) {
- WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+ if (!cpu_latency_qos_request_active(req)) {
+ WARN(1, KERN_ERR "%s called for unknown object\n", __func__);
return;
}
- cancel_delayed_work_sync(&req->work);
+ trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE);
- trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
- pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_REMOVE_REQ,
- PM_QOS_DEFAULT_VALUE);
+ cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
memset(req, 0, sizeof(*req));
}
-EXPORT_SYMBOL_GPL(pm_qos_remove_request);
+EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request);
-/**
- * pm_qos_add_notifier - sets notification entry for changes to target value
- * @pm_qos_class: identifies which qos target changes should be notified.
- * @notifier: notifier block managed by caller.
- *
- * will register the notifier into a notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+/* User space interface to the CPU latency QoS via misc device. */
+
+static int cpu_latency_qos_open(struct inode *inode, struct file *filp)
{
- int retval;
+ struct pm_qos_request *req;
- retval = blocking_notifier_chain_register(
- pm_qos_array[pm_qos_class]->constraints->notifiers,
- notifier);
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE);
+ filp->private_data = req;
- return retval;
+ return 0;
}
-EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
-/**
- * pm_qos_remove_notifier - deletes notification entry from chain.
- * @pm_qos_class: identifies which qos target changes are notified.
- * @notifier: notifier block to be removed.
- *
- * will remove the notifier from the notification chain that gets called
- * upon changes to the pm_qos_class target value.
- */
-int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
+static int cpu_latency_qos_release(struct inode *inode, struct file *filp)
{
- int retval;
+ struct pm_qos_request *req = filp->private_data;
- retval = blocking_notifier_chain_unregister(
- pm_qos_array[pm_qos_class]->constraints->notifiers,
- notifier);
+ filp->private_data = NULL;
- return retval;
+ cpu_latency_qos_remove_request(req);
+ kfree(req);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-/* User space interface to PM QoS classes via misc devices */
-static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
+static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
{
- qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
- qos->pm_qos_power_miscdev.name = qos->name;
- qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+ struct pm_qos_request *req = filp->private_data;
+ unsigned long flags;
+ s32 value;
- if (d) {
- (void)debugfs_create_file(qos->name, S_IRUGO, d,
- (void *)qos, &pm_qos_debug_fops);
- }
+ if (!req || !cpu_latency_qos_request_active(req))
+ return -EINVAL;
- return misc_register(&qos->pm_qos_power_miscdev);
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ value = pm_qos_get_value(&cpu_latency_constraints);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
}
-static int find_pm_qos_object_by_minor(int minor)
+static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *f_pos)
{
- int pm_qos_class;
+ s32 value;
+
+ if (count == sizeof(s32)) {
+ if (copy_from_user(&value, buf, sizeof(s32)))
+ return -EFAULT;
+ } else {
+ int ret;
- for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
- pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
- if (minor ==
- pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
- return pm_qos_class;
+ ret = kstrtos32_from_user(buf, count, 16, &value);
+ if (ret)
+ return ret;
}
- return -1;
+
+ cpu_latency_qos_update_request(filp->private_data, value);
+
+ return count;
}
-static int pm_qos_power_open(struct inode *inode, struct file *filp)
-{
- long pm_qos_class;
+static const struct file_operations cpu_latency_qos_fops = {
+ .write = cpu_latency_qos_write,
+ .read = cpu_latency_qos_read,
+ .open = cpu_latency_qos_open,
+ .release = cpu_latency_qos_release,
+ .llseek = noop_llseek,
+};
- pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
- if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
- struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (!req)
- return -ENOMEM;
+static struct miscdevice cpu_latency_qos_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "cpu_dma_latency",
+ .fops = &cpu_latency_qos_fops,
+};
- pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
- filp->private_data = req;
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+/* The CPU system wakeup latency QoS. */
+static struct pm_qos_constraints cpu_wakeup_latency_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list),
+ .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT,
+ .type = PM_QOS_MIN,
+};
- return 0;
- }
- return -EPERM;
+/**
+ * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit.
+ *
+ * Returns the current CPU system wakeup latency QoS limit that may have been
+ * requested by user space.
+ */
+s32 cpu_wakeup_latency_qos_limit(void)
+{
+ return pm_qos_read_value(&cpu_wakeup_latency_constraints);
}
-static int pm_qos_power_release(struct inode *inode, struct file *filp)
+static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp)
{
struct pm_qos_request *req;
- req = filp->private_data;
- pm_qos_remove_request(req);
- kfree(req);
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->qos = &cpu_wakeup_latency_constraints;
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ,
+ PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+ filp->private_data = req;
return 0;
}
-
-static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
- size_t count, loff_t *f_pos)
+static int cpu_wakeup_latency_qos_release(struct inode *inode,
+ struct file *filp)
{
- s32 value;
- unsigned long flags;
struct pm_qos_request *req = filp->private_data;
- if (!req)
- return -EINVAL;
- if (!pm_qos_request_active(req))
- return -EINVAL;
+ filp->private_data = NULL;
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ,
+ PM_QOS_RESUME_LATENCY_NO_CONSTRAINT);
+ kfree(req);
- spin_lock_irqsave(&pm_qos_lock, flags);
- value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
+ return 0;
+}
+
+static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints);
return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
}
-static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *f_pos)
+static ssize_t cpu_wakeup_latency_qos_write(struct file *filp,
+ const char __user *buf,
+ size_t count, loff_t *f_pos)
{
+ struct pm_qos_request *req = filp->private_data;
s32 value;
- struct pm_qos_request *req;
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
@@ -679,35 +491,299 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
return ret;
}
- req = filp->private_data;
- pm_qos_update_request(req, value);
+ if (value < 0)
+ return -EINVAL;
+
+ pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value);
return count;
}
+static const struct file_operations cpu_wakeup_latency_qos_fops = {
+ .open = cpu_wakeup_latency_qos_open,
+ .release = cpu_wakeup_latency_qos_release,
+ .read = cpu_wakeup_latency_qos_read,
+ .write = cpu_wakeup_latency_qos_write,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice cpu_wakeup_latency_qos_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "cpu_wakeup_latency",
+ .fops = &cpu_wakeup_latency_qos_fops,
+};
+#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */
-static int __init pm_qos_power_init(void)
+static int __init cpu_latency_qos_init(void)
{
- int ret = 0;
- int i;
- struct dentry *d;
+ int ret;
- BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
+ ret = misc_register(&cpu_latency_qos_miscdev);
+ if (ret < 0)
+ pr_err("%s: %s setup failed\n", __func__,
+ cpu_latency_qos_miscdev.name);
- d = debugfs_create_dir("pm_qos", NULL);
- if (IS_ERR_OR_NULL(d))
- d = NULL;
+#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP
+ ret = misc_register(&cpu_wakeup_latency_qos_miscdev);
+ if (ret < 0)
+ pr_err("%s: %s setup failed\n", __func__,
+ cpu_wakeup_latency_qos_miscdev.name);
+#endif
- for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
- ret = register_pm_qos_misc(pm_qos_array[i], d);
- if (ret < 0) {
- printk(KERN_ERR "pm_qos_param: %s setup failed\n",
- pm_qos_array[i]->name);
- return ret;
- }
+ return ret;
+}
+late_initcall(cpu_latency_qos_init);
+#endif /* CONFIG_CPU_IDLE */
+
+/* Definitions related to the frequency QoS below. */
+
+static inline bool freq_qos_value_invalid(s32 value)
+{
+ return value < 0 && value != PM_QOS_DEFAULT_VALUE;
+}
+
+/**
+ * freq_constraints_init - Initialize frequency QoS constraints.
+ * @qos: Frequency QoS constraints to initialize.
+ */
+void freq_constraints_init(struct freq_constraints *qos)
+{
+ struct pm_qos_constraints *c;
+
+ c = &qos->min_freq;
+ plist_head_init(&c->list);
+ c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+ c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+ c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+ c->type = PM_QOS_MAX;
+ c->notifiers = &qos->min_freq_notifiers;
+ BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);
+
+ c = &qos->max_freq;
+ plist_head_init(&c->list);
+ c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+ c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+ c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+ c->type = PM_QOS_MIN;
+ c->notifiers = &qos->max_freq_notifiers;
+ BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);
+}
+
+/**
+ * freq_qos_read_value - Get frequency QoS constraint for a given list.
+ * @qos: Constraints to evaluate.
+ * @type: QoS request type.
+ */
+s32 freq_qos_read_value(struct freq_constraints *qos,
+ enum freq_qos_req_type type)
+{
+ s32 ret;
+
+ switch (type) {
+ case FREQ_QOS_MIN:
+ ret = IS_ERR_OR_NULL(qos) ?
+ FREQ_QOS_MIN_DEFAULT_VALUE :
+ pm_qos_read_value(&qos->min_freq);
+ break;
+ case FREQ_QOS_MAX:
+ ret = IS_ERR_OR_NULL(qos) ?
+ FREQ_QOS_MAX_DEFAULT_VALUE :
+ pm_qos_read_value(&qos->max_freq);
+ break;
+ default:
+ WARN_ON(1);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * freq_qos_apply - Add/modify/remove frequency QoS request.
+ * @req: Constraint request to apply.
+ * @action: Action to perform (add/update/remove).
+ * @value: Value to assign to the QoS request.
+ *
+ * This is only meant to be called from inside pm_qos, not drivers.
+ */
+int freq_qos_apply(struct freq_qos_request *req,
+ enum pm_qos_req_action action, s32 value)
+{
+ int ret;
+
+ switch(req->type) {
+ case FREQ_QOS_MIN:
+ ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode,
+ action, value);
+ break;
+ case FREQ_QOS_MAX:
+ ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode,
+ action, value);
+ break;
+ default:
+ ret = -EINVAL;
}
return ret;
}
-late_initcall(pm_qos_power_init);
+/**
+ * freq_qos_add_request - Insert new frequency QoS request into a given list.
+ * @qos: Constraints to update.
+ * @req: Preallocated request object.
+ * @type: Request type.
+ * @value: Request value.
+ *
+ * Insert a new entry into the @qos list of requests, recompute the effective
+ * QoS constraint value for that list and initialize the @req object. The
+ * caller needs to save that object for later use in updates and removal.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_add_request(struct freq_constraints *qos,
+ struct freq_qos_request *req,
+ enum freq_qos_req_type type, s32 value)
+{
+ int ret;
+
+ if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value))
+ return -EINVAL;
+
+ if (WARN(freq_qos_request_active(req),
+ "%s() called for active request\n", __func__))
+ return -EINVAL;
+
+ req->qos = qos;
+ req->type = type;
+ ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value);
+ if (ret < 0) {
+ req->qos = NULL;
+ req->type = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_add_request);
+
+/**
+ * freq_qos_update_request - Modify existing frequency QoS request.
+ * @req: Request to modify.
+ * @new_value: New request value.
+ *
+ * Update an existing frequency QoS request along with the effective constraint
+ * value for the list of requests it belongs to.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
+{
+ if (!req || freq_qos_value_invalid(new_value))
+ return -EINVAL;
+
+ if (WARN(!freq_qos_request_active(req),
+ "%s() called for unknown object\n", __func__))
+ return -EINVAL;
+
+ if (req->pnode.prio == new_value)
+ return 0;
+
+ return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value);
+}
+EXPORT_SYMBOL_GPL(freq_qos_update_request);
+
+/**
+ * freq_qos_remove_request - Remove frequency QoS request from its list.
+ * @req: Request to remove.
+ *
+ * Remove the given frequency QoS request from the list of constraints it
+ * belongs to and recompute the effective constraint value for that list.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_remove_request(struct freq_qos_request *req)
+{
+ int ret;
+
+ if (!req)
+ return -EINVAL;
+
+ if (WARN(!freq_qos_request_active(req),
+ "%s() called for unknown object\n", __func__))
+ return -EINVAL;
+
+ ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
+ req->qos = NULL;
+ req->type = 0;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_remove_request);
+
+/**
+ * freq_qos_add_notifier - Add frequency QoS change notifier.
+ * @qos: List of requests to add the notifier to.
+ * @type: Request type.
+ * @notifier: Notifier block to add.
+ */
+int freq_qos_add_notifier(struct freq_constraints *qos,
+ enum freq_qos_req_type type,
+ struct notifier_block *notifier)
+{
+ int ret;
+
+ if (IS_ERR_OR_NULL(qos) || !notifier)
+ return -EINVAL;
+
+ switch (type) {
+ case FREQ_QOS_MIN:
+ ret = blocking_notifier_chain_register(qos->min_freq.notifiers,
+ notifier);
+ break;
+ case FREQ_QOS_MAX:
+ ret = blocking_notifier_chain_register(qos->max_freq.notifiers,
+ notifier);
+ break;
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_add_notifier);
+
+/**
+ * freq_qos_remove_notifier - Remove frequency QoS change notifier.
+ * @qos: List of requests to remove the notifier from.
+ * @type: Request type.
+ * @notifier: Notifier block to remove.
+ */
+int freq_qos_remove_notifier(struct freq_constraints *qos,
+ enum freq_qos_req_type type,
+ struct notifier_block *notifier)
+{
+ int ret;
+
+ if (IS_ERR_OR_NULL(qos) || !notifier)
+ return -EINVAL;
+
+ switch (type) {
+ case FREQ_QOS_MIN:
+ ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers,
+ notifier);
+ break;
+ case FREQ_QOS_MAX:
+ ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers,
+ notifier);
+ break;
+ default:
+ WARN_ON(1);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_remove_notifier);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5235dd4e1e2f..0a946932d5c1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/snapshot.c
*
@@ -5,11 +6,10 @@
*
* Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
*/
+#define pr_fmt(fmt) "PM: hibernation: " fmt
+
#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -21,7 +21,8 @@
#include <linux/pm.h>
#include <linux/device.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/nmi.h>
#include <linux/syscalls.h>
#include <linux/console.h>
#include <linux/highmem.h>
@@ -29,15 +30,88 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/ktime.h>
+#include <linux/set_memory.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
#include "power.h"
+#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY)
+static bool hibernate_restore_protection;
+static bool hibernate_restore_protection_active;
+
+void enable_restore_image_protection(void)
+{
+ hibernate_restore_protection = true;
+}
+
+static inline void hibernate_restore_protection_begin(void)
+{
+ hibernate_restore_protection_active = hibernate_restore_protection;
+}
+
+static inline void hibernate_restore_protection_end(void)
+{
+ hibernate_restore_protection_active = false;
+}
+
+static inline int __must_check hibernate_restore_protect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ return set_memory_ro((unsigned long)page_address, 1);
+ return 0;
+}
+
+static inline int hibernate_restore_unprotect_page(void *page_address)
+{
+ if (hibernate_restore_protection_active)
+ return set_memory_rw((unsigned long)page_address, 1);
+ return 0;
+}
+#else
+static inline void hibernate_restore_protection_begin(void) {}
+static inline void hibernate_restore_protection_end(void) {}
+static inline int __must_check hibernate_restore_protect_page(void *page_address) {return 0; }
+static inline int hibernate_restore_unprotect_page(void *page_address) {return 0; }
+#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */
+
+
+/*
+ * The calls to set_direct_map_*() should not fail because remapping a page
+ * here means that we only update protection bits in an existing PTE.
+ * It is still worth to have a warning here if something changes and this
+ * will no longer be the case.
+ */
+static inline void hibernate_map_page(struct page *page)
+{
+ if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+ int ret = set_direct_map_default_noflush(page);
+
+ if (ret)
+ pr_warn_once("Failed to remap page\n");
+ } else {
+ debug_pagealloc_map_pages(page, 1);
+ }
+}
+
+static inline void hibernate_unmap_page(struct page *page)
+{
+ if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+ unsigned long addr = (unsigned long)page_address(page);
+ int ret = set_direct_map_invalid_noflush(page);
+
+ if (ret)
+ pr_warn_once("Failed to remap page\n");
+
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ } else {
+ debug_pagealloc_unmap_pages(page, 1);
+ }
+}
+
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
@@ -64,28 +138,35 @@ unsigned long image_size;
void __init hibernate_image_size_init(void)
{
- image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+ image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE;
}
-/* List of PBEs needed for restoring the pages that were allocated before
+/*
+ * List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
* directly to their "original" page frames.
*/
struct pbe *restore_pblist;
-/* Pointer to an auxiliary buffer (1 page) */
-static void *buffer;
+/* struct linked_page is used to build chains of pages */
-/**
- * @safe_needed - on resume, for storing the PBE list and the image,
- * we can only use memory pages that do not conflict with the pages
- * used before suspend. The unsafe pages have PageNosaveFree set
- * and we count them using unsafe_pages.
- *
- * Each allocated image page is marked as PageNosave and PageNosaveFree
- * so that swsusp_free() can release it.
+#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
+
+struct linked_page {
+ struct linked_page *next;
+ char data[LINKED_PAGE_DATA_SIZE];
+} __packed;
+
+/*
+ * List of "safe" pages (ie. pages that were not used by the image kernel
+ * before hibernation) that may be used as temporary storage for image kernel
+ * memory contents.
*/
+static struct linked_page *safe_pages_list;
+
+/* Pointer to an auxiliary buffer (1 page) */
+static void *buffer;
#define PG_ANY 0
#define PG_SAFE 1
@@ -94,6 +175,19 @@ static void *buffer;
static unsigned int allocated_unsafe_pages;
+/**
+ * get_image_page - Allocate a page for a hibernation image.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages that were not used before hibernation (restore only)
+ *
+ * During image restoration, for storing the PBE list and the image data, we can
+ * only use memory pages that do not conflict with the pages used before
+ * hibernation. The "unsafe" pages have PageNosaveFree set and we count them
+ * using allocated_unsafe_pages.
+ *
+ * Each allocated image page is marked as PageNosave and PageNosaveFree so that
+ * swsusp_free() can release it.
+ */
static void *get_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
@@ -113,9 +207,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
return res;
}
+static void *__get_safe_page(gfp_t gfp_mask)
+{
+ if (safe_pages_list) {
+ void *ret = safe_pages_list;
+
+ safe_pages_list = safe_pages_list->next;
+ memset(ret, 0, PAGE_SIZE);
+ return ret;
+ }
+ return get_image_page(gfp_mask, PG_SAFE);
+}
+
unsigned long get_safe_page(gfp_t gfp_mask)
{
- return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+ return (unsigned long)__get_safe_page(gfp_mask);
}
static struct page *alloc_image_page(gfp_t gfp_mask)
@@ -130,11 +236,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
return page;
}
+static void recycle_safe_page(void *page_address)
+{
+ struct linked_page *lp = page_address;
+
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
+}
+
/**
- * free_image_page - free page represented by @addr, allocated with
- * get_image_page (page flags set by it must be cleared)
+ * free_image_page - Free a page allocated for hibernation image.
+ * @addr: Address of the page to free.
+ * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page.
+ *
+ * The page to free should have been allocated by get_image_page() (page flags
+ * set by it are affected).
*/
-
static inline void free_image_page(void *addr, int clear_nosave_free)
{
struct page *page;
@@ -150,17 +267,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
__free_page(page);
}
-/* struct linked_page is used to build chains of pages */
-
-#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
-
-struct linked_page {
- struct linked_page *next;
- char data[LINKED_PAGE_DATA_SIZE];
-} __packed;
-
-static inline void
-free_list_of_pages(struct linked_page *list, int clear_page_nosave)
+static inline void free_list_of_pages(struct linked_page *list,
+ int clear_page_nosave)
{
while (list) {
struct linked_page *lp = list->next;
@@ -170,30 +278,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave)
}
}
-/**
- * struct chain_allocator is used for allocating small objects out of
- * a linked list of pages called 'the chain'.
- *
- * The chain grows each time when there is no room for a new object in
- * the current page. The allocated objects cannot be freed individually.
- * It is only possible to free them all at once, by freeing the entire
- * chain.
- *
- * NOTE: The chain allocator may be inefficient if the allocated objects
- * are not much smaller than PAGE_SIZE.
- */
-
+/*
+ * struct chain_allocator is used for allocating small objects out of
+ * a linked list of pages called 'the chain'.
+ *
+ * The chain grows each time when there is no room for a new object in
+ * the current page. The allocated objects cannot be freed individually.
+ * It is only possible to free them all at once, by freeing the entire
+ * chain.
+ *
+ * NOTE: The chain allocator may be inefficient if the allocated objects
+ * are not much smaller than PAGE_SIZE.
+ */
struct chain_allocator {
struct linked_page *chain; /* the chain */
unsigned int used_space; /* total size of objects allocated out
- * of the current page
- */
+ of the current page */
gfp_t gfp_mask; /* mask for allocating pages */
int safe_needed; /* if set, only "safe" pages are allocated */
};
-static void
-chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
+static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask,
+ int safe_needed)
{
ca->chain = NULL;
ca->used_space = LINKED_PAGE_DATA_SIZE;
@@ -208,7 +314,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
struct linked_page *lp;
- lp = get_image_page(ca->gfp_mask, ca->safe_needed);
+ lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) :
+ get_image_page(ca->gfp_mask, PG_ANY);
if (!lp)
return NULL;
@@ -221,45 +328,45 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
return ret;
}
-/**
- * Data types related to memory bitmaps.
+/*
+ * Data types related to memory bitmaps.
*
- * Memory bitmap is a structure consiting of many linked lists of
- * objects. The main list's elements are of type struct zone_bitmap
- * and each of them corresonds to one zone. For each zone bitmap
- * object there is a list of objects of type struct bm_block that
- * represent each blocks of bitmap in which information is stored.
+ * Memory bitmap is a structure consisting of many linked lists of
+ * objects. The main list's elements are of type struct zone_bitmap
+ * and each of them corresponds to one zone. For each zone bitmap
+ * object there is a list of objects of type struct bm_block that
+ * represent each blocks of bitmap in which information is stored.
*
- * struct memory_bitmap contains a pointer to the main list of zone
- * bitmap objects, a struct bm_position used for browsing the bitmap,
- * and a pointer to the list of pages used for allocating all of the
- * zone bitmap objects and bitmap block objects.
+ * struct memory_bitmap contains a pointer to the main list of zone
+ * bitmap objects, a struct bm_position used for browsing the bitmap,
+ * and a pointer to the list of pages used for allocating all of the
+ * zone bitmap objects and bitmap block objects.
*
- * NOTE: It has to be possible to lay out the bitmap in memory
- * using only allocations of order 0. Additionally, the bitmap is
- * designed to work with arbitrary number of zones (this is over the
- * top for now, but let's avoid making unnecessary assumptions ;-).
+ * NOTE: It has to be possible to lay out the bitmap in memory
+ * using only allocations of order 0. Additionally, the bitmap is
+ * designed to work with arbitrary number of zones (this is over the
+ * top for now, but let's avoid making unnecessary assumptions ;-).
*
- * struct zone_bitmap contains a pointer to a list of bitmap block
- * objects and a pointer to the bitmap block object that has been
- * most recently used for setting bits. Additionally, it contains the
- * pfns that correspond to the start and end of the represented zone.
+ * struct zone_bitmap contains a pointer to a list of bitmap block
+ * objects and a pointer to the bitmap block object that has been
+ * most recently used for setting bits. Additionally, it contains the
+ * PFNs that correspond to the start and end of the represented zone.
*
- * struct bm_block contains a pointer to the memory page in which
- * information is stored (in the form of a block of bitmap)
- * It also contains the pfns that correspond to the start and end of
- * the represented memory area.
+ * struct bm_block contains a pointer to the memory page in which
+ * information is stored (in the form of a block of bitmap)
+ * It also contains the pfns that correspond to the start and end of
+ * the represented memory area.
*
- * The memory bitmap is organized as a radix tree to guarantee fast random
- * access to the bits. There is one radix tree for each zone (as returned
- * from create_mem_extents).
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
*
- * One radix tree is represented by one struct mem_zone_bm_rtree. There are
- * two linked lists for the nodes of the tree, one for the inner nodes and
- * one for the leave nodes. The linked leave nodes are used for fast linear
- * access of the memory bitmap.
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leaf nodes. The linked leaf nodes are used for fast linear
+ * access of the memory bitmap.
*
- * The struct rtree_node represents one node of the radix tree.
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
@@ -293,21 +400,21 @@ struct mem_zone_bm_rtree {
unsigned int blocks; /* Number of Bitmap Blocks */
};
-/* strcut bm_position is used for browsing memory bitmaps */
+/* struct bm_position is used for browsing memory bitmaps */
struct bm_position {
struct mem_zone_bm_rtree *zone;
struct rtree_node *node;
unsigned long node_pfn;
+ unsigned long cur_pfn;
int node_bit;
};
struct memory_bitmap {
struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
- * bitmap objects and bitmap block
- * objects
- */
+ bitmap objects and bitmap block
+ objects */
struct bm_position cur; /* most recently used bit position */
};
@@ -321,12 +428,16 @@ struct memory_bitmap {
#endif
#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
-/*
- * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+/**
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ * @gfp_mask: GFP mask for the allocation.
+ * @safe_needed: Get pages not used before hibernation (restore only)
+ * @ca: Pointer to a linked list of pages ("a chain") to allocate from
+ * @list: Radix Tree node to add.
*
- * This function is used to allocate inner nodes as well as the
- * leave nodes of the radix tree. It also adds the node to the
- * corresponding linked list passed in by the *list parameter.
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
*/
static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
struct chain_allocator *ca,
@@ -347,12 +458,12 @@ static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
return node;
}
-/*
- * add_rtree_block - Add a new leave node to the radix tree
+/**
+ * add_rtree_block - Add a new leave node to the radix tree.
*
- * The leave nodes need to be allocated in order to keep the leaves
- * linked list in order. This is guaranteed by the zone->blocks
- * counter.
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
int safe_needed, struct chain_allocator *ca)
@@ -417,17 +528,18 @@ static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free);
-/*
- * create_zone_bm_rtree - create a radix tree for one zone
+/**
+ * create_zone_bm_rtree - Create a radix tree for one zone.
*
- * Allocated the mem_zone_bm_rtree structure and initializes it.
- * This function also allocated and builds the radix tree for the
- * zone.
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
*/
-static struct mem_zone_bm_rtree *
-create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
- struct chain_allocator *ca,
- unsigned long start, unsigned long end)
+static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask,
+ int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start,
+ unsigned long end)
{
struct mem_zone_bm_rtree *zone;
unsigned int i, nr_blocks;
@@ -454,12 +566,12 @@ create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
return zone;
}
-/*
- * free_zone_bm_rtree - Free the memory of the radix tree
+/**
+ * free_zone_bm_rtree - Free the memory of the radix tree.
*
- * Free all node pages of the radix tree. The mem_zone_bm_rtree
- * structure itself is not freed here nor are the rtree_node
- * structs.
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
*/
static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
int clear_nosave_free)
@@ -480,6 +592,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
+ bm->cur.cur_pfn = BM_END_OF_MAP;
bm->cur.node_bit = 0;
}
@@ -492,8 +605,8 @@ struct mem_extent {
};
/**
- * free_mem_extents - free a list of memory extents
- * @list - list of extents to empty
+ * free_mem_extents - Free a list of memory extents.
+ * @list: List of extents to free.
*/
static void free_mem_extents(struct list_head *list)
{
@@ -506,10 +619,11 @@ static void free_mem_extents(struct list_head *list)
}
/**
- * create_mem_extents - create a list of memory extents representing
- * contiguous ranges of PFNs
- * @list - list to put the extents into
- * @gfp_mask - mask to use for memory allocations
+ * create_mem_extents - Create a list of memory extents.
+ * @list: List to put the extents into.
+ * @gfp_mask: Mask to use for memory allocations.
+ *
+ * The extents represent contiguous ranges of PFNs.
*/
static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
{
@@ -565,10 +679,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
}
/**
- * memory_bm_create - allocate memory for a memory bitmap
- */
-static int
-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+ * memory_bm_create - Allocate memory for a memory bitmap.
+ */
+static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
+ int safe_needed)
{
struct chain_allocator ca;
struct list_head mem_extents;
@@ -607,8 +721,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
}
/**
- * memory_bm_free - free memory occupied by the memory bitmap @bm
- */
+ * memory_bm_free - Free memory occupied by the memory bitmap.
+ * @bm: Memory bitmap.
+ */
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
struct mem_zone_bm_rtree *zone;
@@ -622,14 +737,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
}
/**
- * memory_bm_find_bit - Find the bit for pfn in the memory
- * bitmap
+ * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap.
+ *
+ * Find the bit in memory bitmap @bm that corresponds to the given PFN.
+ * The cur.zone, cur.block and cur.node_pfn members of @bm are updated.
*
- * Find the bit in the bitmap @bm that corresponds to given pfn.
- * The cur.zone, cur.block and cur.node_pfn member of @bm are
- * updated.
- * It walks the radix tree to find the page which contains the bit for
- * pfn and returns the bit position in **addr and *bit_nr.
+ * Walk the radix tree to find the page containing the bit that represents @pfn
+ * and return the position of the bit in @addr and @bit_nr.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
void **addr, unsigned int *bit_nr)
@@ -658,12 +772,18 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
zone_found:
/*
- * We have a zone. Now walk the radix tree to find the leave
- * node for our pfn.
+ * We have found the zone. Now walk the radix tree to find the leaf node
+ * for our PFN.
*/
+ /*
+ * If the zone we wish to scan is the current zone and the
+ * pfn falls into the current node then we do not need to walk
+ * the tree.
+ */
node = bm->cur.node;
- if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ if (zone == bm->cur.zone &&
+ ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
goto node_found;
node = zone->rtree;
@@ -683,6 +803,7 @@ node_found:
bm->cur.zone = zone;
bm->cur.node = node;
bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+ bm->cur.cur_pfn = pfn;
/* Set return values */
*addr = node->data;
@@ -734,6 +855,11 @@ static void memory_bm_clear_current(struct memory_bitmap *bm)
clear_bit(bit, bm->cur.node->data);
}
+static unsigned long memory_bm_get_current(struct memory_bitmap *bm)
+{
+ return bm->cur.cur_pfn;
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -754,20 +880,20 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
}
/*
- * rtree_next_node - Jumps to the next leave node
+ * rtree_next_node - Jump to the next leaf node.
*
- * Sets the position to the beginning of the next node in the
- * memory bitmap. This is either the next node in the current
- * zone's radix tree or the first node in the radix tree of the
- * next zone.
+ * Set the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
*
- * Returns true if there is a next node, false otherwise.
+ * Return true if there is a next node, false otherwise.
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur.node = list_entry(bm->cur.node->list.next,
- struct rtree_node, list);
- if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
bm->cur.node_bit = 0;
touch_softlockup_watchdog();
@@ -775,9 +901,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/* No more nodes, goto next zone */
- bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur.zone->list != &bm->zones) {
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
@@ -790,14 +916,15 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/**
- * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ * memory_bm_next_pfn - Find the next set bit in a memory bitmap.
+ * @bm: Memory bitmap.
*
- * Starting from the last returned position this function searches
- * for the next set bit in the memory bitmap and returns its
- * number. If no more bit is set BM_END_OF_MAP is returned.
+ * Starting from the last returned position this function searches for the next
+ * set bit in @bm and returns the PFN represented by it. If no more bits are
+ * set, BM_END_OF_MAP is returned.
*
- * It is required to run memory_bm_position_reset() before the
- * first call to this function.
+ * It is required to run memory_bm_position_reset() before the first call to
+ * this function for the given memory bitmap.
*/
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
@@ -812,18 +939,19 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
if (bit < bits) {
pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
bm->cur.node_bit = bit + 1;
+ bm->cur.cur_pfn = pfn;
return pfn;
}
} while (rtree_next_node(bm));
+ bm->cur.cur_pfn = BM_END_OF_MAP;
return BM_END_OF_MAP;
}
-/**
- * This structure represents a range of page frames the contents of which
- * should not be saved during the suspend.
+/*
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during hibernation.
*/
-
struct nosave_region {
struct list_head list;
unsigned long start_pfn;
@@ -832,15 +960,41 @@ struct nosave_region {
static LIST_HEAD(nosave_regions);
+static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ recycle_safe_page(node->data);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ recycle_safe_page(node->data);
+}
+
+static void memory_bm_recycle(struct memory_bitmap *bm)
+{
+ struct mem_zone_bm_rtree *zone;
+ struct linked_page *p_list;
+
+ list_for_each_entry(zone, &bm->zones, list)
+ recycle_zone_bm_rtree(zone);
+
+ p_list = bm->p_list;
+ while (p_list) {
+ struct linked_page *lp = p_list;
+
+ p_list = lp->next;
+ recycle_safe_page(lp);
+ }
+}
+
/**
- * register_nosave_region - register a range of page frames the contents
- * of which should not be saved during the suspend (to be used in the early
- * initialization code)
+ * register_nosave_region - Register a region of unsaveable memory.
+ *
+ * Register a range of page frames the contents of which should not be saved
+ * during hibernation (to be used in the early initialization code).
*/
-
-void __init
-__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
- int use_kmalloc)
+void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
{
struct nosave_region *region;
@@ -856,18 +1010,14 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
goto Report;
}
}
- if (use_kmalloc) {
- /* during init, this shouldn't fail */
- region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
- BUG_ON(!region);
- } else
- /* This allocation cannot fail */
- region = memblock_virt_alloc(sizeof(struct nosave_region), 0);
+ /* This allocation cannot fail */
+ region = memblock_alloc_or_panic(sizeof(struct nosave_region),
+ SMP_CACHE_BYTES);
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
Report:
- printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+ pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
}
@@ -923,10 +1073,12 @@ static void swsusp_unset_page_forbidden(struct page *page)
}
/**
- * mark_nosave_pages - set bits corresponding to the page frames the
- * contents of which should not be saved in a given bitmap.
+ * mark_nosave_pages - Mark pages that should not be saved.
+ * @bm: Memory bitmap.
+ *
+ * Set the bits in @bm that correspond to the page frames the contents of which
+ * should not be saved.
*/
-
static void mark_nosave_pages(struct memory_bitmap *bm)
{
struct nosave_region *region;
@@ -937,36 +1089,35 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
list_for_each_entry(region, &nosave_regions, list) {
unsigned long pfn;
- pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+ pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n",
(unsigned long long) region->start_pfn << PAGE_SHIFT,
((unsigned long long) region->end_pfn << PAGE_SHIFT)
- 1);
- for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn)) {
- /*
- * It is safe to ignore the result of
- * mem_bm_set_bit_check() here, since we won't
- * touch the PFNs for which the error is
- * returned anyway.
- */
- mem_bm_set_bit_check(bm, pfn);
- }
+ for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, pfn);
+ }
}
}
/**
- * create_basic_memory_bitmaps - create bitmaps needed for marking page
- * frames that should not be saved and free page frames. The pointers
- * forbidden_pages_map and free_pages_map are only modified if everything
- * goes well, because we don't want the bits to be used before both bitmaps
- * are set up.
+ * create_basic_memory_bitmaps - Create bitmaps to hold basic page information.
+ *
+ * Create bitmaps needed for marking page frames that should not be saved and
+ * free page frames. The forbidden_pages_map and free_pages_map pointers are
+ * only modified if everything goes well, because we don't want the bits to be
+ * touched before both bitmaps are set up.
*/
-
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
- int error = 0;
+ int error;
if (forbidden_pages_map && free_pages_map)
return 0;
@@ -993,26 +1144,26 @@ int create_basic_memory_bitmaps(void)
free_pages_map = bm2;
mark_nosave_pages(forbidden_pages_map);
- pr_debug("PM: Basic memory bitmaps created\n");
+ pr_debug("Basic memory bitmaps created\n");
return 0;
Free_second_object:
kfree(bm2);
Free_first_bitmap:
- memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
Free_first_object:
kfree(bm1);
return -ENOMEM;
}
/**
- * free_basic_memory_bitmaps - free memory bitmaps allocated by
- * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
- * so that the bitmaps themselves are not referred to while they are being
- * freed.
+ * free_basic_memory_bitmaps - Free memory bitmaps holding basic information.
+ *
+ * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The
+ * auxiliary pointers are necessary so that the bitmaps themselves are not
+ * referred to while they are being freed.
*/
-
void free_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
@@ -1029,15 +1180,47 @@ void free_basic_memory_bitmaps(void)
memory_bm_free(bm2, PG_UNSAFE_CLEAR);
kfree(bm2);
- pr_debug("PM: Basic memory bitmaps freed\n");
+ pr_debug("Basic memory bitmaps freed\n");
+}
+
+static void clear_or_poison_free_page(struct page *page)
+{
+ if (page_poisoning_enabled_static())
+ __kernel_poison_pages(page, 1);
+ else if (want_init_on_free())
+ clear_highpage(page);
+}
+
+void clear_or_poison_free_pages(void)
+{
+ struct memory_bitmap *bm = free_pages_map;
+ unsigned long pfn;
+
+ if (WARN_ON(!(free_pages_map)))
+ return;
+
+ if (page_poisoning_enabled() || want_init_on_free()) {
+ memory_bm_position_reset(bm);
+ pfn = memory_bm_next_pfn(bm);
+ while (pfn != BM_END_OF_MAP) {
+ if (pfn_valid(pfn))
+ clear_or_poison_free_page(pfn_to_page(pfn));
+
+ pfn = memory_bm_next_pfn(bm);
+ }
+ memory_bm_position_reset(bm);
+ pr_info("free pages cleared after restore\n");
+ }
}
/**
- * snapshot_additional_pages - estimate the number of additional pages
- * be needed for setting up the suspend image data structures for given
- * zone (usually the returned value is greater than the exact number)
+ * snapshot_additional_pages - Estimate the number of extra pages needed.
+ * @zone: Memory zone to carry out the computation for.
+ *
+ * Estimate the number of additional pages needed for setting up a hibernation
+ * image data structures for @zone (usually, the returned value is greater than
+ * the exact number).
*/
-
unsigned int snapshot_additional_pages(struct zone *zone)
{
unsigned int rtree, nodes;
@@ -1053,12 +1236,63 @@ unsigned int snapshot_additional_pages(struct zone *zone)
return 2 * rtree;
}
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
+static void mark_free_pages(struct zone *zone)
+{
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
+ unsigned long flags;
+ unsigned int order, t;
+ struct page *page;
+
+ if (zone_is_empty(zone))
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ max_zone_pfn = zone_end_pfn(zone);
+ for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) {
+ page = pfn_to_page(pfn);
+
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
+ if (page_zone(page) != zone)
+ continue;
+
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
+
+ for_each_migratetype_order(order, t) {
+ list_for_each_entry(page,
+ &zone->free_area[order].free_list[t], buddy_list) {
+ unsigned long i;
+
+ pfn = page_to_pfn(page);
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+ swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
#ifdef CONFIG_HIGHMEM
/**
- * count_free_highmem_pages - compute the total number of free highmem
- * pages, system-wide.
+ * count_free_highmem_pages - Compute the total number of free highmem pages.
+ *
+ * The returned number is system-wide.
*/
-
static unsigned int count_free_highmem_pages(void)
{
struct zone *zone;
@@ -1072,11 +1306,12 @@ static unsigned int count_free_highmem_pages(void)
}
/**
- * saveable_highmem_page - Determine whether a highmem page should be
- * included in the suspend image.
+ * saveable_highmem_page - Check if a highmem page is saveable.
*
- * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
- * and it isn't a part of a free chunk of pages.
+ * Determine whether a highmem page should be included in a hibernation image.
+ *
+ * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ * and it isn't part of a free chunk of pages.
*/
static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
@@ -1085,14 +1320,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
@@ -1102,10 +1339,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_highmem_pages - compute the total number of saveable highmem
- * pages.
+ * count_highmem_pages - Compute the total number of saveable highmem pages.
*/
-
static unsigned int count_highmem_pages(void)
{
struct zone *zone;
@@ -1125,20 +1360,17 @@ static unsigned int count_highmem_pages(void)
}
return n;
}
-#else
-static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
- return NULL;
-}
#endif /* CONFIG_HIGHMEM */
/**
- * saveable_page - Determine whether a non-highmem page should be included
- * in the suspend image.
+ * saveable_page - Check if the given page is saveable.
+ *
+ * Determine whether a non-highmem page should be included in a hibernation
+ * image.
*
- * We should save the page if it isn't Nosave, and is not in the range
- * of pages statically defined as 'unsaveable', and it isn't a part of
- * a free chunk of pages.
+ * We should save the page if it isn't Nosave, and is not in the range
+ * of pages statically defined as 'unsaveable', and it isn't part of
+ * a free chunk of pages.
*/
static struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
@@ -1147,8 +1379,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -1156,6 +1388,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
@@ -1167,10 +1402,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
}
/**
- * count_data_pages - compute the total number of saveable non-highmem
- * pages.
+ * count_data_pages - Compute the total number of saveable non-highmem pages.
*/
-
static unsigned int count_data_pages(void)
{
struct zone *zone;
@@ -1190,86 +1423,106 @@ static unsigned int count_data_pages(void)
return n;
}
-/* This is needed, because copy_page and memcpy are not usable for copying
- * task structs.
+/*
+ * This is needed, because copy_page and memcpy are not usable for copying
+ * task structs. Returns true if the page was filled with only zeros,
+ * otherwise false.
*/
-static inline void do_copy_page(long *dst, long *src)
+static inline bool do_copy_page(long *dst, long *src)
{
+ long z = 0;
int n;
- for (n = PAGE_SIZE / sizeof(long); n; n--)
+ for (n = PAGE_SIZE / sizeof(long); n; n--) {
+ z |= *src;
*dst++ = *src++;
+ }
+ return !z;
}
-
/**
- * safe_copy_page - check if the page we are going to copy is marked as
- * present in the kernel page tables (this always is the case if
- * CONFIG_DEBUG_PAGEALLOC is not set and in that case
- * kernel_page_present() always returns 'true').
+ * safe_copy_page - Copy a page in a safe way.
+ *
+ * Check if the page we are going to copy is marked as present in the kernel
+ * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
+ * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
+ * always returns 'true'. Returns true if the page was entirely composed of
+ * zeros, otherwise it will return false.
*/
-static void safe_copy_page(void *dst, struct page *s_page)
+static bool safe_copy_page(void *dst, struct page *s_page)
{
+ bool zeros_only;
+
if (kernel_page_present(s_page)) {
- do_copy_page(dst, page_address(s_page));
+ zeros_only = do_copy_page(dst, page_address(s_page));
} else {
- kernel_map_pages(s_page, 1, 1);
- do_copy_page(dst, page_address(s_page));
- kernel_map_pages(s_page, 1, 0);
+ hibernate_map_page(s_page);
+ zeros_only = do_copy_page(dst, page_address(s_page));
+ hibernate_unmap_page(s_page);
}
+ return zeros_only;
}
-
#ifdef CONFIG_HIGHMEM
-static inline struct page *
-page_is_saveable(struct zone *zone, unsigned long pfn)
+static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn)
{
return is_highmem(zone) ?
saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
}
-static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
struct page *s_page, *d_page;
void *src, *dst;
+ bool zeros_only;
s_page = pfn_to_page(src_pfn);
d_page = pfn_to_page(dst_pfn);
if (PageHighMem(s_page)) {
- src = kmap_atomic(s_page);
- dst = kmap_atomic(d_page);
- do_copy_page(dst, src);
- kunmap_atomic(dst);
- kunmap_atomic(src);
+ src = kmap_local_page(s_page);
+ dst = kmap_local_page(d_page);
+ zeros_only = do_copy_page(dst, src);
+ kunmap_local(dst);
+ kunmap_local(src);
} else {
if (PageHighMem(d_page)) {
- /* Page pointed to by src may contain some kernel
+ /*
+ * The page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
- safe_copy_page(buffer, s_page);
- dst = kmap_atomic(d_page);
+ zeros_only = safe_copy_page(buffer, s_page);
+ dst = kmap_local_page(d_page);
copy_page(dst, buffer);
- kunmap_atomic(dst);
+ kunmap_local(dst);
} else {
- safe_copy_page(page_address(d_page), s_page);
+ zeros_only = safe_copy_page(page_address(d_page), s_page);
}
}
+ return zeros_only;
}
#else
#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
-static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
- safe_copy_page(page_address(pfn_to_page(dst_pfn)),
+ return safe_copy_page(page_address(pfn_to_page(dst_pfn)),
pfn_to_page(src_pfn));
}
#endif /* CONFIG_HIGHMEM */
-static void
-copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+/*
+ * Copy data pages will copy all pages into pages pulled from the copy_bm.
+ * If a page was entirely filled with zeros it will be marked in the zero_bm.
+ *
+ * Returns the number of pages copied.
+ */
+static unsigned long copy_data_pages(struct memory_bitmap *copy_bm,
+ struct memory_bitmap *orig_bm,
+ struct memory_bitmap *zero_bm)
{
+ unsigned long copied_pages = 0;
struct zone *zone;
- unsigned long pfn;
+ unsigned long pfn, copy_pfn;
for_each_populated_zone(zone) {
unsigned long max_zone_pfn;
@@ -1282,23 +1535,34 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
}
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
- for(;;) {
+ copy_pfn = memory_bm_next_pfn(copy_bm);
+ for (;;) {
pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
- copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
+ if (copy_data_page(copy_pfn, pfn)) {
+ memory_bm_set_bit(zero_bm, pfn);
+ /* Use this copy_pfn for a page that is not full of zeros */
+ continue;
+ }
+ copied_pages++;
+ copy_pfn = memory_bm_next_pfn(copy_bm);
}
+ return copied_pages;
}
/* Total number of image pages */
static unsigned int nr_copy_pages;
/* Number of pages needed for saving the original pfns of the image pages */
static unsigned int nr_meta_pages;
+/* Number of zero pages */
+static unsigned int nr_zero_pages;
+
/*
* Numbers of normal and highmem page frames allocated for hibernation image
* before suspending devices.
*/
-unsigned int alloc_normal, alloc_highmem;
+static unsigned int alloc_normal, alloc_highmem;
/*
* Memory bitmap used for marking saveable pages (during hibernation) or
* hibernation image pages (during restore)
@@ -1314,13 +1578,15 @@ static struct memory_bitmap orig_bm;
*/
static struct memory_bitmap copy_bm;
+/* Memory bitmap which tracks which saveable pages were zero filled. */
+static struct memory_bitmap zero_bm;
+
/**
- * swsusp_free - free pages allocated for the suspend.
+ * swsusp_free - Free pages allocated for hibernation image.
*
- * Suspend pages are alocated before the atomic copy is made, so we
- * need to release them after the resume.
+ * Image pages are allocated before snapshot creation, so they need to be
+ * released after resume.
*/
-
void swsusp_free(void)
{
unsigned long fb_pfn, fr_pfn;
@@ -1351,6 +1617,7 @@ loop:
memory_bm_clear_current(forbidden_pages_map);
memory_bm_clear_current(free_pages_map);
+ hibernate_restore_unprotect_page(page_address(page));
__free_page(page);
goto loop;
}
@@ -1358,10 +1625,12 @@ loop:
out:
nr_copy_pages = 0;
nr_meta_pages = 0;
+ nr_zero_pages = 0;
restore_pblist = NULL;
buffer = NULL;
alloc_normal = 0;
alloc_highmem = 0;
+ hibernate_restore_protection_end();
}
/* Helper functions used for the shrinking of memory. */
@@ -1369,7 +1638,7 @@ out:
#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
/**
- * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * preallocate_image_pages - Allocate a number of pages for hibernation image.
* @nr_pages: Number of page frames to allocate.
* @mask: GFP flags to use for the allocation.
*
@@ -1419,18 +1688,16 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
/**
- * __fraction - Compute (an approximation of) x * (multiplier / base)
+ * __fraction - Compute (an approximation of) x * (multiplier / base).
*/
static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
{
- x *= multiplier;
- do_div(x, base);
- return (unsigned long)x;
+ return div64_u64(x * multiplier, base);
}
static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
unsigned long alloc = __fraction(nr_pages, highmem, total);
@@ -1443,15 +1710,15 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
}
static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
- unsigned long highmem,
- unsigned long total)
+ unsigned long highmem,
+ unsigned long total)
{
return 0;
}
#endif /* CONFIG_HIGHMEM */
/**
- * free_unnecessary_pages - Release preallocated pages not needed for the image
+ * free_unnecessary_pages - Release preallocated pages not needed for the image.
*/
static unsigned long free_unnecessary_pages(void)
{
@@ -1505,7 +1772,7 @@ static unsigned long free_unnecessary_pages(void)
}
/**
- * minimum_image_size - Estimate the minimum acceptable size of an image
+ * minimum_image_size - Estimate the minimum acceptable size of an image.
* @saveable: Number of saveable pages in the system.
*
* We want to avoid attempting to free too much memory too hard, so estimate the
@@ -1517,37 +1784,35 @@ static unsigned long free_unnecessary_pages(void)
* [number of saveable pages] - [number of pages that can be freed in theory]
*
* where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
- * minus mapped file pages.
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages.
*/
static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_page_state(NR_SLAB_RECLAIMABLE)
- + global_page_state(NR_ACTIVE_ANON)
- + global_page_state(NR_INACTIVE_ANON)
- + global_page_state(NR_ACTIVE_FILE)
- + global_page_state(NR_INACTIVE_FILE)
- - global_page_state(NR_FILE_MAPPED);
+ size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+ + global_node_page_state(NR_ACTIVE_ANON)
+ + global_node_page_state(NR_INACTIVE_ANON)
+ + global_node_page_state(NR_ACTIVE_FILE)
+ + global_node_page_state(NR_INACTIVE_FILE);
return saveable <= size ? 0 : saveable - size;
}
/**
- * hibernate_preallocate_memory - Preallocate memory for hibernation image
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image.
*
* To create a hibernation image it is necessary to make a copy of every page
* frame in use. We also need a number of page frames to be free during
* hibernation for allocations made while saving the image and for device
* drivers, in case they need to allocate memory from their hibernation
* callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
- * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
+ * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through
* /sys/power/reserved_size, respectively). To make this happen, we compute the
* total number of available page frames and allocate at least
*
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
- * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
+ * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2
+ * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
*
* of them, which corresponds to the maximum size of a hibernation image.
*
@@ -1564,19 +1829,30 @@ int hibernate_preallocate_memory(void)
ktime_t start, stop;
int error;
- printk(KERN_INFO "PM: Preallocating image memory... ");
+ pr_info("Preallocating image memory\n");
start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate original bitmap\n");
goto err_out;
+ }
error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
- if (error)
+ if (error) {
+ pr_err("Cannot allocate copy bitmap\n");
goto err_out;
+ }
+
+ error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY);
+ if (error) {
+ pr_err("Cannot allocate zero bitmap\n");
+ goto err_out;
+ }
alloc_normal = 0;
alloc_highmem = 0;
+ nr_zero_pages = 0;
/* Count the number of saveable data pages. */
save_highmem = count_highmem_pages();
@@ -1601,9 +1877,6 @@ int hibernate_preallocate_memory(void)
count += highmem;
count -= totalreserve_pages;
- /* Add number of pages required for page keys (s390 only). */
- size += page_key_additional_pages(saveable);
-
/* Compute the maximum number of saveable pages to leave in memory. */
max_size = (count - (size + PAGES_FOR_IO)) / 2
- 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1663,8 +1936,11 @@ int hibernate_preallocate_memory(void)
alloc -= pages;
pages += pages_highmem;
pages_highmem = preallocate_image_highmem(alloc);
- if (pages_highmem < alloc)
+ if (pages_highmem < alloc) {
+ pr_err("Image allocation is %lu pages short\n",
+ alloc - pages_highmem);
goto err_out;
+ }
pages += pages_highmem;
/*
* size is the desired number of saveable pages to leave in
@@ -1695,23 +1971,23 @@ int hibernate_preallocate_memory(void)
out:
stop = ktime_get();
- printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+ pr_info("Allocated %lu pages for snapshot\n", pages);
swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
err_out:
- printk(KERN_CONT "\n");
swsusp_free();
return -ENOMEM;
}
#ifdef CONFIG_HIGHMEM
/**
- * count_pages_for_highmem - compute the number of non-highmem pages
- * that will be necessary for creating copies of highmem pages.
- */
-
+ * count_pages_for_highmem - Count non-highmem pages needed for copying highmem.
+ *
+ * Compute the number of non-highmem pages that will be necessary for creating
+ * copies of highmem pages.
+ */
static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
{
unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
@@ -1724,15 +2000,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
return nr_highmem;
}
#else
-static unsigned int
-count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * enough_free_mem - Make sure we have enough free memory for the
- * snapshot image.
+ * enough_free_mem - Check if there is enough free memory for the image.
*/
-
static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
{
struct zone *zone;
@@ -1743,32 +2016,33 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
free += zone_page_state(zone, NR_FREE_PAGES);
nr_pages += count_pages_for_highmem(nr_highmem);
- pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
- nr_pages, PAGES_FOR_IO, free);
+ pr_debug("Normal pages needed: %u + %u, available pages: %u\n",
+ nr_pages, PAGES_FOR_IO, free);
return free > nr_pages + PAGES_FOR_IO;
}
#ifdef CONFIG_HIGHMEM
/**
- * get_highmem_buffer - if there are some highmem pages in the suspend
- * image, we may need the buffer to copy them and/or load their data.
+ * get_highmem_buffer - Allocate a buffer for highmem pages.
+ *
+ * If there are some highmem pages in the hibernation image, we may need a
+ * buffer to copy them and/or load their data.
*/
-
static inline int get_highmem_buffer(int safe_needed)
{
- buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+ buffer = get_image_page(GFP_ATOMIC, safe_needed);
return buffer ? 0 : -ENOMEM;
}
/**
- * alloc_highmem_image_pages - allocate some highmem pages for the image.
- * Try to allocate as many pages as needed, but if the number of free
- * highmem pages is lesser than that, allocate them all.
+ * alloc_highmem_pages - Allocate some highmem pages for the image.
+ *
+ * Try to allocate as many pages as needed, but if the number of free highmem
+ * pages is less than that, allocate them all.
*/
-
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int nr_highmem)
{
unsigned int to_alloc = count_free_highmem_pages();
@@ -1779,7 +2053,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
while (to_alloc-- > 0) {
struct page *page;
- page = alloc_image_page(__GFP_HIGHMEM);
+ page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
memory_bm_set_bit(bm, page_to_pfn(page));
}
return nr_highmem;
@@ -1787,25 +2061,23 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
#else
static inline int get_highmem_buffer(int safe_needed) { return 0; }
-static inline unsigned int
-alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
+ unsigned int n) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * swsusp_alloc - allocate memory for the suspend image
+ * swsusp_alloc - Allocate memory for hibernation image.
*
- * We first try to allocate as many highmem pages as there are
- * saveable highmem pages in the system. If that fails, we allocate
- * non-highmem pages for the copies of the remaining highmem ones.
+ * We first try to allocate as many highmem pages as there are
+ * saveable highmem pages in the system. If that fails, we allocate
+ * non-highmem pages for the copies of the remaining highmem ones.
*
- * In this approach it is likely that the copies of highmem pages will
- * also be located in the high memory, because of the way in which
- * copy_data_pages() works.
+ * In this approach it is likely that the copies of highmem pages will
+ * also be located in the high memory, because of the way in which
+ * copy_data_pages() works.
*/
-
-static int
-swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
- unsigned int nr_pages, unsigned int nr_highmem)
+static int swsusp_alloc(struct memory_bitmap *copy_bm,
+ unsigned int nr_pages, unsigned int nr_highmem)
{
if (nr_highmem > 0) {
if (get_highmem_buffer(PG_ANY))
@@ -1820,7 +2092,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
while (nr_pages-- > 0) {
struct page *page;
- page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+ page = alloc_image_page(GFP_ATOMIC);
if (!page)
goto err_out;
memory_bm_set_bit(copy_bm, page_to_pfn(page));
@@ -1838,41 +2110,40 @@ asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
- printk(KERN_INFO "PM: Creating hibernation image:\n");
+ pm_deferred_pr_dbg("Creating image\n");
drain_local_pages(NULL);
nr_pages = count_data_pages();
nr_highmem = count_highmem_pages();
- printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+ pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem);
if (!enough_free_mem(nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Not enough free memory\n");
+ pm_deferred_pr_dbg("Not enough free memory for image creation\n");
return -ENOMEM;
}
- if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
- printk(KERN_ERR "PM: Memory allocation failed\n");
+ if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem))
return -ENOMEM;
- }
- /* During allocating of suspend pagedir, new cold pages may appear.
+ /*
+ * During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages(NULL);
- copy_data_pages(&copy_bm, &orig_bm);
+ nr_copy_pages = copy_data_pages(&copy_bm, &orig_bm, &zero_bm);
/*
* End of critical section. From now on, we can write to memory,
* but we should not touch disk. This specially means we must _not_
* touch swap space! Except we must write out our image of course.
*/
-
nr_pages += nr_highmem;
- nr_copy_pages = nr_pages;
+ /* We don't actually copy the zero pages */
+ nr_zero_pages = nr_pages - nr_copy_pages;
nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
- printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
- nr_pages);
+ pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n",
+ nr_copy_pages, nr_zero_pages);
return 0;
}
@@ -1885,17 +2156,17 @@ static int init_header_complete(struct swsusp_info *info)
return 0;
}
-static char *check_image_kernel(struct swsusp_info *info)
+static const char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
- if (strcmp(info->uts.sysname,init_utsname()->sysname))
+ if (strcmp(info->uts.sysname, init_utsname()->sysname))
return "system type";
- if (strcmp(info->uts.release,init_utsname()->release))
+ if (strcmp(info->uts.release, init_utsname()->release))
return "kernel release";
- if (strcmp(info->uts.version,init_utsname()->version))
+ if (strcmp(info->uts.version, init_utsname()->version))
return "version";
- if (strcmp(info->uts.machine,init_utsname()->machine))
+ if (strcmp(info->uts.machine, init_utsname()->machine))
return "machine";
return NULL;
}
@@ -1917,13 +2188,22 @@ static int init_header(struct swsusp_info *info)
return init_header_complete(info);
}
+#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1))
+#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG)
+
/**
- * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
- * are stored in the array @buf[] (1 page at a time)
+ * pack_pfns - Prepare PFNs for saving.
+ * @bm: Memory bitmap.
+ * @buf: Memory buffer to store the PFNs in.
+ * @zero_bm: Memory bitmap containing PFNs of zero pages.
+ *
+ * PFNs corresponding to set bits in @bm are stored in the area of memory
+ * pointed to by @buf (1 page at a time). Pages which were filled with only
+ * zeros will have the highest bit set in the packed format to distinguish
+ * them from PFNs which will be contained in the image file.
*/
-
-static inline void
-pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
int j;
@@ -1931,28 +2211,27 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
buf[j] = memory_bm_next_pfn(bm);
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- /* Save page key for data page (s390 only). */
- page_key_read(buf + j);
+ if (memory_bm_test_bit(zero_bm, buf[j]))
+ buf[j] |= ENCODED_PFN_ZERO_FLAG;
}
}
/**
- * snapshot_read_next - used for reading the system memory snapshot.
+ * snapshot_read_next - Get the address to read the next image page from.
+ * @handle: Snapshot handle to be used for the reading.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to read up to the returned number of bytes from the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to read up to the returned number of bytes from the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the end of data stream condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the end of the data stream condition,
+ * and negative numbers are returned on errors. If that happens, the structure
+ * pointed to by @handle is not updated and should not be used any more.
*/
-
int snapshot_read_next(struct snapshot_handle *handle)
{
if (handle->cur > nr_meta_pages + nr_copy_pages)
@@ -1975,21 +2254,22 @@ int snapshot_read_next(struct snapshot_handle *handle)
memory_bm_position_reset(&copy_bm);
} else if (handle->cur <= nr_meta_pages) {
clear_page(buffer);
- pack_pfns(buffer, &orig_bm);
+ pack_pfns(buffer, &orig_bm, &zero_bm);
} else {
struct page *page;
page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
if (PageHighMem(page)) {
- /* Highmem pages are copied to the buffer,
+ /*
+ * Highmem pages are copied to the buffer,
* because we can't return with a kmapped
* highmem page (we may not be called again).
*/
void *kaddr;
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
copy_page(buffer, kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
handle->buffer = buffer;
} else {
handle->buffer = page_address(page);
@@ -1999,75 +2279,61 @@ int snapshot_read_next(struct snapshot_handle *handle)
return PAGE_SIZE;
}
-/**
- * mark_unsafe_pages - mark the pages that cannot be used for storing
- * the image during resume, because they conflict with the pages that
- * had been used before suspend
- */
-
-static int mark_unsafe_pages(struct memory_bitmap *bm)
+static void duplicate_memory_bitmap(struct memory_bitmap *dst,
+ struct memory_bitmap *src)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn;
- /* Clear page flags */
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn))
- swsusp_unset_page_free(pfn_to_page(pfn));
+ memory_bm_position_reset(src);
+ pfn = memory_bm_next_pfn(src);
+ while (pfn != BM_END_OF_MAP) {
+ memory_bm_set_bit(dst, pfn);
+ pfn = memory_bm_next_pfn(src);
}
-
- /* Mark pages that correspond to the "original" pfns as "unsafe" */
- memory_bm_position_reset(bm);
- do {
- pfn = memory_bm_next_pfn(bm);
- if (likely(pfn != BM_END_OF_MAP)) {
- if (likely(pfn_valid(pfn)))
- swsusp_set_page_free(pfn_to_page(pfn));
- else
- return -EFAULT;
- }
- } while (pfn != BM_END_OF_MAP);
-
- allocated_unsafe_pages = 0;
-
- return 0;
}
-static void
-duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
+/**
+ * mark_unsafe_pages - Mark pages that were used before hibernation.
+ *
+ * Mark the pages that cannot be used for storing the image during restoration,
+ * because they conflict with the pages that had been used before hibernation.
+ */
+static void mark_unsafe_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
- memory_bm_position_reset(src);
- pfn = memory_bm_next_pfn(src);
+ /* Clear the "free"/"unsafe" bit for all PFNs */
+ memory_bm_position_reset(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
while (pfn != BM_END_OF_MAP) {
- memory_bm_set_bit(dst, pfn);
- pfn = memory_bm_next_pfn(src);
+ memory_bm_clear_current(free_pages_map);
+ pfn = memory_bm_next_pfn(free_pages_map);
}
+
+ /* Mark pages that correspond to the "original" PFNs as "unsafe" */
+ duplicate_memory_bitmap(free_pages_map, bm);
+
+ allocated_unsafe_pages = 0;
}
static int check_header(struct swsusp_info *info)
{
- char *reason;
+ const char *reason;
reason = check_image_kernel(info);
if (!reason && info->num_physpages != get_num_physpages())
reason = "memory size";
if (reason) {
- printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+ pr_err("Image mismatch: %s\n", reason);
return -EPERM;
}
return 0;
}
/**
- * load header - check the image header and copy data from it
+ * load_header - Check the image header and copy the data from it.
*/
-
-static int
-load_header(struct swsusp_info *info)
+static int load_header(struct swsusp_info *info)
{
int error;
@@ -2081,36 +2347,48 @@ load_header(struct swsusp_info *info)
}
/**
- * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
- * the corresponding bit in the memory bitmap @bm
+ * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap.
+ * @bm: Memory bitmap.
+ * @buf: Area of memory containing the PFNs.
+ * @zero_bm: Memory bitmap with the zero PFNs marked.
+ *
+ * For each element of the array pointed to by @buf (1 page at a time), set the
+ * corresponding bit in @bm. If the page was originally populated with only
+ * zeros then a corresponding bit will also be set in @zero_bm.
*/
-static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
+ unsigned long decoded_pfn;
+ bool zero;
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- /* Extract and buffer page key for data page (s390 only). */
- page_key_memorize(buf + j);
-
- if (memory_bm_pfn_present(bm, buf[j]))
- memory_bm_set_bit(bm, buf[j]);
- else
+ zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG);
+ decoded_pfn = buf[j] & ENCODED_PFN_MASK;
+ if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) {
+ memory_bm_set_bit(bm, decoded_pfn);
+ if (zero) {
+ memory_bm_set_bit(zero_bm, decoded_pfn);
+ nr_zero_pages++;
+ }
+ } else {
+ if (!pfn_valid(decoded_pfn))
+ pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n",
+ (unsigned long long)PFN_PHYS(decoded_pfn));
return -EFAULT;
+ }
}
return 0;
}
-/* List of "safe" pages that may be used to store data loaded from the suspend
- * image
- */
-static struct linked_page *safe_pages_list;
-
#ifdef CONFIG_HIGHMEM
-/* struct highmem_pbe is used for creating the list of highmem pages that
+/*
+ * struct highmem_pbe is used for creating the list of highmem pages that
* should be restored atomically during the resume from disk, because the page
* frames they have occupied before the suspend are in use.
*/
@@ -2120,7 +2398,8 @@ struct highmem_pbe {
struct highmem_pbe *next;
};
-/* List of highmem PBEs needed for restoring the highmem pages that were
+/*
+ * List of highmem PBEs needed for restoring the highmem pages that were
* allocated before the suspend and included in the suspend image, but have
* also been allocated by the "resume" kernel, so their contents cannot be
* written directly to their "original" page frames.
@@ -2128,11 +2407,11 @@ struct highmem_pbe {
static struct highmem_pbe *highmem_pblist;
/**
- * count_highmem_image_pages - compute the number of highmem pages in the
- * suspend image. The bits in the memory bitmap @bm that correspond to the
- * image pages are assumed to be set.
+ * count_highmem_image_pages - Compute the number of highmem pages in the image.
+ * @bm: Memory bitmap.
+ *
+ * The bits in @bm that correspond to image pages are assumed to be set.
*/
-
static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
{
unsigned long pfn;
@@ -2149,24 +2428,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
return cnt;
}
-/**
- * prepare_highmem_image - try to allocate as many highmem pages as
- * there are highmem image pages (@nr_highmem_p points to the variable
- * containing the number of highmem image pages). The pages that are
- * "safe" (ie. will not be overwritten when the suspend image is
- * restored) have the corresponding bits set in @bm (it must be
- * unitialized).
- *
- * NOTE: This function should not be called if there are no highmem
- * image pages.
- */
-
static unsigned int safe_highmem_pages;
static struct memory_bitmap *safe_highmem_bm;
-static int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+/**
+ * prepare_highmem_image - Allocate memory for loading highmem data from image.
+ * @bm: Pointer to an uninitialized memory bitmap structure.
+ * @nr_highmem_p: Pointer to the number of highmem image pages.
+ *
+ * Try to allocate as many highmem pages as there are highmem image pages
+ * (@nr_highmem_p points to the variable containing the number of highmem image
+ * pages). The pages that are "safe" (ie. will not be overwritten when the
+ * hibernation image is restored entirely) have the corresponding bits set in
+ * @bm (it must be uninitialized).
+ *
+ * NOTE: This function should not be called if there are no highmem image pages.
+ */
+static int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p)
{
unsigned int to_alloc;
@@ -2201,39 +2481,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
return 0;
}
+static struct page *last_highmem_page;
+
/**
- * get_highmem_page_buffer - for given highmem image page find the buffer
- * that suspend_write_next() should set for its caller to write to.
+ * get_highmem_page_buffer - Prepare a buffer to store a highmem image page.
+ *
+ * For a given highmem image page get a buffer that suspend_write_next() should
+ * return to its caller to write to.
*
- * If the page is to be saved to its "original" page frame or a copy of
- * the page is to be made in the highmem, @buffer is returned. Otherwise,
- * the copy of the page is to be made in normal memory, so the address of
- * the copy is returned.
+ * If the page is to be saved to its "original" page frame or a copy of
+ * the page is to be made in the highmem, @buffer is returned. Otherwise,
+ * the copy of the page is to be made in normal memory, so the address of
+ * the copy is returned.
*
- * If @buffer is returned, the caller of suspend_write_next() will write
- * the page's contents to @buffer, so they will have to be copied to the
- * right location on the next call to suspend_write_next() and it is done
- * with the help of copy_last_highmem_page(). For this purpose, if
- * @buffer is returned, @last_highmem page is set to the page to which
- * the data will have to be copied from @buffer.
+ * If @buffer is returned, the caller of suspend_write_next() will write
+ * the page's contents to @buffer, so they will have to be copied to the
+ * right location on the next call to suspend_write_next() and it is done
+ * with the help of copy_last_highmem_page(). For this purpose, if
+ * @buffer is returned, @last_highmem_page is set to the page to which
+ * the data will have to be copied from @buffer.
*/
-
-static struct page *last_highmem_page;
-
-static void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
struct highmem_pbe *pbe;
void *kaddr;
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
last_highmem_page = page;
return buffer;
}
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
@@ -2253,8 +2536,9 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
pbe->copy_page = tmp;
} else {
/* Copy of the page will be stored in normal memory */
- kaddr = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ kaddr = __get_safe_page(ca->gfp_mask);
+ if (!kaddr)
+ return ERR_PTR(-ENOMEM);
pbe->copy_page = virt_to_page(kaddr);
}
pbe->next = highmem_pblist;
@@ -2263,19 +2547,20 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
}
/**
- * copy_last_highmem_page - copy the contents of a highmem image from
- * @buffer, where the caller of snapshot_write_next() has place them,
- * to the right location represented by @last_highmem_page .
+ * copy_last_highmem_page - Copy most the most recent highmem image page.
+ *
+ * Copy the contents of a highmem image from @buffer, where the caller of
+ * snapshot_write_next() has stored them, to the right location represented by
+ * @last_highmem_page .
*/
-
static void copy_last_highmem_page(void)
{
if (last_highmem_page) {
void *dst;
- dst = kmap_atomic(last_highmem_page);
+ dst = kmap_local_page(last_highmem_page);
copy_page(dst, buffer);
- kunmap_atomic(dst);
+ kunmap_local(dst);
last_highmem_page = NULL;
}
}
@@ -2294,17 +2579,13 @@ static inline void free_highmem_data(void)
free_image_page(buffer, PG_UNSAFE_CLEAR);
}
#else
-static unsigned int
-count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
-static inline int
-prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
-{
- return 0;
-}
+static inline int prepare_highmem_image(struct memory_bitmap *bm,
+ unsigned int *nr_highmem_p) { return 0; }
-static inline void *
-get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+static inline void *get_highmem_page_buffer(struct page *page,
+ struct chain_allocator *ca)
{
return ERR_PTR(-EINVAL);
}
@@ -2314,27 +2595,33 @@ static inline int last_highmem_page_copied(void) { return 1; }
static inline void free_highmem_data(void) {}
#endif /* CONFIG_HIGHMEM */
+#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
+
/**
- * prepare_image - use the memory bitmap @bm to mark the pages that will
- * be overwritten in the process of restoring the system memory state
- * from the suspend image ("unsafe" pages) and allocate memory for the
- * image.
+ * prepare_image - Make room for loading hibernation image.
+ * @new_bm: Uninitialized memory bitmap structure.
+ * @bm: Memory bitmap with unsafe pages marked.
+ * @zero_bm: Memory bitmap containing the zero pages.
+ *
+ * Use @bm to mark the pages that will be overwritten in the process of
+ * restoring the system memory state from the suspend image ("unsafe" pages)
+ * and allocate memory for the image.
*
- * The idea is to allocate a new memory bitmap first and then allocate
- * as many pages as needed for the image data, but not to assign these
- * pages to specific tasks initially. Instead, we just mark them as
- * allocated and create a lists of "safe" pages that will be used
- * later. On systems with high memory a list of "safe" highmem pages is
- * also created.
+ * The idea is to allocate a new memory bitmap first and then allocate
+ * as many pages as needed for image data, but without specifying what those
+ * pages will be used for just yet. Instead, we mark them all as allocated and
+ * create a lists of "safe" pages to be used later. On systems with high
+ * memory a list of "safe" highmem pages is created too.
+ *
+ * Because it was not known which pages were unsafe when @zero_bm was created,
+ * make a copy of it and recreate it within safe pages.
*/
-
-#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-
-static int
-prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
+static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
+ struct memory_bitmap *zero_bm)
{
unsigned int nr_pages, nr_highmem;
- struct linked_page *sp_list, *lp;
+ struct memory_bitmap tmp;
+ struct linked_page *lp;
int error;
/* If there is no highmem, the buffer will not be necessary */
@@ -2342,9 +2629,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
buffer = NULL;
nr_highmem = count_highmem_image_pages(bm);
- error = mark_unsafe_pages(bm);
- if (error)
- goto Free;
+ mark_unsafe_pages(bm);
error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
if (error)
@@ -2352,20 +2637,39 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
duplicate_memory_bitmap(new_bm, bm);
memory_bm_free(bm, PG_UNSAFE_KEEP);
+
+ /* Make a copy of zero_bm so it can be created in safe pages */
+ error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(&tmp, zero_bm);
+ memory_bm_free(zero_bm, PG_UNSAFE_KEEP);
+
+ /* Recreate zero_bm in safe pages */
+ error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE);
+ if (error)
+ goto Free;
+
+ duplicate_memory_bitmap(zero_bm, &tmp);
+ memory_bm_free(&tmp, PG_UNSAFE_CLEAR);
+ /* At this point zero_bm is in safe pages and it can be used for restoring. */
+
if (nr_highmem > 0) {
error = prepare_highmem_image(bm, &nr_highmem);
if (error)
goto Free;
}
- /* Reserve some safe pages for potential later use.
+ /*
+ * Reserve some safe pages for potential later use.
*
* NOTE: This way we make sure there will be enough safe pages for the
* chain_alloc() in get_buffer(). It is a bit wasteful, but
* nr_copy_pages cannot be greater than 50% of the memory anyway.
+ *
+ * nr_copy_pages cannot be less than allocated_unsafe_pages too.
*/
- sp_list = NULL;
- /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
- nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages;
nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
while (nr_pages > 0) {
lp = get_image_page(GFP_ATOMIC, PG_SAFE);
@@ -2373,13 +2677,12 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
error = -ENOMEM;
goto Free;
}
- lp->next = sp_list;
- sp_list = lp;
+ lp->next = safe_pages_list;
+ safe_pages_list = lp;
nr_pages--;
}
/* Preallocate memory for the image */
- safe_pages_list = NULL;
- nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
+ nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages;
while (nr_pages > 0) {
lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
if (!lp) {
@@ -2396,12 +2699,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
swsusp_set_page_free(virt_to_page(lp));
nr_pages--;
}
- /* Free the reserved safe pages so that chain_alloc() can use them */
- while (sp_list) {
- lp = sp_list->next;
- free_image_page(sp_list, PG_UNSAFE_CLEAR);
- sp_list = lp;
- }
return 0;
Free:
@@ -2410,10 +2707,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
}
/**
- * get_buffer - compute the address that snapshot_write_next() should
- * set for its caller to write to.
+ * get_buffer - Get the address to store the next image data page.
+ *
+ * Get the address that snapshot_write_next() should return to its caller to
+ * write to.
*/
-
static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
{
struct pbe *pbe;
@@ -2428,12 +2726,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return get_highmem_page_buffer(page, ca);
if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
- /* We have allocated the "original" page frame and we can
+ /*
+ * We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
return page_address(page);
- /* The "original" page frame has not been allocated and we have to
+ /*
+ * The "original" page frame has not been allocated and we have to
* use a "safe" page frame to store the loaded page.
*/
pbe = chain_alloc(ca, sizeof(struct pbe));
@@ -2442,41 +2742,40 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
return ERR_PTR(-ENOMEM);
}
pbe->orig_address = page_address(page);
- pbe->address = safe_pages_list;
- safe_pages_list = safe_pages_list->next;
+ pbe->address = __get_safe_page(ca->gfp_mask);
+ if (!pbe->address)
+ return ERR_PTR(-ENOMEM);
pbe->next = restore_pblist;
restore_pblist = pbe;
return pbe->address;
}
/**
- * snapshot_write_next - used for writing the system memory snapshot.
+ * snapshot_write_next - Get the address to store the next image page.
+ * @handle: Snapshot handle structure to guide the writing.
*
- * On the first call to it @handle should point to a zeroed
- * snapshot_handle structure. The structure gets updated and a pointer
- * to it should be passed to this function every next time.
+ * On the first call, @handle should point to a zeroed snapshot_handle
+ * structure. The structure gets populated then and a pointer to it should be
+ * passed to this function every next time.
*
- * On success the function returns a positive number. Then, the caller
- * is allowed to write up to the returned number of bytes to the memory
- * location computed by the data_of() macro.
+ * On success, the function returns a positive number. Then, the caller
+ * is allowed to write up to the returned number of bytes to the memory
+ * location computed by the data_of() macro.
*
- * The function returns 0 to indicate the "end of file" condition,
- * and a negative number is returned on error. In such cases the
- * structure pointed to by @handle is not updated and should not be used
- * any more.
+ * The function returns 0 to indicate the "end of file" condition. Negative
+ * numbers are returned on errors, in which cases the structure pointed to by
+ * @handle is not updated and should not be used any more.
*/
-
int snapshot_write_next(struct snapshot_handle *handle)
{
static struct chain_allocator ca;
- int error = 0;
+ int error;
+next:
/* Check if we have already loaded the entire image */
- if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages)
return 0;
- handle->sync_read = 1;
-
if (!handle->cur) {
if (!buffer)
/* This makes the buffer be freed by swsusp_free() */
@@ -2491,100 +2790,113 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ safe_pages_list = NULL;
+
error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
if (error)
return error;
- /* Allocate buffer for page keys. */
- error = page_key_alloc(nr_copy_pages);
+ error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY);
if (error)
return error;
+ nr_zero_pages = 0;
+
+ hibernate_restore_protection_begin();
} else if (handle->cur <= nr_meta_pages + 1) {
- error = unpack_orig_pfns(buffer, &copy_bm);
+ error = unpack_orig_pfns(buffer, &copy_bm, &zero_bm);
if (error)
return error;
if (handle->cur == nr_meta_pages + 1) {
- error = prepare_image(&orig_bm, &copy_bm);
+ error = prepare_image(&orig_bm, &copy_bm, &zero_bm);
if (error)
return error;
chain_init(&ca, GFP_ATOMIC, PG_SAFE);
memory_bm_position_reset(&orig_bm);
+ memory_bm_position_reset(&zero_bm);
restore_pblist = NULL;
handle->buffer = get_buffer(&orig_bm, &ca);
- handle->sync_read = 0;
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
}
} else {
copy_last_highmem_page();
- /* Restore page key for data page (s390 only). */
- page_key_write(handle->buffer);
+ error = hibernate_restore_protect_page(handle->buffer);
+ if (error)
+ return error;
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
- if (handle->buffer != buffer)
- handle->sync_read = 0;
}
+ handle->sync_read = (handle->buffer == buffer);
handle->cur++;
+
+ /* Zero pages were not included in the image, memset it and move on. */
+ if (handle->cur > nr_meta_pages + 1 &&
+ memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) {
+ memset(handle->buffer, 0, PAGE_SIZE);
+ goto next;
+ }
+
return PAGE_SIZE;
}
/**
- * snapshot_write_finalize - must be called after the last call to
- * snapshot_write_next() in case the last page in the image happens
- * to be a highmem page and its contents should be stored in the
- * highmem. Additionally, it releases the memory that will not be
- * used any more.
+ * snapshot_write_finalize - Complete the loading of a hibernation image.
+ *
+ * Must be called after the last call to snapshot_write_next() in case the last
+ * page in the image happens to be a highmem page and its contents should be
+ * stored in highmem. Additionally, it recycles bitmap memory that's not
+ * necessary any more.
*/
-
-void snapshot_write_finalize(struct snapshot_handle *handle)
+int snapshot_write_finalize(struct snapshot_handle *handle)
{
+ int error;
+
copy_last_highmem_page();
- /* Restore page key for data page (s390 only). */
- page_key_write(handle->buffer);
- page_key_free();
- /* Free only if we have loaded the image entirely */
- if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
- memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+ error = hibernate_restore_protect_page(handle->buffer);
+ /* Do that only if we have loaded the image entirely */
+ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) {
+ memory_bm_recycle(&orig_bm);
free_highmem_data();
}
+ return error;
}
int snapshot_image_loaded(struct snapshot_handle *handle)
{
return !(!nr_copy_pages || !last_highmem_page_copied() ||
- handle->cur <= nr_meta_pages + nr_copy_pages);
+ handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages);
}
#ifdef CONFIG_HIGHMEM
/* Assumes that @buf is ready and points to a "safe" page */
-static inline void
-swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
+static inline void swap_two_pages_data(struct page *p1, struct page *p2,
+ void *buf)
{
void *kaddr1, *kaddr2;
- kaddr1 = kmap_atomic(p1);
- kaddr2 = kmap_atomic(p2);
+ kaddr1 = kmap_local_page(p1);
+ kaddr2 = kmap_local_page(p2);
copy_page(buf, kaddr1);
copy_page(kaddr1, kaddr2);
copy_page(kaddr2, buf);
- kunmap_atomic(kaddr2);
- kunmap_atomic(kaddr1);
+ kunmap_local(kaddr2);
+ kunmap_local(kaddr1);
}
/**
- * restore_highmem - for each highmem page that was allocated before
- * the suspend and included in the suspend image, and also has been
- * allocated by the "resume" kernel swap its current (ie. "before
- * resume") contents with the previous (ie. "before suspend") one.
+ * restore_highmem - Put highmem image pages into their original locations.
*
- * If the resume eventually fails, we can call this function once
- * again and restore the "before resume" highmem state.
+ * For each highmem page that was in use before hibernation and is included in
+ * the image, and also has been allocated by the "restore" kernel, swap its
+ * current contents with the previous (ie. "before hibernation") ones.
+ *
+ * If the restore eventually fails, we can call this function once again and
+ * restore the highmem state as seen by the restore kernel.
*/
-
int restore_highmem(void)
{
struct highmem_pbe *pbe = highmem_pblist;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8d7a1ef72758..2da4482bb6eb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -1,13 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/suspend.c - Suspend to RAM and standby functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -15,7 +16,6 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
-#include <linux/syscalls.h>
#include <linux/gfp.h>
#include <linux/io.h>
#include <linux/kernel.h>
@@ -25,104 +25,193 @@
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/swait.h>
#include <linux/ftrace.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
#include <linux/moduleparam.h>
+#include <linux/fs.h>
#include "power.h"
-const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
+const char * const pm_labels[] = {
+ [PM_SUSPEND_TO_IDLE] = "freeze",
+ [PM_SUSPEND_STANDBY] = "standby",
+ [PM_SUSPEND_MEM] = "mem",
+};
const char *pm_states[PM_SUSPEND_MAX];
+static const char * const mem_sleep_labels[] = {
+ [PM_SUSPEND_TO_IDLE] = "s2idle",
+ [PM_SUSPEND_STANDBY] = "shallow",
+ [PM_SUSPEND_MEM] = "deep",
+};
+const char *mem_sleep_states[PM_SUSPEND_MAX];
+
+suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
+suspend_state_t pm_suspend_target_state;
+EXPORT_SYMBOL_GPL(pm_suspend_target_state);
+
+unsigned int pm_suspend_global_flags;
+EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
static const struct platform_suspend_ops *suspend_ops;
-static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static const struct platform_s2idle_ops *s2idle_ops;
+static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
-enum freeze_state __read_mostly suspend_freeze_state;
-static DEFINE_SPINLOCK(suspend_freeze_lock);
+enum s2idle_states __read_mostly s2idle_state;
+static DEFINE_RAW_SPINLOCK(s2idle_lock);
+
+/**
+ * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend.
+ *
+ * Return 'true' if suspend-to-idle has been selected as the default system
+ * suspend method.
+ */
+bool pm_suspend_default_s2idle(void)
+{
+ return mem_sleep_current == PM_SUSPEND_TO_IDLE;
+}
+EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
-void freeze_set_ops(const struct platform_freeze_ops *ops)
+void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
- lock_system_sleep();
- freeze_ops = ops;
- unlock_system_sleep();
+ unsigned int sleep_flags;
+
+ sleep_flags = lock_system_sleep();
+ s2idle_ops = ops;
+ unlock_system_sleep(sleep_flags);
}
-static void freeze_begin(void)
+static void s2idle_begin(void)
{
- suspend_freeze_state = FREEZE_STATE_NONE;
+ s2idle_state = S2IDLE_STATE_NONE;
}
-static void freeze_enter(void)
+static void s2idle_enter(void)
{
- spin_lock_irq(&suspend_freeze_lock);
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
+
+ /*
+ * The correctness of the code below depends on the number of online
+ * CPUs being stable, but CPUs cannot be taken offline or put online
+ * while it is running.
+ *
+ * The s2idle_lock must be acquired before the pending wakeup check to
+ * prevent pm_system_wakeup() from running as a whole between that check
+ * and the subsequent s2idle_state update in which case a wakeup event
+ * would get lost.
+ */
+ raw_spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
- suspend_freeze_state = FREEZE_STATE_ENTER;
- spin_unlock_irq(&suspend_freeze_lock);
-
- get_online_cpus();
- cpuidle_resume();
+ s2idle_state = S2IDLE_STATE_ENTER;
+ raw_spin_unlock_irq(&s2idle_lock);
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
- pr_debug("PM: suspend-to-idle\n");
/* Make the current CPU wait so it can enter the idle loop too. */
- wait_event(suspend_freeze_wait_head,
- suspend_freeze_state == FREEZE_STATE_WAKE);
- pr_debug("PM: resume from suspend-to-idle\n");
+ swait_event_exclusive(s2idle_wait_head,
+ s2idle_state == S2IDLE_STATE_WAKE);
- cpuidle_pause();
- put_online_cpus();
+ /*
+ * Kick all CPUs to ensure that they resume their timers and restore
+ * consistent system state.
+ */
+ wake_up_all_idle_cpus();
- spin_lock_irq(&suspend_freeze_lock);
+ raw_spin_lock_irq(&s2idle_lock);
out:
- suspend_freeze_state = FREEZE_STATE_NONE;
- spin_unlock_irq(&suspend_freeze_lock);
+ s2idle_state = S2IDLE_STATE_NONE;
+ raw_spin_unlock_irq(&s2idle_lock);
+
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
+}
+
+static void s2idle_loop(void)
+{
+ pm_pr_dbg("suspend-to-idle\n");
+
+ /*
+ * Suspend-to-idle equals:
+ * frozen processes + suspended devices + idle processors.
+ * Thus s2idle_enter() should be called right after all devices have
+ * been suspended.
+ *
+ * Wakeups during the noirq suspend of devices may be spurious, so try
+ * to avoid them upfront.
+ */
+ for (;;) {
+ if (s2idle_ops && s2idle_ops->wake) {
+ if (s2idle_ops->wake())
+ break;
+ } else if (pm_wakeup_pending()) {
+ break;
+ }
+
+ if (s2idle_ops && s2idle_ops->check)
+ s2idle_ops->check();
+
+ s2idle_enter();
+ }
+
+ pm_pr_dbg("resume from suspend-to-idle\n");
}
-void freeze_wake(void)
+void s2idle_wake(void)
{
unsigned long flags;
- spin_lock_irqsave(&suspend_freeze_lock, flags);
- if (suspend_freeze_state > FREEZE_STATE_NONE) {
- suspend_freeze_state = FREEZE_STATE_WAKE;
- wake_up(&suspend_freeze_wait_head);
+ raw_spin_lock_irqsave(&s2idle_lock, flags);
+ if (s2idle_state > S2IDLE_STATE_NONE) {
+ s2idle_state = S2IDLE_STATE_WAKE;
+ swake_up_one(&s2idle_wait_head);
}
- spin_unlock_irqrestore(&suspend_freeze_lock, flags);
+ raw_spin_unlock_irqrestore(&s2idle_lock, flags);
}
-EXPORT_SYMBOL_GPL(freeze_wake);
+EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
/*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
- * support and need to be valid to the low level
- * implementation, no valid callback implies that none are valid.
+ * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level
+ * support and need to be valid to the low-level implementation.
+ *
+ * No ->valid() or ->enter() callback implies that none are valid.
*/
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) &&
+ suspend_ops->enter;
}
-/*
- * If this is set, the "mem" label always corresponds to the deepest sleep state
- * available, the "standby" label corresponds to the second deepest sleep state
- * available (if any), and the "freeze" label corresponds to the remaining
- * available sleep state (if there is one).
- */
-static bool relative_states;
+void __init pm_states_init(void)
+{
+ /* "mem" and "freeze" are always present in /sys/power/state. */
+ pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
+ pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE];
+ /*
+ * Suspend-to-idle should be supported even without any suspend_ops,
+ * initialize mem_sleep_states[] accordingly here.
+ */
+ mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE];
+}
-static int __init sleep_states_setup(char *str)
+static int __init mem_sleep_default_setup(char *str)
{
- relative_states = !strncmp(str, "1", 1);
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+ suspend_state_t state;
+
+ for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++)
+ if (mem_sleep_labels[state] &&
+ !strcmp(str, mem_sleep_labels[state])) {
+ mem_sleep_default = state;
+ mem_sleep_current = state;
+ break;
+ }
+
return 1;
}
-
-__setup("relative_sleep_states=", sleep_states_setup);
+__setup("mem_sleep_default=", mem_sleep_default_setup);
/**
* suspend_set_ops - Set the global suspend method table.
@@ -130,28 +219,31 @@ __setup("relative_sleep_states=", sleep_states_setup);
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- suspend_state_t i;
- int j = 0;
+ unsigned int sleep_flags;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
suspend_ops = ops;
- for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
- if (valid_state(i)) {
- pm_states[i] = pm_labels[j++];
- } else if (!relative_states) {
- pm_states[i] = NULL;
- j++;
- }
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
+ if (valid_state(PM_SUSPEND_STANDBY)) {
+ mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY];
+ pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY];
+ if (mem_sleep_default == PM_SUSPEND_STANDBY)
+ mem_sleep_current = PM_SUSPEND_STANDBY;
+ }
+ if (valid_state(PM_SUSPEND_MEM)) {
+ mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
+ if (mem_sleep_default >= PM_SUSPEND_MEM)
+ mem_sleep_current = PM_SUSPEND_MEM;
+ }
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
/**
* suspend_valid_only_mem - Generic memory-only valid callback.
+ * @state: Target system sleep state.
*
* Platform drivers that implement mem suspend only and only need to check for
* that in their .valid() callback can use this instead of rolling their own
@@ -165,50 +257,58 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE ||
+ (valid_state(state) && !cxl_mem_active());
}
static int platform_suspend_prepare(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ?
suspend_ops->prepare() : 0;
}
static int platform_suspend_prepare_late(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
- freeze_ops->prepare() : 0;
+ return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ?
+ s2idle_ops->prepare() : 0;
}
static int platform_suspend_prepare_noirq(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
- suspend_ops->prepare_late() : 0;
+ if (state == PM_SUSPEND_TO_IDLE)
+ return s2idle_ops && s2idle_ops->prepare_late ?
+ s2idle_ops->prepare_late() : 0;
+
+ return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0;
}
static void platform_resume_noirq(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ if (state == PM_SUSPEND_TO_IDLE) {
+ if (s2idle_ops && s2idle_ops->restore_early)
+ s2idle_ops->restore_early();
+ } else if (suspend_ops->wake) {
suspend_ops->wake();
+ }
}
static void platform_resume_early(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
- freeze_ops->restore();
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore)
+ s2idle_ops->restore();
}
static void platform_resume_finish(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish)
suspend_ops->finish();
}
static int platform_suspend_begin(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
- return freeze_ops->begin();
- else if (suspend_ops->begin)
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin)
+ return s2idle_ops->begin();
+ else if (suspend_ops && suspend_ops->begin)
return suspend_ops->begin(state);
else
return 0;
@@ -216,21 +316,21 @@ static int platform_suspend_begin(suspend_state_t state)
static void platform_resume_end(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
- freeze_ops->end();
- else if (suspend_ops->end)
+ if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end)
+ s2idle_ops->end();
+ else if (suspend_ops && suspend_ops->end)
suspend_ops->end();
}
static void platform_recover(suspend_state_t state)
{
- if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover)
suspend_ops->recover();
}
static bool platform_suspend_again(suspend_state_t state)
{
- return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ?
suspend_ops->suspend_again() : false;
}
@@ -244,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay,
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
+ int i;
+
if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n",
+ pr_info("suspend debug: Waiting for %d second(s).\n",
pm_test_delay);
- mdelay(pm_test_delay * 1000);
+ for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++)
+ msleep(1000);
+
return 1;
}
#endif /* !CONFIG_PM_DEBUG */
@@ -256,6 +360,7 @@ static int suspend_test(int level)
/**
* suspend_prepare - Prepare for entering system sleep state.
+ * @state: Target system sleep state.
*
* Common code run for every system sleep state that can be entered (except for
* hibernation). Run suspend notifiers, allocate the "suspend" console and
@@ -270,20 +375,21 @@ static int suspend_prepare(suspend_state_t state)
pm_prepare_console();
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
+ error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND);
if (error)
- goto Finish;
+ goto Restore;
+ filesystems_freeze(filesystem_freeze_enabled);
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
trace_suspend_resume(TPS("freeze_processes"), 0, false);
if (!error)
return 0;
- suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
- Finish:
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_SUSPEND);
+ Restore:
pm_restore_console();
return error;
}
@@ -317,7 +423,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: late suspend of devices failed\n");
+ pr_err("late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
@@ -326,7 +432,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ pr_err("noirq suspend of devices failed\n");
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -336,26 +442,20 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
- /*
- * PM_SUSPEND_FREEZE equals
- * frozen processes + suspended devices + idle processors.
- * Thus we should invoke freeze_enter() soon after
- * all the devices are suspended.
- */
- if (state == PM_SUSPEND_FREEZE) {
- trace_suspend_resume(TPS("machine_suspend"), state, true);
- freeze_enter();
- trace_suspend_resume(TPS("machine_suspend"), state, false);
+ if (state == PM_SUSPEND_TO_IDLE) {
+ s2idle_loop();
goto Platform_wake;
}
- error = disable_nonboot_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
+ system_state = SYSTEM_SUSPEND;
+
error = syscore_suspend();
if (!error) {
*wakeup = pm_wakeup_pending();
@@ -365,16 +465,19 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
error = suspend_ops->enter(state);
trace_suspend_resume(TPS("machine_suspend"),
state, false);
- events_check_enabled = false;
+ } else if (*wakeup) {
+ error = -EBUSY;
}
syscore_resume();
}
+ system_state = SYSTEM_RUNNING;
+
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
Enable_cpus:
- enable_nonboot_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_wake:
platform_resume_noirq(state);
@@ -403,15 +506,20 @@ int suspend_devices_and_enter(suspend_state_t state)
if (!sleep_state_supported(state))
return -ENOSYS;
+ pm_suspend_target_state = state;
+
+ if (state == PM_SUSPEND_TO_IDLE)
+ pm_set_suspend_no_platform();
+
error = platform_suspend_begin(state);
if (error)
goto Close;
- suspend_console();
+ console_suspend_all();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
- pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ pr_err("Some devices failed to suspend, or early wake event detected\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -426,12 +534,13 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
- trace_suspend_resume(TPS("resume_console"), state, true);
- resume_console();
- trace_suspend_resume(TPS("resume_console"), state, false);
+ trace_suspend_resume(TPS("console_resume_all"), state, true);
+ console_resume_all();
+ trace_suspend_resume(TPS("console_resume_all"), state, false);
Close:
platform_resume_end(state);
+ pm_suspend_target_state = PM_SUSPEND_ON;
return error;
Recover_platform:
@@ -448,6 +557,7 @@ int suspend_devices_and_enter(suspend_state_t state)
static void suspend_finish(void)
{
suspend_thaw_processes();
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
}
@@ -465,30 +575,34 @@ static int enter_state(suspend_state_t state)
int error;
trace_suspend_resume(TPS("suspend_enter"), state, true);
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warning("PM: Unsupported test mode for freeze state,"
- "please choose none/freezer/devices/platform.\n");
+ pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
#endif
} else if (!valid_state(state)) {
return -EINVAL;
}
- if (!mutex_trylock(&pm_mutex))
+ if (!mutex_trylock(&system_transition_mutex))
return -EBUSY;
- if (state == PM_SUSPEND_FREEZE)
- freeze_begin();
+ if (state == PM_SUSPEND_TO_IDLE)
+ s2idle_begin();
- trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- printk(KERN_INFO "PM: Syncing filesystems ... ");
- sys_sync();
- printk("done.\n");
- trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+ if (sync_on_suspend_enabled) {
+ trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ error = pm_sleep_fs_sync();
+ if (error)
+ goto Unlock;
+
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+ }
+
+ pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
+ pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -497,16 +611,15 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
- pm_restrict_gfp_mask();
+ pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
error = suspend_devices_and_enter(state);
- pm_restore_gfp_mask();
Finish:
- pr_debug("PM: Finishing wakeup.\n");
+ events_check_enabled = false;
+ pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
- mutex_unlock(&pm_mutex);
+ mutex_unlock(&system_transition_mutex);
return error;
}
@@ -524,13 +637,10 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
error = enter_state(state);
- if (error) {
- suspend_stats.fail++;
- dpm_save_failed_errno(error);
- } else {
- suspend_stats.success++;
- }
+ dpm_save_errno(error);
+ pr_info("suspend exit\n");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 084452e34a12..d4856ec61570 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
*
* Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
- *
- * This file is released under the GPLv2.
*/
#include <linux/init.h>
@@ -71,7 +70,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
static char info_test[] __initdata =
KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
- unsigned long now;
+ time64_t now;
struct rtc_wkalrm alm;
int status;
@@ -82,10 +81,10 @@ repeat:
printk(err_readtime, dev_name(&rtc->dev), status);
return;
}
- rtc_tm_to_time(&alm.time, &now);
+ now = rtc_tm_to_time64(&alm.time);
memset(&alm, 0, sizeof alm);
- rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+ rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
alm.enabled = true;
status = rtc_set_alarm(rtc, &alm);
@@ -104,9 +103,9 @@ repeat:
printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status < 0)
- state = PM_SUSPEND_FREEZE;
+ state = PM_SUSPEND_TO_IDLE;
}
- if (state == PM_SUSPEND_FREEZE) {
+ if (state == PM_SUSPEND_TO_IDLE) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
@@ -130,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
- if (!candidate->ops->set_alarm)
+ if (!test_bit(RTC_FEATURE_ALARM, candidate->features))
return 0;
if (!device_may_wakeup(candidate->dev.parent))
return 0;
@@ -158,22 +157,22 @@ static int __init setup_test_suspend(char *value)
value++;
suspend_type = strsep(&value, ",");
if (!suspend_type)
- return 0;
+ return 1;
repeat = strsep(&value, ",");
if (repeat) {
if (kstrtou32(repeat, 0, &test_repeat_count_max))
- return 0;
+ return 1;
}
- for (i = 0; pm_labels[i]; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
- return 0;
+ return 1;
}
printk(warn_bad_state, suspend_type);
- return 0;
+ return 1;
}
__setup("test_suspend", setup_test_suspend);
@@ -202,9 +201,11 @@ static int __init test_suspend(void)
}
/* RTCs have initialized by now too ... can we use one? */
- dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
- if (dev)
+ dev = class_find_device(&rtc_class, NULL, NULL, has_wakealarm);
+ if (dev) {
rtc = rtc_class_open(dev_name(dev));
+ put_device(dev);
+ }
if (!rtc) {
printk(warn_no_rtc);
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 570aff817543..33a186373bef 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/swap.c
*
@@ -7,16 +8,15 @@
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
* Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
- *
*/
+#define pr_fmt(fmt) "PM: " fmt
+
+#include <crypto/acompress.h>
#include <linux/module.h>
#include <linux/file.h>
#include <linux/delay.h>
#include <linux/bitops.h>
-#include <linux/genhd.h>
#include <linux/device.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -24,7 +24,6 @@
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/slab.h>
-#include <linux/lzo.h>
#include <linux/vmalloc.h>
#include <linux/cpumask.h>
#include <linux/atomic.h>
@@ -36,20 +35,29 @@
#define HIBERNATE_SIG "S1SUSPEND"
+u32 swsusp_hardware_signature;
+
+/*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
/*
- * The swap map is a data structure used for keeping track of each page
- * written to a swap partition. It consists of many swap_map_page
- * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
- * These structures are stored on the swap and linked together with the
- * help of the .next_swap member.
+ * The swap map is a data structure used for keeping track of each page
+ * written to a swap partition. It consists of many swap_map_page structures
+ * that contain each an array of MAP_PAGE_ENTRIES swap entries. These
+ * structures are stored on the swap and linked together with the help of the
+ * .next_swap member.
*
- * The swap map is created during suspend. The swap map pages are
- * allocated and populated one at a time, so we only need one memory
- * page to set up the entire structure.
+ * The swap map is created during suspend. The swap map pages are allocated and
+ * populated one at a time, so we only need one memory page to set up the entire
+ * structure.
*
- * During resume we pick up all swap_map_page structures into a list.
+ * During resume we pick up all swap_map_page structures into a list.
*/
-
#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
/*
@@ -79,11 +87,9 @@ struct swap_map_page_list {
struct swap_map_page_list *next;
};
-/**
- * The swap_map_handle structure is used for handling swap in
- * a file-alike way
+/*
+ * The swap_map_handle structure is used for handling swap in a file-alike way.
*/
-
struct swap_map_handle {
struct swap_map_page *cur;
struct swap_map_page_list *maps;
@@ -96,7 +102,8 @@ struct swap_map_handle {
struct swsusp_header {
char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
- sizeof(u32)];
+ sizeof(u32) - sizeof(u32)];
+ u32 hw_sig;
u32 crc32;
sector_t image;
unsigned int flags; /* Flags to pass to the "boot" kernel */
@@ -106,11 +113,10 @@ struct swsusp_header {
static struct swsusp_header *swsusp_header;
-/**
- * The following functions are used for tracing the allocated
- * swap pages, so that they can be freed in case of an error.
+/*
+ * The following functions are used for tracing the allocated swap pages, so
+ * that they can be freed in case of an error.
*/
-
struct swsusp_extent {
struct rb_node node;
unsigned long start;
@@ -160,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset)
return 0;
}
-/**
- * alloc_swapdev_block - allocate a swap page and register that it has
- * been allocated, so that it can be freed in case of an error.
- */
-
sector_t alloc_swapdev_block(int swap)
{
unsigned long offset;
+ /*
+ * Allocate a swap page and register that it has been allocated, so that
+ * it can be freed in case of an error.
+ */
offset = swp_offset(get_swap_page_of_type(swap));
if (offset) {
if (swsusp_extents_insert(offset))
@@ -179,24 +184,21 @@ sector_t alloc_swapdev_block(int swap)
return 0;
}
-/**
- * free_all_swap_pages - free swap pages allocated for saving image data.
- * It also frees the extents used to register which swap entries had been
- * allocated.
- */
-
void free_all_swap_pages(int swap)
{
struct rb_node *node;
+ /*
+ * Free swap pages allocated for saving image data. It also frees the
+ * extents used to register which swap entries had been allocated.
+ */
while ((node = swsusp_extents.rb_node)) {
struct swsusp_extent *ext;
- unsigned long offset;
- ext = container_of(node, struct swsusp_extent, node);
+ ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
- for (offset = ext->start; offset <= ext->end; offset++)
- swap_free(swp_entry(swap, offset));
+ swap_free_nr(swp_entry(swap, ext->start),
+ ext->end - ext->start + 1);
kfree(ext);
}
@@ -212,7 +214,84 @@ int swsusp_swap_in_use(void)
*/
static unsigned short root_swap = 0xffff;
-struct block_device *hib_resume_bdev;
+static struct file *hib_resume_bdev_file;
+
+struct hib_bio_batch {
+ atomic_t count;
+ wait_queue_head_t wait;
+ blk_status_t error;
+ struct blk_plug plug;
+};
+
+static void hib_init_batch(struct hib_bio_batch *hb)
+{
+ atomic_set(&hb->count, 0);
+ init_waitqueue_head(&hb->wait);
+ hb->error = BLK_STS_OK;
+ blk_start_plug(&hb->plug);
+}
+
+static void hib_finish_batch(struct hib_bio_batch *hb)
+{
+ blk_finish_plug(&hb->plug);
+}
+
+static void hib_end_io(struct bio *bio)
+{
+ struct hib_bio_batch *hb = bio->bi_private;
+ struct page *page = bio_first_page_all(bio);
+
+ if (bio->bi_status) {
+ pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ put_page(page);
+ else if (clean_pages_on_read)
+ flush_icache_range((unsigned long)page_address(page),
+ (unsigned long)page_address(page) + PAGE_SIZE);
+
+ if (bio->bi_status && !hb->error)
+ hb->error = bio->bi_status;
+ if (atomic_dec_and_test(&hb->count))
+ wake_up(&hb->wait);
+
+ bio_put(bio);
+}
+
+static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr)
+{
+ return bdev_rw_virt(file_bdev(hib_resume_bdev_file),
+ page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf);
+}
+
+static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr,
+ struct hib_bio_batch *hb)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf,
+ GFP_NOIO | __GFP_HIGH);
+ bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
+ bio_add_virt_nofail(bio, addr, PAGE_SIZE);
+ bio->bi_end_io = hib_end_io;
+ bio->bi_private = hb;
+ atomic_inc(&hb->count);
+ submit_bio(bio);
+ return 0;
+}
+
+static int hib_wait_io(struct hib_bio_batch *hb)
+{
+ /*
+ * We are relying on the behavior of blk_plug that a thread with
+ * a plug will flush the plug list before sleeping.
+ */
+ wait_event(hb->wait, atomic_read(&hb->count) == 0);
+ return blk_status_to_errno(hb->error);
+}
/*
* Saving part
@@ -222,90 +301,85 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
swsusp_header->image = handle->first_sector;
+ if (swsusp_hardware_signature) {
+ swsusp_header->hw_sig = swsusp_hardware_signature;
+ flags |= SF_HW_SIG;
+ }
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_bio_write_page(swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
+ swsusp_resume_block, swsusp_header);
} else {
- printk(KERN_ERR "PM: Swap header not found!\n");
+ pr_err("Swap header not found!\n");
error = -ENODEV;
}
return error;
}
-/**
- * swsusp_swap_check - check if the resume device is a swap device
- * and get its index (if so)
- *
- * This is called before saving image
+/*
+ * Hold the swsusp_header flag. This is used in software_resume() in
+ * 'kernel/power/hibernate' to check if the image is compressed and query
+ * for the compression algorithm support(if so).
*/
+unsigned int swsusp_header_flags;
+
static int swsusp_swap_check(void)
{
int res;
- res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
- &hib_resume_bdev);
+ /*
+ * Check if the resume device is a swap device and get its index (if so).
+ * This is called before saving the image.
+ */
+ if (swsusp_resume_device)
+ res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+ else
+ res = find_first_swap(&swsusp_resume_device);
if (res < 0)
return res;
-
root_swap = res;
- res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
- if (res)
- return res;
- res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
- if (res < 0)
- blkdev_put(hib_resume_bdev, FMODE_WRITE);
+ hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
+ BLK_OPEN_WRITE, NULL, NULL);
+ if (IS_ERR(hib_resume_bdev_file))
+ return PTR_ERR(hib_resume_bdev_file);
- return res;
+ return 0;
}
-/**
- * write_page - Write one page to given swap location.
- * @buf: Address we're writing.
- * @offset: Offset of the swap page we're writing to.
- * @bio_chain: Link the next write BIO here
- */
-
-static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
{
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
void *src;
int ret;
if (!offset)
return -ENOSPC;
- if (bio_chain) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
- __GFP_NORETRY);
- if (src) {
- copy_page(src, buf);
- } else {
- ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
- if (ret)
- return ret;
- src = (void *)__get_free_page(__GFP_WAIT |
- __GFP_NOWARN |
- __GFP_NORETRY);
- if (src) {
- copy_page(src, buf);
- } else {
- WARN_ON_ONCE(1);
- bio_chain = NULL; /* Go synchronous */
- src = buf;
- }
- }
- } else {
- src = buf;
+ if (!hb)
+ goto sync_io;
+
+ src = (void *)__get_free_page(gfp);
+ if (!src) {
+ ret = hib_wait_io(hb); /* Free pages */
+ if (ret)
+ return ret;
+ src = (void *)__get_free_page(gfp);
+ if (WARN_ON_ONCE(!src))
+ goto sync_io;
}
- return hib_bio_write_page(offset, src, bio_chain);
+
+ copy_page(src, buf);
+ return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
+sync_io:
+ return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -322,8 +396,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
ret = swsusp_swap_check();
if (ret) {
if (ret != -ENOSPC)
- printk(KERN_ERR "PM: Cannot find swap device, try "
- "swapon -a.\n");
+ pr_err("Cannot find swap device, try swapon -a\n");
return ret;
}
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -343,20 +416,20 @@ static int get_swap_writer(struct swap_map_handle *handle)
err_rel:
release_swap_writer(handle);
err_close:
- swsusp_close(FMODE_WRITE);
+ swsusp_close();
return ret;
}
static int swap_write_page(struct swap_map_handle *handle, void *buf,
- struct bio **bio_chain)
+ struct hib_bio_batch *hb)
{
- int error = 0;
+ int error;
sector_t offset;
if (!handle->cur)
return -EINVAL;
offset = alloc_swapdev_block(root_swap);
- error = write_page(buf, offset, bio_chain);
+ error = write_page(buf, offset, hb);
if (error)
return error;
handle->cur->entries[handle->k++] = offset;
@@ -365,15 +438,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
- error = write_page(handle->cur, handle->cur_swap, bio_chain);
+ error = write_page(handle->cur, handle->cur_swap, hb);
if (error)
goto out;
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
- if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
- error = hib_wait_on_bio_chain(bio_chain);
+ if (hb && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_io(hb);
if (error)
goto out;
/*
@@ -399,43 +472,45 @@ static int swap_writer_finish(struct swap_map_handle *handle,
unsigned int flags, int error)
{
if (!error) {
- flush_swap_writer(handle);
- printk(KERN_INFO "PM: S");
+ pr_info("S");
error = mark_swapfiles(handle, flags);
- printk("|\n");
+ pr_cont("|\n");
+ flush_swap_writer(handle);
}
if (error)
free_all_swap_pages(root_swap);
release_swap_writer(handle);
- swsusp_close(FMODE_WRITE);
+ swsusp_close();
return error;
}
+/*
+ * Bytes we need for compressed data in worst case. We assume(limitation)
+ * this is the worst of all the compression algorithms.
+ */
+#define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2)
+
/* We need to remember how much compressed data we need to read. */
-#define LZO_HEADER sizeof(size_t)
+#define CMP_HEADER sizeof(size_t)
/* Number of pages/bytes we'll compress at one time. */
-#define LZO_UNC_PAGES 32
-#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE)
+#define UNC_PAGES 32
+#define UNC_SIZE (UNC_PAGES * PAGE_SIZE)
-/* Number of pages/bytes we need for compressed data (worst case). */
-#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \
- LZO_HEADER, PAGE_SIZE)
-#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
+/* Number of pages we need for compressed data (worst case). */
+#define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \
+ CMP_HEADER, PAGE_SIZE)
+#define CMP_SIZE (CMP_PAGES * PAGE_SIZE)
-/* Maximum number of threads for compression/decompression. */
-#define LZO_THREADS 3
+/* Default number of threads for compression/decompression. */
+#define CMP_THREADS 3
+static unsigned int hibernate_compression_threads = CMP_THREADS;
/* Minimum/maximum number of pages for read buffering. */
-#define LZO_MIN_RD_PAGES 1024
-#define LZO_MAX_RD_PAGES 8192
-
-
-/**
- * save_image - save the suspend image data
- */
+#define CMP_MIN_RD_PAGES 1024
+#define CMP_MAX_RD_PAGES 8192
static int save_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
@@ -445,41 +520,43 @@ static int save_image(struct swap_map_handle *handle,
int ret;
int nr_pages;
int err2;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
- printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+ hib_init_batch(&hb);
+
+ pr_info("Saving image data pages (%u pages)...\n",
nr_to_write);
m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
while (1) {
ret = snapshot_read_next(snapshot);
if (ret <= 0)
break;
- ret = swap_write_page(handle, data_of(*snapshot), &bio);
+ ret = swap_write_page(handle, data_of(*snapshot), &hb);
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
+ hib_finish_batch(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
+ pr_info("Image saving done\n");
swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
-/**
+/*
* Structure used for CRC32.
*/
struct crc_data {
@@ -490,24 +567,59 @@ struct crc_data {
wait_queue_head_t go; /* start crc update */
wait_queue_head_t done; /* crc update done */
u32 *crc32; /* points to handle's crc32 */
- size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
- unsigned char *unc[LZO_THREADS]; /* uncompressed data */
+ size_t **unc_len; /* uncompressed lengths */
+ unsigned char **unc; /* uncompressed data */
};
-/**
- * CRC32 update function that runs in its own thread.
- */
+static struct crc_data *alloc_crc_data(int nr_threads)
+{
+ struct crc_data *crc;
+
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
+ if (!crc)
+ return NULL;
+
+ crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL);
+ if (!crc->unc)
+ goto err_free_crc;
+
+ crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL);
+ if (!crc->unc_len)
+ goto err_free_unc;
+
+ return crc;
+
+err_free_unc:
+ kfree(crc->unc);
+err_free_crc:
+ kfree(crc);
+ return NULL;
+}
+
+static void free_crc_data(struct crc_data *crc)
+{
+ if (!crc)
+ return;
+
+ if (crc->thr)
+ kthread_stop(crc->thr);
+
+ kfree(crc->unc_len);
+ kfree(crc->unc);
+ kfree(crc);
+}
+
static int crc32_threadfn(void *data)
{
struct crc_data *d = data;
unsigned i;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -516,16 +628,19 @@ static int crc32_threadfn(void *data)
for (i = 0; i < d->run_threads; i++)
*d->crc32 = crc32_le(*d->crc32,
d->unc[i], *d->unc_len[i]);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
}
-/**
- * Structure used for LZO data compression.
+
+/*
+ * Structure used for data compression.
*/
struct cmp_data {
struct task_struct *thr; /* thread */
+ struct crypto_acomp *cc; /* crypto compressor */
+ struct acomp_req *cr; /* crypto request */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -533,92 +648,92 @@ struct cmp_data {
wait_queue_head_t done; /* compression done */
size_t unc_len; /* uncompressed length */
size_t cmp_len; /* compressed length */
- unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
- unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
- unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
+ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[CMP_SIZE]; /* compressed buffer */
};
-/**
- * Compression function that runs in its own thread.
- */
-static int lzo_compress_threadfn(void *data)
+/* Indicates the image size after compression */
+static atomic64_t compressed_size = ATOMIC_INIT(0);
+
+static int compress_threadfn(void *data)
{
struct cmp_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
atomic_set(&d->ready, 0);
- d->ret = lzo1x_1_compress(d->unc, d->unc_len,
- d->cmp + LZO_HEADER, &d->cmp_len,
- d->wrk);
- atomic_set(&d->stop, 1);
+ acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len);
+ acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER,
+ CMP_SIZE - CMP_HEADER);
+ d->ret = crypto_acomp_compress(d->cr);
+ d->cmp_len = d->cr->dlen;
+
+ atomic64_add(d->cmp_len, &compressed_size);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
}
-/**
- * save_image_lzo - Save the suspend image data compressed with LZO.
- * @handle: Swap map handle to use for saving the image.
- * @snapshot: Image to read data from.
- * @nr_to_write: Number of pages to save.
- */
-static int save_image_lzo(struct swap_map_handle *handle,
- struct snapshot_handle *snapshot,
- unsigned int nr_to_write)
+static int save_compressed_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_write)
{
unsigned int m;
int ret = 0;
int nr_pages;
int err2;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
size_t off;
- unsigned thr, run_threads, nr_threads;
+ unsigned int thr, run_threads, nr_threads;
unsigned char *page = NULL;
struct cmp_data *data = NULL;
struct crc_data *crc = NULL;
+ hib_init_batch(&hb);
+
+ atomic64_set(&compressed_size, 0);
+
/*
* We'll limit the number of threads for compression to limit memory
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads);
- page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate %s page\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
- data = vmalloc(sizeof(*data) * nr_threads);
+ data = vcalloc(nr_threads, sizeof(*data));
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate %s data\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct cmp_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = alloc_crc_data(nr_threads);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
/*
* Start the compression threads.
@@ -627,13 +742,26 @@ static int save_image_lzo(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].thr = kthread_run(lzo_compress_threadfn,
+ data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(data[thr].cc)) {
+ pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
+ ret = -EFAULT;
+ goto out_clean;
+ }
+
+ data[thr].cr = acomp_request_alloc(data[thr].cc);
+ if (!data[thr].cr) {
+ pr_err("Could not allocate comp request\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
+ data[thr].thr = kthread_run(compress_threadfn,
&data[thr],
"image_compress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start compression threads\n");
+ pr_err("Cannot start compression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -655,7 +783,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -666,19 +794,17 @@ static int save_image_lzo(struct swap_map_handle *handle,
*/
handle->reqd_free_pages = reqd_free_pages();
- printk(KERN_INFO
- "PM: Using %u thread(s) for compression.\n"
- "PM: Compressing and saving image data (%u pages)...\n",
- nr_threads, nr_to_write);
+ pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo);
+ pr_info("Compressing and saving image data (%u pages)...\n",
+ nr_to_write);
m = nr_to_write / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
- for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+ for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) {
ret = snapshot_read_next(snapshot);
if (ret < 0)
goto out_finish;
@@ -690,10 +816,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
data_of(*snapshot), PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image saving progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image saving progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
if (!off)
@@ -701,7 +825,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
data[thr].unc_len = off;
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
@@ -709,26 +833,25 @@ static int save_image_lzo(struct swap_map_handle *handle,
break;
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR "PM: LZO compression failed\n");
+ pr_err("%s compression failed\n", hib_comp_algo);
goto out_finish;
}
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
- lzo1x_worst_compress(data[thr].unc_len))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ bytes_worst_compress(data[thr].unc_len))) {
+ pr_err("Invalid %s compressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
@@ -744,73 +867,74 @@ static int save_image_lzo(struct swap_map_handle *handle,
* read it.
*/
for (off = 0;
- off < LZO_HEADER + data[thr].cmp_len;
+ off < CMP_HEADER + data[thr].cmp_len;
off += PAGE_SIZE) {
memcpy(page, data[thr].cmp + off, PAGE_SIZE);
- ret = swap_write_page(handle, page, &bio);
+ ret = swap_write_page(handle, page, &hb);
if (ret)
goto out_finish;
}
}
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
out_finish:
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
- if (!ret)
- printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(start, stop, nr_to_write, "Wrote");
-out_clean:
- if (crc) {
- if (crc->thr)
- kthread_stop(crc->thr);
- kfree(crc);
+ if (!ret) {
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
+ pr_info("Image size after compression: %lld kbytes\n",
+ (atomic64_read(&compressed_size) / 1024));
+ pr_info("Image saving done\n");
+ } else {
+ pr_err("Image saving failed: %d\n", ret);
}
+
+out_clean:
+ hib_finish_batch(&hb);
+ free_crc_data(crc);
if (data) {
- for (thr = 0; thr < nr_threads; thr++)
+ for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
+ acomp_request_free(data[thr].cr);
+ crypto_free_acomp(data[thr].cc);
+ }
vfree(data);
}
- if (page) free_page((unsigned long)page);
+ if (page)
+ free_page((unsigned long)page);
return ret;
}
-/**
- * enough_swap - Make sure we have enough swap to save the image.
- *
- * Returns TRUE or FALSE after checking the total amount of swap
- * space avaiable from the resume partition.
- */
-
-static int enough_swap(unsigned int nr_pages, unsigned int flags)
+static int enough_swap(unsigned int nr_pages)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
- pr_debug("PM: Free swap pages: %u\n", free_swap);
+ pr_debug("Free swap pages: %u\n", free_swap);
required = PAGES_FOR_IO + nr_pages;
return free_swap > required;
}
/**
- * swsusp_write - Write entire image and metadata.
- * @flags: flags to pass to the "boot" kernel in the image header
+ * swsusp_write - Write entire image and metadata.
+ * @flags: flags to pass to the "boot" kernel in the image header
+ *
+ * It is important _NOT_ to umount filesystems at this point. We want them
+ * synced (in case something goes wrong) but we DO not want to mark filesystem
+ * clean: it is not. (And it does not matter, if we resume correctly, we'll mark
+ * system clean, anyway.)
*
- * It is important _NOT_ to umount filesystems at this point. We want
- * them synced (in case something goes wrong) but we DO not want to mark
- * filesystem clean: it is not. (And it does not matter, if we resume
- * correctly, we'll mark system clean, anyway.)
+ * Return: 0 on success, negative error code on failure.
*/
-
int swsusp_write(unsigned int flags)
{
struct swap_map_handle handle;
@@ -822,19 +946,19 @@ int swsusp_write(unsigned int flags)
pages = snapshot_get_image_size();
error = get_swap_writer(&handle);
if (error) {
- printk(KERN_ERR "PM: Cannot get swap writer\n");
+ pr_err("Cannot get swap writer\n");
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
- if (!enough_swap(pages, flags)) {
- printk(KERN_ERR "PM: Not enough free swap\n");
+ if (!enough_swap(pages)) {
+ pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;
}
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot);
- if (error < PAGE_SIZE) {
+ if (error < (int)PAGE_SIZE) {
if (error >= 0)
error = -EFAULT;
@@ -845,16 +969,16 @@ int swsusp_write(unsigned int flags)
if (!error) {
error = (flags & SF_NOCOMPRESS_MODE) ?
save_image(&handle, &snapshot, pages - 1) :
- save_image_lzo(&handle, &snapshot, pages - 1);
+ save_compressed_image(&handle, &snapshot, pages - 1);
}
out_finish:
error = swap_writer_finish(&handle, flags, error);
return error;
}
-/**
- * The following functions allow us to read data using a swap map
- * in a file-alike way
+/*
+ * The following functions allow us to read data using a swap map in a file-like
+ * way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
@@ -887,12 +1011,11 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = handle->maps = NULL;
offset = swsusp_header->image;
while (offset) {
- tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+ tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL);
if (!tmp) {
release_swap_reader(handle);
return -ENOMEM;
}
- memset(tmp, 0, sizeof(*tmp));
if (!handle->maps)
handle->maps = tmp;
if (last)
@@ -900,13 +1023,13 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = tmp;
tmp->map = (struct swap_map_page *)
- __get_free_page(__GFP_WAIT | __GFP_HIGH);
+ __get_free_page(GFP_NOIO | __GFP_HIGH);
if (!tmp->map) {
release_swap_reader(handle);
return -ENOMEM;
}
- error = hib_bio_read_page(offset, tmp->map, NULL);
+ error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map);
if (error) {
release_swap_reader(handle);
return error;
@@ -919,7 +1042,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
}
static int swap_read_page(struct swap_map_handle *handle, void *buf,
- struct bio **bio_chain)
+ struct hib_bio_batch *hb)
{
sector_t offset;
int error;
@@ -930,7 +1053,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_bio_read_page(offset, buf, bio_chain);
+ if (hb)
+ error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb);
+ else
+ error = hib_submit_io_sync(REQ_OP_READ, offset, buf);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -954,12 +1080,6 @@ static int swap_reader_finish(struct swap_map_handle *handle)
return 0;
}
-/**
- * load_image - load the image using the swap map handle
- * @handle and the snapshot handle @snapshot
- * (assume there are @nr_pages pages to load)
- */
-
static int load_image(struct swap_map_handle *handle,
struct snapshot_handle *snapshot,
unsigned int nr_to_read)
@@ -968,53 +1088,57 @@ static int load_image(struct swap_map_handle *handle,
int ret = 0;
ktime_t start;
ktime_t stop;
- struct bio *bio;
+ struct hib_bio_batch hb;
int err2;
unsigned nr_pages;
- printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
- nr_to_read);
+ hib_init_batch(&hb);
+
+ clean_pages_on_read = true;
+ pr_info("Loading image data pages (%u pages)...\n", nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
for ( ; ; ) {
ret = snapshot_write_next(snapshot);
if (ret <= 0)
break;
- ret = swap_read_page(handle, data_of(*snapshot), &bio);
+ ret = swap_read_page(handle, data_of(*snapshot), &hb);
if (ret)
break;
if (snapshot->sync_read)
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
break;
if (!(nr_pages % m))
- printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
}
- err2 = hib_wait_on_bio_chain(&bio);
+ err2 = hib_wait_io(&hb);
+ hib_finish_batch(&hb);
stop = ktime_get();
if (!ret)
ret = err2;
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
- snapshot_write_finalize(snapshot);
- if (!snapshot_image_loaded(snapshot))
+ pr_info("Image loading done\n");
+ ret = snapshot_write_finalize(snapshot);
+ if (!ret && !snapshot_image_loaded(snapshot))
ret = -ENODATA;
}
swsusp_show_speed(start, stop, nr_to_read, "Read");
return ret;
}
-/**
- * Structure used for LZO data decompression.
+/*
+ * Structure used for data decompression.
*/
struct dec_data {
struct task_struct *thr; /* thread */
+ struct crypto_acomp *cc; /* crypto compressor */
+ struct acomp_req *cr; /* crypto request */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -1022,52 +1146,52 @@ struct dec_data {
wait_queue_head_t done; /* decompression done */
size_t unc_len; /* uncompressed length */
size_t cmp_len; /* compressed length */
- unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
- unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
+ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[CMP_SIZE]; /* compressed buffer */
};
-/**
- * Deompression function that runs in its own thread.
- */
-static int lzo_decompress_threadfn(void *data)
+static int decompress_threadfn(void *data)
{
struct dec_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
atomic_set(&d->ready, 0);
- d->unc_len = LZO_UNC_SIZE;
- d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
- d->unc, &d->unc_len);
- atomic_set(&d->stop, 1);
+ acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER,
+ d->cmp_len);
+ acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE);
+ d->ret = crypto_acomp_decompress(d->cr);
+ d->unc_len = d->cr->dlen;
+
+ if (clean_pages_on_decompress)
+ flush_icache_range((unsigned long)d->unc,
+ (unsigned long)d->unc + d->unc_len);
+
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
}
-/**
- * load_image_lzo - Load compressed image data and decompress them with LZO.
- * @handle: Swap map handle to use for loading data.
- * @snapshot: Image to copy uncompressed data into.
- * @nr_to_read: Number of pages to load.
- */
-static int load_image_lzo(struct swap_map_handle *handle,
- struct snapshot_handle *snapshot,
- unsigned int nr_to_read)
+static int load_compressed_image(struct swap_map_handle *handle,
+ struct snapshot_handle *snapshot,
+ unsigned int nr_to_read)
{
unsigned int m;
int ret = 0;
int eof = 0;
- struct bio *bio;
+ struct hib_bio_batch hb;
ktime_t start;
ktime_t stop;
unsigned nr_pages;
@@ -1080,36 +1204,37 @@ static int load_image_lzo(struct swap_map_handle *handle,
struct dec_data *data = NULL;
struct crc_data *crc = NULL;
+ hib_init_batch(&hb);
+
/*
* We'll limit the number of threads for decompression to limit memory
* footprint.
*/
nr_threads = num_online_cpus() - 1;
- nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+ nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads);
- page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
+ page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page));
if (!page) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ pr_err("Failed to allocate %s page\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
- data = vmalloc(sizeof(*data) * nr_threads);
+ data = vcalloc(nr_threads, sizeof(*data));
if (!data) {
- printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ pr_err("Failed to allocate %s data\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct dec_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = alloc_crc_data(nr_threads);
if (!crc) {
- printk(KERN_ERR "PM: Failed to allocate crc\n");
+ pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
+
+ clean_pages_on_decompress = true;
/*
* Start the decompression threads.
@@ -1118,13 +1243,26 @@ static int load_image_lzo(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].thr = kthread_run(lzo_decompress_threadfn,
+ data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR_OR_NULL(data[thr].cc)) {
+ pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
+ ret = -EFAULT;
+ goto out_clean;
+ }
+
+ data[thr].cr = acomp_request_alloc(data[thr].cc);
+ if (!data[thr].cr) {
+ pr_err("Could not allocate comp request\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
+ data[thr].thr = kthread_run(decompress_threadfn,
&data[thr],
"image_decompress/%u", thr);
if (IS_ERR(data[thr].thr)) {
data[thr].thr = NULL;
- printk(KERN_ERR
- "PM: Cannot start decompression threads\n");
+ pr_err("Cannot start decompression threads\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1146,7 +1284,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
if (IS_ERR(crc->thr)) {
crc->thr = NULL;
- printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ pr_err("Cannot start CRC32 thread\n");
ret = -ENOMEM;
goto out_clean;
}
@@ -1160,19 +1298,18 @@ static int load_image_lzo(struct swap_map_handle *handle,
*/
if (low_free_pages() > snapshot_get_image_size())
read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
- read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
+ read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES);
for (i = 0; i < read_pages; i++) {
- page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
- __GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT | __GFP_NOWARN |
- __GFP_NORETRY);
+ page[i] = (void *)__get_free_page(i < CMP_PAGES ?
+ GFP_NOIO | __GFP_HIGH :
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY);
if (!page[i]) {
- if (i < LZO_CMP_PAGES) {
+ if (i < CMP_PAGES) {
ring_size = i;
- printk(KERN_ERR
- "PM: Failed to allocate LZO pages\n");
+ pr_err("Failed to allocate %s pages\n", hib_comp_algo);
ret = -ENOMEM;
goto out_clean;
} else {
@@ -1182,15 +1319,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
want = ring_size = i;
- printk(KERN_INFO
- "PM: Using %u thread(s) for decompression.\n"
- "PM: Loading and decompressing image data (%u pages)...\n",
- nr_threads, nr_to_read);
+ pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo);
+ pr_info("Loading and decompressing image data (%u pages)...\n",
+ nr_to_read);
m = nr_to_read / 10;
if (!m)
m = 1;
nr_pages = 0;
- bio = NULL;
start = ktime_get();
ret = snapshot_write_next(snapshot);
@@ -1199,7 +1334,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
for(;;) {
for (i = 0; !eof && i < want; i++) {
- ret = swap_read_page(handle, page[ring], &bio);
+ ret = swap_read_page(handle, page[ring], &hb);
if (ret) {
/*
* On real read error, finish. On end of data,
@@ -1226,7 +1361,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
if (!asked)
break;
- ret = hib_wait_on_bio_chain(&bio);
+ ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
have += asked;
@@ -1236,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
crc->run_threads = 0;
}
@@ -1245,14 +1380,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].cmp_len = *(size_t *)page[pg];
if (unlikely(!data[thr].cmp_len ||
data[thr].cmp_len >
- lzo1x_worst_compress(LZO_UNC_SIZE))) {
- printk(KERN_ERR
- "PM: Invalid LZO compressed length\n");
+ bytes_worst_compress(UNC_SIZE))) {
+ pr_err("Invalid %s compressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
- need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+ need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER,
PAGE_SIZE);
if (need > have) {
if (eof > 1) {
@@ -1263,7 +1397,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
for (off = 0;
- off < LZO_HEADER + data[thr].cmp_len;
+ off < CMP_HEADER + data[thr].cmp_len;
off += PAGE_SIZE) {
memcpy(data[thr].cmp + off,
page[pg], PAGE_SIZE);
@@ -1273,15 +1407,15 @@ static int load_image_lzo(struct swap_map_handle *handle,
pg = 0;
}
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
/*
* Wait for more data while we are decompressing.
*/
- if (have < LZO_CMP_PAGES && asked) {
- ret = hib_wait_on_bio_chain(&bio);
+ if (have < CMP_PAGES && asked) {
+ ret = hib_wait_io(&hb);
if (ret)
goto out_finish;
have += asked;
@@ -1292,22 +1426,20 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
if (ret < 0) {
- printk(KERN_ERR
- "PM: LZO decompression failed\n");
+ pr_err("%s decompression failed\n", hib_comp_algo);
goto out_finish;
}
if (unlikely(!data[thr].unc_len ||
- data[thr].unc_len > LZO_UNC_SIZE ||
- data[thr].unc_len & (PAGE_SIZE - 1))) {
- printk(KERN_ERR
- "PM: Invalid LZO uncompressed length\n");
+ data[thr].unc_len > UNC_SIZE ||
+ data[thr].unc_len & (PAGE_SIZE - 1))) {
+ pr_err("Invalid %s uncompressed length\n", hib_comp_algo);
ret = -1;
goto out_finish;
}
@@ -1318,16 +1450,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
data[thr].unc + off, PAGE_SIZE);
if (!(nr_pages % m))
- printk(KERN_INFO
- "PM: Image loading progress: "
- "%3d%%\n",
- nr_pages / m * 10);
+ pr_info("Image loading progress: %3d%%\n",
+ nr_pages / m * 10);
nr_pages++;
ret = snapshot_write_next(snapshot);
if (ret <= 0) {
crc->run_threads = thr + 1;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
goto out_finish;
}
@@ -1335,26 +1465,25 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
}
out_finish:
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
stop = ktime_get();
if (!ret) {
- printk(KERN_INFO "PM: Image loading done.\n");
- snapshot_write_finalize(snapshot);
- if (!snapshot_image_loaded(snapshot))
+ pr_info("Image loading done\n");
+ ret = snapshot_write_finalize(snapshot);
+ if (!ret && !snapshot_image_loaded(snapshot))
ret = -ENODATA;
if (!ret) {
if (swsusp_header->flags & SF_CRC32_MODE) {
if(handle->crc32 != swsusp_header->crc32) {
- printk(KERN_ERR
- "PM: Invalid image CRC32!\n");
+ pr_err("Invalid image CRC32!\n");
ret = -ENODATA;
}
}
@@ -1362,17 +1491,17 @@ out_finish:
}
swsusp_show_speed(start, stop, nr_to_read, "Read");
out_clean:
+ hib_finish_batch(&hb);
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
- if (crc) {
- if (crc->thr)
- kthread_stop(crc->thr);
- kfree(crc);
- }
+ free_crc_data(crc);
if (data) {
- for (thr = 0; thr < nr_threads; thr++)
+ for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
+ acomp_request_free(data[thr].cr);
+ crypto_free_acomp(data[thr].cc);
+ }
vfree(data);
}
vfree(page);
@@ -1384,8 +1513,9 @@ out_clean:
* swsusp_read - read the hibernation image.
* @flags_p: flags passed by the "frozen" kernel in the image header should
* be written into this memory location
+ *
+ * Return: 0 on success, negative error code on failure.
*/
-
int swsusp_read(unsigned int *flags_p)
{
int error;
@@ -1395,7 +1525,7 @@ int swsusp_read(unsigned int *flags_p)
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_write_next(&snapshot);
- if (error < PAGE_SIZE)
+ if (error < (int)PAGE_SIZE)
return error < 0 ? error : -EFAULT;
header = (struct swsusp_info *)data_of(snapshot);
error = get_swap_reader(&handle, flags_p);
@@ -1406,89 +1536,102 @@ int swsusp_read(unsigned int *flags_p)
if (!error) {
error = (*flags_p & SF_NOCOMPRESS_MODE) ?
load_image(&handle, &snapshot, header->pages - 1) :
- load_image_lzo(&handle, &snapshot, header->pages - 1);
+ load_compressed_image(&handle, &snapshot, header->pages - 1);
}
swap_reader_finish(&handle);
end:
if (!error)
- pr_debug("PM: Image successfully loaded\n");
+ pr_debug("Image successfully loaded\n");
else
- pr_debug("PM: Error %d resuming\n", error);
+ pr_debug("Error %d resuming\n", error);
return error;
}
+static void *swsusp_holder;
+
/**
- * swsusp_check - Check for swsusp signature in the resume device
+ * swsusp_check - Open the resume device and check for the swsusp signature.
+ * @exclusive: Open the resume device exclusively.
+ *
+ * Return: 0 if a valid image is found, negative error code otherwise.
*/
-
-int swsusp_check(void)
+int swsusp_check(bool exclusive)
{
+ void *holder = exclusive ? &swsusp_holder : NULL;
int error;
- hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
- FMODE_READ, NULL);
- if (!IS_ERR(hib_resume_bdev)) {
- set_blocksize(hib_resume_bdev, PAGE_SIZE);
+ hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device,
+ BLK_OPEN_READ, holder, NULL);
+ if (!IS_ERR(hib_resume_bdev_file)) {
clear_page(swsusp_header);
- error = hib_bio_read_page(swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block,
+ swsusp_header);
if (error)
goto put;
if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
+ swsusp_header_flags = swsusp_header->flags;
/* Reset swap signature now */
- error = hib_bio_write_page(swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
+ swsusp_resume_block,
+ swsusp_header);
} else {
error = -EINVAL;
}
+ if (!error && swsusp_header->flags & SF_HW_SIG &&
+ swsusp_header->hw_sig != swsusp_hardware_signature) {
+ pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n",
+ swsusp_header->hw_sig, swsusp_hardware_signature);
+ error = -EINVAL;
+ }
put:
if (error)
- blkdev_put(hib_resume_bdev, FMODE_READ);
+ bdev_fput(hib_resume_bdev_file);
else
- pr_debug("PM: Image signature found, resuming\n");
+ pr_debug("Image signature found, resuming\n");
} else {
- error = PTR_ERR(hib_resume_bdev);
+ error = PTR_ERR(hib_resume_bdev_file);
}
if (error)
- pr_debug("PM: Image not found (code %d)\n", error);
+ pr_debug("Image not found (code %d)\n", error);
return error;
}
/**
- * swsusp_close - close swap device.
+ * swsusp_close - close resume device.
*/
-
-void swsusp_close(fmode_t mode)
+void swsusp_close(void)
{
- if (IS_ERR(hib_resume_bdev)) {
- pr_debug("PM: Image device not initialised\n");
+ if (IS_ERR(hib_resume_bdev_file)) {
+ pr_debug("Image device not initialised\n");
return;
}
- blkdev_put(hib_resume_bdev, mode);
+ fput(hib_resume_bdev_file);
}
/**
- * swsusp_unmark - Unmark swsusp signature in the resume device
+ * swsusp_unmark - Unmark swsusp signature in the resume device
+ *
+ * Return: 0 on success, negative error code on failure.
*/
-
#ifdef CONFIG_SUSPEND
int swsusp_unmark(void)
{
int error;
- hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_bio_write_page(swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
+ swsusp_resume_block,
+ swsusp_header);
} else {
- printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+ pr_err("Cannot find swsusp signature!\n");
error = -ENODEV;
}
@@ -1501,8 +1644,46 @@ int swsusp_unmark(void)
}
#endif
-static int swsusp_header_init(void)
+static ssize_t hibernate_compression_threads_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
+ return sysfs_emit(buf, "%d\n", hibernate_compression_threads);
+}
+
+static ssize_t hibernate_compression_threads_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 0, &val))
+ return -EINVAL;
+
+ if (val < 1)
+ return -EINVAL;
+
+ hibernate_compression_threads = val;
+ return n;
+}
+power_attr(hibernate_compression_threads);
+
+static struct attribute *g[] = {
+ &hibernate_compression_threads_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+static int __init swsusp_header_init(void)
+{
+ int error;
+
+ error = sysfs_create_group(power_kobj, &attr_group);
+ if (error)
+ return -ENOMEM;
+
swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
if (!swsusp_header)
panic("Could not allocate memory for swsusp_header\n");
@@ -1510,3 +1691,19 @@ static int swsusp_header_init(void)
}
core_initcall(swsusp_header_init);
+
+static int __init hibernate_compression_threads_setup(char *str)
+{
+ int rc = kstrtouint(str, 0, &hibernate_compression_threads);
+
+ if (rc)
+ return rc;
+
+ if (hibernate_compression_threads < 1)
+ hibernate_compression_threads = CMP_THREADS;
+
+ return 1;
+
+}
+
+__setup("hibernate_compression_threads=", hibernate_compression_threads_setup);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 526e8911460a..4401cfe26e5c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -1,16 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/user.c
*
* This file provides the user space interface for software suspend/resume.
*
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
*/
#include <linux/suspend.h>
-#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
@@ -25,12 +22,11 @@
#include <linux/cpu.h>
#include <linux/freezer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "power.h"
-
-#define SNAPSHOT_MINOR 231
+static bool need_wait;
static struct snapshot_data {
struct snapshot_handle handle;
@@ -40,27 +36,32 @@ static struct snapshot_data {
bool ready;
bool platform_support;
bool free_bitmaps;
+ dev_t dev;
} snapshot_state;
-atomic_t snapshot_device_available = ATOMIC_INIT(1);
+int is_hibernate_resume_dev(dev_t dev)
+{
+ return hibernation_available() && snapshot_state.dev == dev;
+}
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
int error;
if (!hibernation_available())
return -EPERM;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
- if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ if (!hibernate_acquire()) {
error = -EBUSY;
goto Unlock;
}
if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
error = -ENOSYS;
goto Unlock;
}
@@ -70,39 +71,35 @@ static int snapshot_open(struct inode *inode, struct file *filp)
memset(&data->handle, 0, sizeof(struct snapshot_handle));
if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
/* Hibernating. The image device should be accessible. */
- data->swap = swsusp_resume_device ?
- swap_type_of(swsusp_resume_device, 0, NULL) : -1;
+ data->swap = swap_type_of(swsusp_resume_device, 0);
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (error)
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION);
} else {
/*
* Resuming. We may need to wait for the image device to
* appear.
*/
- wait_for_device_probe();
+ need_wait = true;
data->swap = -1;
data->mode = O_WRONLY;
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
}
- if (error)
- pm_notifier_call_chain(PM_POST_RESTORE);
}
if (error)
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
data->frozen = false;
data->ready = false;
data->platform_support = false;
+ data->dev = 0;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return error;
}
@@ -110,11 +107,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
static int snapshot_release(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
+ unsigned int sleep_flags;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
swsusp_free();
data = filp->private_data;
+ data->dev = 0;
free_all_swap_pages(data->swap);
if (data->frozen) {
pm_restore_gfp_mask();
@@ -125,9 +124,9 @@ static int snapshot_release(struct inode *inode, struct file *filp)
}
pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
- atomic_inc(&snapshot_device_available);
+ hibernate_release();
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return 0;
}
@@ -135,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp)
static ssize_t snapshot_read(struct file *filp, char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned int sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
if (!data->ready) {
@@ -160,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
*offp += res;
Unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
@@ -168,11 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
static ssize_t snapshot_write(struct file *filp, const char __user *buf,
size_t count, loff_t *offp)
{
+ loff_t pg_offp = *offp & ~PAGE_MASK;
struct snapshot_data *data;
+ unsigned long sleep_flags;
ssize_t res;
- loff_t pg_offp = *offp & ~PAGE_MASK;
- lock_system_sleep();
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
+ sleep_flags = lock_system_sleep();
data = filp->private_data;
@@ -181,7 +187,12 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res <= 0)
goto unlock;
} else {
- res = PAGE_SIZE - pg_offp;
+ res = PAGE_SIZE;
+ }
+
+ if (!data_of(data->handle)) {
+ res = -EINVAL;
+ goto unlock;
}
res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
@@ -189,11 +200,52 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res > 0)
*offp += res;
unlock:
- unlock_system_sleep();
+ unlock_system_sleep(sleep_flags);
return res;
}
+struct compat_resume_swap_area {
+ compat_loff_t offset;
+ u32 dev;
+} __packed;
+
+static int snapshot_set_swap_area(struct snapshot_data *data,
+ void __user *argp)
+{
+ sector_t offset;
+ dev_t swdev;
+
+ if (swsusp_swap_in_use())
+ return -EPERM;
+
+ if (in_compat_syscall()) {
+ struct compat_resume_swap_area swap_area;
+
+ if (copy_from_user(&swap_area, argp, sizeof(swap_area)))
+ return -EFAULT;
+ swdev = new_decode_dev(swap_area.dev);
+ offset = swap_area.offset;
+ } else {
+ struct resume_swap_area swap_area;
+
+ if (copy_from_user(&swap_area, argp, sizeof(swap_area)))
+ return -EFAULT;
+ swdev = new_decode_dev(swap_area.dev);
+ offset = swap_area.offset;
+ }
+
+ /*
+ * User space encodes device types as two-byte values,
+ * so we need to recode them
+ */
+ data->swap = swap_type_of(swdev, offset);
+ if (data->swap < 0)
+ return swdev ? -ENODEV : -EINVAL;
+ data->dev = swdev;
+ return 0;
+}
+
static long snapshot_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -202,6 +254,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
loff_t size;
sector_t offset;
+ if (need_wait) {
+ wait_for_device_probe();
+ need_wait = false;
+ }
+
if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
return -ENOTTY;
if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
@@ -209,7 +266,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!mutex_trylock(&pm_mutex))
+ if (!mutex_trylock(&system_transition_mutex))
return -EBUSY;
lock_device_hotplug();
@@ -221,9 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (data->frozen)
break;
- printk("Syncing filesystems ... ");
- sys_sync();
- printk("done.\n");
+ error = pm_sleep_fs_sync();
+ if (error)
+ break;
error = freeze_processes();
if (error)
@@ -262,7 +319,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
case SNAPSHOT_ATOMIC_RESTORE:
- snapshot_write_finalize(&data->handle);
+ error = snapshot_write_finalize(&data->handle);
+ if (error)
+ break;
if (data->mode != O_WRONLY || !data->frozen ||
!snapshot_image_loaded(&data->handle)) {
error = -EPERM;
@@ -351,34 +410,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
case SNAPSHOT_SET_SWAP_AREA:
- if (swsusp_swap_in_use()) {
- error = -EPERM;
- } else {
- struct resume_swap_area swap_area;
- dev_t swdev;
-
- error = copy_from_user(&swap_area, (void __user *)arg,
- sizeof(struct resume_swap_area));
- if (error) {
- error = -EFAULT;
- break;
- }
-
- /*
- * User space encodes device types as two-byte values,
- * so we need to recode them
- */
- swdev = new_decode_dev(swap_area.dev);
- if (swdev) {
- offset = swap_area.offset;
- data->swap = swap_type_of(swdev, offset, NULL);
- if (data->swap < 0)
- error = -ENODEV;
- } else {
- data->swap = -1;
- error = -EINVAL;
- }
- }
+ error = snapshot_set_swap_area(data, (void __user *)arg);
break;
default:
@@ -387,18 +419,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
}
unlock_device_hotplug();
- mutex_unlock(&pm_mutex);
+ mutex_unlock(&system_transition_mutex);
return error;
}
#ifdef CONFIG_COMPAT
-
-struct compat_resume_swap_area {
- compat_loff_t offset;
- u32 dev;
-} __packed;
-
static long
snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -407,49 +433,15 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (cmd) {
case SNAPSHOT_GET_IMAGE_SIZE:
case SNAPSHOT_AVAIL_SWAP_SIZE:
- case SNAPSHOT_ALLOC_SWAP_PAGE: {
- compat_loff_t __user *uoffset = compat_ptr(arg);
- loff_t offset;
- mm_segment_t old_fs;
- int err;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
- set_fs(old_fs);
- if (!err && put_user(offset, uoffset))
- err = -EFAULT;
- return err;
- }
-
+ case SNAPSHOT_ALLOC_SWAP_PAGE:
case SNAPSHOT_CREATE_IMAGE:
+ case SNAPSHOT_SET_SWAP_AREA:
return snapshot_ioctl(file, cmd,
(unsigned long) compat_ptr(arg));
-
- case SNAPSHOT_SET_SWAP_AREA: {
- struct compat_resume_swap_area __user *u_swap_area =
- compat_ptr(arg);
- struct resume_swap_area swap_area;
- mm_segment_t old_fs;
- int err;
-
- err = get_user(swap_area.offset, &u_swap_area->offset);
- err |= get_user(swap_area.dev, &u_swap_area->dev);
- if (err)
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
- (unsigned long) &swap_area);
- set_fs(old_fs);
- return err;
- }
-
default:
return snapshot_ioctl(file, cmd, arg);
}
}
-
#endif /* CONFIG_COMPAT */
static const struct file_operations snapshot_fops = {
@@ -457,7 +449,6 @@ static const struct file_operations snapshot_fops = {
.release = snapshot_release,
.read = snapshot_read,
.write = snapshot_write,
- .llseek = no_llseek,
.unlocked_ioctl = snapshot_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = snapshot_compat_ioctl,
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 019069c84ff6..4e941999a53b 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* kernel/power/wakelock.c
*
@@ -17,6 +18,7 @@
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "power.h"
@@ -25,7 +27,7 @@ static DEFINE_MUTEX(wakelocks_lock);
struct wakelock {
char *name;
struct rb_node node;
- struct wakeup_source ws;
+ struct wakeup_source *ws;
#ifdef CONFIG_PM_WAKELOCKS_GC
struct list_head lru;
#endif
@@ -37,23 +39,23 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
{
struct rb_node *node;
struct wakelock *wl;
- char *str = buf;
- char *end = buf + PAGE_SIZE;
+ int len = 0;
mutex_lock(&wakelocks_lock);
for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
wl = rb_entry(node, struct wakelock, node);
- if (wl->ws.active == show_active)
- str += scnprintf(str, end - str, "%s ", wl->name);
+ if (wl->ws->active == show_active)
+ len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
- if (str > buf)
- str--;
- str += scnprintf(str, end - str, "\n");
+ if (len > 0)
+ --len;
+
+ len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
- return (str - buf);
+ return len;
}
#if CONFIG_PM_WAKELOCKS_LIMIT > 0
@@ -83,7 +85,9 @@ static inline void decrement_wakelocks_number(void) {}
#define WL_GC_COUNT_MAX 100
#define WL_GC_TIME_SEC 300
+static void __wakelocks_gc(struct work_struct *work);
static LIST_HEAD(wakelocks_lru_list);
+static DECLARE_WORK(wakelock_work, __wakelocks_gc);
static unsigned int wakelocks_gc_count;
static inline void wakelocks_lru_add(struct wakelock *wl)
@@ -96,29 +100,28 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl)
list_move(&wl->lru, &wakelocks_lru_list);
}
-static void wakelocks_gc(void)
+static void __wakelocks_gc(struct work_struct *work)
{
struct wakelock *wl, *aux;
ktime_t now;
- if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
- return;
+ mutex_lock(&wakelocks_lock);
now = ktime_get();
list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
u64 idle_time_ns;
bool active;
- spin_lock_irq(&wl->ws.lock);
- idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
- active = wl->ws.active;
- spin_unlock_irq(&wl->ws.lock);
+ spin_lock_irq(&wl->ws->lock);
+ idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws->last_time));
+ active = wl->ws->active;
+ spin_unlock_irq(&wl->ws->lock);
if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
break;
if (!active) {
- wakeup_source_remove(&wl->ws);
+ wakeup_source_unregister(wl->ws);
rb_erase(&wl->node, &wakelocks_tree);
list_del(&wl->lru);
kfree(wl->name);
@@ -127,6 +130,16 @@ static void wakelocks_gc(void)
}
}
wakelocks_gc_count = 0;
+
+ mutex_unlock(&wakelocks_lock);
+}
+
+static void wakelocks_gc(void)
+{
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ schedule_work(&wakelock_work);
}
#else /* !CONFIG_PM_WAKELOCKS_GC */
static inline void wakelocks_lru_add(struct wakelock *wl) {}
@@ -174,8 +187,15 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
kfree(wl);
return ERR_PTR(-ENOMEM);
}
- wl->ws.name = wl->name;
- wakeup_source_add(&wl->ws);
+
+ wl->ws = wakeup_source_register(NULL, wl->name);
+ if (!wl->ws) {
+ kfree(wl->name);
+ kfree(wl);
+ return ERR_PTR(-ENOMEM);
+ }
+ wl->ws->last_time = ktime_get();
+
rb_link_node(&wl->node, parent, node);
rb_insert_color(&wl->node, &wakelocks_tree);
wakelocks_lru_add(wl);
@@ -219,9 +239,9 @@ int pm_wake_lock(const char *buf)
u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
do_div(timeout_ms, NSEC_PER_MSEC);
- __pm_wakeup_event(&wl->ws, timeout_ms);
+ __pm_wakeup_event(wl->ws, timeout_ms);
} else {
- __pm_stay_awake(&wl->ws);
+ __pm_stay_awake(wl->ws);
}
wakelocks_lru_most_recent(wl);
@@ -257,7 +277,7 @@ int pm_wake_unlock(const char *buf)
ret = PTR_ERR(wl);
goto out;
}
- __pm_relax(&wl->ws);
+ __pm_relax(wl->ws);
wakelocks_lru_most_recent(wl);
wakelocks_gc();
diff --git a/kernel/printk/.kunitconfig b/kernel/printk/.kunitconfig
new file mode 100644
index 000000000000..f31458fd1a92
--- /dev/null
+++ b/kernel/printk/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_PRINTK=y
+CONFIG_PRINTK_RINGBUFFER_KUNIT_TEST=y
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..f8004ac3983d 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
+obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
+obj-$(CONFIG_PRINTK_INDEX) += index.o
+
+obj-$(CONFIG_PRINTK) += printk_support.o
+printk_support-y := printk_ringbuffer.o
+printk_support-$(CONFIG_SYSCTL) += sysctl.o
+
+obj-$(CONFIG_PRINTK_RINGBUFFER_KUNIT_TEST) += printk_ringbuffer_kunit_test.o
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..17a9591e54ff 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -1,28 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/console.h>
+#include <linux/errno.h>
#include <linux/string.h>
#include "console_cmdline.h"
#include "braille.h"
-char *_braille_console_setup(char **str, char **brl_options)
+int _braille_console_setup(char **str, char **brl_options)
{
- if (!memcmp(*str, "brl,", 4)) {
+ size_t len;
+
+ len = str_has_prefix(*str, "brl,");
+ if (len) {
*brl_options = "";
- *str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
- *brl_options = *str + 4;
+ *str += len;
+ return 0;
+ }
+
+ len = str_has_prefix(*str, "brl=");
+ if (len) {
+ *brl_options = *str + len;
*str = strchr(*brl_options, ',');
- if (!*str)
+ if (!*str) {
pr_err("need port name after brl=\n");
- else
- *((*str)++) = 0;
- } else
- return NULL;
+ return -EINVAL;
+ }
+ *((*str)++) = 0;
+ }
- return *str;
+ return 0;
}
int
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
index 769d771145c8..123154f86304 100644
--- a/kernel/printk/braille.h
+++ b/kernel/printk/braille.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _PRINTK_BRAILLE_H
#define _PRINTK_BRAILLE_H
@@ -9,7 +10,14 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
c->brl_options = brl_options;
}
-char *
+/*
+ * Setup console according to braille options.
+ * Return -EINVAL on syntax error, 0 on success (or no braille option was
+ * actually given).
+ * Modifies str to point to the serial options
+ * Sets brl_options to the parsed braille options.
+ */
+int
_braille_console_setup(char **str, char **brl_options);
int
@@ -25,10 +33,10 @@ braille_set_options(struct console_cmdline *c, char *brl_options)
{
}
-static inline char *
+static inline int
_braille_console_setup(char **str, char **brl_options)
{
- return NULL;
+ return 0;
}
static inline int
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index 2ca4a8b5fe57..0ab573b6d4dc 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _CONSOLE_CMDLINE_H
#define _CONSOLE_CMDLINE_H
@@ -5,6 +6,8 @@ struct console_cmdline
{
char name[16]; /* Name of the driver */
int index; /* Minor dev. to use */
+ char devname[32]; /* DEVNAME:0.0 style device name */
+ bool user_specified; /* Specified by command line vs. platform */
char *options; /* Options for the driver */
#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
char *brl_options; /* Options for braille driver */
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
new file mode 100644
index 000000000000..a6b27526baaf
--- /dev/null
+++ b/kernel/printk/index.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace indexing of printk formats
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "internal.h"
+
+extern struct pi_entry *__start_printk_index[];
+extern struct pi_entry *__stop_printk_index[];
+
+/* The base dir for module formats, typically debugfs/printk/index/ */
+static struct dentry *dfs_index;
+
+static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos)
+{
+ struct pi_entry **entries;
+ unsigned int nr_entries;
+
+#ifdef CONFIG_MODULES
+ if (mod) {
+ entries = mod->printk_index_start;
+ nr_entries = mod->printk_index_size;
+ } else
+#endif
+ {
+ /* vmlinux, comes from linker symbols */
+ entries = __start_printk_index;
+ nr_entries = __stop_printk_index - __start_printk_index;
+ }
+
+ if (pos >= nr_entries)
+ return NULL;
+
+ return entries[pos];
+}
+
+static void *pi_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ const struct module *mod = s->file->f_inode->i_private;
+ struct pi_entry *entry = pi_get_entry(mod, *pos);
+
+ (*pos)++;
+
+ return entry;
+}
+
+static void *pi_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Make show() print the header line. Do not update *pos because
+ * pi_next() still has to return the entry at index 0 later.
+ */
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ return pi_next(s, NULL, pos);
+}
+
+/*
+ * We need both ESCAPE_ANY and explicit characters from ESCAPE_SPECIAL in @only
+ * because otherwise ESCAPE_NAP will cause double quotes and backslashes to be
+ * ignored for quoting.
+ */
+#define seq_escape_printf_format(s, src) \
+ seq_escape_str(s, src, ESCAPE_ANY | ESCAPE_NAP | ESCAPE_APPEND, "\"\\")
+
+static int pi_show(struct seq_file *s, void *v)
+{
+ const struct pi_entry *entry = v;
+ int level = LOGLEVEL_DEFAULT;
+ enum printk_info_flags flags = 0;
+ u16 prefix_len = 0;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(s, "# <level/flags> filename:line function \"format\"\n");
+ return 0;
+ }
+
+ if (!entry->fmt)
+ return 0;
+
+ if (entry->level)
+ printk_parse_prefix(entry->level, &level, &flags);
+ else
+ prefix_len = printk_parse_prefix(entry->fmt, &level, &flags);
+
+
+ if (flags & LOG_CONT) {
+ /*
+ * LOGLEVEL_DEFAULT here means "use the same level as the
+ * message we're continuing from", not the default message
+ * loglevel, so don't display it as such.
+ */
+ if (level == LOGLEVEL_DEFAULT)
+ seq_puts(s, "<c>");
+ else
+ seq_printf(s, "<%d,c>", level);
+ } else
+ seq_printf(s, "<%d>", level);
+
+ seq_printf(s, " %s:%d %s \"", entry->file, entry->line, entry->func);
+ if (entry->subsys_fmt_prefix)
+ seq_escape_printf_format(s, entry->subsys_fmt_prefix);
+ seq_escape_printf_format(s, entry->fmt + prefix_len);
+ seq_puts(s, "\"\n");
+
+ return 0;
+}
+
+static void pi_stop(struct seq_file *p, void *v) { }
+
+static const struct seq_operations dfs_index_sops = {
+ .start = pi_start,
+ .next = pi_next,
+ .show = pi_show,
+ .stop = pi_stop,
+};
+
+DEFINE_SEQ_ATTRIBUTE(dfs_index);
+
+#ifdef CONFIG_MODULES
+static const char *pi_get_module_name(struct module *mod)
+{
+ return mod ? mod->name : "vmlinux";
+}
+#else
+static const char *pi_get_module_name(struct module *mod)
+{
+ return "vmlinux";
+}
+#endif
+
+static void pi_create_file(struct module *mod)
+{
+ debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index,
+ mod, &dfs_index_fops);
+}
+
+#ifdef CONFIG_MODULES
+static void pi_remove_file(struct module *mod)
+{
+ debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index);
+}
+
+static int pi_module_notify(struct notifier_block *nb, unsigned long op,
+ void *data)
+{
+ struct module *mod = data;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ pi_create_file(mod);
+ break;
+ case MODULE_STATE_GOING:
+ pi_remove_file(mod);
+ break;
+ default: /* we don't care about other module states */
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block module_printk_fmts_nb = {
+ .notifier_call = pi_module_notify,
+};
+
+static void __init pi_setup_module_notifier(void)
+{
+ register_module_notifier(&module_printk_fmts_nb);
+}
+#else
+static inline void __init pi_setup_module_notifier(void) { }
+#endif
+
+static int __init pi_init(void)
+{
+ struct dentry *dfs_root = debugfs_create_dir("printk", NULL);
+
+ dfs_index = debugfs_create_dir("index", dfs_root);
+ pi_setup_module_notifier();
+ pi_create_file(NULL);
+
+ return 0;
+}
+
+/* debugfs comes up on core and must be initialised first */
+postcore_initcall(pi_init);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..5f5f626f4279
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * internal.h - printk internal definitions
+ */
+#include <linux/console.h>
+#include <linux/types.h>
+
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+struct ctl_table;
+void __init printk_sysctl_init(void);
+int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+#else
+#define printk_sysctl_init() do { } while (0)
+#endif
+
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \
+ (con->flags & CON_NBCON) ? "" : "legacy ", \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
+
+/*
+ * Identify if legacy printing is forced in a dedicated kthread. If
+ * true, all printing via console lock occurs within a dedicated
+ * legacy printer thread. The only exception is on panic, after the
+ * nbcon consoles have had their chance to print the panic messages
+ * first.
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define force_legacy_kthread() (true)
+#else
+# define force_legacy_kthread() (false)
+#endif
+
+#ifdef CONFIG_PRINTK
+
+#ifdef CONFIG_PRINTK_CALLER
+#define PRINTK_PREFIX_MAX 48
+#else
+#define PRINTK_PREFIX_MAX 32
+#endif
+
+/*
+ * the maximum size of a formatted record (i.e. with prefix added
+ * per line and dropped messages or in extended message format)
+ */
+#define PRINTK_MESSAGE_MAX 2048
+
+/* the maximum size allowed to be reserved for a record */
+#define PRINTKRB_RECORD_MAX 1024
+
+/* Flags for a single printk record. */
+enum printk_info_flags {
+ /* always show on console, ignore console_loglevel */
+ LOG_FORCE_CON = 1,
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
+
+struct printk_ringbuffer;
+struct dev_printk_info;
+
+extern struct printk_ringbuffer *prb;
+extern bool printk_kthreads_running;
+extern bool printk_kthreads_ready;
+extern bool debug_non_panic_cpus;
+
+__printf(4, 0)
+int vprintk_store(int facility, int level,
+ const struct dev_printk_info *dev_info,
+ const char *fmt, va_list args);
+
+__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+
+void __printk_safe_enter(void);
+void __printk_safe_exit(void);
+
+bool printk_percpu_data_ready(void);
+
+#define printk_safe_enter_irqsave(flags) \
+ do { \
+ local_irq_save(flags); \
+ __printk_safe_enter(); \
+ } while (0)
+
+#define printk_safe_exit_irqrestore(flags) \
+ do { \
+ __printk_safe_exit(); \
+ local_irq_restore(flags); \
+ } while (0)
+
+void defer_console_output(void);
+bool is_printk_legacy_deferred(void);
+bool is_printk_force_console(void);
+
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags);
+void console_lock_spinning_enable(void);
+int console_lock_spinning_disable_and_check(int cookie);
+
+u64 nbcon_seq_read(struct console *con);
+void nbcon_seq_force(struct console *con, u64 seq);
+bool nbcon_alloc(struct console *con);
+void nbcon_free(struct console *con);
+enum nbcon_prio nbcon_get_default_prio(void);
+void nbcon_atomic_flush_pending(void);
+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic);
+bool nbcon_kthread_create(struct console *con);
+void nbcon_kthread_stop(struct console *con);
+void nbcon_kthreads_wake(void);
+
+/**
+ * nbcon_kthread_wake - Wake up a console printing thread
+ * @con: Console to operate on
+ */
+static inline void nbcon_kthread_wake(struct console *con)
+{
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the rcuwait is empty.
+ *
+ * The full memory barrier in rcuwait_wake_up() pairs with the full
+ * memory barrier within set_current_state() of
+ * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait()
+ * adds the waiter but before it has checked the wait condition.
+ *
+ * This pairs with nbcon_kthread_func:A.
+ */
+ rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */
+}
+
+#else
+
+#define PRINTK_PREFIX_MAX 0
+#define PRINTK_MESSAGE_MAX 0
+#define PRINTKRB_RECORD_MAX 0
+
+#define printk_kthreads_running (false)
+#define printk_kthreads_ready (false)
+
+/*
+ * In !PRINTK builds we still export console_sem
+ * semaphore and some of console functions (console_unlock()/etc.), so
+ * printk-safe must preserve the existing local IRQ guarantees.
+ */
+#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
+#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
+
+static inline bool printk_percpu_data_ready(void) { return false; }
+static inline void defer_console_output(void) { }
+static inline bool is_printk_legacy_deferred(void) { return false; }
+static inline u64 nbcon_seq_read(struct console *con) { return 0; }
+static inline void nbcon_seq_force(struct console *con, u64 seq) { }
+static inline bool nbcon_alloc(struct console *con) { return false; }
+static inline void nbcon_free(struct console *con) { }
+static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; }
+static inline void nbcon_atomic_flush_pending(void) { }
+static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic) { return false; }
+static inline void nbcon_kthread_wake(struct console *con) { }
+static inline void nbcon_kthreads_wake(void) { }
+
+#endif /* CONFIG_PRINTK */
+
+extern bool have_boot_console;
+extern bool have_nbcon_console;
+extern bool have_legacy_console;
+extern bool legacy_allow_panic_sync;
+
+/**
+ * struct console_flush_type - Define available console flush methods
+ * @nbcon_atomic: Flush directly using nbcon_atomic() callback
+ * @nbcon_offload: Offload flush to printer thread
+ * @legacy_direct: Call the legacy loop in this context
+ * @legacy_offload: Offload the legacy loop into IRQ or legacy thread
+ *
+ * Note that the legacy loop also flushes the nbcon consoles.
+ */
+struct console_flush_type {
+ bool nbcon_atomic;
+ bool nbcon_offload;
+ bool legacy_direct;
+ bool legacy_offload;
+};
+
+extern bool console_irqwork_blocked;
+
+/*
+ * Identify which console flushing methods should be used in the context of
+ * the caller.
+ */
+static inline void printk_get_console_flush_type(struct console_flush_type *ft)
+{
+ memset(ft, 0, sizeof(*ft));
+
+ switch (nbcon_get_default_prio()) {
+ case NBCON_PRIO_NORMAL:
+ if (have_nbcon_console && !have_boot_console) {
+ if (printk_kthreads_running && !console_irqwork_blocked)
+ ft->nbcon_offload = true;
+ else
+ ft->nbcon_atomic = true;
+ }
+
+ /* Legacy consoles are flushed directly when possible. */
+ if (have_legacy_console || have_boot_console) {
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+ else if (!console_irqwork_blocked)
+ ft->legacy_offload = true;
+ }
+ break;
+
+ case NBCON_PRIO_EMERGENCY:
+ if (have_nbcon_console && !have_boot_console)
+ ft->nbcon_atomic = true;
+
+ /* Legacy consoles are flushed directly when possible. */
+ if (have_legacy_console || have_boot_console) {
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+ else if (!console_irqwork_blocked)
+ ft->legacy_offload = true;
+ }
+ break;
+
+ case NBCON_PRIO_PANIC:
+ /*
+ * In panic, the nbcon consoles will directly print. But
+ * only allowed if there are no boot consoles.
+ */
+ if (have_nbcon_console && !have_boot_console)
+ ft->nbcon_atomic = true;
+
+ if (have_legacy_console || have_boot_console) {
+ /*
+ * This is the same decision as NBCON_PRIO_NORMAL
+ * except that offloading never occurs in panic.
+ *
+ * Note that console_flush_on_panic() will flush
+ * legacy consoles anyway, even if unsafe.
+ */
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+
+ /*
+ * In panic, if nbcon atomic printing occurs,
+ * the legacy consoles must remain silent until
+ * explicitly allowed.
+ */
+ if (ft->nbcon_atomic && !legacy_allow_panic_sync)
+ ft->legacy_direct = false;
+ }
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+extern struct printk_buffers printk_shared_pbufs;
+
+/**
+ * struct printk_buffers - Buffers to read/format/output printk messages.
+ * @outbuf: After formatting, contains text to output.
+ * @scratchbuf: Used as temporary ringbuffer reading and string-print space.
+ */
+struct printk_buffers {
+ char outbuf[PRINTK_MESSAGE_MAX];
+ char scratchbuf[PRINTKRB_RECORD_MAX];
+};
+
+/**
+ * struct printk_message - Container for a prepared printk message.
+ * @pbufs: printk buffers used to prepare the message.
+ * @outbuf_len: The length of prepared text in @pbufs->outbuf to output. This
+ * does not count the terminator. A value of 0 means there is
+ * nothing to output and this record should be skipped.
+ * @seq: The sequence number of the record used for @pbufs->outbuf.
+ * @dropped: The number of dropped records from reading @seq.
+ */
+struct printk_message {
+ struct printk_buffers *pbufs;
+ unsigned int outbuf_len;
+ u64 seq;
+ unsigned long dropped;
+};
+
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+void console_prepend_replay(struct printk_message *pmsg);
+#endif
+
+#ifdef CONFIG_SMP
+bool is_printk_cpu_sync_owner(void);
+#else
+static inline bool is_printk_cpu_sync_owner(void) { return false; }
+#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
new file mode 100644
index 000000000000..3fa403f9831f
--- /dev/null
+++ b/kernel/printk/nbcon.c
@@ -0,0 +1,1985 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kdb.h>
+#include <linux/kthread.h>
+#include <linux/minmax.h>
+#include <linux/panic.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include "internal.h"
+#include "printk_ringbuffer.h"
+/*
+ * Printk console printing implementation for consoles which does not depend
+ * on the legacy style console_lock mechanism.
+ *
+ * The state of the console is maintained in the "nbcon_state" atomic
+ * variable.
+ *
+ * The console is locked when:
+ *
+ * - The 'prio' field contains the priority of the context that owns the
+ * console. Only higher priority contexts are allowed to take over the
+ * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
+ *
+ * - The 'cpu' field denotes on which CPU the console is locked. It is used
+ * to prevent busy waiting on the same CPU. Also it informs the lock owner
+ * that it has lost the lock in a more complex scenario when the lock was
+ * taken over by a higher priority context, released, and taken on another
+ * CPU with the same priority as the interrupted owner.
+ *
+ * The acquire mechanism uses a few more fields:
+ *
+ * - The 'req_prio' field is used by the handover approach to make the
+ * current owner aware that there is a context with a higher priority
+ * waiting for the friendly handover.
+ *
+ * - The 'unsafe' field allows to take over the console in a safe way in the
+ * middle of emitting a message. The field is set only when accessing some
+ * shared resources or when the console device is manipulated. It can be
+ * cleared, for example, after emitting one character when the console
+ * device is in a consistent state.
+ *
+ * - The 'unsafe_takeover' field is set when a hostile takeover took the
+ * console in an unsafe state. The console will stay in the unsafe state
+ * until re-initialized.
+ *
+ * The acquire mechanism uses three approaches:
+ *
+ * 1) Direct acquire when the console is not owned or is owned by a lower
+ * priority context and is in a safe state.
+ *
+ * 2) Friendly handover mechanism uses a request/grant handshake. It is used
+ * when the current owner has lower priority and the console is in an
+ * unsafe state.
+ *
+ * The requesting context:
+ *
+ * a) Sets its priority into the 'req_prio' field.
+ *
+ * b) Waits (with a timeout) for the owning context to unlock the
+ * console.
+ *
+ * c) Takes the lock and clears the 'req_prio' field.
+ *
+ * The owning context:
+ *
+ * a) Observes the 'req_prio' field set on exit from the unsafe
+ * console state.
+ *
+ * b) Gives up console ownership by clearing the 'prio' field.
+ *
+ * 3) Unsafe hostile takeover allows to take over the lock even when the
+ * console is an unsafe state. It is used only in panic() by the final
+ * attempt to flush consoles in a try and hope mode.
+ *
+ * Note that separate record buffers are used in panic(). As a result,
+ * the messages can be read and formatted without any risk even after
+ * using the hostile takeover in unsafe state.
+ *
+ * The release function simply clears the 'prio' field.
+ *
+ * All operations on @console::nbcon_state are atomic cmpxchg based to
+ * handle concurrency.
+ *
+ * The acquire/release functions implement only minimal policies:
+ *
+ * - Preference for higher priority contexts.
+ * - Protection of the panic CPU.
+ *
+ * All other policy decisions must be made at the call sites:
+ *
+ * - What is marked as an unsafe section.
+ * - Whether to spin-wait if there is already an owner and the console is
+ * in an unsafe state.
+ * - Whether to attempt an unsafe hostile takeover.
+ *
+ * The design allows to implement the well known:
+ *
+ * acquire()
+ * output_one_printk_record()
+ * release()
+ *
+ * The output of one printk record might be interrupted with a higher priority
+ * context. The new owner is supposed to reprint the entire interrupted record
+ * from scratch.
+ */
+
+/* Counter of active nbcon emergency contexts. */
+static atomic_t nbcon_cpu_emergency_cnt = ATOMIC_INIT(0);
+
+/**
+ * nbcon_state_set - Helper function to set the console state
+ * @con: Console to update
+ * @new: The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use nbcon_state_try_cmpxchg().
+ */
+static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
+{
+ atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
+}
+
+/**
+ * nbcon_state_read - Helper function to read the console state
+ * @con: Console to read
+ * @state: The state to store the result
+ */
+static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
+{
+ state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
+}
+
+/**
+ * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
+ * @con: Console to update
+ * @cur: Old/expected state
+ * @new: New state
+ *
+ * Return: True on success. False on fail and @cur is updated.
+ */
+static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
+ struct nbcon_state *new)
+{
+ return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
+}
+
+/**
+ * nbcon_seq_read - Read the current console sequence
+ * @con: Console to read the sequence of
+ *
+ * Return: Sequence number of the next record to print on @con.
+ */
+u64 nbcon_seq_read(struct console *con)
+{
+ unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
+
+ return __ulseq_to_u64seq(prb, nbcon_seq);
+}
+
+/**
+ * nbcon_seq_force - Force console sequence to a specific value
+ * @con: Console to work on
+ * @seq: Sequence number value to set
+ *
+ * Only to be used during init (before registration) or in extreme situations
+ * (such as panic with CONSOLE_REPLAY_ALL).
+ */
+void nbcon_seq_force(struct console *con, u64 seq)
+{
+ /*
+ * If the specified record no longer exists, the oldest available record
+ * is chosen. This is especially important on 32bit systems because only
+ * the lower 32 bits of the sequence number are stored. The upper 32 bits
+ * are derived from the sequence numbers available in the ringbuffer.
+ */
+ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
+
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
+}
+
+/**
+ * nbcon_seq_try_update - Try to update the console sequence number
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ * @new_seq: The new sequence number to set
+ *
+ * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
+ * the 64bit value). This could be a different value than @new_seq if
+ * nbcon_seq_force() was used or the current context no longer owns the
+ * console. In the later case, it will stop printing anyway.
+ */
+static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
+{
+ unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq);
+ struct console *con = ctxt->console;
+
+ if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
+ __u64seq_to_ulseq(new_seq))) {
+ ctxt->seq = new_seq;
+ } else {
+ ctxt->seq = nbcon_seq_read(con);
+ }
+}
+
+/**
+ * nbcon_context_try_acquire_direct - Try to acquire directly
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ * @is_reacquire: This acquire is a reacquire
+ *
+ * Acquire the console when it is released. Also acquire the console when
+ * the current owner has a lower priority and the console is in a safe state.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is neither the panic
+ * CPU nor is this a reacquire. Or the current owner or
+ * waiter has the same or higher priority. No acquire
+ * method can be successful in these cases.
+ *
+ * -EBUSY: The current owner has a lower priority but the console
+ * in an unsafe state. The caller should try using
+ * the handover acquire method.
+ */
+static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
+ struct nbcon_state *cur, bool is_reacquire)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ do {
+ /*
+ * Panic does not imply that the console is owned. However,
+ * since all non-panic CPUs are stopped during panic(), it
+ * is safer to have them avoid gaining console ownership.
+ *
+ * One exception is when kdb has locked for printing on this CPU.
+ *
+ * Second exception is a reacquire (and an unsafe takeover
+ * has not previously occurred) then it is allowed to attempt
+ * a direct acquire in panic. This gives console drivers an
+ * opportunity to perform any necessary cleanup if they were
+ * interrupted by the panic CPU while printing.
+ */
+ if (panic_on_other_cpu() &&
+ !kdb_printf_on_this_cpu() &&
+ (!is_reacquire || cur->unsafe_takeover)) {
+ return -EPERM;
+ }
+
+ if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
+ return -EPERM;
+
+ if (cur->unsafe)
+ return -EBUSY;
+
+ /*
+ * The console should never be safe for a direct acquire
+ * if an unsafe hostile takeover has ever happened.
+ */
+ WARN_ON_ONCE(cur->unsafe_takeover);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
+{
+ /*
+ * The request context is well defined by the @req_prio because:
+ *
+ * - Only a context with a priority higher than the owner can become
+ * a waiter.
+ * - Only a context with a priority higher than the waiter can
+ * directly take over the request.
+ * - There are only three priorities.
+ * - Only one CPU is allowed to request PANIC priority.
+ * - Lower priorities are ignored during panic() until reboot.
+ *
+ * As a result, the following scenario is *not* possible:
+ *
+ * 1. This context is currently a waiter.
+ * 2. Another context with a higher priority than this context
+ * directly takes ownership.
+ * 3. The higher priority context releases the ownership.
+ * 4. Another lower priority context takes the ownership.
+ * 5. Another context with the same priority as this context
+ * creates a request and starts waiting.
+ *
+ * Event #1 implies this context is EMERGENCY.
+ * Event #2 implies the new context is PANIC.
+ * Event #3 occurs when panic() has flushed the console.
+ * Event #4 occurs when a non-panic CPU reacquires.
+ * Event #5 is not possible due to the panic_on_other_cpu() check
+ * in nbcon_context_try_acquire_handover().
+ */
+
+ return (cur->req_prio == expected_prio);
+}
+
+/**
+ * nbcon_context_try_acquire_requested - Try to acquire after having
+ * requested a handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * This is a helper function for nbcon_context_try_acquire_handover().
+ * It is called when the console is in an unsafe state. The current
+ * owner will release the console on exit from the unsafe region.
+ *
+ * Return: 0 on success and @cur is updated to the new console state.
+ * Otherwise an error code on failure.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU
+ * or this context is no longer the waiter.
+ *
+ * -EBUSY: The console is still locked. The caller should
+ * continue waiting.
+ *
+ * Note: The caller must still remove the request when an error has occurred
+ * except when this context is no longer the waiter.
+ */
+static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ /* Note that the caller must still remove the request! */
+ if (panic_on_other_cpu())
+ return -EPERM;
+
+ /*
+ * Note that the waiter will also change if there was an unsafe
+ * hostile takeover.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* If still locked, caller should continue waiting. */
+ if (cur->prio != NBCON_PRIO_NONE)
+ return -EBUSY;
+
+ /*
+ * The previous owner should have never released ownership
+ * in an unsafe region.
+ */
+ WARN_ON_ONCE(cur->unsafe);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * The acquire could fail only when it has been taken
+ * over by a higher priority context.
+ */
+ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
+ return -EPERM;
+ }
+
+ /* Handover success. This context now owns the console. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_handover - Try to acquire via handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * The function must be called only when the context has higher priority
+ * than the current owner and the console is in an unsafe state.
+ * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
+ *
+ * The function sets "req_prio" field to make the current owner aware of
+ * the request. Then it waits until the current owner releases the console,
+ * or an even higher context takes over the request, or timeout expires.
+ *
+ * The current owner checks the "req_prio" field on exit from the unsafe
+ * region and releases the console. It does not touch the "req_prio" field
+ * so that the console stays reserved for the waiter.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or a higher priority context has taken over the
+ * console or the handover request.
+ *
+ * -EBUSY: The current owner is on the same CPU so that the hand
+ * shake could not work. Or the current owner is not
+ * willing to wait (zero timeout). Or the console does
+ * not enter the safe state before timeout passed. The
+ * caller might still use the unsafe hostile takeover
+ * when allowed.
+ *
+ * -EAGAIN: @cur has changed when creating the handover request.
+ * The caller should retry with direct acquire.
+ */
+static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+ int timeout;
+ int request_err = -EBUSY;
+
+ /*
+ * Check that the handover is called when the direct acquire failed
+ * with -EBUSY.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(!cur->unsafe);
+
+ /*
+ * Panic does not imply that the console is owned. However, it
+ * is critical that non-panic CPUs during panic are unable to
+ * wait for a handover in order to satisfy the assumptions of
+ * nbcon_waiter_matches(). In particular, the assumption that
+ * lower priorities are ignored during panic.
+ */
+ if (panic_on_other_cpu())
+ return -EPERM;
+
+ /* Handover is not possible on the same CPU. */
+ if (cur->cpu == cpu)
+ return -EBUSY;
+
+ /*
+ * Console stays unsafe after an unsafe takeover until re-initialized.
+ * Waiting is not going to help in this case.
+ */
+ if (cur->unsafe_takeover)
+ return -EBUSY;
+
+ /* Is the caller willing to wait? */
+ if (ctxt->spinwait_max_us == 0)
+ return -EBUSY;
+
+ /*
+ * Setup a request for the handover. The caller should try to acquire
+ * the console directly when the current state has been modified.
+ */
+ new.atom = cur->atom;
+ new.req_prio = ctxt->prio;
+ if (!nbcon_state_try_cmpxchg(con, cur, &new))
+ return -EAGAIN;
+
+ cur->atom = new.atom;
+
+ /* Wait until there is no owner and then acquire the console. */
+ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+ /* On successful acquire, this request is cleared. */
+ request_err = nbcon_context_try_acquire_requested(ctxt, cur);
+ if (!request_err)
+ return 0;
+
+ /*
+ * If the acquire should be aborted, it must be ensured
+ * that the request is removed before returning to caller.
+ */
+ if (request_err == -EPERM)
+ break;
+
+ udelay(1);
+
+ /* Re-read the state because some time has passed. */
+ nbcon_state_read(con, cur);
+ }
+
+ /* Timed out or aborted. Carefully remove handover request. */
+ do {
+ /*
+ * No need to remove request if there is a new waiter. This
+ * can only happen if a higher priority context has taken over
+ * the console or the handover request.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* Unset request for handover. */
+ new.atom = cur->atom;
+ new.req_prio = NBCON_PRIO_NONE;
+ if (nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * Request successfully unset. Report failure of
+ * acquiring via handover.
+ */
+ cur->atom = new.atom;
+ return request_err;
+ }
+
+ /*
+ * Unable to remove request. Try to acquire in case
+ * the owner has released the lock.
+ */
+ } while (nbcon_context_try_acquire_requested(ctxt, cur));
+
+ /* Lucky timing. The acquire succeeded while removing the request. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console even in the unsafe state.
+ *
+ * It can be permitted by setting the 'allow_unsafe_takeover' field only
+ * by the final attempt to flush messages in panic().
+ *
+ * Return: 0 on success. -EPERM when not allowed by the context.
+ */
+static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ if (!ctxt->allow_unsafe_takeover)
+ return -EPERM;
+
+ /* Ensure caller is allowed to perform unsafe hostile takeovers. */
+ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
+ return -EPERM;
+
+ /*
+ * Check that try_acquire_direct() and try_acquire_handover() returned
+ * -EBUSY in the right situation.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(cur->unsafe != true);
+
+ do {
+ new.atom = cur->atom;
+ new.cpu = cpu;
+ new.prio = ctxt->prio;
+ new.unsafe |= cur->unsafe_takeover;
+ new.unsafe_takeover |= cur->unsafe;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static struct printk_buffers panic_nbcon_pbufs;
+
+/**
+ * nbcon_context_try_acquire - Try to acquire nbcon console
+ * @ctxt: The context of the caller
+ * @is_reacquire: This acquire is a reacquire
+ *
+ * Context: Under @ctxt->con->device_lock() or local_irq_save().
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * If the caller allowed an unsafe hostile takeover, on success the
+ * caller should check the current console state to see if it is
+ * in an unsafe state. Otherwise, on success the caller may assume
+ * the console is not in an unsafe state.
+ */
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
+{
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ int err;
+
+ nbcon_state_read(con, &cur);
+try_again:
+ err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire);
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_handover(ctxt, &cur);
+ if (err == -EAGAIN)
+ goto try_again;
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_hostile(ctxt, &cur);
+out:
+ if (err)
+ return false;
+
+ /* Acquire succeeded. */
+
+ /* Assign the appropriate buffer for this context. */
+ if (panic_on_this_cpu())
+ ctxt->pbufs = &panic_nbcon_pbufs;
+ else
+ ctxt->pbufs = con->pbufs;
+
+ /* Set the record sequence for this context to print. */
+ ctxt->seq = nbcon_seq_read(ctxt->console);
+
+ return true;
+}
+
+static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
+ int expected_prio)
+{
+ /*
+ * A similar function, nbcon_waiter_matches(), only deals with
+ * EMERGENCY and PANIC priorities. However, this function must also
+ * deal with the NORMAL priority, which requires additional checks
+ * and constraints.
+ *
+ * For the case where preemption and interrupts are disabled, it is
+ * enough to also verify that the owning CPU has not changed.
+ *
+ * For the case where preemption or interrupts are enabled, an
+ * external synchronization method *must* be used. In particular,
+ * the driver-specific locking mechanism used in device_lock()
+ * (including disabling migration) should be used. It prevents
+ * scenarios such as:
+ *
+ * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and
+ * is scheduled out.
+ * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY
+ * and releases it.
+ * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X]
+ * and is scheduled out.
+ * 4. [Task A] gets running on [CPU X] and sees that the console is
+ * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus
+ * [Task A] thinks it is the owner when it is not.
+ */
+
+ if (cur->prio != expected_prio)
+ return false;
+
+ if (cur->cpu != expected_cpu)
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_context_release - Release the console
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ */
+static void nbcon_context_release(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
+ break;
+
+ new.atom = cur.atom;
+ new.prio = NBCON_PRIO_NONE;
+
+ /*
+ * If @unsafe_takeover is set, it is kept set so that
+ * the state remains permanently unsafe.
+ */
+ new.unsafe |= cur.unsafe_takeover;
+
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ ctxt->pbufs = NULL;
+}
+
+/**
+ * nbcon_context_can_proceed - Check whether ownership can proceed
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @cur: The current console state
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * Must be invoked when entering the unsafe state to make sure that it still
+ * owns the lock. Also must be invoked when exiting the unsafe context
+ * to eventually free the lock for a higher priority context which asked
+ * for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe
+ * state.
+ *
+ * Also it can be called in the safe context before doing an expensive
+ * safe operation. It does not make sense to do the operation when
+ * a higher priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+
+ /* Make sure this context still owns the console. */
+ if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
+ return false;
+
+ /* The console owner can proceed if there is no waiter. */
+ if (cur->req_prio == NBCON_PRIO_NONE)
+ return true;
+
+ /*
+ * A console owner within an unsafe region is always allowed to
+ * proceed, even if there are waiters. It can perform a handover
+ * when exiting the unsafe region. Otherwise the waiter will
+ * need to perform an unsafe hostile takeover.
+ */
+ if (cur->unsafe)
+ return true;
+
+ /* Waiters always have higher priorities than owners. */
+ WARN_ON_ONCE(cur->req_prio <= cur->prio);
+
+ /*
+ * Having a safe point for take over and eventually a few
+ * duplicated characters or a full line is way better than a
+ * hostile takeover. Post processing can take care of the garbage.
+ * Release and hand over.
+ */
+ nbcon_context_release(ctxt);
+
+ /*
+ * It is not clear whether the waiter really took over ownership. The
+ * outermost callsite must make the final decision whether console
+ * ownership is needed for it to proceed. If yes, it must reacquire
+ * ownership (possibly hostile) before carefully proceeding.
+ *
+ * The calling context no longer owns the console so go back all the
+ * way instead of trying to implement reacquire heuristics in tons of
+ * places.
+ */
+ return false;
+}
+
+/**
+ * nbcon_can_proceed - Check whether ownership can proceed
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * It is used in nbcon_enter_unsafe() to make sure that it still owns the
+ * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
+ * for a higher priority context which asked for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe state.
+ *
+ * Also it can be called in the safe context before doing an expensive safe
+ * operation. It does not make sense to do the operation when a higher
+ * priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ nbcon_state_read(con, &cur);
+
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+EXPORT_SYMBOL_GPL(nbcon_can_proceed);
+
+#define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true)
+#define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false)
+
+/**
+ * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @unsafe: The new value for the unsafe bit
+ *
+ * Return: True if the unsafe state was updated and this context still
+ * owns the console. Otherwise false if ownership was handed
+ * over or taken.
+ *
+ * This function allows console owners to modify the unsafe status of the
+ * console.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ *
+ * Internal helper to avoid duplicated code.
+ */
+static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
+{
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ /*
+ * The unsafe bit must not be cleared if an
+ * unsafe hostile takeover has occurred.
+ */
+ if (!unsafe && cur.unsafe_takeover)
+ goto out;
+
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ new.atom = cur.atom;
+ new.unsafe = unsafe;
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ cur.atom = new.atom;
+out:
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+
+void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+ char *buf, unsigned int len)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ wctxt->outbuf = buf;
+ wctxt->len = len;
+ nbcon_state_read(con, &cur);
+ wctxt->unsafe_takeover = cur.unsafe_takeover;
+}
+
+/**
+ * nbcon_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool is_owner;
+
+ is_owner = nbcon_context_enter_unsafe(ctxt);
+ if (!is_owner)
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+ return is_owner;
+}
+EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
+
+/**
+ * nbcon_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool ret;
+
+ ret = nbcon_context_exit_unsafe(ctxt);
+ if (!ret)
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
+
+/**
+ * nbcon_reacquire_nobuf - Reacquire a console after losing ownership
+ * while printing
+ * @wctxt: The write context that was handed to the write callback
+ *
+ * Since ownership can be lost at any time due to handover or takeover, a
+ * printing context _must_ be prepared to back out immediately and
+ * carefully. However, there are scenarios where the printing context must
+ * reacquire ownership in order to finalize or revert hardware changes.
+ *
+ * This function allows a printing context to reacquire ownership using the
+ * same priority as its previous ownership.
+ *
+ * Note that after a successful reacquire the printing context will have no
+ * output buffer because that has been lost. This function cannot be used to
+ * resume printing.
+ */
+void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ while (!nbcon_context_try_acquire(ctxt, true))
+ cpu_relax();
+
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);
+
+/**
+ * nbcon_emit_next_record - Emit a record in the acquired context
+ * @wctxt: The write context that will be handed to the write function
+ * @use_atomic: True if the write_atomic() callback is to be used
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context. If the caller
+ * wants to do more it must reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ struct printk_message pmsg = {
+ .pbufs = ctxt->pbufs,
+ };
+ unsigned long con_dropped;
+ struct nbcon_state cur;
+ unsigned long dropped;
+ unsigned long ulseq;
+
+ /*
+ * This function should never be called for consoles that have not
+ * implemented the necessary callback for writing: i.e. legacy
+ * consoles and, when atomic, nbcon consoles with no write_atomic().
+ * Handle it as if ownership was lost and try to continue.
+ *
+ * Note that for nbcon consoles the write_thread() callback is
+ * mandatory and was already checked in nbcon_alloc().
+ */
+ if (WARN_ON_ONCE((use_atomic && !con->write_atomic) ||
+ !(console_srcu_read_flags(con) & CON_NBCON))) {
+ nbcon_context_release(ctxt);
+ return false;
+ }
+
+ /*
+ * The printk buffers are filled within an unsafe section. This
+ * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
+ * clobbering each other.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
+ if (!ctxt->backlog)
+ return nbcon_context_exit_unsafe(ctxt);
+
+ /*
+ * @con->dropped is not protected in case of an unsafe hostile
+ * takeover. In that situation the update can be racy so
+ * annotate it accordingly.
+ */
+ con_dropped = data_race(READ_ONCE(con->dropped));
+
+ dropped = con_dropped + pmsg.dropped;
+ if (dropped && !is_extended)
+ console_prepend_dropped(&pmsg, dropped);
+
+ /*
+ * If the previous owner was assigned the same record, this context
+ * has taken over ownership and is replaying the record. Prepend a
+ * message to let the user know the record is replayed.
+ */
+ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq));
+ if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) {
+ console_prepend_replay(&pmsg);
+ } else {
+ /*
+ * Ensure this context is still the owner before trying to
+ * update @nbcon_prev_seq. Otherwise the value in @ulseq may
+ * not be from the previous owner and instead be some later
+ * value from the context that took over ownership.
+ */
+ nbcon_state_read(con, &cur);
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq,
+ __u64seq_to_ulseq(pmsg.seq));
+ }
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return false;
+
+ /* For skipped records just update seq/dropped in @con. */
+ if (pmsg.outbuf_len == 0)
+ goto update_con;
+
+ /* Initialize the write context for driver callbacks. */
+ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);
+
+ if (use_atomic)
+ con->write_atomic(con, wctxt);
+ else
+ con->write_thread(con, wctxt);
+
+ if (!wctxt->outbuf) {
+ /*
+ * Ownership was lost and reacquired by the driver. Handle it
+ * as if ownership was lost.
+ */
+ nbcon_context_release(ctxt);
+ return false;
+ }
+
+ /*
+ * Ownership may have been lost but _not_ reacquired by the driver.
+ * This case is detected and handled when entering unsafe to update
+ * dropped/seq values.
+ */
+
+ /*
+ * Since any dropped message was successfully output, reset the
+ * dropped count for the console.
+ */
+ dropped = 0;
+update_con:
+ /*
+ * The dropped count and the sequence number are updated within an
+ * unsafe section. This limits update races to the panic context and
+ * allows the panic context to win.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ if (dropped != con_dropped) {
+ /* Counterpart to the READ_ONCE() above. */
+ WRITE_ONCE(con->dropped, dropped);
+ }
+
+ nbcon_seq_try_update(ctxt, pmsg.seq + 1);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+
+/*
+ * nbcon_emit_one - Print one record for an nbcon console using the
+ * specified callback
+ * @wctxt: An initialized write context struct to use for this context
+ * @use_atomic: True if the write_atomic() callback is to be used
+ *
+ * Return: True, when a record has been printed and there are still
+ * pending records. The caller might want to continue flushing.
+ *
+ * False, when there is no pending record, or when the console
+ * context cannot be acquired, or the ownership has been lost.
+ * The caller should give up. Either the job is done, cannot be
+ * done, or will be handled by the owning context.
+ *
+ * This is an internal helper to handle the locking of the console before
+ * calling nbcon_emit_next_record().
+ */
+static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ unsigned long flags;
+ bool ret = false;
+
+ if (!use_atomic) {
+ con->device_lock(con, &flags);
+
+ /*
+ * Ensure this stays on the CPU to make handover and
+ * takeover possible.
+ */
+ cant_migrate();
+ }
+
+ if (!nbcon_context_try_acquire(ctxt, false))
+ goto out;
+
+ /*
+ * nbcon_emit_next_record() returns false when the console was
+ * handed over or taken over. In both cases the context is no
+ * longer valid.
+ *
+ * The higher priority printing context takes over responsibility
+ * to print the pending records.
+ */
+ if (!nbcon_emit_next_record(wctxt, use_atomic))
+ goto out;
+
+ nbcon_context_release(ctxt);
+
+ ret = ctxt->backlog;
+out:
+ if (!use_atomic)
+ con->device_unlock(con, flags);
+ return ret;
+}
+
+/**
+ * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup
+ * @con: Console to operate on
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ *
+ * Return: True if the thread should shutdown or if the console is
+ * allowed to print and a record is available. False otherwise.
+ *
+ * After the thread wakes up, it must first check if it should shutdown before
+ * attempting any printing.
+ */
+static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt)
+{
+ bool ret = false;
+ short flags;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ /*
+ * Block the kthread when the system is in an emergency or panic mode.
+ * It increases the chance that these contexts would be able to show
+ * the messages directly. And it reduces the risk of interrupted writes
+ * where the context with a higher priority takes over the nbcon console
+ * ownership in the middle of a message.
+ */
+ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) ||
+ unlikely(panic_in_progress()))
+ return false;
+
+ cookie = console_srcu_read_lock();
+
+ flags = console_srcu_read_flags(con);
+ if (console_is_usable(con, flags, false)) {
+ /* Bring the sequence in @ctxt up to date */
+ ctxt->seq = nbcon_seq_read(con);
+
+ ret = prb_read_valid(prb, ctxt->seq, NULL);
+ }
+
+ console_srcu_read_unlock(cookie);
+ return ret;
+}
+
+/**
+ * nbcon_kthread_func - The printer thread function
+ * @__console: Console to operate on
+ *
+ * Return: 0
+ */
+static int nbcon_kthread_func(void *__console)
+{
+ struct console *con = __console;
+ struct nbcon_write_context wctxt = {
+ .ctxt.console = con,
+ .ctxt.prio = NBCON_PRIO_NORMAL,
+ };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ short con_flags;
+ bool backlog;
+ int cookie;
+
+wait_for_event:
+ /*
+ * Guarantee this task is visible on the rcuwait before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * ___rcuwait_wait_event() pairs with the full memory
+ * barrier within rcuwait_has_sleeper().
+ *
+ * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A.
+ */
+ rcuwait_wait_event(&con->rcuwait,
+ nbcon_kthread_should_wakeup(con, ctxt),
+ TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */
+
+ do {
+ if (kthread_should_stop())
+ return 0;
+
+ /*
+ * Block the kthread when the system is in an emergency or panic
+ * mode. See nbcon_kthread_should_wakeup() for more details.
+ */
+ if (unlikely(atomic_read(&nbcon_cpu_emergency_cnt)) ||
+ unlikely(panic_in_progress()))
+ goto wait_for_event;
+
+ backlog = false;
+
+ /*
+ * Keep the srcu read lock around the entire operation so that
+ * synchronize_srcu() can guarantee that the kthread stopped
+ * or suspended printing.
+ */
+ cookie = console_srcu_read_lock();
+
+ con_flags = console_srcu_read_flags(con);
+
+ if (console_is_usable(con, con_flags, false))
+ backlog = nbcon_emit_one(&wctxt, false);
+
+ console_srcu_read_unlock(cookie);
+
+ cond_resched();
+
+ } while (backlog);
+
+ goto wait_for_event;
+}
+
+/**
+ * nbcon_irq_work - irq work to wake console printer thread
+ * @irq_work: The irq work to operate on
+ */
+static void nbcon_irq_work(struct irq_work *irq_work)
+{
+ struct console *con = container_of(irq_work, struct console, irq_work);
+
+ nbcon_kthread_wake(con);
+}
+
+static inline bool rcuwait_has_sleeper(struct rcuwait *w)
+{
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the rcuwait is empty.
+ *
+ * This full memory barrier pairs with the full memory barrier within
+ * set_current_state() of ___rcuwait_wait_event(), which is called
+ * after prepare_to_rcuwait() adds the waiter but before it has
+ * checked the wait condition.
+ *
+ * This pairs with nbcon_kthread_func:A.
+ */
+ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */
+ return rcuwait_active(w);
+}
+
+/**
+ * nbcon_kthreads_wake - Wake up printing threads using irq_work
+ */
+void nbcon_kthreads_wake(void)
+{
+ struct console *con;
+ int cookie;
+
+ if (!printk_kthreads_running)
+ return;
+
+ /*
+ * It is not allowed to call this function when console irq_work
+ * is blocked.
+ */
+ if (WARN_ON_ONCE(console_irqwork_blocked))
+ return;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ if (!(console_srcu_read_flags(con) & CON_NBCON))
+ continue;
+
+ /*
+ * Only schedule irq_work if the printing thread is
+ * actively waiting. If not waiting, the thread will
+ * notice by itself that it has work to do.
+ */
+ if (rcuwait_has_sleeper(&con->rcuwait))
+ irq_work_queue(&con->irq_work);
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/*
+ * nbcon_kthread_stop - Stop a console printer thread
+ * @con: Console to operate on
+ */
+void nbcon_kthread_stop(struct console *con)
+{
+ lockdep_assert_console_list_lock_held();
+
+ if (!con->kthread)
+ return;
+
+ kthread_stop(con->kthread);
+ con->kthread = NULL;
+}
+
+/**
+ * nbcon_kthread_create - Create a console printer thread
+ * @con: Console to operate on
+ *
+ * Return: True if the kthread was started or already exists.
+ * Otherwise false and @con must not be registered.
+ *
+ * This function is called when it will be expected that nbcon consoles are
+ * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL
+ * will be no longer flushed by the legacy loop. This is why failure must
+ * be fatal for console registration.
+ *
+ * If @con was already registered and this function fails, @con must be
+ * unregistered before the global state variable @printk_kthreads_running
+ * can be set.
+ */
+bool nbcon_kthread_create(struct console *con)
+{
+ struct task_struct *kt;
+
+ lockdep_assert_console_list_lock_held();
+
+ if (con->kthread)
+ return true;
+
+ kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index);
+ if (WARN_ON(IS_ERR(kt))) {
+ con_printk(KERN_ERR, con, "failed to start printing thread\n");
+ return false;
+ }
+
+ con->kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(con->kthread, -20);
+
+ return true;
+}
+
+/* Track the nbcon emergency nesting per CPU. */
+static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting);
+static unsigned int early_nbcon_pcpu_emergency_nesting __initdata;
+
+/**
+ * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer
+ *
+ * Context: For reading, any context. For writing, any context which could
+ * not be migrated to another CPU.
+ * Return: Either a pointer to the per CPU emergency nesting counter of
+ * the current CPU or to the init data during early boot.
+ *
+ * The function is safe for reading per-CPU variables in any context because
+ * preemption is disabled if the current CPU is in the emergency state. See
+ * also nbcon_cpu_emergency_enter().
+ */
+static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void)
+{
+ /*
+ * The value of __printk_percpu_data_ready gets set in normal
+ * context and before SMP initialization. As a result it could
+ * never change while inside an nbcon emergency section.
+ */
+ if (!printk_percpu_data_ready())
+ return &early_nbcon_pcpu_emergency_nesting;
+
+ return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting);
+}
+
+/**
+ * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon
+ * printing on the current CPU
+ *
+ * Context: Any context.
+ * Return: The nbcon_prio to use for acquiring an nbcon console in this
+ * context for printing.
+ *
+ * The function is safe for reading per-CPU data in any context because
+ * preemption is disabled if the current CPU is in the emergency or panic
+ * state.
+ */
+enum nbcon_prio nbcon_get_default_prio(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ if (panic_on_this_cpu())
+ return NBCON_PRIO_PANIC;
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+ if (*cpu_emergency_nesting)
+ return NBCON_PRIO_EMERGENCY;
+
+ return NBCON_PRIO_NORMAL;
+}
+
+/*
+ * Track if it is allowed to perform unsafe hostile takeovers of console
+ * ownership. When true, console drivers might perform unsafe actions while
+ * printing. It is externally available via nbcon_allow_unsafe_takeover().
+ */
+static bool panic_nbcon_allow_unsafe_takeover;
+
+/**
+ * nbcon_allow_unsafe_takeover - Check if unsafe console takeovers are allowed
+ *
+ * Return: True, when it is permitted to perform unsafe console printing
+ *
+ * This is also used by console_is_usable() to determine if it is allowed to
+ * call write_atomic() callbacks flagged as unsafe (CON_NBCON_ATOMIC_UNSAFE).
+ */
+bool nbcon_allow_unsafe_takeover(void)
+{
+ return panic_on_this_cpu() && panic_nbcon_allow_unsafe_takeover;
+}
+
+/**
+ * nbcon_legacy_emit_next_record - Print one record for an nbcon console
+ * in legacy contexts
+ * @con: The console to print on
+ * @handover: Will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding
+ * both the console_lock and the SRCU read lock. Otherwise it
+ * is set to false.
+ * @cookie: The cookie from the SRCU read lock.
+ * @use_atomic: Set true when called in an atomic or unknown context.
+ * It affects which nbcon callback will be used: write_atomic()
+ * or write_thread().
+ *
+ * When false, the write_thread() callback is used and would be
+ * called in a preemtible context unless disabled by the
+ * device_lock. The legacy handover is not allowed in this mode.
+ *
+ * Context: Any context except NMI.
+ * Return: True, when a record has been printed and there are still
+ * pending records. The caller might want to continue flushing.
+ *
+ * False, when there is no pending record, or when the console
+ * context cannot be acquired, or the ownership has been lost.
+ * The caller should give up. Either the job is done, cannot be
+ * done, or will be handled by the owning context.
+ *
+ * This function is meant to be called by console_flush_all() to print records
+ * on nbcon consoles from legacy context (printing via console unlocking).
+ * Essentially it is the nbcon version of console_emit_next_record().
+ */
+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic)
+{
+ struct nbcon_write_context wctxt = { };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ unsigned long flags;
+ bool progress;
+
+ ctxt->console = con;
+ ctxt->prio = nbcon_get_default_prio();
+
+ if (use_atomic) {
+ /*
+ * In an atomic or unknown context, use the same procedure as
+ * in console_emit_next_record(). It allows to handover.
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
+ stop_critical_timings();
+ }
+
+ progress = nbcon_emit_one(&wctxt, use_atomic);
+
+ if (use_atomic) {
+ start_critical_timings();
+ *handover = console_lock_spinning_disable_and_check(cookie);
+ printk_safe_exit_irqrestore(flags);
+ } else {
+ /* Non-atomic does not perform legacy spinning handovers. */
+ *handover = false;
+ }
+
+ return progress;
+}
+
+/**
+ * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
+ * write_atomic() callback
+ * @con: The nbcon console to flush
+ * @stop_seq: Flush up until this record
+ *
+ * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on
+ * failure.
+ *
+ * Errors:
+ *
+ * -EPERM: Unable to acquire console ownership.
+ *
+ * -EAGAIN: Another context took over ownership while printing.
+ *
+ * -ENOENT: A record before @stop_seq is not available.
+ *
+ * If flushing up to @stop_seq was not successful, it only makes sense for the
+ * caller to try again when -EAGAIN was returned. When -EPERM is returned,
+ * this context is not allowed to acquire the console. When -ENOENT is
+ * returned, it cannot be expected that the unfinalized record will become
+ * available.
+ */
+static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
+{
+ struct nbcon_write_context wctxt = { };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ int err = 0;
+
+ ctxt->console = con;
+ ctxt->spinwait_max_us = 2000;
+ ctxt->prio = nbcon_get_default_prio();
+ ctxt->allow_unsafe_takeover = nbcon_allow_unsafe_takeover();
+
+ while (nbcon_seq_read(con) < stop_seq) {
+ if (!nbcon_context_try_acquire(ctxt, false))
+ return -EPERM;
+
+ /*
+ * nbcon_emit_next_record() returns false when the console was
+ * handed over or taken over. In both cases the context is no
+ * longer valid.
+ */
+ if (!nbcon_emit_next_record(&wctxt, true))
+ return -EAGAIN;
+
+ nbcon_context_release(ctxt);
+
+ if (!ctxt->backlog) {
+ /* Are there reserved but not yet finalized records? */
+ if (nbcon_seq_read(con) < stop_seq)
+ err = -ENOENT;
+ break;
+ }
+ }
+
+ return err;
+}
+
+/**
+ * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
+ * write_atomic() callback
+ * @con: The nbcon console to flush
+ * @stop_seq: Flush up until this record
+ *
+ * This will stop flushing before @stop_seq if another context has ownership.
+ * That context is then responsible for the flushing. Likewise, if new records
+ * are added while this context was flushing and there is no other context
+ * to handle the printing, this context must also flush those records.
+ */
+static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq)
+{
+ struct console_flush_type ft;
+ unsigned long flags;
+ int err;
+
+again:
+ /*
+ * Atomic flushing does not use console driver synchronization (i.e.
+ * it does not hold the port lock for uart consoles). Therefore IRQs
+ * must be disabled to avoid being interrupted and then calling into
+ * a driver that will deadlock trying to acquire console ownership.
+ */
+ local_irq_save(flags);
+
+ err = __nbcon_atomic_flush_pending_con(con, stop_seq);
+
+ local_irq_restore(flags);
+
+ /*
+ * If there was a new owner (-EPERM, -EAGAIN), that context is
+ * responsible for completing.
+ *
+ * Do not wait for records not yet finalized (-ENOENT) to avoid a
+ * possible deadlock. They will either get flushed by the writer or
+ * eventually skipped on panic CPU.
+ */
+ if (err)
+ return;
+
+ /*
+ * If flushing was successful but more records are available, this
+ * context must flush those remaining records if the printer thread
+ * is not available do it.
+ */
+ printk_get_console_flush_type(&ft);
+ if (!ft.nbcon_offload &&
+ prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
+ stop_seq = prb_next_reserve_seq(prb);
+ goto again;
+ }
+}
+
+/**
+ * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
+ * write_atomic() callback
+ * @stop_seq: Flush up until this record
+ */
+static void __nbcon_atomic_flush_pending(u64 stop_seq)
+{
+ struct console *con;
+ int cookie;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+
+ if (!(flags & CON_NBCON))
+ continue;
+
+ if (!console_is_usable(con, flags, true))
+ continue;
+
+ if (nbcon_seq_read(con) >= stop_seq)
+ continue;
+
+ nbcon_atomic_flush_pending_con(con, stop_seq);
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/**
+ * nbcon_atomic_flush_pending - Flush all nbcon consoles using their
+ * write_atomic() callback
+ *
+ * Flush the backlog up through the currently newest record. Any new
+ * records added while flushing will not be flushed if there is another
+ * context available to handle the flushing. This is to avoid one CPU
+ * printing unbounded because other CPUs continue to add records.
+ */
+void nbcon_atomic_flush_pending(void)
+{
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
+}
+
+/**
+ * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their
+ * write_atomic() callback and allowing unsafe hostile takeovers
+ *
+ * Flush the backlog up through the currently newest record. Unsafe hostile
+ * takeovers will be performed, if necessary.
+ */
+void nbcon_atomic_flush_unsafe(void)
+{
+ panic_nbcon_allow_unsafe_takeover = true;
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb));
+ panic_nbcon_allow_unsafe_takeover = false;
+}
+
+/**
+ * nbcon_cpu_emergency_enter - Enter an emergency section where printk()
+ * messages for that CPU are flushed directly
+ *
+ * Context: Any context. Disables preemption.
+ *
+ * When within an emergency section, printk() calls will attempt to flush any
+ * pending messages in the ringbuffer.
+ */
+void nbcon_cpu_emergency_enter(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ preempt_disable();
+
+ atomic_inc(&nbcon_cpu_emergency_cnt);
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+ (*cpu_emergency_nesting)++;
+}
+
+/**
+ * nbcon_cpu_emergency_exit - Exit an emergency section
+ *
+ * Context: Within an emergency section. Enables preemption.
+ */
+void nbcon_cpu_emergency_exit(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+ if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
+ (*cpu_emergency_nesting)--;
+
+ /*
+ * Wake up kthreads because there might be some pending messages
+ * added by other CPUs with normal priority since the last flush
+ * in the emergency context.
+ */
+ if (!WARN_ON_ONCE(atomic_read(&nbcon_cpu_emergency_cnt) == 0)) {
+ if (atomic_dec_return(&nbcon_cpu_emergency_cnt) == 0) {
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ }
+ }
+
+ preempt_enable();
+}
+
+/**
+ * nbcon_alloc - Allocate and init the nbcon console specific data
+ * @con: Console to initialize
+ *
+ * Return: True if the console was fully allocated and initialized.
+ * Otherwise @con must not be registered.
+ *
+ * When allocation and init was successful, the console must be properly
+ * freed using nbcon_free() once it is no longer needed.
+ */
+bool nbcon_alloc(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ /* Synchronize the kthread start. */
+ lockdep_assert_console_list_lock_held();
+
+ /* The write_thread() callback is mandatory. */
+ if (WARN_ON(!con->write_thread))
+ return false;
+
+ rcuwait_init(&con->rcuwait);
+ init_irq_work(&con->irq_work, nbcon_irq_work);
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL);
+ nbcon_state_set(con, &state);
+
+ /*
+ * Initialize @nbcon_seq to the highest possible sequence number so
+ * that practically speaking it will have nothing to print until a
+ * desired initial sequence number has been set via nbcon_seq_force().
+ */
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb));
+
+ if (con->flags & CON_BOOT) {
+ /*
+ * Boot console printing is synchronized with legacy console
+ * printing, so boot consoles can share the same global printk
+ * buffers.
+ */
+ con->pbufs = &printk_shared_pbufs;
+ } else {
+ con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+ if (!con->pbufs) {
+ con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+ return false;
+ }
+
+ if (printk_kthreads_ready && !have_boot_console) {
+ if (!nbcon_kthread_create(con)) {
+ kfree(con->pbufs);
+ con->pbufs = NULL;
+ return false;
+ }
+
+ /* Might be the first kthread. */
+ printk_kthreads_running = true;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * nbcon_free - Free and cleanup the nbcon console specific data
+ * @con: Console to free/cleanup nbcon data
+ *
+ * Important: @have_nbcon_console must be updated before calling
+ * this function. In particular, it can be set only when there
+ * is still another nbcon console registered.
+ */
+void nbcon_free(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ /* Synchronize the kthread stop. */
+ lockdep_assert_console_list_lock_held();
+
+ if (printk_kthreads_running) {
+ nbcon_kthread_stop(con);
+
+ /* Might be the last nbcon console.
+ *
+ * Do not rely on printk_kthreads_check_locked(). It is not
+ * called in some code paths, see nbcon_free() callers.
+ */
+ if (!have_nbcon_console)
+ printk_kthreads_running = false;
+ }
+
+ nbcon_state_set(con, &state);
+
+ /* Boot consoles share global printk buffers. */
+ if (!(con->flags & CON_BOOT))
+ kfree(con->pbufs);
+
+ con->pbufs = NULL;
+}
+
+/**
+ * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe
+ * section
+ * @con: The nbcon console to acquire
+ *
+ * Context: Under the locking mechanism implemented in
+ * @con->device_lock() including disabling migration.
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * Console drivers will usually use their own internal synchronization
+ * mechasism to synchronize between console printing and non-printing
+ * activities (such as setting baud rates). However, nbcon console drivers
+ * supporting atomic consoles may also want to mark unsafe sections when
+ * performing non-printing activities in order to synchronize against their
+ * atomic_write() callback.
+ *
+ * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL
+ * and marks it unsafe for handover/takeover.
+ */
+bool nbcon_device_try_acquire(struct console *con)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
+
+ cant_migrate();
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->console = con;
+ ctxt->prio = NBCON_PRIO_NORMAL;
+
+ if (!nbcon_context_try_acquire(ctxt, false))
+ return false;
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nbcon_device_try_acquire);
+
+/**
+ * nbcon_device_release - Exit unsafe section and release the nbcon console
+ * @con: The nbcon console acquired in nbcon_device_try_acquire()
+ */
+void nbcon_device_release(struct console *con)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
+ struct console_flush_type ft;
+ int cookie;
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return;
+
+ nbcon_context_release(ctxt);
+
+ /*
+ * This context must flush any new records added while the console
+ * was locked if the printer thread is not available to do it. The
+ * console_srcu_read_lock must be taken to ensure the console is
+ * usable throughout flushing.
+ */
+ cookie = console_srcu_read_lock();
+ printk_get_console_flush_type(&ft);
+ if (console_is_usable(con, console_srcu_read_flags(con), true) &&
+ !ft.nbcon_offload &&
+ prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
+ /*
+ * If nbcon_atomic flushing is not available, fallback to
+ * using the legacy loop.
+ */
+ if (ft.nbcon_atomic) {
+ __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb));
+ } else if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ } else if (ft.legacy_offload) {
+ defer_console_output();
+ }
+ }
+ console_srcu_read_unlock(cookie);
+}
+EXPORT_SYMBOL_GPL(nbcon_device_release);
+
+/**
+ * nbcon_kdb_try_acquire - Try to acquire nbcon console and enter unsafe
+ * section
+ * @con: The nbcon console to acquire
+ * @wctxt: The nbcon write context to be used on success
+ *
+ * Context: Under console_srcu_read_lock() for emitting a single kdb message
+ * using the given con->write_atomic() callback. Can be called
+ * only when the console is usable at the moment.
+ *
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * kdb emits messages on consoles registered for printk() without
+ * storing them into the ring buffer. It has to acquire the console
+ * ownerhip so that it could call con->write_atomic() callback a safe way.
+ *
+ * This function acquires the nbcon console using priority NBCON_PRIO_EMERGENCY
+ * and marks it unsafe for handover/takeover.
+ */
+bool nbcon_kdb_try_acquire(struct console *con,
+ struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->console = con;
+ ctxt->prio = NBCON_PRIO_EMERGENCY;
+
+ if (!nbcon_context_try_acquire(ctxt, false))
+ return false;
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_kdb_release - Exit unsafe section and release the nbcon console
+ *
+ * @wctxt: The nbcon write context initialized by a successful
+ * nbcon_kdb_try_acquire()
+ */
+void nbcon_kdb_release(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return;
+
+ nbcon_context_release(ctxt);
+
+ /*
+ * Flush any new printk() messages added when the console was blocked.
+ * Only the console used by the given write context was blocked.
+ * The console was locked only when the write_atomic() callback
+ * was usable.
+ */
+ __nbcon_atomic_flush_pending_con(ctxt->console, prb_next_reserve_seq(prb));
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..1d765ad242b8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/printk.c
*
@@ -16,6 +17,8 @@
* 01Mar01 Andrew Morton
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
@@ -26,34 +29,38 @@
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
-#include <linux/interrupt.h> /* For in_interrupt() */
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/kexec.h>
-#include <linux/kdb.h>
+#include <linux/syscore_ops.h>
+#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
#include <linux/cpu.h>
-#include <linux/notifier.h>
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
-#include <linux/utsname.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/panic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <asm/sections.h>
+#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
+#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
+#include "internal.h"
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
@@ -61,6 +68,12 @@ int console_printk[4] = {
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
+EXPORT_SYMBOL_GPL(console_printk);
+
+atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
+EXPORT_SYMBOL(ignore_console_lock_warning);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(console);
/*
* Low level drivers may need that to know if they can schedule in
@@ -70,20 +83,227 @@ int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
- * console_sem protects the console_drivers list, and also
- * provides serialisation for access to the entire console
- * driver system.
+ * console_mutex protects console_list updates and console->flags updates.
+ * The flags are synchronized only for consoles that are registered, i.e.
+ * accessible via the console list.
+ */
+static DEFINE_MUTEX(console_mutex);
+
+/*
+ * console_sem protects updates to console->seq
+ * and also provides serialization for console printing.
+ */
+static DEFINE_SEMAPHORE(console_sem, 1);
+HLIST_HEAD(console_list);
+EXPORT_SYMBOL_GPL(console_list);
+DEFINE_STATIC_SRCU(console_srcu);
+
+/*
+ * System may need to suppress printk message under certain
+ * circumstances, like after kernel panic happens.
*/
-static DEFINE_SEMAPHORE(console_sem);
-struct console *console_drivers;
-EXPORT_SYMBOL_GPL(console_drivers);
+int __read_mostly suppress_printk;
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
+
+void lockdep_assert_console_list_lock_held(void)
+{
+ lockdep_assert_held(&console_mutex);
+}
+EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+bool console_srcu_read_lock_is_held(void)
+{
+ return srcu_read_lock_held(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock_is_held);
+#endif
+
+enum devkmsg_log_bits {
+ __DEVKMSG_LOG_BIT_ON = 0,
+ __DEVKMSG_LOG_BIT_OFF,
+ __DEVKMSG_LOG_BIT_LOCK,
+};
+
+enum devkmsg_log_masks {
+ DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON),
+ DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF),
+ DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK),
+};
+
+/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
+#define DEVKMSG_LOG_MASK_DEFAULT 0
+
+static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+
+static int __control_devkmsg(char *str)
+{
+ size_t len;
+
+ if (!str)
+ return -EINVAL;
+
+ len = str_has_prefix(str, "on");
+ if (len) {
+ devkmsg_log = DEVKMSG_LOG_MASK_ON;
+ return len;
+ }
+
+ len = str_has_prefix(str, "off");
+ if (len) {
+ devkmsg_log = DEVKMSG_LOG_MASK_OFF;
+ return len;
+ }
+
+ len = str_has_prefix(str, "ratelimit");
+ if (len) {
+ devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
+ return len;
+ }
+
+ return -EINVAL;
+}
+
+static int __init control_devkmsg(char *str)
+{
+ if (__control_devkmsg(str) < 0) {
+ pr_warn("printk.devkmsg: bad option string '%s'\n", str);
+ return 1;
+ }
+
+ /*
+ * Set sysctl string accordingly:
+ */
+ if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
+ strscpy(devkmsg_log_str, "on");
+ else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
+ strscpy(devkmsg_log_str, "off");
+ /* else "ratelimit" which is set by default. */
+
+ /*
+ * Sysctl cannot change it anymore. The kernel command line setting of
+ * this parameter is to force the setting to be permanent throughout the
+ * runtime of the system. This is a precation measure against userspace
+ * trying to be a smarta** and attempting to change it up on us.
+ */
+ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
+
+ return 1;
+}
+__setup("printk.devkmsg=", control_devkmsg);
+
+char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ char old_str[DEVKMSG_STR_MAX_SIZE];
+ unsigned int old;
+ int err;
+
+ if (write) {
+ if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
+ return -EINVAL;
+
+ old = devkmsg_log;
+ strscpy(old_str, devkmsg_log_str);
+ }
+
+ err = proc_dostring(table, write, buffer, lenp, ppos);
+ if (err)
+ return err;
+
+ if (write) {
+ err = __control_devkmsg(devkmsg_log_str);
+
+ /*
+ * Do not accept an unknown string OR a known string with
+ * trailing crap...
+ */
+ if (err < 0 || (err + 1 != *lenp)) {
+
+ /* ... and restore old setting. */
+ devkmsg_log = old;
+ strscpy(devkmsg_log_str, old_str);
+
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
+
+/**
+ * console_list_lock - Lock the console list
+ *
+ * For console list or console->flags updates
+ */
+void console_list_lock(void)
+{
+ /*
+ * In unregister_console() and console_force_preferred_locked(),
+ * synchronize_srcu() is called with the console_list_lock held.
+ * Therefore it is not allowed that the console_list_lock is taken
+ * with the srcu_lock held.
+ *
+ * Detecting if this context is really in the read-side critical
+ * section is only possible if the appropriate debug options are
+ * enabled.
+ */
+ WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
+ srcu_read_lock_held(&console_srcu));
+
+ mutex_lock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_lock);
+
+/**
+ * console_list_unlock - Unlock the console list
+ *
+ * Counterpart to console_list_lock()
+ */
+void console_list_unlock(void)
+{
+ mutex_unlock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_unlock);
+
+/**
+ * console_srcu_read_lock - Register a new reader for the
+ * SRCU-protected console list
+ *
+ * Use for_each_console_srcu() to iterate the console list
+ *
+ * Context: Any context.
+ * Return: A cookie to pass to console_srcu_read_unlock().
+ */
+int console_srcu_read_lock(void)
+ __acquires(&console_srcu)
+{
+ return srcu_read_lock_nmisafe(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock);
+
+/**
+ * console_srcu_read_unlock - Unregister an old reader from
+ * the SRCU-protected console list
+ * @cookie: cookie returned from console_srcu_read_lock()
+ *
+ * Counterpart to console_srcu_read_lock()
+ */
+void console_srcu_read_unlock(int cookie)
+ __releases(&console_srcu)
+{
+ srcu_read_unlock_nmisafe(&console_srcu, cookie);
+}
+EXPORT_SYMBOL(console_srcu_read_unlock);
+
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
@@ -95,17 +315,36 @@ static struct lockdep_map console_lock_dep_map = {
static int __down_trylock_console_sem(unsigned long ip)
{
- if (down_trylock(&console_sem))
+ int lock_failed;
+ unsigned long flags;
+
+ /*
+ * Here and in __up_console_sem() we need to be in safe mode,
+ * because spindump/WARN/etc from under console ->lock will
+ * deadlock in printk()->down_trylock_console_sem() otherwise.
+ */
+ printk_safe_enter_irqsave(flags);
+ lock_failed = down_trylock(&console_sem);
+ printk_safe_exit_irqrestore(flags);
+
+ if (lock_failed)
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
-#define up_console_sem() do { \
- mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
- up(&console_sem);\
-} while (0)
+static void __up_console_sem(unsigned long ip)
+{
+ unsigned long flags;
+
+ mutex_release(&console_lock_dep_map, ip);
+
+ printk_safe_enter_irqsave(flags);
+ up(&console_sem);
+ printk_safe_exit_irqrestore(flags);
+}
+#define up_console_sem() __up_console_sem(_RET_IP_)
/*
* This is used for debugging the mess that is the VT code by
@@ -113,14 +352,9 @@ static int __down_trylock_console_sem(unsigned long ip)
* definitely not the perfect debug tool (we don't know if _WE_
* hold it and are racing, but it helps tracking those weird code
* paths in the console code where we end up in places I want
- * locked without the console sempahore held).
- */
-static int console_locked, console_suspended;
-
-/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
+ * locked without the console semaphore held).
*/
-static struct console *exclusive_console;
+static int console_locked;
/*
* Array of consoles built from command line options (console=)
@@ -130,7 +364,6 @@ static struct console *exclusive_console;
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
-static int selected_console = -1;
static int preferred_console = -1;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
@@ -138,31 +371,30 @@ EXPORT_SYMBOL(console_set_on_cmdline);
/* Flag: console code may call schedule() */
static int console_may_schedule;
+enum con_msg_format_flags {
+ MSG_FORMAT_DEFAULT = 0,
+ MSG_FORMAT_SYSLOG = (1 << 0),
+};
+
+static int console_msg_format = MSG_FORMAT_DEFAULT;
+
/*
- * The printk log buffer consists of a chain of concatenated variable
- * length records. Every record starts with a record header, containing
- * the overall length of the record.
+ * The printk log buffer consists of a sequenced collection of records, each
+ * containing variable length message text. Every record also contains its
+ * own meta-data (@info).
*
- * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these entries are maintained when messages are
- * stored.
+ * Every record meta-data carries the timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual kernel
+ * messages use LOG_KERN; userspace-injected messages always carry a matching
+ * syslog facility, by default LOG_USER. The origin of every message can be
+ * reliably determined that way.
*
- * If the heads indicate available messages, the length in the header
- * tells the start next message. A length == 0 for the next message
- * indicates a wrap-around to the beginning of the buffer.
+ * The human readable log message of a record is available in @text, the
+ * length of the message text in @text_len. The stored message is not
+ * terminated.
*
- * Every record carries the monotonic timestamp in microseconds, as well as
- * the standard userspace syslog level and syslog facility. The usual
- * kernel messages use LOG_KERN; userspace-injected messages always carry
- * a matching syslog facility, by default LOG_USER. The origin of every
- * message can be reliably determined that way.
- *
- * The human readable log message directly follows the message header. The
- * length of the message text is stored in the header, the stored message
- * is not terminated.
- *
- * Optionally, a message can carry a dictionary of properties (key/value pairs),
- * to provide userspace with a machine-readable message context.
+ * Optionally, a record can carry a dictionary of properties (key/value
+ * pairs), to provide userspace with a machine-readable message context.
*
* Examples for well-defined, commonly used property names are:
* DEVICE=b12:8 device identifier
@@ -172,210 +404,171 @@ static int console_may_schedule;
* +sound:card0 subsystem:devname
* SUBSYSTEM=pci driver-core subsystem name
*
- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
- * follows directly after a '=' character. Every property is terminated by
- * a '\0' character. The last property is not terminated.
- *
- * Example of a message structure:
- * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
- * 0008 34 00 record is 52 bytes long
- * 000a 0b 00 text is 11 bytes long
- * 000c 1f 00 dictionary is 23 bytes long
- * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
- * 0010 69 74 27 73 20 61 20 6c "it's a l"
- * 69 6e 65 "ine"
- * 001b 44 45 56 49 43 "DEVIC"
- * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
- * 52 49 56 45 52 3d 62 75 "RIVER=bu"
- * 67 "g"
- * 0032 00 00 00 padding to next message header
- *
- * The 'struct printk_log' buffer header must never be directly exported to
+ * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
+ * and values are terminated by a '\0' character.
+ *
+ * Example of record values:
+ * record.text_buf = "it's a line" (unterminated)
+ * record.info.seq = 56
+ * record.info.ts_nsec = 36863
+ * record.info.text_len = 11
+ * record.info.facility = 0 (LOG_KERN)
+ * record.info.flags = 0
+ * record.info.level = 3 (LOG_ERR)
+ * record.info.caller_id = 299 (task 299)
+ * record.info.dev_info.subsystem = "pci" (terminated)
+ * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated)
+ *
+ * The 'struct printk_info' buffer must never be directly exported to
* userspace, it is a kernel-private implementation detail that might
* need to be changed in the future, when the requirements change.
*
* /dev/kmsg exports the structured data in the following line format:
- * "level,sequnum,timestamp;<message text>\n"
+ * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
+ *
+ * Users of the export format should ignore possible additional values
+ * separated by ',', and find the message after the ';' character.
*
* The optional key/value pairs are attached as continuation lines starting
* with a space character and terminated by a newline. All possible
* non-prinatable characters are escaped in the "\xff" notation.
- *
- * Users of the export format should ignore possible additional values
- * separated by ',', and find the message after the ';' character.
*/
-enum log_flags {
- LOG_NOCONS = 1, /* already flushed, do not print to console */
- LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_PREFIX = 4, /* text started with a prefix */
- LOG_CONT = 8, /* text is a fragment of a continuation line */
-};
+/* syslog_lock protects syslog_* variables and write access to clear_seq. */
+static DEFINE_MUTEX(syslog_lock);
-struct printk_log {
- u64 ts_nsec; /* timestamp in nanoseconds */
- u16 len; /* length of entire record */
- u16 text_len; /* length of text buffer */
- u16 dict_len; /* length of dictionary buffer */
- u8 facility; /* syslog facility */
- u8 flags:5; /* internal record flags */
- u8 level:3; /* syslog level */
-};
+/*
+ * Specifies if a legacy console is registered. If legacy consoles are
+ * present, it is necessary to perform the console lock/unlock dance
+ * whenever console flushing should occur.
+ */
+bool have_legacy_console;
/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
+ * Specifies if an nbcon console is registered. If nbcon consoles are present,
+ * synchronous printing of legacy consoles will not occur during panic until
+ * the backtrace has been stored to the ringbuffer.
*/
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+bool have_nbcon_console;
+
+/*
+ * Specifies if a boot console is registered. If boot consoles are present,
+ * nbcon consoles cannot print simultaneously and must be synchronized by
+ * the console lock. This is because boot consoles and nbcon consoles may
+ * have mapped the same hardware.
+ */
+bool have_boot_console;
+
+/* See printk_legacy_allow_panic_sync() for details. */
+bool legacy_allow_panic_sync;
+
+/* Avoid using irq_work when suspending. */
+bool console_irqwork_blocked;
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
+/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
-static u32 syslog_idx;
-static enum log_flags syslog_prev;
static size_t syslog_partial;
+static bool syslog_time;
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
+/* True when _all_ printer threads are available for printing. */
+bool printk_kthreads_running;
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags console_prev;
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
-static u32 clear_idx;
+/*
+ * The next printk record to read after the last 'clear' command. There are
+ * two copies (updated with seqcount_latch) so that reads can locklessly
+ * access a valid value. Writers are synchronized by @syslog_lock.
+ */
+static struct latched_seq clear_seq = {
+ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
+ .val[0] = 0,
+ .val[1] = 0,
+};
-#define PREFIX_MAX 32
-#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+#define LOG_LEVEL(v) ((v) & 0x07)
+#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define LOG_ALIGN 4
-#else
-#define LOG_ALIGN __alignof__(struct printk_log)
-#endif
+#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+#define LOG_BUF_LEN_MAX ((u32)1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
-/* Return log buffer address */
-char *log_buf_addr_get(void)
-{
- return log_buf;
-}
+/*
+ * Define the average message size. This only affects the number of
+ * descriptors that will be available. Underestimating is better than
+ * overestimating (too many available descriptors is better than not enough).
+ */
+#define PRB_AVGBITS 5 /* 32 character average length */
-/* Return log buffer size */
-u32 log_buf_len_get(void)
-{
- return log_buf_len;
-}
+#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
+#error CONFIG_LOG_BUF_SHIFT value too small.
+#endif
+_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
+ PRB_AVGBITS, &__log_buf[0]);
-/* human readable text of the record */
-static char *log_text(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log);
-}
+static struct printk_ringbuffer printk_rb_dynamic;
-/* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log) + msg->text_len;
-}
+struct printk_ringbuffer *prb = &printk_rb_static;
-/* get record by index; idx must point to valid msg */
-static struct printk_log *log_from_idx(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
+/*
+ * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
+ * per_cpu_areas are initialised. This variable is set to true when
+ * it's safe to access per-CPU data.
+ */
+static bool __printk_percpu_data_ready __ro_after_init;
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer.
- */
- if (!msg->len)
- return (struct printk_log *)log_buf;
- return msg;
+bool printk_percpu_data_ready(void)
+{
+ return __printk_percpu_data_ready;
}
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
+/* Must be called under syslog_lock. */
+static void latched_seq_write(struct latched_seq *ls, u64 val)
{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /* length == 0 indicates the end of the buffer; wrap */
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer as *this* one, and
- * return the one after that.
- */
- if (!msg->len) {
- msg = (struct printk_log *)log_buf;
- return msg->len;
- }
- return idx + msg->len;
+ write_seqcount_latch_begin(&ls->latch);
+ ls->val[0] = val;
+ write_seqcount_latch(&ls->latch);
+ ls->val[1] = val;
+ write_seqcount_latch_end(&ls->latch);
}
-/*
- * Check whether there is enough free space for the given message.
- *
- * The same values of first_idx and next_idx mean that the buffer
- * is either empty or full.
- *
- * If the buffer is empty, we must respect the position of the indexes.
- * They cannot be reset to the beginning of the buffer.
- */
-static int logbuf_has_space(u32 msg_size, bool empty)
+/* Can be called from any context. */
+static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
- u32 free;
+ unsigned int seq;
+ unsigned int idx;
+ u64 val;
- if (log_next_idx > log_first_idx || empty)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
+ do {
+ seq = read_seqcount_latch(&ls->latch);
+ idx = seq & 0x1;
+ val = ls->val[idx];
+ } while (read_seqcount_latch_retry(&ls->latch, seq));
- /*
- * We need space also for an empty header that signalizes wrapping
- * of the buffer.
- */
- return free >= msg_size + sizeof(struct printk_log);
+ return val;
}
-static int log_make_free_space(u32 msg_size)
+/* Return log buffer address */
+char *log_buf_addr_get(void)
{
- while (log_first_seq < log_next_seq) {
- if (logbuf_has_space(msg_size, false))
- return 0;
- /* drop old messages until we have enough contiguous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
- }
-
- /* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, true))
- return 0;
-
- return -ENOMEM;
+ return log_buf;
}
-/* compute the message size including the padding bytes */
-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
+/* Return log buffer size */
+u32 log_buf_len_get(void)
{
- u32 size;
-
- size = sizeof(struct printk_log) + text_len + dict_len;
- *pad_len = (-size) & (LOG_ALIGN - 1);
- size += *pad_len;
-
- return size;
+ return log_buf_len;
}
/*
@@ -386,81 +579,23 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";
-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
- u16 *dict_len, u32 *pad_len)
+static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
/*
* The message should not take the whole buffer. Otherwise, it might
* get removed too soon.
*/
u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+
if (*text_len > max_text_len)
*text_len = max_text_len;
- /* enable the warning message */
- *trunc_msg_len = strlen(trunc_msg);
- /* disable the "dict" completely */
- *dict_len = 0;
- /* compute the size again, count also the warning message */
- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
-}
-
-/* insert record into the buffer, discard old ones, update heads */
-static int log_store(int facility, int level,
- enum log_flags flags, u64 ts_nsec,
- const char *dict, u16 dict_len,
- const char *text, u16 text_len)
-{
- struct printk_log *msg;
- u32 size, pad_len;
- u16 trunc_msg_len = 0;
-
- /* number of '\0' padding bytes to next message */
- size = msg_used_size(text_len, dict_len, &pad_len);
-
- if (log_make_free_space(size)) {
- /* truncate the message if it is too long for empty buffer */
- size = truncate_msg(&text_len, &trunc_msg_len,
- &dict_len, &pad_len);
- /* survive when the log buffer is too small for trunc_msg */
- if (log_make_free_space(size))
- return 0;
- }
-
- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
- /*
- * This message + an additional empty header does not fit
- * at the end of the buffer. Add an empty header with len == 0
- * to signify a wrap around.
- */
- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
- log_next_idx = 0;
- }
- /* fill message */
- msg = (struct printk_log *)(log_buf + log_next_idx);
- memcpy(log_text(msg), text, text_len);
- msg->text_len = text_len;
- if (trunc_msg_len) {
- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
- msg->text_len += trunc_msg_len;
- }
- memcpy(log_dict(msg), dict, dict_len);
- msg->dict_len = dict_len;
- msg->facility = facility;
- msg->level = level & 7;
- msg->flags = flags & 0x1f;
- if (ts_nsec > 0)
- msg->ts_nsec = ts_nsec;
+ /* enable the warning message (if there is room) */
+ *trunc_msg_len = strlen(trunc_msg);
+ if (*text_len >= *trunc_msg_len)
+ *text_len -= *trunc_msg_len;
else
- msg->ts_nsec = local_clock();
- memset(log_dict(msg) + dict_len, 0, pad_len);
- msg->len = size;
-
- /* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
-
- return msg->text_len;
+ *trunc_msg_len = 0;
}
int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
@@ -477,61 +612,157 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, bool from_file)
+static int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
* already done the capabilities checks at open time.
*/
- if (from_file && type != SYSLOG_ACTION_OPEN)
- return 0;
+ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
+ goto ok;
if (syslog_action_restricted(type)) {
if (capable(CAP_SYSLOG))
- return 0;
- /*
- * For historical reasons, accept CAP_SYS_ADMIN too, with
- * a warning.
- */
- if (capable(CAP_SYS_ADMIN)) {
- pr_warn_once("%s (%d): Attempt to access syslog with "
- "CAP_SYS_ADMIN but no CAP_SYSLOG "
- "(deprecated).\n",
- current->comm, task_pid_nr(current));
- return 0;
- }
+ goto ok;
return -EPERM;
}
+ok:
return security_syslog(type);
}
+static void append_char(char **pp, char *e, char c)
+{
+ if (*pp < e)
+ *(*pp)++ = c;
+}
+
+static ssize_t info_print_ext_header(char *buf, size_t size,
+ struct printk_info *info)
+{
+ u64 ts_usec = info->ts_nsec;
+ char caller[20];
+#ifdef CONFIG_PRINTK_CALLER
+ u32 id = info->caller_id;
+
+ snprintf(caller, sizeof(caller), ",caller=%c%u",
+ id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
+#else
+ caller[0] = '\0';
+#endif
+
+ do_div(ts_usec, 1000);
+
+ return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
+ (info->facility << 3) | info->level, info->seq,
+ ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
+}
+
+static ssize_t msg_add_ext_text(char *buf, size_t size,
+ const char *text, size_t text_len,
+ unsigned char endc)
+{
+ char *p = buf, *e = buf + size;
+ size_t i;
+
+ /* escape non-printable characters */
+ for (i = 0; i < text_len; i++) {
+ unsigned char c = text[i];
+
+ if (c < ' ' || c >= 127 || c == '\\')
+ p += scnprintf(p, e - p, "\\x%02x", c);
+ else
+ append_char(&p, e, c);
+ }
+ append_char(&p, e, endc);
+
+ return p - buf;
+}
+
+static ssize_t msg_add_dict_text(char *buf, size_t size,
+ const char *key, const char *val)
+{
+ size_t val_len = strlen(val);
+ ssize_t len;
+
+ if (!val_len)
+ return 0;
+
+ len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */
+ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
+ len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
+
+ return len;
+}
+
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *text, size_t text_len,
+ struct dev_printk_info *dev_info)
+{
+ ssize_t len;
+
+ len = msg_add_ext_text(buf, size, text, text_len, '\n');
+
+ if (!dev_info)
+ goto out;
+
+ len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
+ dev_info->subsystem);
+ len += msg_add_dict_text(buf + len, size - len, "DEVICE",
+ dev_info->device);
+out:
+ return len;
+}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
- u64 seq;
- u32 idx;
- enum log_flags prev;
+ atomic64_t seq;
+ struct ratelimit_state rs;
struct mutex lock;
- char buf[8192];
+ struct printk_buffers pbufs;
};
+static __printf(3, 4) __cold
+int devkmsg_emit(int facility, int level, const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_emit(facility, level, NULL, fmt, args);
+ va_end(args);
+
+ return r;
+}
+
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
- int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
+ struct file *file = iocb->ki_filp;
+ struct devkmsg_user *user = file->private_data;
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (len > LOG_LINE_MAX)
+ if (len > PRINTKRB_RECORD_MAX)
return -EINVAL;
+
+ /* Ignore when user logging is disabled. */
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return len;
+
+ /* Ratelimit when not explicitly enabled. */
+ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
+ if (!___ratelimit(&user->rs, current->comm))
+ return ret;
+ }
+
buf = kmalloc(len+1, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
buf[len] = '\0';
- if (copy_from_iter(buf, len, from) != len) {
+ if (!copy_from_iter_full(buf, len, from)) {
kfree(buf);
return -EFAULT;
}
@@ -548,19 +779,19 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
line = buf;
if (line[0] == '<') {
char *endp = NULL;
+ unsigned int u;
- i = simple_strtoul(line+1, &endp, 10);
+ u = simple_strtoul(line + 1, &endp, 10);
if (endp && endp[0] == '>') {
- level = i & 7;
- if (i >> 3)
- facility = i >> 3;
+ level = LOG_LEVEL(u);
+ if (LOG_FACILITY(u) != 0)
+ facility = LOG_FACILITY(u);
endp++;
- len -= endp - line;
line = endp;
}
}
- printk_emit(facility, level, NULL, 0, "%s", line);
+ devkmsg_emit(facility, level, "%s", line);
kfree(buf);
return ret;
}
@@ -569,140 +800,83 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_log *msg;
- u64 ts_usec;
- size_t i;
- char cont = '-';
- size_t len;
+ char *outbuf = &user->pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &user->pbufs,
+ };
ssize_t ret;
- if (!user)
- return -EBADF;
-
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
- raw_spin_lock_irq(&logbuf_lock);
- while (user->seq == log_next_seq) {
+
+ if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- raw_spin_unlock_irq(&logbuf_lock);
goto out;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
+ printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
+ false)); /* LMM(devkmsg_read:A) */
if (ret)
goto out;
- raw_spin_lock_irq(&logbuf_lock);
}
- if (user->seq < log_first_seq) {
+ if (pmsg.dropped) {
/* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ atomic64_set(&user->seq, pmsg.seq);
ret = -EPIPE;
- raw_spin_unlock_irq(&logbuf_lock);
goto out;
}
- msg = log_from_idx(user->idx);
- ts_usec = msg->ts_nsec;
- do_div(ts_usec, 1000);
+ atomic64_set(&user->seq, pmsg.seq + 1);
- /*
- * If we couldn't merge continuation line fragments during the print,
- * export the stored flags to allow an optional external merge of the
- * records. Merging the records isn't always neccessarily correct, like
- * when we hit a race during printing. In most cases though, it produces
- * better readable output. 'c' in the record flags mark the first
- * fragment of a line, '+' the following.
- */
- if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
- cont = 'c';
- else if ((msg->flags & LOG_CONT) ||
- ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
- cont = '+';
-
- len = sprintf(user->buf, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level,
- user->seq, ts_usec, cont);
- user->prev = msg->flags;
-
- /* escape non-printable characters */
- for (i = 0; i < msg->text_len; i++) {
- unsigned char c = log_text(msg)[i];
-
- if (c < ' ' || c >= 127 || c == '\\')
- len += sprintf(user->buf + len, "\\x%02x", c);
- else
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
-
- if (msg->dict_len) {
- bool line = true;
-
- for (i = 0; i < msg->dict_len; i++) {
- unsigned char c = log_dict(msg)[i];
-
- if (line) {
- user->buf[len++] = ' ';
- line = false;
- }
-
- if (c == '\0') {
- user->buf[len++] = '\n';
- line = true;
- continue;
- }
-
- if (c < ' ' || c >= 127 || c == '\\') {
- len += sprintf(user->buf + len, "\\x%02x", c);
- continue;
- }
-
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
- }
-
- user->idx = log_next(user->idx);
- user->seq++;
- raw_spin_unlock_irq(&logbuf_lock);
-
- if (len > count) {
+ if (pmsg.outbuf_len > count) {
ret = -EINVAL;
goto out;
}
- if (copy_to_user(buf, user->buf, len)) {
+ if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
ret = -EFAULT;
goto out;
}
- ret = len;
+ ret = pmsg.outbuf_len;
out:
mutex_unlock(&user->lock);
return ret;
}
+/*
+ * Be careful when modifying this function!!!
+ *
+ * Only few operations are supported because the device works only with the
+ * entire variable length messages (records). Non-standard values are
+ * returned in the other cases and has been this way for quite some time.
+ * User space applications might depend on this behavior.
+ */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
- if (!user)
- return -EBADF;
if (offset)
return -ESPIPE;
- raw_spin_lock_irq(&logbuf_lock);
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
@@ -710,40 +884,33 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
- user->seq = clear_seq;
+ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
}
- raw_spin_unlock_irq(&logbuf_lock);
return ret;
}
-static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
+static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
- int ret = 0;
-
- if (!user)
- return POLLERR|POLLNVAL;
+ struct printk_info info;
+ __poll_t ret = 0;
poll_wait(file, &log_wait, wait);
- raw_spin_lock_irq(&logbuf_lock);
- if (user->seq < log_next_seq) {
+ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
- ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+ if (info.seq != atomic64_read(&user->seq))
+ ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
- ret = POLLIN|POLLRDNORM;
+ ret = EPOLLIN|EPOLLRDNORM;
}
- raw_spin_unlock_irq(&logbuf_lock);
return ret;
}
@@ -753,25 +920,27 @@ static int devkmsg_open(struct inode *inode, struct file *file)
struct devkmsg_user *user;
int err;
- /* write-only does not need any file context */
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- return 0;
+ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
+ return -EPERM;
- err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
- SYSLOG_FROM_READER);
- if (err)
- return err;
+ /* write-only does not need any file context */
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
+ if (err)
+ return err;
+ }
- user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
+ ratelimit_default_init(&user->rs);
+ ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);
+
mutex_init(&user->lock);
- raw_spin_lock_irq(&logbuf_lock);
- user->idx = log_first_idx;
- user->seq = log_first_seq;
- raw_spin_unlock_irq(&logbuf_lock);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
file->private_data = user;
return 0;
@@ -781,11 +950,10 @@ static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
- if (!user)
- return 0;
+ ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
- kfree(user);
+ kvfree(user);
return 0;
}
@@ -798,7 +966,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_VMCORE_INFO
/*
* This appends the listed symbols to /proc/vmcore
*
@@ -807,21 +975,63 @@ const struct file_operations kmsg_fops = {
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
*/
-void log_buf_kexec_setup(void)
+void log_buf_vmcoreinfo_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
+ struct dev_printk_info *dev_info = NULL;
+
+ VMCOREINFO_SYMBOL(prb);
+ VMCOREINFO_SYMBOL(printk_rb_static);
+ VMCOREINFO_SYMBOL(clear_seq);
+
/*
- * Export struct printk_log size and field offsets. User space tools can
+ * Export struct size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
*/
- VMCOREINFO_STRUCT_SIZE(printk_log);
- VMCOREINFO_OFFSET(printk_log, ts_nsec);
- VMCOREINFO_OFFSET(printk_log, len);
- VMCOREINFO_OFFSET(printk_log, text_len);
- VMCOREINFO_OFFSET(printk_log, dict_len);
+
+ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
+ VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, fail);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
+ VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
+ VMCOREINFO_OFFSET(prb_desc_ring, descs);
+ VMCOREINFO_OFFSET(prb_desc_ring, infos);
+ VMCOREINFO_OFFSET(prb_desc_ring, head_id);
+ VMCOREINFO_OFFSET(prb_desc_ring, tail_id);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc);
+ VMCOREINFO_OFFSET(prb_desc, state_var);
+ VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, next);
+
+ VMCOREINFO_STRUCT_SIZE(printk_info);
+ VMCOREINFO_OFFSET(printk_info, seq);
+ VMCOREINFO_OFFSET(printk_info, ts_nsec);
+ VMCOREINFO_OFFSET(printk_info, text_len);
+ VMCOREINFO_OFFSET(printk_info, caller_id);
+ VMCOREINFO_OFFSET(printk_info, dev_info);
+
+ VMCOREINFO_STRUCT_SIZE(dev_printk_info);
+ VMCOREINFO_OFFSET(dev_printk_info, subsystem);
+ VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
+ VMCOREINFO_OFFSET(dev_printk_info, device);
+ VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_ring);
+ VMCOREINFO_OFFSET(prb_data_ring, size_bits);
+ VMCOREINFO_OFFSET(prb_data_ring, data);
+ VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
+ VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);
+
+ VMCOREINFO_SIZE(atomic_long_t);
+ VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
+
+ VMCOREINFO_STRUCT_SIZE(latched_seq);
+ VMCOREINFO_OFFSET(latched_seq, val);
}
#endif
@@ -829,18 +1039,28 @@ void log_buf_kexec_setup(void)
static unsigned long __initdata new_log_buf_len;
/* we practice scaling the ring buffer by powers of 2 */
-static void __init log_buf_len_update(unsigned size)
+static void __init log_buf_len_update(u64 size)
{
+ if (size > (u64)LOG_BUF_LEN_MAX) {
+ size = (u64)LOG_BUF_LEN_MAX;
+ pr_err("log_buf over 2G is not supported.\n");
+ }
+
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len)
- new_log_buf_len = size;
+ new_log_buf_len = (unsigned long)size;
}
/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
- unsigned size = memparse(str, &str);
+ u64 size;
+
+ if (!str)
+ return -EINVAL;
+
+ size = memparse(str, &str);
log_buf_len_update(size);
@@ -881,11 +1101,71 @@ static void __init log_buf_add_cpu(void)
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */
+static void __init set_percpu_data_ready(void)
+{
+ __printk_percpu_data_ready = true;
+}
+
+static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_reserved_entry e;
+ struct printk_record dest_r;
+
+ prb_rec_init_wr(&dest_r, r->info->text_len);
+
+ if (!prb_reserve(&e, rb, &dest_r))
+ return 0;
+
+ memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
+ dest_r.info->text_len = r->info->text_len;
+ dest_r.info->facility = r->info->facility;
+ dest_r.info->level = r->info->level;
+ dest_r.info->flags = r->info->flags;
+ dest_r.info->ts_nsec = r->info->ts_nsec;
+ dest_r.info->caller_id = r->info->caller_id;
+ memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));
+
+ prb_final_commit(&e);
+
+ return prb_record_text_space(&e);
+}
+
+static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;
+
+static void print_log_buf_usage_stats(void)
+{
+ unsigned int descs_count = log_buf_len >> PRB_AVGBITS;
+ size_t meta_data_size;
+
+ meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info));
+
+ pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n",
+ log_buf_len, meta_data_size, log_buf_len + meta_data_size);
+}
+
void __init setup_log_buf(int early)
{
+ struct printk_info *new_infos;
+ unsigned int new_descs_count;
+ struct prb_desc *new_descs;
+ struct printk_info info;
+ struct printk_record r;
+ unsigned int text_size;
+ size_t new_descs_size;
+ size_t new_infos_size;
unsigned long flags;
char *new_log_buf;
- int free;
+ unsigned int free;
+ u64 seq;
+
+ /*
+ * Some archs call setup_log_buf() multiple times - first is very
+ * early, e.g. from setup_arch(), and second - when percpu_areas
+ * are initialised.
+ */
+ if (!early)
+ set_percpu_data_ready();
if (log_buf != __log_buf)
return;
@@ -893,34 +1173,98 @@ void __init setup_log_buf(int early)
if (!early && !new_log_buf_len)
log_buf_add_cpu();
- if (!new_log_buf_len)
+ if (!new_log_buf_len) {
+ /* Show the memory stats only once. */
+ if (!early)
+ goto out;
+
return;
+ }
- if (early) {
- new_log_buf =
- memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
- } else {
- new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
- LOG_ALIGN);
+ new_descs_count = new_log_buf_len >> PRB_AVGBITS;
+ if (new_descs_count == 0) {
+ pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
+ goto out;
}
+ new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
- pr_err("log_buf_len: %ld bytes not available\n",
- new_log_buf_len);
- return;
+ pr_err("log_buf_len: %lu text bytes not available\n",
+ new_log_buf_len);
+ goto out;
}
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ new_descs_size = new_descs_count * sizeof(struct prb_desc);
+ new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
+ if (unlikely(!new_descs)) {
+ pr_err("log_buf_len: %zu desc bytes not available\n",
+ new_descs_size);
+ goto err_free_log_buf;
+ }
+
+ new_infos_size = new_descs_count * sizeof(struct printk_info);
+ new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
+ if (unlikely(!new_infos)) {
+ pr_err("log_buf_len: %zu info bytes not available\n",
+ new_infos_size);
+ goto err_free_descs;
+ }
+
+ prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));
+
+ prb_init(&printk_rb_dynamic,
+ new_log_buf, ilog2(new_log_buf_len),
+ new_descs, ilog2(new_descs_count),
+ new_infos);
+
+ local_irq_save(flags);
+
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_next_idx;
- memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- pr_info("log_buf_len: %d bytes\n", log_buf_len);
- pr_info("early log buf free: %d(%d%%)\n",
+ free = __LOG_BUF_LEN;
+ prb_for_each_record(0, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
+
+ prb = &printk_rb_dynamic;
+
+ local_irq_restore(flags);
+
+ /*
+ * Copy any remaining messages that might have appeared from
+ * NMI context after copying but before switching to the
+ * dynamic buffer.
+ */
+ prb_for_each_record(seq, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
+
+ if (seq != prb_next_seq(&printk_rb_static)) {
+ pr_err("dropped %llu messages\n",
+ prb_next_seq(&printk_rb_static) - seq);
+ }
+
+ print_log_buf_usage_stats();
+ pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
+ return;
+
+err_free_descs:
+ memblock_free(new_descs, new_descs_size);
+err_free_log_buf:
+ memblock_free(new_log_buf, new_log_buf_len);
+out:
+ print_log_buf_usage_stats();
}
static bool __read_mostly ignore_loglevel;
@@ -938,6 +1282,11 @@ module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
"ignore loglevel setting (prints all kernel messages to the console)");
+static bool suppress_message_printing(int level)
+{
+ return (level >= console_loglevel && !ignore_loglevel);
+}
+
#ifdef CONFIG_BOOT_PRINTK_DELAY
static int boot_delay; /* msecs delay after each printk during bootup */
@@ -965,11 +1314,11 @@ static void boot_delay_msec(int level)
{
unsigned long long k;
unsigned long timeout;
+ bool suppress = !is_printk_force_console() &&
+ suppress_message_printing(level);
- if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
- || (level >= console_loglevel && !ignore_loglevel)) {
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress)
return;
- }
k = (unsigned long long)loops_per_msec * boot_delay;
@@ -996,140 +1345,305 @@ static inline void boot_delay_msec(int level)
static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-static size_t print_time(u64 ts, char *buf)
+static size_t print_syslog(unsigned int level, char *buf)
{
- unsigned long rem_nsec;
+ return sprintf(buf, "<%u>", level);
+}
- if (!printk_time)
- return 0;
+static size_t print_time(u64 ts, char *buf)
+{
+ unsigned long rem_nsec = do_div(ts, 1000000000);
- rem_nsec = do_div(ts, 1000000000);
+ return sprintf(buf, "[%5lu.%06lu]",
+ (unsigned long)ts, rem_nsec / 1000);
+}
- if (!buf)
- return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
+#ifdef CONFIG_PRINTK_CALLER
+static size_t print_caller(u32 id, char *buf)
+{
+ char caller[12];
- return sprintf(buf, "[%5lu.%06lu] ",
- (unsigned long)ts, rem_nsec / 1000);
+ snprintf(caller, sizeof(caller), "%c%u",
+ id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
+ return sprintf(buf, "[%6s]", caller);
}
+#else
+#define print_caller(id, buf) 0
+#endif
-static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
+static size_t info_print_prefix(const struct printk_info *info, bool syslog,
+ bool time, char *buf)
{
size_t len = 0;
- unsigned int prefix = (msg->facility << 3) | msg->level;
- if (syslog) {
- if (buf) {
- len += sprintf(buf, "<%u>", prefix);
- } else {
- len += 3;
- if (prefix > 999)
- len += 3;
- else if (prefix > 99)
- len += 2;
- else if (prefix > 9)
- len++;
- }
+ if (syslog)
+ len = print_syslog((info->facility << 3) | info->level, buf);
+
+ if (time)
+ len += print_time(info->ts_nsec, buf + len);
+
+ len += print_caller(info->caller_id, buf + len);
+
+ if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
+ buf[len++] = ' ';
+ buf[len] = '\0';
}
- len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
- bool syslog, char *buf, size_t size)
+/*
+ * Prepare the record for printing. The text is shifted within the given
+ * buffer to avoid a need for another one. The following operations are
+ * done:
+ *
+ * - Add prefix for each line.
+ * - Drop truncated lines that no longer fit into the buffer.
+ * - Add the trailing newline that has been removed in vprintk_store().
+ * - Add a string terminator.
+ *
+ * Since the produced string is always terminated, the maximum possible
+ * return value is @r->text_buf_size - 1;
+ *
+ * Return: The length of the updated/prepared text, including the added
+ * prefixes and the newline. The terminator is not counted. The dropped
+ * line(s) are not counted.
+ */
+static size_t record_print_text(struct printk_record *r, bool syslog,
+ bool time)
{
- const char *text = log_text(msg);
- size_t text_size = msg->text_len;
- bool prefix = true;
- bool newline = true;
+ size_t text_len = r->info->text_len;
+ size_t buf_size = r->text_buf_size;
+ char *text = r->text_buf;
+ char prefix[PRINTK_PREFIX_MAX];
+ bool truncated = false;
+ size_t prefix_len;
+ size_t line_len;
size_t len = 0;
+ char *next;
- if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
- prefix = false;
-
- if (msg->flags & LOG_CONT) {
- if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
- prefix = false;
-
- if (!(msg->flags & LOG_NEWLINE))
- newline = false;
- }
+ /*
+ * If the message was truncated because the buffer was not large
+ * enough, treat the available text as if it were the full text.
+ */
+ if (text_len > buf_size)
+ text_len = buf_size;
- do {
- const char *next = memchr(text, '\n', text_size);
- size_t text_len;
+ prefix_len = info_print_prefix(r->info, syslog, time, prefix);
+ /*
+ * @text_len: bytes of unprocessed text
+ * @line_len: bytes of current line _without_ newline
+ * @text: pointer to beginning of current line
+ * @len: number of bytes prepared in r->text_buf
+ */
+ for (;;) {
+ next = memchr(text, '\n', text_len);
if (next) {
- text_len = next - text;
- next++;
- text_size -= next - text;
+ line_len = next - text;
} else {
- text_len = text_size;
+ /* Drop truncated line(s). */
+ if (truncated)
+ break;
+ line_len = text_len;
}
- if (buf) {
- if (print_prefix(msg, syslog, NULL) +
- text_len + 1 >= size - len)
+ /*
+ * Truncate the text if there is not enough space to add the
+ * prefix and a trailing newline and a terminator.
+ */
+ if (len + prefix_len + text_len + 1 + 1 > buf_size) {
+ /* Drop even the current line if no space. */
+ if (len + prefix_len + line_len + 1 + 1 > buf_size)
break;
- if (prefix)
- len += print_prefix(msg, syslog, buf + len);
- memcpy(buf + len, text, text_len);
- len += text_len;
- if (next || newline)
- buf[len++] = '\n';
- } else {
- /* SYSLOG_ACTION_* buffer size only calculation */
- if (prefix)
- len += print_prefix(msg, syslog, NULL);
- len += text_len;
- if (next || newline)
- len++;
+ text_len = buf_size - len - prefix_len - 1 - 1;
+ truncated = true;
}
- prefix = true;
- text = next;
- } while (text);
+ memmove(text + prefix_len, text, text_len);
+ memcpy(text, prefix, prefix_len);
+
+ /*
+ * Increment the prepared length to include the text and
+ * prefix that were just moved+copied. Also increment for the
+ * newline at the end of this line. If this is the last line,
+ * there is no newline, but it will be added immediately below.
+ */
+ len += prefix_len + line_len + 1;
+ if (text_len == line_len) {
+ /*
+ * This is the last line. Add the trailing newline
+ * removed in vprintk_store().
+ */
+ text[prefix_len + line_len] = '\n';
+ break;
+ }
+
+ /*
+ * Advance beyond the added prefix and the related line with
+ * its newline.
+ */
+ text += prefix_len + line_len + 1;
+
+ /*
+ * The remaining text has only decreased by the line with its
+ * newline.
+ *
+ * Note that @text_len can become zero. It happens when @text
+ * ended with a newline (either due to truncation or the
+ * original string ending with "\n\n"). The loop is correctly
+ * repeated and (if not truncated) an empty line with a prefix
+ * will be prepared.
+ */
+ text_len -= line_len + 1;
+ }
+
+ /*
+ * If a buffer was provided, it will be terminated. Space for the
+ * string terminator is guaranteed to be available. The terminator is
+ * not counted in the return value.
+ */
+ if (buf_size > 0)
+ r->text_buf[len] = 0;
return len;
}
+static size_t get_record_print_text_size(struct printk_info *info,
+ unsigned int line_count,
+ bool syslog, bool time)
+{
+ char prefix[PRINTK_PREFIX_MAX];
+ size_t prefix_len;
+
+ prefix_len = info_print_prefix(info, syslog, time, prefix);
+
+ /*
+ * Each line will be preceded with a prefix. The intermediate
+ * newlines are already within the text, but a final trailing
+ * newline will be added.
+ */
+ return ((prefix_len * line_count) + info->text_len + 1);
+}
+
+/*
+ * Beginning with @start_seq, find the first record where it and all following
+ * records up to (but not including) @max_seq fit into @size.
+ *
+ * @max_seq is simply an upper bound and does not need to exist. If the caller
+ * does not require an upper bound, -1 can be used for @max_seq.
+ */
+static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
+ bool syslog, bool time)
+{
+ struct printk_info info;
+ unsigned int line_count;
+ size_t len = 0;
+ u64 seq;
+
+ /* Determine the size of the records up to @max_seq. */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (info.seq >= max_seq)
+ break;
+ len += get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ /*
+ * Adjust the upper bound for the next loop to avoid subtracting
+ * lengths that were never added.
+ */
+ if (seq < max_seq)
+ max_seq = seq;
+
+ /*
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is a best effort
+ * that prevents an infinite loop that could occur with a retry.
+ */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (len <= size || info.seq >= max_seq)
+ break;
+ len -= get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ return seq;
+}
+
+/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
- struct printk_log *msg;
int len = 0;
+ u64 seq;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- while (size > 0) {
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
+
+ mutex_lock(&syslog_lock);
+
+ /*
+ * Wait for the @syslog_seq record to be available. @syslog_seq may
+ * change while waiting.
+ */
+ do {
+ seq = syslog_seq;
+
+ mutex_unlock(&syslog_lock);
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
+ len = wait_event_interruptible(log_wait,
+ prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
+ mutex_lock(&syslog_lock);
+
+ if (len)
+ goto out;
+ } while (syslog_seq != seq);
+
+ /*
+ * Copy records that fit into the buffer. The above cycle makes sure
+ * that the first record is always available.
+ */
+ do {
size_t n;
size_t skip;
+ int err;
- raw_spin_lock_irq(&logbuf_lock);
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_prev = 0;
- syslog_partial = 0;
- }
- if (syslog_seq == log_next_seq) {
- raw_spin_unlock_irq(&logbuf_lock);
+ if (!prb_read_valid(prb, syslog_seq, &r))
break;
+
+ if (r.info->seq != syslog_seq) {
+ /* message is gone, move to next valid one */
+ syslog_seq = r.info->seq;
+ syslog_partial = 0;
}
+ /*
+ * To keep reading/counting partial line consistent,
+ * use printk_time value as of the beginning of a line.
+ */
+ if (!syslog_partial)
+ syslog_time = printk_time;
+
skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, syslog_prev, true, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ n = record_print_text(&r, true, syslog_time);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
- syslog_prev = msg->flags;
+ syslog_seq = r.info->seq + 1;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
@@ -1138,12 +1652,15 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock_irq(&logbuf_lock);
if (!n)
break;
- if (copy_to_user(buf, text + skip, n)) {
+ mutex_unlock(&syslog_lock);
+ err = copy_to_user(buf, text + skip, n);
+ mutex_lock(&syslog_lock);
+
+ if (err) {
if (!len)
len = -EFAULT;
break;
@@ -1152,118 +1669,80 @@ static int syslog_print(char __user *buf, int size)
len += n;
size -= n;
buf += n;
- }
-
+ } while (size);
+out:
+ mutex_unlock(&syslog_lock);
kfree(text);
return len;
}
static int syslog_print_all(char __user *buf, int size, bool clear)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
int len = 0;
+ u64 seq;
+ bool time;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- raw_spin_lock_irq(&logbuf_lock);
- if (buf) {
- u64 next_seq;
- u64 seq;
- u32 idx;
- enum log_flags prev;
+ time = printk_time;
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump.
+ */
+ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
+ size, true, time);
- if (clear_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
- /*
- * Find first record that fits, including all following records,
- * into the user-provided buffer for this dump.
- */
- seq = clear_seq;
- idx = clear_idx;
- prev = 0;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, prev, true, NULL, 0);
- prev = msg->flags;
- idx = log_next(idx);
- seq++;
- }
-
- /* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- prev = 0;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len -= msg_print_text(msg, prev, true, NULL, 0);
- prev = msg->flags;
- idx = log_next(idx);
- seq++;
- }
+ prb_for_each_record(seq, prb, seq, &r) {
+ int textlen;
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ textlen = record_print_text(&r, true, time);
- len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen;
+ if (len + textlen > size) {
+ seq--;
+ break;
+ }
- textlen = msg_print_text(msg, prev, true, text,
- LOG_LINE_MAX + PREFIX_MAX);
- if (textlen < 0) {
- len = textlen;
- break;
- }
- idx = log_next(idx);
- seq++;
- prev = msg->flags;
+ if (copy_to_user(buf + len, text, textlen))
+ len = -EFAULT;
+ else
+ len += textlen;
- raw_spin_unlock_irq(&logbuf_lock);
- if (copy_to_user(buf + len, text, textlen))
- len = -EFAULT;
- else
- len += textlen;
- raw_spin_lock_irq(&logbuf_lock);
-
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- prev = 0;
- }
- }
+ if (len < 0)
+ break;
}
if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
+ mutex_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, seq);
+ mutex_unlock(&syslog_lock);
}
- raw_spin_unlock_irq(&logbuf_lock);
kfree(text);
return len;
}
-int do_syslog(int type, char __user *buf, int len, bool from_file)
+static void syslog_clear(void)
{
+ mutex_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, prb_next_seq(prb));
+ mutex_unlock(&syslog_lock);
+}
+
+int do_syslog(int type, char __user *buf, int len, int source)
+{
+ struct printk_info info;
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
- error = check_syslog_permissions(type, from_file);
- if (error)
- goto out;
-
- error = security_syslog(type);
+ error = check_syslog_permissions(type, source);
if (error)
return error;
@@ -1273,43 +1752,31 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
case SYSLOG_ACTION_OPEN: /* Open log */
break;
case SYSLOG_ACTION_READ: /* Read from log */
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
- error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
- if (error)
- goto out;
+ return 0;
+ if (!access_ok(buf, len))
+ return -EFAULT;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
clear = true;
- /* FALL THRU */
+ fallthrough;
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
- error = -EINVAL;
if (!buf || len < 0)
- goto out;
- error = 0;
+ return -EINVAL;
if (!len)
- goto out;
- if (!access_ok(VERIFY_WRITE, buf, len)) {
- error = -EFAULT;
- goto out;
- }
+ return 0;
+ if (!access_ok(buf, len))
+ return -EFAULT;
error = syslog_print_all(buf, len, clear);
break;
/* Clear ring buffer */
case SYSLOG_ACTION_CLEAR:
- syslog_print_all(NULL, 0, true);
+ syslog_clear();
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1326,50 +1793,48 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
break;
/* Set level of messages printed to console */
case SYSLOG_ACTION_CONSOLE_LEVEL:
- error = -EINVAL;
if (len < 1 || len > 8)
- goto out;
+ return -EINVAL;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
saved_console_loglevel = LOGLEVEL_DEFAULT;
- error = 0;
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- raw_spin_lock_irq(&logbuf_lock);
- if (syslog_seq < log_first_seq) {
+ mutex_lock(&syslog_lock);
+ if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
+ /* No unread messages. */
+ mutex_unlock(&syslog_lock);
+ return 0;
+ }
+ if (info.seq != syslog_seq) {
/* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_prev = 0;
+ syslog_seq = info.seq;
syslog_partial = 0;
}
- if (from_file) {
+ if (source == SYSLOG_FROM_PROC) {
/*
* Short-cut for poll(/"proc/kmsg") which simply checks
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_seq - syslog_seq;
+ error = prb_next_seq(prb) - syslog_seq;
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
- enum log_flags prev = syslog_prev;
-
- error = 0;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- error += msg_print_text(msg, prev, true, NULL, 0);
- idx = log_next(idx);
- seq++;
- prev = msg->flags;
+ bool time = syslog_partial ? syslog_time : printk_time;
+ unsigned int line_count;
+ u64 seq;
+
+ prb_for_each_info(syslog_seq, prb, seq, &info,
+ &line_count) {
+ error += get_record_print_text_size(&info, line_count,
+ true, time);
+ time = printk_time;
}
error -= syslog_partial;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1379,7 +1844,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
error = -EINVAL;
break;
}
-out:
+
return error;
}
@@ -1389,113 +1854,273 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
}
/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
+ * Special console_lock variants that help to reduce the risk of soft-lockups.
+ * They allow to pass console_lock to another printk() call using a busy wait.
*/
-static void call_console_drivers(int level, const char *text, size_t len)
-{
- struct console *con;
- trace_console(text, len);
-
- if (level >= console_loglevel && !ignore_loglevel)
- return;
- if (!console_drivers)
- return;
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_owner_dep_map = {
+ .name = "console_owner"
+};
+#endif
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if (!(con->flags & CON_ENABLED))
- continue;
- if (!con->write)
- continue;
- if (!cpu_online(smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
- continue;
- con->write(con, text, len);
- }
-}
+static DEFINE_RAW_SPINLOCK(console_owner_lock);
+static struct task_struct *console_owner;
+static bool console_waiter;
-/*
- * Zap console related locks when oopsing.
- * To leave time for slow consoles to print a full oops,
- * only zap at most once every 30 seconds.
+/**
+ * console_lock_spinning_enable - mark beginning of code where another
+ * thread might safely busy wait
+ *
+ * This basically converts console_lock into a spinlock. This marks
+ * the section where the console_lock owner can not sleep, because
+ * there may be a waiter spinning (like a spinlock). Also it must be
+ * ready to hand over the lock at the end of the section.
*/
-static void zap_locks(void)
+void console_lock_spinning_enable(void)
{
- static unsigned long oops_timestamp;
-
- if (time_after_eq(jiffies, oops_timestamp) &&
- !time_after(jiffies, oops_timestamp + 30 * HZ))
- return;
+ /*
+ * Do not use spinning in panic(). The panic CPU wants to keep the lock.
+ * Non-panic CPUs abandon the flush anyway.
+ *
+ * Just keep the lockdep annotation. The panic-CPU should avoid
+ * taking console_owner_lock because it might cause a deadlock.
+ * This looks like the easiest way how to prevent false lockdep
+ * reports without handling races a lockless way.
+ */
+ if (panic_in_progress())
+ goto lockdep;
- oops_timestamp = jiffies;
+ raw_spin_lock(&console_owner_lock);
+ console_owner = current;
+ raw_spin_unlock(&console_owner_lock);
- debug_locks_off();
- /* If a crash is occurring, make sure we can't deadlock */
- raw_spin_lock_init(&logbuf_lock);
- /* And make sure that we print immediately */
- sema_init(&console_sem, 1);
+lockdep:
+ /* The waiter may spin on us after setting console_owner */
+ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
+/**
+ * console_lock_spinning_disable_and_check - mark end of code where another
+ * thread was able to busy wait and check if there is a waiter
+ * @cookie: cookie returned from console_srcu_read_lock()
+ *
+ * This is called at the end of the section where spinning is allowed.
+ * It has two functions. First, it is a signal that it is no longer
+ * safe to start busy waiting for the lock. Second, it checks if
+ * there is a busy waiter and passes the lock rights to her.
+ *
+ * Important: Callers lose both the console_lock and the SRCU read lock if
+ * there was a busy waiter. They must not touch items synchronized by
+ * console_lock or SRCU read lock in this case.
+ *
+ * Return: 1 if the lock rights were passed, 0 otherwise.
*/
-static int have_callable_console(void)
+int console_lock_spinning_disable_and_check(int cookie)
{
- struct console *con;
+ int waiter;
- for_each_console(con)
- if (con->flags & CON_ANYTIME)
- return 1;
+ /*
+ * Ignore spinning waiters during panic() because they might get stopped
+ * or blocked at any time,
+ *
+ * It is safe because nobody is allowed to start spinning during panic
+ * in the first place. If there has been a waiter then non panic CPUs
+ * might stay spinning. They would get stopped anyway. The panic context
+ * will never start spinning and an interrupted spin on panic CPU will
+ * never continue.
+ */
+ if (panic_in_progress()) {
+ /* Keep lockdep happy. */
+ spin_release(&console_owner_dep_map, _THIS_IP_);
+ return 0;
+ }
- return 0;
+ raw_spin_lock(&console_owner_lock);
+ waiter = READ_ONCE(console_waiter);
+ console_owner = NULL;
+ raw_spin_unlock(&console_owner_lock);
+
+ if (!waiter) {
+ spin_release(&console_owner_dep_map, _THIS_IP_);
+ return 0;
+ }
+
+ /* The waiter is now free to continue */
+ WRITE_ONCE(console_waiter, false);
+
+ spin_release(&console_owner_dep_map, _THIS_IP_);
+
+ /*
+ * Preserve lockdep lock ordering. Release the SRCU read lock before
+ * releasing the console_lock.
+ */
+ console_srcu_read_unlock(cookie);
+
+ /*
+ * Hand off console_lock to waiter. The waiter will perform
+ * the up(). After this, the waiter is the console_lock owner.
+ */
+ mutex_release(&console_lock_dep_map, _THIS_IP_);
+ return 1;
}
-/*
- * Can we actually use the console at this time on this cpu?
+/**
+ * console_trylock_spinning - try to get console_lock by busy waiting
+ *
+ * This allows to busy wait for the console_lock when the current
+ * owner is running in specially marked sections. It means that
+ * the current owner is running and cannot reschedule until it
+ * is ready to lose the lock.
*
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
+ * Return: 1 if we got the lock, 0 othrewise
*/
-static inline int can_use_console(unsigned int cpu)
+static int console_trylock_spinning(void)
{
- return cpu_online(cpu) || have_callable_console();
-}
+ struct task_struct *owner = NULL;
+ bool waiter;
+ bool spin = false;
+ unsigned long flags;
-/*
- * Try to get console ownership to actually show the kernel
- * messages from a 'printk'. Return true (and with the
- * console_lock held, and 'console_locked' set) if it
- * is successful, false otherwise.
- */
-static int console_trylock_for_printk(void)
-{
- unsigned int cpu = smp_processor_id();
+ if (console_trylock())
+ return 1;
- if (!console_trylock())
+ /*
+ * It's unsafe to spin once a panic has begun. If we are the
+ * panic CPU, we may have already halted the owner of the
+ * console_sem. If we are not the panic CPU, then we should
+ * avoid taking console_sem, so the panic CPU has a better
+ * chance of cleanly acquiring it later.
+ */
+ if (panic_in_progress())
return 0;
+
+ printk_safe_enter_irqsave(flags);
+
+ raw_spin_lock(&console_owner_lock);
+ owner = READ_ONCE(console_owner);
+ waiter = READ_ONCE(console_waiter);
+ if (!waiter && owner && owner != current) {
+ WRITE_ONCE(console_waiter, true);
+ spin = true;
+ }
+ raw_spin_unlock(&console_owner_lock);
+
/*
- * If we can't use the console, we need to release the console
- * semaphore by hand to avoid flushing the buffer. We need to hold the
- * console semaphore in order to do this test safely.
+ * If there is an active printk() writing to the
+ * consoles, instead of having it write our data too,
+ * see if we can offload that load from the active
+ * printer, and do some printing ourselves.
+ * Go into a spin only if there isn't already a waiter
+ * spinning, and there is an active printer, and
+ * that active printer isn't us (recursive printk?).
*/
- if (!can_use_console(cpu)) {
- console_locked = 0;
- up_console_sem();
+ if (!spin) {
+ printk_safe_exit_irqrestore(flags);
return 0;
}
+
+ /* We spin waiting for the owner to release us */
+ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
+ /* Owner will clear console_waiter on hand off */
+ while (READ_ONCE(console_waiter))
+ cpu_relax();
+ spin_release(&console_owner_dep_map, _THIS_IP_);
+
+ printk_safe_exit_irqrestore(flags);
+ /*
+ * The owner passed the console lock to us.
+ * Since we did not spin on console lock, annotate
+ * this as a trylock. Otherwise lockdep will
+ * complain.
+ */
+ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+
+ /*
+ * Update @console_may_schedule for trylock because the previous
+ * owner may have been schedulable.
+ */
+ console_may_schedule = 0;
+
return 1;
}
+/*
+ * Recursion is tracked separately on each CPU. If NMIs are supported, an
+ * additional NMI context per CPU is also separately tracked. Until per-CPU
+ * is available, a separate "early tracking" is performed.
+ */
+static DEFINE_PER_CPU(u8, printk_count);
+static u8 printk_count_early;
+#ifdef CONFIG_HAVE_NMI
+static DEFINE_PER_CPU(u8, printk_count_nmi);
+static u8 printk_count_nmi_early;
+#endif
+
+/*
+ * Recursion is limited to keep the output sane. printk() should not require
+ * more than 1 level of recursion (allowing, for example, printk() to trigger
+ * a WARN), but a higher value is used in case some printk-internal errors
+ * exist, such as the ringbuffer validation checks failing.
+ */
+#define PRINTK_MAX_RECURSION 3
+
+/*
+ * Return a pointer to the dedicated counter for the CPU+context of the
+ * caller.
+ */
+static u8 *__printk_recursion_counter(void)
+{
+#ifdef CONFIG_HAVE_NMI
+ if (in_nmi()) {
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count_nmi);
+ return &printk_count_nmi_early;
+ }
+#endif
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count);
+ return &printk_count_early;
+}
+
+/*
+ * Enter recursion tracking. Interrupts are disabled to simplify tracking.
+ * The caller must check the boolean return value to see if the recursion is
+ * allowed. On failure, interrupts are not disabled.
+ *
+ * @recursion_ptr must be a variable of type (u8 *) and is the same variable
+ * that is passed to printk_exit_irqrestore().
+ */
+#define printk_enter_irqsave(recursion_ptr, flags) \
+({ \
+ bool success = true; \
+ \
+ typecheck(u8 *, recursion_ptr); \
+ local_irq_save(flags); \
+ (recursion_ptr) = __printk_recursion_counter(); \
+ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \
+ local_irq_restore(flags); \
+ success = false; \
+ } else { \
+ (*(recursion_ptr))++; \
+ } \
+ success; \
+})
+
+/* Exit recursion tracking, restoring interrupts. */
+#define printk_exit_irqrestore(recursion_ptr, flags) \
+ do { \
+ typecheck(u8 *, recursion_ptr); \
+ (*(recursion_ptr))--; \
+ local_irq_restore(flags); \
+ } while (0)
+
int printk_delay_msec __read_mostly;
-static inline void printk_delay(void)
+static inline void printk_delay(int level)
{
+ boot_delay_msec(level);
+
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
@@ -1506,402 +2131,343 @@ static inline void printk_delay(void)
}
}
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
- char buf[LOG_LINE_MAX];
- size_t len; /* length == 0 means unused buffer */
- size_t cons; /* bytes written to console */
- struct task_struct *owner; /* task of first print*/
- u64 ts_nsec; /* time of first print */
- u8 level; /* log level of first message */
- u8 facility; /* log facility of first message */
- enum log_flags flags; /* prefix, newline flags */
- bool flushed:1; /* buffer sealed and committed */
-} cont;
-
-static void cont_flush(enum log_flags flags)
-{
- if (cont.flushed)
- return;
- if (cont.len == 0)
- return;
-
- if (cont.cons) {
- /*
- * If a fragment of this line was directly flushed to the
- * console; wait for the console to pick up the rest of the
- * line. LOG_NOCONS suppresses a duplicated output.
- */
- log_store(cont.facility, cont.level, flags | LOG_NOCONS,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.flags = flags;
- cont.flushed = true;
- } else {
- /*
- * If no fragment of this line ever reached the console,
- * just submit it to the store and free the buffer.
- */
- log_store(cont.facility, cont.level, flags, 0,
- NULL, 0, cont.buf, cont.len);
- cont.len = 0;
- }
+static inline u32 printk_caller_id(void)
+{
+ return in_task() ? task_pid_nr(current) :
+ 0x80000000 + smp_processor_id();
}
-static bool cont_add(int facility, int level, const char *text, size_t len)
+/**
+ * printk_parse_prefix - Parse level and control flags.
+ *
+ * @text: The terminated text message.
+ * @level: A pointer to the current level value, will be updated.
+ * @flags: A pointer to the current printk_info flags, will be updated.
+ *
+ * @level may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @level must be set to
+ * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
+ *
+ * @flags may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @flags will be OR'd with the parsed
+ * value.
+ *
+ * Return: The length of the parsed level and control flags.
+ */
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags)
{
- if (cont.len && cont.flushed)
- return false;
+ u16 prefix_len = 0;
+ int kern_level;
- if (cont.len + len > sizeof(cont.buf)) {
- /* the line gets too long, split it up in separate records */
- cont_flush(LOG_CONT);
- return false;
- }
-
- if (!cont.len) {
- cont.facility = facility;
- cont.level = level;
- cont.owner = current;
- cont.ts_nsec = local_clock();
- cont.flags = 0;
- cont.cons = 0;
- cont.flushed = false;
- }
+ while (*text) {
+ kern_level = printk_get_level(text);
+ if (!kern_level)
+ break;
- memcpy(cont.buf + cont.len, text, len);
- cont.len += len;
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level && *level == LOGLEVEL_DEFAULT)
+ *level = kern_level - '0';
+ break;
+ case 'c': /* KERN_CONT */
+ if (flags)
+ *flags |= LOG_CONT;
+ }
- if (cont.len > (sizeof(cont.buf) * 80) / 100)
- cont_flush(LOG_CONT);
+ prefix_len += 2;
+ text += 2;
+ }
- return true;
+ return prefix_len;
}
-static size_t cont_print_text(char *text, size_t size)
+__printf(5, 0)
+static u16 printk_sprint(char *text, u16 size, int facility,
+ enum printk_info_flags *flags, const char *fmt,
+ va_list args)
{
- size_t textlen = 0;
- size_t len;
+ u16 text_len;
- if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
- textlen += print_time(cont.ts_nsec, text);
- size -= textlen;
- }
+ text_len = vscnprintf(text, size, fmt, args);
- len = cont.len - cont.cons;
- if (len > 0) {
- if (len+1 > size)
- len = size-1;
- memcpy(text + textlen, cont.buf + cont.cons, len);
- textlen += len;
- cont.cons = cont.len;
+ /* Mark and strip a trailing newline. */
+ if (text_len && text[text_len - 1] == '\n') {
+ text_len--;
+ *flags |= LOG_NEWLINE;
}
- if (cont.flushed) {
- if (cont.flags & LOG_NEWLINE)
- text[textlen++] = '\n';
- /* got everything, release buffer */
- cont.len = 0;
+ /* Strip log level and control flags. */
+ if (facility == 0) {
+ u16 prefix_len;
+
+ prefix_len = printk_parse_prefix(text, NULL, NULL);
+ if (prefix_len) {
+ text_len -= prefix_len;
+ memmove(text, text + prefix_len, text_len);
+ }
}
- return textlen;
+
+ trace_console(text, text_len);
+
+ return text_len;
}
-asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args)
+__printf(4, 0)
+int vprintk_store(int facility, int level,
+ const struct dev_printk_info *dev_info,
+ const char *fmt, va_list args)
{
- static int recursion_bug;
- static char textbuf[LOG_LINE_MAX];
- char *text = textbuf;
- size_t text_len = 0;
- enum log_flags lflags = 0;
- unsigned long flags;
- int this_cpu;
- int printed_len = 0;
- bool in_sched = false;
- /* cpu currently holding logbuf_lock in this function */
- static unsigned int logbuf_cpu = UINT_MAX;
+ struct prb_reserved_entry e;
+ enum printk_info_flags flags = 0;
+ struct printk_record r;
+ unsigned long irqflags;
+ u16 trunc_msg_len = 0;
+ char prefix_buf[8];
+ u8 *recursion_ptr;
+ u16 reserve_size;
+ va_list args2;
+ u32 caller_id;
+ u16 text_len;
+ int ret = 0;
+ u64 ts_nsec;
- if (level == LOGLEVEL_SCHED) {
- level = LOGLEVEL_DEFAULT;
- in_sched = true;
- }
+ if (!printk_enter_irqsave(recursion_ptr, irqflags))
+ return 0;
- boot_delay_msec(level);
- printk_delay();
+ /*
+ * Since the duration of printk() can vary depending on the message
+ * and state of the ringbuffer, grab the timestamp now so that it is
+ * close to the call of printk(). This provides a more deterministic
+ * timestamp with respect to the caller.
+ */
+ ts_nsec = local_clock();
- /* This stops the holder of console_sem just where we want him */
- local_irq_save(flags);
- this_cpu = smp_processor_id();
+ caller_id = printk_caller_id();
/*
- * Ouch, printk recursed into itself!
+ * The sprintf needs to come first since the syslog prefix might be
+ * passed in as a parameter. An extra byte must be reserved so that
+ * later the vscnprintf() into the reserved buffer has room for the
+ * terminating '\0', which is not counted by vsnprintf().
*/
- if (unlikely(logbuf_cpu == this_cpu)) {
- /*
- * If a crash is occurring during printk() on this CPU,
- * then try to get the crash message out but make sure
- * we can't deadlock. Otherwise just return to avoid the
- * recursion and return - but flag the recursion so that
- * it can be printed at the next appropriate moment:
- */
- if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = 1;
- local_irq_restore(flags);
- return 0;
- }
- zap_locks();
- }
+ va_copy(args2, args);
+ reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
+ va_end(args2);
- lockdep_off();
- raw_spin_lock(&logbuf_lock);
- logbuf_cpu = this_cpu;
+ if (reserve_size > PRINTKRB_RECORD_MAX)
+ reserve_size = PRINTKRB_RECORD_MAX;
- if (unlikely(recursion_bug)) {
- static const char recursion_msg[] =
- "BUG: recent printk recursion!";
+ /* Extract log level or control flags. */
+ if (facility == 0)
+ printk_parse_prefix(&prefix_buf[0], &level, &flags);
+
+ if (level == LOGLEVEL_DEFAULT)
+ level = default_message_loglevel;
- recursion_bug = 0;
- /* emit KERN_CRIT message */
- printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, recursion_msg,
- strlen(recursion_msg));
+ if (dev_info)
+ flags |= LOG_NEWLINE;
+
+ if (is_printk_force_console())
+ flags |= LOG_FORCE_CON;
+
+ if (flags & LOG_CONT) {
+ prb_rec_init_wr(&r, reserve_size);
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
+ text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
+ facility, &flags, fmt, args);
+ r.info->text_len += text_len;
+
+ if (flags & LOG_FORCE_CON)
+ r.info->flags |= LOG_FORCE_CON;
+
+ if (flags & LOG_NEWLINE) {
+ r.info->flags |= LOG_NEWLINE;
+ prb_final_commit(&e);
+ } else {
+ prb_commit(&e);
+ }
+
+ ret = text_len;
+ goto out;
+ }
}
/*
- * The printf needs to come first; we need the syslog
- * prefix which might be passed-in as a parameter.
+ * Explicitly initialize the record before every prb_reserve() call.
+ * prb_reserve_in_last() and prb_reserve() purposely invalidate the
+ * structure when they fail.
*/
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ prb_rec_init_wr(&r, reserve_size);
+ if (!prb_reserve(&e, prb, &r)) {
+ /* truncate the message if it is too long for empty buffer */
+ truncate_msg(&reserve_size, &trunc_msg_len);
- /* mark and strip a trailing newline */
- if (text_len && text[text_len-1] == '\n') {
- text_len--;
- lflags |= LOG_NEWLINE;
+ prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
+ if (!prb_reserve(&e, prb, &r))
+ goto out;
}
- /* strip kernel syslog prefix and extract log level or control flags */
- if (facility == 0) {
- int kern_level = printk_get_level(text);
-
- if (kern_level) {
- const char *end_of_header = printk_skip_level(text);
- switch (kern_level) {
- case '0' ... '7':
- if (level == LOGLEVEL_DEFAULT)
- level = kern_level - '0';
- /* fallthrough */
- case 'd': /* KERN_DEFAULT */
- lflags |= LOG_PREFIX;
- }
- /*
- * No need to check length here because vscnprintf
- * put '\0' at the end of the string. Only valid and
- * newly printed level is detected.
- */
- text_len -= end_of_header - text;
- text = (char *)end_of_header;
- }
+ /* fill message */
+ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
+ if (trunc_msg_len)
+ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
+ r.info->text_len = text_len + trunc_msg_len;
+ r.info->facility = facility;
+ r.info->level = level & 7;
+ r.info->flags = flags & 0x1f;
+ r.info->ts_nsec = ts_nsec;
+ r.info->caller_id = caller_id;
+ if (dev_info)
+ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
+
+ /* A message without a trailing newline can be continued. */
+ if (!(flags & LOG_NEWLINE))
+ prb_commit(&e);
+ else
+ prb_final_commit(&e);
+
+ ret = text_len + trunc_msg_len;
+out:
+ printk_exit_irqrestore(recursion_ptr, irqflags);
+ return ret;
+}
+
+/*
+ * This acts as a one-way switch to allow legacy consoles to print from
+ * the printk() caller context on a panic CPU. It also attempts to flush
+ * the legacy consoles in this context.
+ */
+void printk_legacy_allow_panic_sync(void)
+{
+ struct console_flush_type ft;
+
+ legacy_allow_panic_sync = true;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
}
+}
- if (level == LOGLEVEL_DEFAULT)
- level = default_message_loglevel;
+bool __read_mostly debug_non_panic_cpus;
- if (dict)
- lflags |= LOG_PREFIX|LOG_NEWLINE;
+#ifdef CONFIG_PRINTK_CALLER
+static int __init debug_non_panic_cpus_setup(char *str)
+{
+ debug_non_panic_cpus = true;
+ pr_info("allow messages from non-panic CPUs in panic()\n");
- if (!(lflags & LOG_NEWLINE)) {
- /*
- * Flush the conflicting buffer. An earlier newline was missing,
- * or another task also prints continuation lines.
- */
- if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
- cont_flush(LOG_NEWLINE);
+ return 0;
+}
+early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup);
+module_param(debug_non_panic_cpus, bool, 0644);
+MODULE_PARM_DESC(debug_non_panic_cpus,
+ "allow messages from non-panic CPUs in panic()");
+#endif
- /* buffer line if possible, otherwise store it right away */
- if (cont_add(facility, level, text, text_len))
- printed_len += text_len;
- else
- printed_len += log_store(facility, level,
- lflags | LOG_CONT, 0,
- dict, dictlen, text, text_len);
- } else {
- bool stored = false;
+asmlinkage int vprintk_emit(int facility, int level,
+ const struct dev_printk_info *dev_info,
+ const char *fmt, va_list args)
+{
+ struct console_flush_type ft;
+ int printed_len;
- /*
- * If an earlier newline was missing and it was the same task,
- * either merge it with the current buffer and flush, or if
- * there was a race with interrupts (prefix == true) then just
- * flush it out and store this line separately.
- * If the preceding printk was from a different task and missed
- * a newline, flush and append the newline.
- */
- if (cont.len) {
- if (cont.owner == current && !(lflags & LOG_PREFIX))
- stored = cont_add(facility, level, text,
- text_len);
- cont_flush(LOG_NEWLINE);
- }
+ /* Suppress unimportant messages after panic happens */
+ if (unlikely(suppress_printk))
+ return 0;
- if (stored)
- printed_len += text_len;
- else
- printed_len += log_store(facility, level, lflags, 0,
- dict, dictlen, text, text_len);
- }
+ /*
+ * The messages on the panic CPU are the most important. If
+ * non-panic CPUs are generating any messages, they will be
+ * silently dropped.
+ */
+ if (panic_on_other_cpu() &&
+ !debug_non_panic_cpus &&
+ !panic_triggering_all_cpu_backtrace)
+ return 0;
- logbuf_cpu = UINT_MAX;
- raw_spin_unlock(&logbuf_lock);
- lockdep_on();
- local_irq_restore(flags);
+ printk_get_console_flush_type(&ft);
/* If called from the scheduler, we can not call up(). */
- if (!in_sched) {
- lockdep_off();
+ if (level == LOGLEVEL_SCHED) {
+ level = LOGLEVEL_DEFAULT;
+ ft.legacy_offload |= ft.legacy_direct && !console_irqwork_blocked;
+ ft.legacy_direct = false;
+ }
+
+ printk_delay(level);
+
+ printed_len = vprintk_store(facility, level, dev_info, fmt, args);
+
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+
+ if (ft.legacy_direct) {
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
+ * The caller may be holding system-critical or
+ * timing-sensitive locks. Disable preemption during
+ * printing of all remaining records to all consoles so that
+ * this context can return as soon as possible. Hopefully
+ * another printk() caller will take over the printing.
*/
preempt_disable();
-
/*
* Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
+ * semaphore. The release will print out buffers. With the
+ * spinning variant, this context tries to take over the
+ * printing from another printing context.
*/
- if (console_trylock_for_printk())
+ if (console_trylock_spinning())
console_unlock();
preempt_enable();
- lockdep_on();
}
+ if (ft.legacy_offload)
+ defer_console_output();
+ else if (!console_irqwork_blocked)
+ wake_up_klogd();
+
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
-}
-EXPORT_SYMBOL(vprintk);
-
-asmlinkage int printk_emit(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, ...)
-{
- va_list args;
- int r;
-
- va_start(args, fmt);
- r = vprintk_emit(facility, level, dict, dictlen, fmt, args);
- va_end(args);
-
- return r;
-}
-EXPORT_SYMBOL(printk_emit);
-
int vprintk_default(const char *fmt, va_list args)
{
- int r;
-
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
- return r;
- }
-#endif
- r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the
- * output and call the console drivers. If we fail to get the semaphore, we
- * place the output into the log buffer and return. The current holder of
- * the console_sem will notice the new output in console_unlock(); and will
- * send it to the consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-asmlinkage __visible int printk(const char *fmt, ...)
+asmlinkage __visible int _printk(const char *fmt, ...)
{
- printk_func_t vprintk_func;
va_list args;
int r;
va_start(args, fmt);
-
- /*
- * If a caller overrides the per_cpu printk_func, then it needs
- * to disable preemption when calling printk(). Otherwise
- * the printk_func should be set to the default. No need to
- * disable preemption here.
- */
- vprintk_func = this_cpu_read(printk_func);
- r = vprintk_func(fmt, args);
-
+ r = vprintk(fmt, args);
va_end(args);
return r;
}
-EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(_printk);
+
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
+#define printk_time false
+
+#define prb_read_valid(rb, seq, r) false
+#define prb_first_valid_seq(rb) 0
+#define prb_next_seq(rb) 0
static u64 syslog_seq;
-static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags syslog_prev;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static enum log_flags console_prev;
-static struct cont {
- size_t len;
- size_t cons;
- u8 level;
- bool flushed:1;
-} cont;
-static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
- bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
-
-/* Still needs to be defined for users */
-DEFINE_PER_CPU(printk_func_t, printk_func);
+
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */
@@ -1925,76 +2491,146 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
}
#endif
-static int __add_preferred_console(char *name, int idx, char *options,
- char *brl_options)
+static void set_user_specified(struct console_cmdline *c, bool user_specified)
+{
+ if (!user_specified)
+ return;
+
+ /*
+ * @c console was defined by the user on the command line.
+ * Do not clear when added twice also by SPCR or the device tree.
+ */
+ c->user_specified = true;
+ /* At least one console defined by the user on the command line. */
+ console_set_on_cmdline = 1;
+}
+
+static int __add_preferred_console(const char *name, const short idx,
+ const char *devname, char *options,
+ char *brl_options, bool user_specified)
{
struct console_cmdline *c;
int i;
+ if (!name && !devname)
+ return -EINVAL;
+
+ /*
+ * We use a signed short index for struct console for device drivers to
+ * indicate a not yet assigned index or port. However, a negative index
+ * value is not valid when the console name and index are defined on
+ * the command line.
+ */
+ if (name && idx < 0)
+ return -EINVAL;
+
/*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
+ i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
i++, c++) {
- if (strcmp(c->name, name) == 0 && c->index == idx) {
+ if ((name && strcmp(c->name, name) == 0 && c->index == idx) ||
+ (devname && strcmp(c->devname, devname) == 0)) {
if (!brl_options)
- selected_console = i;
+ preferred_console = i;
+ set_user_specified(c, user_specified);
return 0;
}
}
if (i == MAX_CMDLINECONSOLES)
return -E2BIG;
if (!brl_options)
- selected_console = i;
- strlcpy(c->name, name, sizeof(c->name));
+ preferred_console = i;
+ if (name)
+ strscpy(c->name, name);
+ if (devname)
+ strscpy(c->devname, devname);
c->options = options;
+ set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
c->index = idx;
return 0;
}
+
+static int __init console_msg_format_setup(char *str)
+{
+ if (!strcmp(str, "syslog"))
+ console_msg_format = MSG_FORMAT_SYSLOG;
+ if (!strcmp(str, "default"))
+ console_msg_format = MSG_FORMAT_DEFAULT;
+ return 1;
+}
+__setup("console_msg_format=", console_msg_format_setup);
+
/*
* Set up a console. Called via do_early_param() in init/main.c
* for each "console=" parameter in the boot command line.
*/
static int __init console_setup(char *str)
{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
- char *s, *options, *brl_options = NULL;
+ static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4);
+ char buf[sizeof(console_cmdline[0].devname)];
+ char *brl_options = NULL;
+ char *ttyname = NULL;
+ char *devname = NULL;
+ char *options;
+ char *s;
int idx;
+ /*
+ * console="" or console=null have been suggested as a way to
+ * disable console output. Use ttynull that has been created
+ * for exactly this purpose.
+ */
+ if (str[0] == 0 || strcmp(str, "null") == 0) {
+ __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true);
+ return 1;
+ }
+
if (_braille_console_setup(&str, &brl_options))
return 1;
+ /* For a DEVNAME:0.0 style console the character device is unknown early */
+ if (strchr(str, ':'))
+ devname = buf;
+ else
+ ttyname = buf;
+
/*
* Decode str into name, index, options.
*/
- if (str[0] >= '0' && str[0] <= '9') {
- strcpy(buf, "ttyS");
- strncpy(buf + 4, str, sizeof(buf) - 5);
- } else {
- strncpy(buf, str, sizeof(buf) - 1);
- }
- buf[sizeof(buf) - 1] = 0;
+ if (ttyname && isdigit(str[0]))
+ scnprintf(buf, sizeof(buf), "ttyS%s", str);
+ else
+ strscpy(buf, str);
+
options = strchr(str, ',');
if (options)
*(options++) = 0;
+
#ifdef __sparc__
if (!strcmp(str, "ttya"))
- strcpy(buf, "ttyS0");
+ strscpy(buf, "ttyS0");
if (!strcmp(str, "ttyb"))
- strcpy(buf, "ttyS1");
+ strscpy(buf, "ttyS1");
#endif
+
for (s = buf; *s; s++)
- if (isdigit(*s) || *s == ',')
+ if ((ttyname && isdigit(*s)) || *s == ',')
break;
- idx = simple_strtoul(s, NULL, 10);
+
+ /* @idx will get defined when devname matches. */
+ if (devname)
+ idx = -1;
+ else
+ idx = simple_strtoul(s, NULL, 10);
+
*s = 0;
- __add_preferred_console(buf, idx, options, brl_options);
- console_set_on_cmdline = 1;
+ __add_preferred_console(ttyname, idx, devname, options, brl_options, true);
return 1;
}
__setup("console=", console_setup);
@@ -2012,11 +2648,56 @@ __setup("console=", console_setup);
* commonly to provide a default console (ie from PROM variables) when
* the user has not supplied one.
*/
-int add_preferred_console(char *name, int idx, char *options)
+int add_preferred_console(const char *name, const short idx, char *options)
{
- return __add_preferred_console(name, idx, options, NULL);
+ return __add_preferred_console(name, idx, NULL, options, NULL, false);
}
+/**
+ * match_devname_and_update_preferred_console - Update a preferred console
+ * when matching devname is found.
+ * @devname: DEVNAME:0.0 style device name
+ * @name: Name of the corresponding console driver, e.g. "ttyS"
+ * @idx: Console index, e.g. port number.
+ *
+ * The function checks whether a device with the given @devname is
+ * preferred via the console=DEVNAME:0.0 command line option.
+ * It fills the missing console driver name and console index
+ * so that a later register_console() call could find (match)
+ * and enable this device.
+ *
+ * It might be used when a driver subsystem initializes particular
+ * devices with already known DEVNAME:0.0 style names. And it
+ * could predict which console driver name and index this device
+ * would later get associated with.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int match_devname_and_update_preferred_console(const char *devname,
+ const char *name,
+ const short idx)
+{
+ struct console_cmdline *c = console_cmdline;
+ int i;
+
+ if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0)
+ return -EINVAL;
+
+ for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
+ i++, c++) {
+ if (!strcmp(devname, c->devname)) {
+ pr_info("associate the preferred console \"%s\" with \"%s%d\"\n",
+ devname, name, idx);
+ strscpy(c->name, name);
+ c->index = idx;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console);
+
bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
@@ -2031,60 +2712,123 @@ module_param_named(console_suspend, console_suspend_enabled,
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
" and hibernate operations");
+static bool printk_console_no_auto_verbose;
+
+void console_verbose(void)
+{
+ if (console_loglevel && !printk_console_no_auto_verbose)
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+}
+EXPORT_SYMBOL_GPL(console_verbose);
+
+module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
+MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");
+
/**
- * suspend_console - suspend the console subsystem
+ * console_suspend_all - suspend the console subsystem
*
* This disables printk() while we go into suspend states
*/
-void suspend_console(void)
+void console_suspend_all(void)
{
+ struct console *con;
+
+ if (console_suspend_enabled)
+ pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
+
+ /*
+ * Flush any console backlog and then avoid queueing irq_work until
+ * console_resume_all(). Until then deferred printing is no longer
+ * triggered, NBCON consoles transition to atomic flushing, and
+ * any klogd waiters are not triggered.
+ */
+ pr_flush(1000, true);
+ console_irqwork_blocked = true;
+
if (!console_suspend_enabled)
return;
- printk("Suspending console(s) (use no_console_suspend to debug)\n");
- console_lock();
- console_suspended = 1;
- up_console_sem();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see that they are suspended so that it
+ * is guaranteed that all printing has stopped when this function
+ * completes.
+ */
+ synchronize_srcu(&console_srcu);
}
-void resume_console(void)
+void console_resume_all(void)
{
- if (!console_suspend_enabled)
- return;
- down_console_sem();
- console_suspended = 0;
- console_unlock();
+ struct console_flush_type ft;
+ struct console *con;
+
+ /*
+ * Allow queueing irq_work. After restoring console state, deferred
+ * printing and any klogd waiters need to be triggered in case there
+ * is now a console backlog.
+ */
+ console_irqwork_blocked = false;
+
+ if (console_suspend_enabled) {
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+ }
+
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_offload)
+ defer_console_output();
+ else
+ wake_up_klogd();
+
+ pr_flush(1000, true);
}
/**
* console_cpu_notify - print deferred console messages after CPU hotplug
- * @self: notifier struct
- * @action: CPU hotplug event
- * @hcpu: unused
+ * @cpu: unused
*
* If printk() is called from a CPU that is not online yet, the messages
- * will be spooled but will not show up on the console. This function is
- * called when a new CPU comes online (or fails to come up), and ensures
- * that any such output gets printed.
- */
-static int console_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- switch (action) {
- case CPU_ONLINE:
- case CPU_DEAD:
- case CPU_DOWN_FAILED:
- case CPU_UP_CANCELED:
- console_lock();
- console_unlock();
+ * will be printed on the console only if there are CON_ANYTIME consoles.
+ * This function is called when a new CPU comes online (or fails to come
+ * up) or goes offline.
+ */
+static int console_cpu_notify(unsigned int cpu)
+{
+ struct console_flush_type ft;
+
+ if (!cpuhp_tasks_frozen) {
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ }
}
- return NOTIFY_OK;
+ return 0;
}
/**
- * console_lock - lock the console system for exclusive use.
+ * console_lock - block the console subsystem from printing
*
- * Acquires a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Acquires a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* Can sleep, returns nothing.
*/
@@ -2092,30 +2836,31 @@ void console_lock(void)
{
might_sleep();
+ /* On panic, the console_lock must be left to the panic cpu. */
+ while (panic_on_other_cpu())
+ msleep(1000);
+
down_console_sem();
- if (console_suspended)
- return;
console_locked = 1;
console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);
/**
- * console_trylock - try to lock the console system for exclusive use.
+ * console_trylock - try to block the console subsystem from printing
*
- * Try to acquire a lock which guarantees that the caller has exclusive
- * access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
int console_trylock(void)
{
- if (down_trylock_console_sem())
+ /* On panic, the console_lock must be left to the panic cpu. */
+ if (panic_on_other_cpu())
return 0;
- if (console_suspended) {
- up_console_sem();
+ if (down_trylock_console_sem())
return 0;
- }
console_locked = 1;
console_may_schedule = 0;
return 1;
@@ -2126,150 +2871,494 @@ int is_console_locked(void)
{
return console_locked;
}
+EXPORT_SYMBOL(is_console_locked);
-static void console_cont_flush(char *text, size_t size)
+static void __console_unlock(void)
{
- unsigned long flags;
+ console_locked = 0;
+ up_console_sem();
+}
+
+#ifdef CONFIG_PRINTK
+
+/*
+ * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting
+ * the existing message over and inserting the scratchbuf message.
+ *
+ * @pmsg is the original printk message.
+ * @fmt is the printf format of the message which will prepend the existing one.
+ *
+ * If there is not enough space in @pmsg->pbufs->outbuf, the existing
+ * message text will be sufficiently truncated.
+ *
+ * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
+ */
+__printf(2, 3)
+static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...)
+{
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ va_list args;
size_t len;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ va_start(args, fmt);
+ len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args);
+ va_end(args);
- if (!cont.len)
- goto out;
+ /*
+ * Make sure outbuf is sufficiently large before prepending.
+ * Keep at least the prefix when the message must be truncated.
+ * It is a rather theoretical problem when someone tries to
+ * use a minimalist buffer.
+ */
+ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
+ return;
+
+ if (pmsg->outbuf_len + len >= outbuf_sz) {
+ /* Truncate the message, but keep it terminated. */
+ pmsg->outbuf_len = outbuf_sz - (len + 1);
+ outbuf[pmsg->outbuf_len] = 0;
+ }
+
+ memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
+ memcpy(outbuf, scratchbuf, len);
+ pmsg->outbuf_len += len;
+}
+
+/*
+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message".
+ * @pmsg->outbuf_len is updated appropriately.
+ *
+ * @pmsg is the printk message to prepend.
+ *
+ * @dropped is the dropped count to report in the dropped message.
+ */
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+{
+ console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped);
+}
+
+/*
+ * Prepend the message in @pmsg->pbufs->outbuf with a "replay message".
+ * @pmsg->outbuf_len is updated appropriately.
+ *
+ * @pmsg is the printk message to prepend.
+ */
+void console_prepend_replay(struct printk_message *pmsg)
+{
+ console_prepend_message(pmsg, "** replaying previous printk message **\n");
+}
+
+/*
+ * Read and format the specified record (or a later record if the specified
+ * record is not available).
+ *
+ * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
+ * struct printk_buffers.
+ *
+ * @seq is the record to read and format. If it is not available, the next
+ * valid record is read.
+ *
+ * @is_extended specifies if the message should be formatted for extended
+ * console output.
+ *
+ * @may_supress specifies if records may be skipped based on loglevel.
+ *
+ * Returns false if no record is available. Otherwise true and all fields
+ * of @pmsg are valid. (See the documentation of struct printk_message
+ * for information about the @pmsg fields.)
+ */
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
+{
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ struct printk_info info;
+ struct printk_record r;
+ size_t len = 0;
+ bool force_con;
/*
- * We still queue earlier records, likely because the console was
- * busy. The earlier ones need to be printed before this one, we
- * did not flush any fragment so far, so just let it queue up.
+ * Formatting extended messages requires a separate buffer, so use the
+ * scratch buffer to read in the ringbuffer text.
+ *
+ * Formatting normal messages is done in-place, so read the ringbuffer
+ * text directly into the output buffer.
*/
- if (console_seq < log_next_seq && !cont.cons)
+ if (is_extended)
+ prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
+ else
+ prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);
+
+ if (!prb_read_valid(prb, seq, &r))
+ return false;
+
+ pmsg->seq = r.info->seq;
+ pmsg->dropped = r.info->seq - seq;
+ force_con = r.info->flags & LOG_FORCE_CON;
+
+ /*
+ * Skip records that are not forced to be printed on consoles and that
+ * has level above the console loglevel.
+ */
+ if (!force_con && may_suppress && suppress_message_printing(r.info->level))
goto out;
- len = cont_print_text(text, size);
- raw_spin_unlock(&logbuf_lock);
- stop_critical_timings();
- call_console_drivers(cont.level, text, len);
- start_critical_timings();
- local_irq_restore(flags);
- return;
+ if (is_extended) {
+ len = info_print_ext_header(outbuf, outbuf_sz, r.info);
+ len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
+ &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+ } else {
+ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+ }
out:
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ pmsg->outbuf_len = len;
+ return true;
}
-/**
- * console_unlock - unlock the console system
+/*
+ * The legacy console always acquires a spinlock_t from its printing
+ * callback. This violates lock nesting if the caller acquired an always
+ * spinning lock (raw_spinlock_t) while invoking printk(). This is not a
+ * problem on PREEMPT_RT because legacy consoles print always from a
+ * dedicated thread and never from within printk(). Therefore we tell
+ * lockdep that a sleeping spin lock (spinlock_t) is valid here.
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline void printk_legacy_allow_spinlock_enter(void) { }
+static inline void printk_legacy_allow_spinlock_exit(void) { }
+#else
+static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_CONFIG);
+
+static inline void printk_legacy_allow_spinlock_enter(void)
+{
+ lock_map_acquire_try(&printk_legacy_map);
+}
+
+static inline void printk_legacy_allow_spinlock_exit(void)
+{
+ lock_map_release(&printk_legacy_map);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
+/*
+ * Used as the printk buffers for non-panic, serialized console printing.
+ * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
+ * Its usage requires the console_lock held.
+ */
+struct printk_buffers printk_shared_pbufs;
+
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
*
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding both the
+ * console_lock and the SRCU read lock. Otherwise it is set to false.
*
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
+ * @cookie is the cookie from the SRCU read lock.
*
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ * Returns false if the given console has no next record to print, otherwise
+ * true.
*
- * console_unlock(); may be called from any context.
+ * Requires the console_lock and the SRCU read lock.
*/
-void console_unlock(void)
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
- static char text[LOG_LINE_MAX + PREFIX_MAX];
- static u64 seen_seq;
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ char *outbuf = &printk_shared_pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &printk_shared_pbufs,
+ };
unsigned long flags;
- bool wake_klogd = false;
- bool retry;
- if (console_suspended) {
- up_console_sem();
- return;
+ *handover = false;
+
+ if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
+ return false;
+
+ con->dropped += pmsg.dropped;
+
+ /* Skip messages of formatted length 0. */
+ if (pmsg.outbuf_len == 0) {
+ con->seq = pmsg.seq + 1;
+ goto skip;
}
- console_may_schedule = 0;
+ if (con->dropped && !is_extended) {
+ console_prepend_dropped(&pmsg, con->dropped);
+ con->dropped = 0;
+ }
- /* flush buffered message fragment immediately to console */
- console_cont_flush(text, sizeof(text));
-again:
- for (;;) {
- struct printk_log *msg;
- size_t len;
- int level;
-
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- if (seen_seq != log_next_seq) {
- wake_klogd = true;
- seen_seq = log_next_seq;
- }
+ /* Write everything out to the hardware. */
- if (console_seq < log_first_seq) {
- len = sprintf(text, "** %u printk messages dropped ** ",
- (unsigned)(log_first_seq - console_seq));
+ if (force_legacy_kthread() && !panic_in_progress()) {
+ /*
+ * With forced threading this function is in a task context
+ * (either legacy kthread or get_init_console_seq()). There
+ * is no need for concern about printk reentrance, handovers,
+ * or lockdep complaints.
+ */
- /* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- console_prev = 0;
- } else {
- len = 0;
- }
+ con->write(con, outbuf, pmsg.outbuf_len);
+ con->seq = pmsg.seq + 1;
+ } else {
+ /*
+ * While actively printing out messages, if another printk()
+ * were to occur on another CPU, it may wait for this one to
+ * finish. This task can not be preempted if there is a
+ * waiter waiting to take over.
+ *
+ * Interrupts are disabled because the hand over to a waiter
+ * must not be interrupted until the hand over is completed
+ * (@console_waiter is cleared).
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
+
+ /* Do not trace print latency. */
+ stop_critical_timings();
+
+ printk_legacy_allow_spinlock_enter();
+ con->write(con, outbuf, pmsg.outbuf_len);
+ printk_legacy_allow_spinlock_exit();
+
+ start_critical_timings();
+
+ con->seq = pmsg.seq + 1;
+
+ *handover = console_lock_spinning_disable_and_check(cookie);
+ printk_safe_exit_irqrestore(flags);
+ }
skip:
- if (console_seq == log_next_seq)
- break;
+ return true;
+}
- msg = log_from_idx(console_idx);
- if (msg->flags & LOG_NOCONS) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it.
- */
- console_idx = log_next(console_idx);
- console_seq++;
- /*
- * We will get here again when we register a new
- * CON_PRINTBUFFER console. Clear the flag so we
- * will properly dump everything later.
- */
- msg->flags &= ~LOG_NOCONS;
- console_prev = msg->flags;
- goto skip;
+#else
+
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ *handover = false;
+ return false;
+}
+
+static inline void printk_kthreads_check_locked(void) { }
+
+#endif /* CONFIG_PRINTK */
+
+
+/*
+ * Print out one record for each console.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when all usable consoles were flushed. It is
+ * when the function returns true (can do the job) and @try_again parameter
+ * is set to false, see below.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * @try_again will be set to true when it still makes sense to call this
+ * function again. The function could do the job, see the return value.
+ * And some consoles still make progress.
+ *
+ * Returns true when the function could do the job. Some consoles are usable,
+ * and there was no takeover and no panic_on_other_cpu().
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_one_record(bool do_cond_resched, u64 *next_seq, bool *handover,
+ bool *try_again)
+{
+ struct console_flush_type ft;
+ bool any_usable = false;
+ struct console *con;
+ int cookie;
+
+ *try_again = false;
+
+ printk_get_console_flush_type(&ft);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+ u64 printk_seq;
+ bool progress;
+
+ /*
+ * console_flush_one_record() is only responsible for
+ * nbcon consoles when the nbcon consoles cannot print via
+ * their atomic or threaded flushing.
+ */
+ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
+ continue;
+
+ if (!console_is_usable(con, flags, !do_cond_resched))
+ continue;
+ any_usable = true;
+
+ if (flags & CON_NBCON) {
+ progress = nbcon_legacy_emit_next_record(con, handover, cookie,
+ !do_cond_resched);
+ printk_seq = nbcon_seq_read(con);
+ } else {
+ progress = console_emit_next_record(con, handover, cookie);
+ printk_seq = con->seq;
}
- level = msg->level;
- len += msg_print_text(msg, console_prev, false,
- text + len, sizeof(text) - len);
- console_idx = log_next(console_idx);
- console_seq++;
- console_prev = msg->flags;
- raw_spin_unlock(&logbuf_lock);
+ /*
+ * If a handover has occurred, the SRCU read lock
+ * is already released.
+ */
+ if (*handover)
+ goto fail;
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(level, text, len);
- start_critical_timings();
- local_irq_restore(flags);
+ /* Track the next of the highest seq flushed. */
+ if (printk_seq > *next_seq)
+ *next_seq = printk_seq;
+
+ if (!progress)
+ continue;
+
+ /*
+ * An usable console made a progress. There might still be
+ * pending messages.
+ */
+ *try_again = true;
+
+ /* Allow panic_cpu to take over the consoles safely. */
+ if (panic_on_other_cpu())
+ goto fail_srcu;
+
+ if (do_cond_resched)
+ cond_resched();
}
- console_locked = 0;
+ console_srcu_read_unlock(cookie);
- /* Release the exclusive_console once it is used */
- if (unlikely(exclusive_console))
- exclusive_console = NULL;
+ return any_usable;
- raw_spin_unlock(&logbuf_lock);
+fail_srcu:
+ console_srcu_read_unlock(cookie);
+fail:
+ *try_again = false;
+ return false;
+}
- up_console_sem();
+/*
+ * Print out all remaining records to all consoles.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when this function returns true. It means that all
+ * usable consoles are completely flushed.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns true when there was at least one usable console and all messages
+ * were flushed to all usable consoles. A returned false informs the caller
+ * that everything was not flushed (either there were no usable consoles or
+ * another context has taken over printing or it is a panic situation and this
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+{
+ bool try_again;
+ bool ret;
+
+ *next_seq = 0;
+ *handover = false;
+
+ do {
+ ret = console_flush_one_record(do_cond_resched, next_seq,
+ handover, &try_again);
+ } while (try_again);
+
+ return ret;
+}
+
+static void __console_flush_and_unlock(void)
+{
+ bool do_cond_resched;
+ bool handover;
+ bool flushed;
+ u64 next_seq;
/*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
+ * Console drivers are called with interrupts disabled, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system. Therefore, create
+ * a local to use for the printing loop.
*/
- raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ do_cond_resched = console_may_schedule;
- if (retry && console_trylock())
- goto again;
+ do {
+ console_may_schedule = 0;
- if (wake_klogd)
- wake_up_klogd();
+ flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
+ if (!handover)
+ __console_unlock();
+
+ /*
+ * Abort if there was a failure to flush all messages to all
+ * usable consoles. Either it is not possible to flush (in
+ * which case it would be an infinite loop of retrying) or
+ * another context has taken over printing.
+ */
+ if (!flushed)
+ break;
+
+ /*
+ * Some context may have added new records after
+ * console_flush_all() but before unlocking the console.
+ * Re-check if there is a new record to flush. If the trylock
+ * fails, another context is already handling the printing.
+ */
+ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
+}
+
+/**
+ * console_unlock - unblock the legacy console subsystem from printing
+ *
+ * Releases the console_lock which the caller holds to block printing of
+ * the legacy console subsystem.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock() emits the output on
+ * legacy consoles prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.legacy_direct)
+ __console_flush_and_unlock();
+ else
+ __console_unlock();
}
EXPORT_SYMBOL(console_unlock);
@@ -2291,13 +3380,48 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
+ bool found_unblank = false;
struct console *c;
+ int cookie;
/*
- * console_unblank can no longer be called in interrupt context unless
- * oops_in_progress is set to 1..
+ * First check if there are any consoles implementing the unblank()
+ * callback. If not, there is no reason to continue and take the
+ * console lock, which in particular can be dangerous if
+ * @oops_in_progress is set.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (!console_is_usable(c, console_srcu_read_flags(c), true))
+ continue;
+
+ if (c->unblank) {
+ found_unblank = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+ if (!found_unblank)
+ return;
+
+ /*
+ * Stop console printing because the unblank() callback may
+ * assume the console is not within its write() callback.
+ *
+ * If @oops_in_progress is set, this may be an atomic context.
+ * In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
+ /* Semaphores are not NMI-safe. */
+ if (in_nmi())
+ return;
+
+ /*
+ * Attempting to trylock the console lock can deadlock
+ * if another CPU was stopped while modifying the
+ * semaphore. "Hope and pray" that this is not the
+ * current situation.
+ */
if (down_trylock_console_sem() != 0)
return;
} else
@@ -2305,10 +3429,96 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for_each_console(c)
- if ((c->flags & CON_ENABLED) && c->unblank)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (!console_is_usable(c, console_srcu_read_flags(c), true))
+ continue;
+
+ if (c->unblank)
c->unblank();
+ }
+ console_srcu_read_unlock(cookie);
+
console_unlock();
+
+ if (!oops_in_progress)
+ pr_flush(1000, true);
+}
+
+/*
+ * Rewind all consoles to the oldest available record.
+ *
+ * IMPORTANT: The function is safe only when called under
+ * console_lock(). It is not enforced because
+ * it is used as a best effort in panic().
+ */
+static void __console_rewind_all(void)
+{
+ struct console *c;
+ short flags;
+ int cookie;
+ u64 seq;
+
+ seq = prb_first_valid_seq(prb);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ flags = console_srcu_read_flags(c);
+
+ if (flags & CON_NBCON) {
+ nbcon_seq_force(c, seq);
+ } else {
+ /*
+ * This assignment is safe only when called under
+ * console_lock(). On panic, legacy consoles are
+ * only best effort.
+ */
+ c->seq = seq;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/**
+ * console_flush_on_panic - flush console content on panic
+ * @mode: flush all messages in buffer or just the pending ones
+ *
+ * Immediately output all pending messages no matter what.
+ */
+void console_flush_on_panic(enum con_flush_mode mode)
+{
+ struct console_flush_type ft;
+ bool handover;
+ u64 next_seq;
+
+ /*
+ * Ignore the console lock and flush out the messages. Attempting a
+ * trylock would not be useful because:
+ *
+ * - if it is contended, it must be ignored anyway
+ * - console_lock() and console_trylock() block and fail
+ * respectively in panic for non-panic CPUs
+ * - semaphores are not NMI-safe
+ */
+
+ /*
+ * If another context is holding the console lock,
+ * @console_may_schedule might be set. Clear it so that
+ * this context does not call cond_resched() while flushing.
+ */
+ console_may_schedule = 0;
+
+ if (mode == CONSOLE_REPLAY_ALL)
+ __console_rewind_all();
+
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+
+ /* Flush legacy consoles once allowed, even when dangerous. */
+ if (legacy_allow_panic_sync)
+ console_flush_all(false, &next_seq, &handover);
}
/*
@@ -2318,39 +3528,297 @@ struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
+ int cookie;
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
console_lock();
- for_each_console(c) {
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
+ console_srcu_read_unlock(cookie);
+
console_unlock();
return driver;
}
/*
* Prevent further output on the passed console device so that (for example)
- * serial drivers can disable console output before suspending a port, and can
+ * serial drivers can suspend console output before suspending a port, and can
* re-enable output afterwards.
*/
-void console_stop(struct console *console)
+void console_suspend(struct console *console)
{
- console_lock();
- console->flags &= ~CON_ENABLED;
- console_unlock();
+ __pr_flush(console, 1000, true);
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts must
+ * be able to see that this console is disabled so that (for example)
+ * the caller can suspend the port without risk of another context
+ * using the port.
+ */
+ synchronize_srcu(&console_srcu);
}
-EXPORT_SYMBOL(console_stop);
+EXPORT_SYMBOL(console_suspend);
-void console_start(struct console *console)
+void console_resume(struct console *console)
{
- console_lock();
- console->flags |= CON_ENABLED;
- console_unlock();
+ struct console_flush_type ft;
+ bool is_nbcon;
+
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ is_nbcon = console->flags & CON_NBCON;
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. The related
+ * printing context must be able to see it is enabled so that
+ * it is guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
+ printk_get_console_flush_type(&ft);
+ if (is_nbcon && ft.nbcon_offload)
+ nbcon_kthread_wake(console);
+ else if (ft.legacy_offload)
+ defer_console_output();
+
+ __pr_flush(console, 1000, true);
}
-EXPORT_SYMBOL(console_start);
+EXPORT_SYMBOL(console_resume);
+
+#ifdef CONFIG_PRINTK
+static int unregister_console_locked(struct console *console);
+
+/* True when system boot is far enough to create printer threads. */
+bool printk_kthreads_ready __ro_after_init;
+
+static struct task_struct *printk_legacy_kthread;
+
+static bool legacy_kthread_should_wakeup(void)
+{
+ struct console_flush_type ft;
+ struct console *con;
+ bool ret = false;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ printk_get_console_flush_type(&ft);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+ u64 printk_seq;
+
+ /*
+ * The legacy printer thread is only responsible for nbcon
+ * consoles when the nbcon consoles cannot print via their
+ * atomic or threaded flushing.
+ */
+ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
+ continue;
+
+ if (!console_is_usable(con, flags, false))
+ continue;
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(con);
+ } else {
+ /*
+ * It is safe to read @seq because only this
+ * thread context updates @seq.
+ */
+ printk_seq = con->seq;
+ }
+
+ if (prb_read_valid(prb, printk_seq, NULL)) {
+ ret = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+
+ return ret;
+}
+
+static int legacy_kthread_func(void *unused)
+{
+ bool try_again;
+
+wait_for_event:
+ wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());
+
+ do {
+ bool handover = false;
+ u64 next_seq = 0;
+
+ if (kthread_should_stop())
+ return 0;
+
+ console_lock();
+ console_flush_one_record(true, &next_seq, &handover, &try_again);
+ if (!handover)
+ __console_unlock();
+
+ } while (try_again);
+
+ goto wait_for_event;
+}
+
+static bool legacy_kthread_create(void)
+{
+ struct task_struct *kt;
+
+ lockdep_assert_console_list_lock_held();
+
+ kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy");
+ if (WARN_ON(IS_ERR(kt))) {
+ pr_err("failed to start legacy printing thread\n");
+ return false;
+ }
+
+ printk_legacy_kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(printk_legacy_kthread, -20);
+
+ return true;
+}
+
+/**
+ * printk_kthreads_shutdown - shutdown all threaded printers
+ * @data: syscore context
+ *
+ * On system shutdown all threaded printers are stopped. This allows printk
+ * to transition back to atomic printing, thus providing a robust mechanism
+ * for the final shutdown/reboot messages to be output.
+ */
+static void printk_kthreads_shutdown(void *data)
+{
+ struct console *con;
+
+ console_list_lock();
+ if (printk_kthreads_running) {
+ printk_kthreads_running = false;
+
+ for_each_console(con) {
+ if (con->flags & CON_NBCON)
+ nbcon_kthread_stop(con);
+ }
+
+ /*
+ * The threads may have been stopped while printing a
+ * backlog. Flush any records left over.
+ */
+ nbcon_atomic_flush_pending();
+ }
+ console_list_unlock();
+}
+
+static const struct syscore_ops printk_syscore_ops = {
+ .shutdown = printk_kthreads_shutdown,
+};
+
+static struct syscore printk_syscore = {
+ .ops = &printk_syscore_ops,
+};
+
+/*
+ * If appropriate, start nbcon kthreads and set @printk_kthreads_running.
+ * If any kthreads fail to start, those consoles are unregistered.
+ *
+ * Must be called under console_list_lock().
+ */
+static void printk_kthreads_check_locked(void)
+{
+ struct hlist_node *tmp;
+ struct console *con;
+
+ lockdep_assert_console_list_lock_held();
+
+ if (!printk_kthreads_ready)
+ return;
+
+ /* Start or stop the legacy kthread when needed. */
+ if (have_legacy_console || have_boot_console) {
+ if (!printk_legacy_kthread &&
+ force_legacy_kthread() &&
+ !legacy_kthread_create()) {
+ /*
+ * All legacy consoles must be unregistered. If there
+ * are any nbcon consoles, they will set up their own
+ * kthread.
+ */
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (con->flags & CON_NBCON)
+ continue;
+
+ unregister_console_locked(con);
+ }
+ }
+ } else if (printk_legacy_kthread) {
+ kthread_stop(printk_legacy_kthread);
+ printk_legacy_kthread = NULL;
+ }
+
+ /*
+ * Printer threads cannot be started as long as any boot console is
+ * registered because there is no way to synchronize the hardware
+ * registers between boot console code and regular console code.
+ * It can only be known that there will be no new boot consoles when
+ * an nbcon console is registered.
+ */
+ if (have_boot_console || !have_nbcon_console) {
+ /* Clear flag in case all nbcon consoles unregistered. */
+ printk_kthreads_running = false;
+ return;
+ }
+
+ if (printk_kthreads_running)
+ return;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (!(con->flags & CON_NBCON))
+ continue;
+
+ if (!nbcon_kthread_create(con))
+ unregister_console_locked(con);
+ }
+
+ printk_kthreads_running = true;
+}
+
+static int __init printk_set_kthreads_ready(void)
+{
+ register_syscore(&printk_syscore);
+
+ console_list_lock();
+ printk_kthreads_ready = true;
+ printk_kthreads_check_locked();
+ console_list_unlock();
+
+ return 0;
+}
+early_initcall(printk_set_kthreads_ready);
+#endif /* CONFIG_PRINTK */
static int __read_mostly keep_bootcon;
@@ -2364,6 +3832,174 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ /* Synchronize with possible boot console. */
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
+/*
+ * This is called by register_console() to try to match
+ * the newly registered console with any of the ones selected
+ * by either the command line or add_preferred_console() and
+ * setup/enable it.
+ *
+ * Care need to be taken with consoles that are statically
+ * enabled such as netconsole
+ */
+static int try_enable_preferred_console(struct console *newcon,
+ bool user_specified)
+{
+ struct console_cmdline *c;
+ int i, err;
+
+ for (i = 0, c = console_cmdline;
+ i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
+ i++, c++) {
+ /* Console not yet initialized? */
+ if (!c->name[0])
+ continue;
+ if (c->user_specified != user_specified)
+ continue;
+ if (!newcon->match ||
+ newcon->match(newcon, c->name, c->index, c->options) != 0) {
+ /* default matching */
+ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
+ if (strcmp(c->name, newcon->name) != 0)
+ continue;
+ if (newcon->index >= 0 &&
+ newcon->index != c->index)
+ continue;
+ if (newcon->index < 0)
+ newcon->index = c->index;
+
+ if (_braille_register_console(newcon, c))
+ return 0;
+
+ err = console_call_setup(newcon, c->options);
+ if (err)
+ return err;
+ }
+ newcon->flags |= CON_ENABLED;
+ if (i == preferred_console)
+ newcon->flags |= CON_CONSDEV;
+ return 0;
+ }
+
+ /*
+ * Some consoles, such as pstore and netconsole, can be enabled even
+ * without matching. Accept the pre-enabled consoles only when match()
+ * and setup() had a chance to be called.
+ */
+ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified)
+ return 0;
+
+ return -ENOENT;
+}
+
+/* Try to enable the console unconditionally */
+static void try_enable_default_console(struct console *newcon)
+{
+ if (newcon->index < 0)
+ newcon->index = 0;
+
+ if (console_call_setup(newcon, NULL) != 0)
+ return;
+
+ newcon->flags |= CON_ENABLED;
+
+ if (newcon->device)
+ newcon->flags |= CON_CONSDEV;
+}
+
+/* Return the starting sequence number for a newly registered console. */
+static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered)
+{
+ struct console *con;
+ bool handover;
+ u64 init_seq;
+
+ if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
+ /* Get a consistent copy of @syslog_seq. */
+ mutex_lock(&syslog_lock);
+ init_seq = syslog_seq;
+ mutex_unlock(&syslog_lock);
+ } else {
+ /* Begin with next message added to ringbuffer. */
+ init_seq = prb_next_seq(prb);
+
+ /*
+ * If any enabled boot consoles are due to be unregistered
+ * shortly, some may not be caught up and may be the same
+ * device as @newcon. Since it is not known which boot console
+ * is the same device, flush all consoles and, if necessary,
+ * start with the message of the enabled boot console that is
+ * the furthest behind.
+ */
+ if (bootcon_registered && !keep_bootcon) {
+ /*
+ * Hold the console_lock to stop console printing and
+ * guarantee safe access to console->seq.
+ */
+ console_lock();
+
+ /*
+ * Flush all consoles and set the console to start at
+ * the next unprinted sequence number.
+ */
+ if (!console_flush_all(true, &init_seq, &handover)) {
+ /*
+ * Flushing failed. Just choose the lowest
+ * sequence of the enabled boot consoles.
+ */
+
+ /*
+ * If there was a handover, this context no
+ * longer holds the console_lock.
+ */
+ if (handover)
+ console_lock();
+
+ init_seq = prb_next_seq(prb);
+ for_each_console(con) {
+ u64 seq;
+
+ if (!(con->flags & CON_BOOT) ||
+ !(con->flags & CON_ENABLED)) {
+ continue;
+ }
+
+ if (con->flags & CON_NBCON)
+ seq = nbcon_seq_read(con);
+ else
+ seq = con->seq;
+
+ if (seq < init_seq)
+ init_seq = seq;
+ }
+ }
+
+ console_unlock();
+ }
+ }
+
+ return init_seq;
+}
+
+#define console_first() \
+ hlist_entry(console_list.first, struct console, node)
+
+static int unregister_console_locked(struct console *console);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -2385,136 +4021,143 @@ early_param("keep_bootcon", keep_bootcon_setup);
*/
void register_console(struct console *newcon)
{
- int i;
+ bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic;
+ bool bootcon_registered = false;
+ bool realcon_registered = false;
+ struct console *con;
unsigned long flags;
- struct console *bcon = NULL;
- struct console_cmdline *c;
+ u64 init_seq;
+ int err;
- if (console_drivers)
- for_each_console(bcon)
- if (WARN(bcon == newcon,
- "console '%s%d' already registered\n",
- bcon->name, bcon->index))
- return;
+ console_list_lock();
- /*
- * before we register a new CON_BOOT console, make sure we don't
- * already have a valid console
- */
- if (console_drivers && newcon->flags & CON_BOOT) {
- /* find the last or real console */
- for_each_console(bcon) {
- if (!(bcon->flags & CON_BOOT)) {
- pr_info("Too late to register bootconsole %s%d\n",
- newcon->name, newcon->index);
- return;
- }
+ for_each_console(con) {
+ if (WARN(con == newcon, "console '%s%d' already registered\n",
+ con->name, con->index)) {
+ goto unlock;
}
+
+ if (con->flags & CON_BOOT)
+ bootcon_registered = true;
+ else
+ realcon_registered = true;
}
- if (console_drivers && console_drivers->flags & CON_BOOT)
- bcon = console_drivers;
+ /* Do not register boot consoles when there already is a real one. */
+ if ((newcon->flags & CON_BOOT) && realcon_registered) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ goto unlock;
+ }
- if (preferred_console < 0 || bcon || !console_drivers)
- preferred_console = selected_console;
+ if (newcon->flags & CON_NBCON) {
+ /*
+ * Ensure the nbcon console buffers can be allocated
+ * before modifying any global data.
+ */
+ if (!nbcon_alloc(newcon))
+ goto unlock;
+ }
/*
- * See if we want to use this console driver. If we
- * didn't select a console we take the first one
- * that registers here.
+ * See if we want to enable this console driver by default.
+ *
+ * Nope when a console is preferred by the command line, device
+ * tree, or SPCR.
+ *
+ * The first real console with tty binding (driver) wins. More
+ * consoles might get enabled before the right one is found.
+ *
+ * Note that a console with tty binding will have CON_CONSDEV
+ * flag set and will be first in the list.
*/
if (preferred_console < 0) {
- if (newcon->index < 0)
- newcon->index = 0;
- if (newcon->setup == NULL ||
- newcon->setup(newcon, NULL) == 0) {
- newcon->flags |= CON_ENABLED;
- if (newcon->device) {
- newcon->flags |= CON_CONSDEV;
- preferred_console = 0;
- }
+ if (hlist_empty(&console_list) || !console_first()->device ||
+ console_first()->flags & CON_BOOT) {
+ try_enable_default_console(newcon);
}
}
- /*
- * See if this console matches one we selected on
- * the command line.
- */
- for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
- i++, c++) {
- if (!newcon->match ||
- newcon->match(newcon, c->name, c->index, c->options) != 0) {
- /* default matching */
- BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
- if (strcmp(c->name, newcon->name) != 0)
- continue;
- if (newcon->index >= 0 &&
- newcon->index != c->index)
- continue;
- if (newcon->index < 0)
- newcon->index = c->index;
+ /* See if this console matches one we selected on the command line */
+ err = try_enable_preferred_console(newcon, true);
- if (_braille_register_console(newcon, c))
- return;
+ /* If not, try to match against the platform default(s) */
+ if (err == -ENOENT)
+ err = try_enable_preferred_console(newcon, false);
- if (newcon->setup &&
- newcon->setup(newcon, c->options) != 0)
- break;
- }
-
- newcon->flags |= CON_ENABLED;
- if (i == selected_console) {
- newcon->flags |= CON_CONSDEV;
- preferred_console = selected_console;
- }
- break;
+ /* printk() messages are not printed to the Braille console. */
+ if (err || newcon->flags & CON_BRL) {
+ if (newcon->flags & CON_NBCON)
+ nbcon_free(newcon);
+ goto unlock;
}
- if (!(newcon->flags & CON_ENABLED))
- return;
-
/*
* If we have a bootconsole, and are switching to a real console,
* don't print everything out again, since when the boot console, and
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bootcon_registered &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ }
+
+ newcon->dropped = 0;
+ init_seq = get_init_console_seq(newcon, bootcon_registered);
+
+ if (newcon->flags & CON_NBCON) {
+ have_nbcon_console = true;
+ nbcon_seq_force(newcon, init_seq);
+ } else {
+ have_legacy_console = true;
+ newcon->seq = init_seq;
+ }
+
+ if (newcon->flags & CON_BOOT)
+ have_boot_console = true;
/*
- * Put this console in the list - keep the
- * preferred driver at the head of the list.
+ * If another context is actively using the hardware of this new
+ * console, it will not be aware of the nbcon synchronization. This
+ * is a risk that two contexts could access the hardware
+ * simultaneously if this new console is used for atomic printing
+ * and the other context is still using the hardware.
+ *
+ * Use the driver synchronization to ensure that the hardware is not
+ * in use while this new console transitions to being registered.
*/
- console_lock();
- if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
- newcon->next = console_drivers;
- console_drivers = newcon;
- if (newcon->next)
- newcon->next->flags &= ~CON_CONSDEV;
+ if (use_device_lock)
+ newcon->device_lock(newcon, &flags);
+
+ /*
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
+ */
+ if (hlist_empty(&console_list)) {
+ /* Ensure CON_CONSDEV is always set for the head. */
+ newcon->flags |= CON_CONSDEV;
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
+ } else if (newcon->flags & CON_CONSDEV) {
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
} else {
- newcon->next = console_drivers->next;
- console_drivers->next = newcon;
+ hlist_add_behind_rcu(&newcon->node, console_list.first);
}
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- */
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- console_seq = syslog_seq;
- console_idx = syslog_idx;
- console_prev = syslog_prev;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- /*
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- */
- exclusive_console = newcon;
- }
- console_unlock();
+
+ /*
+ * No need to synchronize SRCU here! The caller does not rely
+ * on all contexts being able to see the new console before
+ * register_console() completes.
+ */
+
+ /* This new console is now registered. */
+ if (use_device_lock)
+ newcon->device_unlock(newcon, flags);
+
console_sysfs_notify();
/*
@@ -2524,80 +4167,387 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- pr_info("%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
- if (bcon &&
+ con_printk(KERN_INFO, newcon, "enabled\n");
+ if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
- /* We need to iterate through all boot consoles, to make
- * sure we print everything out, before we unregister them.
- */
- for_each_console(bcon)
- if (bcon->flags & CON_BOOT)
- unregister_console(bcon);
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (con->flags & CON_BOOT)
+ unregister_console_locked(con);
+ }
}
+
+ /* Changed console list, may require printer threads to start/stop. */
+ printk_kthreads_check_locked();
+unlock:
+ console_list_unlock();
}
EXPORT_SYMBOL(register_console);
-int unregister_console(struct console *console)
+/* Must be called under console_list_lock(). */
+static int unregister_console_locked(struct console *console)
{
- struct console *a, *b;
+ bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic;
+ bool found_legacy_con = false;
+ bool found_nbcon_con = false;
+ bool found_boot_con = false;
+ unsigned long flags;
+ struct console *c;
int res;
- pr_info("%sconsole [%s%d] disabled\n",
- (console->flags & CON_BOOT) ? "boot" : "" ,
- console->name, console->index);
+ lockdep_assert_console_list_lock_held();
+
+ con_printk(KERN_INFO, console, "disabled\n");
res = _braille_unregister_console(console);
- if (res)
+ if (res < 0)
return res;
+ if (res > 0)
+ return 0;
- res = 1;
- console_lock();
- if (console_drivers == console) {
- console_drivers=console->next;
- res = 0;
- } else if (console_drivers) {
- for (a=console_drivers->next, b=console_drivers ;
- a; b=a, a=b->next) {
- if (a == console) {
- b->next = a->next;
- res = 0;
- break;
- }
- }
- }
+ if (!console_is_registered_locked(console))
+ res = -ENODEV;
+ else if (console_is_usable(console, console->flags, true))
+ __pr_flush(console, 1000, true);
+
+ /* Disable it unconditionally */
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+
+ if (res < 0)
+ return res;
/*
+ * Use the driver synchronization to ensure that the hardware is not
+ * in use while this console transitions to being unregistered.
+ */
+ if (use_device_lock)
+ console->device_lock(console, &flags);
+
+ hlist_del_init_rcu(&console->node);
+
+ if (use_device_lock)
+ console->device_unlock(console, flags);
+
+ /*
+ * <HISTORICAL>
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
+ * </HISTORICAL>
+ *
+ * The above makes no sense as there is no guarantee that the next
+ * console has any device attached. Oh well....
*/
- if (console_drivers != NULL && console->flags & CON_CONSDEV)
- console_drivers->flags |= CON_CONSDEV;
+ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
+ console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts
+ * must not be able to see this console in the list so that any
+ * exit/cleanup routines can be performed safely.
+ */
+ synchronize_srcu(&console_srcu);
+
+ /*
+ * With this console gone, the global flags tracking registered
+ * console types may have changed. Update them.
+ */
+ for_each_console(c) {
+ if (c->flags & CON_BOOT)
+ found_boot_con = true;
+
+ if (c->flags & CON_NBCON)
+ found_nbcon_con = true;
+ else
+ found_legacy_con = true;
+ }
+ if (!found_boot_con)
+ have_boot_console = found_boot_con;
+ if (!found_legacy_con)
+ have_legacy_console = found_legacy_con;
+ if (!found_nbcon_con)
+ have_nbcon_console = found_nbcon_con;
+
+ /* @have_nbcon_console must be updated before calling nbcon_free(). */
+ if (console->flags & CON_NBCON)
+ nbcon_free(console);
- console->flags &= ~CON_ENABLED;
- console_unlock();
console_sysfs_notify();
+
+ if (console->exit)
+ res = console->exit(console);
+
+ /* Changed console list, may require printer threads to start/stop. */
+ printk_kthreads_check_locked();
+
+ return res;
+}
+
+int unregister_console(struct console *console)
+{
+ int res;
+
+ console_list_lock();
+ res = unregister_console_locked(console);
+ console_list_unlock();
return res;
}
EXPORT_SYMBOL(unregister_console);
+/**
+ * console_force_preferred_locked - force a registered console preferred
+ * @con: The registered console to force preferred.
+ *
+ * Must be called under console_list_lock().
+ */
+void console_force_preferred_locked(struct console *con)
+{
+ struct console *cur_pref_con;
+
+ if (!console_is_registered_locked(con))
+ return;
+
+ cur_pref_con = console_first();
+
+ /* Already preferred? */
+ if (cur_pref_con == con)
+ return;
+
+ /*
+ * Delete, but do not re-initialize the entry. This allows the console
+ * to continue to appear registered (via any hlist_unhashed_lockless()
+ * checks), even though it was briefly removed from the console list.
+ */
+ hlist_del_rcu(&con->node);
+
+ /*
+ * Ensure that all SRCU list walks have completed so that the console
+ * can be added to the beginning of the console list and its forward
+ * list pointer can be re-initialized.
+ */
+ synchronize_srcu(&console_srcu);
+
+ con->flags |= CON_CONSDEV;
+ WARN_ON(!con->device);
+
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&con->node, &console_list);
+}
+EXPORT_SYMBOL(console_force_preferred_locked);
+
+/*
+ * Initialize the console device. This is called *early*, so
+ * we can't necessarily depend on lots of kernel help here.
+ * Just do some early initializations, and do the complex setup
+ * later.
+ */
+void __init console_init(void)
+{
+ int ret;
+ initcall_t call;
+ initcall_entry_t *ce;
+
+#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE
+ if (!console_set_on_cmdline)
+ add_preferred_console("ttynull", 0, NULL);
+#endif
+
+ /* Setup the default TTY line discipline. */
+ n_tty_init();
+
+ /*
+ * set up the console device so that later boot sequences can
+ * inform about problems etc..
+ */
+ ce = __con_initcall_start;
+ trace_initcall_level("console");
+ while (ce < __con_initcall_end) {
+ call = initcall_from_entry(ce);
+ trace_initcall_start(call);
+ ret = call();
+ trace_initcall_finish(call, ret);
+ ce++;
+ }
+}
+
+/*
+ * Some boot consoles access data that is in the init section and which will
+ * be discarded after the initcalls have been run. To make sure that no code
+ * will access this data, unregister the boot consoles in a late initcall.
+ *
+ * If for some reason, such as deferred probe or the driver being a loadable
+ * module, the real console hasn't registered yet at this point, there will
+ * be a brief interval in which no messages are logged to the console, which
+ * makes it difficult to diagnose problems that occur during this time.
+ *
+ * To mitigate this problem somewhat, only unregister consoles whose memory
+ * intersects with the init section. Note that all other boot consoles will
+ * get unregistered when the real preferred console is registered.
+ */
static int __init printk_late_init(void)
{
+ struct hlist_node *tmp;
struct console *con;
+ int ret;
- for_each_console(con) {
- if (!keep_bootcon && con->flags & CON_BOOT) {
- unregister_console(con);
+ console_list_lock();
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (!(con->flags & CON_BOOT))
+ continue;
+
+ /* Check addresses that might be used for enabled consoles. */
+ if (init_section_intersects(con, sizeof(*con)) ||
+ init_section_contains(con->write, 0) ||
+ init_section_contains(con->read, 0) ||
+ init_section_contains(con->device, 0) ||
+ init_section_contains(con->unblank, 0) ||
+ init_section_contains(con->data, 0)) {
+ /*
+ * Please, consider moving the reported consoles out
+ * of the init section.
+ */
+ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
+ con->name, con->index);
+ unregister_console_locked(con);
}
}
- hotcpu_notifier(console_cpu_notify, 0);
+ console_list_unlock();
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
+ console_cpu_notify);
+ WARN_ON(ret < 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
+ console_cpu_notify, NULL);
+ WARN_ON(ret < 0);
+ printk_sysctl_init();
return 0;
}
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
+/* If @con is specified, only wait for that console. Otherwise wait for all. */
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
+{
+ unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
+ unsigned long remaining_jiffies = timeout_jiffies;
+ struct console_flush_type ft;
+ struct console *c;
+ u64 last_diff = 0;
+ u64 printk_seq;
+ short flags;
+ int cookie;
+ u64 diff;
+ u64 seq;
+
+ /* Sorry, pr_flush() will not work this early. */
+ if (system_state < SYSTEM_SCHEDULING)
+ return false;
+
+ might_sleep();
+
+ seq = prb_next_reserve_seq(prb);
+
+ /* Flush the consoles so that records up to @seq are printed. */
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.legacy_direct) {
+ console_lock();
+ console_unlock();
+ }
+
+ for (;;) {
+ unsigned long begin_jiffies;
+ unsigned long slept_jiffies;
+
+ diff = 0;
+
+ /*
+ * Hold the console_lock to guarantee safe access to
+ * console->seq. Releasing console_lock flushes more
+ * records in case @seq is still not printed on all
+ * usable consoles.
+ *
+ * Holding the console_lock is not necessary if there
+ * are no legacy or boot consoles. However, such a
+ * console could register at any time. Always hold the
+ * console_lock as a precaution rather than
+ * synchronizing against register_console().
+ */
+ console_lock();
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (con && con != c)
+ continue;
+
+ flags = console_srcu_read_flags(c);
+
+ /*
+ * If consoles are not usable, it cannot be expected
+ * that they make forward progress, so only increment
+ * @diff for usable consoles.
+ */
+ if (!console_is_usable(c, flags, true) &&
+ !console_is_usable(c, flags, false)) {
+ continue;
+ }
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(c);
+ } else {
+ printk_seq = c->seq;
+ }
+
+ if (printk_seq < seq)
+ diff += seq - printk_seq;
+ }
+ console_srcu_read_unlock(cookie);
+
+ if (diff != last_diff && reset_on_progress)
+ remaining_jiffies = timeout_jiffies;
+
+ console_unlock();
+
+ /* Note: @diff is 0 if there are no usable consoles. */
+ if (diff == 0 || remaining_jiffies == 0)
+ break;
+
+ /* msleep(1) might sleep much longer. Check time by jiffies. */
+ begin_jiffies = jiffies;
+ msleep(1);
+ slept_jiffies = jiffies - begin_jiffies;
+
+ remaining_jiffies -= min(slept_jiffies, remaining_jiffies);
+
+ last_diff = diff;
+ }
+
+ return (diff == 0);
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms: The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Process context. May sleep while acquiring console lock.
+ * Return: true if all usable printers are caught up.
+ */
+bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ return __pr_flush(NULL, timeout_ms, reset_on_progress);
+}
+
/*
* Delayed printk version, for scheduler-internal messages:
*/
@@ -2608,47 +4558,133 @@ static DEFINE_PER_CPU(int, printk_pending);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
- int pending = __this_cpu_xchg(printk_pending, 0);
+ int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
+ if (force_legacy_kthread()) {
+ if (printk_legacy_kthread)
+ wake_up_interruptible(&legacy_wait);
+ } else {
+ if (console_trylock())
+ console_unlock();
+ }
}
if (pending & PRINTK_PENDING_WAKEUP)
wake_up_interruptible(&log_wait);
}
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
-};
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
+ IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
-void wake_up_klogd(void)
+static void __wake_up_klogd(int val)
{
+ if (!printk_percpu_data_ready())
+ return;
+
+ /*
+ * It is not allowed to call this function when console irq_work
+ * is blocked.
+ */
+ if (WARN_ON_ONCE(console_irqwork_blocked))
+ return;
+
preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the wait queue is empty.
+ *
+ * The full memory barrier within wq_has_sleeper() pairs with the full
+ * memory barrier within set_current_state() of
+ * prepare_to_wait_event(), which is called after ___wait_event() adds
+ * the waiter but before it has checked the wait condition.
+ *
+ * This pairs with devkmsg_read:A and syslog_print:A.
+ */
+ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
+ (val & PRINTK_PENDING_OUTPUT)) {
+ this_cpu_or(printk_pending, val);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
-int printk_deferred(const char *fmt, ...)
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing of those records has already occurred or is
+ * known to be handled by some other context. This function will only
+ * wake the logging daemon.
+ *
+ * Context: Any context.
+ */
+void wake_up_klogd(void)
+{
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP);
+}
+
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ * console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer,
+ * this context is responsible for console printing those records, but
+ * the current context is not allowed to perform the console printing.
+ * Trigger an irq_work context to perform the console printing. This
+ * function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
+void defer_console_output(void)
+{
+ /*
+ * New messages may have been added directly to the ringbuffer
+ * using vprintk_store(), so wake any waiters as well.
+ */
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+}
+
+/**
+ * printk_trigger_flush - Attempt to flush printk buffer to consoles.
+ *
+ * If possible, flush the printk buffer to all consoles in the caller's
+ * context. If offloading is available, trigger deferred printing.
+ *
+ * This is best effort. Depending on the system state, console states,
+ * and caller context, no actual flushing may result from this call.
+ */
+void printk_trigger_flush(void)
+{
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ }
+ if (ft.legacy_offload)
+ defer_console_output();
+}
+
+int vprintk_deferred(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
+}
+
+int _printk_deferred(const char *fmt, ...)
{
va_list args;
int r;
- preempt_disable();
va_start(args, fmt);
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+ r = vprintk_deferred(fmt, args);
va_end(args);
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
-
return r;
}
@@ -2749,49 +4785,64 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
static bool always_kmsg_dump;
module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
+const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
+{
+ switch (reason) {
+ case KMSG_DUMP_PANIC:
+ return "Panic";
+ case KMSG_DUMP_OOPS:
+ return "Oops";
+ case KMSG_DUMP_EMERG:
+ return "Emergency";
+ case KMSG_DUMP_SHUTDOWN:
+ return "Shutdown";
+ default:
+ return "Unknown";
+ }
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
+
/**
- * kmsg_dump - dump kernel log to kernel message dumpers.
+ * kmsg_dump_desc - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
+ * @desc: a short string to describe what caused the panic or oops. Can be NULL
+ * if no additional description is available.
*
* Call each of the registered dumper's dump() callback, which can
* retrieve the kmsg records with kmsg_dump_get_line() or
* kmsg_dump_get_buffer().
*/
-void kmsg_dump(enum kmsg_dump_reason reason)
+void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc)
{
struct kmsg_dumper *dumper;
- unsigned long flags;
-
- if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
- return;
+ struct kmsg_dump_detail detail = {
+ .reason = reason,
+ .description = desc};
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
- if (dumper->max_reason && reason > dumper->max_reason)
- continue;
+ enum kmsg_dump_reason max_reason = dumper->max_reason;
- /* initialize iterator with data about the stored records */
- dumper->active = true;
-
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ /*
+ * If client has not provided a specific max_reason, default
+ * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set.
+ */
+ if (max_reason == KMSG_DUMP_UNDEF) {
+ max_reason = always_kmsg_dump ? KMSG_DUMP_MAX :
+ KMSG_DUMP_OOPS;
+ }
+ if (reason > max_reason)
+ continue;
/* invoke dumper which will iterate over records */
- dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
+ dumper->dump(dumper, &detail);
}
rcu_read_unlock();
}
/**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
@@ -2805,82 +4856,56 @@ void kmsg_dump(enum kmsg_dump_reason reason)
*
* A return value of FALSE indicates that there are no more records to
* read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
+ char *line, size_t size, size_t *len)
{
- struct printk_log *msg;
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
size_t l = 0;
bool ret = false;
- if (!dumper->active)
- goto out;
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
- }
+ prb_rec_init_rd(&r, &info, line, size);
- /* last entry */
- if (dumper->cur_seq >= log_next_seq)
- goto out;
+ /* Read text or count text lines? */
+ if (line) {
+ if (!prb_read_valid(prb, iter->cur_seq, &r))
+ goto out;
+ l = record_print_text(&r, syslog, printk_time);
+ } else {
+ if (!prb_read_valid_info(prb, iter->cur_seq,
+ &info, &line_count)) {
+ goto out;
+ }
+ l = get_record_print_text_size(&info, line_count, syslog,
+ printk_time);
- msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, 0, syslog, line, size);
+ }
- dumper->cur_idx = log_next(dumper->cur_idx);
- dumper->cur_seq++;
+ iter->cur_seq = r.info->seq + 1;
ret = true;
out:
if (len)
*len = l;
return ret;
}
-
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
-{
- unsigned long flags;
- bool ret;
-
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
* kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @buf: buffer to copy the line to
* @size: maximum size of the buffer
- * @len: length of line placed into buffer
+ * @len_out: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
- * with as many of the the *youngest* kmsg records that fit into it.
+ * with as many of the *youngest* kmsg records that fit into it.
* If the buffer is large enough, all available kmsg records will be
* copied with a single call.
*
@@ -2890,179 +4915,232 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* A return value of FALSE indicates that there are no more records to
* read.
*/
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
- char *buf, size_t size, size_t *len)
+bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
+ char *buf, size_t size, size_t *len_out)
{
- unsigned long flags;
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
+ struct printk_info info;
+ struct printk_record r;
u64 seq;
- u32 idx;
u64 next_seq;
- u32 next_idx;
- enum log_flags prev;
- size_t l = 0;
+ size_t len = 0;
bool ret = false;
+ bool time = printk_time;
- if (!dumper->active)
+ if (!buf || !size)
goto out;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
+
+ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
+ if (info.seq != iter->cur_seq) {
+ /* messages are gone, move to first available one */
+ iter->cur_seq = info.seq;
+ }
}
/* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ if (iter->cur_seq >= iter->next_seq)
goto out;
- }
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- prev = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- l += msg_print_text(msg, prev, true, NULL, 0);
- idx = log_next(idx);
- seq++;
- prev = msg->flags;
- }
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump. Pass in size-1
+ * because this function (by way of record_print_text()) will
+ * not write more than size-1 bytes of text into @buf.
+ */
+ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
+ size - 1, syslog, time);
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- prev = 0;
- while (l > size && seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /*
+ * Next kmsg_dump_get_buffer() invocation will dump block of
+ * older records stored right before this one.
+ */
+ next_seq = seq;
- l -= msg_print_text(msg, prev, true, NULL, 0);
- idx = log_next(idx);
- seq++;
- prev = msg->flags;
- }
+ prb_rec_init_rd(&r, &info, buf, size);
- /* last message in next interation */
- next_seq = seq;
- next_idx = idx;
+ prb_for_each_record(seq, prb, seq, &r) {
+ if (r.info->seq >= iter->next_seq)
+ break;
- l = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ len += record_print_text(&r, syslog, time);
- l += msg_print_text(msg, prev, syslog, buf + l, size - l);
- idx = log_next(idx);
- seq++;
- prev = msg->flags;
+ /* Adjust record to store to remaining buffer space. */
+ prb_rec_init_rd(&r, &info, buf + len, size - len);
}
- dumper->next_seq = next_seq;
- dumper->next_idx = next_idx;
+ iter->next_seq = next_seq;
ret = true;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
out:
- if (len)
- *len = l;
+ if (len_out)
+ *len_out = len;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
- * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_rewind - reset the iterator
+ * @iter: kmsg dump iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
*/
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter->next_seq = prb_next_seq(prb);
}
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
/**
- * kmsg_dump_rewind - reset the interator
- * @dumper: registered kmsg dumper
+ * console_try_replay_all - try to replay kernel log on consoles
*
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
+ * Try to obtain lock on console subsystem and replay all
+ * available records in printk buffer on the consoles.
+ * Does nothing if lock is not obtained.
+ *
+ * Context: Any, except for NMI.
*/
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+void console_try_replay_all(void)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- kmsg_dump_rewind_nolock(dumper);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (console_trylock()) {
+ __console_rewind_all();
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_offload)
+ defer_console_output();
+ /* Consoles are flushed as part of console_unlock(). */
+ console_unlock();
+ }
}
-EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+#endif
-static char dump_stack_arch_desc_str[128];
+#ifdef CONFIG_SMP
+static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);
+
+bool is_printk_cpu_sync_owner(void)
+{
+ return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id());
+}
/**
- * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
- * @fmt: printf-style format string
- * @...: arguments for the format string
+ * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
+ * spinning lock is not owned by any CPU.
*
- * The configured string will be printed right after utsname during task
- * dumps. Usually used to add arch-specific system identifiers. If an
- * arch wants to make use of such an ID string, it should initialize this
- * as soon as possible during boot.
+ * Context: Any context.
*/
-void __init dump_stack_set_arch_desc(const char *fmt, ...)
+void __printk_cpu_sync_wait(void)
{
- va_list args;
-
- va_start(args, fmt);
- vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
- fmt, args);
- va_end(args);
+ do {
+ cpu_relax();
+ } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
+EXPORT_SYMBOL(__printk_cpu_sync_wait);
/**
- * dump_stack_print_info - print generic debug info for dump_stack()
- * @log_lvl: log level
+ * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
+ * spinning lock.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately.
*
- * Arch-specific dump_stack() implementations can use this function to
- * print out the same debug information as the generic dump_stack().
+ * Context: Any context. Expects interrupts to be disabled.
+ * Return: 1 on success, otherwise 0.
*/
-void dump_stack_print_info(const char *log_lvl)
+int __printk_cpu_sync_try_get(void)
{
- printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
- log_lvl, raw_smp_processor_id(), current->pid, current->comm,
- print_tainted(), init_utsname()->release,
- (int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ int cpu;
+ int old;
- if (dump_stack_arch_desc_str[0] != '\0')
- printk("%sHardware name: %s\n",
- log_lvl, dump_stack_arch_desc_str);
+ cpu = smp_processor_id();
- print_worker_info(log_lvl, current);
+ /*
+ * Guarantee loads and stores from this CPU when it is the lock owner
+ * are _not_ visible to the previous lock owner. This pairs with
+ * __printk_cpu_sync_put:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_put:A can never read from
+ * __printk_cpu_sync_try_get:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
+ * of the previous CPU
+ * matching
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of this CPU
+ */
+ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
+ cpu); /* LMM(__printk_cpu_sync_try_get:A) */
+ if (old == -1) {
+ /*
+ * This CPU is now the owner and begins loading/storing
+ * data: LMM(__printk_cpu_sync_try_get:B)
+ */
+ return 1;
+
+ } else if (old == cpu) {
+ /* This CPU is already the owner. */
+ atomic_inc(&printk_cpu_sync_nested);
+ return 1;
+ }
+
+ return 0;
}
+EXPORT_SYMBOL(__printk_cpu_sync_try_get);
/**
- * show_regs_print_info - print generic debug info for show_regs()
- * @log_lvl: log level
+ * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
+ *
+ * The calling processor must be the owner of the lock.
*
- * show_regs() implementations can use this function to print out generic
- * debug information.
+ * Context: Any context. Expects interrupts to be disabled.
*/
-void show_regs_print_info(const char *log_lvl)
+void __printk_cpu_sync_put(void)
{
- dump_stack_print_info(log_lvl);
+ if (atomic_read(&printk_cpu_sync_nested)) {
+ atomic_dec(&printk_cpu_sync_nested);
+ return;
+ }
- printk("%stask: %p ti: %p task.ti: %p\n",
- log_lvl, current, current_thread_info(),
- task_thread_info(current));
-}
+ /*
+ * This CPU is finished loading/storing data:
+ * LMM(__printk_cpu_sync_put:A)
+ */
-#endif
+ /*
+ * Guarantee loads and stores from this CPU when it was the
+ * lock owner are visible to the next lock owner. This pairs
+ * with __printk_cpu_sync_try_get:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
+ * of this CPU
+ * matching
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of the next CPU
+ */
+ atomic_set_release(&printk_cpu_sync_owner,
+ -1); /* LMM(__printk_cpu_sync_put:B) */
+}
+EXPORT_SYMBOL(__printk_cpu_sync_put);
+#endif /* CONFIG_SMP */
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
new file mode 100644
index 000000000000..56c8e3d031f4
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.c
@@ -0,0 +1,2415 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <kunit/visibility.h>
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include "printk_ringbuffer.h"
+#include "internal.h"
+
+/**
+ * DOC: printk_ringbuffer overview
+ *
+ * Data Structure
+ * --------------
+ * The printk_ringbuffer is made up of 3 internal ringbuffers:
+ *
+ * desc_ring
+ * A ring of descriptors and their meta data (such as sequence number,
+ * timestamp, loglevel, etc.) as well as internal state information about
+ * the record and logical positions specifying where in the other
+ * ringbuffer the text strings are located.
+ *
+ * text_data_ring
+ * A ring of data blocks. A data block consists of an unsigned long
+ * integer (ID) that maps to a desc_ring index followed by the text
+ * string of the record.
+ *
+ * The internal state information of a descriptor is the key element to allow
+ * readers and writers to locklessly synchronize access to the data.
+ *
+ * Implementation
+ * --------------
+ *
+ * Descriptor Ring
+ * ~~~~~~~~~~~~~~~
+ * The descriptor ring is an array of descriptors. A descriptor contains
+ * essential meta data to track the data of a printk record using
+ * blk_lpos structs pointing to associated text data blocks (see
+ * "Data Rings" below). Each descriptor is assigned an ID that maps
+ * directly to index values of the descriptor array and has a state. The ID
+ * and the state are bitwise combined into a single descriptor field named
+ * @state_var, allowing ID and state to be synchronously and atomically
+ * updated.
+ *
+ * Descriptors have four states:
+ *
+ * reserved
+ * A writer is modifying the record.
+ *
+ * committed
+ * The record and all its data are written. A writer can reopen the
+ * descriptor (transitioning it back to reserved), but in the committed
+ * state the data is consistent.
+ *
+ * finalized
+ * The record and all its data are complete and available for reading. A
+ * writer cannot reopen the descriptor.
+ *
+ * reusable
+ * The record exists, but its text and/or meta data may no longer be
+ * available.
+ *
+ * Querying the @state_var of a record requires providing the ID of the
+ * descriptor to query. This can yield a possible fifth (pseudo) state:
+ *
+ * miss
+ * The descriptor being queried has an unexpected ID.
+ *
+ * The descriptor ring has a @tail_id that contains the ID of the oldest
+ * descriptor and @head_id that contains the ID of the newest descriptor.
+ *
+ * When a new descriptor should be created (and the ring is full), the tail
+ * descriptor is invalidated by first transitioning to the reusable state and
+ * then invalidating all tail data blocks up to and including the data blocks
+ * associated with the tail descriptor (for the text ring). Then
+ * @tail_id is advanced, followed by advancing @head_id. And finally the
+ * @state_var of the new descriptor is initialized to the new ID and reserved
+ * state.
+ *
+ * The @tail_id can only be advanced if the new @tail_id would be in the
+ * committed or reusable queried state. This makes it possible that a valid
+ * sequence number of the tail is always available.
+ *
+ * Descriptor Finalization
+ * ~~~~~~~~~~~~~~~~~~~~~~~
+ * When a writer calls the commit function prb_commit(), record data is
+ * fully stored and is consistent within the ringbuffer. However, a writer can
+ * reopen that record, claiming exclusive access (as with prb_reserve()), and
+ * modify that record. When finished, the writer must again commit the record.
+ *
+ * In order for a record to be made available to readers (and also become
+ * recyclable for writers), it must be finalized. A finalized record cannot be
+ * reopened and can never become "unfinalized". Record finalization can occur
+ * in three different scenarios:
+ *
+ * 1) A writer can simultaneously commit and finalize its record by calling
+ * prb_final_commit() instead of prb_commit().
+ *
+ * 2) When a new record is reserved and the previous record has been
+ * committed via prb_commit(), that previous record is automatically
+ * finalized.
+ *
+ * 3) When a record is committed via prb_commit() and a newer record
+ * already exists, the record being committed is automatically finalized.
+ *
+ * Data Ring
+ * ~~~~~~~~~
+ * The text data ring is a byte array composed of data blocks. Data blocks are
+ * referenced by blk_lpos structs that point to the logical position of the
+ * beginning of a data block and the beginning of the next adjacent data
+ * block. Logical positions are mapped directly to index values of the byte
+ * array ringbuffer.
+ *
+ * Each data block consists of an ID followed by the writer data. The ID is
+ * the identifier of a descriptor that is associated with the data block. A
+ * given data block is considered valid if all of the following conditions
+ * are met:
+ *
+ * 1) The descriptor associated with the data block is in the committed
+ * or finalized queried state.
+ *
+ * 2) The blk_lpos struct within the descriptor associated with the data
+ * block references back to the same data block.
+ *
+ * 3) The data block is within the head/tail logical position range.
+ *
+ * If the writer data of a data block would extend beyond the end of the
+ * byte array, only the ID of the data block is stored at the logical
+ * position and the full data block (ID and writer data) is stored at the
+ * beginning of the byte array. The referencing blk_lpos will point to the
+ * ID before the wrap and the next data block will be at the logical
+ * position adjacent the full data block after the wrap.
+ *
+ * Data rings have a @tail_lpos that points to the beginning of the oldest
+ * data block and a @head_lpos that points to the logical position of the
+ * next (not yet existing) data block.
+ *
+ * When a new data block should be created (and the ring is full), tail data
+ * blocks will first be invalidated by putting their associated descriptors
+ * into the reusable state and then pushing the @tail_lpos forward beyond
+ * them. Then the @head_lpos is pushed forward and is associated with a new
+ * descriptor. If a data block is not valid, the @tail_lpos cannot be
+ * advanced beyond it.
+ *
+ * Info Array
+ * ~~~~~~~~~~
+ * The general meta data of printk records are stored in printk_info structs,
+ * stored in an array with the same number of elements as the descriptor ring.
+ * Each info corresponds to the descriptor of the same index in the
+ * descriptor ring. Info validity is confirmed by evaluating the corresponding
+ * descriptor before and after loading the info.
+ *
+ * Usage
+ * -----
+ * Here are some simple examples demonstrating writers and readers. For the
+ * examples a global ringbuffer (test_rb) is available (which is not the
+ * actual ringbuffer used by printk)::
+ *
+ * DEFINE_PRINTKRB(test_rb, 15, 5);
+ *
+ * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
+ * 1 MiB (2 ^ (15 + 5)) for text data.
+ *
+ * Sample writer code::
+ *
+ * const char *textstr = "message text";
+ * struct prb_reserved_entry e;
+ * struct printk_record r;
+ *
+ * // specify how much to allocate
+ * prb_rec_init_wr(&r, strlen(textstr) + 1);
+ *
+ * if (prb_reserve(&e, &test_rb, &r)) {
+ * snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Note that additional writer functions are available to extend a record
+ * after it has been committed but not yet finalized. This can be done as
+ * long as no new records have been reserved and the caller is the same.
+ *
+ * Sample writer code (record extending)::
+ *
+ * // alternate rest of previous example
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit the record (but do not finalize yet)
+ * prb_commit(&e);
+ * }
+ *
+ * ...
+ *
+ * // specify additional 5 bytes text space to extend
+ * prb_rec_init_wr(&r, 5);
+ *
+ * // try to extend, but only if it does not exceed 32 bytes
+ * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
+ * snprintf(&r.text_buf[r.info->text_len],
+ * r.text_buf_size - r.info->text_len, "hello");
+ *
+ * r.info->text_len += 5;
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Sample reader code::
+ *
+ * struct printk_info info;
+ * struct printk_record r;
+ * char text_buf[32];
+ * u64 seq;
+ *
+ * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
+ *
+ * prb_for_each_record(0, &test_rb, &seq, &r) {
+ * if (info.seq != seq)
+ * pr_warn("lost %llu records\n", info.seq - seq);
+ *
+ * if (info.text_len > r.text_buf_size) {
+ * pr_warn("record %llu text truncated\n", info.seq);
+ * text_buf[r.text_buf_size - 1] = 0;
+ * }
+ *
+ * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
+ * &text_buf[0]);
+ * }
+ *
+ * Note that additional less convenient reader functions are available to
+ * allow complex record access.
+ *
+ * ABA Issues
+ * ~~~~~~~~~~
+ * To help avoid ABA issues, descriptors are referenced by IDs (array index
+ * values combined with tagged bits counting array wraps) and data blocks are
+ * referenced by logical positions (array index values combined with tagged
+ * bits counting array wraps). However, on 32-bit systems the number of
+ * tagged bits is relatively small such that an ABA incident is (at least
+ * theoretically) possible. For example, if 4 million maximally sized (1KiB)
+ * printk messages were to occur in NMI context on a 32-bit system, the
+ * interrupted context would not be able to recognize that the 32-bit integer
+ * completely wrapped and thus represents a different data block than the one
+ * the interrupted context expects.
+ *
+ * To help combat this possibility, additional state checking is performed
+ * (such as using cmpxchg() even though set() would suffice). These extra
+ * checks are commented as such and will hopefully catch any ABA issue that
+ * a 32-bit system might experience.
+ *
+ * Memory Barriers
+ * ~~~~~~~~~~~~~~~
+ * Multiple memory barriers are used. To simplify proving correctness and
+ * generating litmus tests, lines of code related to memory barriers
+ * (loads, stores, and the associated memory barriers) are labeled::
+ *
+ * LMM(function:letter)
+ *
+ * Comments reference the labels using only the "function:letter" part.
+ *
+ * The memory barrier pairs and their ordering are:
+ *
+ * desc_reserve:D / desc_reserve:B
+ * push descriptor tail (id), then push descriptor head (id)
+ *
+ * desc_reserve:D / data_push_tail:B
+ * push data tail (lpos), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / desc_push_tail:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / prb_first_seq:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:F / desc_read:D
+ * set new descriptor id and reserved (state), then allow writer changes
+ *
+ * data_alloc:A (or data_realloc:A) / desc_read:D
+ * set old descriptor reusable (state), then modify new data block area
+ *
+ * data_alloc:A (or data_realloc:A) / data_push_tail:B
+ * push data tail (lpos), then modify new data block area
+ *
+ * _prb_commit:B / desc_read:B
+ * store writer changes, then set new descriptor committed (state)
+ *
+ * desc_reopen_last:A / _prb_commit:B
+ * set descriptor reserved (state), then read descriptor data
+ *
+ * _prb_commit:B / desc_reserve:D
+ * set new descriptor committed (state), then check descriptor head (id)
+ *
+ * data_push_tail:D / data_push_tail:A
+ * set descriptor reusable (state), then push data tail (lpos)
+ *
+ * desc_push_tail:B / desc_reserve:D
+ * set descriptor reusable (state), then push descriptor tail (id)
+ *
+ * desc_update_last_finalized:A / desc_last_finalized_seq:A
+ * store finalized record, then set new highest finalized sequence number
+ */
+
+#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits)
+#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1)
+
+#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits)
+#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1)
+
+/* Determine the data array index from a logical position. */
+#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring))
+
+/* Determine the desc array index from an ID or sequence number. */
+#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring))
+
+/* Determine how many times the data array has wrapped. */
+#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits)
+
+/* Determine if a logical position refers to a data-less block. */
+#define LPOS_DATALESS(lpos) ((lpos) & 1UL)
+#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \
+ LPOS_DATALESS((blk)->next))
+
+/* Get the logical position at index 0 of the current wrap. */
+#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
+((lpos) & ~DATA_SIZE_MASK(data_ring))
+
+/* Get the ID for the same index of the previous wrap as the given ID. */
+#define DESC_ID_PREV_WRAP(desc_ring, id) \
+DESC_ID((id) - DESCS_COUNT(desc_ring))
+
+/*
+ * A data block: mapped directly to the beginning of the data block area
+ * specified as a logical position within the data ring.
+ *
+ * @id: the ID of the associated descriptor
+ * @data: the writer data
+ *
+ * Note that the size of a data block is only known by its associated
+ * descriptor.
+ */
+struct prb_data_block {
+ unsigned long id;
+ char data[];
+};
+
+/*
+ * Return the descriptor associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
+}
+
+/*
+ * Return the printk_info associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
+}
+
+static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
+ unsigned long begin_lpos)
+{
+ return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
+}
+
+/*
+ * Increase the data size to account for data block meta data plus any
+ * padding so that the adjacent data block is aligned on the ID size.
+ */
+static unsigned int to_blk_size(unsigned int size)
+{
+ struct prb_data_block *db = NULL;
+
+ size += sizeof(*db);
+ size = ALIGN(size, sizeof(db->id));
+ return size;
+}
+
+/*
+ * Sanity checker for reserve size. The ringbuffer code assumes that a data
+ * block does not exceed the maximum possible size that could fit within the
+ * ringbuffer. This function provides that basic size check so that the
+ * assumption is safe. In particular, it guarantees that data_push_tail() will
+ * never attempt to push the tail beyond the head.
+ */
+static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
+{
+ /* Data-less blocks take no space. */
+ if (size == 0)
+ return true;
+
+ /*
+ * If data blocks were allowed to be larger than half the data ring
+ * size, a wrapping data block could require more space than the full
+ * ringbuffer.
+ */
+ return to_blk_size(size) <= DATA_SIZE(data_ring) / 2;
+}
+
+/*
+ * Compare the current and requested logical position and decide
+ * whether more space is needed.
+ *
+ * Return false when @lpos_current is already at or beyond @lpos_target.
+ *
+ * Also return false when the difference between the positions is bigger
+ * than the size of the data buffer. It might happen only when the caller
+ * raced with another CPU(s) which already made and used the space.
+ */
+static bool need_more_space(struct prb_data_ring *data_ring,
+ unsigned long lpos_current,
+ unsigned long lpos_target)
+{
+ return lpos_target - lpos_current - 1 < DATA_SIZE(data_ring);
+}
+
+/* Query the state of a descriptor. */
+static enum desc_state get_desc_state(unsigned long id,
+ unsigned long state_val)
+{
+ if (id != DESC_ID(state_val))
+ return desc_miss;
+
+ return DESC_STATE(state_val);
+}
+
+/*
+ * Get a copy of a specified descriptor and return its queried state. If the
+ * descriptor is in an inconsistent state (miss or reserved), the caller can
+ * only expect the descriptor's @state_var field to be valid.
+ *
+ * The sequence number and caller_id can be optionally retrieved. Like all
+ * non-state_var data, they are only valid if the descriptor is in a
+ * consistent state.
+ */
+static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
+ unsigned long id, struct prb_desc *desc_out,
+ u64 *seq_out, u32 *caller_id_out)
+{
+ struct printk_info *info = to_info(desc_ring, id);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+ enum desc_state d_state;
+ unsigned long state_val;
+
+ /* Check the descriptor state. */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
+ d_state = get_desc_state(id, state_val);
+ if (d_state == desc_miss || d_state == desc_reserved) {
+ /*
+ * The descriptor is in an inconsistent state. Set at least
+ * @state_var so that the caller can see the details of
+ * the inconsistent state.
+ */
+ goto out;
+ }
+
+ /*
+ * Guarantee the state is loaded before copying the descriptor
+ * content. This avoids copying obsolete descriptor content that might
+ * not apply to the descriptor state. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
+ * from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * RMB from desc_read:A to desc_read:C
+ */
+ smp_rmb(); /* LMM(desc_read:B) */
+
+ /*
+ * Copy the descriptor data. The data is not valid until the
+ * state has been re-checked. A memcpy() for all of @desc
+ * cannot be used because of the atomic_t @state_var field.
+ */
+ if (desc_out) {
+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ }
+ if (seq_out)
+ *seq_out = info->seq; /* also part of desc_read:C */
+ if (caller_id_out)
+ *caller_id_out = info->caller_id; /* also part of desc_read:C */
+
+ /*
+ * 1. Guarantee the descriptor content is loaded before re-checking
+ * the state. This avoids reading an obsolete descriptor state
+ * that may not apply to the copied content. This pairs with
+ * desc_reserve:F.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:C reads from desc_reserve:G, then desc_read:E
+ * reads from desc_reserve:F.
+ *
+ * Relies on:
+ *
+ * WMB from desc_reserve:F to desc_reserve:G
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * 2. Guarantee the record data is loaded before re-checking the
+ * state. This avoids reading an obsolete descriptor state that may
+ * not apply to the copied data. This pairs with data_alloc:A and
+ * data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If copy_data:A reads from data_alloc:B, then desc_read:E
+ * reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_alloc:B
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * Note: desc_make_reusable:A and data_alloc:B can be different
+ * CPUs. However, the data_alloc:B CPU (which performs the
+ * full memory barrier) must have previously seen
+ * desc_make_reusable:A.
+ */
+ smp_rmb(); /* LMM(desc_read:D) */
+
+ /*
+ * The data has been copied. Return the current descriptor state,
+ * which may have changed since the load above.
+ */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
+ d_state = get_desc_state(id, state_val);
+out:
+ if (desc_out)
+ atomic_long_set(&desc_out->state_var, state_val);
+ return d_state;
+}
+
+/*
+ * Take a specified descriptor out of the finalized state by attempting
+ * the transition from finalized to reusable. Either this context or some
+ * other context will have been successful.
+ */
+static void desc_make_reusable(struct prb_desc_ring *desc_ring,
+ unsigned long id)
+{
+ unsigned long val_finalized = DESC_SV(id, desc_finalized);
+ unsigned long val_reusable = DESC_SV(id, desc_reusable);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+
+ atomic_long_cmpxchg_relaxed(state_var, val_finalized,
+ val_reusable); /* LMM(desc_make_reusable:A) */
+}
+
+/*
+ * Given the text data ring, put the associated descriptor of each
+ * data block from @lpos_begin until @lpos_end into the reusable state.
+ *
+ * If there is any problem making the associated descriptor reusable, either
+ * the descriptor has not yet been finalized or another writer context has
+ * already pushed the tail lpos past the problematic data block. Regardless,
+ * on error the caller can re-load the tail lpos to determine the situation.
+ */
+static bool data_make_reusable(struct printk_ringbuffer *rb,
+ unsigned long lpos_begin,
+ unsigned long lpos_end,
+ unsigned long *lpos_out)
+{
+
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct prb_data_block *blk;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
+ unsigned long id;
+
+ /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
+ while (need_more_space(data_ring, lpos_begin, lpos_end)) {
+ blk = to_block(data_ring, lpos_begin);
+
+ /*
+ * Load the block ID from the data block. This is a data race
+ * against a writer that may have newly reserved this data
+ * area. If the loaded value matches a valid descriptor ID,
+ * the blk_lpos of that descriptor will be checked to make
+ * sure it points back to this data block. If the check fails,
+ * the data area has been recycled by another writer.
+ */
+ id = blk->id; /* LMM(data_make_reusable:A) */
+
+ d_state = desc_read(desc_ring, id, &desc,
+ NULL, NULL); /* LMM(data_make_reusable:B) */
+
+ switch (d_state) {
+ case desc_miss:
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ desc_make_reusable(desc_ring, id);
+ break;
+ case desc_reusable:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ break;
+ }
+
+ /* Advance @lpos_begin to the next data block. */
+ lpos_begin = blk_lpos->next;
+ }
+
+ *lpos_out = lpos_begin;
+ return true;
+}
+
+/*
+ * Advance the data ring tail to at least @lpos. This function puts
+ * descriptors into the reusable state if the tail is pushed beyond
+ * their associated data block.
+ */
+static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ unsigned long tail_lpos_new;
+ unsigned long tail_lpos;
+ unsigned long next_lpos;
+
+ /* If @lpos is from a data-less block, there is nothing to do. */
+ if (LPOS_DATALESS(lpos))
+ return true;
+
+ /*
+ * Any descriptor states that have transitioned to reusable due to the
+ * data tail being pushed to this loaded value will be visible to this
+ * CPU. This pairs with data_push_tail:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_push_tail:A reads from data_push_tail:D, then this CPU can
+ * see desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_push_tail:D
+ * matches
+ * READFROM from data_push_tail:D to data_push_tail:A
+ * thus
+ * READFROM from desc_make_reusable:A to this CPU
+ */
+ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */
+
+ /*
+ * Loop until the tail lpos is at or beyond @lpos. This condition
+ * may already be satisfied, resulting in no full memory barrier
+ * from data_push_tail:D being performed. However, since this CPU
+ * sees the new tail lpos, any descriptor states that transitioned to
+ * the reusable state must already be visible.
+ */
+ while (need_more_space(data_ring, tail_lpos, lpos)) {
+ /*
+ * Make all descriptors reusable that are associated with
+ * data blocks before @lpos.
+ */
+ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
+ /*
+ * 1. Guarantee the block ID loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled data area causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * data_alloc:A and data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:A reads from data_alloc:B,
+ * then data_push_tail:C reads from
+ * data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to data_alloc:B
+ * matching
+ * RMB from data_make_reusable:A to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and data_alloc:B can be
+ * different CPUs. However, the data_alloc:B
+ * CPU (which performs the full memory
+ * barrier) must have previously seen
+ * data_push_tail:D.
+ *
+ * 2. Guarantee the descriptor state loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled descriptor causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:B reads from
+ * desc_reserve:F, then data_push_tail:C reads
+ * from data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to desc_reserve:F
+ * matching
+ * RMB from data_make_reusable:B to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and desc_reserve:F can
+ * be different CPUs. However, the
+ * desc_reserve:F CPU (which performs the
+ * full memory barrier) must have previously
+ * seen data_push_tail:D.
+ */
+ smp_rmb(); /* LMM(data_push_tail:B) */
+
+ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
+ ); /* LMM(data_push_tail:C) */
+ if (tail_lpos_new == tail_lpos)
+ return false;
+
+ /* Another CPU pushed the tail. Try again. */
+ tail_lpos = tail_lpos_new;
+ continue;
+ }
+
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail lpos. A full
+ * memory barrier is needed since other CPUs may have made
+ * the descriptor states reusable. This pairs with
+ * data_push_tail:A.
+ */
+ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
+ next_lpos)) { /* LMM(data_push_tail:D) */
+ break;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Advance the desc ring tail. This function advances the tail by one
+ * descriptor, thus invalidating the oldest descriptor. Before advancing
+ * the tail, the tail descriptor is made reusable and all data blocks up to
+ * and including the descriptor's data block are invalidated (i.e. the data
+ * ring tail is pushed past the data block of the descriptor being made
+ * reusable).
+ */
+static bool desc_push_tail(struct printk_ringbuffer *rb,
+ unsigned long tail_id)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+
+ d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);
+
+ switch (d_state) {
+ case desc_miss:
+ /*
+ * If the ID is exactly 1 wrap behind the expected, it is
+ * in the process of being reserved by another writer and
+ * must be considered reserved.
+ */
+ if (DESC_ID(atomic_long_read(&desc.state_var)) ==
+ DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
+ return false;
+ }
+
+ /*
+ * The ID has changed. Another writer must have pushed the
+ * tail and recycled the descriptor already. Success is
+ * returned because the caller is only interested in the
+ * specified tail being pushed, which it was.
+ */
+ return true;
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ desc_make_reusable(desc_ring, tail_id);
+ break;
+ case desc_reusable:
+ break;
+ }
+
+ /*
+ * Data blocks must be invalidated before their associated
+ * descriptor can be made available for recycling. Invalidating
+ * them later is not possible because there is no way to trust
+ * data blocks once their associated descriptor is gone.
+ */
+
+ if (!data_push_tail(rb, desc.text_blk_lpos.next))
+ return false;
+
+ /*
+ * Check the next descriptor after @tail_id before pushing the tail
+ * to it because the tail must always be in a finalized or reusable
+ * state. The implementation of prb_first_seq() relies on this.
+ *
+ * A successful read implies that the next descriptor is less than or
+ * equal to @head_id so there is no risk of pushing the tail past the
+ * head.
+ */
+ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
+ NULL, NULL); /* LMM(desc_push_tail:A) */
+
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail ID. This allows
+ * verifying the recycled descriptor state. A full memory
+ * barrier is needed since other CPUs may have made the
+ * descriptor states reusable. This pairs with desc_reserve:D.
+ */
+ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
+ DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
+ } else {
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail ID in the
+ * case that the descriptor has been recycled. This pairs
+ * with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_push_tail:A reads from desc_reserve:F, then
+ * desc_push_tail:D reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB from desc_push_tail:A to desc_push_tail:D
+ *
+ * Note: desc_push_tail:B and desc_reserve:F can be different
+ * CPUs. However, the desc_reserve:F CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_push_tail:C) */
+
+ /*
+ * Re-check the tail ID. The descriptor following @tail_id is
+ * not in an allowed tail state. But if the tail has since
+ * been moved by another CPU, then it does not matter.
+ */
+ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
+ return false;
+ }
+
+ return true;
+}
+
+/* Reserve a new descriptor, invalidating the oldest if necessary. */
+static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long prev_state_val;
+ unsigned long id_prev_wrap;
+ struct prb_desc *desc;
+ unsigned long head_id;
+ unsigned long id;
+
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */
+
+ do {
+ id = DESC_ID(head_id + 1);
+ id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);
+
+ /*
+ * Guarantee the head ID is read before reading the tail ID.
+ * Since the tail ID is updated before the head ID, this
+ * guarantees that @id_prev_wrap is never ahead of the tail
+ * ID. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:A reads from desc_reserve:D, then
+ * desc_reserve:C reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:D
+ * matching
+ * RMB from desc_reserve:A to desc_reserve:C
+ *
+ * Note: desc_push_tail:B and desc_reserve:D can be different
+ * CPUs. However, the desc_reserve:D CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_reserve:B) */
+
+ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
+ )) { /* LMM(desc_reserve:C) */
+ /*
+ * Make space for the new descriptor by
+ * advancing the tail.
+ */
+ if (!desc_push_tail(rb, id_prev_wrap))
+ return false;
+ }
+
+ /*
+ * 1. Guarantee the tail ID is read before validating the
+ * recycled descriptor state. A read memory barrier is
+ * sufficient for this. This pairs with desc_push_tail:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:C reads from desc_push_tail:B, then
+ * desc_reserve:E reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to desc_push_tail:B
+ * matching
+ * RMB from desc_reserve:C to desc_reserve:E
+ *
+ * Note: desc_make_reusable:A and desc_push_tail:B can be
+ * different CPUs. However, the desc_push_tail:B CPU
+ * (which performs the full memory barrier) must have
+ * previously seen desc_make_reusable:A.
+ *
+ * 2. Guarantee the tail ID is stored before storing the head
+ * ID. This pairs with desc_reserve:B.
+ *
+ * 3. Guarantee any data ring tail changes are stored before
+ * recycling the descriptor. Data ring tail changes can
+ * happen via desc_push_tail()->data_push_tail(). A full
+ * memory barrier is needed since another CPU may have
+ * pushed the data ring tails. This pairs with
+ * data_push_tail:B.
+ *
+ * 4. Guarantee a new tail ID is stored before recycling the
+ * descriptor. A full memory barrier is needed since
+ * another CPU may have pushed the tail ID. This pairs
+ * with desc_push_tail:C and this also pairs with
+ * prb_first_seq:C.
+ *
+ * 5. Guarantee the head ID is stored before trying to
+ * finalize the previous descriptor. This pairs with
+ * _prb_commit:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
+ id)); /* LMM(desc_reserve:D) */
+
+ desc = to_desc(desc_ring, id);
+
+ /*
+ * If the descriptor has been recycled, verify the old state val.
+ * See "ABA Issues" about why this verification is performed.
+ */
+ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
+ if (prev_state_val &&
+ get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Assign the descriptor a new ID and set its state to reserved.
+ * See "ABA Issues" about why cmpxchg() instead of set() is used.
+ *
+ * Guarantee the new descriptor ID and state is stored before making
+ * any other changes. A write memory barrier is sufficient for this.
+ * This pairs with desc_read:D.
+ */
+ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Now data in @desc can be modified: LMM(desc_reserve:G) */
+
+ *id_out = id;
+ return true;
+}
+
+static bool is_blk_wrapped(struct prb_data_ring *data_ring,
+ unsigned long begin_lpos, unsigned long next_lpos)
+{
+ /*
+ * Subtract one from next_lpos since it's not actually part of this data
+ * block. This allows perfectly fitting records to not wrap.
+ */
+ return DATA_WRAPS(data_ring, begin_lpos) !=
+ DATA_WRAPS(data_ring, next_lpos - 1);
+}
+
+/* Determine the end of a data block. */
+static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
+ unsigned long lpos, unsigned int size)
+{
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ begin_lpos = lpos;
+ next_lpos = lpos + size;
+
+ /* First check if the data block does not wrap. */
+ if (!is_blk_wrapped(data_ring, begin_lpos, next_lpos))
+ return next_lpos;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
+}
+
+/*
+ * Allocate a new data block, invalidating the oldest data block(s)
+ * if necessary. This function also associates the data block with
+ * a specified descriptor.
+ */
+static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_data_block *blk;
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ if (size == 0) {
+ /*
+ * Data blocks are not created for empty lines. Instead, the
+ * reader will recognize these special lpos values and handle
+ * it appropriately.
+ */
+ blk_lpos->begin = EMPTY_LINE_LPOS;
+ blk_lpos->next = EMPTY_LINE_LPOS;
+ return NULL;
+ }
+
+ size = to_blk_size(size);
+
+ begin_lpos = atomic_long_read(&data_ring->head_lpos);
+
+ do {
+ next_lpos = get_next_lpos(data_ring, begin_lpos, size);
+
+ /*
+ * data_check_size() prevents data block allocation that could
+ * cause illegal ringbuffer states. But double check that the
+ * used space will not be bigger than the ring buffer. Wrapped
+ * messages need to reserve more space, see get_next_lpos().
+ *
+ * Specify a data-less block when the check or the allocation
+ * fails.
+ */
+ if (WARN_ON_ONCE(next_lpos - begin_lpos > DATA_SIZE(data_ring)) ||
+ !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
+ blk_lpos->begin = FAILED_LPOS;
+ blk_lpos->next = FAILED_LPOS;
+ return NULL;
+ }
+
+ /*
+ * 1. Guarantee any descriptor states that have transitioned
+ * to reusable are stored before modifying the newly
+ * allocated data area. A full memory barrier is needed
+ * since other CPUs may have made the descriptor states
+ * reusable. See data_push_tail:A about why the reusable
+ * states are visible. This pairs with desc_read:D.
+ *
+ * 2. Guarantee any updated tail lpos is stored before
+ * modifying the newly allocated data area. Another CPU may
+ * be in data_make_reusable() and is reading a block ID
+ * from this area. data_make_reusable() can handle reading
+ * a garbage block ID value, but then it must be able to
+ * load a new tail lpos. A full memory barrier is needed
+ * since other CPUs may have updated the tail lpos. This
+ * pairs with data_push_tail:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
+ next_lpos)); /* LMM(data_alloc:A) */
+
+ blk = to_block(data_ring, begin_lpos);
+ blk->id = id; /* LMM(data_alloc:B) */
+
+ if (is_blk_wrapped(data_ring, begin_lpos, next_lpos)) {
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+ }
+
+ blk_lpos->begin = begin_lpos;
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/*
+ * Try to resize an existing data block associated with the descriptor
+ * specified by @id. If the resized data block should become wrapped, it
+ * copies the old data to the new data block. If @size yields a data block
+ * with the same or less size, the data block is left as is.
+ *
+ * Fail if this is not the last allocated data block or if there is not
+ * enough space or it is not possible make enough space.
+ *
+ * Return a pointer to the beginning of the entire data buffer or NULL on
+ * failure.
+ */
+static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_data_block *blk;
+ unsigned long head_lpos;
+ unsigned long next_lpos;
+ bool wrapped;
+
+ /* Reallocation only works if @blk_lpos is the newest data block. */
+ head_lpos = atomic_long_read(&data_ring->head_lpos);
+ if (head_lpos != blk_lpos->next)
+ return NULL;
+
+ /* Keep track if @blk_lpos was a wrapping data block. */
+ wrapped = is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next);
+
+ size = to_blk_size(size);
+
+ next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
+
+ /*
+ * Use the current data block when the size does not increase, i.e.
+ * when @head_lpos is already able to accommodate the new @next_lpos.
+ *
+ * Note that need_more_space() could never return false here because
+ * the difference between the positions was bigger than the data
+ * buffer size. The data block is reopened and can't get reused.
+ */
+ if (!need_more_space(data_ring, head_lpos, next_lpos)) {
+ if (wrapped)
+ blk = to_block(data_ring, 0);
+ else
+ blk = to_block(data_ring, blk_lpos->begin);
+ return &blk->data[0];
+ }
+
+ /*
+ * data_check_size() prevents data block reallocation that could
+ * cause illegal ringbuffer states. But double check that the
+ * new used space will not be bigger than the ring buffer. Wrapped
+ * messages need to reserve more space, see get_next_lpos().
+ *
+ * Specify failure when the check or the allocation fails.
+ */
+ if (WARN_ON_ONCE(next_lpos - blk_lpos->begin > DATA_SIZE(data_ring)) ||
+ !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
+ return NULL;
+ }
+
+ /* The memory barrier involvement is the same as data_alloc:A. */
+ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
+ next_lpos)) { /* LMM(data_realloc:A) */
+ return NULL;
+ }
+
+ blk = to_block(data_ring, blk_lpos->begin);
+
+ if (is_blk_wrapped(data_ring, blk_lpos->begin, next_lpos)) {
+ struct prb_data_block *old_blk = blk;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+
+ if (!wrapped) {
+ /*
+ * Since the allocated space is now in the newly
+ * created wrapping data block, copy the content
+ * from the old data block.
+ */
+ memcpy(&blk->data[0], &old_blk->data[0],
+ (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
+ }
+ }
+
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/* Return the number of bytes used by a data block. */
+static unsigned int space_used(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos)
+{
+ /* Data-less blocks take no space. */
+ if (BLK_DATALESS(blk_lpos))
+ return 0;
+
+ if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) {
+ /* Data block does not wrap. */
+ return (DATA_INDEX(data_ring, blk_lpos->next) -
+ DATA_INDEX(data_ring, blk_lpos->begin));
+ }
+
+ /*
+ * For wrapping data blocks, the trailing (wasted) space is
+ * also counted.
+ */
+ return (DATA_INDEX(data_ring, blk_lpos->next) +
+ DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
+}
+
+/*
+ * Given @blk_lpos, return a pointer to the writer data from the data block
+ * and calculate the size of the data part. A NULL pointer is returned if
+ * @blk_lpos specifies values that could never be legal.
+ *
+ * This function (used by readers) performs strict validation on the lpos
+ * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static const char *get_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos,
+ unsigned int *data_size)
+{
+ struct prb_data_block *db;
+
+ /* Data-less data block description. */
+ if (BLK_DATALESS(blk_lpos)) {
+ /*
+ * Records that are just empty lines are also valid, even
+ * though they do not have a data block. For such records
+ * explicitly return empty string data to signify success.
+ */
+ if (blk_lpos->begin == EMPTY_LINE_LPOS &&
+ blk_lpos->next == EMPTY_LINE_LPOS) {
+ *data_size = 0;
+ return "";
+ }
+
+ /* Data lost, invalid, or otherwise unavailable. */
+ return NULL;
+ }
+
+ /* Regular data block: @begin and @next in the same wrap. */
+ if (!is_blk_wrapped(data_ring, blk_lpos->begin, blk_lpos->next)) {
+ db = to_block(data_ring, blk_lpos->begin);
+ *data_size = blk_lpos->next - blk_lpos->begin;
+
+ /* Wrapping data block: @begin is one wrap behind @next. */
+ } else if (!is_blk_wrapped(data_ring,
+ blk_lpos->begin + DATA_SIZE(data_ring),
+ blk_lpos->next)) {
+ db = to_block(data_ring, 0);
+ *data_size = DATA_INDEX(data_ring, blk_lpos->next);
+
+ /* Illegal block description. */
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ /* Sanity check. Data-less blocks were handled earlier. */
+ if (WARN_ON_ONCE(!data_check_size(data_ring, *data_size) || !*data_size))
+ return NULL;
+
+ /* A valid data block will always be aligned to the ID size. */
+ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
+ WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
+ return NULL;
+ }
+
+ /* A valid data block will always have at least an ID. */
+ if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
+ return NULL;
+
+ /* Subtract block ID space from size to reflect data size. */
+ *data_size -= sizeof(db->id);
+
+ return &db->data[0];
+}
+
+/*
+ * Attempt to transition the newest descriptor from committed back to reserved
+ * so that the record can be modified by a writer again. This is only possible
+ * if the descriptor is not yet finalized and the provided @caller_id matches.
+ */
+static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
+ u32 caller_id, unsigned long *id_out)
+{
+ unsigned long prev_state_val;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_desc *d;
+ unsigned long id;
+ u32 cid;
+
+ id = atomic_long_read(&desc_ring->head_id);
+
+ /*
+ * To reduce unnecessarily reopening, first check if the descriptor
+ * state and caller ID are correct.
+ */
+ d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
+ if (d_state != desc_committed || cid != caller_id)
+ return NULL;
+
+ d = to_desc(desc_ring, id);
+
+ prev_state_val = DESC_SV(id, desc_committed);
+
+ /*
+ * Guarantee the reserved state is stored before reading any
+ * record data. A full memory barrier is needed because @state_var
+ * modification is followed by reading. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reopen_last:A reads from _prb_commit:B, then
+ * prb_reserve_in_last:A reads from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * MB If desc_reopen_last:A to prb_reserve_in_last:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
+ return NULL;
+ }
+
+ *id_out = id;
+ return d;
+}
+
+/**
+ * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
+ * used by the newest record.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to re-reserve and extend data in.
+ * @r: The record structure to allocate buffers for.
+ * @caller_id: The caller ID of the caller (reserving writer).
+ * @max_size: Fail if the extended size would be greater than this.
+ *
+ * This is the public function available to writers to re-reserve and extend
+ * data.
+ *
+ * The writer specifies the text size to extend (not the new total size) by
+ * setting the @text_buf_size field of @r. To ensure proper initialization
+ * of @r, prb_rec_init_wr() should be used.
+ *
+ * This function will fail if @caller_id does not match the caller ID of the
+ * newest record. In that case the caller must reserve new data using
+ * prb_reserve().
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if text data could be extended, otherwise false.
+ *
+ * On success:
+ *
+ * - @r->text_buf points to the beginning of the entire text buffer.
+ *
+ * - @r->text_buf_size is set to the new total size of the buffer.
+ *
+ * - @r->info is not touched so that @r->info->text_len could be used
+ * to append the text.
+ *
+ * - prb_record_text_space() can be used on @e to query the new
+ * actually used space.
+ *
+ * Important: All @r->info fields will already be set with the current values
+ * for the record. I.e. @r->info->text_len will be less than
+ * @text_buf_size. Writers can use @r->info->text_len to know
+ * where concatenation begins and writers should update
+ * @r->info->text_len after concatenating.
+ */
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ unsigned int data_size;
+ struct prb_desc *d;
+ unsigned long id;
+
+ local_irq_save(e->irqflags);
+
+ /* Transition the newest descriptor back to the reserved state. */
+ d = desc_reopen_last(desc_ring, caller_id, &id);
+ if (!d) {
+ local_irq_restore(e->irqflags);
+ goto fail_reopen;
+ }
+
+ /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */
+
+ info = to_info(desc_ring, id);
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * anything fails from now on.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * desc_reopen_last() checked the caller_id, but there was no
+ * exclusive access at that point. The descriptor may have
+ * changed since then.
+ */
+ if (caller_id != info->caller_id)
+ goto fail;
+
+ if (BLK_DATALESS(&d->text_blk_lpos)) {
+ if (WARN_ON_ONCE(info->text_len != 0)) {
+ pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
+ info->text_len);
+ info->text_len = 0;
+ }
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_alloc(rb, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ } else {
+ if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
+ goto fail;
+
+ /*
+ * Increase the buffer size to include the original size. If
+ * the meta data (@text_len) is not sane, use the full data
+ * block size.
+ */
+ if (WARN_ON_ONCE(info->text_len > data_size)) {
+ pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
+ info->text_len, data_size);
+ info->text_len = data_size;
+ }
+ r->text_buf_size += info->text_len;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_realloc(rb, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ }
+ if (r->text_buf_size && !r->text_buf)
+ goto fail;
+
+ r->info = info;
+
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+fail_reopen:
+ /* Make it clear to the caller that the re-reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+
+/*
+ * @last_finalized_seq value guarantees that all records up to and including
+ * this sequence number are finalized and can be read. The only exception are
+ * too old records which have already been overwritten.
+ *
+ * It is also guaranteed that @last_finalized_seq only increases.
+ *
+ * Be aware that finalized records following non-finalized records are not
+ * reported because they are not yet available to the reader. For example,
+ * a new record stored via printk() will not be available to a printer if
+ * it follows a record that has not been finalized yet. However, once that
+ * non-finalized record becomes finalized, @last_finalized_seq will be
+ * appropriately updated and the full set of finalized records will be
+ * available to the printer. And since each printk() caller will either
+ * directly print or trigger deferred printing of all available unprinted
+ * records, all printk() messages will get printed.
+ */
+static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long ulseq;
+
+ /*
+ * Guarantee the sequence number is loaded before loading the
+ * associated record in order to guarantee that the record can be
+ * seen by this CPU. This pairs with desc_update_last_finalized:A.
+ */
+ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
+ ); /* LMM(desc_last_finalized_seq:A) */
+
+ return __ulseq_to_u64seq(rb, ulseq);
+}
+
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count);
+
+/*
+ * Check if there are records directly following @last_finalized_seq that are
+ * finalized. If so, update @last_finalized_seq to the latest of these
+ * records. It is not allowed to skip over records that are not yet finalized.
+ */
+static void desc_update_last_finalized(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ u64 old_seq = desc_last_finalized_seq(rb);
+ unsigned long oldval;
+ unsigned long newval;
+ u64 finalized_seq;
+ u64 try_seq;
+
+try_again:
+ finalized_seq = old_seq;
+ try_seq = finalized_seq + 1;
+
+ /* Try to find later finalized records. */
+ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
+ finalized_seq = try_seq;
+ try_seq++;
+ }
+
+ /* No update needed if no later finalized record was found. */
+ if (finalized_seq == old_seq)
+ return;
+
+ oldval = __u64seq_to_ulseq(old_seq);
+ newval = __u64seq_to_ulseq(finalized_seq);
+
+ /*
+ * Set the sequence number of a later finalized record that has been
+ * seen.
+ *
+ * Guarantee the record data is visible to other CPUs before storing
+ * its sequence number. This pairs with desc_last_finalized_seq:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then desc_read:A reads from
+ * _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from _prb_commit:B to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
+ *
+ * Note: _prb_commit:B and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A
+ * CPU (which performs the release) must have previously seen
+ * _prb_commit:B.
+ */
+ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
+ &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
+ old_seq = __ulseq_to_u64seq(rb, oldval);
+ goto try_again;
+ }
+}
+
+/*
+ * Attempt to finalize a specified descriptor. If this fails, the descriptor
+ * is either already final or it will finalize itself when the writer commits.
+ */
+static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long prev_state_val = DESC_SV(id, desc_committed);
+ struct prb_desc *d = to_desc(desc_ring, id);
+
+ if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
+ desc_update_last_finalized(rb);
+ }
+}
+
+/**
+ * prb_reserve() - Reserve space in the ringbuffer.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to reserve data in.
+ * @r: The record structure to allocate buffers for.
+ *
+ * This is the public function available to writers to reserve data.
+ *
+ * The writer specifies the text size to reserve by setting the
+ * @text_buf_size field of @r. To ensure proper initialization of @r,
+ * prb_rec_init_wr() should be used.
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if at least text data could be allocated, otherwise false.
+ *
+ * On success, the fields @info and @text_buf of @r will be set by this
+ * function and should be filled in by the writer before committing. Also
+ * on success, prb_record_text_space() can be used on @e to query the actual
+ * space used for the text data block.
+ *
+ * Important: @info->text_len needs to be set correctly by the writer in
+ * order for data to be readable and/or extended. Its value
+ * is initialized to 0.
+ */
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ struct prb_desc *d;
+ unsigned long id;
+ u64 seq;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ /*
+ * Descriptors in the reserved state act as blockers to all further
+ * reservations once the desc_ring has fully wrapped. Disable
+ * interrupts during the reserve/commit window in order to minimize
+ * the likelihood of this happening.
+ */
+ local_irq_save(e->irqflags);
+
+ if (!desc_reserve(rb, &id)) {
+ /* Descriptor reservation failures are tracked. */
+ atomic_long_inc(&rb->fail);
+ local_irq_restore(e->irqflags);
+ goto fail;
+ }
+
+ d = to_desc(desc_ring, id);
+ info = to_info(desc_ring, id);
+
+ /*
+ * All @info fields (except @seq) are cleared and must be filled in
+ * by the writer. Save @seq before clearing because it is used to
+ * determine the new sequence number.
+ */
+ seq = info->seq;
+ memset(info, 0, sizeof(*info));
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * text data allocation fails.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * Initialize the sequence number if it has "never been set".
+ * Otherwise just increment it by a full wrap.
+ *
+ * @seq is considered "never been set" if it has a value of 0,
+ * _except_ for @infos[0], which was specially setup by the ringbuffer
+ * initializer and therefore is always considered as set.
+ *
+ * See the "Bootstrap" comment block in printk_ringbuffer.h for
+ * details about how the initializer bootstraps the descriptors.
+ */
+ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
+ info->seq = DESC_INDEX(desc_ring, id);
+ else
+ info->seq = seq + DESCS_COUNT(desc_ring);
+
+ /*
+ * New data is about to be reserved. Once that happens, previous
+ * descriptors are no longer able to be extended. Finalize the
+ * previous descriptor now so that it can be made available to
+ * readers. (For seq==0 there is no previous descriptor.)
+ */
+ if (info->seq > 0)
+ desc_make_final(rb, DESC_ID(id - 1));
+
+ r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
+ /* If text data allocation fails, a data-less record is committed. */
+ if (r->text_buf_size && !r->text_buf) {
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+ goto fail;
+ }
+
+ r->info = info;
+
+ /* Record full text space used by record. */
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ /* Make it clear to the caller that the reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+EXPORT_SYMBOL_IF_KUNIT(prb_reserve);
+
+/* Commit the data (possibly finalizing it) and restore interrupts. */
+static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ struct prb_desc *d = to_desc(desc_ring, e->id);
+ unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);
+
+ /* Now the writer has finished all writing: LMM(_prb_commit:A) */
+
+ /*
+ * Set the descriptor as committed. See "ABA Issues" about why
+ * cmpxchg() instead of set() is used.
+ *
+ * 1 Guarantee all record data is stored before the descriptor state
+ * is stored as committed. A write memory barrier is sufficient
+ * for this. This pairs with desc_read:B and desc_reopen_last:A.
+ *
+ * 2. Guarantee the descriptor state is stored as committed before
+ * re-checking the head ID in order to possibly finalize this
+ * descriptor. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_commit:A reads from desc_reserve:D, then
+ * desc_make_final:A reads from _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * MB _prb_commit:B to prb_commit:A
+ * matching
+ * MB desc_reserve:D to desc_make_final:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Restore interrupts, the reserve/commit window is finished. */
+ local_irq_restore(e->irqflags);
+}
+
+/**
+ * prb_commit() - Commit (previously reserved) data to the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit data.
+ *
+ * Note that the data is not yet available to readers until it is finalized.
+ * Finalizing happens automatically when space for the next record is
+ * reserved.
+ *
+ * See prb_final_commit() for a version of this function that finalizes
+ * immediately.
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_commit(struct prb_reserved_entry *e)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ unsigned long head_id;
+
+ _prb_commit(e, desc_committed);
+
+ /*
+ * If this descriptor is no longer the head (i.e. a new record has
+ * been allocated), extending the data for this record is no longer
+ * allowed and therefore it must be finalized.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
+ if (head_id != e->id)
+ desc_make_final(e->rb, e->id);
+}
+EXPORT_SYMBOL_IF_KUNIT(prb_commit);
+
+/**
+ * prb_final_commit() - Commit and finalize (previously reserved) data to
+ * the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit+finalize data.
+ *
+ * By finalizing, the data is made immediately available to readers.
+ *
+ * This function should only be used if there are no intentions of extending
+ * this data using prb_reserve_in_last().
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_final_commit(struct prb_reserved_entry *e)
+{
+ _prb_commit(e, desc_finalized);
+
+ desc_update_last_finalized(e->rb);
+}
+
+/*
+ * Count the number of lines in provided text. All text has at least 1 line
+ * (even if @text_size is 0). Each '\n' processed is counted as an additional
+ * line.
+ */
+static unsigned int count_lines(const char *text, unsigned int text_size)
+{
+ unsigned int next_size = text_size;
+ unsigned int line_count = 1;
+ const char *next = text;
+
+ while (next_size) {
+ next = memchr(next, '\n', next_size);
+ if (!next)
+ break;
+ line_count++;
+ next++;
+ next_size = text_size - (next - text);
+ }
+
+ return line_count;
+}
+
+/*
+ * Given @blk_lpos, copy an expected @len of data into the provided buffer.
+ * If @line_count is provided, count the number of lines in the data.
+ *
+ * This function (used by readers) performs strict validation on the data
+ * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static bool copy_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
+ unsigned int buf_size, unsigned int *line_count)
+{
+ unsigned int data_size;
+ const char *data;
+
+ /* Caller might not want any data. */
+ if ((!buf || !buf_size) && !line_count)
+ return true;
+
+ data = get_data(data_ring, blk_lpos, &data_size);
+ if (!data)
+ return false;
+
+ /*
+ * Actual cannot be less than expected. It can be more than expected
+ * because of the trailing alignment padding.
+ *
+ * Note that invalid @len values can occur because the caller loads
+ * the value during an allowed data race.
+ */
+ if (data_size < (unsigned int)len)
+ return false;
+
+ /* Caller interested in the line count? */
+ if (line_count)
+ *line_count = count_lines(data, len);
+
+ /* Caller interested in the data content? */
+ if (!buf || !buf_size)
+ return true;
+
+ data_size = min_t(unsigned int, buf_size, len);
+
+ memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
+ return true;
+}
+
+/*
+ * This is an extended version of desc_read(). It gets a copy of a specified
+ * descriptor. However, it also verifies that the record is finalized and has
+ * the sequence number @seq. On success, 0 is returned.
+ *
+ * Error return values:
+ * -EINVAL: A finalized record with sequence number @seq does not exist.
+ * -ENOENT: A finalized record with sequence number @seq exists, but its data
+ * is not available. This is a valid record, so readers should
+ * continue with the next record.
+ */
+static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
+ unsigned long id, u64 seq,
+ struct prb_desc *desc_out)
+{
+ struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
+ enum desc_state d_state;
+ u64 s;
+
+ d_state = desc_read(desc_ring, id, desc_out, &s, NULL);
+
+ /*
+ * An unexpected @id (desc_miss) or @seq mismatch means the record
+ * does not exist. A descriptor in the reserved or committed state
+ * means the record does not yet exist for the reader.
+ */
+ if (d_state == desc_miss ||
+ d_state == desc_reserved ||
+ d_state == desc_committed ||
+ s != seq) {
+ return -EINVAL;
+ }
+
+ /*
+ * A descriptor in the reusable state may no longer have its data
+ * available; report it as existing but with lost data. Or the record
+ * may actually be a record with lost data.
+ */
+ if (d_state == desc_reusable ||
+ (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy the ringbuffer data from the record with @seq to the provided
+ * @r buffer. On success, 0 is returned.
+ *
+ * See desc_read_finalized_seq() for error return values.
+ */
+static int prb_read(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info = to_info(desc_ring, seq);
+ struct prb_desc *rdesc = to_desc(desc_ring, seq);
+ atomic_long_t *state_var = &rdesc->state_var;
+ struct prb_desc desc;
+ unsigned long id;
+ int err;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ id = DESC_ID(atomic_long_read(state_var));
+
+ /* Get a local copy of the correct descriptor (if available). */
+ err = desc_read_finalized_seq(desc_ring, id, seq, &desc);
+
+ /*
+ * If @r is NULL, the caller is only interested in the availability
+ * of the record.
+ */
+ if (err || !r)
+ return err;
+
+ /* If requested, copy meta data. */
+ if (r->info)
+ memcpy(r->info, info, sizeof(*(r->info)));
+
+ /* Copy text data. If it fails, this is a data-less record. */
+ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
+ r->text_buf, r->text_buf_size, line_count)) {
+ return -ENOENT;
+ }
+
+ /* Ensure the record is still finalized and has the same @seq. */
+ return desc_read_finalized_seq(desc_ring, id, seq, &desc);
+}
+
+/* Get the sequence number of the tail descriptor. */
+u64 prb_first_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ unsigned long id;
+ u64 seq;
+
+ for (;;) {
+ id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */
+
+ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */
+
+ /*
+ * This loop will not be infinite because the tail is
+ * _always_ in the finalized or reusable state.
+ */
+ if (d_state == desc_finalized || d_state == desc_reusable)
+ break;
+
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail in the case
+ * that the descriptor has been recycled. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_first_seq:B reads from desc_reserve:F, then
+ * prb_first_seq:A reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB prb_first_seq:B to prb_first_seq:A
+ */
+ smp_rmb(); /* LMM(prb_first_seq:C) */
+ }
+
+ return seq;
+}
+
+/**
+ * prb_next_reserve_seq() - Get the sequence number after the most recently
+ * reserved record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what sequence
+ * number will be assigned to the next reserved record.
+ *
+ * Note that depending on the situation, this value can be equal to or
+ * higher than the sequence number returned by prb_next_seq().
+ *
+ * Context: Any context.
+ * Return: The sequence number that will be assigned to the next record
+ * reserved.
+ */
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long last_finalized_id;
+ atomic_long_t *state_var;
+ u64 last_finalized_seq;
+ unsigned long head_id;
+ struct prb_desc desc;
+ unsigned long diff;
+ struct prb_desc *d;
+ int err;
+
+ /*
+ * It may not be possible to read a sequence number for @head_id.
+ * So the ID of @last_finailzed_seq is used to calculate what the
+ * sequence number of @head_id will be.
+ */
+
+try_again:
+ last_finalized_seq = desc_last_finalized_seq(rb);
+
+ /*
+ * @head_id is loaded after @last_finalized_seq to ensure that
+ * it points to the record with @last_finalized_seq or newer.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then
+ * prb_next_reserve_seq:A reads from desc_reserve:D.
+ *
+ * Relies on:
+ *
+ * RELEASE from desc_reserve:D to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
+ *
+ * Note: desc_reserve:D and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A CPU
+ * (which performs the release) must have previously seen
+ * desc_read:C, which implies desc_reserve:D can be seen.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */
+
+ d = to_desc(desc_ring, last_finalized_seq);
+ state_var = &d->state_var;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ last_finalized_id = DESC_ID(atomic_long_read(state_var));
+
+ /* Ensure @last_finalized_id is correct. */
+ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);
+
+ if (err == -EINVAL) {
+ if (last_finalized_seq == 0) {
+ /*
+ * No record has been finalized or even reserved yet.
+ *
+ * The @head_id is initialized such that the first
+ * increment will yield the first record (seq=0).
+ * Handle it separately to avoid a negative @diff
+ * below.
+ */
+ if (head_id == DESC0_ID(desc_ring->count_bits))
+ return 0;
+
+ /*
+ * One or more descriptors are already reserved. Use
+ * the descriptor ID of the first one (@seq=0) for
+ * the @diff below.
+ */
+ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
+ } else {
+ /* Record must have been overwritten. Try again. */
+ goto try_again;
+ }
+ }
+
+ /* Diff of known descriptor IDs to compute related sequence numbers. */
+ diff = head_id - last_finalized_id;
+
+ /*
+ * @head_id points to the most recently reserved record, but this
+ * function returns the sequence number that will be assigned to the
+ * next (not yet reserved) record. Thus +1 is needed.
+ */
+ return (last_finalized_seq + diff + 1);
+}
+
+/*
+ * Non-blocking read of a record.
+ *
+ * On success @seq is updated to the record that was read and (if provided)
+ * @r and @line_count will contain the read/calculated data.
+ *
+ * On failure @seq is updated to a record that is not yet available to the
+ * reader, but it will be the next record available to the reader.
+ *
+ * Note: When the current CPU is in panic, this function will skip over any
+ * non-existent/non-finalized records in order to allow the panic CPU
+ * to print any and all records that have been finalized.
+ */
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ u64 tail_seq;
+ int err;
+
+ while ((err = prb_read(rb, *seq, r, line_count))) {
+ tail_seq = prb_first_seq(rb);
+
+ if (*seq < tail_seq) {
+ /*
+ * Behind the tail. Catch up and try again. This
+ * can happen for -ENOENT and -EINVAL cases.
+ */
+ *seq = tail_seq;
+
+ } else if (err == -ENOENT) {
+ /* Record exists, but the data was lost. Skip. */
+ (*seq)++;
+
+ } else {
+ /*
+ * Non-existent/non-finalized record. Must stop.
+ *
+ * For panic situations it cannot be expected that
+ * non-finalized records will become finalized. But
+ * there may be other finalized records beyond that
+ * need to be printed for a panic situation. If this
+ * is the panic CPU, skip this
+ * non-existent/non-finalized record unless non-panic
+ * CPUs are still running and their debugging is
+ * explicitly enabled.
+ *
+ * Note that new messages printed on panic CPU are
+ * finalized when we are here. The only exception
+ * might be the last message without trailing newline.
+ * But it would have the sequence number returned
+ * by "prb_next_reserve_seq() - 1".
+ */
+ if (panic_on_this_cpu() &&
+ (!debug_non_panic_cpus || legacy_allow_panic_sync) &&
+ ((*seq + 1) < prb_next_reserve_seq(rb))) {
+ (*seq)++;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/**
+ * prb_read_valid() - Non-blocking read of a requested record or (if gone)
+ * the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @r: A record data buffer to store the read record to.
+ *
+ * This is the public function available to readers to read a record.
+ *
+ * The reader provides the @info and @text_buf buffers of @r to be
+ * filled in. Any of the buffer pointers can be set to NULL if the reader
+ * is not interested in that data. To ensure proper initialization of @r,
+ * prb_rec_init_rd() should be used.
+ *
+ * Context: Any context.
+ * Return: true if a record was read, otherwise false.
+ *
+ * On success, the reader must check r->info.seq to see which record was
+ * actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a record not yet available to the reader.
+ */
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r)
+{
+ return _prb_read_valid(rb, &seq, r, NULL);
+}
+EXPORT_SYMBOL_IF_KUNIT(prb_read_valid);
+
+/**
+ * prb_read_valid_info() - Non-blocking read of meta data for a requested
+ * record or (if gone) the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @info: A buffer to store the read record meta data to.
+ * @line_count: A buffer to store the number of lines in the record text.
+ *
+ * This is the public function available to readers to read only the
+ * meta data of a record.
+ *
+ * The reader provides the @info, @line_count buffers to be filled in.
+ * Either of the buffer pointers can be set to NULL if the reader is not
+ * interested in that data.
+ *
+ * Context: Any context.
+ * Return: true if a record's meta data was read, otherwise false.
+ *
+ * On success, the reader must check info->seq to see which record meta data
+ * was actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a record not yet available to the reader.
+ */
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count)
+{
+ struct printk_record r;
+
+ prb_rec_init_rd(&r, info, NULL, 0);
+
+ return _prb_read_valid(rb, &seq, &r, line_count);
+}
+
+/**
+ * prb_first_valid_seq() - Get the sequence number of the oldest available
+ * record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the
+ * first/oldest valid sequence number is.
+ *
+ * This provides readers a starting point to begin iterating the ringbuffer.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the first/oldest record or, if the
+ * ringbuffer is empty, 0 is returned.
+ */
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
+{
+ u64 seq = 0;
+
+ if (!_prb_read_valid(rb, &seq, NULL, NULL))
+ return 0;
+
+ return seq;
+}
+
+/**
+ * prb_next_seq() - Get the sequence number after the last available record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the next
+ * newest sequence number available to readers will be.
+ *
+ * This provides readers a sequence number to jump to if all currently
+ * available records should be skipped. It is guaranteed that all records
+ * previous to the returned value have been finalized and are (or were)
+ * available to the reader.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the next newest (not yet available) record
+ * for readers.
+ */
+u64 prb_next_seq(struct printk_ringbuffer *rb)
+{
+ u64 seq;
+
+ seq = desc_last_finalized_seq(rb);
+
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
+
+ /*
+ * The information about the last finalized @seq might be inaccurate.
+ * Search forward to find the current one.
+ */
+ while (_prb_read_valid(rb, &seq, NULL, NULL))
+ seq++;
+
+ return seq;
+}
+
+/**
+ * prb_init() - Initialize a ringbuffer to use provided external buffers.
+ *
+ * @rb: The ringbuffer to initialize.
+ * @text_buf: The data buffer for text data.
+ * @textbits: The size of @text_buf as a power-of-2 value.
+ * @descs: The descriptor buffer for ringbuffer records.
+ * @descbits: The count of @descs items as a power-of-2 value.
+ * @infos: The printk_info buffer for ringbuffer records.
+ *
+ * This is the public function available to writers to setup a ringbuffer
+ * during runtime using provided buffers.
+ *
+ * This must match the initialization of DEFINE_PRINTKRB().
+ *
+ * Context: Any context.
+ */
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int textbits,
+ struct prb_desc *descs, unsigned int descbits,
+ struct printk_info *infos)
+{
+ memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
+ memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));
+
+ rb->desc_ring.count_bits = descbits;
+ rb->desc_ring.descs = descs;
+ rb->desc_ring.infos = infos;
+ atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);
+
+ rb->text_data_ring.size_bits = textbits;
+ rb->text_data_ring.data = text_buf;
+ atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
+ atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));
+
+ atomic_long_set(&rb->fail, 0);
+
+ atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;
+
+ infos[0].seq = -(u64)_DESCS_COUNT(descbits);
+ infos[_DESCS_COUNT(descbits) - 1].seq = 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(prb_init);
+
+/**
+ * prb_record_text_space() - Query the full actual used ringbuffer space for
+ * the text data of a reserved entry.
+ *
+ * @e: The successfully reserved entry to query.
+ *
+ * This is the public function available to writers to see how much actual
+ * space is used in the ringbuffer to store the text data of the specified
+ * entry.
+ *
+ * This function is only valid if @e has been successfully reserved using
+ * prb_reserve().
+ *
+ * Context: Any context.
+ * Return: The size in bytes used by the text data of the associated record.
+ */
+unsigned int prb_record_text_space(struct prb_reserved_entry *e)
+{
+ return e->text_space;
+}
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
new file mode 100644
index 000000000000..4ef81349d9fb
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.h
@@ -0,0 +1,437 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_PRINTK_RINGBUFFER_H
+#define _KERNEL_PRINTK_RINGBUFFER_H
+
+#include <linux/atomic.h>
+#include <linux/bits.h>
+#include <linux/dev_printk.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/*
+ * Meta information about each stored message.
+ *
+ * All fields are set by the printk code except for @seq, which is
+ * set by the ringbuffer code.
+ */
+struct printk_info {
+ u64 seq; /* sequence number */
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 text_len; /* length of text message */
+ u8 facility; /* syslog facility */
+ u8 flags:5; /* internal record flags */
+ u8 level:3; /* syslog level */
+ u32 caller_id; /* thread id or processor id */
+
+ struct dev_printk_info dev_info;
+};
+
+/*
+ * A structure providing the buffers, used by writers and readers.
+ *
+ * Writers:
+ * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
+ * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
+ * buffers reserved for that writer.
+ *
+ * Readers:
+ * Using prb_rec_init_rd(), a reader sets all fields before calling
+ * prb_read_valid(). Note that the reader provides the @info and @text_buf,
+ * buffers. On success, the struct pointed to by @info will be filled and
+ * the char array pointed to by @text_buf will be filled with text data.
+ */
+struct printk_record {
+ struct printk_info *info;
+ char *text_buf;
+ unsigned int text_buf_size;
+};
+
+/* Specifies the logical position and span of a data block. */
+struct prb_data_blk_lpos {
+ unsigned long begin;
+ unsigned long next;
+};
+
+/*
+ * A descriptor: the complete meta-data for a record.
+ *
+ * @state_var: A bitwise combination of descriptor ID and descriptor state.
+ */
+struct prb_desc {
+ atomic_long_t state_var;
+ struct prb_data_blk_lpos text_blk_lpos;
+};
+
+/* A ringbuffer of "ID + data" elements. */
+struct prb_data_ring {
+ unsigned int size_bits;
+ char *data;
+ atomic_long_t head_lpos;
+ atomic_long_t tail_lpos;
+};
+
+/* A ringbuffer of "struct prb_desc" elements. */
+struct prb_desc_ring {
+ unsigned int count_bits;
+ struct prb_desc *descs;
+ struct printk_info *infos;
+ atomic_long_t head_id;
+ atomic_long_t tail_id;
+ atomic_long_t last_finalized_seq;
+};
+
+/*
+ * The high level structure representing the printk ringbuffer.
+ *
+ * @fail: Count of failed prb_reserve() calls where not even a data-less
+ * record was created.
+ */
+struct printk_ringbuffer {
+ struct prb_desc_ring desc_ring;
+ struct prb_data_ring text_data_ring;
+ atomic_long_t fail;
+};
+
+/*
+ * Used by writers as a reserve/commit handle.
+ *
+ * @rb: Ringbuffer where the entry is reserved.
+ * @irqflags: Saved irq flags to restore on entry commit.
+ * @id: ID of the reserved descriptor.
+ * @text_space: Total occupied buffer space in the text data ring, including
+ * ID, alignment padding, and wrapping data blocks.
+ *
+ * This structure is an opaque handle for writers. Its contents are only
+ * to be used by the ringbuffer implementation.
+ */
+struct prb_reserved_entry {
+ struct printk_ringbuffer *rb;
+ unsigned long irqflags;
+ unsigned long id;
+ unsigned int text_space;
+};
+
+/* The possible responses of a descriptor state-query. */
+enum desc_state {
+ desc_miss = -1, /* ID mismatch (pseudo state) */
+ desc_reserved = 0x0, /* reserved, in use by writer */
+ desc_committed = 0x1, /* committed by writer, could get reopened */
+ desc_finalized = 0x2, /* committed, no further modification allowed */
+ desc_reusable = 0x3, /* free, not yet used by any writer */
+};
+
+#define _DATA_SIZE(sz_bits) (1UL << (sz_bits))
+#define _DESCS_COUNT(ct_bits) (1U << (ct_bits))
+#define DESC_SV_BITS BITS_PER_LONG
+#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2)
+#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT)
+#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT))
+#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
+#define DESC_ID_MASK (~DESC_FLAGS_MASK)
+#define DESC_ID(sv) ((sv) & DESC_ID_MASK)
+
+/*
+ * Special data block logical position values (for fields of
+ * @prb_desc.text_blk_lpos).
+ *
+ * - Bit0 is used to identify if the record has no data block. (Implemented in
+ * the LPOS_DATALESS() macro.)
+ *
+ * - Bit1 specifies the reason for not having a data block.
+ *
+ * These special values could never be real lpos values because of the
+ * meta data and alignment padding of data blocks. (See to_blk_size() for
+ * details.)
+ */
+#define FAILED_LPOS 0x1
+#define EMPTY_LINE_LPOS 0x3
+
+#define FAILED_BLK_LPOS \
+{ \
+ .begin = FAILED_LPOS, \
+ .next = FAILED_LPOS, \
+}
+
+/*
+ * Descriptor Bootstrap
+ *
+ * The descriptor array is minimally initialized to allow immediate usage
+ * by readers and writers. The requirements that the descriptor array
+ * initialization must satisfy:
+ *
+ * Req1
+ * The tail must point to an existing (committed or reusable) descriptor.
+ * This is required by the implementation of prb_first_seq().
+ *
+ * Req2
+ * Readers must see that the ringbuffer is initially empty.
+ *
+ * Req3
+ * The first record reserved by a writer is assigned sequence number 0.
+ *
+ * To satisfy Req1, the tail initially points to a descriptor that is
+ * minimally initialized (having no data block, i.e. data-less with the
+ * data block's lpos @begin and @next values set to FAILED_LPOS).
+ *
+ * To satisfy Req2, the initial tail descriptor is initialized to the
+ * reusable state. Readers recognize reusable descriptors as existing
+ * records, but skip over them.
+ *
+ * To satisfy Req3, the last descriptor in the array is used as the initial
+ * head (and tail) descriptor. This allows the first record reserved by a
+ * writer (head + 1) to be the first descriptor in the array. (Only the first
+ * descriptor in the array could have a valid sequence number of 0.)
+ *
+ * The first time a descriptor is reserved, it is assigned a sequence number
+ * with the value of the array index. A "first time reserved" descriptor can
+ * be recognized because it has a sequence number of 0 but does not have an
+ * index of 0. (Only the first descriptor in the array could have a valid
+ * sequence number of 0.) After the first reservation, all future reservations
+ * (recycling) simply involve incrementing the sequence number by the array
+ * count.
+ *
+ * Hack #1
+ * Only the first descriptor in the array is allowed to have the sequence
+ * number 0. In this case it is not possible to recognize if it is being
+ * reserved the first time (set to index value) or has been reserved
+ * previously (increment by the array count). This is handled by _always_
+ * incrementing the sequence number by the array count when reserving the
+ * first descriptor in the array. In order to satisfy Req3, the sequence
+ * number of the first descriptor in the array is initialized to minus
+ * the array count. Then, upon the first reservation, it is incremented
+ * to 0, thus satisfying Req3.
+ *
+ * Hack #2
+ * prb_first_seq() can be called at any time by readers to retrieve the
+ * sequence number of the tail descriptor. However, due to Req2 and Req3,
+ * initially there are no records to report the sequence number of
+ * (sequence numbers are u64 and there is nothing less than 0). To handle
+ * this, the sequence number of the initial tail descriptor is initialized
+ * to 0. Technically this is incorrect, because there is no record with
+ * sequence number 0 (yet) and the tail descriptor is not the first
+ * descriptor in the array. But it allows prb_read_valid() to correctly
+ * report the existence of a record for _any_ given sequence number at all
+ * times. Bootstrapping is complete when the tail is pushed the first
+ * time, thus finally pointing to the first descriptor reserved by a
+ * writer, which has the assigned sequence number 0.
+ */
+
+/*
+ * Initiating Logical Value Overflows
+ *
+ * Both logical position (lpos) and ID values can be mapped to array indexes
+ * but may experience overflows during the lifetime of the system. To ensure
+ * that printk_ringbuffer can handle the overflows for these types, initial
+ * values are chosen that map to the correct initial array indexes, but will
+ * result in overflows soon.
+ *
+ * BLK0_LPOS
+ * The initial @head_lpos and @tail_lpos for data rings. It is at index
+ * 0 and the lpos value is such that it will overflow on the first wrap.
+ *
+ * DESC0_ID
+ * The initial @head_id and @tail_id for the desc ring. It is at the last
+ * index of the descriptor array (see Req3 above) and the ID value is such
+ * that it will overflow on the second wrap.
+ */
+#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits)))
+#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
+#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable)
+
+/*
+ * Define a ringbuffer with an external text data buffer. The same as
+ * DEFINE_PRINTKRB() but requires specifying an external buffer for the
+ * text data.
+ *
+ * Note: The specified external buffer must be of the size:
+ * 2 ^ (descbits + avgtextbits)
+ */
+#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \
+static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reusable */ \
+ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \
+ /* no associated data block */ \
+ .text_blk_lpos = FAILED_BLK_LPOS, \
+ }, \
+}; \
+static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \
+ /* this will be the first record reserved by a writer */ \
+ [0] = { \
+ /* will be incremented to 0 on the first reservation */ \
+ .seq = -(u64)_DESCS_COUNT(descbits), \
+ }, \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reports the first seq value during the bootstrap phase */ \
+ .seq = 0, \
+ }, \
+}; \
+static struct printk_ringbuffer name = { \
+ .desc_ring = { \
+ .count_bits = descbits, \
+ .descs = &_##name##_descs[0], \
+ .infos = &_##name##_infos[0], \
+ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_seq = ATOMIC_INIT(0), \
+ }, \
+ .text_data_ring = { \
+ .size_bits = (avgtextbits) + (descbits), \
+ .data = text_buf, \
+ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ }, \
+ .fail = ATOMIC_LONG_INIT(0), \
+}
+
+/**
+ * DEFINE_PRINTKRB() - Define a ringbuffer.
+ *
+ * @name: The name of the ringbuffer variable.
+ * @descbits: The number of descriptors as a power-of-2 value.
+ * @avgtextbits: The average text data size per record as a power-of-2 value.
+ *
+ * This is a macro for defining a ringbuffer and all internal structures
+ * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
+ * variant where the text data buffer can be specified externally.
+ */
+#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \
+static char _##name##_text[1U << ((avgtextbits) + (descbits))] \
+ __aligned(__alignof__(unsigned long)); \
+_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
+
+/* Writer Interface */
+
+/**
+ * prb_rec_init_wr() - Initialize a buffer for writing records.
+ *
+ * @r: The record to initialize.
+ * @text_buf_size: The needed text buffer size.
+ */
+static inline void prb_rec_init_wr(struct printk_record *r,
+ unsigned int text_buf_size)
+{
+ r->info = NULL;
+ r->text_buf = NULL;
+ r->text_buf_size = text_buf_size;
+}
+
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r);
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size);
+void prb_commit(struct prb_reserved_entry *e);
+void prb_final_commit(struct prb_reserved_entry *e);
+
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int text_buf_size,
+ struct prb_desc *descs, unsigned int descs_count_bits,
+ struct printk_info *infos);
+unsigned int prb_record_text_space(struct prb_reserved_entry *e);
+
+/* Reader Interface */
+
+/**
+ * prb_rec_init_rd() - Initialize a buffer for reading records.
+ *
+ * @r: The record to initialize.
+ * @info: A buffer to store record meta-data.
+ * @text_buf: A buffer to store text data.
+ * @text_buf_size: The size of @text_buf.
+ *
+ * Initialize all the fields that a reader is interested in. All arguments
+ * (except @r) are optional. Only record data for arguments that are
+ * non-NULL or non-zero will be read.
+ */
+static inline void prb_rec_init_rd(struct printk_record *r,
+ struct printk_info *info,
+ char *text_buf, unsigned int text_buf_size)
+{
+ r->info = info;
+ r->text_buf = text_buf;
+ r->text_buf_size = text_buf_size;
+}
+
+/**
+ * prb_for_each_record() - Iterate over the records of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @r: A printk_record to store the record on each iteration.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_record(from, rb, s, r) \
+for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)
+
+/**
+ * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @i: A printk_info to store the record meta data on each iteration.
+ * @lc: An unsigned int to store the text line count of each record.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_info(from, rb, s, i, lc) \
+for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)
+
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r);
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count);
+
+u64 prb_first_seq(struct printk_ringbuffer *rb);
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
+u64 prb_next_seq(struct printk_ringbuffer *rb);
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb);
+
+#ifdef CONFIG_64BIT
+
+#define __u64seq_to_ulseq(u64seq) (u64seq)
+#define __ulseq_to_u64seq(rb, ulseq) (ulseq)
+#define ULSEQ_MAX(rb) (-1)
+
+#else /* CONFIG_64BIT */
+
+#define __u64seq_to_ulseq(u64seq) ((u32)u64seq)
+#define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL)
+
+static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq)
+{
+ u64 rb_first_seq = prb_first_seq(rb);
+ u64 seq;
+
+ /*
+ * The provided sequence is only the lower 32 bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the first sequence
+ * number from the ringbuffer and fold it.
+ *
+ * Having a 32bit representation in the console is sufficient.
+ * If a console ever gets more than 2^31 records behind
+ * the ringbuffer then this is the least of the problems.
+ *
+ * Also the access to the ring buffer is always safe.
+ */
+ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq);
+
+ return seq;
+}
+
+#endif /* CONFIG_64BIT */
+
+#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
diff --git a/kernel/printk/printk_ringbuffer_kunit_test.c b/kernel/printk/printk_ringbuffer_kunit_test.c
new file mode 100644
index 000000000000..2282348e869a
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer_kunit_test.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpuhplock.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+
+#include <kunit/resource.h>
+#include <kunit/test.h>
+
+#include "printk_ringbuffer.h"
+
+/*
+ * This KUnit tests the data integrity of the lockless printk_ringbuffer.
+ * From multiple CPUs it writes messages of varying length and content while
+ * a reader validates the correctness of the messages.
+ *
+ * IMPORTANT: The more CPUs you can use for this KUnit, the better!
+ *
+ * The test works by starting "num_online_cpus() - 1" writer threads, each
+ * pinned to their own CPU. Each writer thread loops, writing data of varying
+ * length into a printk_ringbuffer as fast as possible. The data content is
+ * an embedded data struct followed by string content repeating the byte:
+ *
+ * 'A' + CPUID
+ *
+ * The reader is running on the remaining online CPU, or if there is only one
+ * CPU on the same as the writer.
+ * It ensures that the embedded struct content is consistent with the string
+ * and that the string * is terminated and is composed of the same repeating
+ * byte as its first byte.
+ *
+ * Because the threads are running in such tight loops, they will call
+ * cond_resched() from time to time so the system stays functional.
+ *
+ * If the reader encounters an error, the test is aborted and some
+ * information about the error is reported.
+ * The runtime of the test can be configured with the runtime_ms module parameter.
+ *
+ * Note that the test is performed on a separate printk_ringbuffer instance
+ * and not the instance used by printk().
+ */
+
+static unsigned long runtime_ms = 10 * MSEC_PER_SEC;
+module_param(runtime_ms, ulong, 0400);
+
+/* test data structure */
+struct prbtest_rbdata {
+ unsigned int size;
+ char text[] __counted_by(size);
+};
+
+#define MAX_RBDATA_TEXT_SIZE 0x80
+#define MAX_PRB_RECORD_SIZE (sizeof(struct prbtest_rbdata) + MAX_RBDATA_TEXT_SIZE)
+
+struct prbtest_data {
+ struct kunit *test;
+ struct printk_ringbuffer *ringbuffer;
+ /* used by writers to signal reader of new records */
+ wait_queue_head_t new_record_wait;
+};
+
+struct prbtest_thread_data {
+ unsigned long num;
+ struct prbtest_data *test_data;
+};
+
+static void prbtest_fail_record(struct kunit *test, const struct prbtest_rbdata *dat, u64 seq)
+{
+ unsigned int len;
+
+ len = dat->size - 1;
+
+ KUNIT_FAIL(test, "BAD RECORD: seq=%llu size=%u text=%.*s\n",
+ seq, dat->size,
+ len < MAX_RBDATA_TEXT_SIZE ? len : -1,
+ len < MAX_RBDATA_TEXT_SIZE ? dat->text : "<invalid>");
+}
+
+static bool prbtest_check_data(const struct prbtest_rbdata *dat)
+{
+ unsigned int len;
+
+ /* Sane size? At least one character + trailing '\0' */
+ if (dat->size < 2 || dat->size > MAX_RBDATA_TEXT_SIZE)
+ return false;
+
+ len = dat->size - 1;
+ if (dat->text[len] != '\0')
+ return false;
+
+ /* String repeats with the same character? */
+ while (len--) {
+ if (dat->text[len] != dat->text[0])
+ return false;
+ }
+
+ return true;
+}
+
+static int prbtest_writer(void *data)
+{
+ struct prbtest_thread_data *tr = data;
+ char text_id = 'A' + tr->num;
+ struct prb_reserved_entry e;
+ struct prbtest_rbdata *dat;
+ u32 record_size, text_size;
+ unsigned long count = 0;
+ struct printk_record r;
+
+ kunit_info(tr->test_data->test, "start thread %03lu (writer)\n", tr->num);
+
+ for (;;) {
+ /* ensure at least 1 character + trailing '\0' */
+ text_size = get_random_u32_inclusive(2, MAX_RBDATA_TEXT_SIZE);
+ if (WARN_ON_ONCE(text_size < 2))
+ text_size = 2;
+ if (WARN_ON_ONCE(text_size > MAX_RBDATA_TEXT_SIZE))
+ text_size = MAX_RBDATA_TEXT_SIZE;
+
+ record_size = sizeof(struct prbtest_rbdata) + text_size;
+ WARN_ON_ONCE(record_size > MAX_PRB_RECORD_SIZE);
+
+ /* specify the text sizes for reservation */
+ prb_rec_init_wr(&r, record_size);
+
+ /*
+ * Reservation can fail if:
+ *
+ * - No free descriptor is available.
+ * - The buffer is full, and the oldest record is reserved
+ * but not yet committed.
+ *
+ * It actually happens in this test because all CPUs are trying
+ * to write an unbounded number of messages in a tight loop.
+ * These failures are intentionally ignored because this test
+ * focuses on races, ringbuffer consistency, and pushing system
+ * usability limits.
+ */
+ if (prb_reserve(&e, tr->test_data->ringbuffer, &r)) {
+ r.info->text_len = record_size;
+
+ dat = (struct prbtest_rbdata *)r.text_buf;
+ dat->size = text_size;
+ memset(dat->text, text_id, text_size - 1);
+ dat->text[text_size - 1] = '\0';
+
+ prb_commit(&e);
+
+ wake_up_interruptible(&tr->test_data->new_record_wait);
+ }
+
+ if ((count++ & 0x3fff) == 0)
+ cond_resched();
+
+ if (kthread_should_stop())
+ break;
+ }
+
+ kunit_info(tr->test_data->test, "end thread %03lu: wrote=%lu\n", tr->num, count);
+
+ return 0;
+}
+
+struct prbtest_wakeup_timer {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+
+static void prbtest_wakeup_callback(struct timer_list *timer)
+{
+ struct prbtest_wakeup_timer *wakeup = timer_container_of(wakeup, timer, timer);
+
+ set_tsk_thread_flag(wakeup->task, TIF_NOTIFY_SIGNAL);
+ wake_up_process(wakeup->task);
+}
+
+static int prbtest_reader(struct prbtest_data *test_data, unsigned long timeout_ms)
+{
+ struct prbtest_wakeup_timer wakeup;
+ char text_buf[MAX_PRB_RECORD_SIZE];
+ unsigned long count = 0;
+ struct printk_info info;
+ struct printk_record r;
+ u64 seq = 0;
+
+ wakeup.task = current;
+ timer_setup_on_stack(&wakeup.timer, prbtest_wakeup_callback, 0);
+ mod_timer(&wakeup.timer, jiffies + msecs_to_jiffies(timeout_ms));
+
+ prb_rec_init_rd(&r, &info, text_buf, sizeof(text_buf));
+
+ kunit_info(test_data->test, "start reader\n");
+
+ while (!wait_event_interruptible(test_data->new_record_wait,
+ prb_read_valid(test_data->ringbuffer, seq, &r))) {
+ /* check/track the sequence */
+ if (info.seq < seq)
+ KUNIT_FAIL(test_data->test, "BAD SEQ READ: request=%llu read=%llu\n",
+ seq, info.seq);
+
+ if (!prbtest_check_data((struct prbtest_rbdata *)r.text_buf))
+ prbtest_fail_record(test_data->test,
+ (struct prbtest_rbdata *)r.text_buf, info.seq);
+
+ if ((count++ & 0x3fff) == 0)
+ cond_resched();
+
+ seq = info.seq + 1;
+ }
+
+ timer_delete_sync(&wakeup.timer);
+ timer_destroy_on_stack(&wakeup.timer);
+
+ kunit_info(test_data->test, "end reader: read=%lu seq=%llu\n", count, info.seq);
+
+ return 0;
+}
+
+KUNIT_DEFINE_ACTION_WRAPPER(prbtest_cpumask_cleanup, free_cpumask_var, struct cpumask *);
+KUNIT_DEFINE_ACTION_WRAPPER(prbtest_kthread_cleanup, kthread_stop, struct task_struct *);
+
+static void prbtest_add_cpumask_cleanup(struct kunit *test, cpumask_var_t mask)
+{
+ int err;
+
+ err = kunit_add_action_or_reset(test, prbtest_cpumask_cleanup, mask);
+ KUNIT_ASSERT_EQ(test, err, 0);
+}
+
+static void prbtest_add_kthread_cleanup(struct kunit *test, struct task_struct *kthread)
+{
+ int err;
+
+ err = kunit_add_action_or_reset(test, prbtest_kthread_cleanup, kthread);
+ KUNIT_ASSERT_EQ(test, err, 0);
+}
+
+static inline void prbtest_prb_reinit(struct printk_ringbuffer *rb)
+{
+ prb_init(rb, rb->text_data_ring.data, rb->text_data_ring.size_bits, rb->desc_ring.descs,
+ rb->desc_ring.count_bits, rb->desc_ring.infos);
+}
+
+static void test_readerwriter(struct kunit *test)
+{
+ /* Equivalent to CONFIG_LOG_BUF_SHIFT=13 */
+ DEFINE_PRINTKRB(test_rb, 8, 5);
+
+ struct prbtest_thread_data *thread_data;
+ struct prbtest_data *test_data;
+ struct task_struct *thread;
+ cpumask_var_t test_cpus;
+ int cpu, reader_cpu;
+
+ KUNIT_ASSERT_TRUE(test, alloc_cpumask_var(&test_cpus, GFP_KERNEL));
+ prbtest_add_cpumask_cleanup(test, test_cpus);
+
+ cpus_read_lock();
+ /*
+ * Failure of KUNIT_ASSERT() kills the current task
+ * so it can not be called while the CPU hotplug lock is held.
+ * Instead use a snapshot of the online CPUs.
+ * If they change during test execution it is unfortunate but not a grave error.
+ */
+ cpumask_copy(test_cpus, cpu_online_mask);
+ cpus_read_unlock();
+
+ /* One CPU is for the reader, all others are writers */
+ reader_cpu = cpumask_first(test_cpus);
+ if (cpumask_weight(test_cpus) == 1)
+ kunit_warn(test, "more than one CPU is recommended");
+ else
+ cpumask_clear_cpu(reader_cpu, test_cpus);
+
+ /* KUnit test can get restarted more times. */
+ prbtest_prb_reinit(&test_rb);
+
+ test_data = kunit_kmalloc(test, sizeof(*test_data), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, test_data);
+ test_data->test = test;
+ test_data->ringbuffer = &test_rb;
+ init_waitqueue_head(&test_data->new_record_wait);
+
+ kunit_info(test, "running for %lu ms\n", runtime_ms);
+
+ for_each_cpu(cpu, test_cpus) {
+ thread_data = kunit_kmalloc(test, sizeof(*thread_data), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, thread_data);
+ thread_data->test_data = test_data;
+ thread_data->num = cpu;
+
+ thread = kthread_run_on_cpu(prbtest_writer, thread_data, cpu,
+ "prbtest writer %u");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, thread);
+ prbtest_add_kthread_cleanup(test, thread);
+ }
+
+ kunit_info(test, "starting test\n");
+
+ set_cpus_allowed_ptr(current, cpumask_of(reader_cpu));
+ prbtest_reader(test_data, runtime_ms);
+
+ kunit_info(test, "completed test\n");
+}
+
+static struct kunit_case prb_test_cases[] = {
+ KUNIT_CASE_SLOW(test_readerwriter),
+ {}
+};
+
+static struct kunit_suite prb_test_suite = {
+ .name = "printk-ringbuffer",
+ .test_cases = prb_test_cases,
+};
+kunit_test_suite(prb_test_suite);
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+MODULE_AUTHOR("John Ogness <john.ogness@linutronix.de>");
+MODULE_DESCRIPTION("printk_ringbuffer KUnit test");
+MODULE_LICENSE("GPL");
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
new file mode 100644
index 000000000000..32a28f563b13
--- /dev/null
+++ b/kernel/printk/printk_safe.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * printk_safe.c - Safe printk for printk-deadlock-prone contexts
+ */
+
+#include <linux/preempt.h>
+#include <linux/kdb.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/printk.h>
+#include <linux/kprobes.h>
+
+#include "internal.h"
+
+/* Context where printk messages are never suppressed */
+static atomic_t force_con;
+
+void printk_force_console_enter(void)
+{
+ atomic_inc(&force_con);
+}
+
+void printk_force_console_exit(void)
+{
+ atomic_dec(&force_con);
+}
+
+bool is_printk_force_console(void)
+{
+ return atomic_read(&force_con);
+}
+
+static DEFINE_PER_CPU(int, printk_context);
+
+/* Can be preempted by NMI. */
+void __printk_safe_enter(void)
+{
+ this_cpu_inc(printk_context);
+}
+
+/* Can be preempted by NMI. */
+void __printk_safe_exit(void)
+{
+ this_cpu_dec(printk_context);
+}
+
+void __printk_deferred_enter(void)
+{
+ cant_migrate();
+ __printk_safe_enter();
+}
+
+void __printk_deferred_exit(void)
+{
+ cant_migrate();
+ __printk_safe_exit();
+}
+
+bool is_printk_legacy_deferred(void)
+{
+ /*
+ * The per-CPU variable @printk_context can be read safely in any
+ * context. CPU migration is always disabled when set.
+ *
+ * A context holding the printk_cpu_sync must not spin waiting for
+ * another CPU. For legacy printing, it could be the console_lock
+ * or the port lock.
+ */
+ return (force_legacy_kthread() ||
+ this_cpu_read(printk_context) ||
+ in_nmi() ||
+ is_printk_cpu_sync_owner());
+}
+
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+#ifdef CONFIG_KGDB_KDB
+ /* Allow to pass printk() to kdb but avoid a recursion. */
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+ return vprintk_default(fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
new file mode 100644
index 000000000000..da77f3f5c1fe
--- /dev/null
+++ b/kernel/printk/sysctl.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysctl.c: General linux system control interface
+ */
+
+#include <linux/sysctl.h>
+#include <linux/printk.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include "internal.h"
+
+static const int ten_thousand = 10000;
+
+static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+static const struct ctl_table printk_sysctls[] = {
+ {
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_ratelimit",
+ .data = &printk_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "printk_ratelimit_burst",
+ .data = &printk_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_delay",
+ .data = &printk_delay_msec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&ten_thousand,
+ },
+ {
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
+ .procname = "dmesg_restrict",
+ .data = &dmesg_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+};
+
+void __init printk_sysctl_init(void)
+{
+ register_sysctl_init("kernel", printk_sysctls);
+}
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28d6e9f..1fcf1adcf4eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/profile.c
* Simple profiling. Manages a direct-mapped profile hit count buffer,
@@ -16,7 +17,7 @@
#include <linux/export.h>
#include <linux/profile.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/notifier.h>
#include <linux/mm.h>
#include <linux/cpumask.h>
@@ -25,6 +26,8 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/sched/stat.h>
+
#include <asm/sections.h>
#include <asm/irq_regs.h>
#include <asm/ptrace.h>
@@ -38,59 +41,41 @@ struct profile_hit {
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
static atomic_t *prof_buffer;
-static unsigned long prof_len, prof_shift;
+static unsigned long prof_len;
+static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
-static cpumask_var_t prof_cpu_mask;
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
-static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DEFINE_MUTEX(profile_flip_mutex);
-#endif /* CONFIG_SMP */
-
int profile_setup(char *str)
{
static const char schedstr[] = "schedule";
- static const char sleepstr[] = "sleep";
static const char kvmstr[] = "kvm";
+ const char *select = NULL;
int par;
- if (!strncmp(str, sleepstr, strlen(sleepstr))) {
-#ifdef CONFIG_SCHEDSTATS
- prof_on = SLEEP_PROFILING;
- if (str[strlen(sleepstr)] == ',')
- str += strlen(sleepstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel sleep profiling enabled (shift: %ld)\n",
- prof_shift);
-#else
- pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
-#endif /* CONFIG_SCHEDSTATS */
- } else if (!strncmp(str, schedstr, strlen(schedstr))) {
+ if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
- if (str[strlen(schedstr)] == ',')
- str += strlen(schedstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel schedule profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = schedstr;
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
- if (str[strlen(kvmstr)] == ',')
- str += strlen(kvmstr) + 1;
- if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel KVM profiling enabled (shift: %ld)\n",
- prof_shift);
+ select = kvmstr;
} else if (get_option(&str, &par)) {
- prof_shift = par;
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
- pr_info("kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
+
+ if (select) {
+ if (str[strlen(select)] == ',')
+ str += strlen(select) + 1;
+ if (get_option(&str, &par))
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel %s profiling enabled (shift: %u)\n",
+ select, prof_shift);
+ }
+
return 1;
}
__setup("profile=", profile_setup);
@@ -104,12 +89,14 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
- buffer_bytes = prof_len*sizeof(atomic_t);
- if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
- return -ENOMEM;
+ if (!prof_len) {
+ pr_warn("profiling shift: %u too large\n", prof_shift);
+ prof_on = 0;
+ return -EINVAL;
+ }
- cpumask_copy(prof_cpu_mask, cpu_possible_mask);
+ buffer_bytes = prof_len*sizeof(atomic_t);
prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
if (prof_buffer)
@@ -124,279 +111,16 @@ int __ref profile_init(void)
if (prof_buffer)
return 0;
- free_cpumask_var(prof_cpu_mask);
return -ENOMEM;
}
-/* Profile event notifications */
-
-static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
-static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
-static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
-
-void profile_task_exit(struct task_struct *task)
-{
- blocking_notifier_call_chain(&task_exit_notifier, 0, task);
-}
-
-int profile_handoff_task(struct task_struct *task)
-{
- int ret;
- ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
- return (ret == NOTIFY_OK) ? 1 : 0;
-}
-
-void profile_munmap(unsigned long addr)
-{
- blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-}
-
-int task_handoff_register(struct notifier_block *n)
-{
- return atomic_notifier_chain_register(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_register);
-
-int task_handoff_unregister(struct notifier_block *n)
-{
- return atomic_notifier_chain_unregister(&task_free_notifier, n);
-}
-EXPORT_SYMBOL_GPL(task_handoff_unregister);
-
-int profile_event_register(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_register(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_register(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_register);
-
-int profile_event_unregister(enum profile_type type, struct notifier_block *n)
-{
- int err = -EINVAL;
-
- switch (type) {
- case PROFILE_TASK_EXIT:
- err = blocking_notifier_chain_unregister(
- &task_exit_notifier, n);
- break;
- case PROFILE_MUNMAP:
- err = blocking_notifier_chain_unregister(
- &munmap_notifier, n);
- break;
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(profile_event_unregister);
-
-#ifdef CONFIG_SMP
-/*
- * Each cpu has a pair of open-addressed hashtables for pending
- * profile hits. read_profile() IPI's all cpus to request them
- * to flip buffers and flushes their contents to prof_buffer itself.
- * Flip requests are serialized by the profile_flip_mutex. The sole
- * use of having a second hashtable is for avoiding cacheline
- * contention that would otherwise happen during flushes of pending
- * profile hits required for the accuracy of reported profile hits
- * and so resurrect the interrupt livelock issue.
- *
- * The open-addressed hashtables are indexed by profile buffer slot
- * and hold the number of pending hits to that profile buffer slot on
- * a cpu in an entry. When the hashtable overflows, all pending hits
- * are accounted to their corresponding profile buffer slots with
- * atomic_add() and the hashtable emptied. As numerous pending hits
- * may be accounted to a profile buffer slot in a hashtable entry,
- * this amortizes a number of atomic profile buffer increments likely
- * to be far larger than the number of entries in the hashtable,
- * particularly given that the number of distinct profile buffer
- * positions to which hits are accounted during short intervals (e.g.
- * several seconds) is usually very small. Exclusion from buffer
- * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
- * process context).
- * The hash function is meant to be lightweight as opposed to strong,
- * and was vaguely inspired by ppc64 firmware-supported inverted
- * pagetable hash functions, but uses a full hashtable full of finite
- * collision chains, not just pairs of them.
- *
- * -- nyc
- */
-static void __profile_flip_buffers(void *unused)
-{
- int cpu = smp_processor_id();
-
- per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
-}
-
-static void profile_flip_buffers(void)
-{
- int i, j, cpu;
-
- mutex_lock(&profile_flip_mutex);
- j = per_cpu(cpu_profile_flip, get_cpu());
- put_cpu();
- on_each_cpu(__profile_flip_buffers, NULL, 1);
- for_each_online_cpu(cpu) {
- struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
- for (i = 0; i < NR_PROFILE_HIT; ++i) {
- if (!hits[i].hits) {
- if (hits[i].pc)
- hits[i].pc = 0;
- continue;
- }
- atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
- hits[i].hits = hits[i].pc = 0;
- }
- }
- mutex_unlock(&profile_flip_mutex);
-}
-
-static void profile_discard_flip_buffers(void)
-{
- int i, cpu;
-
- mutex_lock(&profile_flip_mutex);
- i = per_cpu(cpu_profile_flip, get_cpu());
- put_cpu();
- on_each_cpu(__profile_flip_buffers, NULL, 1);
- for_each_online_cpu(cpu) {
- struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
- memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
- }
- mutex_unlock(&profile_flip_mutex);
-}
-
-static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
-{
- unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
- int i, j, cpu;
- struct profile_hit *hits;
-
- pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
- i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
- secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
- cpu = get_cpu();
- hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
- if (!hits) {
- put_cpu();
- return;
- }
- /*
- * We buffer the global profiler buffer into a per-CPU
- * queue and thus reduce the number of global (and possibly
- * NUMA-alien) accesses. The write-queue is self-coalescing:
- */
- local_irq_save(flags);
- do {
- for (j = 0; j < PROFILE_GRPSZ; ++j) {
- if (hits[i + j].pc == pc) {
- hits[i + j].hits += nr_hits;
- goto out;
- } else if (!hits[i + j].hits) {
- hits[i + j].pc = pc;
- hits[i + j].hits = nr_hits;
- goto out;
- }
- }
- i = (i + secondary) & (NR_PROFILE_HIT - 1);
- } while (i != primary);
-
- /*
- * Add the current hit(s) and flush the write-queue out
- * to the global buffer:
- */
- atomic_add(nr_hits, &prof_buffer[pc]);
- for (i = 0; i < NR_PROFILE_HIT; ++i) {
- atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
- hits[i].pc = hits[i].hits = 0;
- }
-out:
- local_irq_restore(flags);
- put_cpu();
-}
-
-static int profile_cpu_callback(struct notifier_block *info,
- unsigned long action, void *__cpu)
-{
- int node, cpu = (unsigned long)__cpu;
- struct page *page;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- node = cpu_to_mem(cpu);
- per_cpu(cpu_profile_flip, cpu) = 0;
- if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- return notifier_from_errno(-ENOMEM);
- per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
- }
- if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO,
- 0);
- if (!page)
- goto out_free;
- per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
- }
- break;
-out_free:
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- return notifier_from_errno(-ENOMEM);
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_set_cpu(cpu, prof_cpu_mask);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- if (prof_cpu_mask != NULL)
- cpumask_clear_cpu(cpu, prof_cpu_mask);
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
- __free_page(page);
- }
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- }
- break;
- }
- return NOTIFY_OK;
-}
-#else /* !CONFIG_SMP */
-#define profile_flip_buffers() do { } while (0)
-#define profile_discard_flip_buffers() do { } while (0)
-#define profile_cpu_callback NULL
-
static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long pc;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
- atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
+ if (pc < prof_len)
+ atomic_add(nr_hits, &prof_buffer[pc]);
}
-#endif /* !CONFIG_SMP */
void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
@@ -410,58 +134,15 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (!user_mode(regs) && prof_cpu_mask != NULL &&
- cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
+ /* This is the old kernel-only legacy profiling */
+ if (!user_mode(regs))
profile_hit(type, (void *)profile_pc(regs));
}
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <asm/uaccess.h>
-
-static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
-{
- seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
- return 0;
-}
-
-static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, prof_cpu_mask_proc_show, NULL);
-}
-
-static ssize_t prof_cpu_mask_proc_write(struct file *file,
- const char __user *buffer, size_t count, loff_t *pos)
-{
- cpumask_var_t new_value;
- int err;
-
- if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
- return -ENOMEM;
-
- err = cpumask_parse_user(buffer, count, new_value);
- if (!err) {
- cpumask_copy(prof_cpu_mask, new_value);
- err = count;
- }
- free_cpumask_var(new_value);
- return err;
-}
-
-static const struct file_operations prof_cpu_mask_proc_fops = {
- .open = prof_cpu_mask_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = prof_cpu_mask_proc_write,
-};
-
-void create_prof_cpu_mask(void)
-{
- /* create /proc/irq/prof_cpu_mask */
- proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
-}
+#include <linux/uaccess.h>
/*
* This function accesses profiling information. The returned data is
@@ -475,9 +156,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
unsigned long p = *ppos;
ssize_t read;
char *pnt;
- unsigned int sample_step = 1 << prof_shift;
+ unsigned long sample_step = 1UL << prof_shift;
- profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
return 0;
if (count > (prof_len+1)*sizeof(unsigned int) - p)
@@ -497,6 +177,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return read;
}
+/* default is to not implement this call */
+int __weak setup_profiling_timer(unsigned mult)
+{
+ return -EINVAL;
+}
+
/*
* Writing to /proc/profile resets the counters
*
@@ -507,8 +193,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
- extern int setup_profiling_timer(unsigned int multiplier);
-
if (count == sizeof(int)) {
unsigned int multiplier;
@@ -519,94 +203,27 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
return -EINVAL;
}
#endif
- profile_discard_flip_buffers();
memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
return count;
}
-static const struct file_operations proc_profile_operations = {
- .read = read_profile,
- .write = write_profile,
- .llseek = default_llseek,
+static const struct proc_ops profile_proc_ops = {
+ .proc_read = read_profile,
+ .proc_write = write_profile,
+ .proc_lseek = default_llseek,
};
-#ifdef CONFIG_SMP
-static void profile_nop(void *unused)
-{
-}
-
-static int create_hash_tables(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- int node = cpu_to_mem(cpu);
- struct page *page;
-
- page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[1]
- = (struct profile_hit *)page_address(page);
- page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
- 0);
- if (!page)
- goto out_cleanup;
- per_cpu(cpu_profile_hits, cpu)[0]
- = (struct profile_hit *)page_address(page);
- }
- return 0;
-out_cleanup:
- prof_on = 0;
- smp_mb();
- on_each_cpu(profile_nop, NULL, 1);
- for_each_online_cpu(cpu) {
- struct page *page;
-
- if (per_cpu(cpu_profile_hits, cpu)[0]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
- per_cpu(cpu_profile_hits, cpu)[0] = NULL;
- __free_page(page);
- }
- if (per_cpu(cpu_profile_hits, cpu)[1]) {
- page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
- per_cpu(cpu_profile_hits, cpu)[1] = NULL;
- __free_page(page);
- }
- }
- return -1;
-}
-#else
-#define create_hash_tables() ({ 0; })
-#endif
-
-int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
+int __ref create_proc_profile(void)
{
struct proc_dir_entry *entry;
int err = 0;
if (!prof_on)
return 0;
-
- cpu_notifier_register_begin();
-
- if (create_hash_tables()) {
- err = -ENOMEM;
- goto out;
- }
-
entry = proc_create("profile", S_IWUSR | S_IRUGO,
- NULL, &proc_profile_operations);
- if (!entry)
- goto out;
- proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
- __hotcpu_notifier(profile_cpu_callback, 0);
-
-out:
- cpu_notifier_register_done();
+ NULL, &profile_proc_ops);
+ if (entry)
+ proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
return err;
}
subsys_initcall(create_proc_profile);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..392ec2f75f01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/ptrace.c
*
@@ -10,6 +11,9 @@
#include <linux/capability.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
@@ -26,7 +30,50 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
+#include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
+#include <asm/syscall.h> /* for syscall_get_* */
+
+/*
+ * Access another process' address space via ptrace.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ if (!tsk->ptrace ||
+ (current != tsk->parent) ||
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptracer_capable(tsk, mm->user_ns))) {
+ mmput(mm);
+ return 0;
+ }
+
+ ret = access_remote_vm(mm, addr, buf, len, gup_flags);
+ mmput(mm);
+
+ return ret;
+}
+
+
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+ const struct cred *ptracer_cred)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+ child->ptracer_cred = get_cred(ptracer_cred);
+}
/*
* ptrace a task: make the debugger its new parent and
@@ -34,11 +81,9 @@
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_entry));
- list_add(&child->ptrace_entry, &new_parent->ptraced);
- child->parent = new_parent;
+ __ptrace_link(child, new_parent, current_cred());
}
/**
@@ -71,14 +116,22 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
*/
void __ptrace_unlink(struct task_struct *child)
{
+ const struct cred *old_cred;
BUG_ON(!child->ptrace);
- child->ptrace = 0;
+ clear_task_syscall_work(child, SYSCALL_TRACE);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+ clear_task_syscall_work(child, SYSCALL_EMU);
+#endif
+
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
+ old_cred = child->ptracer_cred;
+ child->ptracer_cred = NULL;
+ put_cred(old_cred);
spin_lock(&child->sighand->siglock);
-
+ child->ptrace = 0;
/*
* Clear all pending traps and TRAPPING. TRAPPING should be
* cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
@@ -92,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child)
*/
if (!(child->flags & PF_EXITING) &&
(child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count)) {
+ child->signal->group_stop_count))
child->jobctl |= JOBCTL_STOP_PENDING;
- /*
- * This is only possible if this thread was cloned by the
- * traced task running in the stopped group, set the signal
- * for the future reports.
- * FIXME: we should change ptrace_init_task() to handle this
- * case.
- */
- if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
- child->jobctl |= SIGSTOP;
- }
-
/*
* If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
* @child in the butt. Note that @resume should be used iff @child
@@ -118,7 +160,27 @@ void __ptrace_unlink(struct task_struct *child)
spin_unlock(&child->sighand->siglock);
}
-/* Ensure that nothing can wake it up, even SIGKILL */
+static bool looks_like_a_spurious_pid(struct task_struct *task)
+{
+ if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP))
+ return false;
+
+ if (task_pid_vnr(task) == task->ptrace_message)
+ return false;
+ /*
+ * The tracee changed its pid but the PTRACE_EVENT_EXEC event
+ * was not wait()'ed, most probably debugger targets the old
+ * leader which was destroyed in de_thread().
+ */
+ return true;
+}
+
+/*
+ * Ensure that nothing can wake it up, even SIGKILL
+ *
+ * A task is switched to this state while a ptrace operation is in progress;
+ * such that the ptrace operation is uninterruptible.
+ */
static bool ptrace_freeze_traced(struct task_struct *task)
{
bool ret = false;
@@ -128,8 +190,9 @@ static bool ptrace_freeze_traced(struct task_struct *task)
return ret;
spin_lock_irq(&task->sighand->siglock);
- if (task_is_traced(task) && !__fatal_signal_pending(task)) {
- task->state = __TASK_TRACED;
+ if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
+ !__fatal_signal_pending(task)) {
+ task->jobctl |= JOBCTL_PTRACE_FROZEN;
ret = true;
}
spin_unlock_irq(&task->sighand->siglock);
@@ -139,17 +202,21 @@ static bool ptrace_freeze_traced(struct task_struct *task)
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (task->state != __TASK_TRACED)
- return;
-
- WARN_ON(!task->ptrace || task->parent != current);
+ unsigned long flags;
- spin_lock_irq(&task->sighand->siglock);
- if (__fatal_signal_pending(task))
- wake_up_state(task, __TASK_TRACED);
- else
- task->state = TASK_TRACED;
- spin_unlock_irq(&task->sighand->siglock);
+ /*
+ * The child may be awake and may have cleared
+ * JOBCTL_PTRACE_FROZEN (see ptrace_resume). The child will
+ * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
+ */
+ if (lock_task_sighand(task, &flags)) {
+ task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~JOBCTL_TRACED;
+ wake_up_state(task, __TASK_TRACED);
+ }
+ unlock_task_sighand(task, &flags);
+ }
}
/**
@@ -182,7 +249,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
*/
read_lock(&tasklist_lock);
if (child->ptrace && child->parent == current) {
- WARN_ON(child->state == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
@@ -192,33 +258,32 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
}
read_unlock(&tasklist_lock);
- if (!ret && !ignore_state) {
- if (!wait_task_inactive(child, __TASK_TRACED)) {
- /*
- * This can only happen if may_ptrace_stop() fails and
- * ptrace_stop() changes ->state back to TASK_RUNNING,
- * so we should not worry about leaking __TASK_TRACED.
- */
- WARN_ON(child->state == __TASK_TRACED);
- ret = -ESRCH;
- }
- }
+ if (!ret && !ignore_state &&
+ WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN)))
+ ret = -ESRCH;
return ret;
}
-static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
if (mode & PTRACE_MODE_NOAUDIT)
- return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
- else
- return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+ return ns_capable_noaudit(ns, CAP_SYS_PTRACE);
+ return ns_capable(ns, CAP_SYS_PTRACE);
}
/* Returns 0 on success, -errno on denial. */
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
+ struct mm_struct *mm;
+ kuid_t caller_uid;
+ kgid_t caller_gid;
+
+ if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) {
+ WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n");
+ return -EPERM;
+ }
/* May we inspect the given task?
* This check is used both for attaching with ptrace
@@ -228,18 +293,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
* because setting up the necessary parent/child relationship
* or halting the specified task is impossible.
*/
- int dumpable = 0;
+
/* Don't let security modules deny introspection */
if (same_thread_group(task, current))
return 0;
rcu_read_lock();
+ if (mode & PTRACE_MODE_FSCREDS) {
+ caller_uid = cred->fsuid;
+ caller_gid = cred->fsgid;
+ } else {
+ /*
+ * Using the euid would make more sense here, but something
+ * in userland might rely on the old behavior, and this
+ * shouldn't be a security problem since
+ * PTRACE_MODE_REALCREDS implies that the caller explicitly
+ * used a syscall that requests access to another process
+ * (and not a filesystem syscall to procfs).
+ */
+ caller_uid = cred->uid;
+ caller_gid = cred->gid;
+ }
tcred = __task_cred(task);
- if (uid_eq(cred->uid, tcred->euid) &&
- uid_eq(cred->uid, tcred->suid) &&
- uid_eq(cred->uid, tcred->uid) &&
- gid_eq(cred->gid, tcred->egid) &&
- gid_eq(cred->gid, tcred->sgid) &&
- gid_eq(cred->gid, tcred->gid))
+ if (uid_eq(caller_uid, tcred->euid) &&
+ uid_eq(caller_uid, tcred->suid) &&
+ uid_eq(caller_uid, tcred->uid) &&
+ gid_eq(caller_gid, tcred->egid) &&
+ gid_eq(caller_gid, tcred->sgid) &&
+ gid_eq(caller_gid, tcred->gid))
goto ok;
if (ptrace_has_cap(tcred->user_ns, mode))
goto ok;
@@ -247,16 +327,21 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return -EPERM;
ok:
rcu_read_unlock();
+ /*
+ * If a task drops privileges and becomes nondumpable (through a syscall
+ * like setresuid()) while we are trying to access it, we must ensure
+ * that the dumpability is read after the credentials; otherwise,
+ * we may be able to attach to a task that we shouldn't be able to
+ * attach to (as if the task had dropped privileges without becoming
+ * nondumpable).
+ * Pairs with a write barrier in commit_creds().
+ */
smp_rmb();
- if (task->mm)
- dumpable = get_dumpable(task->mm);
- rcu_read_lock();
- if (dumpable != SUID_DUMP_USER &&
- !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
+ mm = task->mm;
+ if (mm &&
+ ((get_dumpable(mm) != SUID_DUMP_USER) &&
+ !ptrace_has_cap(mm->user_ns, mode)))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
@@ -270,6 +355,57 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
+static int check_ptrace_options(unsigned long data)
+{
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
+
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+ return 0;
+}
+
+static inline void ptrace_set_stopped(struct task_struct *task, bool seize)
+{
+ guard(spinlock)(&task->sighand->siglock);
+
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID);
+ /*
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
+ signal_wake_up_state(task, __TASK_STOPPED);
+ }
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -277,12 +413,20 @@ static int ptrace_attach(struct task_struct *task, long request,
bool seize = (request == PTRACE_SEIZE);
int retval;
- retval = -EIO;
if (seize) {
if (addr != 0)
- goto out;
+ return -EIO;
+ /*
+ * This duplicates the check in check_ptrace_options() because
+ * ptrace_attach() and ptrace_setoptions() have historically
+ * used different error codes for unknown ptrace options.
+ */
if (flags & ~(unsigned long)PTRACE_O_MASK)
- goto out;
+ return -EIO;
+
+ retval = check_ptrace_options(flags);
+ if (retval)
+ return retval;
flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
} else {
flags = PT_PTRACED;
@@ -290,86 +434,48 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
- retval = -EPERM;
if (unlikely(task->flags & PF_KTHREAD))
- goto out;
+ return -EPERM;
if (same_thread_group(task, current))
- goto out;
+ return -EPERM;
/*
* Protect exec's credential calculations against our interference;
* SUID, SGID and LSM creds get determined differently
* under ptrace.
*/
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
- goto out;
+ scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
+ &task->signal->cred_guard_mutex) {
- task_lock(task);
- retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
- task_unlock(task);
- if (retval)
- goto unlock_creds;
-
- write_lock_irq(&tasklist_lock);
- retval = -EPERM;
- if (unlikely(task->exit_state))
- goto unlock_tasklist;
- if (task->ptrace)
- goto unlock_tasklist;
-
- if (seize)
- flags |= PT_SEIZED;
- rcu_read_lock();
- if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
- flags |= PT_PTRACE_CAP;
- rcu_read_unlock();
- task->ptrace = flags;
+ scoped_guard (task_lock, task) {
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ if (retval)
+ return retval;
+ }
- __ptrace_link(task, current);
+ scoped_guard (write_lock_irq, &tasklist_lock) {
+ if (unlikely(task->exit_state))
+ return -EPERM;
+ if (task->ptrace)
+ return -EPERM;
- /* SEIZE doesn't trap tracee on attach */
- if (!seize)
- send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
-
- spin_lock(&task->sighand->siglock);
+ task->ptrace = flags;
+ ptrace_link(task, current);
+ ptrace_set_stopped(task, seize);
+ }
+ }
/*
- * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
- * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
- * will be cleared if the child completes the transition or any
- * event which clears the group stop states happens. We'll wait
- * for the transition to complete before returning from this
- * function.
- *
- * This hides STOPPED -> RUNNING -> TRACED transition from the
- * attaching thread but a different thread in the same group can
- * still observe the transient RUNNING state. IOW, if another
- * thread's WNOHANG wait(2) on the stopped tracee races against
- * ATTACH, the wait(2) may fail due to the transient RUNNING.
- *
- * The following task_is_stopped() test is safe as both transitions
- * in and out of STOPPED are protected by siglock.
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
*/
- if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
- signal_wake_up_state(task, __TASK_STOPPED);
-
- spin_unlock(&task->sighand->siglock);
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
- retval = 0;
-unlock_tasklist:
- write_unlock_irq(&tasklist_lock);
-unlock_creds:
- mutex_unlock(&task->signal->cred_guard_mutex);
-out:
- if (!retval) {
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- TASK_UNINTERRUPTIBLE);
- proc_ptrace_connector(task, PTRACE_ATTACH);
- }
-
- return retval;
+ return 0;
}
/**
@@ -393,7 +499,7 @@ static int ptrace_traceme(void)
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED;
- __ptrace_link(current, current->real_parent);
+ ptrace_link(current, current->real_parent);
}
}
write_unlock_irq(&tasklist_lock);
@@ -461,7 +567,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
write_lock_irq(&tasklist_lock);
/*
@@ -492,7 +597,7 @@ void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (unlikely(p->ptrace & PT_EXITKILL))
- send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
+ send_sig_info(SIGKILL, SEND_SIG_PRIV, p);
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, dead);
@@ -508,7 +613,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
int this_len, retval;
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
- retval = access_process_vm(tsk, src, buf, this_len, 0);
+ retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);
+
if (!retval) {
if (copied)
break;
@@ -535,7 +641,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
if (copy_from_user(buf, src, this_len))
return -EFAULT;
- retval = access_process_vm(tsk, dst, buf, this_len, 1);
+ retval = ptrace_access_vm(tsk, dst, buf, this_len,
+ FOLL_FORCE | FOLL_WRITE);
if (!retval) {
if (copied)
break;
@@ -552,9 +659,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
unsigned flags;
+ int ret;
- if (data & ~(unsigned long)PTRACE_O_MASK)
- return -EINVAL;
+ ret = check_ptrace_options(data);
+ if (ret)
+ return ret;
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
@@ -565,7 +674,7 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
return 0;
}
-static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
+static int ptrace_getsiginfo(struct task_struct *child, kernel_siginfo_t *info)
{
unsigned long flags;
int error = -ESRCH;
@@ -573,7 +682,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
if (likely(child->last_siginfo != NULL)) {
- *info = *child->last_siginfo;
+ copy_siginfo(info, child->last_siginfo);
error = 0;
}
unlock_task_sighand(child, &flags);
@@ -581,7 +690,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
return error;
}
-static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
+static int ptrace_setsiginfo(struct task_struct *child, const kernel_siginfo_t *info)
{
unsigned long flags;
int error = -ESRCH;
@@ -589,7 +698,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
if (lock_task_sighand(child, &flags)) {
error = -EINVAL;
if (likely(child->last_siginfo != NULL)) {
- *child->last_siginfo = *info;
+ copy_siginfo(child->last_siginfo, info);
error = 0;
}
unlock_task_sighand(child, &flags);
@@ -617,33 +726,38 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (arg.nr < 0)
return -EINVAL;
+ /* Ensure arg.off fits in an unsigned long */
+ if (arg.off > ULONG_MAX)
+ return 0;
+
if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
pending = &child->signal->shared_pending;
else
pending = &child->pending;
for (i = 0; i < arg.nr; ) {
- siginfo_t info;
- s32 off = arg.off + i;
+ kernel_siginfo_t info;
+ unsigned long off = arg.off + i;
+ bool found = false;
spin_lock_irq(&child->sighand->siglock);
list_for_each_entry(q, &pending->list, list) {
if (!off--) {
+ found = true;
copy_siginfo(&info, &q->info);
break;
}
}
spin_unlock_irq(&child->sighand->siglock);
- if (off >= 0) /* beyond the end of the list */
+ if (!found) /* beyond the end of the list */
break;
#ifdef CONFIG_COMPAT
- if (unlikely(is_compat_task())) {
+ if (unlikely(in_compat_syscall())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
- if (copy_siginfo_to_user32(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user32(uinfo, &info)) {
ret = -EFAULT;
break;
}
@@ -653,8 +767,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
{
siginfo_t __user *uinfo = (siginfo_t __user *) data;
- if (copy_siginfo_to_user(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user(uinfo, &info)) {
ret = -EFAULT;
break;
}
@@ -675,12 +788,26 @@ static int ptrace_peek_siginfo(struct task_struct *child,
return ret;
}
-#ifdef PTRACE_SINGLESTEP
-#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
-#else
-#define is_singlestep(request) 0
+#ifdef CONFIG_RSEQ
+static long ptrace_get_rseq_configuration(struct task_struct *task,
+ unsigned long size, void __user *data)
+{
+ struct ptrace_rseq_configuration conf = {
+ .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr,
+ .rseq_abi_size = task->rseq.len,
+ .signature = task->rseq.sig,
+ .flags = 0,
+ };
+
+ size = min_t(unsigned long, size, sizeof(conf));
+ if (copy_to_user(data, &conf, size))
+ return -EFAULT;
+ return sizeof(conf);
+}
#endif
+#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
+
#ifdef PTRACE_SINGLEBLOCK
#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
#else
@@ -696,21 +823,19 @@ static int ptrace_peek_siginfo(struct task_struct *child,
static int ptrace_resume(struct task_struct *child, long request,
unsigned long data)
{
- bool need_siglock;
-
if (!valid_signal(data))
return -EIO;
if (request == PTRACE_SYSCALL)
- set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ set_task_syscall_work(child, SYSCALL_TRACE);
else
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ clear_task_syscall_work(child, SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
- set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ set_task_syscall_work(child, SYSCALL_EMU);
else
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(child, SYSCALL_EMU);
#endif
if (is_singleblock(request)) {
@@ -733,18 +858,12 @@ static int ptrace_resume(struct task_struct *child, long request,
* Note that we need siglock even if ->exit_code == data and/or this
* status was not reported yet, the new status must not be cleared by
* wait_task_stopped() after resume.
- *
- * If data == 0 we do not care if wait_task_stopped() reports the old
- * status and clears the code too; this can't race with the tracee, it
- * takes siglock after resume.
*/
- need_siglock = data && !thread_group_empty(current);
- if (need_siglock)
- spin_lock_irq(&child->sighand->siglock);
+ spin_lock_irq(&child->sighand->siglock);
child->exit_code = data;
+ child->jobctl &= ~JOBCTL_TRACED;
wake_up_state(child, __TASK_TRACED);
- if (need_siglock)
- spin_unlock_irq(&child->sighand->siglock);
+ spin_unlock_irq(&child->sighand->siglock);
return 0;
}
@@ -794,14 +913,232 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
* to ensure no machine forgets it.
*/
EXPORT_SYMBOL_GPL(task_user_regset_view);
-#endif
+
+static unsigned long
+ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ unsigned long args[ARRAY_SIZE(info->entry.args)];
+ int i;
+
+ info->entry.nr = syscall_get_nr(child, regs);
+ syscall_get_arguments(child, regs, args);
+ for (i = 0; i < ARRAY_SIZE(args); i++)
+ info->entry.args[i] = args[i];
+
+ /* args is the last field in struct ptrace_syscall_info.entry */
+ return offsetofend(struct ptrace_syscall_info, entry.args);
+}
+
+static unsigned long
+ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ /*
+ * As struct ptrace_syscall_info.entry is currently a subset
+ * of struct ptrace_syscall_info.seccomp, it makes sense to
+ * initialize that subset using ptrace_get_syscall_info_entry().
+ * This can be reconsidered in the future if these structures
+ * diverge significantly enough.
+ */
+ ptrace_get_syscall_info_entry(child, regs, info);
+ info->seccomp.ret_data = child->ptrace_message;
+
+ /*
+ * ret_data is the last non-reserved field
+ * in struct ptrace_syscall_info.seccomp
+ */
+ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
+}
+
+static unsigned long
+ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ info->exit.rval = syscall_get_error(child, regs);
+ info->exit.is_error = !!info->exit.rval;
+ if (!info->exit.is_error)
+ info->exit.rval = syscall_get_return_value(child, regs);
+
+ /* is_error is the last field in struct ptrace_syscall_info.exit */
+ return offsetofend(struct ptrace_syscall_info, exit.is_error);
+}
+
+static int
+ptrace_get_syscall_info_op(struct task_struct *child)
+{
+ /*
+ * This does not need lock_task_sighand() to access
+ * child->last_siginfo because ptrace_freeze_traced()
+ * called earlier by ptrace_check_attach() ensures that
+ * the tracee cannot go away and clear its last_siginfo.
+ */
+ switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
+ case SIGTRAP | 0x80:
+ switch (child->ptrace_message) {
+ case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+ return PTRACE_SYSCALL_INFO_ENTRY;
+ case PTRACE_EVENTMSG_SYSCALL_EXIT:
+ return PTRACE_SYSCALL_INFO_EXIT;
+ default:
+ return PTRACE_SYSCALL_INFO_NONE;
+ }
+ case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
+ return PTRACE_SYSCALL_INFO_SECCOMP;
+ default:
+ return PTRACE_SYSCALL_INFO_NONE;
+ }
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+ void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info = {
+ .op = ptrace_get_syscall_info_op(child),
+ .arch = syscall_get_arch(child),
+ .instruction_pointer = instruction_pointer(regs),
+ .stack_pointer = user_stack_pointer(regs),
+ };
+ unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+ unsigned long write_size;
+
+ switch (info.op) {
+ case PTRACE_SYSCALL_INFO_ENTRY:
+ actual_size = ptrace_get_syscall_info_entry(child, regs, &info);
+ break;
+ case PTRACE_SYSCALL_INFO_EXIT:
+ actual_size = ptrace_get_syscall_info_exit(child, regs, &info);
+ break;
+ case PTRACE_SYSCALL_INFO_SECCOMP:
+ actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info);
+ break;
+ }
+
+ write_size = min(actual_size, user_size);
+ return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
+}
+
+static int
+ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ unsigned long args[ARRAY_SIZE(info->entry.args)];
+ int nr = info->entry.nr;
+ int i;
+
+ /*
+ * Check that the syscall number specified in info->entry.nr
+ * is either a value of type "int" or a sign-extended value
+ * of type "int".
+ */
+ if (nr != info->entry.nr)
+ return -ERANGE;
+
+ for (i = 0; i < ARRAY_SIZE(args); i++) {
+ args[i] = info->entry.args[i];
+ /*
+ * Check that the syscall argument specified in
+ * info->entry.args[i] is either a value of type
+ * "unsigned long" or a sign-extended value of type "long".
+ */
+ if (args[i] != info->entry.args[i])
+ return -ERANGE;
+ }
+
+ syscall_set_nr(child, regs, nr);
+ /*
+ * If the syscall number is set to -1, setting syscall arguments is not
+ * just pointless, it would also clobber the syscall return value on
+ * those architectures that share the same register both for the first
+ * argument of syscall and its return value.
+ */
+ if (nr != -1)
+ syscall_set_arguments(child, regs, args);
+
+ return 0;
+}
+
+static int
+ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ /*
+ * info->entry is currently a subset of info->seccomp,
+ * info->seccomp.ret_data is currently ignored.
+ */
+ return ptrace_set_syscall_info_entry(child, regs, info);
+}
+
+static int
+ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ long rval = info->exit.rval;
+
+ /*
+ * Check that the return value specified in info->exit.rval
+ * is either a value of type "long" or a sign-extended value
+ * of type "long".
+ */
+ if (rval != info->exit.rval)
+ return -ERANGE;
+
+ if (info->exit.is_error)
+ syscall_set_return_value(child, regs, rval, 0);
+ else
+ syscall_set_return_value(child, regs, 0, rval);
+
+ return 0;
+}
+
+static int
+ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
+ const void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info;
+
+ if (user_size < sizeof(info))
+ return -EINVAL;
+
+ /*
+ * The compatibility is tracked by info.op and info.flags: if user-space
+ * does not instruct us to use unknown extra bits from future versions
+ * of ptrace_syscall_info, we are not going to read them either.
+ */
+ if (copy_from_user(&info, datavp, sizeof(info)))
+ return -EFAULT;
+
+ /* Reserved for future use. */
+ if (info.flags || info.reserved)
+ return -EINVAL;
+
+ /* Changing the type of the system call stop is not supported yet. */
+ if (ptrace_get_syscall_info_op(child) != info.op)
+ return -EINVAL;
+
+ switch (info.op) {
+ case PTRACE_SYSCALL_INFO_ENTRY:
+ return ptrace_set_syscall_info_entry(child, regs, &info);
+ case PTRACE_SYSCALL_INFO_EXIT:
+ return ptrace_set_syscall_info_exit(child, regs, &info);
+ case PTRACE_SYSCALL_INFO_SECCOMP:
+ return ptrace_set_syscall_info_seccomp(child, regs, &info);
+ default:
+ /* Other types of system call stops are not supported yet. */
+ return -EINVAL;
+ }
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
{
bool seized = child->ptrace & PT_SEIZED;
int ret = -EIO;
- siginfo_t siginfo, *si;
+ kernel_siginfo_t siginfo, *si;
void __user *datavp = (void __user *) data;
unsigned long __user *datalp = datavp;
unsigned long flags;
@@ -835,24 +1172,31 @@ int ptrace_request(struct task_struct *child, long request,
break;
case PTRACE_SETSIGINFO:
- if (copy_from_user(&siginfo, datavp, sizeof siginfo))
- ret = -EFAULT;
- else
+ ret = copy_siginfo_from_user(&siginfo, datavp);
+ if (!ret)
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -878,6 +1222,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
@@ -965,9 +1311,7 @@ int ptrace_request(struct task_struct *child, long request,
}
#endif
-#ifdef PTRACE_SINGLESTEP
case PTRACE_SINGLESTEP:
-#endif
#ifdef PTRACE_SINGLEBLOCK
case PTRACE_SINGLEBLOCK:
#endif
@@ -980,9 +1324,8 @@ int ptrace_request(struct task_struct *child, long request,
return ptrace_resume(child, request, data);
case PTRACE_KILL:
- if (child->exit_state) /* already dead */
- return 0;
- return ptrace_resume(child, request, SIGKILL);
+ send_sig_info(SIGKILL, SEND_SIG_NOINFO, child);
+ return 0;
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
case PTRACE_GETREGSET:
@@ -990,7 +1333,7 @@ int ptrace_request(struct task_struct *child, long request,
struct iovec kiov;
struct iovec __user *uiov = datavp;
- if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+ if (!access_ok(uiov, sizeof(*uiov)))
return -EFAULT;
if (__get_user(kiov.iov_base, &uiov->iov_base) ||
@@ -1002,32 +1345,44 @@ int ptrace_request(struct task_struct *child, long request,
ret = __put_user(kiov.iov_len, &uiov->iov_len);
break;
}
+
+ case PTRACE_GET_SYSCALL_INFO:
+ ret = ptrace_get_syscall_info(child, addr, datavp);
+ break;
+
+ case PTRACE_SET_SYSCALL_INFO:
+ ret = ptrace_set_syscall_info(child, addr, datavp);
+ break;
#endif
- default:
+
+ case PTRACE_SECCOMP_GET_FILTER:
+ ret = seccomp_get_filter(child, addr, datavp);
break;
- }
- return ret;
-}
+ case PTRACE_SECCOMP_GET_METADATA:
+ ret = seccomp_get_metadata(child, addr, datavp);
+ break;
-static struct task_struct *ptrace_get_task_struct(pid_t pid)
-{
- struct task_struct *child;
+#ifdef CONFIG_RSEQ
+ case PTRACE_GET_RSEQ_CONFIGURATION:
+ ret = ptrace_get_rseq_configuration(child, addr, datavp);
+ break;
+#endif
- rcu_read_lock();
- child = find_task_by_vpid(pid);
- if (child)
- get_task_struct(child);
- rcu_read_unlock();
+ case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_set_config(child, addr, datavp);
+ break;
- if (!child)
- return ERR_PTR(-ESRCH);
- return child;
-}
+ case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_get_config(child, addr, datavp);
+ break;
-#ifndef arch_ptrace_attach
-#define arch_ptrace_attach(child) do { } while (0)
-#endif
+ default:
+ break;
+ }
+
+ return ret;
+}
SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
@@ -1037,25 +1392,17 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
- if (!ret)
- arch_ptrace_attach(current);
goto out;
}
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
+ child = find_get_task_by_vpid(pid);
+ if (!child) {
+ ret = -ESRCH;
goto out;
}
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
@@ -1080,7 +1427,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
unsigned long tmp;
int copied;
- copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+ copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
if (copied != sizeof(tmp))
return -EIO;
return put_user(tmp, (unsigned long __user *)data);
@@ -1091,7 +1438,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
{
int copied;
- copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+ copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
return (copied == sizeof(data)) ? 0 : -EIO;
}
@@ -1102,13 +1450,14 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
{
compat_ulong_t __user *datap = compat_ptr(data);
compat_ulong_t word;
- siginfo_t siginfo;
+ kernel_siginfo_t siginfo;
int ret;
switch (request) {
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
- ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+ ret = ptrace_access_vm(child, addr, &word, sizeof(word),
+ FOLL_FORCE);
if (ret != sizeof(word))
ret = -EIO;
else
@@ -1117,7 +1466,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
case PTRACE_POKETEXT:
case PTRACE_POKEDATA:
- ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+ ret = ptrace_access_vm(child, addr, &data, sizeof(data),
+ FOLL_FORCE | FOLL_WRITE);
ret = (ret != sizeof(data) ? -EIO : 0);
break;
@@ -1134,11 +1484,9 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
break;
case PTRACE_SETSIGINFO:
- memset(&siginfo, 0, sizeof siginfo);
- if (copy_siginfo_from_user32(
- &siginfo, (struct compat_siginfo __user *) datap))
- ret = -EFAULT;
- else
+ ret = copy_siginfo_from_user32(
+ &siginfo, (struct compat_siginfo __user *) datap);
+ if (!ret)
ret = ptrace_setsiginfo(child, &siginfo);
break;
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
@@ -1151,7 +1499,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
compat_uptr_t ptr;
compat_size_t len;
- if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+ if (!access_ok(uiov, sizeof(*uiov)))
return -EFAULT;
if (__get_user(ptr, &uiov->iov_base) ||
@@ -1186,20 +1534,14 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
goto out;
}
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
+ child = find_get_task_by_vpid(pid);
+ if (!child) {
+ ret = -ESRCH;
goto out;
}
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
- /*
- * Some architectures need to do book-keeping after
- * a ptrace attach.
- */
- if (!ret)
- arch_ptrace_attach(child);
goto out_put_task_struct;
}
diff --git a/kernel/range.c b/kernel/range.c
index 82cfc285b046..56435f96da73 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,8 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Range add and subtract
*/
-#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/minmax.h>
+#include <linux/printk.h>
#include <linux/sort.h>
#include <linux/string.h>
#include <linux/range.h>
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
new file mode 100644
index 000000000000..4d9b21f69eaa
--- /dev/null
+++ b/kernel/rcu/Kconfig
@@ -0,0 +1,379 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# RCU-related configuration options
+#
+
+menu "RCU Subsystem"
+
+config TREE_RCU
+ bool
+ default y if SMP
+ # Dynticks-idle tracking
+ select CONTEXT_TRACKING_IDLE
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP system with hundreds or
+ thousands of CPUs. It also scales down nicely to
+ smaller systems.
+
+config PREEMPT_RCU
+ bool
+ default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC)
+ select TREE_RCU
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP systems with hundreds or
+ thousands of CPUs, but for which real-time response
+ is also required. It also scales down nicely to
+ smaller systems.
+
+ Select this option if you are unsure.
+
+config TINY_RCU
+ bool
+ default y if !PREEMPT_RCU && !SMP
+ help
+ This option selects the RCU implementation that is
+ designed for UP systems from which real-time response
+ is not required. This option greatly reduces the
+ memory footprint of RCU.
+
+config RCU_EXPERT
+ bool "Make expert-level adjustments to RCU configuration"
+ default n
+ help
+ This option needs to be enabled if you wish to make
+ expert-level adjustments to RCU configuration. By default,
+ no such adjustments can be made, which has the often-beneficial
+ side-effect of preventing "make oldconfig" from asking you all
+ sorts of detailed questions about how you would like numerous
+ obscure RCU options to be set up.
+
+ Say Y if you need to make expert-level adjustments to RCU.
+
+ Say N if you are unsure.
+
+config TINY_SRCU
+ bool
+ default y if TINY_RCU
+ help
+ This option selects the single-CPU non-preemptible version of SRCU.
+
+config TREE_SRCU
+ bool
+ default y if !TINY_RCU
+ help
+ This option selects the full-fledged version of SRCU.
+
+config FORCE_NEED_SRCU_NMI_SAFE
+ bool "Force selection of NEED_SRCU_NMI_SAFE"
+ depends on !TINY_SRCU
+ depends on RCU_EXPERT
+ depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
+ select NEED_SRCU_NMI_SAFE
+ default n
+ help
+ This option forces selection of the NEED_SRCU_NMI_SAFE
+ Kconfig option, allowing testing of srcu_read_lock_nmisafe()
+ and srcu_read_unlock_nmisafe() on architectures (like x86)
+ that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option.
+
+config NEED_SRCU_NMI_SAFE
+ def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
+
+config TASKS_RCU_GENERIC
+ def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
+ help
+ This option enables generic infrastructure code supporting
+ task-based RCU implementations. Not for manual selection.
+
+config FORCE_TASKS_RCU
+ bool "Force selection of TASKS_RCU"
+ depends on RCU_EXPERT
+ select TASKS_RCU
+ default n
+ help
+ This option force-enables a task-based RCU implementation
+ that uses only voluntary context switch (not preemption!),
+ idle, and user-mode execution as quiescent states. Not for
+ manual selection in most cases.
+
+config NEED_TASKS_RCU
+ bool
+ default n
+
+config TASKS_RCU
+ bool
+ default NEED_TASKS_RCU && PREEMPTION
+ select IRQ_WORK
+
+config FORCE_TASKS_RUDE_RCU
+ bool "Force selection of Tasks Rude RCU"
+ depends on RCU_EXPERT
+ select TASKS_RUDE_RCU
+ default n
+ help
+ This option force-enables a task-based RCU implementation
+ that uses only context switch (including preemption) and
+ user-mode execution as quiescent states. It forces IPIs and
+ context switches on all online CPUs, including idle ones,
+ so use with caution. Not for manual selection in most cases.
+
+config TASKS_RUDE_RCU
+ bool
+ default n
+ select IRQ_WORK
+
+config FORCE_TASKS_TRACE_RCU
+ bool "Force selection of Tasks Trace RCU"
+ depends on RCU_EXPERT
+ select TASKS_TRACE_RCU
+ default n
+ help
+ This option enables a task-based RCU implementation that uses
+ explicit rcu_read_lock_trace() read-side markers, and allows
+ these readers to appear in the idle loop as well as on the
+ CPU hotplug code paths. It can force IPIs on online CPUs,
+ including idle ones, so use with caution. Not for manual
+ selection in most cases.
+
+config TASKS_TRACE_RCU
+ bool
+ default n
+ select IRQ_WORK
+
+config RCU_STALL_COMMON
+ def_bool TREE_RCU
+ help
+ This option enables RCU CPU stall code that is common between
+ the TINY and TREE variants of RCU. The purpose is to allow
+ the tiny variants to disable RCU CPU stall warnings, while
+ making these warnings mandatory for the tree variants.
+
+config RCU_NEED_SEGCBLIST
+ def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC )
+
+config RCU_FANOUT
+ int "Tree-based hierarchical RCU fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on TREE_RCU && RCU_EXPERT
+ default 64 if 64BIT
+ default 32 if !64BIT
+ help
+ This option controls the fanout of hierarchical implementations
+ of RCU, allowing RCU to work efficiently on machines with
+ large numbers of CPUs. This value must be at least the fourth
+ root of NR_CPUS, which allows NR_CPUS to be insanely large.
+ The default value of RCU_FANOUT should be used for production
+ systems, but if you are stress-testing the RCU implementation
+ itself, small RCU_FANOUT values allow you to test large-system
+ code paths on small(er) systems.
+
+ Select a specific number if testing RCU itself.
+ Take the default if unsure.
+
+config RCU_FANOUT_LEAF
+ int "Tree-based hierarchical RCU leaf-level fanout value"
+ range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD
+ range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD
+ range 2 3 if RCU_STRICT_GRACE_PERIOD
+ depends on TREE_RCU && RCU_EXPERT
+ default 16 if !RCU_STRICT_GRACE_PERIOD
+ default 2 if RCU_STRICT_GRACE_PERIOD
+ help
+ This option controls the leaf-level fanout of hierarchical
+ implementations of RCU, and allows trading off cache misses
+ against lock contention. Systems that synchronize their
+ scheduling-clock interrupts for energy-efficiency reasons will
+ want the default because the smaller leaf-level fanout keeps
+ lock contention levels acceptably low. Very large systems
+ (hundreds or thousands of CPUs) will instead want to set this
+ value to the maximum value possible in order to reduce the
+ number of cache misses incurred during RCU's grace-period
+ initialization. These systems tend to run CPU-bound, and thus
+ are not helped by synchronized interrupts, and thus tend to
+ skew them, which reduces lock contention enough that large
+ leaf-level fanouts work well. That said, setting leaf-level
+ fanout to a large number will likely cause problematic
+ lock contention on the leaf-level rcu_node structures unless
+ you boot with the skew_tick kernel parameter.
+
+ Select a specific number if testing RCU itself.
+
+ Select the maximum permissible value for large systems, but
+ please understand that you may also need to set the skew_tick
+ kernel boot parameter to avoid contention on the rcu_node
+ structure's locks.
+
+ Take the default if unsure.
+
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
+ default y if PREEMPT_RT
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+config RCU_EXP_KTHREAD
+ bool "Perform RCU expedited work in a real-time kthread"
+ depends on RCU_BOOST && RCU_EXPERT
+ default !PREEMPT_RT && NR_CPUS <= 32
+ help
+ Use this option to further reduce the latencies of expedited
+ grace periods at the expense of being more disruptive.
+
+ This option is disabled by default on PREEMPT_RT=y kernels which
+ disable expedited grace periods after boot by unconditionally
+ setting rcupdate.rcu_normal_after_boot=1.
+
+ Accept the default if unsure.
+
+config RCU_NOCB_CPU
+ bool "Offload RCU callback processing from boot-selected CPUs"
+ depends on TREE_RCU
+ depends on RCU_EXPERT || NO_HZ_FULL
+ default n
+ help
+ Use this option to reduce OS jitter for aggressive HPC or
+ real-time workloads. It can also be used to offload RCU
+ callback invocation to energy-efficient CPUs in battery-powered
+ asymmetric multiprocessors. The price of this reduced jitter
+ is that the overhead of call_rcu() increases and that some
+ workloads will incur significant increases in context-switch
+ rates.
+
+ This option offloads callback invocation from the set of
+ CPUs specified at boot time by the rcu_nocbs parameter.
+ For each such CPU, a kthread ("rcuox/N") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded,
+ and where the "x" is "p" for RCU-preempt (PREEMPTION kernels)
+ and "s" for RCU-sched (!PREEMPTION kernels). This option
+ also creates another kthread for each sqrt(nr_cpu_ids) CPUs
+ ("rcuog/N", where N is the first CPU in that group to come
+ online), which handles grace periods for its group. Nothing
+ prevents these kthreads from running on the specified CPUs,
+ but (1) the kthreads may be preempted between each callback,
+ and (2) affinity or cgroups can be used to force the kthreads
+ to run on whatever set of CPUs is desired.
+
+ The sqrt(nr_cpu_ids) grouping may be overridden using the
+ rcutree.rcu_nocb_gp_stride kernel boot parameter. This can
+ be especially helpful for smaller numbers of CPUs, where
+ sqrt(nr_cpu_ids) can be a bit of a blunt instrument.
+
+ Say Y here if you need reduced OS jitter, despite added overhead.
+ Say N here if you are unsure.
+
+config RCU_NOCB_CPU_DEFAULT_ALL
+ bool "Offload RCU callback processing from all CPUs by default"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ Use this option to offload callback processing from all CPUs
+ by default, in the absence of the rcu_nocbs or nohz_full boot
+ parameter. This also avoids the need to use any boot parameters
+ to achieve the effect of offloading all CPUs on boot.
+
+ Say Y here if you want offload all CPUs by default on boot.
+ Say N here if you are unsure.
+
+config RCU_NOCB_CPU_CB_BOOST
+ bool "Offload RCU callback from real-time kthread"
+ depends on RCU_NOCB_CPU && RCU_BOOST
+ default y if PREEMPT_RT
+ help
+ Use this option to invoke offloaded callbacks as SCHED_FIFO
+ to avoid starvation by heavy SCHED_OTHER background load.
+ Of course, running as SCHED_FIFO during callback floods will
+ cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+ of milliseconds or more. Therefore, when enabling this option,
+ it is your responsibility to ensure that latency-sensitive
+ tasks either run with higher priority or run on some other CPU.
+
+ Say Y here if you want to set RT priority for offloading kthreads.
+ Say N here if you are building a !PREEMPT_RT kernel and are unsure.
+
+config TASKS_TRACE_RCU_READ_MB
+ bool "Tasks Trace RCU readers use memory barriers in user and idle"
+ depends on RCU_EXPERT && TASKS_TRACE_RCU
+ default PREEMPT_RT || NR_CPUS < 8
+ help
+ Use this option to further reduce the number of IPIs sent
+ to CPUs executing in userspace or idle during tasks trace
+ RCU grace periods. Given that a reasonable setting of
+ the rcupdate.rcu_task_ipi_delay kernel boot parameter
+ eliminates such IPIs for many workloads, proper setting
+ of this Kconfig option is important mostly for aggressive
+ real-time installations and for battery-powered devices,
+ hence the default chosen above.
+
+ Say Y here if you hate IPIs.
+ Say N here if you hate read-side memory barriers.
+ Take the default if you are unsure.
+
+config RCU_LAZY
+ bool "RCU callback lazy invocation functionality"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ To save power, batch RCU callbacks and delay starting the
+ corresponding grace period for multiple seconds. The grace
+ period will be started after this delay, in case of memory
+ pressure, or if the corresponding CPU's callback list grows
+ too large.
+
+ These delays happen only on rcu_nocbs CPUs, that is, CPUs
+ whose callbacks have been offloaded.
+
+ Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to
+ globally disable these delays.
+
+config RCU_LAZY_DEFAULT_OFF
+ bool "Turn RCU lazy invocation off by default"
+ depends on RCU_LAZY
+ default n
+ help
+ Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel
+ to boot with these energy-efficiency delays disabled. Use the
+ rcutree.enable_rcu_lazy=0 kernel-boot parameter to override
+ the this option at boot time, thus re-enabling these delays.
+
+config RCU_DOUBLE_CHECK_CB_TIME
+ bool "RCU callback-batch backup time check"
+ depends on RCU_EXPERT
+ default n
+ help
+ Use this option to provide more precise enforcement of the
+ rcutree.rcu_resched_ns module parameter in situations where
+ a single RCU callback might run for hundreds of microseconds,
+ thus defeating the 32-callback batching used to amortize the
+ cost of the fine-grained but expensive local_clock() function.
+
+ This option rounds rcutree.rcu_resched_ns up to the next
+ jiffy, and overrides the 32-callback batching if this limit
+ is exceeded.
+
+ Say Y here if you need tighter callback-limit enforcement.
+ Say N here if you are unsure.
+
+endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
new file mode 100644
index 000000000000..625d75392647
--- /dev/null
+++ b/kernel/rcu/Kconfig.debug
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# RCU-related debugging configuration options
+#
+
+menu "RCU Debugging"
+
+config PROVE_RCU
+ def_bool PROVE_LOCKING
+
+config PROVE_RCU_LIST
+ bool "RCU list lockdep debugging"
+ depends on PROVE_RCU && RCU_EXPERT
+ default n
+ help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove this config option.
+
+config TORTURE_TEST
+ tristate
+ default n
+
+config RCU_SCALE_TEST
+ tristate "performance tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ default n
+ help
+ This option provides a kernel module that runs performance
+ tests on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU performance tests to be built into
+ the kernel.
+ Say M if you want the RCU performance tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST
+ tristate "torture tests for RCU"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ default n
+ help
+ This option provides a kernel module that runs torture tests
+ on the RCU infrastructure. The kernel module may be built
+ after the fact on the running kernel to be tested, if desired.
+
+ Say Y here if you want RCU torture tests to be built into
+ the kernel.
+ Say M if you want the RCU torture tests to build as a module.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST_CHK_RDR_STATE
+ bool "Check rcutorture reader state"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to check the desired rcutorture
+ reader state for each segment against the actual context.
+ Note that PREEMPT_COUNT must be enabled if the preempt-disabled
+ and bh-disabled checks are to take effect, and that PREEMPT_RCU
+ must be enabled for the RCU-nesting checks to take effect.
+ These checks add overhead, and this Kconfig options is therefore
+ disabled by default.
+
+ Say Y here if you want rcutorture reader contexts checked.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST_LOG_CPU
+ bool "Log CPU for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ number of the CPU that the reader was running on at the time.
+ This information can be useful, but it does incur additional
+ overhead, overhead that can make both failures and close calls
+ less probable.
+
+ Say Y here if you want CPU IDs logged.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST_LOG_GP
+ bool "Log grace-period numbers for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ corresponding grace-period sequence numbers. This information
+ can be useful, but it does incur additional overhead, overhead
+ that can make both failures and close calls less probable.
+
+ Say Y here if you want grace-period sequence numbers logged.
+ Say N if you are unsure.
+
+config RCU_REF_SCALE_TEST
+ tristate "Scalability tests for read-side synchronization (RCU and others)"
+ depends on DEBUG_KERNEL
+ select TORTURE_TEST
+ default n
+ help
+ This option provides a kernel module that runs performance tests
+ useful comparing RCU with various read-side synchronization mechanisms.
+ The kernel module may be built after the fact on the running kernel to be
+ tested, if desired.
+
+ Say Y here if you want these performance tests built into the kernel.
+ Say M if you want to build it as a module instead.
+ Say N if you are unsure.
+
+config RCU_CPU_STALL_TIMEOUT
+ int "RCU CPU stall timeout in seconds"
+ depends on RCU_STALL_COMMON
+ range 3 300
+ default 21
+ help
+ If a given RCU grace period extends more than the specified
+ number of seconds, a CPU stall warning is printed. If the
+ RCU grace period persists, additional CPU stall warnings are
+ printed at more widely spaced intervals.
+
+config RCU_EXP_CPU_STALL_TIMEOUT
+ int "Expedited RCU CPU stall timeout in milliseconds"
+ depends on RCU_STALL_COMMON
+ range 0 300000
+ default 0
+ help
+ If a given expedited RCU grace period extends more than the
+ specified number of milliseconds, a CPU stall warning is printed.
+ If the RCU grace period persists, additional CPU stall warnings
+ are printed at more widely spaced intervals. A value of zero
+ says to use the RCU_CPU_STALL_TIMEOUT value converted from
+ seconds to milliseconds.
+
+config RCU_CPU_STALL_CPUTIME
+ bool "Provide additional RCU stall debug information"
+ depends on RCU_STALL_COMMON
+ default n
+ help
+ Collect statistics during the sampling period, such as the number of
+ (hard interrupts, soft interrupts, task switches) and the cputime of
+ (hard interrupts, soft interrupts, kernel tasks) are added to the
+ RCU stall report. For multiple continuous RCU stalls, all sampling
+ periods begin at half of the first RCU stall timeout.
+ The boot option rcupdate.rcu_cpu_stall_cputime has the same function
+ as this one, but will override this if it exists.
+
+config RCU_CPU_STALL_NOTIFIER
+ bool "Provide RCU CPU-stall notifiers"
+ depends on RCU_STALL_COMMON
+ depends on DEBUG_KERNEL
+ depends on RCU_EXPERT
+ default n
+ help
+ WARNING: You almost certainly do not want this!!!
+
+ Enable RCU CPU-stall notifiers, which are invoked just before
+ printing the RCU CPU stall warning. As such, bugs in notifier
+ callbacks can prevent stall warnings from being printed.
+ And the whole reason that a stall warning is being printed is
+ that something is hung up somewhere. Therefore, the notifier
+ callbacks must be written extremely carefully, preferably
+ containing only lockless code. After all, it is quite possible
+ that the whole reason that the RCU CPU stall is happening in
+ the first place is that someone forgot to release whatever lock
+ that you are thinking of acquiring. In which case, having your
+ notifier callback acquire that lock will hang, preventing the
+ RCU CPU stall warning from appearing.
+
+ Say Y here if you want RCU CPU stall notifiers (you don't want them)
+ Say N if you are unsure.
+
+config RCU_TRACE
+ bool "Enable tracing for RCU"
+ depends on DEBUG_KERNEL
+ default y if TREE_RCU
+ select TRACE_CLOCK
+ help
+ This option enables additional tracepoints for ftrace-style
+ event tracing.
+
+ Say Y here if you want to enable RCU tracing
+ Say N if you are unsure.
+
+config RCU_EQS_DEBUG
+ bool "Provide debugging asserts for adding NO_HZ support to an arch"
+ depends on DEBUG_KERNEL
+ help
+ This option provides consistency checks in RCU's handling of
+ NO_HZ. These checks have proven quite helpful in detecting
+ bugs in arch-specific NO_HZ code.
+
+ Say N here if you need ultimate kernel/user switch latencies
+ Say Y if you are unsure
+
+config RCU_STRICT_GRACE_PERIOD
+ bool "Provide debug RCU implementation with short grace periods"
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
+ default n
+ select PREEMPT_COUNT if PREEMPT=n
+ help
+ Select this option to build an RCU variant that is strict about
+ grace periods, making them as short as it can. This limits
+ scalability, destroys real-time response, degrades battery
+ lifetime and kills performance. Don't try this on large
+ machines, as in systems with more than about 10 or 20 CPUs.
+ But in conjunction with tools like KASAN, it can be helpful
+ when looking for certain types of RCU usage bugs, for example,
+ too-short RCU read-side critical sections.
+
+
+config RCU_DYNTICKS_TORTURE
+ bool "Minimize RCU dynticks counter size"
+ depends on RCU_EXPERT && !COMPILE_TEST
+ default n
+ help
+ This option sets the width of the dynticks counter to its
+ minimum usable value. This minimum width greatly increases
+ the probability of flushing out bugs involving counter wrap,
+ but it also increases the probability of extending grace period
+ durations. This Kconfig option should therefore be avoided in
+ production due to the consequent increased probability of OOMs.
+
+ This has no value for production and is only for testing.
+
+endmenu # "RCU Debugging"
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a808424b06..0cfb009a99b9 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,7 +1,18 @@
-obj-y += update.o
-obj-$(CONFIG_SRCU) += srcu.o
+# SPDX-License-Identifier: GPL-2.0
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
+ifeq ($(CONFIG_KCSAN),y)
+KBUILD_CFLAGS += -g -fno-omit-frame-pointer
+endif
+
+obj-y += update.o sync.o
+obj-$(CONFIG_TREE_SRCU) += srcutree.o
+obj-$(CONFIG_TINY_SRCU) += srcutiny.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_PREEMPT_RCU) += tree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
+obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..9cf01832a6c3 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -1,72 +1,233 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update definitions shared among RCU implementations.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2011
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
+#include <linux/slab.h>
#include <trace/events/rcu.h>
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
-/*
- * Process-level increment to ->dynticks_nesting field. This allows for
- * architectures that use half-interrupts and half-exceptions from
- * process context.
- *
- * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
- * that counts the number of process-based reasons why RCU cannot
- * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
- * is the value used to increment or decrement this field.
- *
- * The rest of the bits could in principle be used to count interrupts,
- * but this would mean that a negative-one value in the interrupt
- * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
- * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
- * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
- * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
- * initial exit from idle.
- */
-#define DYNTICK_TASK_NEST_WIDTH 7
-#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
-#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
-#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
-#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
-#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
- DYNTICK_TASK_FLAG)
+
+/*
+ * Grace-period counter management.
+ *
+ * The two least significant bits contain the control flags.
+ * The most significant bits contain the grace-period sequence counter.
+ *
+ * When both control flags are zero, no grace period is in progress.
+ * When either bit is non-zero, a grace period has started and is in
+ * progress. When the grace period completes, the control flags are reset
+ * to 0 and the grace-period sequence counter is incremented.
+ *
+ * However some specific RCU usages make use of custom values.
+ *
+ * SRCU special control values:
+ *
+ * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node
+ * is initialized.
+ *
+ * SRCU_STATE_IDLE : No SRCU gp is in progress
+ *
+ * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates
+ * we are scanning the readers on the slot
+ * defined as inactive (there might well
+ * be pending readers that will use that
+ * index, but their number is bounded).
+ *
+ * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state()
+ * Indicates we are flipping the readers
+ * index and then scanning the readers on the
+ * slot newly designated as inactive (again,
+ * the number of pending readers that will use
+ * this inactive index is bounded).
+ *
+ * RCU polled GP special control value:
+ *
+ * RCU_GET_STATE_COMPLETED : State value indicating an already-completed
+ * polled GP has completed. This value covers
+ * both the state and the counter of the
+ * grace-period sequence number.
+ */
+
+/* Low-order bit definition for polled grace-period APIs. */
+#define RCU_GET_STATE_COMPLETED 0x1
+
+/* A complete grace period count */
+#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1)
+
+extern int sysctl_sched_rt_runtime;
+
+/*
+ * Return the counter portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline unsigned long rcu_seq_ctr(unsigned long s)
+{
+ return s >> RCU_SEQ_CTR_SHIFT;
+}
+
+/*
+ * Return the state portion of a sequence number previously returned
+ * by rcu_seq_snap() or rcu_seq_current().
+ */
+static inline int rcu_seq_state(unsigned long s)
+{
+ return s & RCU_SEQ_STATE_MASK;
+}
+
+/*
+ * Set the state portion of the pointed-to sequence number.
+ * The caller is responsible for preventing conflicting updates.
+ */
+static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
+{
+ WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
+ WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
+}
+
+/* Adjust sequence number for start of update-side operation. */
+static inline void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
+}
+
+/* Compute the end-of-grace-period value for the specified sequence number. */
+static inline unsigned long rcu_seq_endval(unsigned long *sp)
+{
+ return (*sp | RCU_SEQ_STATE_MASK) + 1;
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static inline void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WARN_ON_ONCE(!rcu_seq_state(*sp));
+ WRITE_ONCE(*sp, rcu_seq_endval(sp));
+}
+
+/*
+ * rcu_seq_snap - Take a snapshot of the update side's sequence number.
+ *
+ * This function returns the earliest value of the grace-period sequence number
+ * that will indicate that a full grace period has elapsed since the current
+ * time. Once the grace-period sequence number has reached this value, it will
+ * be safe to invoke all callbacks that have been registered prior to the
+ * current time. This value is the current grace-period number plus two to the
+ * power of the number of low-order bits reserved for state, then rounded up to
+ * the next value in which the state bits are all zero.
+ */
+static inline unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/* Return the current value the update side's sequence number, no ordering. */
+static inline unsigned long rcu_seq_current(unsigned long *sp)
+{
+ return READ_ONCE(*sp);
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not the
+ * corresponding update-side operation has started.
+ */
+static inline bool rcu_seq_started(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_LT((s - 1) & ~RCU_SEQ_STATE_MASK, READ_ONCE(*sp));
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred, but do not allow the
+ * (ULONG_MAX / 2) safety-factor/guard-band.
+ *
+ * The token returned by get_state_synchronize_rcu_full() is based on
+ * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full()
+ * against the root rnp->gp_seq. Since rcu_seq_start() is first called
+ * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq,
+ * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace
+ * periods ahead of the root rnp->gp_seq. To prevent false-positives with the
+ * full polling API that a wrap around instantly completed the GP, when nothing
+ * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT().
+ */
+static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
+{
+ unsigned long cur_s = READ_ONCE(*sp);
+
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP));
+}
+
+/*
+ * Has a grace period completed since the time the old gp_seq was collected?
+ */
+static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new)
+{
+ return ULONG_CMP_LT(old, new & ~RCU_SEQ_STATE_MASK);
+}
+
+/*
+ * Has a grace period started since the time the old gp_seq was collected?
+ */
+static inline bool rcu_seq_new_gp(unsigned long old, unsigned long new)
+{
+ return ULONG_CMP_LT((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK,
+ new);
+}
+
+/*
+ * Roughly how many full grace periods have elapsed between the collection
+ * of the two specified grace periods?
+ */
+static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old)
+{
+ unsigned long rnd_diff;
+
+ if (old == new)
+ return 0;
+ /*
+ * Compute the number of grace periods (still shifted up), plus
+ * one if either of new and old is not an exact grace period.
+ */
+ rnd_diff = (new & ~RCU_SEQ_STATE_MASK) -
+ ((old + RCU_SEQ_STATE_MASK) & ~RCU_SEQ_STATE_MASK) +
+ ((new & RCU_SEQ_STATE_MASK) || (old & RCU_SEQ_STATE_MASK));
+ if (ULONG_CMP_GE(RCU_SEQ_STATE_MASK, rnd_diff))
+ return 1; /* Definitely no grace period has elapsed. */
+ return ((rnd_diff - RCU_SEQ_STATE_MASK - 1) >> RCU_SEQ_CTR_SHIFT) + 2;
+}
/*
* debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
- * by call_rcu() and rcu callback execution, and are therefore not part of the
- * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ * by call_rcu() and rcu callback execution, and are therefore not part
+ * of the RCU API. These are in rcupdate.h because they are used by all
+ * RCU implementations.
*/
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
# define STATE_RCU_HEAD_READY 0
# define STATE_RCU_HEAD_QUEUED 1
-extern struct debug_obj_descr rcuhead_debug_descr;
+extern const struct debug_obj_descr rcuhead_debug_descr;
static inline int debug_rcu_head_queue(struct rcu_head *head)
{
@@ -97,35 +258,62 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-void kfree(const void *);
+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
+{
+ if (unlikely(!rhp->func))
+ kmem_dump_obj(rhp);
+}
-/*
- * Reclaim the specified callback, either by invoking it (non-lazy case)
- * or freeing it directly (lazy case). Return true if lazy, false otherwise.
- */
-static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
+static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp)
{
- unsigned long offset = (unsigned long)head->func;
+ return rhp->next == rhp;
+}
- rcu_lock_acquire(&rcu_callback_map);
- if (__is_kfree_rcu_offset(offset)) {
- RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
- kfree((void *)head - offset);
- rcu_lock_release(&rcu_callback_map);
- return true;
- } else {
- RCU_TRACE(trace_rcu_invoke_callback(rn, head));
- head->func(head);
- rcu_lock_release(&rcu_callback_map);
- return false;
- }
+extern int rcu_cpu_stall_suppress_at_boot;
+
+static inline bool rcu_stall_is_suppressed_at_boot(void)
+{
+ return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
}
+extern int rcu_cpu_stall_notifiers;
+
#ifdef CONFIG_RCU_STALL_COMMON
+extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
+extern int rcu_cpu_stall_timeout;
+extern int rcu_exp_cpu_stall_timeout;
+extern int rcu_cpu_stall_cputime;
+extern bool rcu_exp_stall_task_details __read_mostly;
int rcu_jiffies_till_stall_check(void);
+int rcu_exp_jiffies_till_stall_check(void);
+
+static inline bool rcu_stall_is_suppressed(void)
+{
+ return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress;
+}
+#define rcu_ftrace_dump_stall_suppress() \
+do { \
+ if (!rcu_cpu_stall_suppress) \
+ rcu_cpu_stall_suppress = 3; \
+} while (0)
+
+#define rcu_ftrace_dump_stall_unsuppress() \
+do { \
+ if (rcu_cpu_stall_suppress == 3) \
+ rcu_cpu_stall_suppress = 0; \
+} while (0)
+
+#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+
+static inline bool rcu_stall_is_suppressed(void)
+{
+ return rcu_stall_is_suppressed_at_boot();
+}
+#define rcu_ftrace_dump_stall_suppress()
+#define rcu_ftrace_dump_stall_unsuppress()
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
/*
@@ -135,7 +323,24 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+ static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+ \
+ if (!atomic_read(&___rfd_beenhere) && \
+ !atomic_xchg(&___rfd_beenhere, 1)) { \
+ tracing_off(); \
+ rcu_ftrace_dump_stall_suppress(); \
+ ftrace_dump(oops_dump_mode); \
+ rcu_ftrace_dump_stall_unsuppress(); \
+ } \
+} while (0)
+
void rcu_early_boot_tests(void);
+void rcu_test_sync_prims(void);
/*
* This function really isn't for public consumption, but RCU is special in
@@ -143,4 +348,347 @@ void rcu_early_boot_tests(void);
*/
extern void resched_cpu(int cpu);
+#if !defined(CONFIG_TINY_RCU)
+
+#include <linux/rcu_node_tree.h>
+
+extern int rcu_num_lvls;
+extern int num_rcu_lvl[];
+extern int rcu_num_nodes;
+static bool rcu_fanout_exact;
+static int rcu_fanout_leaf;
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
+ */
+static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
+{
+ int i;
+
+ for (i = 0; i < RCU_NUM_LVLS; i++)
+ levelspread[i] = INT_MIN;
+ if (rcu_fanout_exact) {
+ levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
+ levelspread[i] = RCU_FANOUT;
+ } else {
+ int ccur;
+ int cprv;
+
+ cprv = nr_cpu_ids;
+ for (i = rcu_num_lvls - 1; i >= 0; i--) {
+ ccur = levelcnt[i];
+ levelspread[i] = (cprv + ccur - 1) / ccur;
+ cprv = ccur;
+ }
+ }
+}
+
+extern void rcu_init_geometry(void);
+
+/* Returns a pointer to the first leaf rcu_node structure. */
+#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1])
+
+/* Is this rcu_node a leaf? */
+#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1)
+
+/* Is this rcu_node the last leaf? */
+#define rcu_is_last_leaf_node(rnp) ((rnp) == &rcu_state.node[rcu_num_nodes - 1])
+
+/*
+ * Do a full breadth-first scan of the {s,}rcu_node structures for the
+ * specified state structure (for SRCU) or the only rcu_state structure
+ * (for RCU).
+ */
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
+ for ((rnp) = &(sp)->node[0]; \
+ (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
+#define rcu_for_each_node_breadth_first(rnp) \
+ _rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+ _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
+
+/*
+ * Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
+ * Note that if there is a singleton rcu_node tree with but one rcu_node
+ * structure, this loop -will- visit the rcu_node structure. It is still
+ * a leaf node, even if it is also the root node.
+ */
+#define rcu_for_each_leaf_node(rnp) \
+ for ((rnp) = rcu_first_leaf_node(); \
+ (rnp) < &rcu_state.node[rcu_num_nodes]; (rnp)++)
+
+/*
+ * Iterate over all possible CPUs in a leaf RCU node.
+ */
+#define for_each_leaf_node_possible_cpu(rnp, cpu) \
+ for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \
+ (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
+ (cpu) <= rnp->grphi; \
+ (cpu) = cpumask_next((cpu), cpu_possible_mask))
+
+/*
+ * Iterate over all CPUs in a leaf RCU node's specified mask.
+ */
+#define rcu_find_next_bit(rnp, cpu, mask) \
+ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
+#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
+ for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \
+ (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
+ (cpu) <= rnp->grphi; \
+ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
+
+#endif /* !defined(CONFIG_TINY_RCU) */
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
+
+/*
+ * Wrappers for the rcu_node::lock acquire and release.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
+ */
+#define raw_spin_lock_rcu_node(p) \
+do { \
+ raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_rcu_node(p) \
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
+
+#define raw_spin_lock_irq_rcu_node(p) \
+do { \
+ raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irq_rcu_node(p) \
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
+
+#define raw_spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \
+} while (0)
+
+#define raw_spin_trylock_rcu_node(p) \
+({ \
+ bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#define raw_lockdep_assert_held_rcu_node(p) \
+ lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
+
+#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
+
+#ifdef CONFIG_TINY_RCU
+/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_normal(void) { return true; }
+static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline bool rcu_async_should_hurry(void) { return false; }
+static inline void rcu_expedite_gp(void) { }
+static inline void rcu_unexpedite_gp(void) { }
+static inline void rcu_async_hurry(void) { }
+static inline void rcu_async_relax(void) { }
+static inline bool rcu_cpu_online(int cpu) { return true; }
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_normal(void); /* Internal RCU use. */
+bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+bool rcu_async_should_hurry(void); /* Internal RCU use. */
+void rcu_expedite_gp(void);
+void rcu_unexpedite_gp(void);
+void rcu_async_hurry(void);
+void rcu_async_relax(void);
+void rcupdate_announce_bootup_oddness(void);
+bool rcu_cpu_online(int cpu);
+#ifdef CONFIG_TASKS_RCU_GENERIC
+void show_rcu_tasks_gp_kthreads(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void show_rcu_tasks_gp_kthreads(void) {}
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_TASKS_RCU
+struct task_struct *get_rcu_tasks_gp_kthread(void);
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq);
+#endif // # ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq);
+#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
+#endif
+
+#ifdef CONFIG_TASKS_RCU_GENERIC
+void tasks_cblist_init_generic(void);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void tasks_cblist_init_generic(void) { }
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
+
+#define RCU_SCHEDULER_INACTIVE 0
+#define RCU_SCHEDULER_INIT 1
+#define RCU_SCHEDULER_RUNNING 2
+
+enum rcutorture_type {
+ RCU_FLAVOR,
+ RCU_TASKS_FLAVOR,
+ RCU_TASKS_RUDE_FLAVOR,
+ RCU_TASKS_TRACING_FLAVOR,
+ RCU_TRIVIAL_FLAVOR,
+ SRCU_FLAVOR,
+ INVALID_RCU_FLAVOR
+};
+
+#if defined(CONFIG_RCU_LAZY)
+unsigned long rcu_get_jiffies_lazy_flush(void);
+void rcu_set_jiffies_lazy_flush(unsigned long j);
+#else
+static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; }
+static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
+#endif
+
+#if defined(CONFIG_TREE_RCU)
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq);
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+void rcu_gp_set_torture_wait(int duration);
+void rcu_set_gpwrap_lag(unsigned long lag);
+int rcu_get_gpwrap_count(int cpu);
+#else
+static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = 0;
+}
+#ifdef CONFIG_RCU_TRACE
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
+#endif
+static inline void rcu_gp_set_torture_wait(int duration) { }
+static inline void rcu_set_gpwrap_lag(unsigned long lag) { }
+static inline int rcu_get_gpwrap_count(int cpu) { return 0; }
+#endif
+unsigned long long rcutorture_gather_gp_seqs(void);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len);
+
+#ifdef CONFIG_TINY_SRCU
+
+static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
+ unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = sp->srcu_idx;
+}
+
+#elif defined(CONFIG_TREE_SRCU)
+
+void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
+ unsigned long *gp_seq);
+
+#endif
+
+#ifdef CONFIG_TINY_RCU
+static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; }
+static inline unsigned long rcu_get_gp_seq(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
+static inline void rcu_force_quiescent_state(void) { }
+static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
+static inline void show_rcu_gp_kthreads(void) { }
+static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
+static inline void rcu_fwd_progress_check(unsigned long j) { }
+static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
+static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_watching_zero_in_eqs(int cpu, int *vp);
+unsigned long rcu_get_gp_seq(void);
+unsigned long rcu_exp_batches_completed(void);
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
+void show_rcu_gp_kthreads(void);
+int rcu_get_gp_kthreads_prio(void);
+void rcu_fwd_progress_check(unsigned long j);
+void rcu_force_quiescent_state(void);
+extern struct workqueue_struct *rcu_gp_wq;
+extern struct kthread_worker *rcu_exp_gp_kworker;
+void rcu_gp_slow_register(atomic_t *rgssp);
+void rcu_gp_slow_unregister(atomic_t *rgssp);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_TINY_SRCU
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+#else // #ifdef CONFIG_TINY_SRCU
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+#endif // #else // #ifdef CONFIG_TINY_SRCU
+
+#ifdef CONFIG_RCU_NOCB_CPU
+void rcu_bind_current_to_nocb(void);
+#else
+static inline void rcu_bind_current_to_nocb(void) { }
+#endif
+
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RCU)
+void show_rcu_tasks_classic_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_classic_gp_kthread(void) {}
+#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RUDE_RCU)
+void show_rcu_tasks_rude_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_rude_gp_kthread(void) {}
+#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU)
+void show_rcu_tasks_trace_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_trace_gp_kthread(void) {}
+#endif
+
+#ifdef CONFIG_TINY_RCU
+static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
+#else
+bool rcu_cpu_beenfullyonline(int cpu);
+#endif
+
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+int rcu_stall_notifier_call_chain(unsigned long val, void *v);
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
new file mode 100644
index 000000000000..298a2c573f02
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.c
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU segmented callback lists, function definitions
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "rcu_segcblist.h"
+
+/* Initialize simple callback list. */
+void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+ rclp->len = 0;
+}
+
+/*
+ * Enqueue an rcu_head structure onto the specified callback list.
+ */
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
+{
+ *rclp->tail = rhp;
+ rclp->tail = &rhp->next;
+ WRITE_ONCE(rclp->len, rclp->len + 1);
+}
+
+/*
+ * Flush the second rcu_cblist structure onto the first one, obliterating
+ * any contents of the first. If rhp is non-NULL, enqueue it as the sole
+ * element of the second rcu_cblist structure, but ensuring that the second
+ * rcu_cblist structure, if initially non-empty, always appears non-empty
+ * throughout the process. If rdp is NULL, the second rcu_cblist structure
+ * is instead initialized to empty.
+ */
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp)
+{
+ drclp->head = srclp->head;
+ if (drclp->head)
+ drclp->tail = srclp->tail;
+ else
+ drclp->tail = &drclp->head;
+ drclp->len = srclp->len;
+ if (!rhp) {
+ rcu_cblist_init(srclp);
+ } else {
+ rhp->next = NULL;
+ srclp->head = rhp;
+ srclp->tail = &rhp->next;
+ WRITE_ONCE(srclp->len, 1);
+ }
+}
+
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list.
+ */
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+ struct rcu_head *rhp;
+
+ rhp = rclp->head;
+ if (!rhp)
+ return NULL;
+ rclp->len--;
+ rclp->head = rhp->next;
+ if (!rclp->head)
+ rclp->tail = &rclp->head;
+ return rhp;
+}
+
+/* Set the length of an rcu_segcblist structure. */
+static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ atomic_long_set(&rsclp->len, v);
+#else
+ WRITE_ONCE(rsclp->len, v);
+#endif
+}
+
+/* Get the length of a segment of the rcu_segcblist structure. */
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ return READ_ONCE(rsclp->seglen[seg]);
+}
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
+{
+ long len = 0;
+ int i;
+
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ len += rcu_segcblist_get_seglen(rsclp, i);
+
+ return len;
+}
+
+/* Set the length of a segment of the rcu_segcblist structure. */
+static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], v);
+}
+
+/* Increase the numeric length of a segment by a specified amount. */
+static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
+}
+
+/* Move from's segment length to to's segment. */
+static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to)
+{
+ long len;
+
+ if (from == to)
+ return;
+
+ len = rcu_segcblist_get_seglen(rsclp, from);
+ if (!len)
+ return;
+
+ rcu_segcblist_add_seglen(rsclp, to, len);
+ rcu_segcblist_set_seglen(rsclp, from, 0);
+}
+
+/* Increment segment's length. */
+static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ rcu_segcblist_add_seglen(rsclp, seg, 1);
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by the
+ * specified amount, which can be negative. This can cause the ->len
+ * field to disagree with the actual number of callbacks on the structure.
+ * This increase is fully ordered with respect to the callers accesses
+ * both before and after.
+ *
+ * So why on earth is a memory barrier required both before and after
+ * the update to the ->len field???
+ *
+ * The reason is that rcu_barrier() locklessly samples each CPU's ->len
+ * field, and if a given CPU's field is zero, avoids IPIing that CPU.
+ * This can of course race with both queuing and invoking of callbacks.
+ * Failing to correctly handle either of these races could result in
+ * rcu_barrier() failing to IPI a CPU that actually had callbacks queued
+ * which rcu_barrier() was obligated to wait on. And if rcu_barrier()
+ * failed to wait on such a callback, unloading certain kernel modules
+ * would result in calls to functions whose code was no longer present in
+ * the kernel, for but one example.
+ *
+ * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully
+ * ordered with respect with both list modifications and the rcu_barrier().
+ *
+ * The queuing case is CASE 1 and the invoking case is CASE 2.
+ *
+ * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes
+ * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field
+ * will transition from 0->1, which is one of the transitions that must
+ * be handled carefully. Without the full memory barriers after the ->len
+ * update and at the beginning of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * call_rcu().
+ * rcu_barrier() sees ->len as 0.
+ * set ->len = 1.
+ * rcu_barrier() does nothing.
+ * module is unloaded.
+ * callback invokes unloaded function!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0 will
+ * have unambiguously preceded the return from the racing call_rcu(), which
+ * means that this call_rcu() invocation is OK to not wait on. After all,
+ * you are supposed to make sure that any problematic call_rcu() invocations
+ * happen before the rcu_barrier().
+ *
+ *
+ * CASE 2: Suppose that CPU 0 is invoking its last callback just as
+ * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from
+ * 1->0, which is one of the transitions that must be handled carefully.
+ * Without the full memory barriers before the ->len update and at the
+ * end of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * start invoking last callback
+ * set ->len = 0 (reordered)
+ * rcu_barrier() sees ->len as 0
+ * rcu_barrier() does nothing.
+ * module is unloaded
+ * callback executing after unloaded!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0
+ * will be fully ordered after the completion of the callback function,
+ * so that the module unloading operation is completely safe.
+ *
+ */
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ smp_mb__before_atomic(); // Read header comment above.
+ atomic_long_add(v, &rsclp->len);
+ smp_mb__after_atomic(); // Read header comment above.
+#else
+ smp_mb(); // Read header comment above.
+ WRITE_ONCE(rsclp->len, rsclp->len + v);
+ smp_mb(); // Read header comment above.
+#endif
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by one.
+ * This can cause the ->len field to disagree with the actual number of
+ * callbacks on the structure. This increase is fully ordered with respect
+ * to the callers accesses both before and after.
+ */
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
+{
+ rcu_segcblist_add_len(rsclp, 1);
+}
+
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+ int i;
+
+ BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+ BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+ rsclp->head = NULL;
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
+ rsclp->tails[i] = &rsclp->head;
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
+ rcu_segcblist_set_len(rsclp, 0);
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED);
+}
+
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it. This structure must be empty.
+ */
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+ WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED);
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]);
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_is_enabled(rsclp) &&
+ !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure. This is useful for diagnostics.
+ */
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+ if (rcu_segcblist_is_enabled(rsclp))
+ return rsclp->head;
+ return NULL;
+}
+
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure. This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+ if (rcu_segcblist_is_enabled(rsclp))
+ return *rsclp->tails[RCU_DONE_TAIL];
+ return NULL;
+}
+
+/*
+ * Return false if there are no CBs awaiting grace periods, otherwise,
+ * return true and store the nearest waited-upon grace period into *lp.
+ */
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
+{
+ if (!rcu_segcblist_pend_cbs(rsclp))
+ return false;
+ *lp = rsclp->gp_seq[RCU_WAIT_TAIL];
+ return true;
+}
+
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed. Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp)
+{
+ rcu_segcblist_inc_len(rsclp);
+ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL);
+ rhp->next = NULL;
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
+}
+
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment. If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use. IMPORTANT: The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period. You have been warned.
+ */
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp)
+{
+ int i;
+
+ if (rcu_segcblist_n_cbs(rsclp) == 0)
+ return false;
+ rcu_segcblist_inc_len(rsclp);
+ smp_mb(); /* Ensure counts are updated before callback is entrained. */
+ rhp->next = NULL;
+ for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+ if (!rcu_segcblist_segempty(rsclp, i))
+ break;
+ rcu_segcblist_inc_seglen(rsclp, i);
+ WRITE_ONCE(*rsclp->tails[i], rhp);
+ for (; i <= RCU_NEXT_TAIL; i++)
+ WRITE_ONCE(rsclp->tails[i], &rhp->next);
+ return true;
+}
+
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rcu_segcblist_ready_cbs(rsclp))
+ return; /* Nothing to do. */
+ rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL);
+ *rclp->tail = rsclp->head;
+ WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
+ rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+ for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+ if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+ WRITE_ONCE(rsclp->tails[i], &rsclp->head);
+ rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
+}
+
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure. Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period. Too bad! They will have to start over.
+ */
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rcu_segcblist_pend_cbs(rsclp))
+ return; /* Nothing to do. */
+ rclp->len = 0;
+ *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+ rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
+ for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
+ rclp->len += rcu_segcblist_get_seglen(rsclp, i);
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
+}
+
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ rcu_segcblist_add_len(rsclp, rclp->len);
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ int i;
+
+ if (!rclp->head)
+ return; /* No callbacks to move. */
+ rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len);
+ *rclp->tail = rsclp->head;
+ WRITE_ONCE(rsclp->head, rclp->head);
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ if (&rsclp->head == rsclp->tails[i])
+ WRITE_ONCE(rsclp->tails[i], rclp->tail);
+ else
+ break;
+ rclp->head = NULL;
+ rclp->tail = &rclp->head;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp)
+{
+ if (!rclp->head)
+ return; /* Nothing to do. */
+
+ rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len);
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
+}
+
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+ int i, j;
+
+ WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+ if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+ return;
+
+ /*
+ * Find all callbacks whose ->gp_seq numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+ */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+ if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+ break;
+ WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
+ }
+
+ /* If no callbacks moved, nothing more need be done. */
+ if (i == RCU_WAIT_TAIL)
+ return;
+
+ /* Clean up tail pointers that might have been misordered above. */
+ for (j = RCU_WAIT_TAIL; j < i; j++)
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
+
+ /*
+ * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+ * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the
+ * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+ * created by the now-ready-to-invoke segments.
+ */
+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+ if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+ break; /* No more callbacks. */
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, j);
+ rsclp->gp_seq[j] = rsclp->gp_seq[i];
+ }
+}
+
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally. This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability. When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke. Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
+ */
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+ int i, j;
+
+ WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+ if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+ return false;
+
+ /*
+ * Find the segment preceding the oldest segment of callbacks
+ * whose ->gp_seq[] completion is at or after that passed in via
+ * "seq", skipping any empty segments. This oldest segment, along
+ * with any later segments, can be merged in with any newly arrived
+ * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+ * as their ->gp_seq[] grace-period completion sequence number.
+ */
+ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+ if (!rcu_segcblist_segempty(rsclp, i) &&
+ ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+ break;
+
+ /*
+ * If all the segments contain callbacks that correspond to
+ * earlier grace-period sequence numbers than "seq", leave.
+ * Assuming that the rcu_segcblist structure has enough
+ * segments in its arrays, this can only happen if some of
+ * the non-done segments contain callbacks that really are
+ * ready to invoke. This situation will get straightened
+ * out by the next call to rcu_segcblist_advance().
+ *
+ * Also advance to the oldest segment of callbacks whose
+ * ->gp_seq[] completion is at or after that passed in via "seq",
+ * skipping any empty segments.
+ *
+ * Note that segment "i" (and any lower-numbered segments
+ * containing older callbacks) will be unaffected, and their
+ * grace-period numbers remain unchanged. For example, if i ==
+ * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched.
+ * Instead, the CBs in NEXT_TAIL will be merged with those in
+ * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL
+ * would be updated. NEXT_TAIL would then be empty.
+ */
+ if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
+ return false;
+
+ /* Accounting: everything below i is about to get merged into i. */
+ for (j = i + 1; j <= RCU_NEXT_TAIL; j++)
+ rcu_segcblist_move_seglen(rsclp, j, i);
+
+ /*
+ * Merge all later callbacks, including newly arrived callbacks,
+ * into the segment located by the for-loop above. Assign "seq"
+ * as the ->gp_seq[] value in order to correctly handle the case
+ * where there were no pending callbacks in the rcu_segcblist
+ * structure other than in the RCU_NEXT_TAIL segment.
+ */
+ for (; i < RCU_NEXT_TAIL; i++) {
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
+ rsclp->gp_seq[i] = seq;
+ }
+ return true;
+}
+
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source. Any pending
+ * callbacks from the source get to start over. It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp)
+{
+ struct rcu_cblist donecbs;
+ struct rcu_cblist pendcbs;
+
+ lockdep_assert_cpus_held();
+
+ rcu_cblist_init(&donecbs);
+ rcu_cblist_init(&pendcbs);
+
+ rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+ rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+
+ /*
+ * No need smp_mb() before setting length to 0, because CPU hotplug
+ * lock excludes rcu_barrier.
+ */
+ rcu_segcblist_set_len(src_rsclp, 0);
+
+ rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_count(dst_rsclp, &pendcbs);
+ rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+
+ rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..fadc08ad4b7b
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * RCU segmented callback lists, internal-to-rcu header file
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+#include <linux/rcu_segcblist.h>
+
+/* Return number of callbacks in the specified callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
+{
+ return READ_ONCE(rclp->len);
+}
+
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg);
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
+
+void rcu_cblist_init(struct rcu_cblist *rclp);
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp);
+struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
+
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful! The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure. When callbacks are being invoked, they are
+ * removed as a group. If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list. Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+ return !READ_ONCE(rsclp->head);
+}
+
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return atomic_long_read(&rsclp->len);
+#else
+ return READ_ONCE(rsclp->len);
+#endif
+}
+
+static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ WRITE_ONCE(rsclp->flags, rsclp->flags | flags);
+}
+
+static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags);
+}
+
+static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ return READ_ONCE(rsclp->flags) & flags;
+}
+
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+ return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
+}
+
+/*
+ * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the
+ * [de]offloading process)?
+ */
+static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
+{
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED))
+ return true;
+
+ return false;
+}
+
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks? (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+ return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
+}
+
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+ if (seg == RCU_DONE_TAIL)
+ return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+ return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
+void rcu_segcblist_init(struct rcu_segcblist *rsclp);
+void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
+struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
+void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp);
+bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+ struct rcu_head *rhp);
+void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+ struct rcu_cblist *rclp);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
new file mode 100644
index 000000000000..7484d8ad5767
--- /dev/null
+++ b/kernel/rcu/rcuscale.c
@@ -0,0 +1,1219 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Read-Copy Update module-based scalability-test facility
+ *
+ * Copyright (C) IBM Corporation, 2015
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/sched/debug.h>
+
+#include "rcu.h"
+
+MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
+
+#define SCALE_FLAG "-scale:"
+#define SCALEOUT_STRING(s) \
+ pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s)
+#define VERBOSE_SCALEOUT_STRING(s) \
+ do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0)
+#define SCALEOUT_ERRSTRING(s) \
+ pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s)
+
+/*
+ * The intended use cases for the nreaders and nwriters module parameters
+ * are as follows:
+ *
+ * 1. Specify only the nr_cpus kernel boot parameter. This will
+ * set both nreaders and nwriters to the value specified by
+ * nr_cpus for a mixed reader/writer test.
+ *
+ * 2. Specify the nr_cpus kernel boot parameter, but set
+ * rcuscale.nreaders to zero. This will set nwriters to the
+ * value specified by nr_cpus for an update-only test.
+ *
+ * 3. Specify the nr_cpus kernel boot parameter, but set
+ * rcuscale.nwriters to zero. This will set nreaders to the
+ * value specified by nr_cpus for a read-only test.
+ *
+ * Various other use cases may of course be specified.
+ *
+ * Note that this test's readers are intended only as a test load for
+ * the writers. The reader scalability statistics will be overly
+ * pessimistic due to the per-critical-section interrupt disabling,
+ * test-end checks, and the pair of calls through pointers.
+ */
+
+#ifdef MODULE
+# define RCUSCALE_SHUTDOWN 0
+#else
+# define RCUSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, minruntime, 0, "Minimum run time (s)");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nwriters, -1, "Number of RCU updater threads");
+torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
+ "Shutdown at end of scalability tests.");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
+torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
+torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)");
+
+// Structure definitions for custom fixed-per-task allocator.
+struct writer_mblock {
+ struct rcu_head wmb_rh;
+ struct llist_node wmb_node;
+ struct writer_freelist *wmb_wfl;
+};
+
+struct writer_freelist {
+ struct llist_head ws_lhg;
+ atomic_t ws_inflight;
+ struct llist_head ____cacheline_internodealigned_in_smp ws_lhp;
+ struct writer_mblock *ws_mblocks;
+};
+
+static int nrealreaders;
+static int nrealwriters;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *shutdown_task;
+
+static u64 **writer_durations;
+static bool *writer_done;
+static struct writer_freelist *writer_freelists;
+static int *writer_n_durations;
+static atomic_t n_rcu_scale_reader_started;
+static atomic_t n_rcu_scale_writer_started;
+static atomic_t n_rcu_scale_writer_finished;
+static wait_queue_head_t shutdown_wq;
+static u64 t_rcu_scale_writer_started;
+static u64 t_rcu_scale_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
+
+#define MAX_MEAS 10000
+#define MIN_MEAS 100
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_scale_ops {
+ int ptype;
+ void (*init)(void);
+ void (*cleanup)(void);
+ int (*readlock)(void);
+ void (*readunlock)(int idx);
+ unsigned long (*get_gp_seq)(void);
+ unsigned long (*gp_diff)(unsigned long new, unsigned long old);
+ unsigned long (*exp_completed)(void);
+ void (*async)(struct rcu_head *head, rcu_callback_t func);
+ void (*gp_barrier)(void);
+ void (*sync)(void);
+ void (*exp_sync)(void);
+ struct task_struct *(*rso_gp_kthread)(void);
+ void (*stats)(void);
+ const char *name;
+};
+
+static struct rcu_scale_ops *cur_ops;
+
+/*
+ * Definitions for rcu scalability testing.
+ */
+
+static int rcu_scale_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_scale_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned long __maybe_unused rcu_no_completed(void)
+{
+ return 0;
+}
+
+static void rcu_sync_scale_init(void)
+{
+}
+
+static struct rcu_scale_ops rcu_ops = {
+ .ptype = RCU_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = rcu_scale_read_lock,
+ .readunlock = rcu_scale_read_unlock,
+ .get_gp_seq = rcu_get_gp_seq,
+ .gp_diff = rcu_seq_diff,
+ .exp_completed = rcu_exp_batches_completed,
+ .async = call_rcu_hurry,
+ .gp_barrier = rcu_barrier,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .name = "rcu"
+};
+
+/*
+ * Definitions for srcu scalability testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale;
+
+static int srcu_scale_read_lock(void) __acquires(srcu_ctlp)
+{
+ return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp)
+{
+ srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static unsigned long srcu_scale_completed(void)
+{
+ return srcu_batches_completed(srcu_ctlp);
+}
+
+static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+ call_srcu(srcu_ctlp, head, func);
+}
+
+static void srcu_rcu_barrier(void)
+{
+ srcu_barrier(srcu_ctlp);
+}
+
+static void srcu_scale_synchronize(void)
+{
+ synchronize_srcu(srcu_ctlp);
+}
+
+static void srcu_scale_stats(void)
+{
+ srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG);
+}
+
+static void srcu_scale_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static struct rcu_scale_ops srcu_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = srcu_scale_read_lock,
+ .readunlock = srcu_scale_read_unlock,
+ .get_gp_seq = srcu_scale_completed,
+ .gp_diff = rcu_seq_diff,
+ .exp_completed = srcu_scale_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
+ .sync = srcu_scale_synchronize,
+ .exp_sync = srcu_scale_synchronize_expedited,
+ .stats = srcu_scale_stats,
+ .name = "srcu"
+};
+
+static struct srcu_struct srcud;
+
+static void srcu_sync_scale_init(void)
+{
+ srcu_ctlp = &srcud;
+ init_srcu_struct(srcu_ctlp);
+}
+
+static void srcu_sync_scale_cleanup(void)
+{
+ cleanup_srcu_struct(srcu_ctlp);
+}
+
+static struct rcu_scale_ops srcud_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = srcu_sync_scale_init,
+ .cleanup = srcu_sync_scale_cleanup,
+ .readlock = srcu_scale_read_lock,
+ .readunlock = srcu_scale_read_unlock,
+ .get_gp_seq = srcu_scale_completed,
+ .gp_diff = rcu_seq_diff,
+ .exp_completed = srcu_scale_completed,
+ .async = srcu_call_rcu,
+ .gp_barrier = srcu_rcu_barrier,
+ .sync = srcu_scale_synchronize,
+ .exp_sync = srcu_scale_synchronize_expedited,
+ .stats = srcu_scale_stats,
+ .name = "srcud"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks scalability testing.
+ */
+
+static int tasks_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_scale_read_unlock(int idx)
+{
+}
+
+static void rcu_tasks_scale_stats(void)
+{
+ rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
+static struct rcu_scale_ops tasks_ops = {
+ .ptype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_scale_read_lock,
+ .readunlock = tasks_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks,
+ .gp_barrier = rcu_barrier_tasks,
+ .sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_tasks,
+ .rso_gp_kthread = get_rcu_tasks_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats,
+ .name = "tasks"
+};
+
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+
+/*
+ * Definitions for RCU-tasks-rude scalability testing.
+ */
+
+static int tasks_rude_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_rude_scale_read_unlock(int idx)
+{
+}
+
+static void rcu_tasks_rude_scale_stats(void)
+{
+ rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
+static struct rcu_scale_ops tasks_rude_ops = {
+ .ptype = RCU_TASKS_RUDE_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_rude_scale_read_lock,
+ .readunlock = tasks_rude_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .sync = synchronize_rcu_tasks_rude,
+ .exp_sync = synchronize_rcu_tasks_rude,
+ .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats,
+ .name = "tasks-rude"
+};
+
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+/*
+ * Definitions for RCU-tasks-trace scalability testing.
+ */
+
+static int tasks_trace_scale_read_lock(void)
+{
+ rcu_read_lock_trace();
+ return 0;
+}
+
+static void tasks_trace_scale_read_unlock(int idx)
+{
+ rcu_read_unlock_trace();
+}
+
+static void rcu_tasks_trace_scale_stats(void)
+{
+ rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
+static struct rcu_scale_ops tasks_tracing_ops = {
+ .ptype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_trace_scale_read_lock,
+ .readunlock = tasks_trace_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_trace,
+ .gp_barrier = rcu_barrier_tasks_trace,
+ .sync = synchronize_rcu_tasks_trace,
+ .exp_sync = synchronize_rcu_tasks_trace,
+ .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats,
+ .name = "tasks-tracing"
+};
+
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
+{
+ if (!cur_ops->gp_diff)
+ return new - old;
+ return cur_ops->gp_diff(new, old);
+}
+
+/*
+ * If scalability tests complete, wait for shutdown to commence.
+ */
+static void rcu_scale_wait_shutdown(void)
+{
+ cond_resched_tasks_rcu_qs();
+ if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters)
+ return;
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * RCU scalability reader kthread. Repeatedly does empty RCU read-side
+ * critical section, minimizing update-side interference. However, the
+ * point of this test is not to evaluate reader scalability, but instead
+ * to serve as a test load for update-side scalability testing.
+ */
+static int
+rcu_scale_reader(void *arg)
+{
+ unsigned long flags;
+ int idx;
+ long me = (long)arg;
+
+ VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_rcu_scale_reader_started);
+
+ do {
+ local_irq_save(flags);
+ idx = cur_ops->readlock();
+ cur_ops->readunlock(idx);
+ local_irq_restore(flags);
+ rcu_scale_wait_shutdown();
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_scale_reader");
+ return 0;
+}
+
+/*
+ * Allocate a writer_mblock structure for the specified rcu_scale_writer
+ * task.
+ */
+static struct writer_mblock *rcu_scale_alloc(long me)
+{
+ struct llist_node *llnp;
+ struct writer_freelist *wflp;
+ struct writer_mblock *wmbp;
+
+ if (WARN_ON_ONCE(!writer_freelists))
+ return NULL;
+ wflp = &writer_freelists[me];
+ if (llist_empty(&wflp->ws_lhp)) {
+ // ->ws_lhp is private to its rcu_scale_writer task.
+ wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node);
+ wflp->ws_lhp.first = &wmbp->wmb_node;
+ }
+ llnp = llist_del_first(&wflp->ws_lhp);
+ if (!llnp)
+ return NULL;
+ return container_of(llnp, struct writer_mblock, wmb_node);
+}
+
+/*
+ * Free a writer_mblock structure to its rcu_scale_writer task.
+ */
+static void rcu_scale_free(struct writer_mblock *wmbp)
+{
+ struct writer_freelist *wflp;
+
+ if (!wmbp)
+ return;
+ wflp = wmbp->wmb_wfl;
+ llist_add(&wmbp->wmb_node, &wflp->ws_lhg);
+}
+
+/*
+ * Callback function for asynchronous grace periods from rcu_scale_writer().
+ */
+static void rcu_scale_async_cb(struct rcu_head *rhp)
+{
+ struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh);
+ struct writer_freelist *wflp = wmbp->wmb_wfl;
+
+ atomic_dec(&wflp->ws_inflight);
+ rcu_scale_free(wmbp);
+}
+
+/*
+ * RCU scale writer kthread. Repeatedly does a grace period.
+ */
+static int
+rcu_scale_writer(void *arg)
+{
+ int i = 0;
+ int i_max;
+ unsigned long jdone;
+ long me = (long)arg;
+ bool selfreport = false;
+ bool started = false, done = false, alldone = false;
+ u64 t;
+ DEFINE_TORTURE_RANDOM(tr);
+ u64 *wdp;
+ u64 *wdpp = writer_durations[me];
+ struct writer_freelist *wflp = &writer_freelists[me];
+ struct writer_mblock *wmbp = NULL;
+
+ VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
+ WARN_ON(!wdpp);
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ current->flags |= PF_NO_SETAFFINITY;
+ sched_set_fifo_low(current);
+
+ if (holdoff)
+ schedule_timeout_idle(holdoff * HZ);
+
+ /*
+ * Wait until rcu_end_inkernel_boot() is called for normal GP tests
+ * so that RCU is not always expedited for normal GP tests.
+ * The system_state test is approximate, but works well in practice.
+ */
+ while (!gp_exp && system_state != SYSTEM_RUNNING)
+ schedule_timeout_uninterruptible(1);
+
+ t = ktime_get_mono_fast_ns();
+ if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) {
+ t_rcu_scale_writer_started = t;
+ if (gp_exp) {
+ b_rcu_gp_test_started =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
+ }
+ }
+
+ jdone = jiffies + minruntime * HZ;
+ do {
+ bool gp_succeeded = false;
+
+ if (writer_holdoff)
+ udelay(writer_holdoff);
+ if (writer_holdoff_jiffies)
+ schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
+ wdp = &wdpp[i];
+ *wdp = ktime_get_mono_fast_ns();
+ if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
+ if (!wmbp)
+ wmbp = rcu_scale_alloc(me);
+ if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) {
+ atomic_inc(&wflp->ws_inflight);
+ cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb);
+ wmbp = NULL;
+ gp_succeeded = true;
+ } else if (!kthread_should_stop()) {
+ cur_ops->gp_barrier();
+ } else {
+ rcu_scale_free(wmbp); /* Because we are stopping. */
+ wmbp = NULL;
+ }
+ } else if (gp_exp) {
+ cur_ops->exp_sync();
+ gp_succeeded = true;
+ } else {
+ cur_ops->sync();
+ gp_succeeded = true;
+ }
+ t = ktime_get_mono_fast_ns();
+ *wdp = t - *wdp;
+ i_max = i;
+ if (!started &&
+ atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
+ started = true;
+ if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
+ done = true;
+ WRITE_ONCE(writer_done[me], true);
+ sched_set_normal(current, 0);
+ pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
+ scale_type, SCALE_FLAG, me, MIN_MEAS);
+ if (atomic_inc_return(&n_rcu_scale_writer_finished) >=
+ nrealwriters) {
+ schedule_timeout_interruptible(10);
+ rcu_ftrace_dump(DUMP_ALL);
+ SCALEOUT_STRING("Test complete");
+ t_rcu_scale_writer_finished = t;
+ if (gp_exp) {
+ b_rcu_gp_test_finished =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_gp_test_finished =
+ cur_ops->get_gp_seq();
+ }
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+ }
+ if (done && !alldone &&
+ atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters)
+ alldone = true;
+ if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) {
+ static atomic_t dumped;
+ int i;
+
+ if (!atomic_xchg(&dumped, 1)) {
+ for (i = 0; i < nrealwriters; i++) {
+ if (writer_done[i])
+ continue;
+ pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i);
+ sched_show_task(writer_tasks[i]);
+ }
+ if (cur_ops->stats)
+ cur_ops->stats();
+ }
+ }
+ if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) {
+ pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n",
+ __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone);
+ selfreport = true;
+ }
+ if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1)
+ i++;
+ rcu_scale_wait_shutdown();
+ } while (!torture_must_stop());
+ if (gp_async && cur_ops->async) {
+ rcu_scale_free(wmbp);
+ cur_ops->gp_barrier();
+ }
+ writer_n_durations[me] = i_max + 1;
+ torture_kthread_stopping("rcu_scale_writer");
+ return 0;
+}
+
+static void
+rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" SCALE_FLAG
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
+}
+
+/*
+ * Return the number if non-negative. If -1, the number of CPUs.
+ * If less than -1, that much less than the number of CPUs, but
+ * at least one.
+ */
+static int compute_real(int n)
+{
+ int nr;
+
+ if (n >= 0) {
+ nr = n;
+ } else {
+ nr = num_online_cpus() + 1 + n;
+ if (nr <= 0)
+ nr = 1;
+ }
+ return nr;
+}
+
+/*
+ * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number
+ * of iterations and measure total time and number of GP for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
+torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
+torture_param(bool, kfree_rcu_test_double, false, "Do we run a kfree_rcu() double-argument scale test?");
+torture_param(bool, kfree_rcu_test_single, false, "Do we run a kfree_rcu() single-argument scale test?");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_scale_thread_started;
+static atomic_t n_kfree_scale_thread_ended;
+static struct task_struct *kthread_tp;
+static u64 kthread_stime;
+
+struct kfree_obj {
+ char kfree_obj[8];
+ struct rcu_head rh;
+};
+
+/* Used if doing RCU-kfree'ing via call_rcu(). */
+static void kfree_call_rcu(struct rcu_head *rh)
+{
+ struct kfree_obj *obj = container_of(rh, struct kfree_obj, rh);
+
+ kfree(obj);
+}
+
+static int
+kfree_scale_thread(void *arg)
+{
+ int i, loop = 0;
+ long me = (long)arg;
+ struct kfree_obj *alloc_ptr;
+ u64 start_time, end_time;
+ long long mem_begin, mem_during = 0;
+ bool kfree_rcu_test_both;
+ DEFINE_TORTURE_RANDOM(tr);
+
+ VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ kfree_rcu_test_both = (kfree_rcu_test_single == kfree_rcu_test_double);
+
+ start_time = ktime_get_mono_fast_ns();
+
+ if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) {
+ if (gp_exp)
+ b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_started = cur_ops->get_gp_seq();
+ }
+
+ do {
+ if (!mem_during) {
+ mem_during = mem_begin = si_mem_available();
+ } else if (loop % (kfree_loops / 4) == 0) {
+ mem_during = (mem_during + si_mem_available()) / 2;
+ }
+
+ for (i = 0; i < kfree_alloc_num; i++) {
+ alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL);
+ if (!alloc_ptr)
+ return -ENOMEM;
+
+ if (kfree_by_call_rcu) {
+ call_rcu(&(alloc_ptr->rh), kfree_call_rcu);
+ continue;
+ }
+
+ // By default kfree_rcu_test_single and kfree_rcu_test_double are
+ // initialized to false. If both have the same value (false or true)
+ // both are randomly tested, otherwise only the one with value true
+ // is tested.
+ if ((kfree_rcu_test_single && !kfree_rcu_test_double) ||
+ (kfree_rcu_test_both && torture_random(&tr) & 0x800))
+ kfree_rcu_mightsleep(alloc_ptr);
+ else
+ kfree_rcu(alloc_ptr, rh);
+ }
+
+ cond_resched();
+ } while (!torture_must_stop() && ++loop < kfree_loops);
+
+ if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) {
+ end_time = ktime_get_mono_fast_ns();
+
+ if (gp_exp)
+ b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
+ else
+ b_rcu_gp_test_finished = cur_ops->get_gp_seq();
+
+ pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
+ (unsigned long long)(end_time - start_time), kfree_loops,
+ rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
+ PAGES_TO_MB(mem_begin - mem_during));
+
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+
+ torture_kthread_stopping("kfree_scale_thread");
+ return 0;
+}
+
+static void
+kfree_scale_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (kfree_reader_tasks) {
+ for (i = 0; i < kfree_nrealthreads; i++)
+ torture_stop_kthread(kfree_scale_thread,
+ kfree_reader_tasks[i]);
+ kfree(kfree_reader_tasks);
+ kfree_reader_tasks = NULL;
+ }
+
+ torture_cleanup_end();
+}
+
+/*
+ * shutdown kthread. Just waits to be awakened, then shuts down system.
+ */
+static int
+kfree_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq,
+ atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+
+ smp_mb(); /* Wake before output. */
+
+ kfree_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+// Used if doing RCU-kfree'ing via call_rcu().
+static unsigned long jiffies_at_lazy_cb;
+static struct rcu_head lazy_test1_rh;
+static int rcu_lazy_test1_cb_called;
+static void call_rcu_lazy_test1(struct rcu_head *rh)
+{
+ jiffies_at_lazy_cb = jiffies;
+ WRITE_ONCE(rcu_lazy_test1_cb_called, 1);
+}
+
+static int __init
+kfree_scale_init(void)
+{
+ int firsterr = 0;
+ long i;
+ unsigned long jif_start;
+ unsigned long orig_jif;
+
+ pr_alert("%s" SCALE_FLAG
+ "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n",
+ scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single);
+
+ // Also, do a quick self-test to ensure laziness is as much as
+ // expected.
+ if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
+ pr_alert("CONFIG_RCU_LAZY is disabled, falling back to kfree_rcu() for delayed RCU kfree'ing\n");
+ kfree_by_call_rcu = 0;
+ }
+
+ if (kfree_by_call_rcu) {
+ /* do a test to check the timeout. */
+ orig_jif = rcu_get_jiffies_lazy_flush();
+
+ rcu_set_jiffies_lazy_flush(2 * HZ);
+ rcu_barrier();
+
+ jif_start = jiffies;
+ jiffies_at_lazy_cb = 0;
+ call_rcu(&lazy_test1_rh, call_rcu_lazy_test1);
+
+ smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
+
+ rcu_set_jiffies_lazy_flush(orig_jif);
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
+ firsterr = -1;
+ goto unwind;
+ }
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are being too lazy!\n");
+ firsterr = -1;
+ goto unwind;
+ }
+ }
+
+ kfree_nrealthreads = compute_real(kfree_nthreads);
+ /* Start up the kthreads. */
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
+ shutdown_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n",
+ kfree_mult * sizeof(struct kfree_obj),
+ kfree_by_call_rcu);
+
+ kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
+ GFP_KERNEL);
+ if (kfree_reader_tasks == NULL) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ for (i = 0; i < kfree_nrealthreads; i++) {
+ firsterr = torture_create_kthread(kfree_scale_thread, (void *)i,
+ kfree_reader_tasks[i]);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads)
+ schedule_timeout_uninterruptible(1);
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ kfree_scale_cleanup();
+ return firsterr;
+}
+
+static void
+rcu_scale_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
+ // If built-in, just report all of the GP kthread's CPU time.
+ if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread)
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp) {
+ u32 ns;
+ u64 us;
+
+ kthread_stime = kthread_tp->stime - kthread_stime;
+ us = div_u64_rem(kthread_stime, 1000, &ns);
+ pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns);
+ show_rcu_gp_kthreads();
+ }
+ if (kfree_rcu_test) {
+ kfree_scale_cleanup();
+ return;
+ }
+
+ if (torture_cleanup_begin())
+ return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_scale_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ reader_tasks = NULL;
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_scale_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ scale_type, SCALE_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ scale_type, SCALE_FLAG,
+ t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
+ t_rcu_scale_writer_finished -
+ t_rcu_scale_writer_started,
+ ngps,
+ rcuscale_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j < writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ scale_type, SCALE_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ if (writer_freelists) {
+ int ctr = 0;
+ struct llist_node *llnp;
+ struct writer_freelist *wflp = &writer_freelists[i];
+
+ if (wflp->ws_mblocks) {
+ llist_for_each(llnp, wflp->ws_lhg.first)
+ ctr++;
+ llist_for_each(llnp, wflp->ws_lhp.first)
+ ctr++;
+ WARN_ONCE(ctr != gp_async_max,
+ "%s: ctr = %d gp_async_max = %d\n",
+ __func__, ctr, gp_async_max);
+ kfree(wflp->ws_mblocks);
+ }
+ }
+ }
+ kfree(writer_tasks);
+ writer_tasks = NULL;
+ kfree(writer_durations);
+ writer_durations = NULL;
+ kfree(writer_n_durations);
+ writer_n_durations = NULL;
+ kfree(writer_done);
+ writer_done = NULL;
+ kfree(writer_freelists);
+ writer_freelists = NULL;
+ }
+
+ /* Do torture-type-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+static int __init
+rcu_scale_init(void)
+{
+ int firsterr = 0;
+ long i;
+ long j;
+ static struct rcu_scale_ops *scale_ops[] = {
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
+ };
+
+ if (!torture_init_begin(scale_type, verbose))
+ return -EBUSY;
+
+ /* Process args and announce that the scalability'er is on the job. */
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+ cur_ops = scale_ops[i];
+ if (strcmp(scale_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(scale_ops)) {
+ pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+ pr_alert("rcu-scale types:");
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+ pr_cont(" %s", scale_ops[i]->name);
+ pr_cont("\n");
+ firsterr = -EINVAL;
+ cur_ops = NULL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ cur_ops->init();
+
+ if (cur_ops->rso_gp_kthread) {
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp)
+ kthread_stime = kthread_tp->stime;
+ }
+ if (kfree_rcu_test)
+ return kfree_scale_init();
+
+ nrealwriters = compute_real(nwriters);
+ nrealreaders = compute_real(nreaders);
+ atomic_set(&n_rcu_scale_reader_started, 0);
+ atomic_set(&n_rcu_scale_writer_started, 0);
+ atomic_set(&n_rcu_scale_writer_finished, 0);
+ rcu_scale_print_module_parms(cur_ops, "Start of test");
+
+ /* Start up the kthreads. */
+
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
+ shutdown_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+ reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ firsterr = torture_create_kthread(rcu_scale_reader, (void *)i,
+ reader_tasks[i]);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
+ schedule_timeout_uninterruptible(1);
+ writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL);
+ writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
+ writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL);
+ writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL);
+ if (gp_async) {
+ if (gp_async_max <= 0) {
+ pr_warn("%s: gp_async_max = %d must be greater than zero.\n",
+ __func__, gp_async_max);
+ WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL);
+ }
+ if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done ||
+ (gp_async && !writer_freelists)) {
+ SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters; i++) {
+ writer_durations[i] =
+ kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
+ GFP_KERNEL);
+ if (!writer_durations[i]) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ if (writer_freelists) {
+ struct writer_freelist *wflp = &writer_freelists[i];
+
+ init_llist_head(&wflp->ws_lhg);
+ init_llist_head(&wflp->ws_lhp);
+ wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]),
+ GFP_KERNEL);
+ if (!wflp->ws_mblocks) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (j = 0; j < gp_async_max; j++) {
+ struct writer_mblock *wmbp = &wflp->ws_mblocks[j];
+
+ wmbp->wmb_wfl = wflp;
+ llist_add(&wmbp->wmb_node, &wflp->ws_lhp);
+ }
+ }
+ firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
+ writer_tasks[i]);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ rcu_scale_cleanup();
+ if (shutdown) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST));
+ kernel_power_off();
+ }
+ return firsterr;
+}
+
+module_init(rcu_scale_init);
+module_exit(rcu_scale_cleanup);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8dbe27611ec3..07e51974b06b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1,27 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update module-based torture test facility
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2005, 2006
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Josh Triplett <josh@joshtriplett.org>
*
- * See also: Documentation/RCU/torture.txt
+ * See also: Documentation/RCU/torture.rst
*/
+
+#define pr_fmt(fmt) fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
@@ -30,9 +20,11 @@
#include <linux/err.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/rcu_notifier.h>
#include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <uapi/linux/sched/types.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
@@ -50,79 +42,153 @@
#include <asm/byteorder.h>
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/sysctl.h>
+#include <linux/oom.h>
+#include <linux/tick.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/nmi.h>
+
+#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
-
-
-torture_param(int, cbflood_inter_holdoff, HZ,
- "Holdoff between floods (jiffies)");
-torture_param(int, cbflood_intra_holdoff, 1,
- "Holdoff between bursts (jiffies)");
-torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
-torture_param(int, cbflood_n_per_burst, 20000,
- "# callbacks per burst in flood");
-torture_param(int, fqs_duration, 0,
- "Duration of fqs bursts (us), 0 to disable");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+
+// Bits for ->extendables field, extendables param, and related definitions.
+#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
+#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh.
+#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts.
+#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption.
+#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh().
+#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched().
+#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer.
+ // Note: Manual start, automatic end.
+#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above.
+#define RCUTORTURE_MAX_EXTEND \
+ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
+ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN.
+#define RCUTORTURE_RDR_ALLBITS \
+ (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \
+ RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2)
+#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */
+ /* Must be power of two minus one. */
+#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
+
+torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
+ "Extend readers by disabling bh (1), irqs (2), or preempt (4)");
+torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)");
+torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
+torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
+torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
+torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
+torture_param(bool, gp_cond_exp_full, false,
+ "Use conditional/async full-stateexpedited GP wait primitives");
+torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ,
+ "Wait interval for normal conditional grace periods, us (default 16 jiffies)");
+torture_param(int, gp_cond_wi_exp, 128,
+ "Wait interval for expedited conditional grace periods, us (default 128 us)");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(bool, gp_normal, false,
- "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
+torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
+torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
+torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
+torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ,
+ "Wait interval for normal polled grace periods, us (default 16 jiffies)");
+torture_param(int, gp_poll_wi_exp, 128,
+ "Wait interval for expedited polled grace periods, us (default 128 us)");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
-torture_param(int, n_barrier_cbs, 0,
- "# of callbacks/kthreads for barrier testing");
+torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
+torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, object_debug, 0,
- "Enable debug-object double call_rcu() testing");
+torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing");
+torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period.");
+torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)");
+torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)");
+torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
+torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
+torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable");
+torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)");
+torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit.");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-torture_param(int, stall_cpu_holdoff, 10,
- "Time to wait before starting stall (s).");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
+torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
+torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
+torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one.");
+torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-torture_param(int, test_boost_duration, 4,
- "Duration of each boost test, seconds.");
-torture_param(int, test_boost_interval, 7,
- "Interval between boost tests, seconds.");
-torture_param(bool, test_no_idle_hz, true,
- "Test support for tickless idle CPUs");
-torture_param(bool, verbose, true,
- "Enable verbose debugging printk()s");
+torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds.");
+torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
+torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario.");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
+static int nrealnocbers;
static int nrealreaders;
-static int ncbflooders;
+static int nrealfakewriters;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct *updown_task;
+static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
-static struct task_struct **cbflood_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
+static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
+static struct task_struct *read_exit_task;
+static struct task_struct *preempt_task;
#define RCU_TORTURE_PIPE_LEN 10
+// Mailbox-like structure to check RCU global memory ordering.
+struct rcu_torture_reader_check {
+ unsigned long rtc_myloops;
+ int rtc_chkrdr;
+ unsigned long rtc_chkloops;
+ int rtc_ready;
+ struct rcu_torture_reader_check *rtc_assigner;
+} ____cacheline_internodealigned_in_smp;
+
+// Update-side data structure used to check RCU readers.
struct rcu_torture {
struct rcu_head rtort_rcu;
int rtort_pipe_count;
struct list_head rtort_free;
int rtort_mbtest;
+ struct rcu_torture_reader_check *rtort_chkp;
};
static LIST_HEAD(rcu_torture_freelist);
@@ -130,26 +196,30 @@ static struct rcu_torture __rcu *rcu_torture_current;
static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
- rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
- rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static struct rcu_torture_reader_check *rcu_torture_reader_mbchk;
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_mbchk_fail;
+static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
-static long n_rcu_torture_timers;
+static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
-static long n_barrier_successes;
-static atomic_long_t n_cbfloods;
+static long n_barrier_successes; /* did rcu_barrier test succeed? */
+static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
+static unsigned long shutdown_jiffies;
+static unsigned long start_gp_seq;
+static atomic_long_t n_nocb_offload;
+static atomic_long_t n_nocb_deoffload;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -158,31 +228,84 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_SYNC 6
-#define RTWS_SYNC 7
-#define RTWS_STUTTER 8
-#define RTWS_STOPPING 9
+#define RTWS_COND_GET_FULL 6
+#define RTWS_COND_GET_EXP 7
+#define RTWS_COND_GET_EXP_FULL 8
+#define RTWS_COND_SYNC 9
+#define RTWS_COND_SYNC_FULL 10
+#define RTWS_COND_SYNC_EXP 11
+#define RTWS_COND_SYNC_EXP_FULL 12
+#define RTWS_POLL_GET 13
+#define RTWS_POLL_GET_FULL 14
+#define RTWS_POLL_GET_EXP 15
+#define RTWS_POLL_GET_EXP_FULL 16
+#define RTWS_POLL_WAIT 17
+#define RTWS_POLL_WAIT_FULL 18
+#define RTWS_POLL_WAIT_EXP 19
+#define RTWS_POLL_WAIT_EXP_FULL 20
+#define RTWS_SYNC 21
+#define RTWS_STUTTER 22
+#define RTWS_STOPPING 23
+static const char * const rcu_torture_writer_state_names[] = {
+ "RTWS_FIXED_DELAY",
+ "RTWS_DELAY",
+ "RTWS_REPLACE",
+ "RTWS_DEF_FREE",
+ "RTWS_EXP_SYNC",
+ "RTWS_COND_GET",
+ "RTWS_COND_GET_FULL",
+ "RTWS_COND_GET_EXP",
+ "RTWS_COND_GET_EXP_FULL",
+ "RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_FULL",
+ "RTWS_COND_SYNC_EXP",
+ "RTWS_COND_SYNC_EXP_FULL",
+ "RTWS_POLL_GET",
+ "RTWS_POLL_GET_FULL",
+ "RTWS_POLL_GET_EXP",
+ "RTWS_POLL_GET_EXP_FULL",
+ "RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_FULL",
+ "RTWS_POLL_WAIT_EXP",
+ "RTWS_POLL_WAIT_EXP_FULL",
+ "RTWS_SYNC",
+ "RTWS_STUTTER",
+ "RTWS_STOPPING",
+};
-#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
-#define RCUTORTURE_RUNNABLE_INIT 1
-#else
-#define RCUTORTURE_RUNNABLE_INIT 0
-#endif
-static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
-module_param(torture_runnable, int, 0444);
-MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
+/* Record reader segment types and duration for first failing read. */
+struct rt_read_seg {
+ int rt_readstate;
+ unsigned long rt_delay_jiffies;
+ unsigned long rt_delay_ms;
+ unsigned long rt_delay_us;
+ bool rt_preempted;
+ int rt_cpu;
+ int rt_end_cpu;
+ unsigned long long rt_gp_seq;
+ unsigned long long rt_gp_seq_end;
+ u64 rt_ts;
+};
+static int err_segs_recorded;
+static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS];
+static int rt_read_nsegs;
+static int rt_read_preempted;
-#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
-#define rcu_can_boost() 1
-#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-#define rcu_can_boost() 0
-#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+static const char *rcu_torture_writer_state_getname(void)
+{
+ unsigned int i = READ_ONCE(rcu_torture_writer_state);
+
+ if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
+ return "???";
+ return rcu_torture_writer_state_names[i];
+}
#ifdef CONFIG_RCU_TRACE
static u64 notrace rcu_trace_clock_local(void)
{
u64 ts = trace_clock_local();
- unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+
+ (void)do_div(ts, NSEC_PER_USEC);
return ts;
}
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -192,6 +315,15 @@ static u64 notrace rcu_trace_clock_local(void)
}
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * Stop aggressive CPU-hog tests a bit before the end of the test in order
+ * to avoid interfering with test shutdown.
+ */
+static bool shutdown_time_arrived(void)
+{
+ return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ);
+}
+
static unsigned long boost_starttime; /* jiffies of next boost test start. */
static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
@@ -201,6 +333,8 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
+static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
+
/*
* Allocate an element from the rcu_tortures pool.
*/
@@ -241,22 +375,66 @@ rcu_torture_free(struct rcu_torture *p)
struct rcu_torture_ops {
int ttype;
void (*init)(void);
+ void (*cleanup)(void);
int (*readlock)(void);
- void (*read_delay)(struct torture_random_state *rrsp);
+ void (*read_delay)(struct torture_random_state *rrsp,
+ struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
- unsigned long (*started)(void);
- unsigned long (*completed)(void);
+ int (*readlock_held)(void); // lockdep.
+ int (*readlock_nesting)(void); // actual nesting, if available, -1 if not.
+ int (*down_read)(void);
+ void (*up_read)(int idx);
+ unsigned long (*get_gp_seq)(void);
+ unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
- unsigned long (*get_state)(void);
+ void (*exp_current)(void);
+ unsigned long (*get_gp_state_exp)(void);
+ unsigned long (*start_gp_poll_exp)(void);
+ void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_gp_state_exp)(unsigned long oldstate);
+ void (*cond_sync_exp)(unsigned long oldstate);
+ void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_comp_state)(void);
+ void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2);
+ bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
+ unsigned long (*get_gp_state)(void);
+ void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*start_gp_poll)(void);
+ void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_gp_state)(unsigned long oldstate);
+ bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
- void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+ void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
+ int poll_active;
+ int poll_active_full;
+ call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
void (*stats)(void);
+ void (*gp_kthread_dbg)(void);
+ bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
+ int (*stall_dur)(void);
+ void (*get_gp_data)(int *flags, unsigned long *gp_seq);
+ void (*gp_slow_register)(atomic_t *rgssp);
+ void (*gp_slow_unregister)(atomic_t *rgssp);
+ bool (*reader_blocked)(void);
+ unsigned long long (*gather_gp_seqs)(void);
+ void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len);
+ void (*set_gpwrap_lag)(unsigned long lag);
+ int (*get_gpwrap_count)(int cpu);
+ long cbflood_max;
int irq_capable;
int can_boost;
+ int extendables;
+ int slow_gps;
+ int no_pi_lock;
+ int debug_objects;
+ int start_poll_irqsoff;
+ int have_up_down;
const char *name;
};
@@ -266,37 +444,65 @@ static struct rcu_torture_ops *cur_ops;
* Definitions for rcu torture testing.
*/
-static int rcu_torture_read_lock(void) __acquires(RCU)
+static int torture_readlock_not_held(void)
+{
+ return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+}
+
+static int rcu_torture_read_lock(void)
{
rcu_read_lock();
return 0;
}
-static void rcu_read_delay(struct torture_random_state *rrsp)
+static void
+rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
{
+ unsigned long started;
+ unsigned long completed;
const unsigned long shortdelay_us = 200;
- const unsigned long longdelay_ms = 50;
+ unsigned long longdelay_ms = 300;
+ unsigned long long ts;
/* We want a short delay sometimes to make a reader delay the grace
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+ if (!atomic_read(&rcu_fwd_cb_nodelay) &&
+ !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
+ started = cur_ops->get_gp_seq();
+ ts = rcu_trace_clock_local();
+ if ((preempt_count() & HARDIRQ_MASK) || softirq_count())
+ longdelay_ms = 5; /* Avoid triggering BH limits. */
mdelay(longdelay_ms);
- if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+ rtrsp->rt_delay_ms = longdelay_ms;
+ completed = cur_ops->get_gp_seq();
+ do_trace_rcu_torture_read(cur_ops->name, NULL, ts,
+ started, completed);
+ }
+ if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) {
udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
+ rtrsp->rt_delay_us = shortdelay_us;
+ }
if (!preempt_count() &&
- !(torture_random(rrsp) % (nrealreaders * 20000)))
- preempt_schedule(); /* No QS if preempt_disable() in effect */
-#endif
+ !(torture_random(rrsp) % (nrealreaders * 500)))
+ torture_preempt_schedule(); /* QS only if preemptible. */
}
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
+static void rcu_torture_read_unlock(int idx)
{
rcu_read_unlock();
}
+static int rcu_torture_readlock_nesting(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU))
+ return rcu_preempt_depth();
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return (preempt_count() & PREEMPT_MASK);
+ return -1;
+}
+
/*
* Update callback in the pipe. This should be invoked after a grace period.
*/
@@ -304,12 +510,19 @@ static bool
rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
+ struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp);
+ if (rtrcp) {
+ WRITE_ONCE(rp->rtort_chkp, NULL);
+ smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
+ }
i = rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ WRITE_ONCE(rp->rtort_pipe_count, i + 1);
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
+ if (i + 1 >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
return true;
}
@@ -359,7 +572,7 @@ static unsigned long rcu_no_completed(void)
static void rcu_torture_deferred_free(struct rcu_torture *p)
{
- call_rcu(&p->rtort_rcu, rcu_torture_cb);
+ call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb);
}
static void rcu_sync_torture_init(void)
@@ -367,65 +580,67 @@ static void rcu_sync_torture_init(void)
INIT_LIST_HEAD(&rcu_torture_removed);
}
-static struct rcu_torture_ops rcu_ops = {
- .ttype = RCU_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .started = rcu_batches_started,
- .completed = rcu_batches_completed,
- .deferred_free = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .exp_sync = synchronize_rcu_expedited,
- .get_state = get_state_synchronize_rcu,
- .cond_sync = cond_synchronize_rcu,
- .call = call_rcu,
- .cb_barrier = rcu_barrier,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .name = "rcu"
-};
-
-/*
- * Definitions for rcu_bh torture testing.
- */
-
-static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
-{
- rcu_read_lock_bh();
- return 0;
-}
-
-static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
+static bool rcu_poll_need_2gp(bool poll, bool poll_full)
{
- rcu_read_unlock_bh();
+ return poll;
}
-static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
-{
- call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static struct rcu_torture_ops rcu_bh_ops = {
- .ttype = RCU_BH_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_bh_torture_read_lock,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_bh_torture_read_unlock,
- .started = rcu_batches_started_bh,
- .completed = rcu_batches_completed_bh,
- .deferred_free = rcu_bh_torture_deferred_free,
- .sync = synchronize_rcu_bh,
- .exp_sync = synchronize_rcu_bh_expedited,
- .call = call_rcu_bh,
- .cb_barrier = rcu_barrier_bh,
- .fqs = rcu_bh_force_quiescent_state,
- .stats = NULL,
- .irq_capable = 1,
- .name = "rcu_bh"
+static struct rcu_torture_ops rcu_ops = {
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
+ .readlock_nesting = rcu_torture_readlock_nesting,
+ .get_gp_seq = rcu_get_gp_seq,
+ .gp_diff = rcu_seq_diff,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .same_gp_state = same_state_synchronize_rcu,
+ .same_gp_state_full = same_state_synchronize_rcu_full,
+ .get_comp_state = get_completed_synchronize_rcu,
+ .get_comp_state_full = get_completed_synchronize_rcu_full,
+ .get_gp_state = get_state_synchronize_rcu,
+ .get_gp_state_full = get_state_synchronize_rcu_full,
+ .start_gp_poll = start_poll_synchronize_rcu,
+ .start_gp_poll_full = start_poll_synchronize_rcu_full,
+ .poll_gp_state = poll_state_synchronize_rcu,
+ .poll_gp_state_full = poll_state_synchronize_rcu_full,
+ .poll_need_2gp = rcu_poll_need_2gp,
+ .cond_sync = cond_synchronize_rcu,
+ .cond_sync_full = cond_synchronize_rcu_full,
+ .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE,
+ .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE,
+ .get_gp_state_exp = get_state_synchronize_rcu,
+ .start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
+ .poll_gp_state_exp = poll_state_synchronize_rcu,
+ .cond_sync_exp = cond_synchronize_rcu_expedited,
+ .cond_sync_exp_full = cond_synchronize_rcu_expedited_full,
+ .call = call_rcu_hurry,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .gp_kthread_dbg = show_rcu_gp_kthreads,
+ .check_boost_failed = rcu_check_boost_fail,
+ .stall_dur = rcu_jiffies_till_stall_check,
+ .get_gp_data = rcutorture_get_gp_data,
+ .gp_slow_register = rcu_gp_slow_register,
+ .gp_slow_unregister = rcu_gp_slow_unregister,
+ .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)
+ ? has_rcu_reader_blocked
+ : NULL,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
+ .set_gpwrap_lag = rcu_set_gpwrap_lag,
+ .get_gpwrap_count = rcu_get_gpwrap_count,
+ .irq_capable = 1,
+ .can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
+ .extendables = RCUTORTURE_MAX_EXTEND,
+ .debug_objects = 1,
+ .start_poll_irqsoff = 1,
+ .name = "rcu"
};
/*
@@ -447,7 +662,7 @@ static void synchronize_rcu_busted(void)
}
static void
-call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+call_rcu_busted(struct rcu_head *head, rcu_callback_t func)
{
/* This is a deliberate bug for testing purposes only! */
func(head);
@@ -459,17 +674,17 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
- .started = rcu_no_completed,
- .completed = rcu_no_completed,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
- .cb_barrier = NULL,
- .fqs = NULL,
- .stats = NULL,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
.irq_capable = 1,
- .name = "rcu_busted"
+ .extendables = RCUTORTURE_MAX_EXTEND,
+ .name = "busted"
};
/*
@@ -477,13 +692,69 @@ static struct rcu_torture_ops rcu_busted_ops = {
*/
DEFINE_STATIC_SRCU(srcu_ctl);
+DEFINE_STATIC_SRCU_FAST(srcu_ctlf);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud);
+static struct srcu_struct srcu_ctld;
+static struct srcu_struct *srcu_ctlp = &srcu_ctl;
+static struct rcu_torture_ops srcud_ops;
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+static void srcu_torture_init(void)
{
- return srcu_read_lock(&srcu_ctl);
+ rcu_sync_torture_init();
+ if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL))
+ VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU");
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI)
+ VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU");
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ srcu_ctlp = &srcu_ctlf;
+ VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU");
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ srcu_ctlp = &srcu_ctlfud;
+ VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU");
+ }
+}
+
+static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq);
}
-static void srcu_read_delay(struct torture_random_state *rrsp)
+static int srcu_torture_read_lock(void)
+{
+ int idx;
+ struct srcu_ctr __percpu *scp;
+ int ret = 0;
+
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
+ idx = srcu_read_lock(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI) {
+ idx = srcu_read_lock_nmisafe(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 1;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 2;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 3;
+ }
+ return ret;
+}
+
+static void
+srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
{
long delay;
const long uspertick = 1000000 / HZ;
@@ -493,119 +764,301 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
- if (!delay)
+ if (!delay && in_task()) {
schedule_timeout_interruptible(longdelay);
- else
- rcu_read_delay(rrsp);
+ rtrsp->rt_delay_jiffies = longdelay;
+ } else {
+ rcu_read_delay(rrsp, rtrsp);
+ }
+}
+
+static void srcu_torture_read_unlock(int idx)
+{
+ WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
+ srcu_read_unlock_fast_updown(srcu_ctlp,
+ __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2));
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI)
+ srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
+ srcu_read_unlock(srcu_ctlp, idx & 0x1);
}
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+static int torture_srcu_read_lock_held(void)
{
- srcu_read_unlock(&srcu_ctl, idx);
+ return srcu_read_lock_held(srcu_ctlp);
+}
+
+static bool srcu_torture_have_up_down(void)
+{
+ int rf = reader_flavor;
+
+ if (!rf)
+ rf = SRCU_READ_FLAVOR_NORMAL;
+ return !!(cur_ops->have_up_down & rf);
+}
+
+static int srcu_torture_down_read(void)
+{
+ int idx;
+ struct srcu_ctr __percpu *scp;
+
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+ WARN_ON_ONCE(reader_flavor & (reader_flavor - 1));
+
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
+ idx = srcu_down_read(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ scp = srcu_down_read_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx << 3;
+ }
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void srcu_torture_up_read(int idx)
+{
+ WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN)
+ srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) ||
+ !(reader_flavor & SRCU_READ_FLAVOR_ALL))
+ srcu_up_read(srcu_ctlp, idx & 0x1);
+ else
+ WARN_ON_ONCE(1);
}
static unsigned long srcu_torture_completed(void)
{
- return srcu_batches_completed(&srcu_ctl);
+ return srcu_batches_completed(srcu_ctlp);
}
static void srcu_torture_deferred_free(struct rcu_torture *rp)
{
- call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+ call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
}
static void srcu_torture_synchronize(void)
{
- synchronize_srcu(&srcu_ctl);
+ synchronize_srcu(srcu_ctlp);
+}
+
+static unsigned long srcu_torture_get_gp_state(void)
+{
+ return get_state_synchronize_srcu(srcu_ctlp);
+}
+
+static unsigned long srcu_torture_start_gp_poll(void)
+{
+ return start_poll_synchronize_srcu(srcu_ctlp);
+}
+
+static bool srcu_torture_poll_gp_state(unsigned long oldstate)
+{
+ return poll_state_synchronize_srcu(srcu_ctlp, oldstate);
}
static void srcu_torture_call(struct rcu_head *head,
- void (*func)(struct rcu_head *head))
+ rcu_callback_t func)
{
- call_srcu(&srcu_ctl, head, func);
+ call_srcu(srcu_ctlp, head, func);
}
static void srcu_torture_barrier(void)
{
- srcu_barrier(&srcu_ctl);
+ srcu_barrier(srcu_ctlp);
}
static void srcu_torture_stats(void)
{
- int cpu;
- int idx = srcu_ctl.completed & 0x1;
-
- pr_alert("%s%s per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
- for_each_possible_cpu(cpu) {
- long c0, c1;
-
- c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
- c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
- pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
- }
- pr_cont("\n");
+ srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
}
static void srcu_torture_synchronize_expedited(void)
{
- synchronize_srcu_expedited(&srcu_ctl);
+ synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static void srcu_torture_expedite_current(void)
+{
+ srcu_expedite_current(srcu_ctlp);
}
static struct rcu_torture_ops srcu_ops = {
.ttype = SRCU_FLAVOR,
- .init = rcu_sync_torture_init,
+ .init = srcu_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
- .started = NULL,
- .completed = srcu_torture_completed,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
+ .readlock_held = torture_srcu_read_lock_held,
+ .get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .exp_current = srcu_torture_expedite_current,
+ .same_gp_state = same_state_synchronize_srcu,
+ .get_comp_state = get_completed_synchronize_srcu,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
+ .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
+ .cbflood_max = 50000,
+ .irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
.name = "srcu"
};
+static void srcud_torture_init(void)
+{
+ rcu_sync_torture_init();
+ if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ WARN_ON(init_srcu_struct_fast(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU");
+ } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) {
+ WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld));
+ VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU");
+ } else {
+ WARN_ON(init_srcu_struct(&srcu_ctld));
+ }
+ srcu_ctlp = &srcu_ctld;
+}
+
+static void srcu_torture_cleanup(void)
+{
+ cleanup_srcu_struct(&srcu_ctld);
+ srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops srcud_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = srcud_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
+ .get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .exp_current = srcu_torture_expedite_current,
+ .same_gp_state = same_state_synchronize_srcu,
+ .get_comp_state = get_completed_synchronize_srcu,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
+ .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
+ .cbflood_max = 50000,
+ .irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN,
+ .name = "srcud"
+};
+
+/* As above, but broken due to inappropriate reader extension. */
+static struct rcu_torture_ops busted_srcud_ops = {
+ .ttype = SRCU_FLAVOR,
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
+ .get_gp_seq = srcu_torture_completed,
+ .deferred_free = srcu_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .exp_sync = srcu_torture_synchronize_expedited,
+ .call = srcu_torture_call,
+ .cb_barrier = srcu_torture_barrier,
+ .stats = srcu_torture_stats,
+ .irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .extendables = RCUTORTURE_MAX_EXTEND,
+ .name = "busted_srcud"
+};
+
/*
- * Definitions for sched torture testing.
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not work well with CPU hotplug nor
+ * with rcutorture's shuffling.
*/
-static int sched_torture_read_lock(void)
+static void synchronize_rcu_trivial(void)
{
- preempt_disable();
- return 0;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ torture_sched_setaffinity(current->pid, cpumask_of(cpu), true);
+ WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+ }
}
-static void sched_torture_read_unlock(int idx)
+static void rcu_sync_torture_init_trivial(void)
{
- preempt_enable();
+ rcu_sync_torture_init();
+ // if (onoff_interval || shuffle_interval) {
+ if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) {
+ onoff_interval = 0;
+ shuffle_interval = 0;
+ }
}
-static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+static int rcu_torture_read_lock_trivial(void)
{
- call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+ preempt_disable();
+ return 0;
}
-static struct rcu_torture_ops sched_ops = {
- .ttype = RCU_SCHED_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = sched_torture_read_lock,
+static void rcu_torture_read_unlock_trivial(int idx)
+{
+ preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init_trivial,
+ .readlock = rcu_torture_read_lock_trivial,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = sched_torture_read_unlock,
- .started = rcu_batches_started_sched,
- .completed = rcu_batches_completed_sched,
- .deferred_free = rcu_sched_torture_deferred_free,
- .sync = synchronize_sched,
- .exp_sync = synchronize_sched_expedited,
- .call = call_rcu_sched,
- .cb_barrier = rcu_barrier_sched,
- .fqs = rcu_sched_force_quiescent_state,
- .stats = NULL,
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial,
+ .exp_sync = synchronize_rcu_trivial,
.irq_capable = 1,
- .name = "sched"
+ .name = "trivial"
};
#ifdef CONFIG_TASKS_RCU
@@ -628,105 +1081,281 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb);
}
+static void synchronize_rcu_mult_test(void)
+{
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry);
+}
+
static struct rcu_torture_ops tasks_ops = {
.ttype = RCU_TASKS_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = tasks_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = tasks_torture_read_unlock,
- .started = rcu_no_completed,
- .completed = rcu_no_completed,
+ .get_gp_seq = rcu_no_completed,
.deferred_free = rcu_tasks_torture_deferred_free,
.sync = synchronize_rcu_tasks,
- .exp_sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_mult_test,
.call = call_rcu_tasks,
.cb_barrier = rcu_barrier_tasks,
- .fqs = NULL,
- .stats = NULL,
+ .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread,
+ .get_gp_data = rcu_tasks_get_gp_data,
.irq_capable = 1,
+ .slow_gps = 1,
.name = "tasks"
};
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
-#else /* #ifdef CONFIG_TASKS_RCU */
+#endif // #else #ifdef CONFIG_TASKS_RCU
-#define RCUTORTURE_TASKS_OPS
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
+#ifdef CONFIG_TASKS_RUDE_RCU
/*
- * RCU torture priority-boost testing. Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked. If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
+ * Definitions for rude RCU-tasks torture testing.
*/
-struct rcu_boost_inflight {
- struct rcu_head rcu;
- int inflight;
+static struct rcu_torture_ops tasks_rude_ops = {
+ .ttype = RCU_TASKS_RUDE_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock_trivial,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_tasks_rude,
+ .exp_sync = synchronize_rcu_tasks_rude,
+ .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .get_gp_data = rcu_tasks_rude_get_gp_data,
+ .cbflood_max = 50000,
+ .irq_capable = 1,
+ .name = "tasks-rude"
};
-static void rcu_torture_boost_cb(struct rcu_head *head)
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU
+
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+/*
+ * Definitions for tracing RCU-tasks torture testing.
+ */
+
+static int tasks_tracing_torture_read_lock(void)
{
- struct rcu_boost_inflight *rbip =
- container_of(head, struct rcu_boost_inflight, rcu);
+ rcu_read_lock_trace();
+ return 0;
+}
+
+static void tasks_tracing_torture_read_unlock(int idx)
+{
+ rcu_read_unlock_trace();
+}
+
+static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p)
+{
+ call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb);
+}
- smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
- rbip->inflight = 0;
+static struct rcu_torture_ops tasks_tracing_ops = {
+ .ttype = RCU_TASKS_TRACING_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = tasks_tracing_torture_read_lock,
+ .read_delay = srcu_read_delay, /* just reuse srcu's version. */
+ .readunlock = tasks_tracing_torture_read_unlock,
+ .readlock_held = rcu_read_lock_trace_held,
+ .get_gp_seq = rcu_no_completed,
+ .deferred_free = rcu_tasks_tracing_torture_deferred_free,
+ .sync = synchronize_rcu_tasks_trace,
+ .exp_sync = synchronize_rcu_tasks_trace,
+ .call = call_rcu_tasks_trace,
+ .cb_barrier = rcu_barrier_tasks_trace,
+ .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .get_gp_data = rcu_tasks_trace_get_gp_data,
+ .cbflood_max = 50000,
+ .irq_capable = 1,
+ .slow_gps = 1,
+ .name = "tasks-tracing"
+};
+
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU
+
+
+static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
+{
+ if (!cur_ops->gp_diff)
+ return new - old;
+ return cur_ops->gp_diff(new, old);
+}
+
+/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly starting grace periods and waiting
+ * for them to complete. If a given grace period takes too long, we assume
+ * that priority inversion has occurred.
+ */
+
+static int old_rt_runtime = -1;
+
+static void rcu_torture_disable_rt_throttle(void)
+{
+ /*
+ * Disable RT throttling so that rcutorture's boost threads don't get
+ * throttled. Only possible if rcutorture is built-in otherwise the
+ * user should manually do this by setting the sched_rt_period_us and
+ * sched_rt_runtime sysctls.
+ */
+ if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime != -1)
+ return;
+
+ old_rt_runtime = sysctl_sched_rt_runtime;
+ sysctl_sched_rt_runtime = -1;
+}
+
+static void rcu_torture_enable_rt_throttle(void)
+{
+ if (!IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) || old_rt_runtime == -1)
+ return;
+
+ sysctl_sched_rt_runtime = old_rt_runtime;
+ old_rt_runtime = -1;
+}
+
+static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
+{
+ int cpu;
+ static int dbg_done;
+ unsigned long end = jiffies;
+ bool gp_done;
+ unsigned long j;
+ static unsigned long last_persist;
+ unsigned long lp;
+ unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
+
+ if (end - *start > mininterval) {
+ // Recheck after checking time to avoid false positives.
+ smp_mb(); // Time check before grace-period check.
+ if (cur_ops->poll_gp_state(gp_state))
+ return false; // passed, though perhaps just barely
+ if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+ // At most one persisted message per boost test.
+ j = jiffies;
+ lp = READ_ONCE(last_persist);
+ if (time_after(j, lp + mininterval) &&
+ cmpxchg(&last_persist, lp, j) == lp) {
+ if (cpu < 0)
+ pr_info("Boost inversion persisted: QS from all CPUs\n");
+ else
+ pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ }
+ return false; // passed on a technicality
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+ pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+ current->rt_priority, gp_state, end - *start);
+ cur_ops->gp_kthread_dbg();
+ // Recheck after print to flag grace period ending during splat.
+ gp_done = cur_ops->poll_gp_state(gp_state);
+ pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+ gp_done ? "ended already" : "still pending");
+
+ }
+
+ return true; // failed
+ } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+ *start = jiffies;
+ }
+
+ return false; // passed
}
static int rcu_torture_boost(void *arg)
{
- unsigned long call_rcu_time;
unsigned long endtime;
+ unsigned long gp_state;
+ unsigned long gp_state_time;
unsigned long oldstarttime;
- struct rcu_boost_inflight rbi = { .inflight = 0 };
- struct sched_param sp;
+ unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ;
- VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ } else {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period");
+ while (time_before(jiffies, booststarttime)) {
+ schedule_timeout_idle(HZ);
+ if (kthread_should_stop())
+ goto cleanup;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period");
+ }
/* Set real-time priority. */
- sp.sched_priority = 1;
- if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
- n_rcu_torture_boost_rterror++;
- }
+ sched_set_fifo_low(current);
- init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
+ bool failed = false; // Test failed already in this test interval
+ bool gp_initiated = false;
+
+ if (kthread_should_stop())
+ goto checkwait;
+
/* Wait for the next test interval. */
- oldstarttime = boost_starttime;
- while (ULONG_CMP_LT(jiffies, oldstarttime)) {
+ oldstarttime = READ_ONCE(boost_starttime);
+ while (time_before(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
- stutter_wait("rcu_torture_boost");
+ if (stutter_wait("rcu_torture_boost"))
+ sched_set_fifo_low(current);
if (torture_must_stop())
goto checkwait;
}
- /* Do one boost-test interval. */
+ // Do one boost-test interval.
endtime = oldstarttime + test_boost_duration * HZ;
- call_rcu_time = jiffies;
- while (ULONG_CMP_LT(jiffies, endtime)) {
- /* If we don't have a callback in flight, post one. */
- if (!rbi.inflight) {
- smp_mb(); /* RCU core before ->inflight = 1. */
- rbi.inflight = 1;
- call_rcu(&rbi.rcu, rcu_torture_boost_cb);
- if (jiffies - call_rcu_time >
- test_boost_duration * HZ - HZ / 2) {
- VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
- n_rcu_torture_boost_failure++;
- }
- call_rcu_time = jiffies;
+ while (time_before(jiffies, endtime)) {
+ // Has current GP gone too long?
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+ // If we don't have a grace period in flight, start one.
+ if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+ gp_state = cur_ops->start_gp_poll();
+ gp_initiated = true;
+ gp_state_time = jiffies;
+ }
+ if (stutter_wait("rcu_torture_boost")) {
+ sched_set_fifo_low(current);
+ // If the grace period already ended,
+ // we don't know when that happened, so
+ // start over.
+ if (cur_ops->poll_gp_state(gp_state))
+ gp_initiated = false;
}
- cond_resched_rcu_qs();
- stutter_wait("rcu_torture_boost");
if (torture_must_stop())
goto checkwait;
}
+ // In case the grace period extended beyond the end of the loop.
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ rcu_torture_boost_failed(gp_state, &gp_state_time);
+
/*
* Set the start time of the next test interval.
* Yes, this is vulnerable to long delays, but such
@@ -734,86 +1363,34 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime &&
- !kthread_should_stop()) {
+ while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
- boost_starttime = jiffies +
- test_boost_interval * HZ;
- n_rcu_torture_boosts++;
+ if (oldstarttime == boost_starttime) {
+ WRITE_ONCE(boost_starttime,
+ jiffies + test_boost_interval * HZ);
+ n_rcu_torture_boosts++;
+ }
mutex_unlock(&boost_mutex);
break;
}
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
/* Go do the stutter. */
-checkwait: stutter_wait("rcu_torture_boost");
+checkwait: if (stutter_wait("rcu_torture_boost"))
+ sched_set_fifo_low(current);
} while (!torture_must_stop());
+cleanup:
/* Clean up and exit. */
- while (!kthread_should_stop() || rbi.inflight) {
+ while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
- smp_mb(); /* order accesses to ->inflight before stack-frame death. */
- destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
}
-static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
-{
-}
-
-/*
- * RCU torture callback-flood kthread. Repeatedly induces bursts of calls
- * to call_rcu() or analogous, increasing the probability of occurrence
- * of callback-overflow corner cases.
- */
-static int
-rcu_torture_cbflood(void *arg)
-{
- int err = 1;
- int i;
- int j;
- struct rcu_head *rhp;
-
- if (cbflood_n_per_burst > 0 &&
- cbflood_inter_holdoff > 0 &&
- cbflood_intra_holdoff > 0 &&
- cur_ops->call &&
- cur_ops->cb_barrier) {
- rhp = vmalloc(sizeof(*rhp) *
- cbflood_n_burst * cbflood_n_per_burst);
- err = !rhp;
- }
- if (err) {
- VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
- while (!torture_must_stop())
- schedule_timeout_interruptible(HZ);
- return 0;
- }
- VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
- do {
- schedule_timeout_interruptible(cbflood_inter_holdoff);
- atomic_long_inc(&n_cbfloods);
- WARN_ON(signal_pending(current));
- for (i = 0; i < cbflood_n_burst; i++) {
- for (j = 0; j < cbflood_n_per_burst; j++) {
- cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
- rcu_torture_cbflood_cb);
- }
- schedule_timeout_interruptible(cbflood_intra_holdoff);
- WARN_ON(signal_pending(current));
- }
- cur_ops->cb_barrier();
- stutter_wait("rcu_torture_cbflood");
- } while (!torture_must_stop());
- vfree(rhp);
- torture_kthread_stopping("rcu_torture_cbflood");
- return 0;
-}
-
/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
@@ -824,13 +1401,14 @@ rcu_torture_fqs(void *arg)
{
unsigned long fqs_resume_time;
int fqs_burst_remaining;
+ int oldnice = task_nice(current);
VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
do {
fqs_resume_time = jiffies + fqs_stutter * HZ;
- while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+ while (time_before(jiffies, fqs_resume_time) &&
!kthread_should_stop()) {
- schedule_timeout_interruptible(1);
+ schedule_timeout_interruptible(HZ / 20);
}
fqs_burst_remaining = fqs_duration;
while (fqs_burst_remaining > 0 &&
@@ -839,12 +1417,159 @@ rcu_torture_fqs(void *arg)
udelay(fqs_holdoff);
fqs_burst_remaining -= fqs_holdoff;
}
- stutter_wait("rcu_torture_fqs");
+ if (stutter_wait("rcu_torture_fqs"))
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
torture_kthread_stopping("rcu_torture_fqs");
return 0;
}
+// Used by writers to randomly choose from the available grace-period primitives.
+static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { };
+static int nsynctypes;
+
+/*
+ * Determine which grace-period primitives are available.
+ */
+static void rcu_torture_write_types(void)
+{
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
+ bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
+ bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
+
+ /* Initialize synctype[] array. If none set, take default. */
+ if (!gp_cond1 &&
+ !gp_cond_exp1 &&
+ !gp_cond_full1 &&
+ !gp_cond_exp_full1 &&
+ !gp_exp1 &&
+ !gp_poll_exp1 &&
+ !gp_poll_exp_full1 &&
+ !gp_normal1 &&
+ !gp_poll1 &&
+ !gp_poll_full1 &&
+ !gp_sync1) {
+ gp_cond1 = true;
+ gp_cond_exp1 = true;
+ gp_cond_full1 = true;
+ gp_cond_exp_full1 = true;
+ gp_exp1 = true;
+ gp_poll_exp1 = true;
+ gp_poll_exp_full1 = true;
+ gp_normal1 = true;
+ gp_poll1 = true;
+ gp_poll_full1 = true;
+ gp_sync1 = true;
+ }
+ if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
+ synctype[nsynctypes++] = RTWS_COND_GET;
+ pr_info("%s: Testing conditional GPs.\n", __func__);
+ } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
+ pr_alert("%s: gp_cond without primitives.\n", __func__);
+ }
+ if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP;
+ pr_info("%s: Testing conditional expedited GPs.\n", __func__);
+ } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
+ pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
+ }
+ if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_FULL;
+ pr_info("%s: Testing conditional full-state GPs.\n", __func__);
+ } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
+ pr_alert("%s: gp_cond_full without primitives.\n", __func__);
+ }
+ if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
+ pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
+ } else if (gp_cond_exp_full &&
+ (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
+ pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
+ }
+ if (gp_exp1 && cur_ops->exp_sync) {
+ synctype[nsynctypes++] = RTWS_EXP_SYNC;
+ pr_info("%s: Testing expedited GPs.\n", __func__);
+ } else if (gp_exp && !cur_ops->exp_sync) {
+ pr_alert("%s: gp_exp without primitives.\n", __func__);
+ }
+ if (gp_normal1 && cur_ops->deferred_free) {
+ synctype[nsynctypes++] = RTWS_DEF_FREE;
+ pr_info("%s: Testing asynchronous GPs.\n", __func__);
+ } else if (gp_normal && !cur_ops->deferred_free) {
+ pr_alert("%s: gp_normal without primitives.\n", __func__);
+ }
+ if (gp_poll1 && cur_ops->get_comp_state && cur_ops->same_gp_state &&
+ cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
+ synctype[nsynctypes++] = RTWS_POLL_GET;
+ pr_info("%s: Testing polling GPs.\n", __func__);
+ } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
+ pr_alert("%s: gp_poll without primitives.\n", __func__);
+ }
+ if (gp_poll_full1 && cur_ops->get_comp_state_full && cur_ops->same_gp_state_full
+ && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
+ pr_info("%s: Testing polling full-state GPs.\n", __func__);
+ } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_full without primitives.\n", __func__);
+ }
+ if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
+ pr_info("%s: Testing polling expedited GPs.\n", __func__);
+ } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
+ pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
+ }
+ if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
+ pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
+ } else if (gp_poll_exp_full &&
+ (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
+ }
+ if (gp_sync1 && cur_ops->sync) {
+ synctype[nsynctypes++] = RTWS_SYNC;
+ pr_info("%s: Testing normal GPs.\n", __func__);
+ } else if (gp_sync && !cur_ops->sync) {
+ pr_alert("%s: gp_sync without primitives.\n", __func__);
+ }
+ pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes);
+ pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp);
+}
+
+/*
+ * Do the specified rcu_torture_writer() synchronous grace period,
+ * while also testing out the polled APIs. Note well that the single-CPU
+ * grace-period optimizations must be accounted for.
+ */
+static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
+{
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ bool dopoll;
+ bool dopoll_full;
+ unsigned long r = torture_random(trsp);
+
+ dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
+ dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
+ if (dopoll || dopoll_full)
+ cpus_read_lock();
+ if (dopoll)
+ cookie = cur_ops->get_gp_state();
+ if (dopoll_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
+ sync();
+ sync();
+ WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %pS() online %*pbl.",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 4 failed %pS() online %*pbl",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ if (dopoll || dopoll_full)
+ cpus_read_unlock();
+}
+
/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
@@ -853,48 +1578,40 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
- bool can_expedite = !rcu_gp_is_expedited();
+ bool booting_still = false;
+ bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int expediting = 0;
unsigned long gp_snap;
- bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_sync1 = gp_sync;
+ unsigned long gp_snap1;
+ struct rcu_gp_oldstate gp_snap_full;
+ struct rcu_gp_oldstate gp_snap1_full;
int i;
+ int idx;
+ unsigned long j;
+ int oldnice = task_nice(current);
+ struct rcu_gp_oldstate *rgo = NULL;
+ int rgo_size = 0;
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
- int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
- RTWS_COND_GET, RTWS_SYNC };
- int nsynctypes = 0;
-
+ unsigned long stallsdone = jiffies;
+ bool stutter_waited;
+ unsigned long *ulo = NULL;
+ int ulo_size = 0;
+
+ // If a new stall test is added, this must be adjusted.
+ if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
+ stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) *
+ HZ * (stall_cpu_repeat + 1);
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- pr_alert("%s" TORTURE_FLAG
- " Grace periods expedited from boot/sysfs for %s,\n",
- torture_type, cur_ops->name);
- pr_alert("%s" TORTURE_FLAG
- " Testing of dynamic grace-period expediting diabled.\n",
- torture_type);
-
- /* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
- if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
- synctype[nsynctypes++] = RTWS_COND_GET;
- else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
- pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
- if (gp_exp1 && cur_ops->exp_sync)
- synctype[nsynctypes++] = RTWS_EXP_SYNC;
- else if (gp_exp && !cur_ops->exp_sync)
- pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
- if (gp_normal1 && cur_ops->deferred_free)
- synctype[nsynctypes++] = RTWS_DEF_FREE;
- else if (gp_normal && !cur_ops->deferred_free)
- pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
- if (gp_sync1 && cur_ops->sync)
- synctype[nsynctypes++] = RTWS_SYNC;
- else if (gp_sync && !cur_ops->sync)
- pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " GP expediting controlled from boot/sysfs for %s.\n",
+ torture_type, cur_ops->name);
if (WARN_ONCE(nsynctypes == 0,
- "rcu_torture_writer: No update-side primitives.\n")) {
+ "%s: No update-side primitives.\n", __func__)) {
/*
* No updates primitives, so don't try updating.
* The resulting test won't be testing much, hence the
@@ -902,15 +1619,37 @@ rcu_torture_writer(void *arg)
*/
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
+ return 0;
+ }
+ if (cur_ops->poll_active > 0) {
+ ulo = kcalloc(cur_ops->poll_active, sizeof(*ulo), GFP_KERNEL);
+ if (!WARN_ON(!ulo))
+ ulo_size = cur_ops->poll_active;
+ }
+ if (cur_ops->poll_active_full > 0) {
+ rgo = kcalloc(cur_ops->poll_active_full, sizeof(*rgo), GFP_KERNEL);
+ if (!WARN_ON(!rgo))
+ rgo_size = cur_ops->poll_active_full;
}
+ // If the system is still booting, let it finish.
+ j = jiffies;
+ while (!torture_must_stop() && !rcu_inkernel_boot_has_ended()) {
+ booting_still = true;
+ schedule_timeout_interruptible(HZ);
+ }
+ if (booting_still)
+ pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n",
+ torture_type, jiffies - j);
+
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
- schedule_timeout_uninterruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
rcu_torture_writer_state = RTWS_DELAY;
udelay(torture_random(&rand) & 0x3ff);
rcu_torture_writer_state = RTWS_REPLACE;
@@ -924,7 +1663,41 @@ rcu_torture_writer(void *arg)
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
- old_rp->rtort_pipe_count++;
+ WRITE_ONCE(old_rp->rtort_pipe_count,
+ old_rp->rtort_pipe_count + 1);
+ ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count);
+
+ // Make sure readers block polled grace periods.
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
+ idx = cur_ops->readlock();
+ cookie = cur_ops->get_gp_state();
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_comp_state) {
+ cookie = cur_ops->get_comp_state();
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ }
+ cur_ops->readunlock(idx);
+ }
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
+ idx = cur_ops->readlock();
+ cur_ops->get_gp_state_full(&cookie_full);
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ if (cur_ops->get_comp_state_full) {
+ cur_ops->get_comp_state_full(&cookie_full);
+ WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+ }
+ cur_ops->readunlock(idx);
+ }
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -932,23 +1705,111 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
- cur_ops->exp_sync();
+ do_rtws_sync(&rand, cur_ops->exp_sync);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
rcu_torture_writer_state = RTWS_COND_GET;
- gp_snap = cur_ops->get_state();
- i = torture_random(&rand) % 16;
- if (i != 0)
- schedule_timeout_interruptible(i);
- udelay(torture_random(&rand) % 1000);
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC;
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_EXP:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP;
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp,
+ 1000, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
+ cur_ops->cond_sync_exp(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi,
+ 1000, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
+ cur_ops->cond_sync_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp,
+ 1000, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET:
+ rcu_torture_writer_state = RTWS_POLL_GET;
+ for (i = 0; i < ulo_size; i++)
+ ulo[i] = cur_ops->get_comp_state();
+ gp_snap = cur_ops->start_gp_poll();
+ rcu_torture_writer_state = RTWS_POLL_WAIT;
+ if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+ cur_ops->exp_current();
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ gp_snap1 = cur_ops->get_gp_state();
+ for (i = 0; i < ulo_size; i++)
+ if (cur_ops->poll_gp_state(ulo[i]) ||
+ cur_ops->same_gp_state(ulo[i], gp_snap1)) {
+ ulo[i] = gp_snap1;
+ break;
+ }
+ WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi,
+ 1000, &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_FULL;
+ for (i = 0; i < rgo_size; i++)
+ cur_ops->get_comp_state_full(&rgo[i]);
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ if (cur_ops->exp_current && !torture_random(&rand) % 0xff)
+ cur_ops->exp_current();
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ cur_ops->get_gp_state_full(&gp_snap1_full);
+ for (i = 0; i < rgo_size; i++)
+ if (cur_ops->poll_gp_state_full(&rgo[i]) ||
+ cur_ops->same_gp_state_full(&rgo[i],
+ &gp_snap1_full)) {
+ rgo[i] = gp_snap1_full;
+ break;
+ }
+ WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi,
+ 1000, &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP;
+ gp_snap = cur_ops->start_gp_poll_exp();
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
+ while (!cur_ops->poll_gp_state_exp(gp_snap))
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp,
+ 1000, &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp,
+ 1000, &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
- cur_ops->sync();
+ do_rtws_sync(&rand, cur_ops->sync);
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -956,7 +1817,8 @@ rcu_torture_writer(void *arg)
break;
}
}
- rcutorture_record_progress(++rcu_torture_current_version);
+ WRITE_ONCE(rcu_torture_current_version,
+ rcu_torture_current_version + 1);
/* Cycle through nesting levels of rcu_expedite_gp() calls. */
if (can_expedite &&
!(torture_random(&rand) & 0xff & (!!expediting - 1))) {
@@ -967,16 +1829,43 @@ rcu_torture_writer(void *arg)
rcu_unexpedite_gp();
if (++expediting > 3)
expediting = -expediting;
+ } else if (!can_expedite) { /* Disabled during boot, recheck. */
+ can_expedite = !rcu_gp_is_expedited() &&
+ !rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
- stutter_wait("rcu_torture_writer");
+ stutter_waited = stutter_wait("rcu_torture_writer");
+ if (stutter_waited &&
+ !atomic_read(&rcu_fwd_cb_nodelay) &&
+ !cur_ops->slow_gps &&
+ !torture_must_stop() &&
+ time_after(jiffies, stallsdone))
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
+ if (list_empty(&rcu_tortures[i].rtort_free) &&
+ rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
+ tracing_off();
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ rcu_ftrace_dump(DUMP_ALL);
+ break;
+ }
+ if (stutter_waited)
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
+ rcu_torture_current = NULL; // Let stats task know that we are done.
/* Reset expediting back to unexpedited. */
if (expediting > 0)
expediting = -expediting;
while (can_expedite && expediting++ < 0)
rcu_unexpedite_gp();
WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " Dynamic grace-period expediting was disabled.\n",
+ torture_type);
+ kfree(ulo);
+ kfree(rgo);
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
@@ -989,26 +1878,99 @@ rcu_torture_writer(void *arg)
static int
rcu_torture_fakewriter(void *arg)
{
+ unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap_full;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, MAX_NICE);
+ if (WARN_ONCE(nsynctypes == 0,
+ "%s: No update-side primitives.\n", __func__)) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+ }
+
do {
- schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
- udelay(torture_random(&rand) & 0x3ff);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
- torture_random(&rand) % (nfakewriters * 8) == 0) {
+ torture_random(&rand) % (nrealfakewriters * 8) == 0) {
cur_ops->cb_barrier();
- } else if (gp_normal == gp_exp) {
- if (torture_random(&rand) & 0x80)
- cur_ops->sync();
- else
- cur_ops->exp_sync();
- } else if (gp_normal) {
- cur_ops->sync();
} else {
- cur_ops->exp_sync();
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ break;
+ case RTWS_EXP_SYNC:
+ cur_ops->exp_sync();
+ break;
+ case RTWS_COND_GET:
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync(gp_snap);
+ break;
+ case RTWS_COND_GET_EXP:
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp(gp_snap);
+ break;
+ case RTWS_COND_GET_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_full(&gp_snap_full);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ break;
+ case RTWS_POLL_GET:
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_disable();
+ gp_snap = cur_ops->start_gp_poll();
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_enable();
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_FULL:
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_disable();
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_enable();
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP:
+ gp_snap = cur_ops->start_gp_poll_exp();
+ while (!cur_ops->poll_gp_state_exp(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_SYNC:
+ cur_ops->sync();
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
stutter_wait("rcu_torture_fakewriter");
} while (!torture_must_stop());
@@ -1017,78 +1979,492 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
-static void rcutorture_trace_dump(void)
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+ kfree(rhp);
+}
+
+// Set up and carry out testing of RCU's global memory ordering
+static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
+ struct torture_random_state *trsp)
+{
+ unsigned long loops;
+ int noc = torture_num_online_cpus();
+ int rdrchked;
+ int rdrchker;
+ struct rcu_torture_reader_check *rtrcp; // Me.
+ struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking.
+ struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked.
+ struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me.
+
+ if (myid < 0)
+ return; // Don't try this from timer handlers.
+
+ // Increment my counter.
+ rtrcp = &rcu_torture_reader_mbchk[myid];
+ WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1);
+
+ // Attempt to assign someone else some checking work.
+ rdrchked = torture_random(trsp) % nrealreaders;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ rdrchker = torture_random(trsp) % nrealreaders;
+ rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker];
+ if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker &&
+ smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below.
+ !READ_ONCE(rtp->rtort_chkp) &&
+ !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below.
+ rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0);
+ rtrcp->rtc_chkrdr = rdrchked;
+ WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends.
+ if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) ||
+ cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp))
+ (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out.
+ }
+
+ // If assigned some completed work, do it!
+ rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner);
+ if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready))
+ return; // No work or work not yet ready.
+ rdrchked = rtrcp_assigner->rtc_chkrdr;
+ if (WARN_ON_ONCE(rdrchked < 0))
+ return;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ loops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ atomic_inc(&n_rcu_torture_mbchk_tries);
+ if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops))
+ atomic_inc(&n_rcu_torture_mbchk_fail);
+ rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2;
+ rtrcp_assigner->rtc_ready = 0;
+ smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work.
+ smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign.
+}
+
+// Verify the specified RCUTORTURE_RDR* state.
+#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
+static void rcutorture_one_extend_check(char *s, int curstate, int new, int old)
{
- static atomic_t beenhere = ATOMIC_INIT(0);
+ int mask;
- if (atomic_read(&beenhere))
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi())
return;
- if (atomic_xchg(&beenhere, 1) != 0)
+
+ WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS);
+ WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS);
+
+ // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable.
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
+ !softirq_count(), ROEC_ARGS);
+ WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
+ !(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
+ WARN_ONCE(cur_ops->readlock_nesting &&
+ (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
+ cur_ops->readlock_nesting() == 0, ROEC_ARGS);
+
+ // Interrupt handlers have all sorts of stuff disabled, so ignore
+ // unintended disabling.
+ if (in_serving_softirq() || in_hardirq())
return;
- ftrace_dump(DUMP_ALL);
+
+ WARN_ONCE(cur_ops->extendables &&
+ !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
+ softirq_count(), ROEC_ARGS);
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses preempt_disable()
+ * as rcu_read_lock().
+ */
+ mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+
+ WARN_ONCE(cur_ops->extendables && !(curstate & mask) &&
+ (preempt_count() & PREEMPT_MASK), ROEC_ARGS);
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses "preempt_count() &
+ * PREEMPT_MASK" as ->readlock_nesting().
+ */
+ mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count())
+ mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
+ WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) &&
+ cur_ops->readlock_nesting() > 0, ROEC_ARGS);
}
/*
- * RCU torture reader from timer handler. Dereferences rcu_torture_current,
- * incrementing the corresponding element of the pipeline array. The
- * counter in the element should never be greater than 1, otherwise, the
- * RCU implementation is broken.
+ * Do one extension of an RCU read-side critical section using the
+ * current reader state in readstate (set to zero for initial entry
+ * to extended critical section), set the new state as specified by
+ * newstate (set to zero for final exit from extended critical section),
+ * and random-number-generator state in trsp. If this is neither the
+ * beginning or end of the critical section and if there was actually a
+ * change, do a ->read_delay().
*/
-static void rcu_torture_timer(unsigned long unused)
+static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp,
+ struct rt_read_seg *rtrsp)
{
- int idx;
+ bool first;
+ unsigned long flags;
+ int idxnew1 = -1;
+ int idxnew2 = -1;
+ int idxold1 = *readstate;
+ int idxold2 = idxold1;
+ int statesnew = ~*readstate & newstate;
+ int statesold = *readstate & ~newstate;
+
+ first = idxold1 == 0;
+ WARN_ON_ONCE(idxold2 < 0);
+ WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN));
+ rcutorture_one_extend_check("before change", idxold1, statesnew, statesold);
+ rtrsp->rt_readstate = newstate;
+
+ /* First, put new protection in place to avoid critical-section gap. */
+ if (statesnew & RCUTORTURE_RDR_BH)
+ local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_RBH)
+ rcu_read_lock_bh();
+ if (statesnew & RCUTORTURE_RDR_IRQ)
+ local_irq_disable();
+ if (statesnew & RCUTORTURE_RDR_PREEMPT)
+ preempt_disable();
+ if (statesnew & RCUTORTURE_RDR_SCHED)
+ rcu_read_lock_sched();
+ if (statesnew & RCUTORTURE_RDR_RCU_1)
+ idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1;
+ if (statesnew & RCUTORTURE_RDR_RCU_2)
+ idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2;
+
+ // Complain unless both the old and the new protection is in place.
+ rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold);
+
+ // Sample CPU under both sets of protections to reduce confusion.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
+ int cpu = raw_smp_processor_id();
+ rtrsp->rt_cpu = cpu;
+ if (!first) {
+ rtrsp[-1].rt_end_cpu = cpu;
+ if (cur_ops->reader_blocked)
+ rtrsp[-1].rt_preempted = cur_ops->reader_blocked();
+ }
+ }
+ // Sample grace-period sequence number, as good a place as any.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) {
+ rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs();
+ rtrsp->rt_ts = ktime_get_mono_fast_ns();
+ if (!first)
+ rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq;
+ }
+
+ /*
+ * Next, remove old protection, in decreasing order of strength
+ * to avoid unlock paths that aren't safe in the stronger
+ * context. Namely: BH can not be enabled with disabled interrupts.
+ * Additionally PREEMPT_RT requires that BH is enabled in preemptible
+ * context.
+ */
+ if (statesold & RCUTORTURE_RDR_IRQ)
+ local_irq_enable();
+ if (statesold & RCUTORTURE_RDR_PREEMPT)
+ preempt_enable();
+ if (statesold & RCUTORTURE_RDR_SCHED)
+ rcu_read_unlock_sched();
+ if (statesold & RCUTORTURE_RDR_BH)
+ local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_RBH)
+ rcu_read_unlock_bh();
+ if (statesold & RCUTORTURE_RDR_RCU_2) {
+ cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2);
+ WARN_ON_ONCE(idxnew2 != -1);
+ idxold2 = 0;
+ }
+ if (statesold & RCUTORTURE_RDR_RCU_1) {
+ bool lockit;
+
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
+ if (lockit)
+ raw_spin_lock_irqsave(&current->pi_lock, flags);
+ cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
+ if (lockit)
+ raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ }
+ if (statesold & RCUTORTURE_RDR_UPDOWN) {
+ cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
+ }
+
+ /* Delay if neither beginning nor end and there was a change. */
+ if ((statesnew || statesold) && *readstate && newstate)
+ cur_ops->read_delay(trsp, rtrsp);
+
+ /* Update the reader state. */
+ if (idxnew1 == -1)
+ idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
+ WARN_ON_ONCE(idxnew1 < 0);
+ if (idxnew2 == -1)
+ idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
+ WARN_ON_ONCE(idxnew2 < 0);
+ *readstate = idxnew1 | idxnew2 | newstate;
+ WARN_ON_ONCE(*readstate < 0);
+ if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
+ pr_info("Unexpected readstate value of %#x\n", *readstate);
+ rcutorture_one_extend_check("after change", *readstate, statesnew, statesold);
+}
+
+/* Return the biggest extendables mask given current RCU and boot parameters. */
+static int rcutorture_extend_mask_max(void)
+{
+ int mask;
+
+ WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
+ mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
+ mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+ return mask;
+}
+
+/* Return a random protection state mask, but with at least one bit set. */
+static int
+rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
+{
+ int mask = rcutorture_extend_mask_max();
+ unsigned long randmask1 = torture_random(trsp);
+ unsigned long randmask2 = randmask1 >> 3;
+ unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+ unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits.
+ /* Mostly only one bit (need preemption!), sometimes lots of bits. */
+ if (!(randmask1 & 0x7))
+ mask = mask & randmask2;
+ else
+ mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
+
+ // Can't have nested RCU reader without outer RCU reader.
+ if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
+ if (oldmask & RCUTORTURE_RDR_RCU_1)
+ mask &= ~RCUTORTURE_RDR_RCU_2;
+ else
+ mask |= RCUTORTURE_RDR_RCU_1;
+ }
+
+ /*
+ * Can't enable bh w/irq disabled.
+ */
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & bhs;
+
+ /*
+ * Ideally these sequences would be detected in debug builds
+ * (regardless of RT), but until then don't stop testing
+ * them on non-RT.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* Can't modify BH in atomic context */
+ if (oldmask & preempts_irq)
+ mask &= ~bhs;
+ if ((oldmask | mask) & preempts_irq)
+ mask |= oldmask & bhs;
+ }
+
+ return mask ?: RCUTORTURE_RDR_RCU_1;
+}
+
+/*
+ * Do a randomly selected number of extensions of an existing RCU read-side
+ * critical section.
+ */
+static struct rt_read_seg *
+rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp)
+{
+ int i;
+ int j;
+ int mask = rcutorture_extend_mask_max();
+
+ WARN_ON_ONCE(!*readstate); /* -Existing- RCU read-side critsect! */
+ if (!((mask - 1) & mask))
+ return rtrsp; /* Current RCU reader not extendable. */
+ /* Bias towards larger numbers of loops. */
+ i = torture_random(trsp);
+ i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
+ for (j = 0; j < i; j++) {
+ mask = rcutorture_extend_mask(*readstate, trsp);
+ WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]);
+ }
+ return &rtrsp[j];
+}
+
+struct rcu_torture_one_read_state {
+ bool checkpolling;
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
unsigned long started;
- unsigned long completed;
- static DEFINE_TORTURE_RANDOM(rand);
- static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
- int pipe_count;
+ int readstate;
+ struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS];
+ struct rt_read_seg *rtrsp;
unsigned long long ts;
+};
- idx = cur_ops->readlock();
- if (cur_ops->started)
- started = cur_ops->started();
- else
- started = cur_ops->completed();
- ts = rcu_trace_clock_local();
- p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
- if (p == NULL) {
- /* Leave because rcu_torture_writer is not yet underway */
- cur_ops->readunlock(idx);
- return;
+static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp)
+{
+ memset(rtorsp, 0, sizeof(*rtorsp));
+ rtorsp->checkpolling = !(torture_random(trsp) & 0xfff);
+ rtorsp->rtrsp = &rtorsp->rtseg[0];
+}
+
+/*
+ * Set up the first segment of a series of overlapping read-side
+ * critical sections. The caller must have actually initiated the
+ * outermost read-side critical section.
+ */
+static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp, long myid)
+{
+ if (rtorsp->checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ rtorsp->cookie = cur_ops->get_gp_state();
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ cur_ops->get_gp_state_full(&rtorsp->cookie_full);
+ }
+ rtorsp->started = cur_ops->get_gp_seq();
+ rtorsp->ts = rcu_trace_clock_local();
+ rtorsp->p = rcu_dereference_check(rcu_torture_current,
+ !cur_ops->readlock_held || cur_ops->readlock_held() ||
+ (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN));
+ if (rtorsp->p == NULL) {
+ /* Wait for rcu_torture_writer to get underway */
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
+ return false;
}
- if (p->rtort_mbtest == 0)
+ if (rtorsp->p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
- spin_lock(&rand_lock);
- cur_ops->read_delay(&rand);
- n_rcu_torture_timers++;
- spin_unlock(&rand_lock);
+ rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp);
+ return true;
+}
+
+/*
+ * Complete the last segment of a series of overlapping read-side
+ * critical sections and check for errors.
+ */
+static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp)
+{
+ int i;
+ unsigned long completed;
+ int pipe_count;
+ bool preempted = false;
+ struct rt_read_seg *rtrsp1;
+
preempt_disable();
- pipe_count = p->rtort_pipe_count;
+ pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
+ // Should not happen in a correct RCU implementation,
+ // happens quite often for torture_type=busted.
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- completed = cur_ops->completed();
+ completed = cur_ops->get_gp_seq();
if (pipe_count > 1) {
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
- started, completed);
- rcutorture_trace_dump();
+ do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu,
+ rtorsp->ts, rtorsp->started, completed);
+ rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed - started;
- if (cur_ops->started)
- completed++;
+ completed = rcutorture_seq_diff(completed, rtorsp->started);
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
- cur_ops->readunlock(idx);
+ if (rtorsp->checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ rtorsp->cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full),
+ "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ }
+ if (cur_ops->reader_blocked)
+ preempted = cur_ops->reader_blocked();
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
+ WARN_ON_ONCE(rtorsp->readstate);
+ // This next splat is expected behavior if leakpointer, especially
+ // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
+ WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1);
+
+ /* If error or close call, record the sequence of reader protections. */
+ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) {
+ i = 0;
+ for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++)
+ err_segs[i++] = *rtrsp1;
+ rt_read_nsegs = i;
+ rt_read_preempted = preempted;
+ }
+}
+
+/*
+ * Do one read-side critical section, returning false if there was
+ * no data to read. Can be invoked both from process context and
+ * from a timer handler.
+ */
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
+{
+ int newstate;
+ struct rcu_torture_one_read_state rtors;
+
+ WARN_ON_ONCE(!rcu_is_watching());
+ init_rcu_torture_one_read_state(&rtors, trsp);
+ newstate = rcutorture_extend_mask(rtors.readstate, trsp);
+ WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++);
+ if (!rcu_torture_one_read_start(&rtors, trsp, myid))
+ return false;
+ rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp);
+ rcu_torture_one_read_end(&rtors, trsp);
+ return true;
+}
+
+static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
+
+/*
+ * RCU torture reader from timer handler. Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array. The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(struct timer_list *unused)
+{
+ atomic_long_inc(&n_rcu_torture_timers);
+ (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1);
+
+ /* Test call_rcu() invocation from interrupt handler. */
+ if (cur_ops->call) {
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+
+ if (rhp)
+ cur_ops->call(rhp, rcu_torture_timer_cb);
+ }
}
/*
@@ -1100,78 +2476,235 @@ static void rcu_torture_timer(unsigned long unused)
static int
rcu_torture_reader(void *arg)
{
- unsigned long started;
- unsigned long completed;
- int idx;
+ unsigned long lastsleep = jiffies;
+ long myid = (long)arg;
+ int mynumonline = myid;
DEFINE_TORTURE_RANDOM(rand);
- struct rcu_torture *p;
- int pipe_count;
struct timer_list t;
- unsigned long long ts;
VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
- setup_timer_on_stack(&t, rcu_torture_timer, 0);
-
+ timer_setup_on_stack(&t, rcu_torture_timer, 0);
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
do {
if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
- idx = cur_ops->readlock();
- if (cur_ops->started)
- started = cur_ops->started();
- else
- started = cur_ops->completed();
- ts = rcu_trace_clock_local();
- p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(&srcu_ctl));
- if (p == NULL) {
- /* Wait for rcu_torture_writer to get underway */
- cur_ops->readunlock(idx);
+ if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
- continue;
+ if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
+ torture_hrtimeout_us(500, 1000, &rand);
+ lastsleep = jiffies + 10;
}
- if (p->rtort_mbtest == 0)
- atomic_inc(&n_rcu_torture_mberror);
- cur_ops->read_delay(&rand);
- preempt_disable();
- pipe_count = p->rtort_pipe_count;
- if (pipe_count > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
- pipe_count = RCU_TORTURE_PIPE_LEN;
- }
- completed = cur_ops->completed();
- if (pipe_count > 1) {
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
- ts, started, completed);
- rcutorture_trace_dump();
- }
- __this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = completed - started;
- if (cur_ops->started)
- completed++;
- if (completed > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
- completed = RCU_TORTURE_PIPE_LEN;
- }
- __this_cpu_inc(rcu_torture_batch[completed]);
- preempt_enable();
- cur_ops->readunlock(idx);
- cond_resched_rcu_qs();
+ while (!torture_must_stop() &&
+ (torture_num_online_cpus() < mynumonline || !rcu_inkernel_boot_has_ended()))
+ schedule_timeout_interruptible(HZ / 5);
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
- del_timer_sync(&t);
- destroy_timer_on_stack(&t);
+ timer_delete_sync(&t);
+ timer_destroy_on_stack(&t);
}
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
return 0;
}
+struct rcu_torture_one_read_state_updown {
+ struct hrtimer rtorsu_hrt;
+ bool rtorsu_inuse;
+ ktime_t rtorsu_kt;
+ int rtorsu_cpu;
+ unsigned long rtorsu_j;
+ unsigned long rtorsu_ndowns;
+ unsigned long rtorsu_nups;
+ unsigned long rtorsu_nmigrates;
+ struct torture_random_state rtorsu_trs;
+ struct rcu_torture_one_read_state rtorsu_rtors;
+};
+
+static struct rcu_torture_one_read_state_updown *updownreaders;
+static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand);
+static int rcu_torture_updown(void *arg);
+
+static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp)
+{
+ int cpu = raw_smp_processor_id();
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt);
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ WRITE_ONCE(rtorsup->rtorsu_nmigrates,
+ rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu));
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ return HRTIMER_NORESTART;
+}
+
+static int rcu_torture_updown_init(void)
+{
+ int i;
+ struct torture_random_state *rand = &rcu_torture_updown_rand;
+ int ret;
+
+ if (n_up_down < 0)
+ return 0;
+ if (!srcu_torture_have_up_down()) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives");
+ return 0;
+ }
+ updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL);
+ if (!updownreaders) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests");
+ return -ENOMEM;
+ }
+ for (i = 0; i < n_up_down; i++) {
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand);
+ hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ torture_random_init(&updownreaders[i].rtorsu_trs);
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors,
+ &updownreaders[i].rtorsu_trs);
+ }
+ ret = torture_create_kthread(rcu_torture_updown, rand, updown_task);
+ if (ret) {
+ kfree(updownreaders);
+ updownreaders = NULL;
+ }
+ return ret;
+}
+
+static void rcu_torture_updown_cleanup(void)
+{
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (!smp_load_acquire(&rtorsup->rtorsu_inuse))
+ continue;
+ if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) {
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ }
+
+ }
+ kfree(updownreaders);
+ updownreaders = NULL;
+}
+
+// Do one reader for rcu_torture_updown().
+static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup)
+{
+ int idx;
+ int rawidx;
+ ktime_t t;
+
+ init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ rawidx = cur_ops->down_read();
+ WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1);
+ idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1;
+ rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN;
+ rtorsup->rtorsu_rtors.rtrsp++;
+ rtorsup->rtorsu_cpu = raw_smp_processor_id();
+ if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) {
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ schedule_timeout_idle(HZ);
+ return;
+ }
+ smp_store_release(&rtorsup->rtorsu_inuse, true);
+ t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million.
+ if (t < 10 * 1000)
+ t = 200 * 1000 * 1000;
+ hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ smp_mb(); // Sample jiffies after posting hrtimer.
+ rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler.
+ rtorsup->rtorsu_kt = t;
+}
+
+/*
+ * RCU torture up/down reader kthread, starting RCU readers in kthread
+ * context and ending them in hrtimer handlers. Otherwise similar to
+ * rcu_torture_reader().
+ */
+static int
+rcu_torture_updown(void *arg)
+{
+ unsigned long j;
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_updown task started");
+ do {
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (torture_must_stop())
+ break;
+ j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse.
+ if (smp_load_acquire(&rtorsup->rtorsu_inuse)) {
+ WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10),
+ "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j);
+ continue;
+ }
+ rcu_torture_updown_one(rtorsup);
+ }
+ torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand);
+ stutter_wait("rcu_torture_updown");
+ } while (!torture_must_stop());
+ rcu_torture_updown_cleanup();
+ torture_kthread_stopping("rcu_torture_updown");
+ return 0;
+}
+
+/*
+ * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
+ * increase race probabilities and fuzzes the interval between toggling.
+ */
+static int rcu_nocb_toggle(void *arg)
+{
+ int cpu;
+ int maxcpu = -1;
+ int oldnice = task_nice(current);
+ long r;
+ DEFINE_TORTURE_RANDOM(rand);
+ ktime_t toggle_delay;
+ unsigned long toggle_fuzz;
+ ktime_t toggle_interval = ms_to_ktime(nocbs_toggle);
+
+ VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
+ while (!rcu_inkernel_boot_has_ended())
+ schedule_timeout_interruptible(HZ / 10);
+ for_each_possible_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (toggle_interval > ULONG_MAX)
+ toggle_fuzz = ULONG_MAX >> 3;
+ else
+ toggle_fuzz = toggle_interval >> 3;
+ if (toggle_fuzz <= 0)
+ toggle_fuzz = NSEC_PER_USEC;
+ do {
+ r = torture_random(&rand);
+ cpu = (r >> 1) % (maxcpu + 1);
+ if (r & 0x1) {
+ rcu_nocb_cpu_offload(cpu);
+ atomic_long_inc(&n_nocb_offload);
+ } else {
+ rcu_nocb_cpu_deoffload(cpu);
+ atomic_long_inc(&n_nocb_deoffload);
+ }
+ toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL);
+ if (stutter_wait("rcu_nocb_toggle"))
+ sched_set_normal(current, oldnice);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_nocb_toggle");
+ return 0;
+}
+
/*
* Print torture statistics. Caller must ensure that there is only
* one call to this function at a given time!!! This is normally
@@ -1187,52 +2720,81 @@ rcu_torture_stats_print(void)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long n_gpwraps = 0;
+ unsigned long ndowns = 0;
+ unsigned long nunexpired = 0;
+ unsigned long nmigrates = 0;
+ unsigned long nups = 0;
+ struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
+ static bool splatted;
+ struct task_struct *wtp;
for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
- batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+ pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]);
+ batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
+ if (cur_ops->get_gpwrap_count)
+ n_gpwraps += cur_ops->get_gpwrap_count(cpu);
}
- for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ if (updownreaders) {
+ for (i = 0; i < n_up_down; i++) {
+ ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns);
+ nups += READ_ONCE(updownreaders[i].rtorsu_nups);
+ nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse);
+ nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates);
+ }
+ }
+ for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
- }
+ } // The value of variable "i" is used later, so don't clobber it!
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
- pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
- rcu_torture_current,
+ rtcp = rcu_access_pointer(rcu_torture_current);
+ pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ rtcp,
+ rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER",
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ",
atomic_read(&n_rcu_torture_mberror),
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
+ atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
+ n_rcu_torture_barrier_error,
+ n_rcu_torture_boost_ktrerror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
- n_rcu_torture_timers);
+ atomic_long_read(&n_rcu_torture_timers));
+ if (updownreaders)
+ pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates);
torture_onoff_stats();
pr_cont("barrier: %ld/%ld:%ld ",
- n_barrier_successes,
- n_barrier_attempts,
- n_rcu_torture_barrier_error);
- pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
+ data_race(n_barrier_successes),
+ data_race(n_barrier_attempts),
+ data_race(n_rcu_torture_barrier_error));
+ pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
+ pr_cont("nocb-toggles: %ld:%ld ",
+ atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
+ pr_cont("gpwraps: %ld\n", n_gpwraps);
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
- if (atomic_read(&n_rcu_torture_mberror) != 0 ||
- n_rcu_torture_barrier_error != 0 ||
- n_rcu_torture_boost_ktrerror != 0 ||
- n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_failure != 0 ||
- i > 1) {
+ if (atomic_read(&n_rcu_torture_mberror) ||
+ atomic_read(&n_rcu_torture_mbchk_fail) ||
+ n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
+ n_rcu_torture_boost_failure || i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
+ WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
+ WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
+ WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?)
+ WARN_ON_ONCE(i > 1); // Too-short grace period
}
pr_cont("Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1255,18 +2817,27 @@ rcu_torture_stats_print(void)
if (cur_ops->stats)
cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
- rcu_torture_current != NULL) {
- int __maybe_unused flags;
- unsigned long __maybe_unused gpnum;
- unsigned long __maybe_unused completed;
-
- rcutorture_get_gp_data(cur_ops->ttype,
- &flags, &gpnum, &completed);
- pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
- rcu_torture_writer_state,
- gpnum, completed, flags);
- show_rcu_gp_kthreads();
- rcutorture_trace_dump();
+ rcu_access_pointer(rcu_torture_current) &&
+ !rcu_stall_is_suppressed() &&
+ rcu_inkernel_boot_has_ended()) {
+ int __maybe_unused flags = 0;
+ unsigned long __maybe_unused gp_seq = 0;
+
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
+ wtp = READ_ONCE(writer_task);
+ pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state, gp_seq, flags,
+ wtp == NULL ? ~0U : wtp->__state,
+ wtp == NULL ? -1 : (int)task_cpu(wtp));
+ if (!splatted && wtp) {
+ sched_show_task(wtp);
+ splatted = true;
+ }
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ rcu_ftrace_dump(DUMP_ALL);
}
rtcv_snap = rcu_torture_current_version;
}
@@ -1288,7 +2859,57 @@ rcu_torture_stats(void *arg)
return 0;
}
-static inline void
+/* Test mem_dump_obj() and friends. */
+static void rcu_torture_mem_dump_obj(void)
+{
+ struct rcu_head *rhp;
+ struct kmem_cache *kcp;
+ static int z;
+
+ kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ if (WARN_ON_ONCE(!kcp))
+ return;
+ rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp)) {
+ kmem_cache_destroy(kcp);
+ return;
+ }
+ pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+ pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+ mem_dump_obj(ZERO_SIZE_PTR);
+ pr_alert("mem_dump_obj(NULL):");
+ mem_dump_obj(NULL);
+ pr_alert("mem_dump_obj(%px):", &rhp);
+ mem_dump_obj(&rhp);
+ pr_alert("mem_dump_obj(%px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(%px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ pr_alert("mem_dump_obj(%px):", &z);
+ mem_dump_obj(&z);
+ kmem_cache_free(kcp, rhp);
+ kmem_cache_destroy(kcp);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp))
+ return;
+ pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ kfree(rhp);
+ rhp = vmalloc(4096);
+ if (WARN_ON_ONCE(!rhp))
+ return;
+ pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ vfree(rhp);
+}
+
+static void
rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
{
pr_alert("%s" TORTURE_FLAG
@@ -1297,48 +2918,83 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"shuffle_interval=%d stutter=%d irqreader=%d "
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d shutdown_secs=%d "
- "stall_cpu=%d stall_cpu_holdoff=%d "
+ "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
+ "stall_cpu_block=%d stall_cpu_repeat=%d "
"n_barrier_cbs=%d "
- "onoff_interval=%d onoff_holdoff=%d\n",
- torture_type, tag, nrealreaders, nfakewriters,
+ "onoff_interval=%d onoff_holdoff=%d "
+ "read_exit_delay=%d read_exit_burst=%d "
+ "reader_flavor=%x "
+ "nocbs_nthreads=%d nocbs_toggle=%d "
+ "test_nmis=%d "
+ "preempt_duration=%d preempt_interval=%d n_up_down=%d\n",
+ torture_type, tag, nrealreaders, nrealfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration, shutdown_secs,
- stall_cpu, stall_cpu_holdoff,
+ test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs,
+ stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
+ stall_cpu_block, stall_cpu_repeat,
n_barrier_cbs,
- onoff_interval, onoff_holdoff);
+ onoff_interval, onoff_holdoff,
+ read_exit_delay, read_exit_burst,
+ reader_flavor,
+ nocbs_nthreads, nocbs_toggle,
+ test_nmis,
+ preempt_duration, preempt_interval, n_up_down);
}
-static void rcutorture_booster_cleanup(int cpu)
+static int rcutorture_booster_cleanup(unsigned int cpu)
{
struct task_struct *t;
if (boost_tasks[cpu] == NULL)
- return;
+ return 0;
mutex_lock(&boost_mutex);
t = boost_tasks[cpu];
boost_tasks[cpu] = NULL;
+ rcu_torture_enable_rt_throttle();
mutex_unlock(&boost_mutex);
/* This must be outside of the mutex, otherwise deadlock! */
torture_stop_kthread(rcu_torture_boost, t);
+ return 0;
}
-static int rcutorture_booster_init(int cpu)
+static int rcutorture_booster_init(unsigned int cpu)
{
int retval;
if (boost_tasks[cpu] != NULL)
return 0; /* Already created, nothing more to do. */
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_IRQ_FORCED_THREADING
+ if (force_irqthreads()) {
+ t = per_cpu(ktimerd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+#endif
+ }
+
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
+ rcu_torture_disable_rt_throttle();
VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ cpu, "rcu_torture_boost_%u");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
@@ -1347,37 +3003,107 @@ static int rcutorture_booster_init(int cpu)
mutex_unlock(&boost_mutex);
return retval;
}
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
mutex_unlock(&boost_mutex);
return 0;
}
+static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr)
+{
+ pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_torture_stall_block = {
+ .notifier_call = rcu_torture_stall_nf,
+};
+
/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
+ * induces a CPU stall for the time specified by stall_cpu. If a new
+ * stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
*/
-static int rcu_torture_stall(void *args)
+static void rcu_torture_stall_one(int rep, int irqsoff)
{
+ int idx;
unsigned long stop_at;
- VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
if (stall_cpu_holdoff > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
}
- if (!kthread_should_stop()) {
- stop_at = get_seconds() + stall_cpu;
+ if (!kthread_should_stop() && stall_gp_kthread > 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall");
+ rcu_gp_set_torture_wait(stall_gp_kthread * HZ);
+ for (idx = 0; idx < stall_gp_kthread + 2; idx++) {
+ if (kthread_should_stop())
+ break;
+ schedule_timeout_uninterruptible(HZ);
+ }
+ }
+ if (!kthread_should_stop() && stall_cpu > 0) {
+ VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall");
+ stop_at = ktime_get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
- pr_alert("rcu_torture_stall start.\n");
- rcu_read_lock();
- preempt_disable();
- while (ULONG_CMP_LT(get_seconds(), stop_at))
- continue; /* Induce RCU CPU stall warning. */
- preempt_enable();
- rcu_read_unlock();
- pr_alert("rcu_torture_stall end.\n");
+ idx = cur_ops->readlock();
+ if (irqsoff)
+ local_irq_disable();
+ else if (!stall_cpu_block)
+ preempt_disable();
+ pr_alert("%s start stall episode %d on CPU %d.\n",
+ __func__, rep + 1, raw_smp_processor_id());
+ while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) &&
+ !kthread_should_stop())
+ if (stall_cpu_block) {
+#ifdef CONFIG_PREEMPTION
+ preempt_schedule();
+#else
+ schedule_timeout_uninterruptible(HZ);
+#endif
+ } else if (stall_no_softlockup) {
+ touch_softlockup_watchdog();
+ }
+ if (irqsoff)
+ local_irq_enable();
+ else if (!stall_cpu_block)
+ preempt_enable();
+ cur_ops->readunlock(idx);
+ }
+}
+
+/*
+ * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many
+ * additional times as specified by the stall_cpu_repeat module parameter.
+ * Note that stall_cpu_irqsoff is ignored on the second and subsequent
+ * stall.
+ */
+static int rcu_torture_stall(void *args)
+{
+ int i;
+ int repeat = stall_cpu_repeat;
+ int ret;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (repeat < 0) {
+ repeat = 0;
+ WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+ }
+ if (rcu_cpu_stall_notifiers) {
+ ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+ __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+ }
+ for (i = 0; i <= repeat; i++) {
+ if (kthread_should_stop())
+ break;
+ rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0);
+ }
+ pr_alert("%s end.\n", __func__);
+ if (rcu_cpu_stall_notifiers && !ret) {
+ ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
}
torture_shutdown_absorb("rcu_torture_stall");
while (!kthread_should_stop())
@@ -1388,22 +3114,544 @@ static int rcu_torture_stall(void *args)
/* Spawn CPU-stall kthread, if stall_cpu specified. */
static int __init rcu_torture_stall_init(void)
{
- if (stall_cpu <= 0)
+ if (stall_cpu <= 0 && stall_gp_kthread <= 0)
return 0;
return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
}
+/* State structure for forward-progress self-propagating RCU callback. */
+struct fwd_cb_state {
+ struct rcu_head rh;
+ int stop;
+};
+
+/*
+ * Forward-progress self-propagating RCU callback function. Because
+ * callbacks run from softirq, this function is an implicit RCU read-side
+ * critical section.
+ */
+static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp)
+{
+ struct fwd_cb_state *fcsp = container_of(rhp, struct fwd_cb_state, rh);
+
+ if (READ_ONCE(fcsp->stop)) {
+ WRITE_ONCE(fcsp->stop, 2);
+ return;
+ }
+ cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb);
+}
+
+/* State for continuous-flood RCU callbacks. */
+struct rcu_fwd_cb {
+ struct rcu_head rh;
+ struct rcu_fwd_cb *rfc_next;
+ struct rcu_fwd *rfc_rfp;
+ int rfc_gps;
+};
+
+#define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */
+#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */
+#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */
+#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */
+#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
+
+struct rcu_launder_hist {
+ long n_launders;
+ unsigned long launder_gp_seq;
+};
+
+struct rcu_fwd {
+ spinlock_t rcu_fwd_lock;
+ struct rcu_fwd_cb *rcu_fwd_cb_head;
+ struct rcu_fwd_cb **rcu_fwd_cb_tail;
+ long n_launders_cb;
+ unsigned long rcu_fwd_startat;
+ struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
+ unsigned long rcu_launder_gp_seq_start;
+ int rcu_fwd_id;
+};
+
+static DEFINE_MUTEX(rcu_fwd_mutex);
+static struct rcu_fwd *rcu_fwds;
+static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
+static bool rcu_fwd_emergency_stop;
+
+static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
+{
+ unsigned long gps;
+ unsigned long gps_old;
+ int i;
+ int j;
+
+ for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
+ if (rfp->n_launders_hist[i].n_launders > 0)
+ break;
+ pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
+ __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
+ gps_old = rfp->rcu_launder_gp_seq_start;
+ for (j = 0; j <= i; j++) {
+ gps = rfp->n_launders_hist[j].launder_gp_seq;
+ pr_cont(" %ds/%d: %ld:%ld",
+ j + 1, FWD_CBS_HIST_DIV,
+ rfp->n_launders_hist[j].n_launders,
+ rcutorture_seq_diff(gps, gps_old));
+ gps_old = gps;
+ }
+ pr_cont("\n");
+}
+
+/* Callback function for continuous-flood RCU callbacks. */
+static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
+{
+ unsigned long flags;
+ int i;
+ struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh);
+ struct rcu_fwd_cb **rfcpp;
+ struct rcu_fwd *rfp = rfcp->rfc_rfp;
+
+ rfcp->rfc_next = NULL;
+ rfcp->rfc_gps++;
+ spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+ rfcpp = rfp->rcu_fwd_cb_tail;
+ rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
+ smp_store_release(rfcpp, rfcp);
+ WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
+ i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
+ if (i >= ARRAY_SIZE(rfp->n_launders_hist))
+ i = ARRAY_SIZE(rfp->n_launders_hist) - 1;
+ rfp->n_launders_hist[i].n_launders++;
+ rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
+}
+
+// Give the scheduler a chance, even on nohz_full CPUs.
+static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
+{
+ if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ // Real call_rcu() floods hit userspace, so emulate that.
+ if (need_resched() || (iter & 0xfff))
+ schedule();
+ return;
+ }
+ // No userspace emulation: CB invocation throttles call_rcu()
+ cond_resched();
+}
+
+/*
+ * Free all callbacks on the rcu_fwd_cb_head list, either because the
+ * test is over or because we hit an OOM event.
+ */
+static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp)
+{
+ unsigned long flags;
+ unsigned long freed = 0;
+ struct rcu_fwd_cb *rfcp;
+
+ for (;;) {
+ spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
+ rfcp = rfp->rcu_fwd_cb_head;
+ if (!rfcp) {
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
+ break;
+ }
+ rfp->rcu_fwd_cb_head = rfcp->rfc_next;
+ if (!rfp->rcu_fwd_cb_head)
+ rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags);
+ kfree(rfcp);
+ freed++;
+ rcu_torture_fwd_prog_cond_resched(freed);
+ if (tick_nohz_full_enabled()) {
+ local_irq_save(flags);
+ rcu_momentary_eqs();
+ local_irq_restore(flags);
+ }
+ }
+ return freed;
+}
+
+/* Carry out need_resched()/cond_resched() forward-progress testing. */
+static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
+ int *tested, int *tested_tries)
+{
+ unsigned long cver;
+ unsigned long dur;
+ struct fwd_cb_state fcs;
+ unsigned long gps;
+ int idx;
+ int sd;
+ int sd4;
+ bool selfpropcb = false;
+ unsigned long stopat;
+ static DEFINE_TORTURE_RANDOM(trs);
+
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (!cur_ops->sync)
+ return; // Cannot do need_resched() forward progress testing without ->sync.
+ if (cur_ops->call && cur_ops->cb_barrier) {
+ init_rcu_head_on_stack(&fcs.rh);
+ selfpropcb = true;
+ }
+
+ /* Tight loop containing cond_resched(). */
+ atomic_inc(&rcu_fwd_cb_nodelay);
+ cur_ops->sync(); /* Later readers see above write. */
+ if (selfpropcb) {
+ WRITE_ONCE(fcs.stop, 0);
+ cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
+ }
+ cver = READ_ONCE(rcu_torture_current_version);
+ gps = cur_ops->get_gp_seq();
+ sd = cur_ops->stall_dur() + 1;
+ sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div;
+ dur = sd4 + torture_random(&trs) % (sd - sd4);
+ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+ stopat = rfp->rcu_fwd_startat + dur;
+ while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
+ !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
+ idx = cur_ops->readlock();
+ udelay(10);
+ cur_ops->readunlock(idx);
+ if (!fwd_progress_need_resched || need_resched())
+ cond_resched();
+ }
+ (*tested_tries)++;
+ if (!time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
+ !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
+ (*tested)++;
+ cver = READ_ONCE(rcu_torture_current_version) - cver;
+ gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ WARN_ON(!cver && gps < 2);
+ pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
+ rfp->rcu_fwd_id, dur, cver, gps);
+ }
+ if (selfpropcb) {
+ WRITE_ONCE(fcs.stop, 1);
+ cur_ops->sync(); /* Wait for running CB to complete. */
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
+ cur_ops->cb_barrier(); /* Wait for queued callbacks. */
+ }
+
+ if (selfpropcb) {
+ WARN_ON(READ_ONCE(fcs.stop) != 2);
+ destroy_rcu_head_on_stack(&fcs.rh);
+ }
+ schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
+ atomic_dec(&rcu_fwd_cb_nodelay);
+}
+
+/* Carry out call_rcu() forward-progress testing. */
+static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
+{
+ unsigned long cver;
+ unsigned long flags;
+ unsigned long gps;
+ int i;
+ long n_launders;
+ long n_launders_cb_snap;
+ long n_launders_sa;
+ long n_max_cbs;
+ long n_max_gps;
+ struct rcu_fwd_cb *rfcp;
+ struct rcu_fwd_cb *rfcpn;
+ unsigned long stopat;
+ unsigned long stoppedat;
+
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (READ_ONCE(rcu_fwd_emergency_stop))
+ return; /* Get out of the way quickly, no GP wait! */
+ if (!cur_ops->call)
+ return; /* Can't do call_rcu() fwd prog without ->call. */
+
+ /* Loop continuously posting RCU callbacks. */
+ atomic_inc(&rcu_fwd_cb_nodelay);
+ cur_ops->sync(); /* Later readers see above write. */
+ WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
+ stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
+ n_launders = 0;
+ rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread
+ n_launders_sa = 0;
+ n_max_cbs = 0;
+ n_max_gps = 0;
+ for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++)
+ rfp->n_launders_hist[i].n_launders = 0;
+ cver = READ_ONCE(rcu_torture_current_version);
+ gps = cur_ops->get_gp_seq();
+ rfp->rcu_launder_gp_seq_start = gps;
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
+ while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
+ !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
+ rfcp = READ_ONCE(rfp->rcu_fwd_cb_head);
+ rfcpn = NULL;
+ if (rfcp)
+ rfcpn = READ_ONCE(rfcp->rfc_next);
+ if (rfcpn) {
+ if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS &&
+ ++n_max_gps >= MIN_FWD_CBS_LAUNDERED)
+ break;
+ rfp->rcu_fwd_cb_head = rfcpn;
+ n_launders++;
+ n_launders_sa++;
+ } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
+ rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rfcp)) {
+ schedule_timeout_interruptible(1);
+ continue;
+ }
+ n_max_cbs++;
+ n_launders_sa = 0;
+ rfcp->rfc_gps = 0;
+ rfcp->rfc_rfp = rfp;
+ } else {
+ rfcp = NULL;
+ }
+ if (rfcp)
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
+ if (tick_nohz_full_enabled()) {
+ local_irq_save(flags);
+ rcu_momentary_eqs();
+ local_irq_restore(flags);
+ }
+ }
+ stoppedat = jiffies;
+ n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
+ cver = READ_ONCE(rcu_torture_current_version) - cver;
+ gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
+ cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
+ (void)rcu_torture_fwd_prog_cbfree(rfp);
+
+ if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
+ !shutdown_time_arrived()) {
+ if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n",
+ __func__,
+ stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
+ n_launders + n_max_cbs - n_launders_cb_snap,
+ n_launders, n_launders_sa,
+ n_max_gps, n_max_cbs, cver, gps, num_online_cpus());
+ atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
+ rcu_torture_fwd_cb_hist(rfp);
+ mutex_unlock(&rcu_fwd_mutex);
+ }
+ schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
+ atomic_dec(&rcu_fwd_cb_nodelay);
+}
+
+
+/*
+ * OOM notifier, but this only prints diagnostic information for the
+ * current forward-progress test.
+ */
+static int rcutorture_oom_notify(struct notifier_block *self,
+ unsigned long notused, void *nfreed)
+{
+ int i;
+ long ncbs;
+ struct rcu_fwd *rfp;
+
+ mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
+ if (!rfp) {
+ mutex_unlock(&rcu_fwd_mutex);
+ return NOTIFY_OK;
+ }
+ WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
+ __func__);
+ for (i = 0; i < fwd_progress; i++) {
+ rcu_torture_fwd_cb_hist(&rfp[i]);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
+ }
+ WRITE_ONCE(rcu_fwd_emergency_stop, true);
+ smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ smp_mb(); /* Frees before return to avoid redoing OOM. */
+ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
+ pr_info("%s returning after OOM processing.\n", __func__);
+ mutex_unlock(&rcu_fwd_mutex);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_oom_nb = {
+ .notifier_call = rcutorture_oom_notify
+};
+
+/* Carry out grace-period forward-progress testing. */
+static int rcu_torture_fwd_prog(void *args)
+{
+ bool firsttime = true;
+ long max_cbs;
+ int oldnice = task_nice(current);
+ unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
+ struct rcu_fwd *rfp = args;
+ int tested = 0;
+ int tested_tries = 0;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started");
+ while (!rcu_inkernel_boot_has_ended())
+ schedule_timeout_interruptible(HZ / 10);
+ rcu_bind_current_to_nocb();
+ if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
+ set_user_nice(current, MAX_NICE);
+ do {
+ if (!rfp->rcu_fwd_id) {
+ schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ if (!firsttime) {
+ max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+ pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+ }
+ firsttime = false;
+ WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
+ } else {
+ while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
+ schedule_timeout_interruptible(HZ / 20);
+ oldseq = READ_ONCE(rcu_fwd_seq);
+ }
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
+ rcu_torture_fwd_prog_cr(rfp);
+ if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
+ (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ (rcu_inkernel_boot_has_ended() &&
+ torture_num_online_cpus() > rfp->rcu_fwd_id)))
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
+
+ /* Avoid slow periods, better to test when busy. */
+ if (stutter_wait("rcu_torture_fwd_prog"))
+ sched_set_normal(current, oldnice);
+ } while (!torture_must_stop());
+ /* Short runs might not contain a valid forward-progress attempt. */
+ if (!rfp->rcu_fwd_id) {
+ WARN_ON(!tested && tested_tries >= 5);
+ pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ }
+ torture_kthread_stopping("rcu_torture_fwd_prog");
+ return 0;
+}
+
+/* If forward-progress checking is requested and feasible, spawn the thread. */
+static int __init rcu_torture_fwd_prog_init(void)
+{
+ int i;
+ int ret = 0;
+ struct rcu_fwd *rfp;
+
+ if (!fwd_progress)
+ return 0; /* Not requested, so don't do it. */
+ if (fwd_progress >= nr_cpu_ids) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
+ fwd_progress = nr_cpu_ids;
+ } else if (fwd_progress < 0) {
+ fwd_progress = nr_cpu_ids;
+ }
+ if ((!cur_ops->sync && !cur_ops->call) ||
+ (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
+ cur_ops == &rcu_busted_ops) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
+ fwd_progress = 0;
+ return 0;
+ }
+ if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing");
+ fwd_progress = 0;
+ if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
+ return -EINVAL; /* In module, can fail back to user. */
+ WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */
+ return 0;
+ }
+ if (fwd_progress_holdoff <= 0)
+ fwd_progress_holdoff = 1;
+ if (fwd_progress_div <= 0)
+ fwd_progress_div = 4;
+ rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
+ fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ if (!rfp || !fwd_prog_tasks) {
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+ fwd_progress = 0;
+ return -ENOMEM;
+ }
+ for (i = 0; i < fwd_progress; i++) {
+ spin_lock_init(&rfp[i].rcu_fwd_lock);
+ rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
+ rfp[i].rcu_fwd_id = i;
+ }
+ mutex_lock(&rcu_fwd_mutex);
+ rcu_fwds = rfp;
+ mutex_unlock(&rcu_fwd_mutex);
+ register_oom_notifier(&rcutorture_oom_nb);
+ for (i = 0; i < fwd_progress; i++) {
+ ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
+ if (ret) {
+ fwd_progress = i;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static void rcu_torture_fwd_prog_cleanup(void)
+{
+ int i;
+ struct rcu_fwd *rfp;
+
+ if (!rcu_fwds || !fwd_prog_tasks)
+ return;
+ for (i = 0; i < fwd_progress; i++)
+ torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
+ unregister_oom_notifier(&rcutorture_oom_nb);
+ mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
+ rcu_fwds = NULL;
+ mutex_unlock(&rcu_fwd_mutex);
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+}
+
/* Callback function for RCU barrier testing. */
static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
{
atomic_inc(&barrier_cbs_invoked);
}
+/* IPI handler to get callback posted on desired CPU, if online. */
+static int rcu_torture_barrier1cb(void *rcu_void)
+{
+ struct rcu_head *rhp = rcu_void;
+
+ cur_ops->call(rhp, rcu_torture_barrier_cbf);
+ return 0;
+}
+
/* kthread function to register callbacks used to test RCU barriers. */
static int rcu_torture_barrier_cbs(void *arg)
{
long myid = (long)arg;
- bool lastphase = 0;
+ bool lastphase = false;
bool newphase;
struct rcu_head rcu;
@@ -1413,13 +3661,18 @@ static int rcu_torture_barrier_cbs(void *arg)
do {
wait_event(barrier_cbs_wq[myid],
(newphase =
- ACCESS_ONCE(barrier_phase)) != lastphase ||
+ smp_load_acquire(&barrier_phase)) != lastphase ||
torture_must_stop());
lastphase = newphase;
- smp_mb(); /* ensure barrier_phase load before ->call(). */
if (torture_must_stop())
break;
- cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ /*
+ * The above smp_load_acquire() ensures barrier_phase load
+ * is ordered before the following ->call().
+ */
+ if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1))
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -1439,8 +3692,8 @@ static int rcu_torture_barrier(void *arg)
do {
atomic_set(&barrier_cbs_invoked, 0);
atomic_set(&barrier_cbs_count, n_barrier_cbs);
- smp_mb(); /* Ensure barrier_phase after prior assignments. */
- barrier_phase = !barrier_phase;
+ /* Ensure barrier_phase ordered after prior assignments. */
+ smp_store_release(&barrier_phase, !barrier_phase);
for (i = 0; i < n_barrier_cbs; i++)
wake_up(&barrier_cbs_wq[i]);
wait_event(barrier_wq,
@@ -1455,9 +3708,24 @@ static int rcu_torture_barrier(void *arg)
pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
atomic_read(&barrier_cbs_invoked),
n_barrier_cbs);
- WARN_ON_ONCE(1);
+ WARN_ON(1);
+ // Wait manually for the remaining callbacks
+ i = 0;
+ do {
+ if (WARN_ON(i++ > HZ))
+ i = INT_MIN;
+ schedule_timeout_interruptible(1);
+ cur_ops->cb_barrier();
+ } while (atomic_read(&barrier_cbs_invoked) !=
+ n_barrier_cbs &&
+ !torture_must_stop());
+ smp_mb(); // Can't trust ordering if broken.
+ if (!torture_must_stop())
+ pr_err("Recovered: barrier_cbs_invoked = %d\n",
+ atomic_read(&barrier_cbs_invoked));
+ } else {
+ n_barrier_successes++;
}
- n_barrier_successes++;
schedule_timeout_interruptible(HZ / 10);
} while (!torture_must_stop());
torture_kthread_stopping("rcu_torture_barrier");
@@ -1470,7 +3738,7 @@ static int rcu_torture_barrier_init(void)
int i;
int ret;
- if (n_barrier_cbs == 0)
+ if (n_barrier_cbs <= 0)
return 0;
if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
pr_alert("%s" TORTURE_FLAG
@@ -1484,11 +3752,10 @@ static int rcu_torture_barrier_init(void)
atomic_set(&barrier_cbs_count, 0);
atomic_set(&barrier_cbs_invoked, 0);
barrier_cbs_tasks =
- kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+ kcalloc(n_barrier_cbs, sizeof(barrier_cbs_tasks[0]),
GFP_KERNEL);
barrier_cbs_wq =
- kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
- GFP_KERNEL);
+ kcalloc(n_barrier_cbs, sizeof(barrier_cbs_wq[0]), GFP_KERNEL);
if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
@@ -1521,80 +3788,386 @@ static void rcu_torture_barrier_cleanup(void)
}
}
-static int rcutorture_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static bool rcu_torture_can_boost(void)
{
- long cpu = (long)hcpu;
+ static int boost_warn_once;
+ int prio;
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- (void)rcutorture_booster_init(cpu);
- break;
- case CPU_DOWN_PREPARE:
- rcutorture_booster_cleanup(cpu);
- break;
- default:
- break;
+ if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
+ return false;
+ if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
+ return false;
+
+ prio = rcu_get_gp_kthreads_prio();
+ if (!prio)
+ return false;
+
+ if (prio < 2) {
+ if (boost_warn_once == 1)
+ return false;
+
+ pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
+ boost_warn_once = 1;
+ return false;
}
- return NOTIFY_OK;
+
+ return true;
}
-static struct notifier_block rcutorture_cpu_nb = {
- .notifier_call = rcutorture_cpu_notify,
-};
+static bool read_exit_child_stop;
+static bool read_exit_child_stopped;
+static wait_queue_head_t read_exit_wq;
+
+// Child kthread which just does an rcutorture reader and exits.
+static int rcu_torture_read_exit_child(void *trsp_in)
+{
+ struct torture_random_state *trsp = trsp_in;
+ set_user_nice(current, MAX_NICE);
+ // Minimize time between reading and exiting.
+ while (!kthread_should_stop())
+ schedule_timeout_uninterruptible(HZ / 20);
+ (void)rcu_torture_one_read(trsp, -1);
+ return 0;
+}
+
+// Parent kthread which creates and destroys read-exit child kthreads.
+static int rcu_torture_read_exit(void *unused)
+{
+ bool errexit = false;
+ int i;
+ struct task_struct *tsp;
+ DEFINE_TORTURE_RANDOM(trs);
+
+ // Allocate and initialize.
+ set_user_nice(current, MAX_NICE);
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test");
+
+ // Each pass through this loop does one read-exit episode.
+ do {
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ for (i = 0; i < read_exit_burst; i++) {
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ stutter_wait("rcu_torture_read_exit");
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s", "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ break;
+ }
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits++;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ i = 0;
+ for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
+ schedule_timeout_uninterruptible(HZ);
+ } while (!errexit && !READ_ONCE(read_exit_child_stop));
+
+ // Clean up and exit.
+ smp_store_release(&read_exit_child_stopped, true); // After reaping.
+ smp_mb(); // Store before wakeup.
+ wake_up(&read_exit_wq);
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(HZ / 20);
+ torture_kthread_stopping("rcu_torture_read_exit");
+ return 0;
+}
+
+static int rcu_torture_read_exit_init(void)
+{
+ if (read_exit_burst <= 0)
+ return 0;
+ init_waitqueue_head(&read_exit_wq);
+ read_exit_child_stop = false;
+ read_exit_child_stopped = false;
+ return torture_create_kthread(rcu_torture_read_exit, NULL,
+ read_exit_task);
+}
+
+static void rcu_torture_read_exit_cleanup(void)
+{
+ if (!read_exit_task)
+ return;
+ WRITE_ONCE(read_exit_child_stop, true);
+ smp_mb(); // Above write before wait.
+ wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped));
+ torture_stop_kthread(rcutorture_read_exit, read_exit_task);
+}
+
+static void rcutorture_test_nmis(int n)
+{
+#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ int cpu;
+ int dumpcpu;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ preempt_disable();
+ cpu = smp_processor_id();
+ dumpcpu = cpu + 1;
+ if (dumpcpu >= nr_cpu_ids)
+ dumpcpu = 0;
+ pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu);
+ dump_cpu_task(dumpcpu);
+ preempt_enable();
+ schedule_timeout_uninterruptible(15 * HZ);
+ }
+#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis);
+#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+}
+
+// Randomly preempt online CPUs.
+static int rcu_torture_preempt(void *unused)
+{
+ int cpu = -1;
+ DEFINE_TORTURE_RANDOM(rand);
+
+ schedule_timeout_idle(stall_cpu_holdoff);
+ do {
+ // Wait for preempt_interval ms with up to 100us fuzz.
+ torture_hrtimeout_ms(preempt_interval, 100, &rand);
+ // Select online CPU.
+ cpu = cpumask_next(cpu, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_next(-1, cpu_online_mask);
+ WARN_ON_ONCE(cpu >= nr_cpu_ids);
+ // Move to that CPU, if can't do so, retry later.
+ if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false))
+ continue;
+ // Preempt at high-ish priority, then reset to normal.
+ sched_set_fifo(current);
+ torture_sched_setaffinity(current->pid, cpu_present_mask, true);
+ mdelay(preempt_duration);
+ sched_set_normal(current, 0);
+ stutter_wait("rcu_torture_preempt");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_preempt");
+ return 0;
+}
+
+static enum cpuhp_state rcutor_hp;
+
+static struct hrtimer gpwrap_lag_timer;
+static bool gpwrap_lag_active;
+
+/* Timer handler for toggling RCU grace-period sequence overflow test lag value */
+static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer)
+{
+ ktime_t next_delay;
+
+ if (gpwrap_lag_active) {
+ pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n");
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+ next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0);
+ } else {
+ pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps);
+ cur_ops->set_gpwrap_lag(gpwrap_lag_gps);
+ gpwrap_lag_active = true;
+ next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0);
+ }
+
+ if (torture_must_stop_irq())
+ return HRTIMER_NORESTART;
+
+ hrtimer_forward_now(timer, next_delay);
+ return HRTIMER_RESTART;
+}
+
+static int rcu_gpwrap_lag_init(void)
+{
+ if (!gpwrap_lag)
+ return 0;
+
+ if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) {
+ pr_alert("rcu-torture: lag timing parameters must be positive\n");
+ return -EINVAL;
+ }
+
+ hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ gpwrap_lag_active = false;
+ hrtimer_start(&gpwrap_lag_timer,
+ ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL);
+
+ return 0;
+}
+
+static void rcu_gpwrap_lag_cleanup(void)
+{
+ hrtimer_cancel(&gpwrap_lag_timer);
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+}
static void
rcu_torture_cleanup(void)
{
+ int firsttime;
+ int flags = 0;
+ unsigned long gp_seq = 0;
int i;
+ int j;
- rcutorture_record_test_transition();
if (torture_cleanup_begin()) {
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
+ return;
+ }
+ if (!cur_ops) {
+ torture_cleanup_end();
return;
}
+ rcutorture_test_nmis(test_nmis);
+
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ torture_stop_kthread(rcu_torture_preempt, preempt_task);
+ rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
+ rcu_torture_fwd_prog_cleanup();
torture_stop_kthread(rcu_torture_stall, stall_task);
torture_stop_kthread(rcu_torture_writer, writer_task);
+ if (nocb_tasks) {
+ for (i = 0; i < nrealnocbers; i++)
+ torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]);
+ kfree(nocb_tasks);
+ nocb_tasks = NULL;
+ }
+
+ if (updown_task) {
+ torture_stop_kthread(rcu_torture_updown, updown_task);
+ updown_task = NULL;
+ }
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
reader_tasks[i]);
kfree(reader_tasks);
+ reader_tasks = NULL;
}
- rcu_torture_current = NULL;
+ kfree(rcu_torture_reader_mbchk);
+ rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nrealfakewriters; i++)
torture_stop_kthread(rcu_torture_fakewriter,
fakewriter_tasks[i]);
- }
kfree(fakewriter_tasks);
fakewriter_tasks = NULL;
}
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
+ pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n",
+ cur_ops->name, (long)gp_seq, flags,
+ rcutorture_seq_diff(gp_seq, start_gp_seq));
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
- for (i = 0; i < ncbflooders; i++)
- torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
- if ((test_boost == 1 && cur_ops->can_boost) ||
- test_boost == 2) {
- unregister_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i)
- rcutorture_booster_cleanup(i);
- }
-
- /* Wait for all RCU callbacks to fire. */
+ if (rcu_torture_can_boost() && rcutor_hp >= 0)
+ cpuhp_remove_state(rcutor_hp);
- if (cur_ops->cb_barrier != NULL)
+ /*
+ * Wait for all RCU callbacks to fire, then do torture-type-specific
+ * cleanup operations.
+ */
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ rcu_torture_mem_dump_obj();
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
+ if (err_segs_recorded) {
+ pr_alert("Failure/close-call rcutorture reader segments:\n");
+ if (rt_read_nsegs == 0)
+ pr_alert("\t: No segments recorded!!!\n");
+ firsttime = 1;
+ for (i = 0; i < rt_read_nsegs; i++) {
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP))
+ pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL));
+ else
+ pr_alert("\t");
+ pr_cont("%d: %#4x", i, err_segs[i].rt_readstate);
+ if (err_segs[i].rt_delay_jiffies != 0) {
+ pr_cont("%s%ldjiffies", firsttime ? "" : "+",
+ err_segs[i].rt_delay_jiffies);
+ firsttime = 0;
+ }
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
+ pr_cont(" CPU %2d", err_segs[i].rt_cpu);
+ if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu)
+ pr_cont("->%-2d", err_segs[i].rt_end_cpu);
+ else
+ pr_cont(" ...");
+ }
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) &&
+ cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) {
+ char buf1[20+1];
+ char buf2[20+1];
+ char sepchar = '-';
+
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq,
+ buf1, ARRAY_SIZE(buf1));
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end,
+ buf2, ARRAY_SIZE(buf2));
+ if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) {
+ if (buf2[0]) {
+ for (j = 0; buf2[j]; j++)
+ buf2[j] = '.';
+ if (j)
+ buf2[j - 1] = ' ';
+ }
+ sepchar = ' ';
+ }
+ pr_cont(" %s%c%s", buf1, sepchar, buf2);
+ }
+ if (err_segs[i].rt_delay_ms != 0) {
+ pr_cont(" %s%ldms", firsttime ? "" : "+",
+ err_segs[i].rt_delay_ms);
+ firsttime = 0;
+ }
+ if (err_segs[i].rt_delay_us != 0) {
+ pr_cont(" %s%ldus", firsttime ? "" : "+",
+ err_segs[i].rt_delay_us);
+ firsttime = 0;
+ }
+ pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : "");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH)
+ pr_cont(" BH");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ)
+ pr_cont(" IRQ");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT)
+ pr_cont(" PREEMPT");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH)
+ pr_cont(" RBH");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED)
+ pr_cont(" SCHED");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1)
+ pr_cont(" RCU_1");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2)
+ pr_cont(" RCU_2");
+ pr_cont("\n");
+
+ }
+ if (rt_read_preempted)
+ pr_alert("\tReader was preempted.\n");
+ }
if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else if (torture_onoff_failures())
@@ -1603,9 +4176,13 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag)
+ rcu_gpwrap_lag_cleanup();
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
static void rcu_torture_leak_cb(struct rcu_head *rhp)
{
}
@@ -1621,9 +4198,8 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
* next grace period. Unlikely, but can happen. If it
* does happen, the debug-objects subsystem won't have splatted.
*/
- pr_alert("rcutorture: duplicated callback was invoked.\n");
+ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
}
-#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
/*
* Verify that double-free causes debug-objects to complain, but only
@@ -1632,47 +4208,250 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
static void rcu_test_debug_objects(void)
{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ int idx;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) {
+ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n",
+ KBUILD_MODNAME, cur_ops->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(cur_ops->debug_objects &&
+ (!cur_ops->call || !cur_ops->cb_barrier)))
+ return;
+
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
- pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+ pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name);
/* Try to queue the rh2 pair of callbacks for the same grace period. */
- preempt_disable(); /* Prevent preemption from interrupting test. */
- rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
- local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu(&rh2, rcu_torture_leak_cb);
- call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
- local_irq_enable();
- rcu_read_unlock();
- preempt_enable();
+ idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */
+ cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ cur_ops->call(&rh2, rcu_torture_leak_cb);
+ cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ if (rhp) {
+ cur_ops->call(rhp, rcu_torture_leak_cb);
+ cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ }
+ cur_ops->readunlock(idx);
/* Wait for them all to get done so we can safely return. */
- rcu_barrier();
- pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+ cur_ops->cb_barrier();
+ pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
-#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
- pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
-#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+ kfree(rhp);
+}
+
+static void rcutorture_sync(void)
+{
+ static unsigned long n;
+
+ if (cur_ops->sync && !(++n & 0xfff))
+ cur_ops->sync();
+}
+
+static DEFINE_MUTEX(mut0);
+static DEFINE_MUTEX(mut1);
+static DEFINE_MUTEX(mut2);
+static DEFINE_MUTEX(mut3);
+static DEFINE_MUTEX(mut4);
+static DEFINE_MUTEX(mut5);
+static DEFINE_MUTEX(mut6);
+static DEFINE_MUTEX(mut7);
+static DEFINE_MUTEX(mut8);
+static DEFINE_MUTEX(mut9);
+
+static DECLARE_RWSEM(rwsem0);
+static DECLARE_RWSEM(rwsem1);
+static DECLARE_RWSEM(rwsem2);
+static DECLARE_RWSEM(rwsem3);
+static DECLARE_RWSEM(rwsem4);
+static DECLARE_RWSEM(rwsem5);
+static DECLARE_RWSEM(rwsem6);
+static DECLARE_RWSEM(rwsem7);
+static DECLARE_RWSEM(rwsem8);
+static DECLARE_RWSEM(rwsem9);
+
+DEFINE_STATIC_SRCU(srcu0);
+DEFINE_STATIC_SRCU(srcu1);
+DEFINE_STATIC_SRCU(srcu2);
+DEFINE_STATIC_SRCU(srcu3);
+DEFINE_STATIC_SRCU(srcu4);
+DEFINE_STATIC_SRCU(srcu5);
+DEFINE_STATIC_SRCU(srcu6);
+DEFINE_STATIC_SRCU(srcu7);
+DEFINE_STATIC_SRCU(srcu8);
+DEFINE_STATIC_SRCU(srcu9);
+
+static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i,
+ int cyclelen, int deadlock)
+{
+ int j = i + 1;
+
+ if (j >= cyclelen)
+ j = deadlock ? 0 : -1;
+ if (j >= 0)
+ pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i);
+ else
+ pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i);
+ return j;
+}
+
+// Test lockdep on SRCU-based deadlock scenarios.
+static void rcu_torture_init_srcu_lockdep(void)
+{
+ int cyclelen;
+ int deadlock;
+ bool err = false;
+ int i;
+ int j;
+ int idx;
+ struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4,
+ &mut5, &mut6, &mut7, &mut8, &mut9 };
+ struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4,
+ &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 };
+ struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4,
+ &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 };
+ int testtype;
+
+ if (!test_srcu_lockdep)
+ return;
+
+ deadlock = test_srcu_lockdep / 1000;
+ testtype = (test_srcu_lockdep / 10) % 100;
+ cyclelen = test_srcu_lockdep % 10;
+ WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus));
+ if (WARN_ONCE(deadlock != !!deadlock,
+ "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n",
+ __func__, test_srcu_lockdep, deadlock))
+ err = true;
+ if (WARN_ONCE(cyclelen <= 0,
+ "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n",
+ __func__, test_srcu_lockdep, cyclelen))
+ err = true;
+ if (err)
+ goto err_out;
+
+ if (testtype == 0) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu",
+ "srcu_read_unlock", i, cyclelen, deadlock);
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+
+ if (testtype == 1) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ mutex_lock(muts[i]);
+ mutex_unlock(muts[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu",
+ "mutex_unlock", i, cyclelen, deadlock);
+ mutex_lock(muts[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ mutex_unlock(muts[i]);
+ }
+ return;
+ }
+
+ if (testtype == 2) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ down_read(rwsems[i]);
+ up_read(rwsems[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu",
+ "up_write", i, cyclelen, deadlock);
+ down_write(rwsems[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ up_write(rwsems[i]);
+ }
+ return;
+ }
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (testtype == 3) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock";
+ char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace"
+ : "synchronize_srcu";
+ char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock";
+
+ j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock);
+ if (i == 0)
+ rcu_read_lock_trace();
+ else
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0) {
+ if (i == cyclelen - 1)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_srcu(srcus[j]);
+ }
+ if (i == 0)
+ rcu_read_unlock_trace();
+ else
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+#endif // #ifdef CONFIG_TASKS_TRACE_RCU
+
+err_out:
+ pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep);
+ pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__);
+ pr_info("%s: D: Deadlock if nonzero.\n", __func__);
+ pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__);
+ pr_info("%s: L: Cycle length.\n", __func__);
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU))
+ pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__);
}
static int __init
rcu_torture_init(void)
{
- int i;
+ long i;
int cpu;
int firsterr = 0;
+ int flags = 0;
+ unsigned long gp_seq = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
- RCUTORTURE_TASKS_OPS
+ &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
+ TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
+ &trivial_ops,
};
- if (!torture_init_begin(torture_type, verbose, &torture_runnable))
+ if (!torture_init_begin(torture_type, verbose))
return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
@@ -1686,26 +4465,48 @@ rcu_torture_init(void)
torture_type);
pr_alert("rcu-torture types:");
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
- pr_alert(" %s", torture_ops[i]->name);
- pr_alert("\n");
- torture_init_end();
- return -EINVAL;
+ pr_cont(" %s", torture_ops[i]->name);
+ pr_cont("\n");
+ firsterr = -EINVAL;
+ cur_ops = NULL;
+ goto unwind;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
+ if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops ||
+ !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n",
+ cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU));
+ nocbs_nthreads = 0;
+ }
if (cur_ops->init)
- cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+ cur_ops->init();
+
+ rcu_torture_init_srcu_lockdep();
+
+ if (nfakewriters >= 0) {
+ nrealfakewriters = nfakewriters;
+ } else {
+ nrealfakewriters = num_online_cpus() - 2 - nfakewriters;
+ if (nrealfakewriters <= 0)
+ nrealfakewriters = 1;
+ }
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
- nrealreaders = num_online_cpus() - 1;
+ nrealreaders = num_online_cpus() - 2 - nreaders;
if (nrealreaders <= 0)
nrealreaders = 1;
}
rcu_torture_print_module_parms(cur_ops, "Start of test");
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
+ start_gp_seq = gp_seq;
+ pr_alert("%s: Start-test grace-period state: g%ld f%#x\n",
+ cur_ops->name, (long)gp_seq, flags);
/* Set up the freelist. */
@@ -1724,10 +4525,11 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_alloc_fail, 0);
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_mbchk_fail, 0);
+ atomic_set(&n_rcu_torture_mbchk_tries, 0);
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1738,122 +4540,167 @@ rcu_torture_init(void)
per_cpu(rcu_torture_batch, cpu)[i] = 0;
}
}
+ err_segs_recorded = 0;
+ rt_read_nsegs = 0;
/* Start up the kthreads. */
- firsterr = torture_create_kthread(rcu_torture_writer, NULL,
- writer_task);
- if (firsterr)
- goto unwind;
- fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
- GFP_KERNEL);
- if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ rcu_torture_write_types();
+ if (nrealfakewriters > 0) {
+ fakewriter_tasks = kcalloc(nrealfakewriters,
+ sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nrealfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
- reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+ reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
- if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
+ GFP_KERNEL);
+ if (!reader_tasks || !rcu_torture_reader_mbchk) {
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
- firsterr = torture_create_kthread(rcu_torture_reader, NULL,
+ rcu_torture_reader_mbchk[i].rtc_chkrdr = -1;
+ firsterr = torture_create_kthread(rcu_torture_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+
+ firsterr = rcu_torture_updown_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+ nrealnocbers = nocbs_nthreads;
+ if (WARN_ON(nrealnocbers < 0))
+ nrealnocbers = 1;
+ if (WARN_ON(nocbs_toggle < 0))
+ nocbs_toggle = HZ;
+ if (nrealnocbers > 0) {
+ nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
+ if (nocb_tasks == NULL) {
+ TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ } else {
+ nocb_tasks = NULL;
+ }
+ for (i = 0; i < nrealnocbers; i++) {
+ firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]);
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(rcu_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
- if (test_no_idle_hz) {
+ if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter < 0)
stutter = 0;
if (stutter) {
- firsterr = torture_stutter_init(stutter * HZ);
- if (firsterr)
+ int t;
+
+ t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
+ firsterr = torture_stutter_init(stutter * HZ, t);
+ if (torture_init_error(firsterr))
goto unwind;
}
if (fqs_duration < 0)
fqs_duration = 0;
- if (fqs_duration) {
+ if (fqs_holdoff < 0)
+ fqs_holdoff = 0;
+ if (fqs_duration && fqs_holdoff) {
/* Create the fqs thread */
firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
fqs_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_boost_interval < 1)
test_boost_interval = 1;
if (test_boost_duration < 2)
test_boost_duration = 2;
- if ((test_boost == 1 && cur_ops->can_boost) ||
- test_boost == 2) {
+ if (rcu_torture_can_boost()) {
boost_starttime = jiffies + test_boost_interval * HZ;
- register_cpu_notifier(&rcutorture_cpu_nb);
- for_each_possible_cpu(i) {
- if (cpu_is_offline(i))
- continue; /* Heuristic: CPU can go offline. */
- firsterr = rcutorture_booster_init(i);
- if (firsterr)
- goto unwind;
- }
+
+ firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
+ rcutorture_booster_init,
+ rcutorture_booster_cleanup);
+ rcutor_hp = firsterr;
+ if (torture_init_error(firsterr))
+ goto unwind;
}
+ shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
- firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
- if (firsterr)
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
+ rcutorture_sync);
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_stall_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
+ goto unwind;
+ firsterr = rcu_torture_fwd_prog_init();
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_barrier_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
+ firsterr = rcu_torture_read_exit_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+ if (preempt_duration > 0) {
+ firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
if (object_debug)
rcu_test_debug_objects();
- if (cbflood_n_burst > 0) {
- /* Create the cbflood threads */
- ncbflooders = (num_online_cpus() + 3) / 4;
- cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
- GFP_KERNEL);
- if (!cbflood_task) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
+
+ if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister))
+ cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag) {
+ firsterr = rcu_gpwrap_lag_init();
+ if (torture_init_error(firsterr))
goto unwind;
- }
- for (i = 0; i < ncbflooders; i++) {
- firsterr = torture_create_kthread(rcu_torture_cbflood,
- NULL,
- cbflood_task[i]);
- if (firsterr)
- goto unwind;
- }
}
- rcutorture_record_test_transition();
+
torture_init_end();
return 0;
unwind:
torture_init_end();
rcu_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
new file mode 100644
index 000000000000..07a313782dfd
--- /dev/null
+++ b/kernel/rcu/refscale.c
@@ -0,0 +1,1616 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Scalability test comparing RCU vs other mechanisms
+// for acquiring references on objects.
+//
+// Copyright (C) Google, 2020.
+//
+// Author: Joel Fernandes <joel@joelfernandes.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/seq_buf.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+#include <linux/sched/clock.h>
+
+#include "rcu.h"
+
+#define SCALE_FLAG "-ref-scale: "
+
+#define SCALEOUT(s, x...) \
+ pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
+
+#define VERBOSE_SCALEOUT(s, x...) \
+ do { \
+ if (verbose) \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } while (0)
+
+static atomic_t verbose_batch_ctr;
+
+#define VERBOSE_SCALEOUT_BATCH(s, x...) \
+do { \
+ if (verbose && \
+ (verbose_batched <= 0 || \
+ !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
+ schedule_timeout_uninterruptible(1); \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } \
+} while (0)
+
+#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
+
+MODULE_DESCRIPTION("Scalability test for object reference mechanisms");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
+
+// Number of seconds to extend warm-up and cool-down for multiple guest OSes
+torture_param(long, guest_os_delay, 0,
+ "Number of seconds to extend warm-up/cool-down for multiple guest OSes.");
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
+ "Holdoff time before test start (s)");
+// Number of typesafe_lookup structures, that is, the degree of concurrency.
+torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
+// Number of loops per experiment, all readers execute operations concurrently.
+torture_param(int, loops, 10000, "Number of loops per experiment.");
+// Number of readers, with -1 defaulting to about 75% of the CPUs.
+torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
+// Number of runs.
+torture_param(int, nruns, 30, "Number of experiments to run.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
+
+#ifdef MODULE
+# define REFSCALE_SHUTDOWN 0
+#else
+# define REFSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
+ "Shutdown at end of scalability tests.");
+
+struct reader_task {
+ struct task_struct *task;
+ int start_reader;
+ wait_queue_head_t wq;
+ u64 last_duration_ns;
+};
+
+static struct task_struct *shutdown_task;
+static wait_queue_head_t shutdown_wq;
+
+static struct task_struct *main_task;
+static wait_queue_head_t main_wq;
+static int shutdown_start;
+
+static struct reader_task *reader_tasks;
+
+// Number of readers that are part of the current experiment.
+static atomic_t nreaders_exp;
+
+// Use to wait for all threads to start.
+static atomic_t n_init;
+static atomic_t n_started;
+static atomic_t n_warmedup;
+static atomic_t n_cooleddown;
+
+// Track which experiment is currently running.
+static int exp_idx;
+
+// Operations vector for selecting different types of tests.
+struct ref_scale_ops {
+ bool (*init)(void);
+ void (*cleanup)(void);
+ void (*readsection)(const int nloops);
+ void (*delaysection)(const int nloops, const int udl, const int ndl);
+ bool enable_irqs;
+ const char *name;
+};
+
+static const struct ref_scale_ops *cur_ops;
+
+static void un_delay(const int udl, const int ndl)
+{
+ if (udl)
+ udelay(udl);
+ if (ndl)
+ ndelay(ndl);
+}
+
+static void ref_rcu_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ rcu_read_unlock();
+ }
+}
+
+static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock();
+ un_delay(udl, ndl);
+ rcu_read_unlock();
+ }
+}
+
+static bool rcu_sync_scale_init(void)
+{
+ return true;
+}
+
+static const struct ref_scale_ops rcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_rcu_read_section,
+ .delaysection = ref_rcu_delay_section,
+ .name = "rcu"
+};
+
+// Definitions for SRCU ref scale testing.
+DEFINE_STATIC_SRCU(srcu_refctl_scale);
+DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
+
+static void srcu_ref_scale_read_section(const int nloops)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock(srcu_ctlp, idx);
+ }
+}
+
+static const struct ref_scale_ops srcu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_ref_scale_read_section,
+ .delaysection = srcu_ref_scale_delay_section,
+ .name = "srcu"
+};
+
+static bool srcu_fast_sync_scale_init(void)
+{
+ srcu_ctlp = &srcu_fast_refctl_scale;
+ return true;
+}
+
+static void srcu_fast_ref_scale_read_section(const int nloops)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static const struct ref_scale_ops srcu_fast_ops = {
+ .init = srcu_fast_sync_scale_init,
+ .readsection = srcu_fast_ref_scale_read_section,
+ .delaysection = srcu_fast_ref_scale_delay_section,
+ .name = "srcu-fast"
+};
+
+static bool srcu_fast_updown_sync_scale_init(void)
+{
+ srcu_ctlp = &srcu_fast_updown_refctl_scale;
+ return true;
+}
+
+static void srcu_fast_updown_ref_scale_read_section(const int nloops)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+ }
+}
+
+static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast_updown(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+ }
+}
+
+static const struct ref_scale_ops srcu_fast_updown_ops = {
+ .init = srcu_fast_updown_sync_scale_init,
+ .readsection = srcu_fast_updown_ref_scale_read_section,
+ .delaysection = srcu_fast_updown_ref_scale_delay_section,
+ .name = "srcu-fast-updown"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+// Definitions for RCU Tasks ref scale testing: Empty read markers.
+// These definitions also work for RCU Rude readers.
+static void rcu_tasks_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ continue;
+}
+
+static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--)
+ un_delay(udl, ndl);
+}
+
+static const struct ref_scale_ops rcu_tasks_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_tasks_ref_scale_read_section,
+ .delaysection = rcu_tasks_ref_scale_delay_section,
+ .name = "rcu-tasks"
+};
+
+#define RCU_TASKS_OPS &rcu_tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define RCU_TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+// Definitions for RCU Tasks Trace ref scale testing.
+static void rcu_trace_ref_scale_read_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ rcu_read_unlock_trace();
+ }
+}
+
+static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ rcu_read_lock_trace();
+ un_delay(udl, ndl);
+ rcu_read_unlock_trace();
+ }
+}
+
+static const struct ref_scale_ops rcu_trace_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = rcu_trace_ref_scale_read_section,
+ .delaysection = rcu_trace_ref_scale_delay_section,
+ .name = "rcu-trace"
+};
+
+#define RCU_TRACE_OPS &rcu_trace_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define RCU_TRACE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+// Definitions for reference count
+static atomic_t refcnt;
+
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
+static void ref_refcnt_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ atomic_dec(&refcnt);
+ }
+}
+
+static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ atomic_inc(&refcnt);
+ un_delay(udl, ndl);
+ atomic_dec(&refcnt);
+ }
+}
+
+static const struct ref_scale_ops refcnt_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_refcnt_section,
+ .delaysection = ref_refcnt_delay_section,
+ .name = "refcnt"
+};
+
+static void ref_percpuinc_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ this_cpu_inc(test_acqrel);
+ this_cpu_dec(test_acqrel);
+ }
+}
+
+static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ this_cpu_inc(test_acqrel);
+ un_delay(udl, ndl);
+ this_cpu_dec(test_acqrel);
+ }
+}
+
+static const struct ref_scale_ops percpuinc_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_percpuinc_section,
+ .delaysection = ref_percpuinc_delay_section,
+ .name = "percpuinc"
+};
+
+// Note that this can lose counts in preemptible kernels.
+static void ref_incpercpu_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap = this_cpu_ptr(&test_acqrel);
+
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ }
+}
+
+static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap = this_cpu_ptr(&test_acqrel);
+
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ }
+}
+
+static const struct ref_scale_ops incpercpu_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpu_section,
+ .delaysection = ref_incpercpu_delay_section,
+ .name = "incpercpu"
+};
+
+static void ref_incpercpupreempt_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ preempt_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ preempt_enable();
+ }
+}
+
+static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ preempt_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ preempt_enable();
+ }
+}
+
+static const struct ref_scale_ops incpercpupreempt_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpupreempt_section,
+ .delaysection = ref_incpercpupreempt_delay_section,
+ .name = "incpercpupreempt"
+};
+
+static void ref_incpercpubh_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_bh_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_bh_enable();
+ }
+}
+
+static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_bh_disable();
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_bh_enable();
+ }
+}
+
+static const struct ref_scale_ops incpercpubh_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpubh_section,
+ .delaysection = ref_incpercpubh_delay_section,
+ .enable_irqs = true,
+ .name = "incpercpubh"
+};
+
+static void ref_incpercpuirqsave_section(const int nloops)
+{
+ int i;
+ unsigned long flags;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_irq_save(flags);
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_irq_restore(flags);
+ }
+}
+
+static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ unsigned long flags;
+
+ for (i = nloops; i >= 0; i--) {
+ unsigned long *tap;
+
+ local_irq_save(flags);
+ tap = this_cpu_ptr(&test_acqrel);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) + 1);
+ un_delay(udl, ndl);
+ WRITE_ONCE(*tap, READ_ONCE(*tap) - 1);
+ local_irq_restore(flags);
+ }
+}
+
+static const struct ref_scale_ops incpercpuirqsave_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = ref_incpercpuirqsave_section,
+ .delaysection = ref_incpercpuirqsave_delay_section,
+ .name = "incpercpuirqsave"
+};
+
+// Definitions for rwlock
+static rwlock_t test_rwlock;
+
+static bool ref_rwlock_init(void)
+{
+ rwlock_init(&test_rwlock);
+ return true;
+}
+
+static void ref_rwlock_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ read_lock(&test_rwlock);
+ un_delay(udl, ndl);
+ read_unlock(&test_rwlock);
+ }
+}
+
+static const struct ref_scale_ops rwlock_ops = {
+ .init = ref_rwlock_init,
+ .readsection = ref_rwlock_section,
+ .delaysection = ref_rwlock_delay_section,
+ .name = "rwlock"
+};
+
+// Definitions for rwsem
+static struct rw_semaphore test_rwsem;
+
+static bool ref_rwsem_init(void)
+{
+ init_rwsem(&test_rwsem);
+ return true;
+}
+
+static void ref_rwsem_section(const int nloops)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ up_read(&test_rwsem);
+ }
+}
+
+static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ for (i = nloops; i >= 0; i--) {
+ down_read(&test_rwsem);
+ un_delay(udl, ndl);
+ up_read(&test_rwsem);
+ }
+}
+
+static const struct ref_scale_ops rwsem_ops = {
+ .init = ref_rwsem_init,
+ .readsection = ref_rwsem_section,
+ .delaysection = ref_rwsem_delay_section,
+ .name = "rwsem"
+};
+
+// Definitions for global spinlock
+static DEFINE_RAW_SPINLOCK(test_lock);
+
+static void ref_lock_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock(&test_lock);
+ raw_spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock(&test_lock);
+ un_delay(udl, ndl);
+ raw_spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops lock_ops = {
+ .readsection = ref_lock_section,
+ .delaysection = ref_lock_delay_section,
+ .name = "lock"
+};
+
+// Definitions for global irq-save spinlock
+
+static void ref_lock_irq_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock_irqsave(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock_irqsave(&test_lock, flags);
+ un_delay(udl, ndl);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops lock_irq_ops = {
+ .readsection = ref_lock_irq_section,
+ .delaysection = ref_lock_irq_delay_section,
+ .name = "lock-irq"
+};
+
+static void ref_acqrel_section(const int nloops)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ un_delay(udl, ndl);
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops acqrel_ops = {
+ .readsection = ref_acqrel_section,
+ .delaysection = ref_acqrel_delay_section,
+ .name = "acqrel"
+};
+
+static volatile u64 stopopts;
+
+static void ref_sched_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += sched_clock();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += sched_clock();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static const struct ref_scale_ops sched_clock_ops = {
+ .readsection = ref_sched_clock_section,
+ .delaysection = ref_sched_clock_delay_section,
+ .name = "sched-clock"
+};
+
+
+static void ref_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += ktime_get_real_fast_ns();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static const struct ref_scale_ops clock_ops = {
+ .readsection = ref_clock_section,
+ .delaysection = ref_clock_delay_section,
+ .name = "clock"
+};
+
+static void ref_jiffies_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += jiffies;
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += jiffies;
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static const struct ref_scale_ops jiffies_ops = {
+ .readsection = ref_jiffies_section,
+ .delaysection = ref_jiffies_delay_section,
+ .name = "jiffies"
+};
+
+static void ref_preempt_section(const int nloops)
+{
+ int i;
+
+ migrate_disable();
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ preempt_enable();
+ }
+ migrate_enable();
+}
+
+static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ migrate_disable();
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ un_delay(udl, ndl);
+ preempt_enable();
+ }
+ migrate_enable();
+}
+
+static const struct ref_scale_ops preempt_ops = {
+ .readsection = ref_preempt_section,
+ .delaysection = ref_preempt_delay_section,
+ .name = "preempt"
+};
+
+static void ref_bh_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_bh_disable();
+ local_bh_enable();
+ }
+ preempt_enable();
+}
+
+static void ref_bh_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_bh_disable();
+ un_delay(udl, ndl);
+ local_bh_enable();
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops bh_ops = {
+ .readsection = ref_bh_section,
+ .delaysection = ref_bh_delay_section,
+ .enable_irqs = true,
+ .name = "bh"
+};
+
+static void ref_irq_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_disable();
+ local_irq_enable();
+ }
+ preempt_enable();
+}
+
+static void ref_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_disable();
+ un_delay(udl, ndl);
+ local_irq_enable();
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops irq_ops = {
+ .readsection = ref_irq_section,
+ .delaysection = ref_irq_delay_section,
+ .name = "irq"
+};
+
+static void ref_irqsave_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_save(flags);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+
+static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ local_irq_save(flags);
+ un_delay(udl, ndl);
+ local_irq_restore(flags);
+ }
+ preempt_enable();
+}
+
+static const struct ref_scale_ops irqsave_ops = {
+ .readsection = ref_irqsave_section,
+ .delaysection = ref_irqsave_delay_section,
+ .name = "irqsave"
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+// Methods leveraging SLAB_TYPESAFE_BY_RCU.
+//
+
+// Item to look up in a typesafe manner. Array of pointers to these.
+struct refscale_typesafe {
+ atomic_t rts_refctr; // Used by all flavors
+ spinlock_t rts_lock;
+ seqlock_t rts_seqlock;
+ unsigned int a;
+ unsigned int b;
+};
+
+static struct kmem_cache *typesafe_kmem_cachep;
+static struct refscale_typesafe **rtsarray;
+static long rtsarray_size;
+static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand);
+static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start);
+static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start);
+
+// Conditionally acquire an explicit in-structure reference count.
+static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ return atomic_inc_not_zero(&rtsp->rts_refctr);
+}
+
+// Unconditionally release an explicit in-structure reference count.
+static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ if (!atomic_dec_return(&rtsp->rts_refctr)) {
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ kmem_cache_free(typesafe_kmem_cachep, rtsp);
+ }
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure spinlock.
+static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ spin_lock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally release an explicit in-structure spinlock.
+static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ spin_unlock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure sequence lock.
+static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ *start = read_seqbegin(&rtsp->rts_seqlock);
+ return true;
+}
+
+// Conditionally release an explicit in-structure sequence lock. Return
+// true if this release was successful, that is, if no retry is required.
+static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ return !read_seqretry(&rtsp->rts_seqlock, start);
+}
+
+// Do a read-side critical section with the specified delay in
+// microseconds and nanoseconds inserted so as to increase probability
+// of failure.
+static void typesafe_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned int a;
+ unsigned int b;
+ int i;
+ long idx;
+ struct refscale_typesafe *rtsp;
+ unsigned int start;
+
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size;
+ preempt_enable();
+retry:
+ rcu_read_lock();
+ rtsp = rcu_dereference(rtsarray[idx]);
+ a = READ_ONCE(rtsp->a);
+ if (!rts_acquire(rtsp, &start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ if (a != READ_ONCE(rtsp->a)) {
+ (void)rts_release(rtsp, start);
+ rcu_read_unlock();
+ goto retry;
+ }
+ un_delay(udl, ndl);
+ b = READ_ONCE(rtsp->a);
+ // Remember, seqlock read-side release can fail.
+ if (!rts_release(rtsp, start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
+ b = rtsp->b;
+ rcu_read_unlock();
+ WARN_ON_ONCE(a * a != b);
+ }
+}
+
+// Because the acquisition and release methods are expensive, there
+// is no point in optimizing away the un_delay() function's two checks.
+// Thus simply define typesafe_read_section() as a simple wrapper around
+// typesafe_delay_section().
+static void typesafe_read_section(const int nloops)
+{
+ typesafe_delay_section(nloops, 0, 0);
+}
+
+// Allocate and initialize one refscale_typesafe structure.
+static struct refscale_typesafe *typesafe_alloc_one(void)
+{
+ struct refscale_typesafe *rtsp;
+
+ rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL);
+ if (!rtsp)
+ return NULL;
+ atomic_set(&rtsp->rts_refctr, 1);
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a);
+ return rtsp;
+}
+
+// Slab-allocator constructor for refscale_typesafe structures created
+// out of a new slab of system memory.
+static void refscale_typesafe_ctor(void *rtsp_in)
+{
+ struct refscale_typesafe *rtsp = rtsp_in;
+
+ spin_lock_init(&rtsp->rts_lock);
+ seqlock_init(&rtsp->rts_seqlock);
+ preempt_disable();
+ rtsp->a = torture_random(this_cpu_ptr(&refscale_rand));
+ preempt_enable();
+}
+
+static const struct ref_scale_ops typesafe_ref_ops;
+static const struct ref_scale_ops typesafe_lock_ops;
+static const struct ref_scale_ops typesafe_seqlock_ops;
+
+// Initialize for a typesafe test.
+static bool typesafe_init(void)
+{
+ long idx;
+ long si = lookup_instances;
+
+ typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe",
+ sizeof(struct refscale_typesafe), sizeof(void *),
+ SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor);
+ if (!typesafe_kmem_cachep)
+ return false;
+ if (si < 0)
+ si = -si * nr_cpu_ids;
+ else if (si == 0)
+ si = nr_cpu_ids;
+ rtsarray_size = si;
+ rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL);
+ if (!rtsarray)
+ return false;
+ for (idx = 0; idx < rtsarray_size; idx++) {
+ rtsarray[idx] = typesafe_alloc_one();
+ if (!rtsarray[idx])
+ return false;
+ }
+ if (cur_ops == &typesafe_ref_ops) {
+ rts_acquire = typesafe_ref_acquire;
+ rts_release = typesafe_ref_release;
+ } else if (cur_ops == &typesafe_lock_ops) {
+ rts_acquire = typesafe_lock_acquire;
+ rts_release = typesafe_lock_release;
+ } else if (cur_ops == &typesafe_seqlock_ops) {
+ rts_acquire = typesafe_seqlock_acquire;
+ rts_release = typesafe_seqlock_release;
+ } else {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ return true;
+}
+
+// Clean up after a typesafe test.
+static void typesafe_cleanup(void)
+{
+ long idx;
+
+ if (rtsarray) {
+ for (idx = 0; idx < rtsarray_size; idx++)
+ kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]);
+ kfree(rtsarray);
+ rtsarray = NULL;
+ rtsarray_size = 0;
+ }
+ kmem_cache_destroy(typesafe_kmem_cachep);
+ typesafe_kmem_cachep = NULL;
+ rts_acquire = NULL;
+ rts_release = NULL;
+}
+
+// The typesafe_init() function distinguishes these structures by address.
+static const struct ref_scale_ops typesafe_ref_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_ref"
+};
+
+static const struct ref_scale_ops typesafe_lock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_lock"
+};
+
+static const struct ref_scale_ops typesafe_seqlock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_seqlock"
+};
+
+static void rcu_scale_one_reader(void)
+{
+ if (readdelay <= 0)
+ cur_ops->readsection(loops);
+ else
+ cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
+}
+
+// Warm up cache, or, if needed run a series of rcu_scale_one_reader()
+// to allow multiple rcuscale guest OSes to collect mutually valid data.
+static void rcu_scale_warm_cool(void)
+{
+ unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1);
+
+ do {
+ rcu_scale_one_reader();
+ cond_resched();
+ } while (time_before(jiffies, jdone));
+}
+
+// Reader kthread. Repeatedly does empty RCU read-side
+// critical section, minimizing update-side interference.
+static int
+ref_scale_reader(void *arg)
+{
+ unsigned long flags;
+ long me = (long)arg;
+ struct reader_task *rt = &(reader_tasks[me]);
+ u64 start;
+ s64 duration;
+
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_init);
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+repeat:
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id());
+
+ // Wait for signal that this reader can start.
+ wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
+ torture_must_stop());
+
+ if (torture_must_stop())
+ goto end;
+
+ // Make sure that the CPU is affinitized appropriately during testing.
+ WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids);
+
+ WRITE_ONCE(rt->start_reader, 0);
+ if (!atomic_dec_return(&n_started))
+ while (atomic_read_acquire(&n_started))
+ cpu_relax();
+
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+
+
+ // To reduce noise, do an initial cache-warming invocation, check
+ // in, and then keep warming until everyone has checked in.
+ rcu_scale_one_reader();
+ if (!atomic_dec_return(&n_warmedup))
+ while (atomic_read_acquire(&n_warmedup))
+ rcu_scale_one_reader();
+ // Also keep interrupts disabled when it is safe to do so, which
+ // it is not for local_bh_enable(). This also has the effect of
+ // preventing entries into slow path for rcu_read_unlock().
+ if (!cur_ops->enable_irqs)
+ local_irq_save(flags);
+ start = ktime_get_mono_fast_ns();
+
+ rcu_scale_one_reader();
+
+ duration = ktime_get_mono_fast_ns() - start;
+ if (!cur_ops->enable_irqs)
+ local_irq_restore(flags);
+
+ rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+ // To reduce runtime-skew noise, do maintain-load invocations until
+ // everyone is done.
+ if (!atomic_dec_return(&n_cooleddown))
+ while (atomic_read_acquire(&n_cooleddown))
+ rcu_scale_one_reader();
+
+ if (atomic_dec_and_test(&nreaders_exp))
+ wake_up(&main_wq);
+
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+ me, exp_idx, atomic_read(&nreaders_exp));
+
+ if (!torture_must_stop())
+ goto repeat;
+end:
+ torture_kthread_stopping("ref_scale_reader");
+ return 0;
+}
+
+static void reset_readers(void)
+{
+ int i;
+ struct reader_task *rt;
+
+ for (i = 0; i < nreaders; i++) {
+ rt = &(reader_tasks[i]);
+
+ rt->last_duration_ns = 0;
+ }
+}
+
+// Print the results of each reader and return the sum of all their durations.
+static u64 process_durations(int n)
+{
+ int i;
+ struct reader_task *rt;
+ struct seq_buf s;
+ char *buf;
+ u64 sum = 0;
+
+ buf = kmalloc(800 + 64, GFP_KERNEL);
+ if (!buf)
+ return 0;
+ seq_buf_init(&s, buf, 800 + 64);
+
+ seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+ exp_idx);
+
+ for (i = 0; i < n && !torture_must_stop(); i++) {
+ rt = &(reader_tasks[i]);
+
+ if (i % 5 == 0)
+ seq_buf_putc(&s, '\n');
+
+ if (seq_buf_used(&s) >= 800) {
+ pr_alert("%s", seq_buf_str(&s));
+ seq_buf_clear(&s);
+ }
+
+ seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns);
+
+ sum += rt->last_duration_ns;
+ }
+ pr_alert("%s\n", seq_buf_str(&s));
+
+ kfree(buf);
+ return sum;
+}
+
+// The main_func is the main orchestrator, it performs a bunch of
+// experiments. For every experiment, it orders all the readers
+// involved to start and waits for them to finish the experiment. It
+// then reads their timestamps and starts the next experiment. Each
+// experiment progresses from 1 concurrent reader to N of them at which
+// point all the timestamps are printed.
+static int main_func(void *arg)
+{
+ int exp, r;
+ char buf1[64];
+ char *buf;
+ u64 *result_avg;
+
+ set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+
+ VERBOSE_SCALEOUT("main_func task started");
+ result_avg = kcalloc(nruns, sizeof(*result_avg), GFP_KERNEL);
+ buf = kzalloc(800 + 64, GFP_KERNEL);
+ if (!result_avg || !buf) {
+ SCALEOUT_ERRSTRING("out of memory");
+ goto oom_exit;
+ }
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+
+ // Wait for all threads to start.
+ atomic_inc(&n_init);
+ while (atomic_read(&n_init) < nreaders + 1)
+ schedule_timeout_uninterruptible(1);
+
+ // Start exp readers up per experiment
+ rcu_scale_warm_cool();
+ for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
+ if (torture_must_stop())
+ goto end;
+
+ reset_readers();
+ atomic_set(&nreaders_exp, nreaders);
+ atomic_set(&n_started, nreaders);
+ atomic_set(&n_warmedup, nreaders);
+ atomic_set(&n_cooleddown, nreaders);
+
+ exp_idx = exp;
+
+ for (r = 0; r < nreaders; r++) {
+ smp_store_release(&reader_tasks[r].start_reader, 1);
+ wake_up(&reader_tasks[r].wq);
+ }
+
+ VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers",
+ nreaders);
+
+ wait_event(main_wq,
+ !atomic_read(&nreaders_exp) || torture_must_stop());
+
+ VERBOSE_SCALEOUT("main_func: experiment ended");
+
+ if (torture_must_stop())
+ goto end;
+
+ result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
+ }
+ rcu_scale_warm_cool();
+
+ // Print the average of all experiments
+ SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+
+ pr_alert("Runs\tTime(ns)\n");
+ for (exp = 0; exp < nruns; exp++) {
+ u64 avg;
+ u32 rem;
+
+ avg = div_u64_rem(result_avg[exp], 1000, &rem);
+ sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
+ strcat(buf, buf1);
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
+ }
+
+ pr_alert("%s", buf);
+
+oom_exit:
+ // This will shutdown everything including us.
+ if (shutdown) {
+ shutdown_start = 1;
+ wake_up(&shutdown_wq);
+ }
+
+ // Wait for torture to stop us
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+
+end:
+ torture_kthread_stopping("main_func");
+ kfree(result_avg);
+ kfree(buf);
+ return 0;
+}
+
+static void
+ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" SCALE_FLAG
+ "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
+}
+
+static void
+ref_scale_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nreaders; i++)
+ torture_stop_kthread("ref_scale_reader",
+ reader_tasks[i].task);
+ }
+ kfree(reader_tasks);
+ reader_tasks = NULL;
+
+ torture_stop_kthread("main_task", main_task);
+
+ // Do scale-type-specific cleanup operations.
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+// Shutdown kthread. Just waits to be awakened, then shuts down system.
+static int
+ref_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, shutdown_start);
+
+ smp_mb(); // Wake before output.
+ ref_scale_cleanup();
+ kernel_power_off();
+
+ return -EINVAL;
+}
+
+static int __init
+ref_scale_init(void)
+{
+ long i;
+ int firsterr = 0;
+ static const struct ref_scale_ops *scale_ops[] = {
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops,
+ RCU_TRACE_OPS RCU_TASKS_OPS
+ &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops,
+ &incpercpubh_ops, &incpercpuirqsave_ops,
+ &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
+ &sched_clock_ops, &clock_ops, &jiffies_ops,
+ &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops,
+ &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
+ };
+
+ if (!torture_init_begin(scale_type, verbose))
+ return -EBUSY;
+
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+ cur_ops = scale_ops[i];
+ if (strcmp(scale_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(scale_ops)) {
+ pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+ pr_alert("rcu-scale types:");
+ for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+ pr_cont(" %s", scale_ops[i]->name);
+ pr_cont("\n");
+ firsterr = -EINVAL;
+ cur_ops = NULL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ if (!cur_ops->init()) {
+ firsterr = -EUCLEAN;
+ goto unwind;
+ }
+
+ ref_scale_print_module_parms(cur_ops, "Start of test");
+
+ // Shutdown task
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
+ shutdown_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+
+ // Reader tasks (default to ~75% of online CPUs).
+ if (nreaders < 0)
+ nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+ if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops))
+ loops = 1;
+ if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders))
+ nreaders = 1;
+ if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns))
+ nruns = 1;
+ if (WARN_ONCE(loops > INT_MAX / nreaders,
+ "%s: nreaders * loops will overflow, adjusted loops to %d",
+ __func__, INT_MAX / nreaders))
+ loops = INT_MAX / nreaders;
+ reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (!reader_tasks) {
+ SCALEOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
+
+ for (i = 0; i < nreaders; i++) {
+ init_waitqueue_head(&reader_tasks[i].wq);
+ firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
+ reader_tasks[i].task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ // Main Task
+ init_waitqueue_head(&main_wq);
+ firsterr = torture_create_kthread(main_func, NULL, main_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ ref_scale_cleanup();
+ if (shutdown) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
+ kernel_power_off();
+ }
+ return firsterr;
+}
+
+module_init(ref_scale_init);
+module_exit(ref_scale_cleanup);
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
deleted file mode 100644
index cad76e76b4e7..000000000000
--- a/kernel/rcu/srcu.c
+++ /dev/null
@@ -1,676 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (C) IBM Corporation, 2006
- * Copyright (C) Fujitsu, 2012
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- * Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU/ *.txt
- *
- */
-
-#include <linux/export.h>
-#include <linux/mutex.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/srcu.h>
-
-#include "rcu.h"
-
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
- b->head = NULL;
- b->tail = &b->head;
-}
-
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
- *b->tail = head;
- b->tail = &head->next;
-}
-
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
- return b->tail == &b->head;
-}
-
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
- struct rcu_head *head;
-
- if (rcu_batch_empty(b))
- return NULL;
-
- head = b->head;
- b->head = head->next;
- if (b->tail == &head->next)
- rcu_batch_init(b);
-
- return head;
-}
-
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
- if (!rcu_batch_empty(from)) {
- *to->tail = from->head;
- to->tail = from->tail;
- rcu_batch_init(from);
- }
-}
-
-static int init_srcu_struct_fields(struct srcu_struct *sp)
-{
- sp->completed = 0;
- spin_lock_init(&sp->queue_lock);
- sp->running = false;
- rcu_batch_init(&sp->batch_queue);
- rcu_batch_init(&sp->batch_check0);
- rcu_batch_init(&sp->batch_check1);
- rcu_batch_init(&sp->batch_done);
- INIT_DELAYED_WORK(&sp->work, process_srcu);
- sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
- return sp->per_cpu_ref ? 0 : -ENOMEM;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
- struct lock_class_key *key)
-{
- /* Don't re-initialize a lock while it is held. */
- debug_check_no_locks_freed((void *)sp, sizeof(*sp));
- lockdep_init_map(&sp->dep_map, name, key, 0);
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(__init_srcu_struct);
-
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
- *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
- * to any other function. Each srcu_struct represents a separate domain
- * of SRCU protection.
- */
-int init_srcu_struct(struct srcu_struct *sp)
-{
- return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/*
- * Returns approximate total of the readers' ->seq[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
- unsigned long t;
-
- for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
- sum += t;
- }
- return sum;
-}
-
-/*
- * Returns approximate number of readers active on the specified rank
- * of the per-CPU ->c[] counters.
- */
-static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
-{
- int cpu;
- unsigned long sum = 0;
- unsigned long t;
-
- for_each_possible_cpu(cpu) {
- t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
- sum += t;
- }
- return sum;
-}
-
-/*
- * Return true if the number of pre-existing readers is determined to
- * be stably zero. An example unstable zero can occur if the call
- * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
- * but due to task migration, sees the corresponding __srcu_read_unlock()
- * decrement. This can happen because srcu_readers_active_idx() takes
- * time to sum the array, and might in fact be interrupted or preempted
- * partway through the summation.
- */
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
-{
- unsigned long seq;
-
- seq = srcu_readers_seq_idx(sp, idx);
-
- /*
- * The following smp_mb() A pairs with the smp_mb() B located in
- * __srcu_read_lock(). This pairing ensures that if an
- * __srcu_read_lock() increments its counter after the summation
- * in srcu_readers_active_idx(), then the corresponding SRCU read-side
- * critical section will see any changes made prior to the start
- * of the current SRCU grace period.
- *
- * Also, if the above call to srcu_readers_seq_idx() saw the
- * increment of ->seq[], then the call to srcu_readers_active_idx()
- * must see the increment of ->c[].
- */
- smp_mb(); /* A */
-
- /*
- * Note that srcu_readers_active_idx() can incorrectly return
- * zero even though there is a pre-existing reader throughout.
- * To see this, suppose that task A is in a very long SRCU
- * read-side critical section that started on CPU 0, and that
- * no other reader exists, so that the sum of the counters
- * is equal to one. Then suppose that task B starts executing
- * srcu_readers_active_idx(), summing up to CPU 1, and then that
- * task C starts reading on CPU 0, so that its increment is not
- * summed, but finishes reading on CPU 2, so that its decrement
- * -is- summed. Then when task B completes its sum, it will
- * incorrectly get zero, despite the fact that task A has been
- * in its SRCU read-side critical section the whole time.
- *
- * We therefore do a validation step should srcu_readers_active_idx()
- * return zero.
- */
- if (srcu_readers_active_idx(sp, idx) != 0)
- return false;
-
- /*
- * The remainder of this function is the validation step.
- * The following smp_mb() D pairs with the smp_mb() C in
- * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
- * by srcu_readers_active_idx() above, then any destructive
- * operation performed after the grace period will happen after
- * the corresponding SRCU read-side critical section.
- *
- * Note that there can be at most NR_CPUS worth of readers using
- * the old index, which is not enough to overflow even a 32-bit
- * integer. (Yes, this does mean that systems having more than
- * a billion or so CPUs need to be 64-bit systems.) Therefore,
- * the sum of the ->seq[] counters cannot possibly overflow.
- * Therefore, the only way that the return values of the two
- * calls to srcu_readers_seq_idx() can be equal is if there were
- * no increments of the corresponding rank of ->seq[] counts
- * in the interim. But the missed-increment scenario laid out
- * above includes an increment of the ->seq[] counter by
- * the corresponding __srcu_read_lock(). Therefore, if this
- * scenario occurs, the return values from the two calls to
- * srcu_readers_seq_idx() will differ, and thus the validation
- * step below suffices.
- */
- smp_mb(); /* D */
-
- return srcu_readers_seq_idx(sp, idx) == seq;
-}
-
-/**
- * srcu_readers_active - returns approximate number of readers.
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
- *
- * Note that this is not an atomic primitive, and can therefore suffer
- * severe errors when invoked on an active srcu_struct. That said, it
- * can be useful as an error check at cleanup time.
- */
-static int srcu_readers_active(struct srcu_struct *sp)
-{
- int cpu;
- unsigned long sum = 0;
-
- for_each_possible_cpu(cpu) {
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
- sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
- }
- return sum;
-}
-
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
-{
- if (WARN_ON(srcu_readers_active(sp)))
- return; /* Leakage unless caller handles error. */
- free_percpu(sp->per_cpu_ref);
- sp->per_cpu_ref = NULL;
-}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct. Must be called from process context.
- * Returns an index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
- int idx;
-
- idx = ACCESS_ONCE(sp->completed) & 0x1;
- preempt_disable();
- __this_cpu_inc(sp->per_cpu_ref->c[idx]);
- smp_mb(); /* B */ /* Avoid leaking the critical section. */
- __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
- preempt_enable();
- return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
- * Removes the count for the old reader from the appropriate per-CPU
- * element of the srcu_struct. Note that this may well be a different
- * CPU than that which was incremented by the corresponding srcu_read_lock().
- * Must be called from process context.
- */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
-{
- smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_dec(sp->per_cpu_ref->c[idx]);
-}
-EXPORT_SYMBOL_GPL(__srcu_read_unlock);
-
-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods. This approach
- * has done well in testing, so there is no need for a config parameter.
- */
-#define SRCU_RETRY_CHECK_DELAY 5
-#define SYNCHRONIZE_SRCU_TRYCOUNT 2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12
-
-/*
- * @@@ Wait until all pre-existing readers complete. Such readers
- * will have used the index specified by "idx".
- * the caller should ensures the ->completed is not changed while checking
- * and idx = (->completed & 1) ^ 1
- */
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
-{
- for (;;) {
- if (srcu_readers_active_idx_check(sp, idx))
- return true;
- if (--trycount <= 0)
- return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
- }
-}
-
-/*
- * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->c[] and ->seq[] arrays. This allows
- * us to wait for pre-existing readers in a starvation-free manner.
- */
-static void srcu_flip(struct srcu_struct *sp)
-{
- sp->completed++;
-}
-
-/*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
- * initiating grace-period processing if it is not already running.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing SRCU read-side critical section. On systems with
- * more than one CPU, this means that when "func()" is invoked, each CPU
- * is guaranteed to have executed a full memory barrier since the end of
- * its last corresponding SRCU read-side critical section whose beginning
- * preceded the call to call_rcu(). It also means that each CPU executing
- * an SRCU read-side critical section that continues beyond the start of
- * "func()" must have executed a memory barrier after the call_rcu()
- * but before the beginning of that SRCU read-side critical section.
- * Note that these guarantees include CPUs that are offline, idle, or
- * executing in user mode, as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting SRCU callback function "func()", then both CPU A and CPU
- * B are guaranteed to execute a full memory barrier during the time
- * interval between the call to call_rcu() and the invocation of "func()".
- * This guarantee applies even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- *
- * Of course, these guarantees apply only for invocations of call_srcu(),
- * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
- * srcu_struct structure.
- */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
- void (*func)(struct rcu_head *head))
-{
- unsigned long flags;
-
- head->next = NULL;
- head->func = func;
- spin_lock_irqsave(&sp->queue_lock, flags);
- rcu_batch_queue(&sp->batch_queue, head);
- if (!sp->running) {
- sp->running = true;
- queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
- }
- spin_unlock_irqrestore(&sp->queue_lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_srcu);
-
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
-static void srcu_reschedule(struct srcu_struct *sp);
-
-/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- */
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
-{
- struct rcu_synchronize rcu;
- struct rcu_head *head = &rcu.head;
- bool done = false;
-
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
-
- might_sleep();
- init_completion(&rcu.completion);
-
- head->next = NULL;
- head->func = wakeme_after_rcu;
- spin_lock_irq(&sp->queue_lock);
- if (!sp->running) {
- /* steal the processing owner */
- sp->running = true;
- rcu_batch_queue(&sp->batch_check0, head);
- spin_unlock_irq(&sp->queue_lock);
-
- srcu_advance_batches(sp, trycount);
- if (!rcu_batch_empty(&sp->batch_done)) {
- BUG_ON(sp->batch_done.head != head);
- rcu_batch_dequeue(&sp->batch_done);
- done = true;
- }
- /* give the processing owner to work_struct */
- srcu_reschedule(sp);
- } else {
- rcu_batch_queue(&sp->batch_queue, head);
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (!done)
- wait_for_completion(&rcu.completion);
-}
-
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for the count to drain to zero of both indexes. To avoid the
- * possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * and then flip the completed and wait for the count of the other index.
- *
- * Can block; must be called from process context.
- *
- * Note that it is illegal to call synchronize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section,
- * as long as the resulting graph of srcu_structs is acyclic.
- *
- * There are memory-ordering constraints implied by synchronize_srcu().
- * On systems with more than one CPU, when synchronize_srcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last corresponding SRCU-sched read-side critical section
- * whose beginning preceded the call to synchronize_srcu(). In addition,
- * each CPU having an SRCU read-side critical section that extends beyond
- * the return from synchronize_srcu() is guaranteed to have executed a
- * full memory barrier after the beginning of synchronize_srcu() and before
- * the beginning of that SRCU read-side critical section. Note that these
- * guarantees include CPUs that are offline, idle, or executing in user mode,
- * as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_srcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
- * are the same CPU, but again only if the system has more than one CPU.
- *
- * Of course, these memory-ordering guarantees apply only when
- * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
- * passed the same srcu_struct structure.
- */
-void synchronize_srcu(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, rcu_gp_is_expedited()
- ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
- : SYNCHRONIZE_SRCU_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu);
-
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
- __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
-/**
- * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- * @sp: srcu_struct on which to wait for in-flight callbacks.
- */
-void srcu_barrier(struct srcu_struct *sp)
-{
- synchronize_srcu(sp);
-}
-EXPORT_SYMBOL_GPL(srcu_barrier);
-
-/**
- * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
- *
- * Report the number of batches, correlated with, but not necessarily
- * precisely the same as, the number of grace periods that have elapsed.
- */
-unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return sp->completed;
-}
-EXPORT_SYMBOL_GPL(srcu_batches_completed);
-
-#define SRCU_CALLBACK_BATCH 10
-#define SRCU_INTERVAL 1
-
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
- if (!rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
- spin_unlock_irq(&sp->queue_lock);
- }
-}
-
-/*
- * Core SRCU state machine. Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
- */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
-{
- int idx = 1 ^ (sp->completed & 1);
-
- /*
- * Because readers might be delayed for an extended period after
- * fetching ->completed for their index, at any point in time there
- * might well be readers using both idx=0 and idx=1. We therefore
- * need to wait for readers to clear from both index values before
- * invoking a callback.
- */
-
- if (rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_check1))
- return; /* no callbacks need to be advanced */
-
- if (!try_check_zero(sp, idx, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have already done with their
- * first zero check and flip back when they were enqueued on
- * ->batch_check0 in a previous invocation of srcu_advance_batches().
- * (Presumably try_check_zero() returned false during that
- * invocation, leaving the callbacks stranded on ->batch_check1.)
- * They are therefore ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
- if (rcu_batch_empty(&sp->batch_check0))
- return; /* no callbacks need to be advanced */
- srcu_flip(sp);
-
- /*
- * The callbacks in ->batch_check0 just finished their
- * first check zero and flip, so move them to ->batch_check1
- * for future checking on the other idx.
- */
- rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
- /*
- * SRCU read-side critical sections are normally short, so check
- * at least twice in quick succession after a flip.
- */
- trycount = trycount < 2 ? 2 : trycount;
- if (!try_check_zero(sp, idx^1, trycount))
- return; /* failed to advance, will try after SRCU_INTERVAL */
-
- /*
- * The callbacks in ->batch_check1 have now waited for all
- * pre-existing readers using both idx values. They are therefore
- * ready to invoke, so move them to ->batch_done.
- */
- rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-}
-
-/*
- * Invoke a limited number of SRCU callbacks that have passed through
- * their grace period. If there are more to do, SRCU will reschedule
- * the workqueue.
- */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
-{
- int i;
- struct rcu_head *head;
-
- for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
- head = rcu_batch_dequeue(&sp->batch_done);
- if (!head)
- break;
- local_bh_disable();
- head->func(head);
- local_bh_enable();
- }
-}
-
-/*
- * Finished one round of SRCU grace period. Start another if there are
- * more SRCU callbacks queued, otherwise put SRCU into not-running state.
- */
-static void srcu_reschedule(struct srcu_struct *sp)
-{
- bool pending = true;
-
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- spin_lock_irq(&sp->queue_lock);
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
- sp->running = false;
- pending = false;
- }
- spin_unlock_irq(&sp->queue_lock);
- }
-
- if (pending)
- queue_delayed_work(system_power_efficient_wq,
- &sp->work, SRCU_INTERVAL);
-}
-
-/*
- * This is the work-queue function that handles SRCU grace periods.
- */
-void process_srcu(struct work_struct *work)
-{
- struct srcu_struct *sp;
-
- sp = container_of(work, struct srcu_struct, work.work);
-
- srcu_collect_new(sp);
- srcu_advance_batches(sp, 1);
- srcu_invoke_callbacks(sp);
- srcu_reschedule(sp);
-}
-EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..3450c3751ef7
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ * tiny version for non-preemptible single-CPU use.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@linux.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include <linux/rcu_node_tree.h>
+#include "rcu_segcblist.h"
+#include "rcu.h"
+
+#ifndef CONFIG_TREE_RCU
+int rcu_scheduler_active __read_mostly;
+#else // #ifndef CONFIG_TREE_RCU
+extern int rcu_scheduler_active;
+#endif // #else // #ifndef CONFIG_TREE_RCU
+static LIST_HEAD(srcu_boot_list);
+static bool srcu_init_done;
+
+static int init_srcu_struct_fields(struct srcu_struct *ssp)
+{
+ ssp->srcu_lock_nesting[0] = 0;
+ ssp->srcu_lock_nesting[1] = 0;
+ init_swait_queue_head(&ssp->srcu_wq);
+ ssp->srcu_cb_head = NULL;
+ ssp->srcu_cb_tail = &ssp->srcu_cb_head;
+ ssp->srcu_gp_running = false;
+ ssp->srcu_gp_waiting = false;
+ ssp->srcu_idx = 0;
+ ssp->srcu_idx_max = 0;
+ INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
+ INIT_LIST_HEAD(&ssp->srcu_work.entry);
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
+ struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
+ lockdep_init_map(&ssp->dep_map, name, key, 0);
+ return init_srcu_struct_fields(ssp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *ssp)
+{
+ return init_srcu_struct_fields(ssp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @ssp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *ssp)
+{
+ WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
+ flush_work(&ssp->srcu_work);
+ WARN_ON(ssp->srcu_gp_running);
+ WARN_ON(ssp->srcu_gp_waiting);
+ WARN_ON(ssp->srcu_cb_head);
+ WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
+ WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
+ WARN_ON(ssp->srcu_idx & 0x1);
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Removes the count for the old reader from the appropriate element of
+ * the srcu_struct.
+ */
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
+{
+ int newval;
+
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
+ WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+ preempt_enable();
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled())
+ swake_up_one(&ssp->srcu_wq);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * Workqueue handler to drive one grace period and invoke any callbacks
+ * that become ready as a result. Single-CPU operation and preemption
+ * disabling mean that we get away with murder on synchronization. ;-)
+ */
+void srcu_drive_gp(struct work_struct *wp)
+{
+ int idx;
+ struct rcu_head *lh;
+ struct rcu_head *rhp;
+ struct srcu_struct *ssp;
+
+ ssp = container_of(wp, struct srcu_struct, srcu_work);
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
+ preempt_enable();
+ return; /* Already running or nothing to do. */
+ }
+
+ /* Remove recently arrived callbacks and wait for readers. */
+ WRITE_ONCE(ssp->srcu_gp_running, true);
+ local_irq_disable();
+ lh = ssp->srcu_cb_head;
+ ssp->srcu_cb_head = NULL;
+ ssp->srcu_cb_tail = &ssp->srcu_cb_head;
+ local_irq_enable();
+ idx = (ssp->srcu_idx & 0x2) / 2;
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
+ preempt_enable();
+ do {
+ // Deadlock issues prevent __srcu_read_unlock() from
+ // doing an unconditional wakeup, so polling is required.
+ swait_event_timeout_exclusive(ssp->srcu_wq,
+ !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10);
+ } while (READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ preempt_enable();
+
+ /* Invoke the callbacks we removed above. */
+ while (lh) {
+ rhp = lh;
+ lh = lh->next;
+ debug_rcu_head_callback(rhp);
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ }
+
+ /*
+ * Enable rescheduling, and if there are more callbacks,
+ * reschedule ourselves. This can race with a call_srcu()
+ * at interrupt level, but the ->srcu_gp_running checks will
+ * straighten that out.
+ */
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ WRITE_ONCE(ssp->srcu_gp_running, false);
+ idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
+ preempt_enable();
+ if (idx)
+ schedule_work(&ssp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_drive_gp);
+
+static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
+{
+ unsigned long cookie;
+
+ lockdep_assert_preemption_disabled(); // Needed for PREEMPT_LAZY
+ cookie = get_state_synchronize_srcu(ssp);
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
+ return;
+ }
+ WRITE_ONCE(ssp->srcu_idx_max, cookie);
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
+ if (likely(srcu_init_done))
+ schedule_work(&ssp->srcu_work);
+ else if (list_empty(&ssp->srcu_work.entry))
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
+ }
+}
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ unsigned long flags;
+
+ rhp->func = func;
+ rhp->next = NULL;
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ local_irq_save(flags);
+ *ssp->srcu_cb_tail = rhp;
+ ssp->srcu_cb_tail = &rhp->next;
+ local_irq_restore(flags);
+ srcu_gp_start_if_needed(ssp);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ */
+void synchronize_srcu(struct srcu_struct *ssp)
+{
+ struct rcu_synchronize rs;
+
+ srcu_lock_sync(&ssp->dep_map);
+
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+
+ might_sleep();
+ init_rcu_head_on_stack(&rs.head);
+ init_completion(&rs.completion);
+ call_srcu(ssp, &rs.head, wakeme_after_rcu);
+ wait_for_completion(&rs.completion);
+ destroy_rcu_head_on_stack(&rs.head);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/*
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret;
+
+ barrier();
+ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
+ barrier();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/*
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ *
+ * The difference between this and get_state_synchronize_srcu() is that
+ * this function ensures that the poll_state_synchronize_srcu() will
+ * eventually return the value true.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret;
+
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ ret = get_state_synchronize_srcu(ssp);
+ srcu_gp_start_if_needed(ssp);
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/*
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
+
+ barrier();
+ return cookie == SRCU_GET_STATE_COMPLETED ||
+ ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
+#ifndef CONFIG_TREE_RCU
+/* Lockdep diagnostics. */
+void __init rcu_scheduler_starting(void)
+{
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
+#endif // #ifndef CONFIG_TREE_RCU
+
+/*
+ * Queue work for srcu_struct structures with early boot callbacks.
+ * The work won't actually execute until the workqueue initialization
+ * phase that takes place after the scheduler starts.
+ */
+void __init srcu_init(void)
+{
+ struct srcu_struct *ssp;
+
+ srcu_init_done = true;
+ while (!list_empty(&srcu_boot_list)) {
+ ssp = list_first_entry(&srcu_boot_list,
+ struct srcu_struct, srcu_work.entry);
+ list_del_init(&ssp->srcu_work.entry);
+ schedule_work(&ssp->srcu_work);
+ }
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..ea3f128de06f
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,2208 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Authors: Paul McKenney <paulmck@linux.ibm.com>
+ * Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * Documentation/RCU/ *.txt
+ *
+ */
+
+#define pr_fmt(fmt) "rcu: " fmt
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+
+#include "rcu.h"
+#include "rcu_segcblist.h"
+
+/* Holdoff in nanoseconds for auto-expediting. */
+#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
+static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
+module_param(exp_holdoff, ulong, 0444);
+
+/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */
+static ulong counter_wrap_check = (ULONG_MAX >> 2);
+module_param(counter_wrap_check, ulong, 0444);
+
+/*
+ * Control conversion to SRCU_SIZE_BIG:
+ * 0: Don't convert at all.
+ * 1: Convert at init_srcu_struct() time.
+ * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ * 3: Decide at boot time based on system shape (default).
+ * 0x1x: Convert when excessive contention encountered.
+ */
+#define SRCU_SIZING_NONE 0
+#define SRCU_SIZING_INIT 1
+#define SRCU_SIZING_TORTURE 2
+#define SRCU_SIZING_AUTO 3
+#define SRCU_SIZING_CONTEND 0x10
+#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
+#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
+#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
+#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
+#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
+static int convert_to_big = SRCU_SIZING_AUTO;
+module_param(convert_to_big, int, 0444);
+
+/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */
+static int big_cpu_lim __read_mostly = 128;
+module_param(big_cpu_lim, int, 0444);
+
+/* Contention events per jiffy to initiate transition to big. */
+static int small_contention_lim __read_mostly = 100;
+module_param(small_contention_lim, int, 0444);
+
+/* Early-boot callback-management, so early that no lock is required! */
+static LIST_HEAD(srcu_boot_list);
+static bool __read_mostly srcu_init_done;
+
+static void srcu_invoke_callbacks(struct work_struct *work);
+static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
+static void srcu_delay_timer(struct timer_list *t);
+
+/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
+
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_unlock_irq_rcu_node(p) \
+ spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+#define spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+
+/*
+ * Initialize SRCU per-CPU data. Note that statically allocated
+ * srcu_struct structures might already have srcu_read_lock() and
+ * srcu_read_unlock() running against them. So if the is_static
+ * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and
+ * ->srcu_ctrs[].srcu_unlocks.
+ */
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head;
+ sdp->mynode = NULL;
+ sdp->cpu = cpu;
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+ sdp->ssp = ssp;
+ }
+}
+
+/* Invalid seq state, used during snp node initialization */
+#define SRCU_SNP_INIT_SEQ 0x2
+
+/*
+ * Check whether sequence number corresponding to snp node,
+ * is invalid.
+ */
+static inline bool srcu_invl_snp_seq(unsigned long s)
+{
+ return s == SRCU_SNP_INIT_SEQ;
+}
+
+/*
+ * Allocated and initialize SRCU combining tree. Returns @true if
+ * allocation succeeded and @false otherwise.
+ */
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
+{
+ int cpu;
+ int i;
+ int level = 0;
+ int levelspread[RCU_NUM_LVLS];
+ struct srcu_data *sdp;
+ struct srcu_node *snp;
+ struct srcu_node *snp_first;
+
+ /* Initialize geometry if it has not already been initialized. */
+ rcu_init_geometry();
+ ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ if (!ssp->srcu_sup->node)
+ return false;
+
+ /* Work out the overall tree geometry. */
+ ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
+ for (i = 1; i < rcu_num_lvls; i++)
+ ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
+ rcu_init_levelspread(levelspread, num_rcu_lvl);
+
+ /* Each pass through this loop initializes one srcu_node structure. */
+ srcu_for_each_node_breadth_first(ssp, snp) {
+ spin_lock_init(&ACCESS_PRIVATE(snp, lock));
+ BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) !=
+ ARRAY_SIZE(snp->srcu_data_have_cbs));
+ for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
+ snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
+ snp->srcu_data_have_cbs[i] = 0;
+ }
+ snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
+ snp->grplo = -1;
+ snp->grphi = -1;
+ if (snp == &ssp->srcu_sup->node[0]) {
+ /* Root node, special case. */
+ snp->srcu_parent = NULL;
+ continue;
+ }
+
+ /* Non-root node. */
+ if (snp == ssp->srcu_sup->level[level + 1])
+ level++;
+ snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+ (snp - ssp->srcu_sup->level[level]) /
+ levelspread[level - 1];
+ }
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ level = rcu_num_lvls - 1;
+ snp_first = ssp->srcu_sup->level[level];
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ sdp->mynode = &snp_first[cpu / levelspread[level]];
+ for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
+ if (snp->grplo < 0)
+ snp->grplo = cpu;
+ snp->grphi = cpu;
+ }
+ sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo);
+ }
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ return true;
+}
+
+/*
+ * Initialize non-compile-time initialized fields, including the
+ * associated srcu_node and srcu_data structures. The is_static parameter
+ * tells us that ->sda has already been wired up to srcu_data.
+ */
+static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
+{
+ if (!is_static)
+ ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ if (!ssp->srcu_sup)
+ return -ENOMEM;
+ if (!is_static)
+ spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->srcu_sup->node = NULL;
+ mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
+ ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL;
+ ssp->srcu_sup->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ ssp->srcu_sup->sda_is_static = is_static;
+ if (!is_static) {
+ ssp->sda = alloc_percpu(struct srcu_data);
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
+ }
+ if (!ssp->sda)
+ goto err_free_sup;
+ init_srcu_struct_data(ssp);
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
+ ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+ goto err_free_sda;
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
+ }
+ ssp->srcu_sup->srcu_ssp = ssp;
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed,
+ SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */
+ return 0;
+
+err_free_sda:
+ if (!is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ }
+err_free_sup:
+ if (!is_static) {
+ kfree(ssp->srcu_sup);
+ ssp->srcu_sup = NULL;
+ }
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+static int
+__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+ /* Don't re-initialize a lock while it is held. */
+ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
+ lockdep_init_map(&ssp->dep_map, name, key, 0);
+ return init_srcu_struct_fields(ssp, false);
+}
+
+int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = 0;
+ return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+ return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast);
+
+int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name,
+ struct lock_class_key *key)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+ return __init_srcu_struct_common(ssp, name, key);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary
+ * to invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function. Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *ssp)
+{
+ ssp->srcu_reader_flavor = 0;
+ return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+/**
+ * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST()
+ * for non-static srcu_struct structures that are to be passed to
+ * srcu_read_lock_fast() and friends. It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
+ */
+int init_srcu_struct_fast(struct srcu_struct *ssp)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST;
+ return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast);
+
+/**
+ * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure
+ * @ssp: structure to initialize.
+ *
+ * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and
+ * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct
+ * structures that are to be passed to srcu_read_lock_fast_updown(),
+ * srcu_down_read_fast(), and friends. It is necessary to invoke this on a
+ * given srcu_struct before passing that srcu_struct to any other function.
+ * Each srcu_struct represents a separate domain of SRCU protection.
+ */
+int init_srcu_struct_fast_updown(struct srcu_struct *ssp)
+{
+ ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN;
+ return init_srcu_struct_fields(ssp, false);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * Initiate a transition to SRCU_SIZE_BIG with lock held.
+ */
+static void __srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
+}
+
+/*
+ * Initiate an idempotent transition to SRCU_SIZE_BIG.
+ */
+static void srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+
+ /* Double-checked locking on ->srcu_size-state. */
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ return;
+ }
+ __srcu_transition_to_big(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+}
+
+/*
+ * Check to see if the just-encountered contention event justifies
+ * a transition to SRCU_SIZE_BIG.
+ */
+static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+{
+ unsigned long j;
+
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
+ return;
+ j = jiffies;
+ if (ssp->srcu_sup->srcu_size_jiffies != j) {
+ ssp->srcu_sup->srcu_size_jiffies = j;
+ ssp->srcu_sup->srcu_n_lock_retries = 0;
+ }
+ if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
+ return;
+ __srcu_transition_to_big(ssp);
+}
+
+/*
+ * Acquire the specified srcu_data structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+{
+ struct srcu_struct *ssp = sdp->ssp;
+
+ if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_rcu_node(sdp, *flags);
+}
+
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+ if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+}
+
+/*
+ * First-use initialization of statically allocated srcu_struct
+ * structure. Wiring up the combining tree is more than can be
+ * done with compile-time initialization, so this check is added
+ * to each update-side SRCU primitive. Use ssp->lock, which -is-
+ * compile-time initialized, to resolve races involving multiple
+ * CPUs trying to garner first-use privileges.
+ */
+static void check_init_srcu_struct(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+
+ /* The smp_load_acquire() pairs with the smp_store_release(). */
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
+ return; /* Already initialized. */
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ return;
+ }
+ init_srcu_struct_fields(ssp, true);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+}
+
+/*
+ * Is the current or any upcoming grace period to be expedited?
+ */
+static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
+{
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp));
+}
+
+/*
+ * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks
+ * values for the rank of per-CPU counters specified by idx, and returns
+ * true if the caller did the proper barrier (gp), and if the count of
+ * the locks matches that of the unlocks passed in.
+ */
+static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks)
+{
+ int cpu;
+ unsigned long mask = 0;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
+
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
+ }
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
+ "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
+ if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp)
+ return false;
+ return sum == unlocks;
+}
+
+/*
+ * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks
+ * values for the rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm)
+{
+ int cpu;
+ unsigned long mask = ssp->srcu_reader_flavor;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
+
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks);
+ mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
+ }
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
+ "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
+ *rdm = mask;
+ return sum;
+}
+
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
+{
+ bool did_gp;
+ unsigned long rdm;
+ unsigned long unlocks;
+
+ unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm);
+ did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP);
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted. Needs to be a smp_mb() as the read side may
+ * contain a read from a variable that is written to before the
+ * synchronize_srcu() in the write side. In this case smp_mb()s
+ * A and B (or X and Y) act like the store buffering pattern.
+ *
+ * This smp_mb() also pairs with smp_mb() C (or, in the case of X,
+ * Z) to prevent accesses after the synchronize_srcu() from being
+ * executed before the grace period ends.
+ */
+ if (!did_gp)
+ smp_mb(); /* A */
+ else if (srcu_gp_is_expedited(ssp))
+ synchronize_rcu_expedited(); /* X */
+ else
+ synchronize_rcu(); /* X */
+
+ /*
+ * If the locks are the same as the unlocks, then there must have
+ * been no readers on this index at some point in this function.
+ * But there might be more readers, as a task might have read
+ * the current ->srcu_ctrp but not yet have incremented its CPU's
+ * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible
+ * that most of the tasks have been preempted between fetching
+ * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And
+ * there could be almost (ULONG_MAX / sizeof(struct task_struct))
+ * tasks in a system whose address space was fully populated
+ * with memory. Call this quantity Nt.
+ *
+ * So suppose that the updater is preempted at this
+ * point in the code for a long time. That now-preempted
+ * updater has already flipped ->srcu_ctrp (possibly during
+ * the preceding grace period), done an smp_mb() (again,
+ * possibly during the preceding grace period), and summed up
+ * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times
+ * can a given one of the aforementioned Nt tasks increment the
+ * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter,
+ * in the absence of nesting?
+ *
+ * It can clearly do so once, given that it has already fetched
+ * the old value of ->srcu_ctrp and is just about to use that
+ * value to index its increment of ->srcu_ctrs[idx].srcu_locks.
+ * But as soon as it leaves that SRCU read-side critical section,
+ * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must
+ * follow the updater's above read from that same value. Thus,
+ as soon the reading task does an smp_mb() and a later fetch from
+ * ->srcu_ctrp, that task will be guaranteed to get the new index.
+ * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks
+ * in __srcu_read_unlock() is after the smp_mb(), and the fetch
+ * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb().
+ * Thus, that task might not see the new value of ->srcu_ctrp until
+ * the -second- __srcu_read_lock(), which in turn means that this
+ * task might well increment ->srcu_ctrs[idx].srcu_locks for the
+ * old value of ->srcu_ctrp twice, not just once.
+ *
+ * However, it is important to note that a given smp_mb() takes
+ * effect not just for the task executing it, but also for any
+ * later task running on that same CPU.
+ *
+ * That is, there can be almost Nt + Nc further increments
+ * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc
+ * is the number of CPUs. But this is OK because the size of
+ * the task_struct structure limits the value of Nt and current
+ * systems limit Nc to a few thousand.
+ *
+ * OK, but what about nesting? This does impose a limit on
+ * nesting of half of the size of the task_struct structure
+ * (measured in bytes), which should be sufficient. A late 2022
+ * TREE01 rcutorture run reported this size to be no less than
+ * 9408 bytes, allowing up to 4704 levels of nesting, which is
+ * comfortably beyond excessive. Especially on 64-bit systems,
+ * which are unlikely to be configured with an address space fully
+ * populated with memory, at least not anytime soon.
+ */
+ return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks);
+}
+
+/**
+ * srcu_readers_active - returns true if there are readers. and false
+ * otherwise
+ * @ssp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct. That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static bool srcu_readers_active(struct srcu_struct *ssp)
+{
+ int cpu;
+ unsigned long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
+
+ sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks);
+ sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks);
+ }
+ return sum;
+}
+
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections. If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
+ // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
+ // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
+ (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+ SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);
+
+/*
+ * Return grace-period delay, zero if there are expedited grace
+ * periods pending, SRCU_INTERVAL otherwise.
+ */
+static unsigned long srcu_get_delay(struct srcu_struct *ssp)
+{
+ unsigned long gpstart;
+ unsigned long j;
+ unsigned long jbase = SRCU_INTERVAL;
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ if (srcu_gp_is_expedited(ssp))
+ jbase = 0;
+ if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
+ j = jiffies - 1;
+ gpstart = READ_ONCE(sup->srcu_gp_start);
+ if (time_after(j, gpstart))
+ jbase += j - gpstart;
+ if (!jbase) {
+ ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay);
+ WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ jbase = 1;
+ }
+ }
+ return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
+}
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @ssp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *ssp)
+{
+ int cpu;
+ unsigned long delay;
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ delay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ if (WARN_ON(!delay))
+ return; /* Just leak it! */
+ if (WARN_ON(srcu_readers_active(ssp)))
+ return; /* Just leak it! */
+ flush_delayed_work(&sup->work);
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
+
+ timer_delete_sync(&sdp->delay_work);
+ flush_work(&sdp->work);
+ if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
+ return; /* Forgot srcu_barrier(), so just leak it! */
+ }
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) ||
+ WARN_ON(srcu_readers_active(ssp))) {
+ pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
+ __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
+ rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
+ return; // Caller forgot to stop doing call_srcu()?
+ // Or caller invoked start_poll_synchronize_srcu()
+ // and then cleanup_srcu_struct() before that grace
+ // period ended?
+ }
+ kfree(sup->node);
+ sup->node = NULL;
+ sup->srcu_size_state = SRCU_SIZE_SMALL;
+ if (!sup->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ kfree(sup);
+ ssp->srcu_sup = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Check for consistent reader flavor.
+ */
+void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
+{
+ int old_read_flavor;
+ struct srcu_data *sdp;
+
+ /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */
+ WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi());
+ WARN_ON_ONCE(read_flavor & (read_flavor - 1));
+
+ sdp = raw_cpu_ptr(ssp->sda);
+ old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor);
+ WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor);
+ WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor &&
+ old_read_flavor != ssp->srcu_reader_flavor);
+ WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor);
+ if (!old_read_flavor) {
+ old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
+ if (!old_read_flavor)
+ return;
+ }
+ WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor);
+}
+EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.
+ * Returns a guaranteed non-negative index that must be passed to the
+ * matching __srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *ssp)
+{
+ struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
+
+ this_cpu_inc(scp->srcu_locks.counter);
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
+ return __srcu_ptr_to_ctr(ssp, scp);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ */
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
+{
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+#ifdef CONFIG_NEED_SRCU_NMI_SAFE
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct, but in an NMI-safe manner using RMW atomics.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
+{
+ struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp);
+ struct srcu_ctr *scp = raw_cpu_ptr(scpp);
+
+ atomic_long_inc(&scp->srcu_locks);
+ smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
+ return __srcu_ptr_to_ctr(ssp, scpp);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ */
+void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
+{
+ smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
+ atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
+
+#endif // CONFIG_NEED_SRCU_NMI_SAFE
+
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *ssp)
+{
+ int state;
+
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
+ smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
+ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+ state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
+ WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
+}
+
+
+static void srcu_delay_timer(struct timer_list *t)
+{
+ struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work);
+
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
+}
+
+static void srcu_queue_delayed_work_on(struct srcu_data *sdp,
+ unsigned long delay)
+{
+ if (!delay) {
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
+ return;
+ }
+
+ timer_reduce(&sdp->delay_work, jiffies + delay);
+}
+
+/*
+ * Schedule callback invocation for the specified srcu_data structure,
+ * if possible, on the corresponding CPU.
+ */
+static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
+{
+ srcu_queue_delayed_work_on(sdp, delay);
+}
+
+/*
+ * Schedule callback invocation for all srcu_data structures associated
+ * with the specified srcu_node structure that have callbacks for the
+ * just-completed grace period, the one corresponding to idx. If possible,
+ * schedule this invocation on the corresponding CPUs.
+ */
+static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp,
+ unsigned long mask, unsigned long delay)
+{
+ int cpu;
+
+ for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+ if (!(mask & (1UL << (cpu - snp->grplo))))
+ continue;
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
+ }
+}
+
+/*
+ * Note the end of an SRCU grace period. Initiates callback invocation
+ * and starts a new grace period if needed.
+ *
+ * The ->srcu_cb_mutex acquisition does not protect any data, but
+ * instead prevents more than one grace period from starting while we
+ * are initiating callback invocation. This allows the ->srcu_have_cbs[]
+ * array to have a finite number of elements.
+ */
+static void srcu_gp_end(struct srcu_struct *ssp)
+{
+ unsigned long cbdelay = 1;
+ bool cbs;
+ bool last_lvl;
+ int cpu;
+ unsigned long gpseq;
+ int idx;
+ unsigned long mask;
+ struct srcu_data *sdp;
+ unsigned long sgsne;
+ struct srcu_node *snp;
+ int ss_state;
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ /* Prevent more than one additional grace period. */
+ mutex_lock(&sup->srcu_cb_mutex);
+
+ /* End the current grace period. */
+ spin_lock_irq_rcu_node(sup);
+ idx = rcu_seq_state(sup->srcu_gp_seq);
+ WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+ if (srcu_gp_is_expedited(ssp))
+ cbdelay = 0;
+
+ WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+ rcu_seq_end(&sup->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
+ spin_unlock_irq_rcu_node(sup);
+ mutex_unlock(&sup->srcu_gp_mutex);
+ /* A new grace period can start at this point. But only one. */
+
+ /* Initiate callback invocation as needed. */
+ ss_state = smp_load_acquire(&sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
+ cbdelay);
+ } else {
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ srcu_for_each_node_breadth_first(ssp, snp) {
+ spin_lock_irq_rcu_node(snp);
+ cbs = false;
+ last_lvl = snp >= sup->level[rcu_num_lvls - 1];
+ if (last_lvl)
+ cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
+ if (ss_state < SRCU_SIZE_BIG)
+ mask = ~0;
+ else
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ spin_unlock_irq_rcu_node(snp);
+ if (cbs)
+ srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
+ }
+ }
+
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_irq_rcu_node(sdp);
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irq_rcu_node(sdp);
+ }
+
+ /* Callback initiation done, allow grace periods after next. */
+ mutex_unlock(&sup->srcu_cb_mutex);
+
+ /* Start a new grace period if needed. */
+ spin_lock_irq_rcu_node(sup);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (!rcu_seq_state(gpseq) &&
+ ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
+ srcu_gp_start(ssp);
+ spin_unlock_irq_rcu_node(sup);
+ srcu_reschedule(ssp, 0);
+ } else {
+ spin_unlock_irq_rcu_node(sup);
+ }
+
+ /* Transition to big if needed. */
+ if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
+ if (ss_state == SRCU_SIZE_ALLOC)
+ init_srcu_struct_nodes(ssp, GFP_KERNEL);
+ else
+ smp_store_release(&sup->srcu_size_state, ss_state + 1);
+ }
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent expedited
+ * grace-period requests. This function is invoked for the first known
+ * expedited request for a grace period that has already been requested,
+ * but without expediting. To start a completely new grace period,
+ * whether expedited or not, use srcu_funnel_gp_start() instead.
+ */
+static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp,
+ unsigned long s)
+{
+ unsigned long flags;
+ unsigned long sgsne;
+
+ if (snp)
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
+ (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
+ return;
+ spin_lock_irqsave_rcu_node(snp, flags);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ }
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent grace-period
+ * requests. The winner has to do the work of actually starting grace
+ * period s. Losers must either ensure that their desired grace-period
+ * number is recorded on at least their leaf srcu_node structure, or they
+ * must take steps to invoke their own callbacks.
+ *
+ * Note that this function also does the work of srcu_funnel_exp_start(),
+ * in some cases by directly invoking it.
+ *
+ * The srcu read lock should be hold around this function. And s is a seq snap
+ * after holding that lock.
+ */
+static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
+ unsigned long s, bool do_norm)
+{
+ unsigned long flags;
+ int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
+ unsigned long sgsne;
+ struct srcu_node *snp;
+ struct srcu_node *snp_leaf;
+ unsigned long snp_seq;
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ snp_leaf = NULL;
+ else
+ snp_leaf = sdp->mynode;
+
+ if (snp_leaf)
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+ if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
+ return; /* GP already done and CBs recorded. */
+ spin_lock_irqsave_rcu_node(snp, flags);
+ snp_seq = snp->srcu_have_cbs[idx];
+ if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
+ if (snp == snp_leaf && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ if (snp == snp_leaf && snp_seq != s) {
+ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(ssp, snp, s);
+ return;
+ }
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == snp_leaf)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ }
+
+ /* Top of tree, must ensure the grace period will be started. */
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
+ /*
+ * Record need for grace period s. Pair with load
+ * acquire setting up for initialization.
+ */
+ smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/
+ }
+ if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
+
+ /* If grace period not already in progress, start it. */
+ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
+ rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ srcu_gp_start(ssp);
+
+ // And how can that list_add() in the "else" clause
+ // possibly be safe for concurrent execution? Well,
+ // it isn't. And it does not have to be. After all, it
+ // can only be executed during early boot when there is only
+ // the one boot CPU running with interrupts still disabled.
+ if (likely(srcu_init_done))
+ queue_delayed_work(rcu_gp_wq, &sup->work,
+ !!srcu_get_delay(ssp));
+ else if (list_empty(&sup->work.work.entry))
+ list_add(&sup->work.work.entry, &srcu_boot_list);
+ }
+ spin_unlock_irqrestore_rcu_node(sup, flags);
+}
+
+/*
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->srcu_ctrp is not changed while checking.
+ */
+static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
+{
+ unsigned long curdelay;
+
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ curdelay = !srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+
+ for (;;) {
+ if (srcu_readers_active_idx_check(ssp, idx))
+ return true;
+ if ((--trycount + curdelay) <= 0)
+ return false;
+ udelay(srcu_retry_check_delay);
+ }
+}
+
+/*
+ * Increment the ->srcu_ctrp counter so that future SRCU readers will
+ * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *ssp)
+{
+ /*
+ * Because the flip of ->srcu_ctrp is executed only if the
+ * preceding call to srcu_readers_active_idx_check() found that
+ * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums
+ * matched and because that summing uses atomic_long_read(),
+ * there is ordering due to a control dependency between that
+ * summing and the WRITE_ONCE() in this call to srcu_flip().
+ * This ordering ensures that if this updater saw a given reader's
+ * increment from __srcu_read_lock(), that reader was using a value
+ * of ->srcu_ctrp from before the previous call to srcu_flip(),
+ * which should be quite rare. This ordering thus helps forward
+ * progress because the grace period could otherwise be delayed
+ * by additional calls to __srcu_read_lock() using that old (soon
+ * to be new) value of ->srcu_ctrp.
+ *
+ * This sum-equality check and ordering also ensures that if
+ * a given call to __srcu_read_lock() uses the new value of
+ * ->srcu_ctrp, this updater's earlier scans cannot have seen
+ * that reader's increments, which is all to the good, because
+ * this grace period need not wait on that reader. After all,
+ * if those earlier scans had seen that reader, there would have
+ * been a sum mismatch and this code would not be reached.
+ *
+ * This means that the following smp_mb() is redundant, but
+ * it stays until either (1) Compilers learn about this sort of
+ * control dependency or (2) Some production workload running on
+ * a production system is unduly delayed by this slowpath smp_mb().
+ * Except for _lite() readers, where it is inoperative, which
+ * means that it is a good thing that it is redundant.
+ */
+ smp_mb(); /* E */ /* Pairs with B and C. */
+
+ WRITE_ONCE(ssp->srcu_ctrp,
+ &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]);
+
+ /*
+ * Ensure that if the updater misses an __srcu_read_unlock()
+ * increment, that task's __srcu_read_lock() following its next
+ * __srcu_read_lock() or __srcu_read_unlock() will see the above
+ * counter update. Note that both this memory barrier and the
+ * one in srcu_readers_active_idx_check() provide the guarantee
+ * for __srcu_read_lock().
+ *
+ * Note that this is a performance optimization, in which we spend
+ * an otherwise unnecessary smp_mb() in order to reduce the number
+ * of full per-CPU-variable scans in srcu_readers_lock_idx() and
+ * srcu_readers_unlock_idx(). But this performance optimization
+ * is not so optimal for SRCU-fast, where we would be spending
+ * not smp_mb(), but rather synchronize_rcu(). At the same time,
+ * the overhead of the smp_mb() is in the noise, so there is no
+ * point in omitting it in the SRCU-fast case. So the same code
+ * is executed either way.
+ */
+ smp_mb(); /* D */ /* Pairs with C. */
+}
+
+/*
+ * If SRCU is likely idle, in other words, the next SRCU grace period
+ * should be expedited, return true, otherwise return false. Except that
+ * in the presence of _lite() readers, always return false.
+ *
+ * Note that it is OK for several current from-idle requests for a new
+ * grace period from idle to specify expediting because they will all end
+ * up requesting the same grace period anyhow. So no loss.
+ *
+ * Note also that if any CPU (including the current one) is still invoking
+ * callbacks, this function will nevertheless say "idle". This is not
+ * ideal, but the overhead of checking all CPUs' callback lists is even
+ * less ideal, especially on large systems. Furthermore, the wakeup
+ * can happen before the callback is fully removed, so we have no choice
+ * but to accept this type of error.
+ *
+ * This function is also subject to counter-wrap errors, but let's face
+ * it, if this function was preempted for enough time for the counters
+ * to wrap, it really doesn't matter whether or not we expedite the grace
+ * period. The extra overhead of a needlessly expedited grace period is
+ * negligible when amortized over that time period, and the extra latency
+ * of a needlessly non-expedited grace period is similarly negligible.
+ */
+static bool srcu_should_expedite(struct srcu_struct *ssp)
+{
+ unsigned long curseq;
+ unsigned long flags;
+ struct srcu_data *sdp;
+ unsigned long t;
+ unsigned long tlast;
+
+ check_init_srcu_struct(ssp);
+ /* If _lite() readers, don't do unsolicited expediting. */
+ if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP)
+ return false;
+ /* If the local srcu_data structure has callbacks, not idle. */
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ return false; /* Callbacks already present, so not idle. */
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+ /*
+ * No local callbacks, so probabilistically probe global state.
+ * Exact information would require acquiring locks, which would
+ * kill scalability, hence the probabilistic nature of the probe.
+ */
+
+ /* First, see if enough time has passed since the last GP. */
+ t = ktime_get_mono_fast_ns();
+ tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
+ if (exp_holdoff == 0 ||
+ time_in_range_open(t, tlast, tlast + exp_holdoff))
+ return false; /* Too soon after last GP. */
+
+ /* Next, check for probable idleness. */
+ curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
+ smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
+ return false; /* Grace period in progress, so not idle. */
+ smp_mb(); /* Order ->srcu_gp_seq with prior access. */
+ if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
+ return false; /* GP # changed, so not idle. */
+ return true; /* With reasonable probability, idle! */
+}
+
+/*
+ * SRCU callback function to leak a callback.
+ */
+static void srcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
+/*
+ * Start an SRCU grace period, and also queue the callback if non-NULL.
+ */
+static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
+ struct rcu_head *rhp, bool do_norm)
+{
+ unsigned long flags;
+ int idx;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+ struct srcu_node *sdp_mynode;
+ int ss_state;
+
+ check_init_srcu_struct(ssp);
+ /*
+ * While starting a new grace period, make sure we are in an
+ * SRCU read-side critical section so that the grace-period
+ * sequence number cannot wrap around in the meantime.
+ */
+ idx = __srcu_read_lock_nmisafe(ssp);
+ ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_CALL)
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
+ else
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (rhp)
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+ /*
+ * It's crucial to capture the snapshot 's' for acceleration before
+ * reading the current gp_seq that is used for advancing. This is
+ * essential because if the acceleration snapshot is taken after a
+ * failed advancement attempt, there's a risk that a grace period may
+ * conclude and a new one may start in the interim. If the snapshot is
+ * captured after this sequence of events, the acceleration snapshot 's'
+ * could be excessively advanced, leading to acceleration failure.
+ * In such a scenario, an 'acceleration leak' can occur, where new
+ * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment.
+ * Also note that encountering advancing failures is a normal
+ * occurrence when the grace period for RCU_WAIT_TAIL is in progress.
+ *
+ * To see this, consider the following events which occur if
+ * rcu_seq_snap() were to be called after advance:
+ *
+ * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
+ * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
+ *
+ * 2) The grace period for RCU_WAIT_TAIL is seen as started but not
+ * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
+ *
+ * 3) This value is passed to rcu_segcblist_advance() which can't move
+ * any segment forward and fails.
+ *
+ * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
+ * But then the call to rcu_seq_snap() observes the grace period for the
+ * RCU_WAIT_TAIL segment as completed and the subsequent one for the
+ * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
+ * so it returns a snapshot of the next grace period, which is X + 12.
+ *
+ * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+ * freshly enqueued callback in RCU_NEXT_TAIL can't move to
+ * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
+ * period (gp_num = X + 8). So acceleration fails.
+ */
+ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
+ if (rhp) {
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Acceleration can never fail because the base current gp_seq
+ * used for acceleration is <= the value of gp_seq used for
+ * advancing. This means that RCU_NEXT_TAIL segment will
+ * always be able to be emptied by the acceleration into the
+ * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
+ }
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER)
+ sdp_mynode = NULL;
+ else
+ sdp_mynode = sdp->mynode;
+
+ if (needgp)
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(ssp, sdp_mynode, s);
+ __srcu_read_unlock_nmisafe(ssp, idx);
+ return s;
+}
+
+/*
+ * Enqueue an SRCU callback on the srcu_data structure associated with
+ * the current CPU and the specified srcu_struct structure, initiating
+ * grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section. On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_srcu(). It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_srcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_srcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func, bool do_norm)
+{
+ if (debug_rcu_head_queue(rhp)) {
+ /* Probable double call_srcu(), so leak the callback. */
+ WRITE_ONCE(rhp->func, srcu_leak_callback);
+ WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n");
+ return;
+ }
+ rhp->func = func;
+ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
+}
+
+/**
+ * call_srcu() - Queue a callback for invocation after an SRCU grace period
+ * @ssp: srcu_struct in queue the callback
+ * @rhp: structure to be used for queueing the SRCU callback.
+ * @func: function to be invoked after the SRCU grace period
+ *
+ * The callback function will be invoked some time after a full SRCU
+ * grace period elapses, in other words after all pre-existing SRCU
+ * read-side critical sections have completed. However, the callback
+ * function might well execute concurrently with other SRCU read-side
+ * critical sections that started after call_srcu() was invoked. SRCU
+ * read-side critical sections are delimited by srcu_read_lock() and
+ * srcu_read_unlock(), and may be nested.
+ *
+ * The callback will be invoked from process context, but with bh
+ * disabled. The callback function must therefore be fast and must
+ * not block.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
+ */
+void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ __call_srcu(ssp, rhp, func, true);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
+{
+ struct rcu_synchronize rcu;
+
+ srcu_lock_sync(&ssp->dep_map);
+
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+ might_sleep();
+ check_init_srcu_struct(ssp);
+ init_completion(&rcu.completion);
+ init_rcu_head_on_stack(&rcu.head);
+ __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm);
+ wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
+
+ /*
+ * Make sure that later code is ordered after the SRCU grace
+ * period. This pairs with the spin_lock_irq_rcu_node()
+ * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
+ * because the current CPU might have been totally uninvolved with
+ * (and thus unordered against) that grace period.
+ */
+ smp_mb();
+}
+
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @ssp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *ssp)
+{
+ __synchronize_srcu(ssp, rcu_gp_is_normal());
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @ssp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero
+ * at first, and then flip the ->srcu_ctrp and wait for the count of the
+ * other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU read-side critical section
+ * whose beginning preceded the call to synchronize_srcu(). In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section. Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ *
+ * Implementation of these memory-ordering guarantees is similar to
+ * that of synchronize_rcu().
+ *
+ * If SRCU is likely idle as determined by srcu_should_expedite(),
+ * expedite the first request. This semantic was provided by Classic SRCU,
+ * and is relied upon by its users, so TREE SRCU must also provide it.
+ * Note that detecting idleness is heuristic and subject to both false
+ * positives and negatives.
+ */
+void synchronize_srcu(struct srcu_struct *ssp)
+{
+ if (srcu_should_expedite(ssp) || rcu_gp_is_expedited())
+ synchronize_srcu_expedited(ssp);
+ else
+ __synchronize_srcu(ssp, true);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/**
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. It is the caller's responsibility
+ * to make sure that grace period happens, for example, by invoking
+ * call_srcu() after return from get_state_synchronize_srcu().
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ // Any prior manipulation of SRCU-protected data must happen
+ // before the load from ->srcu_gp_seq.
+ smp_mb();
+ return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/**
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
+ * this function also ensures that any needed SRCU grace period will be
+ * started. This convenience does come at a cost in terms of CPU overhead.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return srcu_gp_start_if_needed(ssp, NULL, true);
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/**
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ * @ssp: srcu_struct to provide cookie for.
+ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
+ *
+ * This function takes the cookie that was returned from either
+ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
+ * returns @true if an SRCU grace period elapsed since the time that the
+ * cookie was created.
+ *
+ * Because cookies are finite in size, wrapping/overflow is possible.
+ * This is more pronounced on 32-bit systems where cookies are 32 bits,
+ * where in theory wrapping could happen in about 14 hours assuming
+ * 25-microsecond expedited SRCU grace periods. However, a more likely
+ * overflow lower bound is on the order of 24 days in the case of
+ * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit
+ * system requires geologic timespans, as in more than seven million years
+ * even for expedited SRCU grace periods.
+ *
+ * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems
+ * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses
+ * a 16-bit cookie, which rcutorture routinely wraps in a matter of a
+ * few minutes. If this proves to be a problem, this counter will be
+ * expanded to the same size as for Tree SRCU.
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ if (cookie != SRCU_GET_STATE_COMPLETED &&
+ !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ return false;
+ // Ensure that the end of the SRCU grace period happens before
+ // any subsequent code that the caller might execute.
+ smp_mb(); // ^^^
+ return true;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
+/*
+ * Callback function for srcu_barrier() use.
+ */
+static void srcu_barrier_cb(struct rcu_head *rhp)
+{
+ struct srcu_data *sdp;
+ struct srcu_struct *ssp;
+
+ rhp->next = rhp; // Mark the callback as having been invoked.
+ sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
+ ssp = sdp->ssp;
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+}
+
+/*
+ * Enqueue an srcu_barrier() callback on the specified srcu_data
+ * structure's ->cblist. but only if that ->cblist already has at least one
+ * callback enqueued. Note that if a CPU already has callbacks enqueue,
+ * it must have already registered the need for a future grace period,
+ * so all we need do is enqueue a callback that will use the same grace
+ * period as the last callback already in the queue.
+ */
+static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
+{
+ spin_lock_irq_rcu_node(sdp);
+ atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+ atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ }
+ spin_unlock_irq_rcu_node(sdp);
+}
+
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @ssp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *ssp)
+{
+ int cpu;
+ int idx;
+ unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
+
+ check_init_srcu_struct(ssp);
+ mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
+ smp_mb(); /* Force ordering following return. */
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
+ return; /* Someone else did our work for us. */
+ }
+ rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+ init_completion(&ssp->srcu_sup->srcu_barrier_completion);
+
+ /* Initial count prevents reaching zero until all CBs are posted. */
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
+
+ idx = __srcu_read_lock_nmisafe(ssp);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
+ else
+ for_each_possible_cpu(cpu)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
+ __srcu_read_unlock_nmisafe(ssp, idx);
+
+ /* Remove the initial count, at which point reaching zero can happen. */
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
+
+ rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/* Callback for srcu_expedite_current() usage. */
+static void srcu_expedite_current_cb(struct rcu_head *rhp)
+{
+ unsigned long flags;
+ bool needcb = false;
+ struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head);
+
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+ WARN_ON_ONCE(1);
+ } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+ sdp->srcu_ec_state = SRCU_EC_IDLE;
+ } else {
+ WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+ sdp->srcu_ec_state = SRCU_EC_PENDING;
+ needcb = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ // If needed, requeue ourselves as an expedited SRCU callback.
+ if (needcb)
+ __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+}
+
+/**
+ * srcu_expedite_current - Expedite the current SRCU grace period
+ * @ssp: srcu_struct to expedite.
+ *
+ * Cause the current SRCU grace period to become expedited. The grace
+ * period following the current one might also be expedited. If there is
+ * no current grace period, one might be created. If the current grace
+ * period is currently sleeping, that sleep will complete before expediting
+ * will take effect.
+ */
+void srcu_expedite_current(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+ bool needcb = false;
+ struct srcu_data *sdp;
+
+ migrate_disable();
+ sdp = this_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (sdp->srcu_ec_state == SRCU_EC_IDLE) {
+ sdp->srcu_ec_state = SRCU_EC_PENDING;
+ needcb = true;
+ } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) {
+ sdp->srcu_ec_state = SRCU_EC_REPOST;
+ } else {
+ WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST);
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ // If needed, queue an expedited SRCU callback.
+ if (needcb)
+ __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false);
+ migrate_enable();
+}
+EXPORT_SYMBOL_GPL(srcu_expedite_current);
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @ssp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+unsigned long srcu_batches_completed(struct srcu_struct *ssp)
+{
+ return READ_ONCE(ssp->srcu_sup->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+/*
+ * Core SRCU state machine. Push state bits of ->srcu_gp_seq
+ * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
+ * completed in that state.
+ */
+static void srcu_advance_state(struct srcu_struct *ssp)
+{
+ int idx;
+
+ mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
+
+ /*
+ * Because readers might be delayed for an extended period after
+ * fetching ->srcu_ctrp for their index, at any point in time there
+ * might well be readers using both idx=0 and idx=1. We therefore
+ * need to wait for readers to clear from both index values before
+ * invoking a callback.
+ *
+ * The load-acquire ensures that we see the accesses performed
+ * by the prior grace period.
+ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
+ if (idx == SRCU_STATE_IDLE) {
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
+ return;
+ }
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
+ if (idx == SRCU_STATE_IDLE)
+ srcu_gp_start(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ if (idx != SRCU_STATE_IDLE) {
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
+ return; /* Someone else started the grace period. */
+ }
+ }
+
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
+ if (!try_check_zero(ssp, idx, 1)) {
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
+ return; /* readers present, retry later. */
+ }
+ srcu_flip(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ }
+
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+
+ /*
+ * SRCU read-side critical sections are normally short,
+ * so check at least twice in quick succession after a flip.
+ */
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
+ if (!try_check_zero(ssp, idx, 2)) {
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
+ return; /* readers present, retry later. */
+ }
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
+ }
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period. If there are more to do, SRCU will reschedule
+ * the workqueue. Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
+ */
+static void srcu_invoke_callbacks(struct work_struct *work)
+{
+ long len;
+ bool more;
+ struct rcu_cblist ready_cbs;
+ struct rcu_head *rhp;
+ struct srcu_data *sdp;
+ struct srcu_struct *ssp;
+
+ sdp = container_of(work, struct srcu_data, work);
+
+ ssp = sdp->ssp;
+ rcu_cblist_init(&ready_cbs);
+ spin_lock_irq_rcu_node(sdp);
+ WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Although this function is theoretically re-entrant, concurrent
+ * callbacks invocation is disallowed to avoid executing an SRCU barrier
+ * too early.
+ */
+ if (sdp->srcu_cblist_invoking ||
+ !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
+ spin_unlock_irq_rcu_node(sdp);
+ return; /* Someone else on the job or nothing to do. */
+ }
+
+ /* We are on the job! Extract and invoke ready callbacks. */
+ sdp->srcu_cblist_invoking = true;
+ rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+ len = ready_cbs.len;
+ spin_unlock_irq_rcu_node(sdp);
+ rhp = rcu_cblist_dequeue(&ready_cbs);
+ for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+ debug_rcu_head_unqueue(rhp);
+ debug_rcu_head_callback(rhp);
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ }
+ WARN_ON_ONCE(ready_cbs.len);
+
+ /*
+ * Update counts, accelerate new callbacks, and if needed,
+ * schedule another round of callback invocation.
+ */
+ spin_lock_irq_rcu_node(sdp);
+ rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
+ sdp->srcu_cblist_invoking = false;
+ more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
+ spin_unlock_irq_rcu_node(sdp);
+ /* An SRCU barrier or callbacks from previous nesting work pending */
+ if (more)
+ srcu_schedule_cbs_sdp(sdp, 0);
+}
+
+/*
+ * Finished one round of SRCU grace period. Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
+{
+ bool pushgp = true;
+
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
+ /* All requests fulfilled, time to go idle. */
+ pushgp = false;
+ }
+ } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
+ /* Outstanding request and no GP. Start one. */
+ srcu_gp_start(ssp);
+ }
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+
+ if (pushgp)
+ queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+static void process_srcu(struct work_struct *work)
+{
+ unsigned long curdelay;
+ unsigned long j;
+ struct srcu_struct *ssp;
+ struct srcu_usage *sup;
+
+ sup = container_of(work, struct srcu_usage, work.work);
+ ssp = sup->srcu_ssp;
+
+ srcu_advance_state(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ curdelay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ if (curdelay) {
+ WRITE_ONCE(sup->reschedule_count, 0);
+ } else {
+ j = jiffies;
+ if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count);
+ WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+ if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
+ curdelay = 1;
+ } else {
+ WRITE_ONCE(sup->reschedule_count, 1);
+ WRITE_ONCE(sup->reschedule_jiffies, j);
+ }
+ }
+ srcu_reschedule(ssp, curdelay);
+}
+
+void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,
+ unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+
+static const char * const srcu_size_state_name[] = {
+ "SRCU_SIZE_SMALL",
+ "SRCU_SIZE_ALLOC",
+ "SRCU_SIZE_WAIT_BARRIER",
+ "SRCU_SIZE_WAIT_CALL",
+ "SRCU_SIZE_WAIT_CBS1",
+ "SRCU_SIZE_WAIT_CBS2",
+ "SRCU_SIZE_WAIT_CBS3",
+ "SRCU_SIZE_WAIT_CBS4",
+ "SRCU_SIZE_BIG",
+ "SRCU_SIZE_???",
+};
+
+void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
+{
+ int cpu;
+ int idx;
+ unsigned long s0 = 0, s1 = 0;
+ int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
+ int ss_state_idx = ss_state;
+
+ idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0];
+ if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
+ ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
+ pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
+ tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
+ srcu_size_state_name[ss_state_idx]);
+ if (!ssp->sda) {
+ // Called after cleanup_srcu_struct(), perhaps.
+ pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
+ } else {
+ pr_cont(" per-CPU(idx=%d):", idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *sdp;
+
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks));
+ u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks));
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks));
+ l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks));
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
+ }
+ if (SRCU_SIZING_IS_TORTURE())
+ srcu_transition_to_big(ssp);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
+static int __init srcu_bootup_announce(void)
+{
+ pr_info("Hierarchical SRCU implementation.\n");
+ if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
+ pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+ pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+ if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+ pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+ pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
+ return 0;
+}
+early_initcall(srcu_bootup_announce);
+
+void __init srcu_init(void)
+{
+ struct srcu_usage *sup;
+
+ /* Decide on srcu_struct-size strategy. */
+ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
+ if (nr_cpu_ids >= big_cpu_lim) {
+ convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
+ pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
+ } else {
+ convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND;
+ pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
+ }
+ }
+
+ /*
+ * Once that is set, call_srcu() can follow the normal path and
+ * queue delayed work. This must follow RCU workqueues creation
+ * and timers initialization.
+ */
+ srcu_init_done = true;
+ while (!list_empty(&srcu_boot_list)) {
+ sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
+ work.work.entry);
+ list_del_init(&sup->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+ sup->srcu_size_state == SRCU_SIZE_SMALL)
+ sup->srcu_size_state = SRCU_SIZE_ALLOC;
+ queue_work(rcu_gp_wq, &sup->work.work);
+ }
+}
+
+#ifdef CONFIG_MODULES
+
+/* Initialize any global-scope srcu_struct structures used by this module. */
+static int srcu_module_coming(struct module *mod)
+{
+ int i;
+ struct srcu_struct *ssp;
+ struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ ssp->sda = alloc_percpu(struct srcu_data);
+ if (WARN_ON_ONCE(!ssp->sda))
+ return -ENOMEM;
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
+ }
+ return 0;
+}
+
+/* Clean up any global-scope srcu_struct structures used by this module. */
+static void srcu_module_going(struct module *mod)
+{
+ int i;
+ struct srcu_struct *ssp;
+ struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
+ !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
+ cleanup_srcu_struct(ssp);
+ if (!WARN_ON(srcu_readers_active(ssp)))
+ free_percpu(ssp->sda);
+ }
+}
+
+/* Handle one module, either coming or going. */
+static int srcu_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ int ret = 0;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ret = srcu_module_coming(mod);
+ break;
+ case MODULE_STATE_GOING:
+ srcu_module_going(mod);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block srcu_module_nb = {
+ .notifier_call = srcu_module_notify,
+ .priority = 0,
+};
+
+static __init int init_srcu_module_notifier(void)
+{
+ int ret;
+
+ ret = register_module_notifier(&srcu_module_nb);
+ if (ret)
+ pr_warn("Failed to register srcu module notifier\n");
+ return ret;
+}
+late_initcall(init_srcu_module_notifier);
+
+#endif /* #ifdef CONFIG_MODULES */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
new file mode 100644
index 000000000000..da60a9947c00
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU-based infrastructure for lightweight reader-writer locking
+ *
+ * Copyright (c) 2015, Red Hat, Inc.
+ *
+ * Author: Oleg Nesterov <oleg@redhat.com>
+ */
+
+#include <linux/rcu_sync.h>
+#include <linux/sched.h>
+
+enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
+
+#define rss_lock gp_wait.lock
+
+/**
+ * rcu_sync_init() - Initialize an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be initialized
+ */
+void rcu_sync_init(struct rcu_sync *rsp)
+{
+ memset(rsp, 0, sizeof(*rsp));
+ init_waitqueue_head(&rsp->gp_wait);
+}
+
+static void rcu_sync_func(struct rcu_head *rhp);
+
+static void rcu_sync_call(struct rcu_sync *rsp)
+{
+ call_rcu_hurry(&rsp->cb_head, rcu_sync_func);
+}
+
+/**
+ * rcu_sync_func() - Callback function managing reader access to fastpath
+ * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
+ *
+ * This function is passed to call_rcu() function by rcu_sync_enter() and
+ * rcu_sync_exit(), so that it is invoked after a grace period following the
+ * that invocation of enter/exit.
+ *
+ * If it is called by rcu_sync_enter() it signals that all the readers were
+ * switched onto slow path.
+ *
+ * If it is called by rcu_sync_exit() it takes action based on events that
+ * have taken place in the meantime, so that closely spaced rcu_sync_enter()
+ * and rcu_sync_exit() pairs need not wait for a grace period.
+ *
+ * If another rcu_sync_enter() is invoked before the grace period
+ * ended, reset state to allow the next rcu_sync_exit() to let the
+ * readers back onto their fastpaths (after a grace period). If both
+ * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked
+ * before the grace period ended, re-invoke call_rcu() on behalf of that
+ * rcu_sync_exit(). Otherwise, set all state back to idle so that readers
+ * can again use their fastpaths.
+ */
+static void rcu_sync_func(struct rcu_head *rhp)
+{
+ struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
+ unsigned long flags;
+
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
+
+ spin_lock_irqsave(&rsp->rss_lock, flags);
+ if (rsp->gp_count) {
+ /*
+ * We're at least a GP after the GP_IDLE->GP_ENTER transition.
+ */
+ WRITE_ONCE(rsp->gp_state, GP_PASSED);
+ wake_up_locked(&rsp->gp_wait);
+ } else if (rsp->gp_state == GP_REPLAY) {
+ /*
+ * A new rcu_sync_exit() has happened; requeue the callback to
+ * catch a later GP.
+ */
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ rcu_sync_call(rsp);
+ } else {
+ /*
+ * We're at least a GP after the last rcu_sync_exit(); everybody
+ * will now have observed the write side critical section.
+ * Let 'em rip!
+ */
+ WRITE_ONCE(rsp->gp_state, GP_IDLE);
+ }
+ spin_unlock_irqrestore(&rsp->rss_lock, flags);
+}
+
+/**
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update. After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths. A later call to
+ * rcu_sync_exit() re-enables reader fastpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+ int gp_state;
+
+ spin_lock_irq(&rsp->rss_lock);
+ gp_state = rsp->gp_state;
+ if (gp_state == GP_IDLE) {
+ WRITE_ONCE(rsp->gp_state, GP_ENTER);
+ WARN_ON_ONCE(rsp->gp_count);
+ /*
+ * Note that we could simply do rcu_sync_call(rsp) here and
+ * avoid the "if (gp_state == GP_IDLE)" block below.
+ *
+ * However, synchronize_rcu() can be faster if rcu_expedited
+ * or rcu_blocking_is_gp() is true.
+ *
+ * Another reason is that we can't wait for rcu callback if
+ * we are called at early boot time but this shouldn't happen.
+ */
+ }
+ rsp->gp_count++;
+ spin_unlock_irq(&rsp->rss_lock);
+
+ if (gp_state == GP_IDLE) {
+ /*
+ * See the comment above, this simply does the "synchronous"
+ * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED.
+ */
+ synchronize_rcu();
+ rcu_sync_func(&rsp->cb_head);
+ /* Not really needed, wait_event() would see GP_PASSED. */
+ return;
+ }
+
+ wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED);
+}
+
+/**
+ * rcu_sync_exit() - Allow readers back onto fast path after grace period
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who have completed, and can therefore
+ * now allow readers to make use of their fastpaths after a grace period
+ * has elapsed. After this grace period has completed, all subsequent
+ * calls to rcu_sync_is_idle() will return true, which tells readers that
+ * they can once again use their fastpaths.
+ */
+void rcu_sync_exit(struct rcu_sync *rsp)
+{
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+
+ spin_lock_irq(&rsp->rss_lock);
+ WARN_ON_ONCE(rsp->gp_count == 0);
+ if (!--rsp->gp_count) {
+ if (rsp->gp_state == GP_PASSED) {
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ rcu_sync_call(rsp);
+ } else if (rsp->gp_state == GP_EXIT) {
+ WRITE_ONCE(rsp->gp_state, GP_REPLAY);
+ }
+ }
+ spin_unlock_irq(&rsp->rss_lock);
+}
+
+/**
+ * rcu_sync_dtor() - Clean up an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be cleaned up
+ */
+void rcu_sync_dtor(struct rcu_sync *rsp)
+{
+ int gp_state;
+
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
+
+ spin_lock_irq(&rsp->rss_lock);
+ WARN_ON_ONCE(rsp->gp_count);
+ if (rsp->gp_state == GP_REPLAY)
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ gp_state = rsp->gp_state;
+ spin_unlock_irq(&rsp->rss_lock);
+
+ if (gp_state != GP_IDLE) {
+ rcu_barrier();
+ WARN_ON_ONCE(rsp->gp_state != GP_IDLE);
+ }
+}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
new file mode 100644
index 000000000000..2dc044fd126e
--- /dev/null
+++ b/kernel/rcu/tasks.h
@@ -0,0 +1,2283 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Task-based RCU implementations.
+ *
+ * Copyright (C) 2020 Paul E. McKenney
+ */
+
+#ifdef CONFIG_TASKS_RCU_GENERIC
+#include "rcu_segcblist.h"
+
+////////////////////////////////////////////////////////////////////////
+//
+// Generic data structures.
+
+struct rcu_tasks;
+typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);
+typedef void (*pregp_func_t)(struct list_head *hop);
+typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop);
+typedef void (*postscan_func_t)(struct list_head *hop);
+typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
+typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
+
+/**
+ * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism.
+ * @cblist: Callback list.
+ * @lock: Lock protecting per-CPU callback list.
+ * @rtp_jiffies: Jiffies counter value for statistics.
+ * @lazy_timer: Timer to unlazify callbacks.
+ * @urgent_gp: Number of additional non-lazy grace periods.
+ * @rtp_n_lock_retries: Rough lock-contention statistic.
+ * @rtp_work: Work queue for invoking callbacks.
+ * @rtp_irq_work: IRQ work queue for deferred wakeups.
+ * @barrier_q_head: RCU callback for barrier operation.
+ * @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @rtp_exit_list: List of tasks in the latter portion of do_exit().
+ * @cpu: CPU number corresponding to this entry.
+ * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure.
+ * @rtpp: Pointer to the rcu_tasks structure.
+ */
+struct rcu_tasks_percpu {
+ struct rcu_segcblist cblist;
+ raw_spinlock_t __private lock;
+ unsigned long rtp_jiffies;
+ unsigned long rtp_n_lock_retries;
+ struct timer_list lazy_timer;
+ unsigned int urgent_gp;
+ struct work_struct rtp_work;
+ struct irq_work rtp_irq_work;
+ struct rcu_head barrier_q_head;
+ struct list_head rtp_blkd_tasks;
+ struct list_head rtp_exit_list;
+ int cpu;
+ int index;
+ struct rcu_tasks *rtpp;
+};
+
+/**
+ * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
+ * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
+ * @cbs_gbl_lock: Lock protecting callback list.
+ * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
+ * @gp_func: This flavor's grace-period-wait function.
+ * @gp_state: Grace period's most recent state transition (debugging).
+ * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
+ * @init_fract: Initial backoff sleep interval.
+ * @gp_jiffies: Time of last @gp_state transition.
+ * @gp_start: Most recent grace-period start in jiffies.
+ * @tasks_gp_seq: Number of grace periods completed since boot in upper bits.
+ * @n_ipis: Number of IPIs sent to encourage grace periods to end.
+ * @n_ipis_fails: Number of IPI-send failures.
+ * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy.
+ * @pregp_func: This flavor's pre-grace-period function (optional).
+ * @pertask_func: This flavor's per-task scan function (optional).
+ * @postscan_func: This flavor's post-task scan function (optional).
+ * @holdouts_func: This flavor's holdout-list scan function (optional).
+ * @postgp_func: This flavor's post-grace-period function (optional).
+ * @call_func: This flavor's call_rcu()-equivalent function.
+ * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE).
+ * @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask.
+ * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
+ * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
+ * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
+ * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers.
+ * @barrier_q_mutex: Serialize barrier operations.
+ * @barrier_q_count: Number of queues being waited on.
+ * @barrier_q_completion: Barrier wait/wakeup mechanism.
+ * @barrier_q_seq: Sequence number for barrier operations.
+ * @barrier_q_start: Most recent barrier start in jiffies.
+ * @name: This flavor's textual name.
+ * @kname: This flavor's kthread name.
+ */
+struct rcu_tasks {
+ struct rcuwait cbs_wait;
+ raw_spinlock_t cbs_gbl_lock;
+ struct mutex tasks_gp_mutex;
+ int gp_state;
+ int gp_sleep;
+ int init_fract;
+ unsigned long gp_jiffies;
+ unsigned long gp_start;
+ unsigned long tasks_gp_seq;
+ unsigned long n_ipis;
+ unsigned long n_ipis_fails;
+ struct task_struct *kthread_ptr;
+ unsigned long lazy_jiffies;
+ rcu_tasks_gp_func_t gp_func;
+ pregp_func_t pregp_func;
+ pertask_func_t pertask_func;
+ postscan_func_t postscan_func;
+ holdouts_func_t holdouts_func;
+ postgp_func_t postgp_func;
+ call_rcu_func_t call_func;
+ unsigned int wait_state;
+ struct rcu_tasks_percpu __percpu *rtpcpu;
+ struct rcu_tasks_percpu **rtpcp_array;
+ int percpu_enqueue_shift;
+ int percpu_enqueue_lim;
+ int percpu_dequeue_lim;
+ unsigned long percpu_dequeue_gpseq;
+ struct mutex barrier_q_mutex;
+ atomic_t barrier_q_count;
+ struct completion barrier_q_completion;
+ unsigned long barrier_q_seq;
+ unsigned long barrier_q_start;
+ char *name;
+ char *kname;
+};
+
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
+
+#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
+static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \
+ .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \
+}; \
+static struct rcu_tasks rt_name = \
+{ \
+ .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \
+ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
+ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
+ .gp_func = gp, \
+ .call_func = call, \
+ .wait_state = TASK_UNINTERRUPTIBLE, \
+ .rtpcpu = &rt_name ## __percpu, \
+ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
+ .name = n, \
+ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
+ .percpu_enqueue_lim = 1, \
+ .percpu_dequeue_lim = 1, \
+ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
+ .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \
+ .kname = #rt_name, \
+}
+
+#ifdef CONFIG_TASKS_RCU
+
+/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
+
+/* Avoid IPIing CPUs early in the grace period. */
+#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
+static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
+module_param(rcu_task_ipi_delay, int, 0644);
+
+/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
+#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
+static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
+module_param(rcu_task_stall_timeout, int, 0644);
+#define RCU_TASK_STALL_INFO (HZ * 10)
+static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO;
+module_param(rcu_task_stall_info, int, 0644);
+static int rcu_task_stall_info_mult __read_mostly = 3;
+module_param(rcu_task_stall_info_mult, int, 0444);
+
+static int rcu_task_enqueue_lim __read_mostly = -1;
+module_param(rcu_task_enqueue_lim, int, 0444);
+
+static bool rcu_task_cb_adjust;
+static int rcu_task_contend_lim __read_mostly = 100;
+module_param(rcu_task_contend_lim, int, 0444);
+static int rcu_task_collapse_lim __read_mostly = 10;
+module_param(rcu_task_collapse_lim, int, 0444);
+static int rcu_task_lazy_lim __read_mostly = 32;
+module_param(rcu_task_lazy_lim, int, 0444);
+
+static int rcu_task_cpu_ids;
+
+/* RCU tasks grace-period state for debugging. */
+#define RTGS_INIT 0
+#define RTGS_WAIT_WAIT_CBS 1
+#define RTGS_WAIT_GP 2
+#define RTGS_PRE_WAIT_GP 3
+#define RTGS_SCAN_TASKLIST 4
+#define RTGS_POST_SCAN_TASKLIST 5
+#define RTGS_WAIT_SCAN_HOLDOUTS 6
+#define RTGS_SCAN_HOLDOUTS 7
+#define RTGS_POST_GP 8
+#define RTGS_WAIT_READERS 9
+#define RTGS_INVOKE_CBS 10
+#define RTGS_WAIT_CBS 11
+#ifndef CONFIG_TINY_RCU
+static const char * const rcu_tasks_gp_state_names[] = {
+ "RTGS_INIT",
+ "RTGS_WAIT_WAIT_CBS",
+ "RTGS_WAIT_GP",
+ "RTGS_PRE_WAIT_GP",
+ "RTGS_SCAN_TASKLIST",
+ "RTGS_POST_SCAN_TASKLIST",
+ "RTGS_WAIT_SCAN_HOLDOUTS",
+ "RTGS_SCAN_HOLDOUTS",
+ "RTGS_POST_GP",
+ "RTGS_WAIT_READERS",
+ "RTGS_INVOKE_CBS",
+ "RTGS_WAIT_CBS",
+};
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+////////////////////////////////////////////////////////////////////////
+//
+// Generic code.
+
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp);
+
+/* Record grace-period phase and time. */
+static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
+{
+ rtp->gp_state = newstate;
+ rtp->gp_jiffies = jiffies;
+}
+
+#ifndef CONFIG_TINY_RCU
+/* Return state name. */
+static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
+{
+ int i = data_race(rtp->gp_state); // Let KCSAN detect update races
+ int j = READ_ONCE(i); // Prevent the compiler from reading twice
+
+ if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names))
+ return "???";
+ return rcu_tasks_gp_state_names[j];
+}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+// Initialize per-CPU callback lists for the specified flavor of
+// Tasks RCU. Do not enqueue callbacks before this function is invoked.
+static void cblist_init_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ int lim;
+ int shift;
+ int maxcpu;
+ int index = 0;
+
+ if (rcu_task_enqueue_lim < 0) {
+ rcu_task_enqueue_lim = 1;
+ rcu_task_cb_adjust = true;
+ } else if (rcu_task_enqueue_lim == 0) {
+ rcu_task_enqueue_lim = 1;
+ }
+ lim = rcu_task_enqueue_lim;
+
+ rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL);
+ BUG_ON(!rtp->rtpcp_array);
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(!rtpcp);
+ if (cpu)
+ raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ rcu_segcblist_init(&rtpcp->cblist);
+ INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
+ rtpcp->cpu = cpu;
+ rtpcp->rtpp = rtp;
+ rtpcp->index = index;
+ rtp->rtpcp_array[index] = rtpcp;
+ index++;
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ if (!rtpcp->rtp_exit_list.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head;
+ maxcpu = cpu;
+ }
+
+ rcu_task_cpu_ids = maxcpu + 1;
+ if (lim > rcu_task_cpu_ids)
+ lim = rcu_task_cpu_ids;
+ shift = ilog2(rcu_task_cpu_ids / lim);
+ if (((rcu_task_cpu_ids - 1) >> shift) >= lim)
+ shift++;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n",
+ rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim),
+ rcu_task_cb_adjust, rcu_task_cpu_ids);
+}
+
+// Compute wakeup time for lazy callback timer.
+static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp)
+{
+ return jiffies + rtp->lazy_jiffies;
+}
+
+// Timer handler that unlazifies lazy callbacks.
+static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
+{
+ unsigned long flags;
+ bool needwake = false;
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp,
+ lazy_timer);
+
+ rtp = rtpcp->rtpp;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) {
+ if (!rtpcp->urgent_gp)
+ rtpcp->urgent_gp = 1;
+ needwake = true;
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (needwake)
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
+
+// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
+
+ rtp = rtpcp->rtpp;
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
+
+// Enqueue a callback for the specified flavor of Tasks RCU.
+static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
+ struct rcu_tasks *rtp)
+{
+ int chosen_cpu;
+ unsigned long flags;
+ bool havekthread = smp_load_acquire(&rtp->kthread_ptr);
+ int ideal_cpu;
+ unsigned long j;
+ bool needadjust = false;
+ bool needwake;
+ struct rcu_tasks_percpu *rtpcp;
+
+ rhp->next = NULL;
+ rhp->func = func;
+ local_irq_save(flags);
+ rcu_read_lock();
+ ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
+ chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+ WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids);
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
+ if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ j = jiffies;
+ if (rtpcp->rtp_jiffies != j) {
+ rtpcp->rtp_jiffies = j;
+ rtpcp->rtp_n_lock_retries = 0;
+ }
+ if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
+ READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids)
+ needadjust = true; // Defer adjustment to avoid deadlock.
+ }
+ // Queuing callbacks before initialization not yet supported.
+ if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist)))
+ rcu_segcblist_init(&rtpcp->cblist);
+ needwake = (func == wakeme_after_rcu) ||
+ (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim);
+ if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) {
+ if (rtp->lazy_jiffies)
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ else
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ }
+ if (needwake)
+ rtpcp->urgent_gp = 3;
+ rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (unlikely(needadjust)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids);
+ pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ rcu_read_unlock();
+ /* We can't create the thread unless interrupts are enabled. */
+ if (needwake && READ_ONCE(rtp->kthread_ptr))
+ irq_work_queue(&rtpcp->rtp_irq_work);
+}
+
+// RCU callback function for rcu_barrier_tasks_generic().
+static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp;
+
+ rhp->next = rhp; // Mark the callback as having been invoked.
+ rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
+ rtp = rtpcp->rtpp;
+ if (atomic_dec_and_test(&rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+}
+
+// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
+// Operates in a manner similar to rcu_barrier().
+static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq);
+
+ mutex_lock(&rtp->barrier_q_mutex);
+ if (rcu_seq_done(&rtp->barrier_q_seq, s)) {
+ smp_mb();
+ mutex_unlock(&rtp->barrier_q_mutex);
+ return;
+ }
+ rtp->barrier_q_start = jiffies;
+ rcu_seq_start(&rtp->barrier_q_seq);
+ init_completion(&rtp->barrier_q_completion);
+ atomic_set(&rtp->barrier_q_count, 2);
+ for_each_possible_cpu(cpu) {
+ if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim))
+ break;
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head))
+ atomic_inc(&rtp->barrier_q_count);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ if (atomic_sub_and_test(2, &rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+ wait_for_completion(&rtp->barrier_q_completion);
+ rcu_seq_end(&rtp->barrier_q_seq);
+ mutex_unlock(&rtp->barrier_q_mutex);
+}
+
+// Advance callbacks and indicate whether either a grace period or
+// callback invocation is needed.
+static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
+{
+ int cpu;
+ int dequeue_limit;
+ unsigned long flags;
+ bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
+ long n;
+ long ncbs = 0;
+ long ncbsnz = 0;
+ int needgpcb = 0;
+
+ dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
+ for (cpu = 0; cpu < dequeue_limit; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ /* Advance and accelerate any new callbacks. */
+ if (!rcu_segcblist_n_cbs(&rtpcp->cblist))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ // Should we shrink down to a single callback queue?
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (n) {
+ ncbs += n;
+ if (cpu > 0)
+ ncbsnz += n;
+ }
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
+ if (rtp->lazy_jiffies)
+ rtpcp->urgent_gp--;
+ needgpcb |= 0x3;
+ } else if (rcu_segcblist_empty(&rtpcp->cblist)) {
+ rtpcp->urgent_gp = 0;
+ }
+ if (rcu_segcblist_ready_cbs(&rtpcp->cblist))
+ needgpcb |= 0x1;
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ // Shrink down to a single callback queue if appropriate.
+ // This is done in two stages: (1) If there are no more than
+ // rcu_task_collapse_lim callbacks on CPU 0 and none on any other
+ // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period,
+ // if there has not been an increase in callbacks, limit dequeuing
+ // to CPU 0. Note the matching RCU read-side critical section in
+ // call_rcu_tasks_generic().
+ if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim > 1) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids));
+ smp_store_release(&rtp->percpu_enqueue_lim, 1);
+ rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ gpdone = false;
+ pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ if (rcu_task_cb_adjust && !ncbsnz && gpdone) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
+ WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
+ pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ if (rtp->percpu_dequeue_lim == 1) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ }
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+
+ return needgpcb;
+}
+
+// Advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
+{
+ int cpuwq;
+ unsigned long flags;
+ int len;
+ int index;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ struct rcu_tasks_percpu *rtpcp_next;
+
+ index = rtpcp->index * 2 + 1;
+ if (index < num_possible_cpus()) {
+ rtpcp_next = rtp->rtpcp_array[index];
+ if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work);
+ index++;
+ if (index < num_possible_cpus()) {
+ rtpcp_next = rtp->rtpcp_array[index];
+ if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work);
+ }
+ }
+ }
+ }
+
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ return;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ len = rcl.len;
+ for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ debug_rcu_head_callback(rhp);
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ cond_resched();
+ }
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_add_len(&rtpcp->cblist, -len);
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+
+// Workqueue flood to advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work);
+
+ rtp = rtpcp->rtpp;
+ rcu_tasks_invoke_cbs(rtp, rtpcp);
+}
+
+// Wait for one grace period.
+static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
+{
+ int needgpcb;
+
+ mutex_lock(&rtp->tasks_gp_mutex);
+
+ // If there were none, wait a bit and start over.
+ if (unlikely(midboot)) {
+ needgpcb = 0x2;
+ } else {
+ mutex_unlock(&rtp->tasks_gp_mutex);
+ set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+ rcuwait_wait_event(&rtp->cbs_wait,
+ (needgpcb = rcu_tasks_need_gpcb(rtp)),
+ TASK_IDLE);
+ mutex_lock(&rtp->tasks_gp_mutex);
+ }
+
+ if (needgpcb & 0x2) {
+ // Wait for one grace period.
+ set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+ rtp->gp_start = jiffies;
+ rcu_seq_start(&rtp->tasks_gp_seq);
+ rtp->gp_func(rtp);
+ rcu_seq_end(&rtp->tasks_gp_seq);
+ }
+
+ // Invoke callbacks.
+ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
+ rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+ mutex_unlock(&rtp->tasks_gp_mutex);
+}
+
+// RCU-tasks kthread that detects grace periods and invokes callbacks.
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+ int cpu;
+ struct rcu_tasks *rtp = arg;
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0);
+ rtpcp->urgent_gp = 1;
+ }
+
+ /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
+ housekeeping_affine(current, HK_TYPE_RCU);
+ smp_store_release(&rtp->kthread_ptr, current); // Let GPs start!
+
+ /*
+ * Each pass through the following loop makes one check for
+ * newly arrived callbacks, and, if there are some, waits for
+ * one RCU-tasks grace period and then invokes the callbacks.
+ * This loop is terminated by the system going down. ;-)
+ */
+ for (;;) {
+ // Wait for one grace period and invoke any callbacks
+ // that are ready.
+ rcu_tasks_one_gp(rtp, false);
+
+ // Paranoid sleep to keep this from entering a tight loop.
+ schedule_timeout_idle(rtp->gp_sleep);
+ }
+}
+
+// Wait for a grace period for the specified flavor of Tasks RCU.
+static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+{
+ /* Complain if the scheduler has not started. */
+ if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_%s() called too soon", rtp->name))
+ return;
+
+ // If the grace-period kthread is running, use it.
+ if (READ_ONCE(rtp->kthread_ptr)) {
+ wait_rcu_gp_state(rtp->wait_state, rtp->call_func);
+ return;
+ }
+ rcu_tasks_one_gp(rtp, true);
+}
+
+/* Spawn RCU-tasks grace-period kthread. */
+static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
+{
+ struct task_struct *t;
+
+ t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
+ return;
+ smp_mb(); /* Ensure others see full kthread. */
+}
+
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any non-default Tasks RCU settings.
+ */
+static void __init rcu_tasks_bootup_oddness(void)
+{
+#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+ int rtsimc;
+
+ if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
+ pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ rtsimc = clamp(rcu_task_stall_info_mult, 1, 10);
+ if (rtsimc != rcu_task_stall_info_mult) {
+ pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc);
+ rcu_task_stall_info_mult = rtsimc;
+ }
+#endif /* #ifdef CONFIG_TASKS_RCU */
+#ifdef CONFIG_TASKS_RCU
+ pr_info("\tTrampoline variant of Tasks RCU enabled.\n");
+#endif /* #ifdef CONFIG_TASKS_RCU */
+#ifdef CONFIG_TASKS_RUDE_RCU
+ pr_info("\tRude variant of Tasks RCU enabled.\n");
+#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
+#ifdef CONFIG_TASKS_TRACE_RCU
+ pr_info("\tTracing variant of Tasks RCU enabled.\n");
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
+}
+
+
+/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
+static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
+{
+ int cpu;
+ bool havecbs = false;
+ bool haveurgent = false;
+ bool haveurgentcbs = false;
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)))
+ havecbs = true;
+ if (data_race(rtpcp->urgent_gp))
+ haveurgent = true;
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp))
+ haveurgentcbs = true;
+ if (havecbs && haveurgent && haveurgentcbs)
+ break;
+ }
+ pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n",
+ rtp->kname,
+ tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
+ jiffies - data_race(rtp->gp_jiffies),
+ data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
+ data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
+ ".k"[!!data_race(rtp->kthread_ptr)],
+ ".C"[havecbs],
+ ".u"[haveurgent],
+ ".U"[haveurgentcbs],
+ rtp->lazy_jiffies,
+ s);
+}
+
+/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */
+static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt,
+ char *tf, char *tst)
+{
+ cpumask_var_t cm;
+ int cpu;
+ bool gotcb = false;
+ unsigned long j = jiffies;
+
+ pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n",
+ tt, tf, tst, data_race(rtp->tasks_gp_seq),
+ j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies),
+ data_race(rtp->gp_state), tasks_gp_state_getname(rtp));
+ pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n",
+ data_race(rtp->percpu_enqueue_shift),
+ data_race(rtp->percpu_enqueue_lim),
+ data_race(rtp->percpu_dequeue_lim),
+ data_race(rtp->percpu_dequeue_gpseq));
+ (void)zalloc_cpumask_var(&cm, GFP_KERNEL);
+ pr_alert("\tCallback counts:");
+ for_each_possible_cpu(cpu) {
+ long n;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head))
+ cpumask_set_cpu(cpu, cm);
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (!n)
+ continue;
+ pr_cont(" %d:%ld", cpu, n);
+ gotcb = true;
+ }
+ if (gotcb)
+ pr_cont(".\n");
+ else
+ pr_cont(" (none).\n");
+ pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ",
+ data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start),
+ atomic_read(&rtp->barrier_q_count));
+ if (cpumask_available(cm) && !cpumask_empty(cm))
+ pr_cont(" %*pbl.\n", cpumask_pr_args(cm));
+ else
+ pr_cont("(none).\n");
+ free_cpumask_var(cm);
+}
+
+#endif // #ifndef CONFIG_TINY_RCU
+
+static void exit_tasks_rcu_finish_trace(struct task_struct *t);
+
+#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+
+////////////////////////////////////////////////////////////////////////
+//
+// Shared code between task-list-scanning variants of Tasks RCU.
+
+/* Wait for one RCU-tasks grace period. */
+static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
+{
+ struct task_struct *g;
+ int fract;
+ LIST_HEAD(holdouts);
+ unsigned long j;
+ unsigned long lastinfo;
+ unsigned long lastreport;
+ bool reported = false;
+ int rtsi;
+ struct task_struct *t;
+
+ set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
+ rtp->pregp_func(&holdouts);
+
+ /*
+ * There were callbacks, so we need to wait for an RCU-tasks
+ * grace period. Start off by scanning the task list for tasks
+ * that are not already voluntarily blocked. Mark these tasks
+ * and make a list of them in holdouts.
+ */
+ set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST);
+ if (rtp->pertask_func) {
+ rcu_read_lock();
+ for_each_process_thread(g, t)
+ rtp->pertask_func(t, &holdouts);
+ rcu_read_unlock();
+ }
+
+ set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST);
+ rtp->postscan_func(&holdouts);
+
+ /*
+ * Each pass through the following loop scans the list of holdout
+ * tasks, removing any that are no longer holdouts. When the list
+ * is empty, we are done.
+ */
+ lastreport = jiffies;
+ lastinfo = lastreport;
+ rtsi = READ_ONCE(rcu_task_stall_info);
+
+ // Start off with initial wait and slowly back off to 1 HZ wait.
+ fract = rtp->init_fract;
+
+ while (!list_empty(&holdouts)) {
+ ktime_t exp;
+ bool firstreport;
+ bool needreport;
+ int rtst;
+
+ // Slowly back off waiting for holdouts
+ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ schedule_timeout_idle(fract);
+ } else {
+ exp = jiffies_to_nsecs(fract);
+ __set_current_state(TASK_IDLE);
+ schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
+ }
+
+ if (fract < HZ)
+ fract++;
+
+ rtst = READ_ONCE(rcu_task_stall_timeout);
+ needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
+ if (needreport) {
+ lastreport = jiffies;
+ reported = true;
+ }
+ firstreport = true;
+ WARN_ON(signal_pending(current));
+ set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS);
+ rtp->holdouts_func(&holdouts, needreport, &firstreport);
+
+ // Print pre-stall informational messages if needed.
+ j = jiffies;
+ if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) {
+ lastinfo = j;
+ rtsi = rtsi * rcu_task_stall_info_mult;
+ pr_info("%s: %s grace period number %lu (since boot) is %lu jiffies old.\n",
+ __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start);
+ }
+ }
+
+ set_tasks_gp_state(rtp, RTGS_POST_GP);
+ rtp->postgp_func(rtp);
+}
+
+#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */
+
+#ifdef CONFIG_TASKS_RCU
+
+////////////////////////////////////////////////////////////////////////
+//
+// Simple variant of RCU whose quiescent states are voluntary context
+// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle.
+// As such, grace periods can take one good long time. There are no
+// read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
+// because this implementation is intended to get the system into a safe
+// state for some of the manipulations involved in tracing and the like.
+// Finally, this implementation does not support high call_rcu_tasks()
+// rates from multiple CPUs. If this is required, per-CPU callback lists
+// will be needed.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_pregp_step():
+// Invokes synchronize_rcu() in order to wait for all in-flight
+// t->on_rq and t->nvcsw transitions to complete. This works because
+// all such transitions are carried out with interrupts disabled.
+// rcu_tasks_pertask(), invoked on every non-idle task:
+// For every runnable non-idle task other than the current one, use
+// get_task_struct() to pin down that task, snapshot that task's
+// number of voluntary context switches, and add that task to the
+// holdout list.
+// rcu_tasks_postscan():
+// Gather per-CPU lists of tasks in do_exit() to ensure that all
+// tasks that were in the process of exiting (and which thus might
+// not know to synchronize with this RCU Tasks grace period) have
+// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
+// will take care of any tasks stuck in the non-preemptible region
+// of do_exit() following its call to exit_tasks_rcu_finish().
+// check_all_holdout_tasks(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list.
+// rcu_tasks_postgp():
+// Invokes synchronize_rcu() in order to ensure that all prior
+// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks
+// to have happened before the end of this RCU Tasks grace period.
+// Again, this works because all such transitions are carried out
+// with interrupts disabled.
+//
+// For each exiting task, the exit_tasks_rcu_start() and
+// exit_tasks_rcu_finish() functions add and remove, respectively, the
+// current task to a per-CPU list of tasks that rcu_tasks_postscan() must
+// wait on. This is necessary because rcu_tasks_postscan() must wait on
+// tasks that have already been removed from the global list of tasks.
+//
+// Pre-grace-period update-side code is ordered before the grace
+// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
+// is ordered before the grace period via synchronize_rcu() call in
+// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// disabling.
+
+/* Pre-grace-period preparation. */
+static void rcu_tasks_pregp_step(struct list_head *hop)
+{
+ /*
+ * Wait for all pre-existing t->on_rq and t->nvcsw transitions
+ * to complete. Invoking synchronize_rcu() suffices because all
+ * these transitions occur with interrupts disabled. Without this
+ * synchronize_rcu(), a read-side critical section that started
+ * before the grace period might be incorrectly seen as having
+ * started after the grace period.
+ *
+ * This synchronize_rcu() also dispenses with the need for a
+ * memory barrier on the first store to t->rcu_tasks_holdout,
+ * as it forces the store to happen after the beginning of the
+ * grace period.
+ */
+ synchronize_rcu();
+}
+
+/* Check for quiescent states since the pregp's synchronize_rcu() */
+static bool rcu_tasks_is_holdout(struct task_struct *t)
+{
+ int cpu;
+
+ /* Has the task been seen voluntarily sleeping? */
+ if (!READ_ONCE(t->on_rq))
+ return false;
+
+ /*
+ * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
+ * since it is a spurious state (it will transition into the
+ * traditional blocked state or get woken up without outside
+ * dependencies), not considering it such should only affect timing.
+ *
+ * Be conservative for now and not include it.
+ */
+
+ /*
+ * Idle tasks (or idle injection) within the idle loop are RCU-tasks
+ * quiescent states. But CPU boot code performed by the idle task
+ * isn't a quiescent state.
+ */
+ if (is_idle_task(t))
+ return false;
+
+ cpu = task_cpu(t);
+
+ /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */
+ if (t == idle_task(cpu) && !rcu_cpu_online(cpu))
+ return false;
+
+ return true;
+}
+
+/* Per-task initial processing. */
+static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
+{
+ if (t != current && rcu_tasks_is_holdout(t)) {
+ get_task_struct(t);
+ t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+ WRITE_ONCE(t->rcu_tasks_holdout, true);
+ list_add(&t->rcu_tasks_holdout_list, hop);
+ }
+}
+
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+
+/* Processing between scanning taskslist and draining the holdout list. */
+static void rcu_tasks_postscan(struct list_head *hop)
+{
+ int cpu;
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+ }
+
+ /*
+ * Exiting tasks may escape the tasklist scan. Those are vulnerable
+ * until their final schedule() with TASK_DEAD state. To cope with
+ * this, divide the fragile exit path part in two intersecting
+ * read side critical sections:
+ *
+ * 1) A task_struct list addition before calling exit_notify(),
+ * which may remove the task from the tasklist, with the
+ * removal after the final preempt_disable() call in do_exit().
+ *
+ * 2) An _RCU_ read side starting with the final preempt_disable()
+ * call in do_exit() and ending with the final call to schedule()
+ * with TASK_DEAD state.
+ *
+ * This handles the part 1). And postgp will handle part 2) with a
+ * call to synchronize_rcu().
+ */
+
+ for_each_possible_cpu(cpu) {
+ unsigned long j = jiffies + 1;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu);
+ struct task_struct *t;
+ struct task_struct *t1;
+ struct list_head tmp;
+
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) {
+ if (list_empty(&t->rcu_tasks_holdout_list))
+ rcu_tasks_pertask(t, hop);
+
+ // RT kernels need frequent pauses, otherwise
+ // pause at least once per pair of jiffies.
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j))
+ continue;
+
+ // Keep our place in the list while pausing.
+ // Nothing else traverses this list, so adding a
+ // bare list_head is OK.
+ list_add(&tmp, &t->rcu_tasks_exit_list);
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ cond_resched(); // For CONFIG_PREEMPT=n kernels
+ raw_spin_lock_irq_rcu_node(rtpcp);
+ t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list);
+ list_del(&tmp);
+ j = jiffies + 1;
+ }
+ raw_spin_unlock_irq_rcu_node(rtpcp);
+ }
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer);
+}
+
+/* See if tasks are still holding out, complain if so. */
+static void check_holdout_task(struct task_struct *t,
+ bool needreport, bool *firstreport)
+{
+ int cpu;
+
+ if (!READ_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+ !rcu_tasks_is_holdout(t) ||
+ (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
+ !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) {
+ WRITE_ONCE(t->rcu_tasks_holdout, false);
+ list_del_init(&t->rcu_tasks_holdout_list);
+ put_task_struct(t);
+ return;
+ }
+ rcu_request_urgent_qs_task(t);
+ if (!needreport)
+ return;
+ if (*firstreport) {
+ pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
+ *firstreport = false;
+ }
+ cpu = task_cpu(t);
+ pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
+ t, ".I"[is_idle_task(t)],
+ "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
+ t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
+ data_race(t->rcu_tasks_idle_cpu), cpu);
+ sched_show_task(t);
+}
+
+/* Scan the holdout lists for tasks no longer holding out. */
+static void check_all_holdout_tasks(struct list_head *hop,
+ bool needreport, bool *firstreport)
+{
+ struct task_struct *t, *t1;
+
+ list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) {
+ check_holdout_task(t, needreport, firstreport);
+ cond_resched();
+ }
+}
+
+/* Finish off the Tasks-RCU grace period. */
+static void rcu_tasks_postgp(struct rcu_tasks *rtp)
+{
+ /*
+ * Because ->on_rq and ->nvcsw are not guaranteed to have a full
+ * memory barriers prior to them in the schedule() path, memory
+ * reordering on other CPUs could cause their RCU-tasks read-side
+ * critical sections to extend past the end of the grace period.
+ * However, because these ->nvcsw updates are carried out with
+ * interrupts disabled, we can use synchronize_rcu() to force the
+ * needed ordering on all such CPUs.
+ *
+ * This synchronize_rcu() also confines all ->rcu_tasks_holdout
+ * accesses to be within the grace period, avoiding the need for
+ * memory barriers for ->rcu_tasks_holdout accesses.
+ *
+ * In addition, this synchronize_rcu() waits for exiting tasks
+ * to complete their final preempt_disable() region of execution,
+ * enforcing the whole region before tasklist removal until
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
+ * read side critical section.
+ */
+ synchronize_rcu();
+}
+
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+#ifndef CONFIG_TINY_RCU
+ int rtsi;
+
+ rtsi = READ_ONCE(rcu_task_stall_info);
+ pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+ __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+ tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+ pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+#endif // #ifndef CONFIG_TINY_RCU
+}
+
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @rhp: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle,
+ * or transition to usermode execution. As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-structure synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
+ */
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
+{
+ call_rcu_tasks_generic(rhp, func, &rcu_tasks);
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+
+/**
+ * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-tasks
+ * grace period has elapsed, in other words after all currently
+ * executing rcu-tasks read-side critical sections have elapsed. These
+ * read-side critical sections are delimited by calls to schedule(),
+ * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
+ * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
+ *
+ * This is a very specialized primitive, intended only for a few uses in
+ * tracing and other situations requiring manipulation of function
+ * preambles and profiling hooks. The synchronize_rcu_tasks() function
+ * is not (yet) intended for heavy use from multiple CPUs.
+ *
+ * See the description of synchronize_rcu() for more detailed information
+ * on memory ordering guarantees.
+ */
+void synchronize_rcu_tasks(void)
+{
+ synchronize_rcu_tasks_generic(&rcu_tasks);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
+
+/**
+ * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
+ *
+ * Although the current implementation is guaranteed to wait, it is not
+ * obligated to, for example, if there are no pending callbacks.
+ */
+void rcu_barrier_tasks(void)
+{
+ rcu_barrier_tasks_generic(&rcu_tasks);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+
+static int rcu_tasks_lazy_ms = -1;
+module_param(rcu_tasks_lazy_ms, int, 0444);
+
+static int __init rcu_spawn_tasks_kthread(void)
+{
+ rcu_tasks.gp_sleep = HZ / 10;
+ rcu_tasks.init_fract = HZ / 10;
+ if (rcu_tasks_lazy_ms >= 0)
+ rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms);
+ rcu_tasks.pregp_func = rcu_tasks_pregp_step;
+ rcu_tasks.pertask_func = rcu_tasks_pertask;
+ rcu_tasks.postscan_func = rcu_tasks_postscan;
+ rcu_tasks.holdouts_func = check_all_holdout_tasks;
+ rcu_tasks.postgp_func = rcu_tasks_postgp;
+ rcu_tasks.wait_state = TASK_IDLE;
+ rcu_spawn_tasks_kthread_generic(&rcu_tasks);
+ return 0;
+}
+
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_classic_gp_kthread(void)
+{
+ show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
+}
+EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
+
+void rcu_tasks_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print);
+#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_gp_kthread(void)
+{
+ return rcu_tasks.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data);
+
+/*
+ * Protect against tasklist scan blind spot while the task is exiting and
+ * may be removed from the tasklist. Do this by adding the task to yet
+ * another list.
+ *
+ * Note that the task will remove itself from this list, so there is no
+ * need for get_task_struct(), except in the case where rcu_tasks_pertask()
+ * adds it to the holdout list, in which case rcu_tasks_pertask() supplies
+ * the needed get_task_struct().
+ */
+void exit_tasks_rcu_start(void)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t = current;
+
+ WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list));
+ preempt_disable();
+ rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
+ t->rcu_tasks_exit_cpu = smp_processor_id();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ WARN_ON_ONCE(!rtpcp->rtp_exit_list.next);
+ list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ preempt_enable();
+}
+
+/*
+ * Remove the task from the "yet another list" because do_exit() is now
+ * non-preemptible, allowing synchronize_rcu() to wait beyond this point.
+ */
+void exit_tasks_rcu_finish(void)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t = current;
+
+ WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list));
+ rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->rcu_tasks_exit_list);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+
+ exit_tasks_rcu_finish_trace(t);
+}
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+void exit_tasks_rcu_start(void) { }
+void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+
+////////////////////////////////////////////////////////////////////////
+//
+// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's
+// trick of passing an empty function to schedule_on_each_cpu().
+// This approach provides batching of concurrent calls to the synchronous
+// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu()
+// in order to send IPIs far and wide and induces otherwise unnecessary
+// context switches on all online CPUs, whether idle or not.
+//
+// Callback handling is provided by the rcu_tasks_kthread() function.
+//
+// Ordering is provided by the scheduler's context-switch code.
+
+// Empty function to allow workqueues to force a context switch.
+static void rcu_tasks_be_rude(struct work_struct *work)
+{
+}
+
+// Wait for one rude RCU-tasks grace period.
+static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
+{
+ rtp->n_ipis += cpumask_weight(cpu_online_mask);
+ schedule_on_each_cpu(rcu_tasks_be_rude);
+}
+
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
+ "RCU Tasks Rude");
+
+/*
+ * call_rcu_tasks_rude() - Queue a callback rude task-based grace period
+ * @rhp: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks_rude()
+ * assumes that the read-side critical sections end at context switch,
+ * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as
+ * usermode execution is schedulable). As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-structure synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
+ *
+ * This is no longer exported, and is instead reserved for use by
+ * synchronize_rcu_tasks_rude().
+ */
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
+{
+ call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude);
+}
+
+/**
+ * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period
+ *
+ * Control will return to the caller some time after a rude rcu-tasks
+ * grace period has elapsed, in other words after all currently
+ * executing rcu-tasks read-side critical sections have elapsed. These
+ * read-side critical sections are delimited by calls to schedule(),
+ * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable
+ * context), and (in theory, anyway) cond_resched().
+ *
+ * This is a very specialized primitive, intended only for a few uses in
+ * tracing and other situations requiring manipulation of function preambles
+ * and profiling hooks. The synchronize_rcu_tasks_rude() function is not
+ * (yet) intended for heavy use from multiple CPUs.
+ *
+ * See the description of synchronize_rcu() for more detailed information
+ * on memory ordering guarantees.
+ */
+void synchronize_rcu_tasks_rude(void)
+{
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU))
+ synchronize_rcu_tasks_generic(&rcu_tasks_rude);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
+
+static int __init rcu_spawn_tasks_rude_kthread(void)
+{
+ rcu_tasks_rude.gp_sleep = HZ / 10;
+ rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
+ return 0;
+}
+
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_rude_gp_kthread(void)
+{
+ show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
+}
+EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
+
+void rcu_tasks_rude_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print);
+#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
+{
+ return rcu_tasks_rude.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data);
+
+#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
+
+////////////////////////////////////////////////////////////////////////
+//
+// Tracing variant of Tasks RCU. This variant is designed to be used
+// to protect tracing hooks, including those of BPF. This variant
+// therefore:
+//
+// 1. Has explicit read-side markers to allow finite grace periods
+// in the face of in-kernel loops for PREEMPT=n builds.
+//
+// 2. Protects code in the idle loop, exception entry/exit, and
+// CPU-hotplug code paths, similar to the capabilities of SRCU.
+//
+// 3. Avoids expensive read-side instructions, having overhead similar
+// to that of Preemptible RCU.
+//
+// There are of course downsides. For example, the grace-period code
+// can send IPIs to CPUs, even when those CPUs are in the idle loop or
+// in nohz_full userspace. If needed, these downsides can be at least
+// partially remedied.
+//
+// Perhaps most important, this variant of RCU does not affect the vanilla
+// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
+// readers can operate from idle, offline, and exception entry/exit in no
+// way allows rcu_preempt and rcu_sched readers to also do so.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_trace_pregp_step():
+// Disables CPU hotplug, adds all currently executing tasks to the
+// holdout list, then checks the state of all tasks that blocked
+// or were preempted within their current RCU Tasks Trace read-side
+// critical section, adding them to the holdout list if appropriate.
+// Finally, this function re-enables CPU hotplug.
+// The ->pertask_func() pointer is NULL, so there is no per-task processing.
+// rcu_tasks_trace_postscan():
+// Invokes synchronize_rcu() to wait for late-stage exiting tasks
+// to finish exiting.
+// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list. Once this
+// list is empty, the grace period has completed.
+// rcu_tasks_trace_postgp():
+// Provides the needed full memory barrier and does debug checks.
+//
+// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
+//
+// Pre-grace-period update-side code is ordered before the grace period
+// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
+// read-side code is ordered before the grace period by atomic operations
+// on .b.need_qs flag of each task involved in this process, or by scheduler
+// context-switch ordering (for locked-down non-running readers).
+
+// The lockdep state must be outside of #ifdef to be useful.
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_trace_key;
+struct lockdep_map rcu_trace_lock_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key);
+EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
+// Record outstanding IPIs to each CPU. No point in sending two...
+static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
+
+// The number of detections of task quiescent state relying on
+// heavyweight readers executing explicit memory barriers.
+static unsigned long n_heavy_reader_attempts;
+static unsigned long n_heavy_reader_updates;
+static unsigned long n_heavy_reader_ofl_updates;
+static unsigned long n_trc_holdouts;
+
+void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
+DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
+ "RCU Tasks Trace");
+
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+ smp_mb(); // Enforce full grace-period ordering.
+ return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+ smp_store_release(&t->trc_reader_special.b.need_qs, v);
+ smp_mb(); // Enforce full grace-period ordering.
+}
+
+/*
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ *
+ * Returns the old value, which is often ignored.
+ */
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
+{
+ return cmpxchg(&t->trc_reader_special.b.need_qs, old, new);
+}
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
+
+/*
+ * If we are the last reader, signal the grace-period kthread.
+ * Also remove from the per-CPU list of blocked tasks.
+ */
+void rcu_read_unlock_trace_special(struct task_struct *t)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ union rcu_special trs;
+
+ // Open-coded full-word version of rcu_ld_need_qs().
+ smp_mb(); // Enforce full grace-period ordering.
+ trs = smp_load_acquire(&t->trc_reader_special);
+
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
+ smp_mb(); // Pairs with update-side barriers.
+ // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
+ if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
+ u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+ TRC_NEED_QS_CHECKED);
+
+ WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
+ }
+ if (trs.b.blocked) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->trc_blkd_node);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, false);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ WRITE_ONCE(t->trc_reader_nesting, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
+
+/* Add a newly blocked reader task to its CPU's list. */
+void rcu_tasks_trace_qs_blkd(struct task_struct *t)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+
+ local_irq_save(flags);
+ rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
+ t->trc_blkd_cpu = smp_processor_id();
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, true);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
+
+/* Add a task to the holdout list, if it is not already on the list. */
+static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
+{
+ if (list_empty(&t->trc_holdout_list)) {
+ get_task_struct(t);
+ list_add(&t->trc_holdout_list, bhp);
+ n_trc_holdouts++;
+ }
+}
+
+/* Remove a task from the holdout list, if it is in fact present. */
+static void trc_del_holdout(struct task_struct *t)
+{
+ if (!list_empty(&t->trc_holdout_list)) {
+ list_del_init(&t->trc_holdout_list);
+ put_task_struct(t);
+ n_trc_holdouts--;
+ }
+}
+
+/* IPI handler to check task state. */
+static void trc_read_check_handler(void *t_in)
+{
+ int nesting;
+ struct task_struct *t = current;
+ struct task_struct *texp = t_in;
+
+ // If the task is no longer running on this CPU, leave.
+ if (unlikely(texp != t))
+ goto reset_ipi; // Already on holdout list, so will check later.
+
+ // If the task is not in a read-side critical section, and
+ // if this is the last reader, awaken the grace-period kthread.
+ nesting = READ_ONCE(t->trc_reader_nesting);
+ if (likely(!nesting)) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ goto reset_ipi;
+ }
+ // If we are racing with an rcu_read_unlock_trace(), try again later.
+ if (unlikely(nesting < 0))
+ goto reset_ipi;
+
+ // Get here if the task is in a read-side critical section.
+ // Set its state so that it will update state for the grace-period
+ // kthread upon exit from that critical section.
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
+
+reset_ipi:
+ // Allow future IPIs to be sent on CPU and for task.
+ // Also order this IPI handler against any later manipulations of
+ // the intended task.
+ smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
+ smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
+}
+
+/* Callback function for scheduler to check locked-down task. */
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
+{
+ struct list_head *bhp = bhp_in;
+ int cpu = task_cpu(t);
+ int nesting;
+ bool ofl = cpu_is_offline(cpu);
+
+ if (task_curr(t) && !ofl) {
+ // If no chance of heavyweight readers, do it the hard way.
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ return -EINVAL;
+
+ // If heavyweight readers are enabled on the remote task,
+ // we can inspect its state despite its currently running.
+ // However, we cannot safely change its state.
+ n_heavy_reader_attempts++;
+ // Check for "running" idle tasks on offline CPUs.
+ if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting))
+ return -EINVAL; // No quiescent state, do it the hard way.
+ n_heavy_reader_updates++;
+ nesting = 0;
+ } else {
+ // The task is not running, so C-language access is safe.
+ nesting = t->trc_reader_nesting;
+ WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
+ n_heavy_reader_ofl_updates++;
+ }
+
+ // If not exiting a read-side critical section, mark as checked
+ // so that the grace-period kthread will remove it from the
+ // holdout list.
+ if (!nesting) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ return 0; // In QS, so done.
+ }
+ if (nesting < 0)
+ return -EINVAL; // Reader transitioning, try again later.
+
+ // The task is in a read-side critical section, so set up its
+ // state so that it will update state upon exit from that critical
+ // section.
+ if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+ trc_add_holdout(t, bhp);
+ return 0;
+}
+
+/* Attempt to extract the state for the specified task. */
+static void trc_wait_for_one_reader(struct task_struct *t,
+ struct list_head *bhp)
+{
+ int cpu;
+
+ // If a previous IPI is still in flight, let it complete.
+ if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI
+ return;
+
+ // The current task had better be in a quiescent state.
+ if (t == current) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
+ return;
+ }
+
+ // Attempt to nail down the task for inspection.
+ get_task_struct(t);
+ if (!task_call_func(t, trc_inspect_reader, bhp)) {
+ put_task_struct(t);
+ return;
+ }
+ put_task_struct(t);
+
+ // If this task is not yet on the holdout list, then we are in
+ // an RCU read-side critical section. Otherwise, the invocation of
+ // trc_add_holdout() that added it to the list did the necessary
+ // get_task_struct(). Either way, the task cannot be freed out
+ // from under this code.
+
+ // If currently running, send an IPI, either way, add to list.
+ trc_add_holdout(t, bhp);
+ if (task_curr(t) &&
+ time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) {
+ // The task is currently running, so try IPIing it.
+ cpu = task_cpu(t);
+
+ // If there is already an IPI outstanding, let it happen.
+ if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
+ return;
+
+ per_cpu(trc_ipi_to_cpu, cpu) = true;
+ t->trc_ipi_to_cpu = cpu;
+ rcu_tasks_trace.n_ipis++;
+ if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
+ // Just in case there is some other reason for
+ // failure than the target CPU being offline.
+ WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
+ __func__, cpu);
+ rcu_tasks_trace.n_ipis_fails++;
+ per_cpu(trc_ipi_to_cpu, cpu) = false;
+ t->trc_ipi_to_cpu = -1;
+ }
+ }
+}
+
+/*
+ * Initialize for first-round processing for the specified task.
+ * Return false if task is NULL or already taken care of, true otherwise.
+ */
+static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
+{
+ // During early boot when there is only the one boot CPU, there
+ // is no idle task for the other CPUs. Also, the grace-period
+ // kthread is always in a quiescent state. In addition, just return
+ // if this task is already on the list.
+ if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
+ return false;
+
+ rcu_st_need_qs(t, 0);
+ t->trc_ipi_to_cpu = -1;
+ return true;
+}
+
+/* Do first-round processing for the specified task. */
+static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
+{
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_wait_for_one_reader(t, hop);
+}
+
+/* Initialize for a new RCU-tasks-trace grace period. */
+static void rcu_tasks_trace_pregp_step(struct list_head *hop)
+{
+ LIST_HEAD(blkd_tasks);
+ int cpu;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t;
+
+ // There shouldn't be any old IPIs, but...
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
+
+ // Disable CPU hotplug across the CPU scan for the benefit of
+ // any IPIs that might be needed. This also waits for all readers
+ // in CPU-hotplug code paths.
+ cpus_read_lock();
+
+ // These rcu_tasks_trace_pertask_prep() calls are serialized to
+ // allow safe access to the hop list.
+ for_each_online_cpu(cpu) {
+ rcu_read_lock();
+ // Note that cpu_curr_snapshot() picks up the target
+ // CPU's current task while its runqueue is locked with
+ // an smp_mb__after_spinlock(). This ensures that either
+ // the grace-period kthread will see that task's read-side
+ // critical section or the task will see the updater's pre-GP
+ // accesses. The trailing smp_mb() in cpu_curr_snapshot()
+ // does not currently play a role other than simplify
+ // that function's ordering semantics. If these simplified
+ // ordering semantics continue to be redundant, that smp_mb()
+ // might be removed.
+ t = cpu_curr_snapshot(cpu);
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_add_holdout(t, hop);
+ rcu_read_unlock();
+ cond_resched_tasks_rcu_qs();
+ }
+
+ // Only after all running tasks have been accounted for is it
+ // safe to take care of the tasks that have blocked within their
+ // current RCU tasks trace read-side critical section.
+ for_each_possible_cpu(cpu) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
+ while (!list_empty(&blkd_tasks)) {
+ rcu_read_lock();
+ t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
+ list_del_init(&t->trc_blkd_node);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ rcu_tasks_trace_pertask(t, hop);
+ rcu_read_unlock();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ cond_resched_tasks_rcu_qs();
+ }
+
+ // Re-enable CPU hotplug now that the holdout list is populated.
+ cpus_read_unlock();
+}
+
+/*
+ * Do intermediate processing between task and holdout scans.
+ */
+static void rcu_tasks_trace_postscan(struct list_head *hop)
+{
+ // Wait for late-stage exiting tasks to finish exiting.
+ // These might have passed the call to exit_tasks_rcu_finish().
+
+ // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
+ synchronize_rcu();
+ // Any tasks that exit after this point will set
+ // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
+}
+
+/* Communicate task state back to the RCU tasks trace stall warning request. */
+struct trc_stall_chk_rdr {
+ int nesting;
+ int ipi_to_cpu;
+ u8 needqs;
+};
+
+static int trc_check_slow_task(struct task_struct *t, void *arg)
+{
+ struct trc_stall_chk_rdr *trc_rdrp = arg;
+
+ if (task_curr(t) && cpu_online(task_cpu(t)))
+ return false; // It is running, so decline to inspect it.
+ trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
+ trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
+ trc_rdrp->needqs = rcu_ld_need_qs(t);
+ return true;
+}
+
+/* Show the state of a task stalling the current RCU tasks trace GP. */
+static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
+{
+ int cpu;
+ struct trc_stall_chk_rdr trc_rdr;
+ bool is_idle_tsk = is_idle_task(t);
+
+ if (*firstreport) {
+ pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
+ *firstreport = false;
+ }
+ cpu = task_cpu(t);
+ if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
+ pr_alert("P%d: %c%c\n",
+ t->pid,
+ ".I"[t->trc_ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk]);
+ else
+ pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
+ t->pid,
+ ".I"[trc_rdr.ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk],
+ ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+ ".B"[!!data_race(t->trc_reader_special.b.blocked)],
+ trc_rdr.nesting,
+ " !CN"[trc_rdr.needqs & 0x3],
+ " ?"[trc_rdr.needqs > 0x3],
+ cpu, cpu_online(cpu) ? "" : "(offline)");
+ sched_show_task(t);
+}
+
+/* List stalled IPIs for RCU tasks trace. */
+static void show_stalled_ipi_trace(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (per_cpu(trc_ipi_to_cpu, cpu))
+ pr_alert("\tIPI outstanding to CPU %d\n", cpu);
+}
+
+/* Do one scan of the holdout list. */
+static void check_all_holdout_tasks_trace(struct list_head *hop,
+ bool needreport, bool *firstreport)
+{
+ struct task_struct *g, *t;
+
+ // Disable CPU hotplug across the holdout list scan for IPIs.
+ cpus_read_lock();
+
+ list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
+ // If safe and needed, try to check the current task.
+ if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
+ !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
+ trc_wait_for_one_reader(t, hop);
+
+ // If check succeeded, remove this task from the list.
+ if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
+ rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
+ trc_del_holdout(t);
+ else if (needreport)
+ show_stalled_task_trace(t, firstreport);
+ cond_resched_tasks_rcu_qs();
+ }
+
+ // Re-enable CPU hotplug now that the holdout list scan has completed.
+ cpus_read_unlock();
+
+ if (needreport) {
+ if (*firstreport)
+ pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
+ show_stalled_ipi_trace();
+ }
+}
+
+static void rcu_tasks_trace_empty_fn(void *unused)
+{
+}
+
+/* Wait for grace period to complete and provide ordering. */
+static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
+{
+ int cpu;
+
+ // Wait for any lingering IPI handlers to complete. Note that
+ // if a CPU has gone offline or transitioned to userspace in the
+ // meantime, all IPI handlers should have been drained beforehand.
+ // Yes, this assumes that CPUs process IPIs in order. If that ever
+ // changes, there will need to be a recheck and/or timed wait.
+ for_each_online_cpu(cpu)
+ if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
+ smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
+
+ smp_mb(); // Caller's code must be ordered after wakeup.
+ // Pairs with pretty much every ordering primitive.
+}
+
+/* Report any needed quiescent state for this exiting task. */
+static void exit_tasks_rcu_finish_trace(struct task_struct *t)
+{
+ union rcu_special trs = READ_ONCE(t->trc_reader_special);
+
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
+ if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
+ rcu_read_unlock_trace_special(t);
+ else
+ WRITE_ONCE(t->trc_reader_nesting, 0);
+}
+
+/**
+ * call_rcu_tasks_trace() - Queue a callback trace task-based grace period
+ * @rhp: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a trace rcu-tasks
+ * grace period elapses, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have completed. These
+ * read-side critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
+ */
+void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func)
+{
+ call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace);
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
+
+/**
+ * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
+ *
+ * Control will return to the caller some time after a trace rcu-tasks
+ * grace period has elapsed, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have elapsed. These read-side
+ * critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
+ *
+ * This is a very specialized primitive, intended only for a few uses in
+ * tracing and other situations requiring manipulation of function preambles
+ * and profiling hooks. The synchronize_rcu_tasks_trace() function is not
+ * (yet) intended for heavy use from multiple CPUs.
+ *
+ * See the description of synchronize_rcu() for more detailed information
+ * on memory ordering guarantees.
+ */
+void synchronize_rcu_tasks_trace(void)
+{
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section");
+ synchronize_rcu_tasks_generic(&rcu_tasks_trace);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
+
+/**
+ * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks.
+ *
+ * Although the current implementation is guaranteed to wait, it is not
+ * obligated to, for example, if there are no pending callbacks.
+ */
+void rcu_barrier_tasks_trace(void)
+{
+ rcu_barrier_tasks_generic(&rcu_tasks_trace);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
+
+int rcu_tasks_trace_lazy_ms = -1;
+module_param(rcu_tasks_trace_lazy_ms, int, 0444);
+
+static int __init rcu_spawn_tasks_trace_kthread(void)
+{
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
+ rcu_tasks_trace.gp_sleep = HZ / 10;
+ rcu_tasks_trace.init_fract = HZ / 10;
+ } else {
+ rcu_tasks_trace.gp_sleep = HZ / 200;
+ if (rcu_tasks_trace.gp_sleep <= 0)
+ rcu_tasks_trace.gp_sleep = 1;
+ rcu_tasks_trace.init_fract = HZ / 200;
+ if (rcu_tasks_trace.init_fract <= 0)
+ rcu_tasks_trace.init_fract = 1;
+ }
+ if (rcu_tasks_trace_lazy_ms >= 0)
+ rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
+ rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
+ rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
+ rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
+ rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
+ rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace);
+ return 0;
+}
+
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_trace_gp_kthread(void)
+{
+ char buf[64];
+
+ snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu",
+ data_race(n_trc_holdouts),
+ data_race(n_heavy_reader_ofl_updates),
+ data_race(n_heavy_reader_updates),
+ data_race(n_heavy_reader_attempts));
+ show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
+}
+EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
+
+void rcu_tasks_trace_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print);
+#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
+{
+ return rcu_tasks_trace.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data);
+
+#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
+static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
+#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
+
+#ifndef CONFIG_TINY_RCU
+void show_rcu_tasks_gp_kthreads(void)
+{
+ show_rcu_tasks_classic_gp_kthread();
+ show_rcu_tasks_rude_gp_kthread();
+ show_rcu_tasks_trace_gp_kthread();
+}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_PROVE_RCU
+struct rcu_tasks_test_desc {
+ struct rcu_head rh;
+ const char *name;
+ bool notrun;
+ unsigned long runstart;
+};
+
+static struct rcu_tasks_test_desc tests[] = {
+ {
+ .name = "call_rcu_tasks()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_trace()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+ }
+};
+
+#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+static void test_rcu_tasks_callback(struct rcu_head *rhp)
+{
+ struct rcu_tasks_test_desc *rttd =
+ container_of(rhp, struct rcu_tasks_test_desc, rh);
+
+ pr_info("Callback from %s invoked.\n", rttd->name);
+
+ rttd->notrun = false;
+}
+#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+
+static void rcu_tasks_initiate_self_tests(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ pr_info("Running RCU Tasks wait API self tests\n");
+ tests[0].runstart = jiffies;
+ synchronize_rcu_tasks();
+ call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ pr_info("Running RCU Tasks Rude wait API self tests\n");
+ synchronize_rcu_tasks_rude();
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ pr_info("Running RCU Tasks Trace wait API self tests\n");
+ tests[1].runstart = jiffies;
+ synchronize_rcu_tasks_trace();
+ call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback);
+#endif
+}
+
+/*
+ * Return: 0 - test passed
+ * 1 - test failed, but have not timed out yet
+ * -1 - test failed and timed out
+ */
+static int rcu_tasks_verify_self_tests(void)
+{
+ int ret = 0;
+ int i;
+ unsigned long bst = rcu_task_stall_timeout;
+
+ if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT)
+ bst = RCU_TASK_BOOT_STALL_TIMEOUT;
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ while (tests[i].notrun) { // still hanging.
+ if (time_after(jiffies, tests[i].runstart + bst)) {
+ pr_err("%s has failed boot-time tests.\n", tests[i].name);
+ ret = -1;
+ break;
+ }
+ ret = 1;
+ break;
+ }
+ }
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+
+/*
+ * Repeat the rcu_tasks_verify_self_tests() call once every second until the
+ * test passes or has timed out.
+ */
+static struct delayed_work rcu_tasks_verify_work;
+static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused)
+{
+ int ret = rcu_tasks_verify_self_tests();
+
+ if (ret <= 0)
+ return;
+
+ /* Test fails but not timed out yet, reschedule another check */
+ schedule_delayed_work(&rcu_tasks_verify_work, HZ);
+}
+
+static int rcu_tasks_verify_schedule_work(void)
+{
+ INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn);
+ rcu_tasks_verify_work_fn(NULL);
+ return 0;
+}
+late_initcall(rcu_tasks_verify_schedule_work);
+#else /* #ifdef CONFIG_PROVE_RCU */
+static void rcu_tasks_initiate_self_tests(void) { }
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
+void __init tasks_cblist_init_generic(void)
+{
+ lockdep_assert_irqs_disabled();
+ WARN_ON(num_online_cpus() > 1);
+
+#ifdef CONFIG_TASKS_RCU
+ cblist_init_generic(&rcu_tasks);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ cblist_init_generic(&rcu_tasks_rude);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ cblist_init_generic(&rcu_tasks_trace);
+#endif
+}
+
+static int __init rcu_init_tasks_generic(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ rcu_spawn_tasks_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ rcu_spawn_tasks_rude_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ rcu_spawn_tasks_trace_kthread();
+#endif
+
+ // Run the self-tests.
+ rcu_tasks_initiate_self_tests();
+
+ return 0;
+}
+core_initcall(rcu_init_tasks_generic);
+
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void rcu_tasks_bootup_oddness(void) {}
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 069742d61c68..585cade21010 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
@@ -25,7 +12,7 @@
#include <linux/completion.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mutex.h>
@@ -35,108 +22,43 @@
#include <linux/time.h>
#include <linux/cpu.h>
#include <linux/prefetch.h>
-#include <linux/ftrace_event.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
#include "rcu.h"
-/* Forward declarations for tiny_plugin.h. */
-struct rcu_ctrlblk;
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static void rcu_process_callbacks(struct softirq_action *unused);
-static void __call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu),
- struct rcu_ctrlblk *rcp);
-
-#include "tiny_plugin.h"
-
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ unsigned long gp_seq; /* Grace-period counter. */
+};
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+ .donetail = &rcu_ctrlblk.rcucblist,
+ .curtail = &rcu_ctrlblk.rcucblist,
+ .gp_seq = 0 - 300UL,
+};
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
+void rcu_barrier(void)
{
+ wait_rcu_gp(call_rcu_hurry);
}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
-
-/*
- * Test whether RCU thinks that the current CPU is idle.
- */
-bool notrace __rcu_is_watching(void)
-{
- return true;
-}
-EXPORT_SYMBOL(__rcu_is_watching);
-
-#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
-/*
- * Helper function for rcu_sched_qs() and rcu_bh_qs().
- * Also irqs are disabled to avoid confusion due to interrupt handlers
- * invoking call_rcu().
- */
-static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
-{
- RCU_TRACE(reset_cpu_stall_ticks(rcp));
- if (rcp->donetail != rcp->curtail) {
- rcp->donetail = rcp->curtail;
- return 1;
- }
-
- return 0;
-}
-
-/*
- * Record an rcu quiescent state. And an rcu_bh quiescent state while we
- * are at it, given that any rcu quiescent state is also an rcu_bh
- * quiescent state. Use "+" instead of "||" to defeat short circuiting.
- */
-void rcu_sched_qs(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
- rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
- local_irq_restore(flags);
-}
+EXPORT_SYMBOL(rcu_barrier);
-/*
- * Record an rcu_bh quiescent state.
- */
-void rcu_bh_qs(void)
+/* Record an rcu quiescent state. */
+void rcu_qs(void)
{
unsigned long flags;
local_irq_save(flags);
- if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
+ rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
+ raise_softirq_irqoff(RCU_SOFTIRQ);
+ }
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
local_irq_restore(flags);
}
@@ -146,138 +68,185 @@ void rcu_bh_qs(void)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
- RCU_TRACE(check_cpu_stalls());
if (user)
- rcu_sched_qs();
- else if (!in_softirq())
- rcu_bh_qs();
- if (user)
- rcu_note_voluntary_context_switch(current);
+ rcu_qs();
+ else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail)
+ set_need_resched_current();
}
/*
- * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
- * whose grace period has elapsed.
+ * Reclaim the specified callback, either by invoking it for non-kfree cases or
+ * freeing it directly (for kfree). Return true if kfreeing, false otherwise.
*/
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static inline bool rcu_reclaim_tiny(struct rcu_head *head)
+{
+ rcu_callback_t f;
+
+ rcu_lock_acquire(&rcu_callback_map);
+
+ trace_rcu_invoke_callback("", head);
+ f = head->func;
+ debug_rcu_head_callback(head);
+ WRITE_ONCE(head->func, (rcu_callback_t)0L);
+ f(head);
+ rcu_lock_release(&rcu_callback_map);
+ return false;
+}
+
+/* Invoke the RCU callbacks whose grace period has elapsed. */
+static __latent_entropy void rcu_process_callbacks(void)
{
- const char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
- RCU_TRACE(int cb_count = 0);
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
- RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
- list = rcp->rcucblist;
- rcp->rcucblist = *rcp->donetail;
- *rcp->donetail = NULL;
- if (rcp->curtail == rcp->donetail)
- rcp->curtail = &rcp->rcucblist;
- rcp->donetail = &rcp->rcucblist;
+ if (rcu_ctrlblk.donetail == &rcu_ctrlblk.rcucblist) {
+ /* No callbacks ready, so just leave. */
+ local_irq_restore(flags);
+ return;
+ }
+ list = rcu_ctrlblk.rcucblist;
+ rcu_ctrlblk.rcucblist = *rcu_ctrlblk.donetail;
+ *rcu_ctrlblk.donetail = NULL;
+ if (rcu_ctrlblk.curtail == rcu_ctrlblk.donetail)
+ rcu_ctrlblk.curtail = &rcu_ctrlblk.rcucblist;
+ rcu_ctrlblk.donetail = &rcu_ctrlblk.rcucblist;
local_irq_restore(flags);
/* Invoke the callbacks on the local list. */
- RCU_TRACE(rn = rcp->name);
while (list) {
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
- local_bh_disable();
- __rcu_reclaim(rn, list);
- local_bh_enable();
+ rcu_reclaim_tiny(list);
list = next;
- RCU_TRACE(cb_count++);
}
- RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
- RCU_TRACE(trace_rcu_batch_end(rcp->name,
- cb_count, 0, need_resched(),
- is_idle_task(current),
- false));
-}
-
-static void rcu_process_callbacks(struct softirq_action *unused)
-{
- __rcu_process_callbacks(&rcu_sched_ctrlblk);
- __rcu_process_callbacks(&rcu_bh_ctrlblk);
}
/*
* Wait for a grace period to elapse. But it is illegal to invoke
- * synchronize_sched() from within an RCU read-side critical section.
- * Therefore, any legal call to synchronize_sched() is a quiescent
- * state, and so on a UP system, synchronize_sched() need do nothing.
- * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the
- * benefits of doing might_sleep() to reduce latency.)
+ * synchronize_rcu() from within an RCU read-side critical section.
+ * Therefore, any legal call to synchronize_rcu() is a quiescent state,
+ * and so on a UP system, synchronize_rcu() need do nothing, other than
+ * let the polled APIs know that another grace period elapsed.
*
- * Cool, huh? (Due to Josh Triplett.)
+ * (But Lai Jiangshan points out the benefits of doing might_sleep()
+ * to reduce latency.)
*
- * But we want to make this a static inline later. The cond_resched()
- * currently makes this problematic.
+ * Cool, huh? (Due to Josh Triplett.)
*/
-void synchronize_sched(void)
+void synchronize_rcu(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU read-side critical section");
- cond_resched();
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+ preempt_disable();
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
+ preempt_enable();
}
-EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL_GPL(synchronize_rcu);
/*
- * Helper function for call_rcu() and call_rcu_bh().
+ * Post an RCU callback to be invoked after the end of an RCU grace
+ * period. But since we have but one CPU, that would be after any
+ * quiescent state.
*/
-static void __call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu),
- struct rcu_ctrlblk *rcp)
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
- debug_rcu_head_queue(head);
+ if (debug_rcu_head_queue(head)) {
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
+ return;
+ }
+
head->func = func;
head->next = NULL;
local_irq_save(flags);
- *rcp->curtail = head;
- rcp->curtail = &head->next;
- RCU_TRACE(rcp->qlen++);
+ *rcu_ctrlblk.curtail = head;
+ rcu_ctrlblk.curtail = &head->next;
local_irq_restore(flags);
if (unlikely(is_idle_task(current))) {
- /* force scheduling for rcu_sched_qs() */
+ /* force scheduling for rcu_qs() */
resched_cpu(0);
}
}
+EXPORT_SYMBOL_GPL(call_rcu);
/*
- * Post an RCU callback to be invoked after the end of an RCU-sched grace
- * period. But since we have but one CPU, that would be after any
- * quiescent state.
+ * Store a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
*/
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- __call_rcu(head, func, &rcu_sched_ctrlblk);
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
}
-EXPORT_SYMBOL_GPL(call_rcu_sched);
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
/*
- * Post an RCU bottom-half callback to be invoked after any subsequent
- * quiescent state.
+ * Return a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
*/
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+unsigned long get_state_synchronize_rcu(void)
{
- __call_rcu(head, func, &rcu_bh_ctrlblk);
+ return READ_ONCE(rcu_ctrlblk.gp_seq);
}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/*
+ * Return a grace-period-counter "cookie" and ensure that a future grace
+ * period completes. For more information, see the Tree RCU header comment.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ if (unlikely(is_idle_task(current))) {
+ /* force scheduling for rcu_qs() */
+ resched_cpu(0);
+ }
+ return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
+
+/*
+ * Return true if the grace period corresponding to oldstate has completed
+ * and false otherwise. For more information, see the Tree RCU header
+ * comment.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+ return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST)
+unsigned long long rcutorture_gather_gp_seqs(void)
+{
+ return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL;
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
+
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ snprintf(cp, len, "g%04llx", seqs & 0xffffULL);
+}
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
+#endif
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
- RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
-
rcu_early_boot_tests();
+ tasks_cblist_init_generic();
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f94e209a10d6..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
- RCU_TRACE(long qlen); /* Number of pending CBs. */
- RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
- RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
- RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
- RCU_TRACE(const char *name); /* Name of RCU type. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_sched")
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
- RCU_TRACE(.name = "rcu_bh")
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously.
- */
-void __init rcu_scheduler_starting(void)
-{
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
-}
-
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcp->qlen -= n;
- local_irq_restore(flags);
-}
-
-/*
- * Dump statistics for TINY_RCU, such as they are.
- */
-static int show_tiny_stats(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
- seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
- return 0;
-}
-
-static int show_tiny_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_tiny_stats, NULL);
-}
-
-static const struct file_operations show_tiny_stats_fops = {
- .owner = THIS_MODULE,
- .open = show_tiny_stats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutiny_trace_init(void)
-{
- struct dentry *retval;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
- retval = debugfs_create_file("rcudata", 0444, rcudir,
- NULL, &show_tiny_stats_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-
-static void __exit rcutiny_trace_cleanup(void)
-{
- debugfs_remove_recursive(rcudir);
-}
-
-module_init(rcutiny_trace_init);
-module_exit(rcutiny_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
-MODULE_LICENSE("GPL");
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
- unsigned long j;
- unsigned long js;
-
- if (rcu_cpu_stall_suppress)
- return;
- rcp->ticks_this_gp++;
- j = jiffies;
- js = ACCESS_ONCE(rcp->jiffies_stall);
- if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
- pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
- rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
- jiffies - rcp->gp_start, rcp->qlen);
- dump_stack();
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
- } else if (ULONG_CMP_GE(j, js)) {
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
- }
-}
-
-static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
-{
- rcp->ticks_this_gp = 0;
- rcp->gp_start = jiffies;
- ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
-}
-
-static void check_cpu_stalls(void)
-{
- RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
- RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cf7304b2867..293bbd9ac3f4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1,47 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * Read-Copy Update mechanism for mutual exclusion
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
*
* Copyright IBM Corporation, 2008
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
*/
+
+#define pr_fmt(fmt) "rcu: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/export.h>
#include <linux/completion.h>
+#include <linux/kmemleak.h>
#include <linux/moduleparam.h>
-#include <linux/module.h>
+#include <linux/panic.h>
+#include <linux/panic_notifier.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -50,90 +43,101 @@
#include <linux/kernel_stat.h>
#include <linux/wait.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/prefetch.h>
#include <linux/delay.h>
-#include <linux/stop_machine.h>
#include <linux/random.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/suspend.h>
+#include <linux/ftrace.h>
+#include <linux/tick.h>
+#include <linux/sysrq.h>
+#include <linux/kprobes.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/kasan.h>
+#include <linux/context_tracking.h>
+#include "../time/tick-internal.h"
#include "tree.h"
#include "rcu.h"
-MODULE_ALIAS("rcutree");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "rcutree."
/* Data structures. */
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
+ .gpwrap = true,
+};
-/*
- * In order to export the rcu_state name to the tracing tools, it
- * needs to be added in the __tracepoint_string section.
- * This requires defining a separate variable tp_<sname>_varname
- * that points to the string being used, and this will allow
- * the tracing userspace tools to be able to decipher the string
- * address to the matching string.
- */
-#ifdef CONFIG_TRACING
-# define DEFINE_RCU_TPS(sname) \
-static char sname##_varname[] = #sname; \
-static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname;
-# define RCU_STATE_NAME(sname) sname##_varname
-#else
-# define DEFINE_RCU_TPS(sname)
-# define RCU_STATE_NAME(sname) __stringify(sname)
+int rcu_get_gpwrap_count(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return READ_ONCE(rdp->gpwrap_count);
+}
+EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count);
+
+static struct rcu_state rcu_state = {
+ .level = { &rcu_state.node[0] },
+ .gp_state = RCU_GP_IDLE,
+ .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
+ .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+ .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
+ .name = RCU_NAME,
+ .abbr = RCU_ABBR,
+ .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
+ .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
+ .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
+ .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
+ rcu_sr_normal_gp_cleanup_work),
+ .srs_cleanups_pending = ATOMIC_INIT(0),
+#ifdef CONFIG_RCU_NOCB_CPU
+ .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex),
#endif
+};
-#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
-DEFINE_RCU_TPS(sname) \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
-struct rcu_state sname##_state = { \
- .level = { &sname##_state.node[0] }, \
- .rda = &sname##_data, \
- .call = cr, \
- .fqs_state = RCU_GP_IDLE, \
- .gpnum = 0UL - 300UL, \
- .completed = 0UL - 300UL, \
- .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_nxttail = &sname##_state.orphan_nxtlist, \
- .orphan_donetail = &sname##_state.orphan_donelist, \
- .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
- .name = RCU_STATE_NAME(sname), \
- .abbr = sabbr, \
-}
-
-RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
-RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-
-static struct rcu_state *rcu_state_p;
-LIST_HEAD(rcu_struct_flavors);
-
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+/* Dump rcu_node combining tree at boot to verify correct setup. */
+static bool dump_tree;
+module_param(dump_tree, bool, 0444);
+/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
+module_param(use_softirq, bool, 0444);
+#endif
+/* Control rcu_node-tree auto-balancing at boot time. */
+static bool rcu_fanout_exact;
+module_param(rcu_fanout_exact, bool, 0444);
+/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
- NUM_RCU_LVL_0,
- NUM_RCU_LVL_1,
- NUM_RCU_LVL_2,
- NUM_RCU_LVL_3,
- NUM_RCU_LVL_4,
-};
+/* Number of rcu_nodes at specified level. */
+int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
- * The rcu_scheduler_active variable transitions from zero to one just
- * before the first task is spawned. So when this variable is zero, RCU
- * can assume that there is but one task, allowing RCU to (for example)
- * optimize synchronize_sched() to a simple barrier(). When this variable
- * is one, RCU must actually do all the hard work required to detect real
- * grace periods. This variable is also used to suppress boot-time false
- * positives from lockdep-RCU error checking.
+ * The rcu_scheduler_active variable is initialized to the value
+ * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
+ * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
+ * RCU can assume that there is but one task, allowing RCU to (for example)
+ * optimize synchronize_rcu() to a simple barrier(). When this variable
+ * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
+ * to detect real grace periods. This variable is also used to suppress
+ * boot-time false positives from lockdep-RCU error checking. Finally, it
+ * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
+ * is fully initialized, including all of its kthreads having been spawned.
*/
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -152,97 +156,192 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
-static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
+ unsigned long gps, unsigned long flags);
static void invoke_rcu_core(void);
-static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+static void rcu_report_exp_rdp(struct rcu_data *rdp);
+static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
+static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
+static bool rcu_init_invoked(void);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-/* rcuc/rcub kthread realtime priority */
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-module_param(kthread_prio, int, 0644);
+/*
+ * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
+ * real-time priority(enabling/disabling) is controlled by
+ * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
+ */
+static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
+module_param(kthread_prio, int, 0444);
/* Delay in jiffies for grace-period initialization delays, debug only. */
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
-static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
-module_param(gp_init_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-static const int gp_init_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-#define PER_RCU_NODE_PERIOD 10 /* Number of grace periods between delays. */
+
+static int gp_preinit_delay;
+module_param(gp_preinit_delay, int, 0444);
+static int gp_init_delay;
+module_param(gp_init_delay, int, 0444);
+static int gp_cleanup_delay;
+module_param(gp_cleanup_delay, int, 0444);
+static int nohz_full_patience_delay;
+module_param(nohz_full_patience_delay, int, 0444);
+static int nohz_full_patience_delay_jiffies;
+
+// Add delay to rcu_read_unlock() for strict grace periods.
+static int rcu_unlock_delay;
+#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
+module_param(rcu_unlock_delay, int, 0444);
+#endif
+
+/* Retrieve RCU kthreads priority for rcutorture */
+int rcu_get_gp_kthreads_prio(void)
+{
+ return kthread_prio;
+}
+EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
/*
- * Track the rcutorture test sequence number and the update version
- * number within a given test. The rcutorture_testseq is incremented
- * on every rcutorture module load and unload, so has an odd value
- * when a test is running. The rcutorture_vernum is set to zero
- * when rcutorture starts and is incremented on each rcutorture update.
- * These variables enable correlating rcutorture output with the
- * RCU tracing information.
+ * Number of grace periods between delays, normalized by the duration of
+ * the delay. The longer the delay, the more the grace periods between
+ * each delay. The reason for this normalization is that it means that,
+ * for non-zero delays, the overall slowdown of grace periods is constant
+ * regardless of the duration of the delay. This arrangement balances
+ * the need for long delays to increase some race probabilities with the
+ * need for fast grace periods to increase other race probabilities.
*/
-unsigned long rcutorture_testseq;
-unsigned long rcutorture_vernum;
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
- * Compute the mask of online CPUs for the specified rcu_node structure.
- * This will not be stable unless the rcu_node structure's ->lock is
- * held, but the bit corresponding to the current CPU will be stable
- * in most contexts.
+ * Return true if an RCU grace period is in progress. The READ_ONCE()s
+ * permit this function to be invoked without holding the root rcu_node
+ * structure's ->lock, but of course results can be subject to change.
*/
-unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+static int rcu_gp_in_progress(void)
{
- return ACCESS_ONCE(rnp->qsmaskinitnext);
+ return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
}
/*
- * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
- * permit this function to be invoked without holding the root rcu_node
- * structure's ->lock, but of course results can be subject to change.
+ * Return the number of callbacks queued on the specified CPU.
+ * Handles both the nocbs and normal cases.
+ */
+static long rcu_get_n_cbs_cpu(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ if (rcu_segcblist_is_enabled(&rdp->cblist))
+ return rcu_segcblist_n_cbs(&rdp->cblist);
+ return 0;
+}
+
+/**
+ * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing
+ *
+ * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU.
+ * This is a special-purpose function to be used in the softirq
+ * infrastructure and perhaps the occasional long-running softirq
+ * handler.
+ *
+ * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is
+ * equivalent to momentarily completely enabling preemption. For
+ * example, given this code::
+ *
+ * local_bh_disable();
+ * do_something();
+ * rcu_softirq_qs(); // A
+ * do_something_else();
+ * local_bh_enable(); // B
+ *
+ * A call to synchronize_rcu() that began concurrently with the
+ * call to do_something() would be guaranteed to wait only until
+ * execution reached statement A. Without that rcu_softirq_qs(),
+ * that same synchronize_rcu() would instead be guaranteed to wait
+ * until execution reached statement B.
*/
-static int rcu_gp_in_progress(struct rcu_state *rsp)
+void rcu_softirq_qs(void)
{
- return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal rcu_softirq_qs() in RCU read-side critical section");
+ rcu_qs();
+ rcu_preempt_deferred_qs(current);
+ rcu_tasks_qs(current, false);
}
/*
- * Note a quiescent state. Because we do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period, this just sets a flag.
- * The caller must have disabled preemption.
+ * Reset the current CPU's RCU_WATCHING counter to indicate that the
+ * newly onlined CPU is no longer in an extended quiescent state.
+ * This will either leave the counter unchanged, or increment it
+ * to the next non-quiescent value.
+ *
+ * The non-atomic test/increment sequence works because the upper bits
+ * of the ->state variable are manipulated only by the corresponding CPU,
+ * or when the corresponding CPU is offline.
*/
-void rcu_sched_qs(void)
+static void rcu_watching_online(void)
{
- if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
- trace_rcu_grace_period(TPS("rcu_sched"),
- __this_cpu_read(rcu_sched_data.gpnum),
- TPS("cpuqs"));
- __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
- }
+ if (ct_rcu_watching() & CT_RCU_WATCHING)
+ return;
+ ct_state_inc(CT_RCU_WATCHING);
+}
+
+/*
+ * Return true if the snapshot returned from ct_rcu_watching()
+ * indicates that RCU is in an extended quiescent state.
+ */
+static bool rcu_watching_snap_in_eqs(int snap)
+{
+ return !(snap & CT_RCU_WATCHING);
}
-void rcu_bh_qs(void)
+/**
+ * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU
+ * since the specified @snap?
+ *
+ * @rdp: The rcu_data corresponding to the CPU for which to check EQS.
+ * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS.
+ *
+ * Returns true if the CPU corresponding to @rdp has spent some time in an
+ * extended quiescent state since @snap. Note that this doesn't check if it
+ * /still/ is in an EQS, just that it went through one since @snap.
+ *
+ * This is meant to be used in a loop waiting for a CPU to go through an EQS.
+ */
+static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap)
{
- if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
- trace_rcu_grace_period(TPS("rcu_bh"),
- __this_cpu_read(rcu_bh_data.gpnum),
- TPS("cpuqs"));
- __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
- }
+ /*
+ * The first failing snapshot is already ordered against the accesses
+ * performed by the remote CPU after it exits idle.
+ *
+ * The second snapshot therefore only needs to order against accesses
+ * performed by the remote CPU prior to entering idle and therefore can
+ * rely solely on acquire semantics.
+ */
+ if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap)))
+ return true;
+
+ return snap != ct_rcu_watching_cpu_acquire(rdp->cpu);
}
-static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+/*
+ * Return true if the referenced integer is zero while the specified
+ * CPU remains within a single extended quiescent state.
+ */
+bool rcu_watching_zero_in_eqs(int cpu, int *vp)
+{
+ int snap;
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
- .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
+ // If not quiescent, force back to earlier extended quiescent state.
+ snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING;
+ smp_rmb(); // Order CT state and *vp reads.
+ if (READ_ONCE(*vp))
+ return false; // Non-zero, so report failure;
+ smp_rmb(); // Order *vp read and CT state re-read.
-DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
-EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+ // If still in the same extended quiescent state, we are good!
+ return snap == ct_rcu_watching_cpu(cpu);
+}
/*
* Let the RCU core know that this CPU has gone through the scheduler,
@@ -251,1534 +350,1553 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
* memory barriers to let the RCU core know about it, regardless of what
* this CPU might (or might not) do in the near future.
*
- * We inform the RCU core by emulating a zero-duration dyntick-idle
- * period, which we in turn do by incrementing the ->dynticks counter
- * by two.
+ * We inform the RCU core by emulating a zero-duration dyntick-idle period.
+ *
+ * The caller must have disabled interrupts and must not be idle.
*/
-static void rcu_momentary_dyntick_idle(void)
+notrace void rcu_momentary_eqs(void)
{
- unsigned long flags;
- struct rcu_data *rdp;
- struct rcu_dynticks *rdtp;
- int resched_mask;
- struct rcu_state *rsp;
+ int seq;
- local_irq_save(flags);
+ raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
+ seq = ct_state_inc(2 * CT_RCU_WATCHING);
+ /* It is illegal to call this from idle state. */
+ WARN_ON_ONCE(!(seq & CT_RCU_WATCHING));
+ rcu_preempt_deferred_qs(current);
+}
+EXPORT_SYMBOL_GPL(rcu_momentary_eqs);
+
+/**
+ * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
+ *
+ * If the current CPU is idle and running at a first-level (not nested)
+ * interrupt, or directly, from idle, return true.
+ *
+ * The caller must have at least disabled IRQs.
+ */
+static int rcu_is_cpu_rrupt_from_idle(void)
+{
+ long nmi_nesting = ct_nmi_nesting();
/*
- * Yes, we can lose flag-setting operations. This is OK, because
- * the flag will be set again after some delay.
+ * Usually called from the tick; but also used from smp_function_call()
+ * for expedited grace periods. This latter can result in running from
+ * the idle task, instead of an actual IPI.
*/
- resched_mask = raw_cpu_read(rcu_sched_qs_mask);
- raw_cpu_write(rcu_sched_qs_mask, 0);
+ lockdep_assert_irqs_disabled();
- /* Find the flavor that needs a quiescent state. */
- for_each_rcu_flavor(rsp) {
- rdp = raw_cpu_ptr(rsp->rda);
- if (!(resched_mask & rsp->flavor_mask))
- continue;
- smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
- if (ACCESS_ONCE(rdp->mynode->completed) !=
- ACCESS_ONCE(rdp->cond_resched_completed))
- continue;
+ /* Check for counter underflows */
+ RCU_LOCKDEP_WARN(ct_nesting() < 0,
+ "RCU nesting counter underflow!");
- /*
- * Pretend to be momentarily idle for the quiescent state.
- * This allows the grace-period kthread to record the
- * quiescent state, with no need for this CPU to do anything
- * further.
- */
- rdtp = this_cpu_ptr(&rcu_dynticks);
- smp_mb__before_atomic(); /* Earlier stuff before QS. */
- atomic_add(2, &rdtp->dynticks); /* QS. */
- smp_mb__after_atomic(); /* Later stuff after QS. */
- break;
+ /* Non-idle interrupt or nested idle interrupt */
+ if (nmi_nesting > 1)
+ return false;
+
+ /*
+ * Non nested idle interrupt (interrupting section where RCU
+ * wasn't watching).
+ */
+ if (nmi_nesting == 1)
+ return true;
+
+ /* Not in an interrupt */
+ if (!nmi_nesting) {
+ RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current),
+ "RCU nmi_nesting counter not in idle task!");
+ return !rcu_is_watching_curr_cpu();
}
- local_irq_restore(flags);
-}
-/*
- * Note a context switch. This is a quiescent state for RCU-sched,
- * and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
- */
-void rcu_note_context_switch(void)
-{
- trace_rcu_utilization(TPS("Start context switch"));
- rcu_sched_qs();
- rcu_preempt_note_context_switch();
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
- rcu_momentary_dyntick_idle();
- trace_rcu_utilization(TPS("End context switch"));
-}
-EXPORT_SYMBOL_GPL(rcu_note_context_switch);
+ RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!");
-/*
- * Register a quiescent state for all RCU flavors. If there is an
- * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
- * dyntick-idle quiescent state visible to other CPUs (but only for those
- * RCU flavors in desperate need of a quiescent state, which will normally
- * be none of them). Either way, do a lightweight quiescent state for
- * all RCU flavors.
- */
-void rcu_all_qs(void)
-{
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
- rcu_momentary_dyntick_idle();
- this_cpu_inc(rcu_qs_ctr);
+ return false;
}
-EXPORT_SYMBOL_GPL(rcu_all_qs);
-static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
-static long qhimark = 10000; /* If this many pending, ignore blimit. */
-static long qlowmark = 100; /* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
+ // Maximum callbacks per rcu_do_batch ...
+#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
+static long blimit = DEFAULT_RCU_BLIMIT;
+#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
+static long qhimark = DEFAULT_RCU_QHIMARK;
+#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit.
+static long qlowmark = DEFAULT_RCU_QLOMARK;
+#define DEFAULT_RCU_QOVLD_MULT 2
+#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
+static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
+static long qovld_calc = -1; // No pre-initialization lock acquisitions!
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
+module_param(qovld, long, 0444);
-static ulong jiffies_till_first_fqs = ULONG_MAX;
+static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
+static bool rcu_kick_kthreads;
+static int rcu_divisor = 7;
+module_param(rcu_divisor, int, 0644);
-module_param(jiffies_till_first_fqs, ulong, 0644);
-module_param(jiffies_till_next_fqs, ulong, 0644);
+/* Force an exit from rcu_do_batch() after 3 milliseconds. */
+static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
+module_param(rcu_resched_ns, long, 0644);
/*
* How long the grace period must be before we start recruiting
* quiescent-state help from rcu_note_context_switch().
*/
-static ulong jiffies_till_sched_qs = HZ / 20;
-module_param(jiffies_till_sched_qs, ulong, 0644);
-
-static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp);
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj);
-static void force_quiescent_state(struct rcu_state *rsp);
-static int rcu_pending(void);
+static ulong jiffies_till_sched_qs = ULONG_MAX;
+module_param(jiffies_till_sched_qs, ulong, 0444);
+static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
+module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
/*
- * Return the number of RCU batches started thus far for debug & stats.
+ * Make sure that we give the grace-period kthread time to detect any
+ * idle CPUs before taking active measures to force quiescent states.
+ * However, don't go below 100 milliseconds, adjusted upwards for really
+ * large systems.
*/
-unsigned long rcu_batches_started(void)
+static void adjust_jiffies_till_sched_qs(void)
{
- return rcu_state_p->gpnum;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_started);
+ unsigned long j;
-/*
- * Return the number of RCU-sched batches started thus far for debug & stats.
- */
-unsigned long rcu_batches_started_sched(void)
-{
- return rcu_sched_state.gpnum;
+ /* If jiffies_till_sched_qs was specified, respect the request. */
+ if (jiffies_till_sched_qs != ULONG_MAX) {
+ WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
+ return;
+ }
+ /* Otherwise, set to third fqs scan, but bound below on large system. */
+ j = READ_ONCE(jiffies_till_first_fqs) +
+ 2 * READ_ONCE(jiffies_till_next_fqs);
+ if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
+ j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
+ pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
+ WRITE_ONCE(jiffies_to_sched_qs, j);
}
-EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
-/*
- * Return the number of RCU BH batches started thus far for debug & stats.
- */
-unsigned long rcu_batches_started_bh(void)
+static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp)
{
- return rcu_bh_state.gpnum;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+ ulong j;
+ int ret = kstrtoul(val, 0, &j);
-/*
- * Return the number of RCU batches completed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
- return rcu_state_p->completed;
+ if (!ret) {
+ WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j);
+ adjust_jiffies_till_sched_qs();
+ }
+ return ret;
}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-/*
- * Return the number of RCU-sched batches completed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed_sched(void)
+static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp)
{
- return rcu_sched_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
+ ulong j;
+ int ret = kstrtoul(val, 0, &j);
-/*
- * Return the number of RCU BH batches completed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed_bh(void)
-{
- return rcu_bh_state.completed;
+ if (!ret) {
+ WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
+ adjust_jiffies_till_sched_qs();
+ }
+ return ret;
}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-/*
- * Force a quiescent state.
- */
-void rcu_force_quiescent_state(void)
-{
- force_quiescent_state(rcu_state_p);
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+static const struct kernel_param_ops first_fqs_jiffies_ops = {
+ .set = param_set_first_fqs_jiffies,
+ .get = param_get_ulong,
+};
-/*
- * Force a quiescent state for RCU BH.
- */
-void rcu_bh_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_bh_state);
-}
-EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+static const struct kernel_param_ops next_fqs_jiffies_ops = {
+ .set = param_set_next_fqs_jiffies,
+ .get = param_get_ulong,
+};
+
+module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644);
+module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644);
+module_param(rcu_kick_kthreads, bool, 0644);
+
+static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
+static int rcu_pending(int user);
/*
- * Force a quiescent state for RCU-sched.
+ * Return the number of RCU GPs completed thus far for debug & stats.
*/
-void rcu_sched_force_quiescent_state(void)
+unsigned long rcu_get_gp_seq(void)
{
- force_quiescent_state(&rcu_sched_state);
+ return READ_ONCE(rcu_state.gp_seq);
}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
/*
- * Show the state of the grace-period kthreads.
+ * Return the number of RCU expedited batches completed thus far for
+ * debug & stats. Odd numbers mean that a batch is in progress, even
+ * numbers mean idle. The value returned will thus be roughly double
+ * the cumulative batches since boot.
*/
-void show_rcu_gp_kthreads(void)
+unsigned long rcu_exp_batches_completed(void)
{
- struct rcu_state *rsp;
-
- for_each_rcu_flavor(rsp) {
- pr_info("%s: wait state: %d ->state: %#lx\n",
- rsp->name, rsp->gp_state, rsp->gp_kthread->state);
- /* sched_show_task(rsp->gp_kthread); */
- }
+ return rcu_state.expedited_sequence;
}
-EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
/*
- * Record the number of times rcutorture tests have been initiated and
- * terminated. This information allows the debugfs tracing stats to be
- * correlated to the rcutorture messages, even when the rcutorture module
- * is being repeatedly loaded and unloaded. In other words, we cannot
- * store this state in rcutorture itself.
+ * Return the root node of the rcu_state structure.
*/
-void rcutorture_record_test_transition(void)
+static struct rcu_node *rcu_get_root(void)
{
- rcutorture_testseq++;
- rcutorture_vernum = 0;
+ return &rcu_state.node[0];
}
-EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gpnum, unsigned long *completed)
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
- struct rcu_state *rsp = NULL;
-
- switch (test_type) {
- case RCU_FLAVOR:
- rsp = rcu_state_p;
- break;
- case RCU_BH_FLAVOR:
- rsp = &rcu_bh_state;
- break;
- case RCU_SCHED_FLAVOR:
- rsp = &rcu_sched_state;
- break;
- default:
- break;
- }
- if (rsp != NULL) {
- *flags = ACCESS_ONCE(rsp->gp_flags);
- *gpnum = ACCESS_ONCE(rsp->gpnum);
- *completed = ACCESS_ONCE(rsp->completed);
- return;
- }
- *flags = 0;
- *gpnum = 0;
- *completed = 0;
+ *flags = READ_ONCE(rcu_state.gp_flags);
+ *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
-/*
- * Record the number of writer passes through the current rcutorture test.
- * This is also used to correlate debugfs tracing stats with the rcutorture
- * messages.
- */
-void rcutorture_record_progress(unsigned long vernum)
+/* Gather grace-period sequence numbers for rcutorture diagnostics. */
+unsigned long long rcutorture_gather_gp_seqs(void)
{
- rcutorture_vernum++;
+ return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) |
+ ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) |
+ (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL);
}
-EXPORT_SYMBOL_GPL(rcutorture_record_progress);
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
-/*
- * Does the CPU have callbacks ready to be invoked?
- */
-static int
-cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+/* Format grace-period sequence numbers for rcutorture diagnostics. */
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
{
- return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
- rdp->nxttail[RCU_DONE_TAIL] != NULL;
-}
+ unsigned int egp = (seqs >> 16) & 0xffffffULL;
+ unsigned int ggp = (seqs >> 40) & 0xffffULL;
+ unsigned int pgp = seqs & 0xffffULL;
-/*
- * Return the root node of the specified rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
-{
- return &rsp->node[0];
+ snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp);
}
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK))
/*
- * Is there any need for future grace periods?
- * Interrupts must be disabled. If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace/guest resume.
*/
-static int rcu_future_needs_gp(struct rcu_state *rsp)
+static void late_wakeup_func(struct irq_work *work)
{
- struct rcu_node *rnp = rcu_get_root(rsp);
- int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
- int *fp = &rnp->need_future_gp[idx];
-
- return ACCESS_ONCE(*fp);
}
-/*
- * Does the current CPU require a not-yet-started grace period?
- * The caller must have disabled interrupts to prevent races with
- * normal callback registry.
- */
-static int
-cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
-{
- int i;
-
- if (rcu_gp_in_progress(rsp))
- return 0; /* No, a grace period is already in progress. */
- if (rcu_future_needs_gp(rsp))
- return 1; /* Yes, a no-CBs CPU needs one. */
- if (!rdp->nxttail[RCU_NEXT_TAIL])
- return 0; /* No, this is a no-CBs (or offline) CPU. */
- if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
- return 1; /* Yes, this CPU has newly registered callbacks. */
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
- if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
- rdp->nxtcompleted[i]))
- return 1; /* Yes, CBs for future grace period. */
- return 0; /* No grace period needed. */
-}
+static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+ IRQ_WORK_INIT(late_wakeup_func);
/*
- * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
+ * If either:
+ *
+ * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+ * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
*
- * If the new value of the ->dynticks_nesting counter now is zero,
- * we really have entered idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
+ * In these cases the late RCU wake ups aren't supported in the resched loops and our
+ * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+ * get re-enabled again.
*/
-static void rcu_eqs_enter_common(long long oldval, bool user)
+noinstr void rcu_irq_work_resched(void)
{
- struct rcu_state *rsp;
- struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
-
- trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
- ftrace_dump(DUMP_ORIG);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
- for_each_rcu_flavor(rsp) {
- rdp = this_cpu_ptr(rsp->rda);
- do_nocb_deferred_wakeup(rdp);
- }
- rcu_prepare_for_idle();
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
- rcu_dynticks_task_enter();
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- /*
- * It is illegal to enter an extended quiescent state while
- * in an RCU read-side critical section.
- */
- rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
- "Illegal idle entry in RCU read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
- "Illegal idle entry in RCU-bh read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
- "Illegal idle entry in RCU-sched read-side critical section.");
-}
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
+ return;
-/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- */
-static void rcu_eqs_enter(bool user)
-{
- long long oldval;
- struct rcu_dynticks *rdtp;
+ if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+ return;
- rdtp = this_cpu_ptr(&rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
- if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
- rdtp->dynticks_nesting = 0;
- rcu_eqs_enter_common(oldval, user);
- } else {
- rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
+ instrumentation_begin();
+ if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+ irq_work_queue(this_cpu_ptr(&late_wakeup_work));
}
+ instrumentation_end();
}
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */
+#ifdef CONFIG_PROVE_RCU
/**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur. (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * We crowbar the ->dynticks_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
- */
-void rcu_idle_enter(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_eqs_enter(false);
- rcu_sysidle_enter(0);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-#ifdef CONFIG_RCU_USER_QS
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace. No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
+ * rcu_irq_exit_check_preempt - Validate that scheduling is possible
*/
-void rcu_user_enter(void)
+void rcu_irq_exit_check_preempt(void)
{
- rcu_eqs_enter(1);
+ lockdep_assert_irqs_disabled();
+
+ RCU_LOCKDEP_WARN(ct_nesting() <= 0,
+ "RCU nesting counter underflow/zero!");
+ RCU_LOCKDEP_WARN(ct_nmi_nesting() !=
+ CT_NESTING_IRQ_NONIDLE,
+ "Bad RCU nmi_nesting counter\n");
+ RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
+ "RCU in extended quiescent state!");
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* #ifdef CONFIG_PROVE_RCU */
+#ifdef CONFIG_NO_HZ_FULL
/**
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
+ * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
*
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture violates this assumption, RCU will give you what you
- * deserve, good and hard. But very infrequently and irreproducibly.
+ * The scheduler tick is not normally enabled when CPUs enter the kernel
+ * from nohz_full userspace execution. After all, nohz_full userspace
+ * execution is an RCU quiescent state and the time executing in the kernel
+ * is quite short. Except of course when it isn't. And it is not hard to
+ * cause a large system to spend tens of seconds or even minutes looping
+ * in the kernel, which can cause a number of problems, include RCU CPU
+ * stall warnings.
*
- * Use things like work queues to work around this limitation.
+ * Therefore, if a nohz_full CPU fails to report a quiescent state
+ * in a timely manner, the RCU grace-period kthread sets that CPU's
+ * ->rcu_urgent_qs flag with the expectation that the next interrupt or
+ * exception will invoke this function, which will turn on the scheduler
+ * tick, which will enable RCU to detect that CPU's quiescent states,
+ * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
+ * The tick will be disabled once a quiescent state is reported for
+ * this CPU.
*
- * You have been warned.
+ * Of course, in carefully tuned systems, there might never be an
+ * interrupt or exception. In that case, the RCU grace-period kthread
+ * will eventually cause one to happen. However, in less carefully
+ * controlled environments, this function allows RCU to get what it
+ * needs without creating otherwise useless interruptions.
*/
-void rcu_irq_exit(void)
+void __rcu_irq_enter_check_tick(void)
{
- unsigned long flags;
- long long oldval;
- struct rcu_dynticks *rdtp;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- local_irq_save(flags);
- rdtp = this_cpu_ptr(&rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
- if (rdtp->dynticks_nesting)
- trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
- else
- rcu_eqs_enter_common(oldval, true);
- rcu_sysidle_enter(1);
- local_irq_restore(flags);
+ // If we're here from NMI there's nothing to do.
+ if (in_nmi())
+ return;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
+ "Illegal rcu_irq_enter_check_tick() from extended quiescent state");
+
+ if (!tick_nohz_full_cpu(rdp->cpu) ||
+ !READ_ONCE(rdp->rcu_urgent_qs) ||
+ READ_ONCE(rdp->rcu_forced_tick)) {
+ // RCU doesn't need nohz_full help from this CPU, or it is
+ // already getting that help.
+ return;
+ }
+
+ // We get here only when not in an extended quiescent state and
+ // from interrupts (as opposed to NMIs). Therefore, (1) RCU is
+ // already watching and (2) The fact that we are in an interrupt
+ // handler and that the rcu_node lock is an irq-disabled lock
+ // prevents self-deadlock. So we can safely recheck under the lock.
+ // Note that the nohz_full state currently cannot change.
+ raw_spin_lock_rcu_node(rdp->mynode);
+ if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
+ // A nohz_full CPU is in the kernel and RCU needs a
+ // quiescent state. Turn on the tick!
+ WRITE_ONCE(rdp->rcu_forced_tick, true);
+ tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ }
+ raw_spin_unlock_rcu_node(rdp->mynode);
}
+NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
+#endif /* CONFIG_NO_HZ_FULL */
/*
- * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API. This is used by
+ * the idle-entry code to figure out whether it is safe to disable the
+ * scheduler-clock interrupt.
*
- * If the new value of the ->dynticks_nesting counter was previously zero,
- * we really have exited idle, and must do the appropriate accounting.
- * The caller must have disabled interrupts.
+ * Just check whether or not this CPU has non-offloaded RCU callbacks
+ * queued.
*/
-static void rcu_eqs_exit_common(long long oldval, int user)
-{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- rcu_dynticks_task_exit();
- smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
- atomic_inc(&rdtp->dynticks);
- /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
- if (!user && !is_idle_task(current)) {
- struct task_struct *idle __maybe_unused =
- idle_task(smp_processor_id());
-
- trace_rcu_dyntick(TPS("Error on exit: not idle task"),
- oldval, rdtp->dynticks_nesting);
- ftrace_dump(DUMP_ORIG);
- WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
- current->pid, current->comm,
- idle->pid, idle->comm); /* must be idle task! */
- }
+int rcu_needs_cpu(void)
+{
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
}
/*
- * Exit an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
+ * If any sort of urgency was applied to the current CPU (for example,
+ * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
+ * to get to a quiescent state, disable it.
*/
-static void rcu_eqs_exit(bool user)
+static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
{
- struct rcu_dynticks *rdtp;
- long long oldval;
-
- rdtp = this_cpu_ptr(&rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE(oldval < 0);
- if (oldval & DYNTICK_TASK_NEST_MASK) {
- rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- } else {
- rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(oldval, user);
+ raw_lockdep_assert_held_rcu_node(rdp->mynode);
+ WRITE_ONCE(rdp->rcu_urgent_qs, false);
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
+ if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
+ tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
+ WRITE_ONCE(rdp->rcu_forced_tick, false);
}
}
/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ * rcu_is_watching - RCU read-side critical sections permitted on current CPU?
*
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
+ * Return @true if RCU is watching the running CPU and @false otherwise.
+ * An @true return means that this CPU can safely enter RCU read-side
+ * critical sections.
*
- * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
- * allow for the possibility of usermode upcalls messing up our count
- * of interrupt nesting level during the busy period that is just
- * now starting.
+ * Although calls to rcu_is_watching() from most parts of the kernel
+ * will return @true, there are important exceptions. For example, if the
+ * current CPU is deep within its idle loop, in kernel entry/exit code,
+ * or offline, rcu_is_watching() will return @false.
+ *
+ * Make notrace because it can be called by the internal functions of
+ * ftrace, and making this notrace removes unnecessary recursion calls.
*/
-void rcu_idle_exit(void)
+notrace bool rcu_is_watching(void)
{
- unsigned long flags;
+ bool ret;
- local_irq_save(flags);
- rcu_eqs_exit(false);
- rcu_sysidle_exit(0);
- local_irq_restore(flags);
+ preempt_disable_notrace();
+ ret = rcu_is_watching_curr_cpu();
+ preempt_enable_notrace();
+ return ret;
}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
+EXPORT_SYMBOL_GPL(rcu_is_watching);
-#ifdef CONFIG_RCU_USER_QS
-/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
+/*
+ * If a holdout task is actually running, request an urgent quiescent
+ * state from its CPU. This is unsynchronized, so migrations can cause
+ * the request to go to the wrong CPU. Which is OK, all that will happen
+ * is that the CPU's next context switch will be a bit slower and next
+ * time around this task will generate another request.
*/
-void rcu_user_exit(void)
+void rcu_request_urgent_qs_task(struct task_struct *t)
{
- rcu_eqs_exit(1);
+ int cpu;
+
+ barrier();
+ cpu = task_cpu(t);
+ if (!task_curr(t))
+ return; /* This task is not running on that CPU. */
+ smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
-#endif /* CONFIG_RCU_USER_QS */
+
+static unsigned long seq_gpwrap_lag = ULONG_MAX / 4;
/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to
- * user mode! This code assumes that the idle loop never does upcalls to
- * user mode. If your architecture does do upcalls from the idle loop (or
- * does anything else that results in unbalanced calls to the irq_enter()
- * and irq_exit() functions), RCU will give you what you deserve, good
- * and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
+ * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value.
+ * @lag_gps: Set overflow lag to this many grace period worth of counters
+ * which is used by rcutorture to quickly force a gpwrap situation.
+ * @lag_gps = 0 means we reset it back to the boot-time value.
*/
-void rcu_irq_enter(void)
+void rcu_set_gpwrap_lag(unsigned long lag_gps)
{
- unsigned long flags;
- struct rcu_dynticks *rdtp;
- long long oldval;
+ unsigned long lag_seq_count;
- local_irq_save(flags);
- rdtp = this_cpu_ptr(&rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- rdtp->dynticks_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
- if (oldval)
- trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
- else
- rcu_eqs_exit_common(oldval, true);
- rcu_sysidle_exit(1);
- local_irq_restore(flags);
+ lag_seq_count = (lag_gps == 0)
+ ? ULONG_MAX / 4
+ : lag_gps << RCU_SEQ_CTR_SHIFT;
+ WRITE_ONCE(seq_gpwrap_lag, lag_seq_count);
}
+EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag);
-/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- *
- * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
- * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * that the CPU is active. This implementation permits nested NMIs, as
- * long as the nesting level does not overflow an int. (You will probably
- * run out of stack space first.)
+/*
+ * When trying to report a quiescent state on behalf of some other CPU,
+ * it is our responsibility to check for and handle potential overflow
+ * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
+ * After all, the CPU might be in deep idle state, and thus executing no
+ * code whatsoever.
*/
-void rcu_nmi_enter(void)
+static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- int incby = 2;
-
- /* Complain about underflow. */
- WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag,
+ rnp->gp_seq)) {
+ WRITE_ONCE(rdp->gpwrap, true);
+ WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1);
+ }
+ if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
+ rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
+}
+/*
+ * Snapshot the specified CPU's RCU_WATCHING counter so that we can later
+ * credit them with an implicit quiescent state. Return 1 if this CPU
+ * is in dynticks idle mode, which is an extended quiescent state.
+ */
+static int rcu_watching_snap_save(struct rcu_data *rdp)
+{
/*
- * If idle from RCU viewpoint, atomically increment ->dynticks
- * to mark non-idle and increment ->dynticks_nmi_nesting by one.
- * Otherwise, increment ->dynticks_nmi_nesting by two. This means
- * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
- * to be in the outermost NMI handler that interrupted an RCU-idle
- * period (observation due to Andy Lutomirski).
+ * Full ordering between remote CPU's post idle accesses and updater's
+ * accesses prior to current GP (and also the started GP sequence number)
+ * is enforced by rcu_seq_start() implicit barrier and even further by
+ * smp_mb__after_unlock_lock() barriers chained all the way throughout the
+ * rnp locking tree since rcu_gp_init() and up to the current leaf rnp
+ * locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and post grace period
+ * updater's accesses is enforced by the below acquire semantic.
*/
- if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
- smp_mb__before_atomic(); /* Force delay from prior write. */
- atomic_inc(&rdtp->dynticks);
- /* atomic_inc() before later RCU read-side crit sects */
- smp_mb__after_atomic(); /* See above. */
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- incby = 1;
+ rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu);
+ if (rcu_watching_snap_in_eqs(rdp->watching_snap)) {
+ trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
+ rcu_gpnum_ovf(rdp->mynode, rdp);
+ return 1;
}
- rdtp->dynticks_nmi_nesting += incby;
- barrier();
+ return 0;
}
-/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+
+/*
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to rcu_watching_snap_save() for this same CPU, or by
+ * virtue of having been offline.
*
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
*/
-void rcu_nmi_exit(void)
+static int rcu_watching_snap_recheck(struct rcu_data *rdp)
{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+ unsigned long jtsq;
+ int ret = 0;
+ struct rcu_node *rnp = rdp->mynode;
/*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
- * (We are exiting an NMI handler, so RCU better be paying attention
- * to us!)
+ * If the CPU passed through or entered a dynticks idle phase with
+ * no active irq/NMI handlers, then we can safely pretend that the CPU
+ * already acknowledged the request to pass through a quiescent
+ * state. Either way, that CPU cannot possibly be in an RCU
+ * read-side critical section that started before the beginning
+ * of the current RCU grace period.
*/
- WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) {
+ trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
+ rcu_gpnum_ovf(rnp, rdp);
+ return 1;
+ }
/*
- * If the nesting level is not 1, the CPU wasn't RCU-idle, so
- * leave it in non-RCU-idle state.
+ * Complain if a CPU that is considered to be offline from RCU's
+ * perspective has not yet reported a quiescent state. After all,
+ * the offline CPU should have reported a quiescent state during
+ * the CPU-offline process, or, failing that, by rcu_gp_init()
+ * if it ran concurrently with either the CPU going offline or the
+ * last task on a leaf rcu_node structure exiting its RCU read-side
+ * critical section while all CPUs corresponding to that structure
+ * are offline. This added warning detects bugs in any of these
+ * code paths.
+ *
+ * The rcu_node structure's ->lock is held here, which excludes
+ * the relevant portions the CPU-hotplug code, the grace-period
+ * initialization code, and the rcu_read_unlock() code paths.
+ *
+ * For more detail, please refer to the "Hotplug CPU" section
+ * of RCU's Requirements documentation.
*/
- if (rdtp->dynticks_nmi_nesting != 1) {
- rdtp->dynticks_nmi_nesting -= 2;
- return;
+ if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
+ struct rcu_node *rnp1;
+
+ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
+ __func__, rnp->grplo, rnp->grphi, rnp->level,
+ (long)rnp->gp_seq, (long)rnp->completedqs);
+ for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
+ pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
+ __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
+ pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
+ __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
+ return 1; /* Break things loose after complaining. */
}
- /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- rdtp->dynticks_nmi_nesting = 0;
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic(); /* Force delay to next write. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
-}
+ /*
+ * A CPU running for an extended time within the kernel can
+ * delay RCU grace periods: (1) At age jiffies_to_sched_qs,
+ * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
+ * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the
+ * unsynchronized assignments to the per-CPU rcu_need_heavy_qs
+ * variable are safe because the assignments are repeated if this
+ * CPU failed to pass through a quiescent state. This code
+ * also checks .jiffies_resched in case jiffies_to_sched_qs
+ * is set way high.
+ */
+ jtsq = READ_ONCE(jiffies_to_sched_qs);
+ if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
+ (time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
+ time_after(jiffies, rcu_state.jiffies_resched) ||
+ rcu_state.cbovld)) {
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
+ /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
+ smp_store_release(&rdp->rcu_urgent_qs, true);
+ } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
+ }
-/**
- * __rcu_is_watching - are RCU read-side critical sections safe?
- *
- * Return true if RCU is watching the running CPU, which means that
- * this CPU can safely enter RCU read-side critical sections. Unlike
- * rcu_is_watching(), the caller of __rcu_is_watching() must have at
- * least disabled preemption.
- */
-bool notrace __rcu_is_watching(void)
-{
- return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
-}
+ /*
+ * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
+ * The above code handles this, but only for straight cond_resched().
+ * And some in-kernel loops check need_resched() before calling
+ * cond_resched(), which defeats the above code for CPUs that are
+ * running in-kernel with scheduling-clock interrupts disabled.
+ * So hit them over the head with the resched_cpu() hammer!
+ */
+ if (tick_nohz_full_cpu(rdp->cpu) &&
+ (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
+ rcu_state.cbovld)) {
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
+ WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
+ }
-/**
- * rcu_is_watching - see if RCU thinks that the current CPU is idle
- *
- * If the current CPU is in its idle loop and is neither in an interrupt
- * or NMI handler, return true.
- */
-bool notrace rcu_is_watching(void)
-{
- bool ret;
+ /*
+ * If more than halfway to RCU CPU stall-warning time, invoke
+ * resched_cpu() more frequently to try to loosen things up a bit.
+ * Also check to see if the CPU is getting hammered with interrupts,
+ * but only once per grace period, just to keep the IPIs down to
+ * a dull roar.
+ */
+ if (time_after(jiffies, rcu_state.jiffies_resched)) {
+ if (time_after(jiffies,
+ READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
+ WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
+ }
+ if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
+ (rnp->ffmask & rdp->grpmask)) {
+ rdp->rcu_iw_pending = true;
+ rdp->rcu_iw_gp_seq = rnp->gp_seq;
+ irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
+ }
+
+ if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
+ int cpu = rdp->cpu;
+ struct rcu_snap_record *rsrp;
+ struct kernel_cpustat *kcsp;
+
+ kcsp = &kcpustat_cpu(cpu);
+
+ rsrp = &rdp->snap_record;
+ rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(cpu);
+ rsrp->jiffies = jiffies;
+ rsrp->gp_seq = rdp->gp_seq;
+ }
+ }
- preempt_disable();
- ret = __rcu_is_watching();
- preempt_enable();
return ret;
}
-EXPORT_SYMBOL_GPL(rcu_is_watching);
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+/* Trace-event wrapper function for trace_rcu_future_grace_period. */
+static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long gp_seq_req, const char *s)
+{
+ trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ gp_seq_req, rnp->level,
+ rnp->grplo, rnp->grphi, s);
+}
/*
- * Is the current CPU online? Disable preemption to avoid false positives
- * that could otherwise happen due to the current CPU number being sampled,
- * this task being preempted, its old CPU being taken offline, resuming
- * on some other CPU, then determining that its old CPU is now offline.
- * It is OK to use RCU on an offline processor during initial boot, hence
- * the check for rcu_scheduler_fully_active. Note also that it is OK
- * for a CPU coming online to use RCU for one jiffy prior to marking itself
- * online in the cpu_online_mask. Similarly, it is OK for a CPU going
- * offline to continue to use RCU for one jiffy after marking itself
- * offline in the cpu_online_mask. This leniency is necessary given the
- * non-atomic nature of the online and offline processing, for example,
- * the fact that a CPU enters the scheduler after completing the CPU_DYING
- * notifiers.
+ * rcu_start_this_gp - Request the start of a particular grace period
+ * @rnp_start: The leaf node of the CPU from which to start.
+ * @rdp: The rcu_data corresponding to the CPU from which to start.
+ * @gp_seq_req: The gp_seq of the grace period to start.
+ *
+ * Start the specified grace period, as needed to handle newly arrived
+ * callbacks. The required future grace periods are recorded in each
+ * rcu_node structure's ->gp_seq_needed field. Returns true if there
+ * is reason to awaken the grace-period kthread.
*
- * This is also why RCU internally marks CPUs online during the
- * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ * The caller must hold the specified rcu_node structure's ->lock, which
+ * is why the caller is responsible for waking the grace-period kthread.
*
- * Disable checking if in an NMI handler because we cannot safely report
- * errors from NMI handlers anyway.
+ * Returns true if the GP thread needs to be awakened else false.
*/
-bool rcu_lockdep_current_cpu_online(void)
+static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
+ unsigned long gp_seq_req)
{
- struct rcu_data *rdp;
+ bool ret = false;
struct rcu_node *rnp;
- bool ret;
- if (in_nmi())
- return true;
- preempt_disable();
- rdp = this_cpu_ptr(&rcu_sched_data);
- rnp = rdp->mynode;
- ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
- !rcu_scheduler_fully_active;
- preempt_enable();
+ /*
+ * Use funnel locking to either acquire the root rcu_node
+ * structure's lock or bail out if the need for this grace period
+ * has already been recorded -- or if that grace period has in
+ * fact already started. If there is already a grace period in
+ * progress in a non-leaf node, no recording is needed because the
+ * end of the grace period will scan the leaf rcu_node structures.
+ * Note that rnp_start->lock must not be released.
+ */
+ raw_lockdep_assert_held_rcu_node(rnp_start);
+ trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
+ for (rnp = rnp_start; 1; rnp = rnp->parent) {
+ if (rnp != rnp_start)
+ raw_spin_lock_rcu_node(rnp);
+ if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
+ rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
+ (rnp != rnp_start &&
+ rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
+ trace_rcu_this_gp(rnp, rdp, gp_seq_req,
+ TPS("Prestarted"));
+ goto unlock_out;
+ }
+ WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
+ /*
+ * We just marked the leaf or internal node, and a
+ * grace period is in progress, which means that
+ * rcu_gp_cleanup() will see the marking. Bail to
+ * reduce contention.
+ */
+ trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
+ TPS("Startedleaf"));
+ goto unlock_out;
+ }
+ if (rnp != rnp_start && rnp->parent != NULL)
+ raw_spin_unlock_rcu_node(rnp);
+ if (!rnp->parent)
+ break; /* At root, and perhaps also leaf. */
+ }
+
+ /* If GP already in progress, just leave, otherwise start one. */
+ if (rcu_gp_in_progress()) {
+ trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
+ goto unlock_out;
+ }
+ trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ if (!READ_ONCE(rcu_state.gp_kthread)) {
+ trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
+ goto unlock_out;
+ }
+ trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq"));
+ ret = true; /* Caller must wake GP kthread. */
+unlock_out:
+ /* Push furthest requested GP to leaf node and rcu_data structure. */
+ if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
+ WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
+ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+ }
+ if (rnp != rnp_start)
+ raw_spin_unlock_rcu_node(rnp);
return ret;
}
-EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-
-#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-/**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
- *
- * If the current CPU is idle or running at a first-level (not nested)
- * interrupt from idle, return true. The caller must have at least
- * disabled preemption.
+/*
+ * Clean up any old requests for the just-ended grace period. Also return
+ * whether any additional grace periods have been requested.
*/
-static int rcu_is_cpu_rrupt_from_idle(void)
+static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
{
- return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
+ bool needmore;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
+ if (!needmore)
+ rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
+ trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
+ needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+ return needmore;
}
/*
- * Snapshot the specified CPU's dynticks counter so that we can later
- * credit them with an implicit quiescent state. Return 1 if this CPU
- * is in dynticks idle mode, which is an extended quiescent state.
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in an
+ * interrupt or softirq handler, in which case we just might immediately
+ * sleep upon return, resulting in a grace-period hang), and don't bother
+ * awakening when there is nothing for the grace-period kthread to do
+ * (as in several CPUs raced to awaken, we lost), and finally don't try
+ * to awaken a kthread that has not yet been created. If all those checks
+ * are passed, track some debug information and awaken.
+ *
+ * So why do the self-wakeup when in an interrupt or softirq handler
+ * in the grace-period kthread's context? Because the kthread might have
+ * been interrupted just as it was going to sleep, and just after the final
+ * pre-sleep check of the awaken condition. In this case, a wakeup really
+ * is required, and is therefore supplied.
*/
-static int dyntick_save_progress_counter(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static void rcu_gp_kthread_wake(void)
{
- rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
- rcu_sysidle_check_cpu(rdp, isidle, maxj);
- if ((rdp->dynticks_snap & 0x1) == 0) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
- return 1;
- } else {
- if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
- rdp->mynode->gpnum))
- ACCESS_ONCE(rdp->gpwrap) = true;
- return 0;
- }
+ struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
+
+ if ((current == t && !in_hardirq() && !in_serving_softirq()) ||
+ !READ_ONCE(rcu_state.gp_flags) || !t)
+ return;
+ WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
+ WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
+ swake_up_one(&rcu_state.gp_wq);
}
/*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * If there is room, assign a ->gp_seq number to any callbacks on this
+ * CPU that have not already been assigned. Also accelerate any callbacks
+ * that were previously assigned a ->gp_seq number that has since proven
+ * to be too conservative, which can happen if callbacks get assigned a
+ * ->gp_seq number while RCU is idle, but with reference to a non-root
+ * rcu_node structure. This function is idempotent, so it does not hurt
+ * to call it repeatedly. Returns an flag saying that we should awaken
+ * the RCU grace-period kthread.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
*/
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
- bool *isidle, unsigned long *maxj)
+static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
{
- unsigned int curr;
- int *rcrmp;
- unsigned int snap;
+ unsigned long gp_seq_req;
+ bool ret = false;
- curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
- snap = (unsigned int)rdp->dynticks_snap;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ raw_lockdep_assert_held_rcu_node(rnp);
- /*
- * If the CPU passed through or entered a dynticks idle phase with
- * no active irq/NMI handlers, then we can safely pretend that the CPU
- * already acknowledged the request to pass through a quiescent
- * state. Either way, that CPU cannot possibly be in an RCU
- * read-side critical section that started before the beginning
- * of the current RCU grace period.
- */
- if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
- rdp->dynticks_fqs++;
- return 1;
- }
+ /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
+ return false;
- /*
- * Check for the CPU being offline, but only if the grace period
- * is old enough. We don't need to worry about the CPU changing
- * state: If we see it offline even once, it has been through a
- * quiescent state.
- *
- * The reason for insisting that the grace period be at least
- * one jiffy old is that CPUs that are not quite online and that
- * have just gone offline can still execute RCU read-side critical
- * sections.
- */
- if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
- return 0; /* Grace period is not old enough. */
- barrier();
- if (cpu_is_offline(rdp->cpu)) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
- rdp->offline_fqs++;
- return 1;
- }
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
/*
- * A CPU running for an extended time within the kernel can
- * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
- * even context-switching back and forth between a pair of
- * in-kernel CPU-bound tasks cannot advance grace periods.
- * So if the grace period is old enough, make the CPU pay attention.
- * Note that the unsynchronized assignments to the per-CPU
- * rcu_sched_qs_mask variable are safe. Yes, setting of
- * bits can be lost, but they will be set again on the next
- * force-quiescent-state pass. So lost bit sets do not result
- * in incorrect behavior, merely in a grace period lasting
- * a few jiffies longer than it might otherwise. Because
- * there are at most four threads involved, and because the
- * updates are only once every few jiffies, the probability of
- * lossage (and thus of slight grace-period extension) is
- * quite low.
- *
- * Note that if the jiffies_till_sched_qs boot/sysfs parameter
- * is set too high, we override with half of the RCU CPU stall
- * warning delay.
+ * Callbacks are often registered with incomplete grace-period
+ * information. Something about the fact that getting exact
+ * information requires acquiring a global lock... RCU therefore
+ * makes a conservative estimate of the grace period number at which
+ * a given callback will become ready to invoke. The following
+ * code checks this estimate and improves it when possible, thus
+ * accelerating callback invocation to an earlier grace-period
+ * number.
*/
- rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
- if (ULONG_CMP_GE(jiffies,
- rdp->rsp->gp_start + jiffies_till_sched_qs) ||
- ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
- ACCESS_ONCE(rdp->cond_resched_completed) =
- ACCESS_ONCE(rdp->mynode->completed);
- smp_mb(); /* ->cond_resched_completed before *rcrmp. */
- ACCESS_ONCE(*rcrmp) =
- ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
- resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
- rdp->rsp->jiffies_resched += 5; /* Enable beating. */
- } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- /* Time to beat on that CPU again! */
- resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
- rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
- }
- }
+ gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
+ if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
+ ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
- return 0;
+ /* Trace depending on how much we were able to accelerate. */
+ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
+ trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
+ else
+ trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+
+ return ret;
}
-static void record_gp_stall_check_time(struct rcu_state *rsp)
+/*
+ * Similar to rcu_accelerate_cbs(), but does not require that the leaf
+ * rcu_node structure's ->lock be held. It consults the cached value
+ * of ->gp_seq_needed in the rcu_data structure, and if that indicates
+ * that a new grace-period request be made, invokes rcu_accelerate_cbs()
+ * while holding the leaf rcu_node structure's ->lock.
+ */
+static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
- unsigned long j = jiffies;
- unsigned long j1;
+ unsigned long c;
+ bool needwake;
- rsp->gp_start = j;
- smp_wmb(); /* Record start time before stall time. */
- j1 = rcu_jiffies_till_stall_check();
- ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
- rsp->jiffies_resched = j + j1 / 2;
- rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+ rcu_lockdep_assert_cblist_protected(rdp);
+ c = rcu_seq_snap(&rcu_state.gp_seq);
+ if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
+ /* Old request still live, so mark recent callbacks. */
+ (void)rcu_segcblist_accelerate(&rdp->cblist, c);
+ return;
+ }
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ needwake = rcu_accelerate_cbs(rnp, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ if (needwake)
+ rcu_gp_kthread_wake();
}
/*
- * Complain about starvation of grace-period kthread.
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist. This function is idempotent, so it does not hurt to
+ * invoke it repeatedly. As long as it is not invoked -too- often...
+ * Returns true if the RCU grace-period kthread needs to be awakened.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
*/
-static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
+static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
{
- unsigned long gpa;
- unsigned long j;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
+ return false;
+
+ /*
+ * Find all callbacks whose ->gp_seq numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+ */
+ rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
- j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
- if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies!\n",
- rsp->name, j - gpa);
+ /* Classify any remaining callbacks. */
+ return rcu_accelerate_cbs(rnp, rdp);
}
/*
- * Dump stacks of all tasks running on stalled CPUs.
+ * Move and classify callbacks, but only if doing so won't require
+ * that the RCU grace-period kthread be awakened.
*/
-static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
+static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
- int cpu;
- unsigned long flags;
- struct rcu_node *rnp;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
+ return;
+ // The grace period cannot end while we hold the rcu_node lock.
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ raw_spin_unlock_rcu_node(rnp);
+}
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (rnp->qsmask != 0) {
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu))
- dump_cpu_task(rnp->grplo + cpu);
- }
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+/*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
+ * quiescent state. This is intended to be invoked when the CPU notices
+ * a new grace period.
+ */
+static void rcu_strict_gp_check_qs(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rcu_read_lock();
+ rcu_read_unlock();
}
}
-static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
+/*
+ * Update CPU-local rcu_data state to record the beginnings and ends of
+ * grace periods. The caller must hold the ->lock of the leaf rcu_node
+ * structure corresponding to the current CPU, and must have irqs disabled.
+ * Returns true if the grace-period kthread needs to be awakened.
+ */
+static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
- int cpu;
- long delta;
- unsigned long flags;
- unsigned long gpa;
- unsigned long j;
- int ndetected = 0;
- struct rcu_node *rnp = rcu_get_root(rsp);
- long totqlen = 0;
+ bool ret = false;
+ bool need_qs;
+ const bool offloaded = rcu_rdp_is_offloaded(rdp);
- /* Only let one CPU complain about others per time interval. */
+ raw_lockdep_assert_held_rcu_node(rnp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
- if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (rdp->gp_seq == rnp->gp_seq)
+ return false; /* Nothing to do. */
- /*
- * OK, time to rat on our buddy...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s detected stalls on CPUs/tasks:",
- rsp->name);
- print_cpu_stall_info_begin();
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- ndetected += rcu_print_task_stall(rnp);
- if (rnp->qsmask != 0) {
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu)) {
- print_cpu_stall_info(rsp,
- rnp->grplo + cpu);
- ndetected++;
- }
- }
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
-
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
- smp_processor_id(), (long)(jiffies - rsp->gp_start),
- (long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (ndetected) {
- rcu_dump_cpu_stacks(rsp);
+ /* Handle the ends of any preceding grace periods first. */
+ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
+ unlikely(rdp->gpwrap)) {
+ if (!offloaded)
+ ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
+ rdp->core_needs_qs = false;
+ trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
} else {
- if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
- ACCESS_ONCE(rsp->completed) == gpnum) {
- pr_err("INFO: Stall ended before state dump start\n");
- } else {
- j = jiffies;
- gpa = ACCESS_ONCE(rsp->gp_activity);
- pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
- rsp->name, j - gpa, j, gpa,
- jiffies_till_next_fqs,
- rcu_get_root(rsp)->qsmask);
- /* In this case, the current CPU might be at fault. */
- sched_show_task(current);
- }
+ if (!offloaded)
+ ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
+ if (rdp->core_needs_qs)
+ rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
}
- /* Complain about tasks blocking the grace period. */
- rcu_print_detail_task_stall(rsp);
-
- rcu_check_gp_kthread_starvation(rsp);
-
- force_quiescent_state(rsp); /* Kick them all. */
+ /* Now handle the beginnings of any new-to-this-CPU grace periods. */
+ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
+ unlikely(rdp->gpwrap)) {
+ /*
+ * If the current grace period is waiting for this CPU,
+ * set up to detect a quiescent state, otherwise don't
+ * go looking for one.
+ */
+ trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
+ need_qs = !!(rnp->qsmask & rdp->grpmask);
+ rdp->cpu_no_qs.b.norm = need_qs;
+ rdp->core_needs_qs = need_qs;
+ zero_cpu_stall_ticks(rdp);
+ }
+ rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
+ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
+ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap)
+ WRITE_ONCE(rdp->last_sched_clock, jiffies);
+ WRITE_ONCE(rdp->gpwrap, false);
+ rcu_gpnum_ovf(rnp, rdp);
+ return ret;
}
-static void print_cpu_stall(struct rcu_state *rsp)
+static void note_gp_changes(struct rcu_data *rdp)
{
- int cpu;
unsigned long flags;
- struct rcu_node *rnp = rcu_get_root(rsp);
- long totqlen = 0;
-
- /*
- * OK, time to rat on ourselves...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s self-detected stall on CPU", rsp->name);
- print_cpu_stall_info_begin();
- print_cpu_stall_info(rsp, smp_processor_id());
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
- jiffies - rsp->gp_start,
- (long)rsp->gpnum, (long)rsp->completed, totqlen);
+ bool needwake;
+ struct rcu_node *rnp;
- rcu_check_gp_kthread_starvation(rsp);
+ local_irq_save(flags);
+ rnp = rdp->mynode;
+ if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) &&
+ !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
+ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
+ local_irq_restore(flags);
+ return;
+ }
+ needwake = __note_gp_changes(rnp, rdp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ rcu_strict_gp_check_qs();
+ if (needwake)
+ rcu_gp_kthread_wake();
+}
- rcu_dump_cpu_stacks(rsp);
+static atomic_t *rcu_gp_slow_suppress;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
- 3 * rcu_jiffies_till_stall_check() + 3;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+/* Register a counter to suppress debugging grace-period delays. */
+void rcu_gp_slow_register(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rcu_gp_slow_suppress);
- /*
- * Attempt to revive the RCU machinery by forcing a context switch.
- *
- * A context switch would normally allow the RCU state machine to make
- * progress and it could be we're stuck in kernel space without context
- * switches for an entirely unreasonable amount of time.
- */
- resched_cpu(smp_processor_id());
+ WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
-static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+/* Unregister a counter, with NULL for not caring which. */
+void rcu_gp_slow_unregister(atomic_t *rgssp)
{
- unsigned long completed;
- unsigned long gpnum;
- unsigned long gps;
- unsigned long j;
- unsigned long js;
- struct rcu_node *rnp;
+ WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
- if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
- return;
- j = jiffies;
+ WRITE_ONCE(rcu_gp_slow_suppress, NULL);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
- /*
- * Lots of memory barriers to reject false positives.
- *
- * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
- * then rsp->gp_start, and finally rsp->completed. These values
- * are updated in the opposite order with memory barriers (or
- * equivalent) during grace-period initialization and cleanup.
- * Now, a false positive can occur if we get an new value of
- * rsp->gp_start and a old value of rsp->jiffies_stall. But given
- * the memory barriers, the only way that this can happen is if one
- * grace period ends and another starts between these two fetches.
- * Detect this by comparing rsp->completed with the previous fetch
- * from rsp->gpnum.
- *
- * Given this check, comparisons of jiffies, rsp->jiffies_stall,
- * and rsp->gp_start suffice to forestall false positives.
- */
- gpnum = ACCESS_ONCE(rsp->gpnum);
- smp_rmb(); /* Pick up ->gpnum first... */
- js = ACCESS_ONCE(rsp->jiffies_stall);
- smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = ACCESS_ONCE(rsp->gp_start);
- smp_rmb(); /* ...and finally ->gp_start before ->completed. */
- completed = ACCESS_ONCE(rsp->completed);
- if (ULONG_CMP_GE(completed, gpnum) ||
- ULONG_CMP_LT(j, js) ||
- ULONG_CMP_GE(gps, js))
- return; /* No stall or GP completed since entering function. */
- rnp = rdp->mynode;
- if (rcu_gp_in_progress(rsp) &&
- (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+static bool rcu_gp_slow_is_suppressed(void)
+{
+ atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall(rsp);
+ return rgssp && atomic_read(rgssp);
+}
+
+static void rcu_gp_slow(int delay)
+{
+ if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
+ !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ schedule_timeout_idle(delay);
+}
- } else if (rcu_gp_in_progress(rsp) &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
+static unsigned long sleep_duration;
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(rsp, gpnum);
- }
+/* Allow rcutorture to stall the grace-period kthread. */
+void rcu_gp_set_torture_wait(int duration)
+{
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0)
+ WRITE_ONCE(sleep_duration, duration);
}
+EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait);
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
+/* Actually implement the aforementioned wait. */
+static void rcu_gp_torture_wait(void)
{
- struct rcu_state *rsp;
+ unsigned long duration;
- for_each_rcu_flavor(rsp)
- ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST))
+ return;
+ duration = xchg(&sleep_duration, 0UL);
+ if (duration > 0) {
+ pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
+ schedule_timeout_idle(duration);
+ pr_alert("%s: Wait complete\n", __func__);
+ }
}
/*
- * Initialize the specified rcu_data structure's default callback list
- * to empty. The default callback list is the one that is not used by
- * no-callbacks CPUs.
+ * Handler for on_each_cpu() to invoke the target CPU's RCU core
+ * processing.
*/
-static void init_default_callback_list(struct rcu_data *rdp)
+static void rcu_strict_gp_boundary(void *unused)
{
- int i;
+ invoke_rcu_core();
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If RCU was idle, note beginning of GP.
+ if (!rcu_seq_state(rcu_state.gp_seq_polled))
+ rcu_seq_start(&rcu_state.gp_seq_polled);
- rdp->nxtlist = NULL;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- rdp->nxttail[i] = &rdp->nxtlist;
+ // Either way, record current state.
+ *snap = rcu_state.gp_seq_polled;
}
-/*
- * Initialize the specified rcu_data structure's callback list to empty.
- */
-static void init_callback_list(struct rcu_data *rdp)
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
{
- if (init_nocb_callback_list(rdp))
- return;
- init_default_callback_list(rdp);
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If the previously noted GP is still in effect, record the
+ // end of that GP. Either way, zero counter to avoid counter-wrap
+ // problems.
+ if (*snap && *snap == rcu_state.gp_seq_polled) {
+ rcu_seq_end(&rcu_state.gp_seq_polled);
+ rcu_state.gp_seq_polled_snap = 0;
+ rcu_state.gp_seq_polled_exp_snap = 0;
+ } else {
+ *snap = 0;
+ }
}
-/*
- * Determine the value that ->completed will have at the end of the
- * next subsequent grace period. This is used to tag callbacks so that
- * a CPU can invoke callbacks in a timely fashion even if that CPU has
- * been dyntick-idle for an extended period with callbacks under the
- * influence of RCU_FAST_NO_HZ.
- *
- * The caller must hold rnp->lock with interrupts disabled.
- */
-static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
- struct rcu_node *rnp)
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
{
- /*
- * If RCU is idle, we just wait for the next grace period.
- * But we can only be sure that RCU is idle if we are looking
- * at the root rcu_node structure -- otherwise, a new grace
- * period might have started, but just not yet gotten around
- * to initializing the current non-root rcu_node structure.
- */
- if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
- return rnp->completed + 1;
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
- /*
- * Otherwise, wait for a possible partial grace period and
- * then the subsequent full grace period.
- */
- return rnp->completed + 2;
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_start(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-/*
- * Trace-event helper function for rcu_start_future_gp() and
- * rcu_nocb_wait_gp().
- */
-static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long c, const char *s)
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
{
- trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
- rnp->completed, c, rnp->level,
- rnp->grplo, rnp->grphi, s);
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_end(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
- * Start some future grace period, as needed to handle newly arrived
- * callbacks. The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field. Returns true if there
- * is reason to awaken the grace-period kthread.
+ * There is a single llist, which is used for handling
+ * synchronize_rcu() users' enqueued rcu_synchronize nodes.
+ * Within this llist, there are two tail pointers:
+ *
+ * wait tail: Tracks the set of nodes, which need to
+ * wait for the current GP to complete.
+ * done tail: Tracks the set of nodes, for which grace
+ * period has elapsed. These nodes processing
+ * will be done as part of the cleanup work
+ * execution by a kworker.
+ *
+ * At every grace period init, a new wait node is added
+ * to the llist. This wait node is used as wait tail
+ * for this new grace period. Given that there are a fixed
+ * number of wait nodes, if all wait nodes are in use
+ * (which can happen when kworker callback processing
+ * is delayed) and additional grace period is requested.
+ * This means, a system is slow in processing callbacks.
+ *
+ * TODO: If a slow processing is detected, a first node
+ * in the llist should be used as a wait-tail for this
+ * grace period, therefore users which should wait due
+ * to a slow process are handled by _this_ grace period
+ * and not next.
+ *
+ * Below is an illustration of how the done and wait
+ * tail pointers move from one set of rcu_synchronize nodes
+ * to the other, as grace periods start and finish and
+ * nodes are processed by kworker.
+ *
+ *
+ * a. Initial llist callbacks list:
+ *
+ * +----------+ +--------+ +-------+
+ * | | | | | |
+ * | head |---------> | cb2 |--------->| cb1 |
+ * | | | | | |
+ * +----------+ +--------+ +-------+
+ *
+ *
+ *
+ * b. New GP1 Start:
+ *
+ * WAIT TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * c. GP completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * d. New callbacks and GP2 start:
+ *
+ * WAIT TAIL DONE TAIL
+ * | |
+ * | |
+ * v v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ *
+ * e. GP2 completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ * While the llist state transitions from d to e, a kworker
+ * can start executing rcu_sr_normal_gp_cleanup_work() and
+ * can observe either the old done tail (@c) or the new
+ * done tail (@e). So, done tail updates and reads need
+ * to use the rel-acq semantics. If the concurrent kworker
+ * observes the old done tail, the newly queued work
+ * execution will process the updated done tail. If the
+ * concurrent kworker observes the new done tail, then
+ * the newly queued work will skip processing the done
+ * tail, as workqueue semantics guarantees that the new
+ * work is executed only after the previous one completes.
+ *
+ * f. kworker callbacks processing complete:
+ *
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+
+ * | | | |
+ * | head ------> wait |
+ * | | | head2 |
+ * +----------+ +--------+
*
- * The caller must hold the specified rcu_node structure's ->lock.
*/
-static bool __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long *c_out)
+static bool rcu_sr_is_wait_head(struct llist_node *node)
{
- unsigned long c;
+ return &(rcu_state.srs_wait_nodes)[0].node <= node &&
+ node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node;
+}
+
+static struct llist_node *rcu_sr_get_wait_head(void)
+{
+ struct sr_wait_node *sr_wn;
int i;
- bool ret = false;
- struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
- /*
- * Pick up grace-period number for new callbacks. If this
- * grace period is already marked as needed, return to the caller.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp);
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
- if (rnp->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
- goto out;
- }
+ for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) {
+ sr_wn = &(rcu_state.srs_wait_nodes)[i];
- /*
- * If either this rcu_node structure or the root rcu_node structure
- * believe that a grace period is in progress, then we must wait
- * for the one following, which is in "c". Because our request
- * will be noticed at the end of the current grace period, we don't
- * need to explicitly start one. We only do the lockless check
- * of rnp_root's fields if the current rcu_node structure thinks
- * there is no grace period in flight, and because we hold rnp->lock,
- * the only possible change is when rnp_root's two fields are
- * equal, in which case rnp_root->gpnum might be concurrently
- * incremented. But that is OK, as it will just result in our
- * doing some extra useless work.
- */
- if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
- rnp->need_future_gp[c & 0x1]++;
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
- goto out;
+ if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1))
+ return &sr_wn->node;
}
- /*
- * There might be no grace period in progress. If we don't already
- * hold it, acquire the root rcu_node structure's lock in order to
- * start one (if needed).
- */
- if (rnp != rnp_root) {
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
- }
+ return NULL;
+}
- /*
- * Get a new grace-period number. If there really is no grace
- * period in progress, it will be smaller than the one we obtained
- * earlier. Adjust callbacks as needed. Note that even no-CBs
- * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
- */
- c = rcu_cbs_completed(rdp->rsp, rnp_root);
- for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
- if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
- rdp->nxtcompleted[i] = c;
+static void rcu_sr_put_wait_head(struct llist_node *node)
+{
+ struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node);
- /*
- * If the needed for the required grace period is already
- * recorded, trace and leave.
- */
- if (rnp_root->need_future_gp[c & 0x1]) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
- goto unlock_out;
- }
+ atomic_set_release(&sr_wn->inuse, 0);
+}
- /* Record the need for the future grace period. */
- rnp_root->need_future_gp[c & 0x1]++;
+/* Enable rcu_normal_wake_from_gp automatically on small systems. */
+#define WAKE_FROM_GP_CPU_THRESHOLD 16
- /* If a grace period is not already in progress, start one. */
- if (rnp_root->gpnum != rnp_root->completed) {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
- } else {
- trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
- ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
- }
-unlock_out:
- if (rnp != rnp_root)
- raw_spin_unlock(&rnp_root->lock);
-out:
- if (c_out != NULL)
- *c_out = c;
- return ret;
-}
+static int rcu_normal_wake_from_gp = -1;
+module_param(rcu_normal_wake_from_gp, int, 0644);
+static struct workqueue_struct *sync_wq;
-/*
- * Clean up any old requests for the just-ended grace period. Also return
- * whether any additional grace periods have been requested. Also invoke
- * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
- * waiting for this grace period to complete.
- */
-static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_sr_normal_complete(struct llist_node *node)
{
- int c = rnp->completed;
- int needmore;
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_synchronize *rs = container_of(
+ (struct rcu_head *) node, struct rcu_synchronize, head);
- rcu_nocb_gp_cleanup(rsp, rnp);
- rnp->need_future_gp[c & 0x1] = 0;
- needmore = rnp->need_future_gp[(c + 1) & 0x1];
- trace_rcu_future_gp(rnp, rdp, c,
- needmore ? TPS("CleanupMore") : TPS("Cleanup"));
- return needmore;
-}
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ !poll_state_synchronize_rcu_full(&rs->oldstate),
+ "A full grace period is not passed yet!\n");
-/*
- * Awaken the grace-period kthread for the specified flavor of RCU.
- * Don't do a self-awaken, and don't bother awakening when there is
- * nothing for the grace-period kthread to do (as in several CPUs
- * raced to awaken, and we lost), and finally don't try to awaken
- * a kthread that has not yet been created.
- */
-static void rcu_gp_kthread_wake(struct rcu_state *rsp)
-{
- if (current == rsp->gp_kthread ||
- !ACCESS_ONCE(rsp->gp_flags) ||
- !rsp->gp_kthread)
- return;
- wake_up(&rsp->gp_wq);
+ /* Finally. */
+ complete(&rs->completion);
}
-/*
- * If there is room, assign a ->completed number to any callbacks on
- * this CPU that have not already been assigned. Also accelerate any
- * callbacks that were previously assigned a ->completed number that has
- * since proven to be too conservative, which can happen if callbacks get
- * assigned a ->completed number while RCU is idle, but with reference to
- * a non-root rcu_node structure. This function is idempotent, so it does
- * not hurt to call it repeatedly. Returns an flag saying that we should
- * awaken the RCU grace-period kthread.
- *
- * The caller must hold rnp->lock with interrupts disabled.
- */
-static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp)
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
{
- unsigned long c;
- int i;
- bool ret;
-
- /* If the CPU has no callbacks, nothing to do. */
- if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return false;
+ struct llist_node *done, *rcu, *next, *head;
/*
- * Starting from the sublist containing the callbacks most
- * recently assigned a ->completed number and working down, find the
- * first sublist that is not assignable to an upcoming grace period.
- * Such a sublist has something in it (first two tests) and has
- * a ->completed number assigned that will complete sooner than
- * the ->completed number for newly arrived callbacks (last test).
+ * This work execution can potentially execute
+ * while a new done tail is being updated by
+ * grace period kthread in rcu_sr_normal_gp_cleanup().
+ * So, read and updates of done tail need to
+ * follow acq-rel semantics.
*
- * The key point is that any later sublist can be assigned the
- * same ->completed number as the newly arrived callbacks, which
- * means that the callbacks in any of these later sublist can be
- * grouped into a single sublist, whether or not they have already
- * been assigned a ->completed number.
+ * Given that wq semantics guarantees that a single work
+ * cannot execute concurrently by multiple kworkers,
+ * the done tail list manipulations are protected here.
*/
- c = rcu_cbs_completed(rsp, rnp);
- for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
- if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
- !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
- break;
+ done = smp_load_acquire(&rcu_state.srs_done_tail);
+ if (WARN_ON_ONCE(!done))
+ return;
- /*
- * If there are no sublist for unassigned callbacks, leave.
- * At the same time, advance "i" one sublist, so that "i" will
- * index into the sublist where all the remaining callbacks should
- * be grouped into.
- */
- if (++i >= RCU_NEXT_TAIL)
- return false;
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
+ head = done->next;
+ done->next = NULL;
/*
- * Assign all subsequent callbacks' ->completed number to the next
- * full grace period and group them all in the sublist initially
- * indexed by "i".
+ * The dummy node, which is pointed to by the
+ * done tail which is acq-read above is not removed
+ * here. This allows lockless additions of new
+ * rcu_synchronize nodes in rcu_sr_normal_add_req(),
+ * while the cleanup work executes. The dummy
+ * nodes is removed, in next round of cleanup
+ * work execution.
*/
- for (; i <= RCU_NEXT_TAIL; i++) {
- rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
- rdp->nxtcompleted[i] = c;
+ llist_for_each_safe(rcu, next, head) {
+ if (!rcu_sr_is_wait_head(rcu)) {
+ rcu_sr_normal_complete(rcu);
+ continue;
+ }
+
+ rcu_sr_put_wait_head(rcu);
}
- /* Record any needed additional grace periods. */
- ret = rcu_start_future_gp(rnp, rdp, NULL);
- /* Trace depending on how much we were able to accelerate. */
- if (!*rdp->nxttail[RCU_WAIT_TAIL])
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
- else
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
- return ret;
+ /* Order list manipulations with atomic access. */
+ atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
}
/*
- * Move any callbacks whose grace period has completed to the
- * RCU_DONE_TAIL sublist, then compact the remaining sublists and
- * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
- * sublist. This function is idempotent, so it does not hurt to
- * invoke it repeatedly. As long as it is not invoked -too- often...
- * Returns true if the RCU grace-period kthread needs to be awakened.
- *
- * The caller must hold rnp->lock with interrupts disabled.
+ * Helper function for rcu_gp_cleanup().
*/
-static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp)
+static void rcu_sr_normal_gp_cleanup(void)
{
- int i, j;
+ struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
+ int done = 0;
- /* If the CPU has no callbacks, nothing to do. */
- if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return false;
+ wait_tail = rcu_state.srs_wait_tail;
+ if (wait_tail == NULL)
+ return;
+
+ rcu_state.srs_wait_tail = NULL;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
/*
- * Find all callbacks whose ->completed numbers indicate that they
- * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+ * Process (a) and (d) cases. See an illustration.
*/
- for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
- if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+ llist_for_each_safe(rcu, next, wait_tail->next) {
+ if (rcu_sr_is_wait_head(rcu))
break;
- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
- }
- /* Clean up any sublist tail pointers that were misordered above. */
- for (j = RCU_WAIT_TAIL; j < i; j++)
- rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
- /* Copy down callbacks to fill in empty sublists. */
- for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
- if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+ rcu_sr_normal_complete(rcu);
+ // It can be last, update a next on this step.
+ wait_tail->next = next;
+
+ if (++done == SR_MAX_USERS_WAKE_FROM_GP)
break;
- rdp->nxttail[j] = rdp->nxttail[i];
- rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
}
- /* Classify any remaining callbacks. */
- return rcu_accelerate_cbs(rsp, rnp, rdp);
+ /*
+ * Fast path, no more users to process except putting the second last
+ * wait head if no inflight-workers. If there are in-flight workers,
+ * they will remove the last wait head.
+ *
+ * Note that the ACQUIRE orders atomic access with list manipulation.
+ */
+ if (wait_tail->next && wait_tail->next->next == NULL &&
+ rcu_sr_is_wait_head(wait_tail->next) &&
+ !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ rcu_sr_put_wait_head(wait_tail->next);
+ wait_tail->next = NULL;
+ }
+
+ /* Concurrent sr_normal_gp_cleanup work might observe this update. */
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+
+ /*
+ * We schedule a work in order to perform a final processing
+ * of outstanding users(if still left) and releasing wait-heads
+ * added by rcu_sr_normal_gp_init() call.
+ */
+ if (wait_tail->next) {
+ atomic_inc(&rcu_state.srs_cleanups_pending);
+ if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work))
+ atomic_dec(&rcu_state.srs_cleanups_pending);
+ }
}
/*
- * Update CPU-local rcu_data state to record the beginnings and ends of
- * grace periods. The caller must hold the ->lock of the leaf rcu_node
- * structure corresponding to the current CPU, and must have irqs disabled.
- * Returns true if the grace-period kthread needs to be awakened.
+ * Helper function for rcu_gp_init().
*/
-static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp)
+static bool rcu_sr_normal_gp_init(void)
{
- bool ret;
-
- /* Handle the ends of any preceding grace periods first. */
- if (rdp->completed == rnp->completed &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
-
- /* No grace period end, so just accelerate recent callbacks. */
- ret = rcu_accelerate_cbs(rsp, rnp, rdp);
-
- } else {
+ struct llist_node *first;
+ struct llist_node *wait_head;
+ bool start_new_poll = false;
+
+ first = READ_ONCE(rcu_state.srs_next.first);
+ if (!first || rcu_sr_is_wait_head(first))
+ return start_new_poll;
+
+ wait_head = rcu_sr_get_wait_head();
+ if (!wait_head) {
+ // Kick another GP to retry.
+ start_new_poll = true;
+ return start_new_poll;
+ }
- /* Advance callbacks. */
- ret = rcu_advance_cbs(rsp, rnp, rdp);
+ /* Inject a wait-dummy-node. */
+ llist_add(wait_head, &rcu_state.srs_next);
- /* Remember that we saw this grace-period completion. */
- rdp->completed = rnp->completed;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
- }
+ /*
+ * A waiting list of rcu_synchronize nodes should be empty on
+ * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(),
+ * rolls it over. If not, it is a BUG, warn a user.
+ */
+ WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL);
+ rcu_state.srs_wait_tail = wait_head;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
- if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
- /*
- * If the current grace period is waiting for this CPU,
- * set up to detect a quiescent state, otherwise don't
- * go looking for one.
- */
- rdp->gpnum = rnp->gpnum;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
- rdp->passed_quiesce = 0;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
- zero_cpu_stall_ticks(rdp);
- ACCESS_ONCE(rdp->gpwrap) = false;
- }
- return ret;
+ return start_new_poll;
}
-static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
+static void rcu_sr_normal_add_req(struct rcu_synchronize *rs)
{
- unsigned long flags;
- bool needwake;
- struct rcu_node *rnp;
-
- local_irq_save(flags);
- rnp = rdp->mynode;
- if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
- rdp->completed == ACCESS_ONCE(rnp->completed) &&
- !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
- !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
- local_irq_restore(flags);
- return;
- }
- smp_mb__after_unlock_lock();
- needwake = __note_gp_changes(rsp, rnp, rdp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (needwake)
- rcu_gp_kthread_wake(rsp);
+ llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next);
}
/*
- * Initialize a new grace period. Return 0 if no grace period required.
+ * Initialize a new grace period. Return false if no grace period required.
*/
-static int rcu_gp_init(struct rcu_state *rsp)
+static noinline_for_stack bool rcu_gp_init(void)
{
+ unsigned long flags;
unsigned long oldmask;
+ unsigned long mask;
struct rcu_data *rdp;
- struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp = rcu_get_root();
+ bool start_new_poll;
+ unsigned long old_gp_seq;
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
- if (!ACCESS_ONCE(rsp->gp_flags)) {
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ raw_spin_lock_irq_rcu_node(rnp);
+ if (!rcu_state.gp_flags) {
/* Spurious wakeup, tell caller to go back to sleep. */
- raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ raw_spin_unlock_irq_rcu_node(rnp);
+ return false;
}
- ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+ WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */
- if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
+ if (WARN_ON_ONCE(rcu_gp_in_progress())) {
/*
* Grace period already in progress, don't start another.
* Not supposed to be able to happen.
*/
- raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ raw_spin_unlock_irq_rcu_node(rnp);
+ return false;
}
/* Advance to a new grace period and initialize state. */
- record_gp_stall_check_time(rsp);
- /* Record GP times before starting GP, hence smp_store_release(). */
- smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
- trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
- raw_spin_unlock_irq(&rnp->lock);
+ record_gp_stall_check_time();
+ /*
+ * A new wait segment must be started before gp_seq advanced, so
+ * that previous gp waiters won't observe the new gp_seq.
+ */
+ start_new_poll = rcu_sr_normal_gp_init();
+ /* Record GP times before starting GP, hence rcu_seq_start(). */
+ old_gp_seq = rcu_state.gp_seq;
+ /*
+ * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug
+ * scan below. Otherwise we risk a race where a newly onlining CPU could
+ * be missed by the current grace period, potentially leading to
+ * use-after-free errors. For a detailed explanation of this race, see
+ * Documentation/RCU/Design/Requirements/Requirements.rst in the
+ * "Hotplug CPU" section.
+ *
+ * Also note that the root rnp's gp_seq is kept separate from, and lags,
+ * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on
+ * Single-node systems for more details (in Data-Structures.rst).
+ */
+ rcu_seq_start(&rcu_state.gp_seq);
+ /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq)));
+
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+ rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
+ raw_spin_unlock_irq_rcu_node(rnp);
/*
- * Apply per-leaf buffered online and offline operations to the
- * rcu_node tree. Note that this new grace period need not wait
- * for subsequent online CPUs, and that quiescent-state forcing
- * will handle subsequent offline CPUs.
+ * The "start_new_poll" is set to true, only when this GP is not able
+ * to handle anything and there are outstanding users. It happens when
+ * the rcu_sr_normal_gp_init() function was not able to insert a dummy
+ * separator to the llist, because there were no left any dummy-nodes.
+ *
+ * Number of dummy-nodes is fixed, it could be that we are run out of
+ * them, if so we start a new pool request to repeat a try. It is rare
+ * and it means that a system is doing a slow processing of callbacks.
*/
- rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ if (start_new_poll)
+ (void) start_poll_synchronize_rcu();
+
+ /*
+ * Apply per-leaf buffered online and offline operations to
+ * the rcu_node tree. Note that this new grace period need not
+ * wait for subsequent online CPUs, and that RCU hooks in the CPU
+ * offlining path, when combined with checks in this function,
+ * will handle CPUs that are currently going offline or that will
+ * go offline later. Please also refer to "Hotplug CPU" section
+ * of RCU's Requirements documentation.
+ */
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+ /* Exclude CPU hotplug operations. */
+ rcu_for_each_leaf_node(rnp) {
+ local_irq_disable();
+ /*
+ * Serialize with CPU offline. See Requirements.rst > Hotplug CPU >
+ * Concurrent Quiescent State Reporting for Offline CPUs.
+ */
+ arch_spin_lock(&rcu_state.ofl_lock);
+ raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_enable();
continue;
}
@@ -1788,12 +1906,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
/* If zero-ness of ->qsmaskinit changed, propagate up tree. */
if (!oldmask != !rnp->qsmaskinit) {
- if (!oldmask) /* First online CPU for this rcu_node. */
- rcu_init_new_rnp(rnp);
- else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
- rnp->wait_blkd_tasks = true;
- else /* Last offline CPU and can propagate. */
+ if (!oldmask) { /* First online CPU for rcu_node. */
+ if (!rnp->wait_blkd_tasks) /* Ever offline? */
+ rcu_init_new_rnp(rnp);
+ } else if (rcu_preempt_has_tasks(rnp)) {
+ rnp->wait_blkd_tasks = true; /* blocked tasks */
+ } else { /* Last offline CPU and can propagate. */
rcu_cleanup_dead_rnp(rnp);
+ }
}
/*
@@ -1802,113 +1922,236 @@ static int rcu_gp_init(struct rcu_state *rsp)
* still offline, propagate up the rcu_node tree and
* clear ->wait_blkd_tasks. Otherwise, if one of this
* rcu_node structure's CPUs has since come back online,
- * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
- * checks for this, so just call it unconditionally).
+ * simply clear ->wait_blkd_tasks.
*/
if (rnp->wait_blkd_tasks &&
- (!rcu_preempt_has_tasks(rnp) ||
- rnp->qsmaskinit)) {
+ (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
rnp->wait_blkd_tasks = false;
- rcu_cleanup_dead_rnp(rnp);
+ if (!rnp->qsmaskinit)
+ rcu_cleanup_dead_rnp(rnp);
}
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_enable();
}
+ rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
/*
* Set the quiescent-state-needed bits in all the rcu_node
- * structures for all currently online CPUs in breadth-first order,
- * starting from the root rcu_node structure, relying on the layout
- * of the tree within the rsp->node[] array. Note that other CPUs
- * will access only the leaves of the hierarchy, thus seeing that no
- * grace period is in progress, at least until the corresponding
- * leaf node has been initialized. In addition, we have excluded
- * CPU-hotplug operations.
+ * structures for all currently online CPUs in breadth-first
+ * order, starting from the root rcu_node structure, relying on the
+ * layout of the tree within the rcu_state.node[] array. Note that
+ * other CPUs will access only the leaves of the hierarchy, thus
+ * seeing that no grace period is in progress, at least until the
+ * corresponding leaf node has been initialized.
*
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
*/
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
- rdp = this_cpu_ptr(rsp->rda);
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
+ rcu_for_each_node_breadth_first(rnp) {
+ rcu_gp_slow(gp_init_delay);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rdp = this_cpu_ptr(&rcu_data);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
- ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
- if (WARN_ON_ONCE(rnp->completed != rsp->completed))
- ACCESS_ONCE(rnp->completed) = rsp->completed;
+ WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
if (rnp == rdp->mynode)
- (void)__note_gp_changes(rsp, rnp, rdp);
+ (void)__note_gp_changes(rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
- trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+ trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
- raw_spin_unlock_irq(&rnp->lock);
- cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- if (gp_init_delay > 0 &&
- !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
- schedule_timeout_uninterruptible(gp_init_delay);
+ /*
+ * Quiescent states for tasks on any now-offline CPUs. Since we
+ * released the ofl and rnp lock before this loop, CPUs might
+ * have gone offline and we have to report QS on their behalf.
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting.
+ */
+ mask = rnp->qsmask & ~rnp->qsmaskinitnext;
+ rnp->rcu_gp_init_mask = mask;
+ if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+ else
+ raw_spin_unlock_irq_rcu_node(rnp);
+ cond_resched_tasks_rcu_qs();
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
}
- return 1;
+ // If strict, make all CPUs aware of new grace period.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
+
+ return true;
+}
+
+/*
+ * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
+ * time.
+ */
+static bool rcu_gp_fqs_check_wake(int *gfp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ // If under overload conditions, force an immediate FQS scan.
+ if (*gfp & RCU_GP_FLAG_OVLD)
+ return true;
+
+ // Someone like call_rcu() requested a force-quiescent-state scan.
+ *gfp = READ_ONCE(rcu_state.gp_flags);
+ if (*gfp & RCU_GP_FLAG_FQS)
+ return true;
+
+ // The current grace period has completed.
+ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+ return true;
+
+ return false;
}
/*
* Do one round of quiescent-state forcing.
*/
-static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static void rcu_gp_fqs(bool first_time)
{
- int fqs_state = fqs_state_in;
- bool isidle = false;
- unsigned long maxj;
- struct rcu_node *rnp = rcu_get_root(rsp);
+ int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
+ struct rcu_node *rnp = rcu_get_root();
+
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
+
+ WARN_ON_ONCE(nr_fqs > 3);
+ /* Only countdown nr_fqs for stall purposes if jiffies moves. */
+ if (nr_fqs) {
+ if (nr_fqs == 1) {
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
+ }
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
+ }
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- rsp->n_force_qs++;
- if (fqs_state == RCU_SAVE_DYNTICK) {
+ if (first_time) {
/* Collect dyntick-idle snapshots. */
- if (is_sysidle_rcu_state(rsp)) {
- isidle = true;
- maxj = jiffies - ULONG_MAX / 4;
- }
- force_qs_rnp(rsp, dyntick_save_progress_counter,
- &isidle, &maxj);
- rcu_sysidle_report_gp(rsp, isidle, maxj);
- fqs_state = RCU_FORCE_QS;
+ force_qs_rnp(rcu_watching_snap_save);
} else {
/* Handle dyntick-idle and offline CPUs. */
- isidle = true;
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
+ force_qs_rnp(rcu_watching_snap_recheck);
}
/* Clear flag to prevent immediate re-entry. */
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
- raw_spin_unlock_irq(&rnp->lock);
+ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
+ raw_spin_lock_irq_rcu_node(rnp);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS);
+ raw_spin_unlock_irq_rcu_node(rnp);
+ }
+}
+
+/*
+ * Loop doing repeated quiescent-state forcing until the grace period ends.
+ */
+static noinline_for_stack void rcu_gp_fqs_loop(void)
+{
+ bool first_gp_fqs = true;
+ int gf = 0;
+ unsigned long j;
+ int ret;
+ struct rcu_node *rnp = rcu_get_root();
+
+ j = READ_ONCE(jiffies_till_first_fqs);
+ if (rcu_state.cbovld)
+ gf = RCU_GP_FLAG_OVLD;
+ ret = 0;
+ for (;;) {
+ if (rcu_state.cbovld) {
+ j = (j + 2) / 3;
+ if (j <= 0)
+ j = 1;
+ }
+ if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
+ WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
+ /*
+ * jiffies_force_qs before RCU_GP_WAIT_FQS state
+ * update; required for stall checks.
+ */
+ smp_wmb();
+ WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
+ jiffies + (j ? 3 * j : 2));
+ }
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
+ TPS("fqswait"));
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
+ (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
+ rcu_gp_fqs_check_wake(&gf), j);
+ rcu_gp_torture_wait();
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
+ /* Locking provides needed memory barriers. */
+ /*
+ * Exit the loop if the root rcu_node structure indicates that the grace period
+ * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
+ * is required only for single-node rcu_node trees because readers blocking
+ * the current grace period are queued only on leaf rcu_node structures.
+ * For multi-node trees, checking the root node's ->qsmask suffices, because a
+ * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+ * the corresponding leaf nodes have passed through their quiescent state.
+ */
+ if (!READ_ONCE(rnp->qsmask) &&
+ !rcu_preempt_blocked_readers_cgp(rnp))
+ break;
+ /* If time for quiescent-state forcing, do it. */
+ if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
+ (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) {
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
+ TPS("fqsstart"));
+ rcu_gp_fqs(first_gp_fqs);
+ gf = 0;
+ if (first_gp_fqs) {
+ first_gp_fqs = false;
+ gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0;
+ }
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
+ TPS("fqsend"));
+ cond_resched_tasks_rcu_qs();
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ ret = 0; /* Force full wait till next FQS. */
+ j = READ_ONCE(jiffies_till_next_fqs);
+ } else {
+ /* Deal with stray signal. */
+ cond_resched_tasks_rcu_qs();
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ WARN_ON(signal_pending(current));
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
+ TPS("fqswaitsig"));
+ ret = 1; /* Keep old FQS timing. */
+ j = jiffies;
+ if (time_after(jiffies, rcu_state.jiffies_force_qs))
+ j = 1;
+ else
+ j = rcu_state.jiffies_force_qs - j;
+ gf = 0;
+ }
}
- return fqs_state;
}
/*
* Clean up after the old grace period.
*/
-static void rcu_gp_cleanup(struct rcu_state *rsp)
+static noinline void rcu_gp_cleanup(void)
{
- unsigned long gp_duration;
+ int cpu;
bool needgp = false;
- int nocb = 0;
+ unsigned long gp_duration;
+ unsigned long new_gp_seq;
+ bool offloaded;
struct rcu_data *rdp;
- struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_node *rnp = rcu_get_root();
+ struct swait_queue_head *sq;
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
- gp_duration = jiffies - rsp->gp_start;
- if (gp_duration > rsp->gp_max)
- rsp->gp_max = gp_duration;
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ raw_spin_lock_irq_rcu_node(rnp);
+ rcu_state.gp_end = jiffies;
+ gp_duration = rcu_state.gp_end - rcu_state.gp_start;
+ if (gp_duration > rcu_state.gp_max)
+ rcu_state.gp_max = gp_duration;
/*
* We know the grace period is complete, but to everyone else
@@ -1918,227 +2161,153 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
- raw_spin_unlock_irq(&rnp->lock);
+ rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
+ raw_spin_unlock_irq_rcu_node(rnp);
/*
- * Propagate new ->completed value to rcu_node structures so
- * that other CPUs don't have to wait until the start of the next
- * grace period to process their callbacks. This also avoids
- * some nasty RCU grace-period initialization races by forcing
- * the end of the current grace period to be completely recorded in
- * all of the rcu_node structures before the beginning of the next
- * grace period is recorded in any of the rcu_node structures.
+ * Propagate new ->gp_seq value to rcu_node structures so that
+ * other CPUs don't have to wait until the start of the next grace
+ * period to process their callbacks. This also avoids some nasty
+ * RCU grace-period initialization races by forcing the end of
+ * the current grace period to be completely recorded in all of
+ * the rcu_node structures before the beginning of the next grace
+ * period is recorded in any of the rcu_node structures.
*/
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
- WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ new_gp_seq = rcu_state.gp_seq;
+ rcu_seq_end(&new_gp_seq);
+ rcu_for_each_node_breadth_first(rnp) {
+ raw_spin_lock_irq_rcu_node(rnp);
+ if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+ dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
- ACCESS_ONCE(rnp->completed) = rsp->gpnum;
- rdp = this_cpu_ptr(rsp->rda);
+ WRITE_ONCE(rnp->gp_seq, new_gp_seq);
+ if (!rnp->parent)
+ smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
+ rdp = this_cpu_ptr(&rcu_data);
if (rnp == rdp->mynode)
- needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
+ needgp = __note_gp_changes(rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
- nocb += rcu_future_gp_cleanup(rsp, rnp);
- raw_spin_unlock_irq(&rnp->lock);
- cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ needgp = rcu_future_gp_cleanup(rnp) || needgp;
+ // Reset overload indication for CPUs no longer overloaded
+ if (rcu_is_leaf_node(rnp))
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ check_cb_ovld_locked(rdp, rnp);
+ }
+ sq = rcu_nocb_gp_get(rnp);
+ raw_spin_unlock_irq_rcu_node(rnp);
+ rcu_nocb_gp_cleanup(sq);
+ cond_resched_tasks_rcu_qs();
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ rcu_gp_slow(gp_cleanup_delay);
+ }
+ rnp = rcu_get_root();
+ raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
+
+ /* Declare grace period done, trace first to use old GP number. */
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
+ rcu_seq_end(&rcu_state.gp_seq);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
+ /* Check for GP requests since above loop. */
+ rdp = this_cpu_ptr(&rcu_data);
+ if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
+ trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed,
+ TPS("CleanupMore"));
+ needgp = true;
}
- rnp = rcu_get_root(rsp);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
- rcu_nocb_gp_set(rnp, nocb);
-
- /* Declare grace period done. */
- ACCESS_ONCE(rsp->completed) = rsp->gpnum;
- trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
- rsp->fqs_state = RCU_GP_IDLE;
- rdp = this_cpu_ptr(rsp->rda);
/* Advance CBs to reduce false positives below. */
- needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
- if (needgp || cpu_needs_another_gp(rsp, rdp)) {
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("newreq"));
+ offloaded = rcu_rdp_is_offloaded(rdp);
+ if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
+
+ // We get here if a grace period was needed (“needgp”)
+ // and the above call to rcu_accelerate_cbs() did not set
+ // the RCU_GP_FLAG_INIT bit in ->gp_state (which records
+ // the need for another grace period).  The purpose
+ // of the “offloaded” check is to avoid invoking
+ // rcu_accelerate_cbs() on an offloaded CPU because we do not
+ // hold the ->nocb_lock needed to safely access an offloaded
+ // ->cblist.  We do not want to acquire that lock because
+ // it can be heavily contended during callback floods.
+
+ WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
+ } else {
+
+ // We get here either if there is no need for an
+ // additional grace period or if rcu_accelerate_cbs() has
+ // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
+ // So all we need to do is to clear all of the other
+ // ->gp_flags bits.
+
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
}
- raw_spin_unlock_irq(&rnp->lock);
+ raw_spin_unlock_irq_rcu_node(rnp);
+
+ // Make synchronize_rcu() users aware of the end of old grace period.
+ rcu_sr_normal_gp_cleanup();
+
+ // If strict, make all CPUs aware of the end of the old grace period.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
}
/*
* Body of kthread that handles grace periods.
*/
-static int __noreturn rcu_gp_kthread(void *arg)
+static int __noreturn rcu_gp_kthread(void *unused)
{
- int fqs_state;
- int gf;
- unsigned long j;
- int ret;
- struct rcu_state *rsp = arg;
- struct rcu_node *rnp = rcu_get_root(rsp);
-
rcu_bind_gp_kthread();
for (;;) {
/* Handle grace-period start. */
for (;;) {
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
- rsp->gp_state = RCU_GP_WAIT_GPS;
- wait_event_interruptible(rsp->gp_wq,
- ACCESS_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
+ swait_event_idle_exclusive(rcu_state.gp_wq,
+ READ_ONCE(rcu_state.gp_flags) &
+ RCU_GP_FLAG_INIT);
+ rcu_gp_torture_wait();
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
/* Locking provides needed memory barrier. */
- if (rcu_gp_init(rsp))
+ if (rcu_gp_init())
break;
- cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
+ cond_resched_tasks_rcu_qs();
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
WARN_ON(signal_pending(current));
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwaitsig"));
}
/* Handle quiescent-state forcing. */
- fqs_state = RCU_SAVE_DYNTICK;
- j = jiffies_till_first_fqs;
- if (j > HZ) {
- j = HZ;
- jiffies_till_first_fqs = HZ;
- }
- ret = 0;
- for (;;) {
- if (!ret)
- rsp->jiffies_force_qs = jiffies + j;
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqswait"));
- rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = ACCESS_ONCE(rsp->gp_flags)) &
- RCU_GP_FLAG_FQS) ||
- (!ACCESS_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp)),
- j);
- /* Locking provides needed memory barriers. */
- /* If grace period done, leave loop. */
- if (!ACCESS_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp))
- break;
- /* If time for quiescent-state forcing, do it. */
- if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
- (gf & RCU_GP_FLAG_FQS)) {
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqsstart"));
- fqs_state = rcu_gp_fqs(rsp, fqs_state);
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqsend"));
- cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- } else {
- /* Deal with stray signal. */
- cond_resched_rcu_qs();
- ACCESS_ONCE(rsp->gp_activity) = jiffies;
- WARN_ON(signal_pending(current));
- trace_rcu_grace_period(rsp->name,
- ACCESS_ONCE(rsp->gpnum),
- TPS("fqswaitsig"));
- }
- j = jiffies_till_next_fqs;
- if (j > HZ) {
- j = HZ;
- jiffies_till_next_fqs = HZ;
- } else if (j < 1) {
- j = 1;
- jiffies_till_next_fqs = 1;
- }
- }
+ rcu_gp_fqs_loop();
/* Handle grace-period end. */
- rcu_gp_cleanup(rsp);
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
+ rcu_gp_cleanup();
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
}
}
/*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock and hard irqs must be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function. This can happen when the dying CPU reports its
- * quiescent state.
- *
- * Returns true if the grace-period kthread must be awakened.
+ * Report a full set of quiescent states to the rcu_state data structure.
+ * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
+ * another grace period is required. Whether we wake the grace-period
+ * kthread or it awakens itself for the next round of quiescent-state
+ * forcing, that kthread will clean up after the just-completed grace
+ * period. Note that the caller must hold rnp->lock, which is released
+ * before return.
*/
-static bool
-rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp)
+static void rcu_report_qs_rsp(unsigned long flags)
+ __releases(rcu_get_root()->lock)
{
- if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
- /*
- * Either we have not yet spawned the grace-period
- * task, this CPU does not need another grace period,
- * or a grace period is already in progress.
- * Either way, don't start a new grace period.
- */
- return false;
- }
- ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
- trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
- TPS("newreq"));
-
- /*
- * We can't do wakeups while holding the rnp->lock, as that
- * could cause possible deadlocks with the rq->lock. Defer
- * the wakeup to our caller.
- */
- return true;
-}
-
-/*
- * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
- * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
- * is invoked indirectly from rcu_advance_cbs(), which would result in
- * endless recursion -- or would do so if it wasn't for the self-deadlock
- * that is encountered beforehand.
- *
- * Returns true if the grace-period kthread needs to be awakened.
- */
-static bool rcu_start_gp(struct rcu_state *rsp)
-{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_node *rnp = rcu_get_root(rsp);
- bool ret = false;
-
- /*
- * If there is no grace period in progress right now, any
- * callbacks we have up to this point will be satisfied by the
- * next grace period. Also, advancing the callbacks reduces the
- * probability of false positives from cpu_needs_another_gp()
- * resulting in pointless grace periods. So, advance callbacks
- * then start the grace period!
- */
- ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
- ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
- return ret;
-}
-
-/*
- * Report a full set of quiescent states to the specified rcu_state
- * data structure. This involves cleaning up after the prior grace
- * period and letting rcu_start_gp() start up the next grace period
- * if one is needed. Note that the caller must hold rnp->lock, which
- * is released before return.
- */
-static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
- __releases(rcu_get_root(rsp)->lock)
-{
- WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
- raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ raw_lockdep_assert_held_rcu_node(rcu_get_root());
+ WARN_ON_ONCE(!rcu_gp_in_progress());
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
+ raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
+ rcu_gp_kthread_wake();
}
/*
@@ -2148,40 +2317,48 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* must be represented by the same rcu_node structure (which need not be a
* leaf rcu_node structure, though it often will be). The gps parameter
* is the grace-period snapshot, which means that the quiescent states
- * are valid only if rnp->gpnum is equal to gps. That structure's lock
+ * are valid only if rnp->gp_seq is equal to gps. That structure's lock
* must be held upon entry, and it is released before return.
+ *
+ * As a special case, if mask is zero, the bit-already-cleared check is
+ * disabled. This allows propagating quiescent state due to resumed tasks
+ * during grace-period initialization.
*/
-static void
-rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
- struct rcu_node *rnp, unsigned long gps, unsigned long flags)
+static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
+ unsigned long gps, unsigned long flags)
__releases(rnp->lock)
{
unsigned long oldmask = 0;
struct rcu_node *rnp_c;
+ raw_lockdep_assert_held_rcu_node(rnp);
+
/* Walk up the rcu_node hierarchy. */
for (;;) {
- if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
+ if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
/*
* Our bit has already been cleared, or the
* relevant grace period is already over, so done.
*/
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
- rnp->qsmask &= ~mask;
- trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
+ rcu_preempt_blocked_readers_cgp(rnp));
+ WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
+ trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
mask, rnp->qsmask, rnp->level,
rnp->grplo, rnp->grphi,
!!rnp->gp_tasks);
if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
/* Other bits still set at this level, so done. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
+ rnp->completedqs = rnp->gp_seq;
mask = rnp->grpmask;
if (rnp->parent == NULL) {
@@ -2189,12 +2366,11 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
break;
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rnp_c = rnp;
rnp = rnp->parent;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- oldmask = rnp_c->qsmask;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ oldmask = READ_ONCE(rnp_c->qsmask);
}
/*
@@ -2202,72 +2378,66 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
* state for this grace period. Invoke rcu_report_qs_rsp()
* to clean up and start the next grace period if one is needed.
*/
- rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
+ rcu_report_qs_rsp(flags); /* releases rnp->lock. */
}
/*
* Record a quiescent state for all tasks that were previously queued
* on the specified rcu_node structure and that were blocking the current
- * RCU grace period. The caller must hold the specified rnp->lock with
+ * RCU grace period. The caller must hold the corresponding rnp->lock with
* irqs disabled, and this lock is released upon return, but irqs remain
* disabled.
*/
-static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
- struct rcu_node *rnp, unsigned long flags)
+static void __maybe_unused
+rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
unsigned long gps;
unsigned long mask;
struct rcu_node *rnp_p;
- if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
- rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) ||
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
+ rnp->qsmask != 0) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return; /* Still need more quiescent states! */
}
+ rnp->completedqs = rnp->gp_seq;
rnp_p = rnp->parent;
if (rnp_p == NULL) {
/*
* Only one rcu_node structure in the tree, so don't
* try to report up to its nonexistent parent!
*/
- rcu_report_qs_rsp(rsp, flags);
+ rcu_report_qs_rsp(flags);
return;
}
- /* Report up the rest of the hierarchy, tracking current ->gpnum. */
- gps = rnp->gpnum;
+ /* Report up the rest of the hierarchy, tracking current ->gp_seq. */
+ gps = rnp->gp_seq;
mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
+ rcu_report_qs_rnp(mask, rnp_p, gps, flags);
}
/*
* Record a quiescent state for the specified CPU to that CPU's rcu_data
- * structure. This must be either called from the specified CPU, or
- * called when the specified CPU is known to be offline (and when it is
- * also known that no other CPU is concurrently trying to help the offline
- * CPU). The lastcomp argument is used to make sure we are still in the
- * grace period of interest. We don't want to end the current grace period
- * based on quiescent states detected in an earlier grace period!
+ * structure. This must be called from the specified CPU.
*/
static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
+rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake;
struct rcu_node *rnp;
+ WARN_ON_ONCE(rdp->cpu != smp_processor_id());
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- if ((rdp->passed_quiesce == 0 &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
- rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
rdp->gpwrap) {
/*
@@ -2276,27 +2446,33 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* We will instead need a new quiescent state that lies
* within the current grace period.
*/
- rdp->passed_quiesce = 0; /* need qs for new gp. */
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
mask = rdp->grpmask;
+ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
- rdp->qs_pending = 0;
-
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
+ *
+ * NOCB kthreads have their own way to deal with that...
*/
- needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ /*
+ * The current GP has not yet ended, so it
+ * should not be possible for rcu_accelerate_cbs()
+ * to return true. So complain, but don't awaken.
+ */
+ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
+ }
- rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ rcu_disable_urgency_upon_qs(rdp);
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
- if (needwake)
- rcu_gp_kthread_wake(rsp);
}
}
@@ -2307,437 +2483,257 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* quiescent state for this grace period, and record that fact if so.
*/
static void
-rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+rcu_check_quiescent_state(struct rcu_data *rdp)
{
/* Check for grace-period ends and beginnings. */
- note_gp_changes(rsp, rdp);
+ note_gp_changes(rdp);
/*
* Does this CPU still need to do its part for current grace period?
* If no, return and let the other CPUs do their part as well.
*/
- if (!rdp->qs_pending)
+ if (!rdp->core_needs_qs)
return;
/*
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (!rdp->passed_quiesce &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
+ if (rdp->cpu_no_qs.b.norm)
return;
/*
* Tell RCU we are done (but rcu_report_qs_rdp() will be the
* judge of that).
*/
- rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
+ rcu_report_qs_rdp(rdp);
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Send the specified CPU's RCU callbacks to the orphanage. The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
- struct rcu_node *rnp, struct rcu_data *rdp)
+/* Return true if callback-invocation time limit exceeded. */
+static bool rcu_do_batch_check_time(long count, long tlimit,
+ bool jlimit_check, unsigned long jlimit)
{
- /* No-CBs CPUs do not have orphanable callbacks. */
- if (rcu_is_nocb_cpu(rdp->cpu))
- return;
-
- /*
- * Orphan the callbacks. First adjust the counts. This is safe
- * because _rcu_barrier() excludes CPU-hotplug operations, so it
- * cannot be running now. Thus no memory barrier is required.
- */
- if (rdp->nxtlist != NULL) {
- rsp->qlen_lazy += rdp->qlen_lazy;
- rsp->qlen += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
- rdp->qlen_lazy = 0;
- ACCESS_ONCE(rdp->qlen) = 0;
- }
-
- /*
- * Next, move those callbacks still needing a grace period to
- * the orphanage, where some other CPU will pick them up.
- * Some of the callbacks might have gone partway through a grace
- * period, but that is too bad. They get to start over because we
- * cannot assume that grace periods are synchronized across CPUs.
- * We don't bother updating the ->nxttail[] array yet, instead
- * we just reset the whole thing later on.
- */
- if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
- *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
- rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = NULL;
- }
-
- /*
- * Then move the ready-to-invoke callbacks to the orphanage,
- * where some other CPU will pick them up. These will not be
- * required to pass though another grace period: They are done.
- */
- if (rdp->nxtlist != NULL) {
- *rsp->orphan_donetail = rdp->nxtlist;
- rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
- }
-
- /*
- * Finally, initialize the rcu_data structure's list to empty and
- * disallow further callbacks on this CPU.
- */
- init_callback_list(rdp);
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ // Invoke local_clock() only once per 32 consecutive callbacks.
+ return unlikely(tlimit) &&
+ (!likely(count & 31) ||
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) &&
+ jlimit_check && time_after(jiffies, jlimit))) &&
+ local_clock() >= tlimit;
}
/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage. The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
- int i;
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
- /* No-CBs CPUs are handled specially. */
- if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
- return;
-
- /* Do the accounting first. */
- rdp->qlen_lazy += rsp->qlen_lazy;
- rdp->qlen += rsp->qlen;
- rdp->n_cbs_adopted += rsp->qlen;
- if (rsp->qlen_lazy != rsp->qlen)
- rcu_idle_count_callbacks_posted();
- rsp->qlen_lazy = 0;
- rsp->qlen = 0;
-
- /*
- * We do not need a memory barrier here because the only way we
- * can get here if there is an rcu_barrier() in flight is if
- * we are the task doing the rcu_barrier().
- */
-
- /* First adopt the ready-to-invoke callbacks. */
- if (rsp->orphan_donelist != NULL) {
- *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
- for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
- if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[i] = rsp->orphan_donetail;
- rsp->orphan_donelist = NULL;
- rsp->orphan_donetail = &rsp->orphan_donelist;
- }
-
- /* And then adopt the callbacks that still need a grace period. */
- if (rsp->orphan_nxtlist != NULL) {
- *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
- rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
- rsp->orphan_nxtlist = NULL;
- rsp->orphan_nxttail = &rsp->orphan_nxtlist;
- }
-}
-
-/*
- * Trace the fact that this CPU is going offline.
- */
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
- RCU_TRACE(unsigned long mask);
- RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
-
- RCU_TRACE(mask = rdp->grpmask);
- trace_rcu_grace_period(rsp->name,
- rnp->gpnum + 1 - !!(rnp->qsmask & mask),
- TPS("cpuofl"));
-}
-
-/*
- * All CPUs for the specified rcu_node structure have gone offline,
- * and all tasks that were preempted within an RCU read-side critical
- * section while running on one of those CPUs have since exited their RCU
- * read-side critical section. Some other CPU is reporting this fact with
- * the specified rcu_node structure's ->lock held and interrupts disabled.
- * This function therefore goes up the tree of rcu_node structures,
- * clearing the corresponding bits in the ->qsmaskinit fields. Note that
- * the leaf rcu_node structure's ->qsmaskinit field has already been
- * updated
- *
- * This function does check that the specified rcu_node structure has
- * all CPUs offline and no blocked tasks, so it is OK to invoke it
- * prematurely. That said, invoking it after the fact will cost you
- * a needless lock acquisition. So once it has done its work, don't
- * invoke it again.
- */
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
- long mask;
- struct rcu_node *rnp = rnp_leaf;
-
- if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
- return;
- for (;;) {
- mask = rnp->grpmask;
- rnp = rnp->parent;
- if (!rnp)
- break;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock(); /* GP memory ordering. */
- rnp->qsmaskinit &= ~mask;
- rnp->qsmask &= ~mask;
- if (rnp->qsmaskinit) {
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- return;
- }
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
-}
-
-/*
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function. We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
- */
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
- unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
- mask = rdp->grpmask;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
- rnp->qsmaskinitnext &= ~mask;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them. There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
- */
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- /* Adjust any no-longer-needed kthreads. */
- rcu_boost_kthread_setaffinity(rnp, -1);
-
- /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
- rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
- rcu_adopt_orphan_cbs(rsp, flags);
- raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
- WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
- cpu, rdp->qlen, rdp->nxtlist);
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
-}
-
-static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
-}
-
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
* Invoke any RCU callbacks that have made it to the end of their grace
- * period. Thottle as specified by rdp->blimit.
+ * period. Throttle as specified by rdp->blimit.
*/
-static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_data *rdp)
{
+ long bl;
+ long count = 0;
+ int div;
+ bool __maybe_unused empty;
unsigned long flags;
- struct rcu_head *next, *list, **tail;
- long bl, count, count_lazy;
- int i;
+ unsigned long jlimit;
+ bool jlimit_check = false;
+ long pending;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ struct rcu_head *rhp;
+ long tlimit = 0;
/* If no callbacks are ready, just return. */
- if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
- trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
- trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ trace_rcu_batch_start(rcu_state.name,
+ rcu_segcblist_n_cbs(&rdp->cblist), 0);
+ trace_rcu_batch_end(rcu_state.name, 0,
+ !rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
- rcu_is_callbacks_kthread());
+ rcu_is_callbacks_kthread(rdp));
return;
}
/*
- * Extract the list of ready callbacks, disabling to prevent
- * races with call_rcu() from interrupt handlers.
+ * Extract the list of ready callbacks, disabling IRQs to prevent
+ * races with call_rcu() from interrupt handlers. Leave the
+ * callback counts, as rcu_barrier() needs to be conservative.
+ *
+ * Callbacks execution is fully ordered against preceding grace period
+ * completion (materialized by rnp->gp_seq update) thanks to the
+ * smp_mb__after_unlock_lock() upon node locking required for callbacks
+ * advancing. In NOCB mode this ordering is then further relayed through
+ * the nocb locking that protects both callbacks advancing and extraction.
*/
- local_irq_save(flags);
+ rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- bl = rdp->blimit;
- trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
- list = rdp->nxtlist;
- rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
- *rdp->nxttail[RCU_DONE_TAIL] = NULL;
- tail = rdp->nxttail[RCU_DONE_TAIL];
- for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
- if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
- rdp->nxttail[i] = &rdp->nxtlist;
- local_irq_restore(flags);
+ pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);
+ div = READ_ONCE(rcu_divisor);
+ div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
+ bl = max(rdp->blimit, pending >> div);
+ if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) &&
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) {
+ const long npj = NSEC_PER_SEC / HZ;
+ long rrn = READ_ONCE(rcu_resched_ns);
+
+ rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
+ tlimit = local_clock() + rrn;
+ jlimit = jiffies + (rrn + npj + 1) / npj;
+ jlimit_check = true;
+ }
+ trace_rcu_batch_start(rcu_state.name,
+ rcu_segcblist_n_cbs(&rdp->cblist), bl);
+ rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
+ if (rcu_rdp_is_offloaded(rdp))
+ rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
- count = count_lazy = 0;
- while (list) {
- next = list->next;
- prefetch(next);
- debug_rcu_head_unqueue(list);
- if (__rcu_reclaim(rsp->name, list))
- count_lazy++;
- list = next;
- /* Stop only if limit reached and CPU has something to do. */
- if (++count >= bl &&
- (need_resched() ||
- (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
- break;
- }
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+ rhp = rcu_cblist_dequeue(&rcl);
- local_irq_save(flags);
- trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
- is_idle_task(current),
- rcu_is_callbacks_kthread());
-
- /* Update count, and requeue any remaining callbacks. */
- if (list != NULL) {
- *tail = rdp->nxtlist;
- rdp->nxtlist = list;
- for (i = 0; i < RCU_NEXT_SIZE; i++)
- if (&rdp->nxtlist == rdp->nxttail[i])
- rdp->nxttail[i] = tail;
- else
+ for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ rcu_callback_t f;
+
+ count++;
+ debug_rcu_head_unqueue(rhp);
+
+ rcu_lock_acquire(&rcu_callback_map);
+ trace_rcu_invoke_callback(rcu_state.name, rhp);
+
+ f = rhp->func;
+ debug_rcu_head_callback(rhp);
+ WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
+ f(rhp);
+
+ rcu_lock_release(&rcu_callback_map);
+
+ /*
+ * Stop only if limit reached and CPU has something to do.
+ */
+ if (in_serving_softirq()) {
+ if (count >= bl && (need_resched() || !is_idle_task(current)))
break;
+ /*
+ * Make sure we don't spend too much time here and deprive other
+ * softirq vectors of CPU cycles.
+ */
+ if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit))
+ break;
+ } else {
+ // In rcuc/rcuoc context, so no worries about
+ // depriving other softirq vectors of CPU cycles.
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ cond_resched_tasks_rcu_qs();
+ lockdep_assert_irqs_enabled();
+ local_bh_disable();
+ // But rcuc kthreads can delay quiescent-state
+ // reporting, so check time limits for them.
+ if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING &&
+ rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) {
+ rdp->rcu_cpu_has_work = 1;
+ break;
+ }
+ }
}
- smp_mb(); /* List handling before counting for rcu_barrier(). */
- rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+
+ rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
+ trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
+ is_idle_task(current), rcu_is_callbacks_kthread(rdp));
+
+ /* Update counts and requeue any remaining callbacks. */
+ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
+ rcu_segcblist_add_len(&rdp->cblist, -count);
/* Reinstate batch limit if we have worked down the excess. */
- if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+ count = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
rdp->blimit = blimit;
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
- if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+ if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rsp->n_force_qs;
- } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
- rdp->qlen_last_fqs_check = rdp->qlen;
- WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
+ } else if (count < rdp->qlen_last_fqs_check - qhimark)
+ rdp->qlen_last_fqs_check = count;
- local_irq_restore(flags);
+ /*
+ * The following usually indicates a double call_rcu(). To track
+ * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
+ */
+ empty = rcu_segcblist_empty(&rdp->cblist);
+ WARN_ON_ONCE(count == 0 && !empty);
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ count != 0 && empty);
+ WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+ WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
- /* Re-invoke RCU core processing if there are callbacks remaining. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
- invoke_rcu_core();
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt. If rcu_pending returns
- * false, there is no point in invoking rcu_check_callbacks().
+ * This function is invoked from each scheduling-clock interrupt,
+ * and checks to see if this CPU is in a non-context-switch quiescent
+ * state, for example, user mode or idle loop. It also schedules RCU
+ * core processing. If the current grace period has gone on too long,
+ * it will ask the scheduler to manufacture a context switch for the sole
+ * purpose of providing the needed quiescent state.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
- trace_rcu_utilization(TPS("Start scheduler-tick"));
- increment_cpu_stall_ticks();
- if (user || rcu_is_cpu_rrupt_from_idle()) {
-
- /*
- * Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
- *
- * No memory barrier is required here because both
- * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
- * variables that other CPUs neither access nor modify,
- * at least not while the corresponding CPU is online.
- */
-
- rcu_sched_qs();
- rcu_bh_qs();
-
- } else if (!in_softirq()) {
-
- /*
- * Get here if this CPU did not take its interrupt from
- * softirq, in other words, if it is not interrupting
- * a rcu_bh read-side critical section. This is an _bh
- * critical section, so note it.
- */
+ unsigned long j;
- rcu_bh_qs();
+ if (IS_ENABLED(CONFIG_PROVE_RCU)) {
+ j = jiffies;
+ WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
+ __this_cpu_write(rcu_data.last_sched_clock, j);
}
- rcu_preempt_check_callbacks();
- if (rcu_pending())
+ trace_rcu_utilization(TPS("Start scheduler-tick"));
+ lockdep_assert_irqs_disabled();
+ raw_cpu_inc(rcu_data.ticks_this_gp);
+ /* The load-acquire pairs with the store-release setting to true. */
+ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
+ /* Idle and userspace execution already are quiescent states. */
+ if (!rcu_is_cpu_rrupt_from_idle() && !user)
+ set_need_resched_current();
+ __this_cpu_write(rcu_data.rcu_urgent_qs, false);
+ }
+ rcu_flavor_sched_clock_irq(user);
+ if (rcu_pending(user))
invoke_rcu_core();
- if (user)
+ if (user || rcu_is_cpu_rrupt_from_idle())
rcu_note_voluntary_context_switch(current);
+ lockdep_assert_irqs_disabled();
+
trace_rcu_utilization(TPS("End scheduler-tick"));
}
/*
- * Scan the leaf rcu_node structures, processing dyntick state for any that
- * have not yet encountered a quiescent state, using the function specified.
- * Also initiate boosting for any threads blocked on the root rcu_node.
- *
- * The caller must have suppressed start of new grace periods.
+ * Scan the leaf rcu_node structures. For each structure on which all
+ * CPUs have reported a quiescent state and on which there are tasks
+ * blocking the current grace period, initiate RCU priority boosting.
+ * Otherwise, invoke the specified function to check dyntick state for
+ * each CPU that has not yet reported a quiescent state.
*/
-static void force_qs_rnp(struct rcu_state *rsp,
- int (*f)(struct rcu_data *rsp, bool *isidle,
- unsigned long *maxj),
- bool *isidle, unsigned long *maxj)
+static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
- unsigned long bit;
int cpu;
unsigned long flags;
- unsigned long mask;
struct rcu_node *rnp;
- rcu_for_each_leaf_node(rsp, rnp) {
- cond_resched_rcu_qs();
- mask = 0;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- if (!rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
+ rcu_state.cbovld = rcu_state.cbovldnext;
+ rcu_state.cbovldnext = false;
+ rcu_for_each_leaf_node(rnp) {
+ unsigned long mask = 0;
+ unsigned long rsmask = 0;
+
+ cond_resched_tasks_rcu_qs();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
- if (rcu_state_p == &rcu_sched_state ||
- rsp != rcu_state_p ||
- rcu_preempt_blocked_readers_cgp(rnp)) {
+ if (rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
* are all zero. But we might need to
@@ -2747,35 +2743,32 @@ static void force_qs_rnp(struct rcu_state *rsp,
/* rcu_initiate_boost() releases rnp->lock */
continue;
}
- if (rnp->parent &&
- (rnp->parent->qsmask & rnp->grpmask)) {
- /*
- * Race between grace-period
- * initialization and task exiting RCU
- * read-side critical section: Report.
- */
- rcu_report_unblock_qs_rnp(rsp, rnp, flags);
- /* rcu_report_unblock_qs_rnp() rlses ->lock */
- continue;
- }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue;
}
- cpu = rnp->grplo;
- bit = 1;
- for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
- if ((rnp->qsmask & bit) != 0) {
- if ((rnp->qsmaskinit & bit) == 0)
- *isidle = false; /* Pending hotplug. */
- if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
- mask |= bit;
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+ struct rcu_data *rdp;
+ int ret;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ ret = f(rdp);
+ if (ret > 0) {
+ mask |= rdp->grpmask;
+ rcu_disable_urgency_upon_qs(rdp);
}
+ if (ret < 0)
+ rsmask |= rdp->grpmask;
}
if (mask != 0) {
- /* Idle/offline CPUs, report (releases rnp->lock. */
- rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+ /* Idle/offline CPUs, report (releases rnp->lock). */
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
/* Nothing to do here, so just drop the lock. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+
+ for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+ resched_cpu(cpu);
}
}
@@ -2783,127 +2776,228 @@ static void force_qs_rnp(struct rcu_state *rsp,
* Force quiescent states on reluctant CPUs, and also detect which
* CPUs are in dyntick-idle mode.
*/
-static void force_quiescent_state(struct rcu_state *rsp)
+void rcu_force_quiescent_state(void)
{
unsigned long flags;
bool ret;
struct rcu_node *rnp;
struct rcu_node *rnp_old = NULL;
+ if (!rcu_gp_in_progress())
+ return;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = __this_cpu_read(rsp->rda->mynode);
+ rnp = raw_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
- ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
- !raw_spin_trylock(&rnp->fqslock);
+ ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
+ !raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
- if (ret) {
- rsp->n_force_qs_lh++;
+ if (ret)
return;
- }
rnp_old = rnp;
}
- /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
+ /* rnp_old == rcu_get_root(), rnp == NULL. */
/* Reached the root of the rcu_node tree, acquire lock. */
- raw_spin_lock_irqsave(&rnp_old->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
raw_spin_unlock(&rnp_old->fqslock);
- if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- rsp->n_force_qs_lh++;
- raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
- ACCESS_ONCE(rsp->gp_flags) =
- ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
- raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
- rcu_gp_kthread_wake(rsp);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
+ raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
+ rcu_gp_kthread_wake();
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+// Workqueue handler for an RCU reader for kernels enforcing struct RCU
+// grace periods.
+static void strict_work_handler(struct work_struct *work)
+{
+ rcu_read_lock();
+ rcu_read_unlock();
}
-/*
- * This does the RCU core processing work for the specified rcu_state
- * and rcu_data structures. This may be called only from the CPU to
- * whom the rdp belongs.
- */
-static void
-__rcu_process_callbacks(struct rcu_state *rsp)
+/* Perform RCU core processing work for the current CPU. */
+static __latent_entropy void rcu_core(void)
{
- unsigned long flags;
- bool needwake;
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
- WARN_ON_ONCE(rdp->beenonline == 0);
+ if (cpu_is_offline(smp_processor_id()))
+ return;
+ trace_rcu_utilization(TPS("Start RCU core"));
+ WARN_ON_ONCE(!rdp->beenonline);
+
+ /* Report any deferred quiescent states if preemption enabled. */
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
+ rcu_preempt_deferred_qs(current);
+ } else if (rcu_preempt_need_deferred_qs(current)) {
+ guard(irqsave)();
+ set_need_resched_current();
+ }
/* Update RCU state based on any recent quiescent states. */
- rcu_check_quiescent_state(rsp, rdp);
-
- /* Does this CPU require a not-yet-started grace period? */
- local_irq_save(flags);
- if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
- if (needwake)
- rcu_gp_kthread_wake(rsp);
- } else {
- local_irq_restore(flags);
+ rcu_check_quiescent_state(rdp);
+
+ /* No grace period and unregistered callbacks? */
+ if (!rcu_gp_in_progress() &&
+ rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
+ guard(irqsave)();
+ if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
+ rcu_accelerate_cbs_unlocked(rnp, rdp);
}
+ rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
+
/* If there are callbacks ready, invoke them. */
- if (cpu_has_callbacks_ready_to_invoke(rdp))
- invoke_rcu_callbacks(rsp, rdp);
+ if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ likely(READ_ONCE(rcu_scheduler_fully_active))) {
+ rcu_do_batch(rdp);
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ invoke_rcu_core();
+ }
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
+ trace_rcu_utilization(TPS("End RCU core"));
+
+ // If strict GPs, schedule an RCU reader in a clean environment.
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
+}
+
+static void rcu_core_si(void)
+{
+ rcu_core();
+}
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+ wake_up_process(t);
+}
+
+static void invoke_rcu_core_kthread(void)
+{
+ struct task_struct *t;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+ t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
+ if (t != NULL && t != current)
+ rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
+ local_irq_restore(flags);
}
/*
- * Do RCU core processing for the current CPU.
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void invoke_rcu_core(void)
{
- struct rcu_state *rsp;
-
- if (cpu_is_offline(smp_processor_id()))
+ if (!cpu_online(smp_processor_id()))
return;
- trace_rcu_utilization(TPS("Start RCU core"));
- for_each_rcu_flavor(rsp)
- __rcu_process_callbacks(rsp);
- trace_rcu_utilization(TPS("End RCU core"));
+ if (use_softirq)
+ raise_softirq(RCU_SOFTIRQ);
+ else
+ invoke_rcu_core_kthread();
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+ per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(rcu_data.rcu_cpu_has_work);
}
/*
- * Schedule RCU callback invocation. If the specified type of RCU
- * does not support RCU priority boosting, just do a direct call,
- * otherwise wake up the per-CPU kernel kthread. Note that because we
- * are running on the current CPU with softirqs disabled, the
- * rcu_cpu_kthread_task cannot disappear out from under us.
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces
+ * the RCU softirq used in configurations of RCU that do not support RCU
+ * priority boosting.
*/
-static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+static void rcu_cpu_kthread(unsigned int cpu)
{
- if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
- return;
- if (likely(!rsp->boost)) {
- rcu_do_batch(rsp, rdp);
- return;
+ unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
+ int spincnt;
+
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
+ for (spincnt = 0; spincnt < 10; spincnt++) {
+ WRITE_ONCE(*j, jiffies);
+ local_bh_disable();
+ *statusp = RCU_KTHREAD_RUNNING;
+ local_irq_disable();
+ work = *workp;
+ WRITE_ONCE(*workp, 0);
+ local_irq_enable();
+ if (work)
+ rcu_core();
+ local_bh_enable();
+ if (!READ_ONCE(*workp)) {
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
+ }
}
- invoke_rcu_callbacks_kthread();
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+ schedule_timeout_idle(2);
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+ *statusp = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(*j, jiffies);
}
-static void invoke_rcu_core(void)
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_data.rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
{
- if (cpu_online(smp_processor_id()))
- raise_softirq(RCU_SOFTIRQ);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
+ if (use_softirq)
+ return 0;
+ WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
+ "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
+ return 0;
+}
+
+static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
+{
+ rcu_segcblist_enqueue(&rdp->cblist, head);
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
}
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
*/
-static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
- struct rcu_head *head, unsigned long flags)
+static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags)
{
- bool needwake;
-
+ rcutree_enqueue(rdp, head, func);
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
@@ -2917,34 +3011,28 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/*
* Force the grace period if too many callbacks or too long waiting.
- * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
* if some other CPU has recently done so. Also, don't bother
- * invoking force_quiescent_state() if the newly enqueued callback
+ * invoking rcu_force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
- if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
+ rdp->qlen_last_fqs_check + qhimark)) {
/* Are we ignoring a completed grace period? */
- note_gp_changes(rsp, rdp);
+ note_gp_changes(rdp);
/* Start a new grace period if one not already started. */
- if (!rcu_gp_in_progress(rsp)) {
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
- needwake = rcu_start_gp(rsp);
- raw_spin_unlock(&rnp_root->lock);
- if (needwake)
- rcu_gp_kthread_wake(rsp);
+ if (!rcu_gp_in_progress()) {
+ rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
} else {
/* Give the grace period a kick. */
- rdp->blimit = LONG_MAX;
- if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp);
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
+ rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
+ if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
+ rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
+ rcu_force_quiescent_state();
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
+ rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
}
@@ -2957,672 +3045,800 @@ static void rcu_leak_callback(struct rcu_head *rhp)
}
/*
- * Helper function for call_rcu() and friends. The cpu argument will
- * normally be -1, indicating "currently running CPU". It may specify
- * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
- * is expected to specify a CPU.
+ * Check and if necessary update the leaf rcu_node structure's
+ * ->cbovldmask bit corresponding to the current CPU based on that CPU's
+ * number of queued RCU callbacks. The caller must hold the leaf rcu_node
+ * structure's ->lock.
*/
+static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp)
+{
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (qovld_calc <= 0)
+ return; // Early boot and wildcard value set.
+ if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc)
+ WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask);
+ else
+ WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
+}
+
+/*
+ * Check and if necessary update the leaf rcu_node structure's
+ * ->cbovldmask bit corresponding to the current CPU based on that CPU's
+ * number of queued RCU callbacks. No locks need be held, but the
+ * caller must have disabled interrupts.
+ *
+ * Note that this function ignores the possibility that there are a lot
+ * of callbacks all of which have already seen the end of their respective
+ * grace periods. This omission is due to the need for no-CBs CPUs to
+ * be holding ->nocb_lock to do this check, which is too heavy for a
+ * common-case operation.
+ */
+static void check_cb_ovld(struct rcu_data *rdp)
+{
+ struct rcu_node *const rnp = rdp->mynode;
+
+ if (qovld_calc <= 0 ||
+ ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) ==
+ !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
+ return; // Early boot wildcard value or already set correctly.
+ raw_spin_lock_rcu_node(rnp);
+ check_cb_ovld_locked(rdp, rnp);
+ raw_spin_unlock_rcu_node(rnp);
+}
+
static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
- struct rcu_state *rsp, int cpu, bool lazy)
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
{
+ static atomic_t doublefrees;
unsigned long flags;
+ bool lazy;
struct rcu_data *rdp;
- WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
+ /* Misaligned rcu_head! */
+ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+
+ /* Avoid NULL dereference if callback is NULL. */
+ if (WARN_ON_ONCE(!func))
+ return;
+
if (debug_rcu_head_queue(head)) {
- /* Probable double call_rcu(), so leak the callback. */
- ACCESS_ONCE(head->func) = rcu_leak_callback;
- WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
+ /*
+ * Probable double call_rcu(), so leak the callback.
+ * Use rcu:rcu_callback trace event to find the previous
+ * time callback was passed to call_rcu().
+ */
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
+ WRITE_ONCE(head->func, rcu_leak_callback);
return;
}
head->func = func;
head->next = NULL;
+ kasan_record_aux_stack(head);
- /*
- * Opportunistically note grace-period endings and beginnings.
- * Note that we might see a beginning right after we see an
- * end, but never vice versa, since this CPU has to pass through
- * a quiescent state betweentimes.
- */
local_irq_save(flags);
- rdp = this_cpu_ptr(rsp->rda);
+ rdp = this_cpu_ptr(&rcu_data);
+ RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!");
+
+ lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
- if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
- int offline;
-
- if (cpu != -1)
- rdp = per_cpu_ptr(rsp->rda, cpu);
- if (likely(rdp->mynode)) {
- /* Post-boot, so this should be for a no-CBs CPU. */
- offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- WARN_ON_ONCE(offline);
- /* Offline CPU, _call_rcu() illegal, leak callback. */
- local_irq_restore(flags);
- return;
- }
- /*
- * Very early boot, before rcu_init(). Initialize if needed
- * and then drop through to queue the callback.
- */
- BUG_ON(cpu != -1);
+ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
+ // This can trigger due to call_rcu() from offline CPU:
+ WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
WARN_ON_ONCE(!rcu_is_watching());
- if (!likely(rdp->nxtlist))
- init_default_callback_list(rdp);
+ // Very early boot, before rcu_init(). Initialize if needed
+ // and then drop through to queue the callback.
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
}
- ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
- if (lazy)
- rdp->qlen_lazy++;
- else
- rcu_idle_count_callbacks_posted();
- smp_mb(); /* Count before adding callback for rcu_barrier(). */
- *rdp->nxttail[RCU_NEXT_TAIL] = head;
- rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
-
- if (__is_kfree_rcu_offset((unsigned long)func))
- trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
- rdp->qlen_lazy, rdp->qlen);
- else
- trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
- /* Go handle any RCU core processing required. */
- __call_rcu_core(rsp, rdp, head, flags);
+ check_cb_ovld(rdp);
+
+ if (unlikely(rcu_rdp_is_offloaded(rdp)))
+ call_rcu_nocb(rdp, head, func, flags, lazy);
+ else
+ call_rcu_core(rdp, head, func, flags);
local_irq_restore(flags);
}
-/*
- * Queue an RCU-sched callback for invocation after a grace period.
+#ifdef CONFIG_RCU_LAZY
+static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF);
+module_param(enable_rcu_lazy, bool, 0444);
+
+/**
+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
+ * flush all lazy callbacks (including the new one) to the main ->cblist while
+ * doing so.
+ *
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.
+ *
+ * Use this API instead of call_rcu() if you don't want the callback to be
+ * delayed for very long periods of time, which can happen on systems without
+ * memory pressure and on systems which are lightly loaded or mostly idle.
+ * This function will cause callbacks to be invoked sooner than later at the
+ * expense of extra power. Other than that, this function is identical to, and
+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
+ * ordering and other functionality.
*/
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, &rcu_sched_state, -1, 0);
+ __call_rcu_common(head, func, false);
}
-EXPORT_SYMBOL_GPL(call_rcu_sched);
+EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#else
+#define enable_rcu_lazy false
+#endif
-/*
- * Queue an RCU callback for invocation after a quicker grace period.
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * By default the callbacks are 'lazy' and are kept hidden from the main
+ * ->cblist to prevent starting of grace periods too soon.
+ * If you desire grace periods to start very soon, use call_rcu_hurry().
+ *
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.
+ *
+ * It is perfectly legal to repost an RCU callback, potentially with
+ * a different callback function, from within its callback function.
+ * The specified function will be invoked after another full grace period
+ * has elapsed. This use case is similar in form to the common practice
+ * of reposting a timer from within its own handler.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ *
+ * Specific to call_rcu() (as opposed to the other call_rcu*() functions),
+ * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many
+ * seconds before starting the grace period needed by the corresponding
+ * callback. This delay can significantly improve energy-efficiency
+ * on low-utilization battery-powered devices. To avoid this delay,
+ * in latency-sensitive kernel code, use call_rcu_hurry().
*/
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, &rcu_bh_state, -1, 0);
+ __call_rcu_common(head, func, enable_rcu_lazy);
}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
+EXPORT_SYMBOL_GPL(call_rcu);
/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
+ * During early boot, any blocking grace-period wait automatically
+ * implies a grace period.
+ *
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case. Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
*/
-void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
+static int rcu_blocking_is_gp(void)
{
- __call_rcu(head, func, rcu_state_p, -1, 1);
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
+ might_sleep();
+ return false;
+ }
+ return true;
}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
/*
- * Because a context switch is a grace period for RCU-sched and RCU-bh,
- * any blocking grace-period wait automatically implies a grace period
- * if there is only one CPU online at any point time during execution
- * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds
- * some overhead: RCU still operates correctly.
+ * Helper function for the synchronize_rcu() API.
*/
-static inline int rcu_blocking_is_gp(void)
+static void synchronize_rcu_normal(void)
{
- int ret;
+ struct rcu_synchronize rs;
- might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- ret = num_online_cpus() <= 1;
- preempt_enable();
- return ret;
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
+
+ if (READ_ONCE(rcu_normal_wake_from_gp) < 1) {
+ wait_rcu_gp(call_rcu_hurry);
+ goto trace_complete_out;
+ }
+
+ init_rcu_head_on_stack(&rs.head);
+ init_completion(&rs.completion);
+
+ /*
+ * This code might be preempted, therefore take a GP
+ * snapshot before adding a request.
+ */
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ get_state_synchronize_rcu_full(&rs.oldstate);
+
+ rcu_sr_normal_add_req(&rs);
+
+ /* Kick a GP and start waiting. */
+ (void) start_poll_synchronize_rcu();
+
+ /* Now we can wait. */
+ wait_for_completion(&rs.completion);
+ destroy_rcu_head_on_stack(&rs.head);
+
+trace_complete_out:
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete"));
}
/**
- * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ * synchronize_rcu - wait until a grace period has elapsed.
*
- * Control will return to the caller some time after a full rcu-sched
- * grace period has elapsed, in other words after all currently executing
- * rcu-sched read-side critical sections have completed. These read-side
- * critical sections are delimited by rcu_read_lock_sched() and
- * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
- * local_irq_disable(), and so on may be used in place of
- * rcu_read_lock_sched().
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting.
*
- * This means that all preempt_disable code sequences, including NMI and
- * non-threaded hardware-interrupt handlers, in progress on entry will
- * have completed before this primitive returns. However, this does not
- * guarantee that softirq handlers will have completed, since in some
- * kernels, these handlers can run in process context, and can block.
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
*
* Note that this guarantee implies further memory-ordering guarantees.
- * On systems with more than one CPU, when synchronize_sched() returns,
- * each CPU is guaranteed to have executed a full memory barrier since the
- * end of its last RCU-sched read-side critical section whose beginning
- * preceded the call to synchronize_sched(). In addition, each CPU having
+ * On systems with more than one CPU, when synchronize_rcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last RCU read-side critical section whose beginning
+ * preceded the call to synchronize_rcu(). In addition, each CPU having
* an RCU read-side critical section that extends beyond the return from
- * synchronize_sched() is guaranteed to have executed a full memory barrier
- * after the beginning of synchronize_sched() and before the beginning of
+ * synchronize_rcu() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_rcu() and before the beginning of
* that RCU read-side critical section. Note that these guarantees include
* CPUs that are offline, idle, or executing in user mode, as well as CPUs
* that are executing in the kernel.
*
- * Furthermore, if CPU A invoked synchronize_sched(), which returned
+ * Furthermore, if CPU A invoked synchronize_rcu(), which returned
* to its caller on CPU B, then both CPU A and CPU B are guaranteed
* to have executed a full memory barrier during the execution of
- * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
+ * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
*
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API. In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
-void synchronize_sched(void)
+void synchronize_rcu(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU-sched read-side critical section");
- if (rcu_blocking_is_gp())
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+ if (!rcu_blocking_is_gp()) {
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu_normal();
return;
- if (rcu_gp_is_expedited())
- synchronize_sched_expedited();
- else
- wait_rcu_gp(call_rcu_sched);
+ }
+
+ // Context allows vacuous grace periods.
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with other
+ // normal GPs only by being fully nested within them, which allows
+ // reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+
+ // Update the normal grace-period counters to record
+ // this grace period, but only those used by the boot CPU.
+ // The rcu_scheduler_starting() will take care of the rest of
+ // these counters.
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
+ for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL_GPL(synchronize_rcu);
/**
- * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu_bh grace
- * period has elapsed, in other words after all currently executing rcu_bh
- * read-side critical sections have completed. RCU read-side critical
- * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
- * and may be nested.
+ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
+ * @rgosp: Place to put state cookie
*
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
+ * Stores into @rgosp a value that will always be treated by functions
+ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
+ * has already completed.
*/
-void synchronize_rcu_bh(void)
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
- if (rcu_blocking_is_gp())
- return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_bh_expedited();
- else
- wait_rcu_gp(call_rcu_bh);
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+ rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
* Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * to determine whether or not a full grace period has elapsed in the
- * meantime.
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime.
*/
unsigned long get_state_synchronize_rcu(void)
{
/*
* Any prior manipulation of RCU-protected data must happen
- * before the load from ->gpnum.
+ * before the load from ->gp_seq.
*/
smp_mb(); /* ^^^ */
-
- /*
- * Make sure this load happens before the purportedly
- * time-consuming work between get_state_synchronize_rcu()
- * and cond_synchronize_rcu().
- */
- return smp_load_acquire(&rcu_state_p->gpnum);
+ return rcu_seq_snap(&rcu_state.gp_seq_polled);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
/**
- * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
+ * @rgosp: location to place combined normal/expedited grace-period state
*
- * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ * Places the normal and expedited grace-period states in @rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * long, but is guaranteed to see all grace periods. In contrast, the
+ * combined state occupies less memory, but can sometimes fail to take
+ * grace periods into account.
*
- * If a full RCU grace period has elapsed since the earlier call to
- * get_state_synchronize_rcu(), just return. Otherwise, invoke
- * synchronize_rcu() to wait for a full grace period.
- *
- * Yes, this function does not take counter wrap into account. But
- * counter wrap is harmless. If the counter wraps, we have waited for
- * more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * This does not guarantee that the needed grace period will actually
+ * start.
*/
-void cond_synchronize_rcu(unsigned long oldstate)
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- unsigned long newstate;
-
/*
- * Ensure that this load happens before any RCU-destructive
- * actions the caller might carry out after we return.
+ * Any prior manipulation of RCU-protected data must happen
+ * before the loads from ->gp_seq and ->expedited_sequence.
*/
- newstate = smp_load_acquire(&rcu_state_p->completed);
- if (ULONG_CMP_GE(oldstate, newstate))
- synchronize_rcu();
+ smp_mb(); /* ^^^ */
+
+ // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use
+ // in poll_state_synchronize_rcu_full() notwithstanding. Use of
+ // the latter here would result in too-short grace periods due to
+ // interactions with newly onlined CPUs.
+ rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
+ rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
}
-EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/*
+ * Helper function for start_poll_synchronize_rcu() and
+ * start_poll_synchronize_rcu_full().
+ */
+static void start_poll_synchronize_rcu_common(void)
{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
+ unsigned long flags;
+ bool needwake;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ local_irq_save(flags);
+ rdp = this_cpu_ptr(&rcu_data);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp); // irqs already disabled.
+ // Note it is possible for a grace period to have elapsed between
+ // the above call to get_state_synchronize_rcu() and the below call
+ // to rcu_seq_snap. This is OK, the worst that happens is that we
+ // get a grace period that no one needed. These accesses are ordered
+ // by smp_mb(), and we are accessing them in the opposite order
+ // from which they are updated at grace-period start, as required.
+ needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (needwake)
+ rcu_gp_kthread_wake();
}
/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code. In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime. If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
*/
-void synchronize_sched_expedited(void)
+unsigned long start_poll_synchronize_rcu(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
- struct rcu_state *rsp = &rcu_sched_state;
-
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- synchronize_sched();
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
-
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, fall back to normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- return;
- }
- WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
-
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
+ unsigned long gp_seq = get_state_synchronize_rcu();
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ start_poll_synchronize_rcu_common();
+ return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+/**
+ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * Places the normal and expedited grace-period states in *@rgos. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed grace period is not already slated to start, notifies
+ * RCU core of the need for that grace period.
+ */
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ start_poll_synchronize_rcu_common();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+/**
+ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which @oldstate was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @oldstate
+ * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
+ * on the one hand or by directly invoking either synchronize_rcu() or
+ * synchronize_rcu_expedited() on the other.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than a billion grace periods (and way more on a 64-bit system!).
+ * Those needing to keep old state values for very long time periods
+ * (many hours even on 32-bit systems) should check them occasionally and
+ * either refresh them or set a flag indicating that the grace period has
+ * completed. Alternatively, they can use get_completed_synchronize_rcu()
+ * to get a guaranteed-completed grace-period state.
+ *
+ * In addition, because oldstate compresses the grace-period state for
+ * both normal and expedited grace periods into a single unsigned long,
+ * it can miss a grace period when synchronize_rcu() runs concurrently
+ * with synchronize_rcu_expedited(). If this is unacceptable, please
+ * instead use the _full() variant of these polling APIs.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+ if (oldstate == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
-all_cpus_idle:
- free_cpumask_var(cm);
+/**
+ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which *rgosp was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @rgosp
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited
+ * for more than a billion grace periods (and way more on a 64-bit
+ * system!). Those needing to keep rcu_gp_oldstate values for very
+ * long time periods (many hours even on 32-bit systems) should check
+ * them occasionally and either refresh them or set a flag indicating
+ * that the grace period has completed. Alternatively, they can use
+ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
+ * grace-period state.
+ *
+ * This function provides the same memory-ordering guarantees that would
+ * be provided by a synchronize_rcu() that was invoked at the call to
+ * the function that provided @rgosp, and that returned at the end of this
+ * function. And this guarantee requires that the root rcu_node structure's
+ * ->gp_seq field be checked instead of that of the rcu_state structure.
+ * The problem is that the just-ending grace-period's callbacks can be
+ * invoked between the time that the root rcu_node structure's ->gp_seq
+ * field is updated and the time that the rcu_state structure's ->gp_seq
+ * field is updated. Therefore, if a single synchronize_rcu() is to
+ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
+ * then the root rcu_node structure is the one that needs to be polled.
+ */
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ smp_mb(); // Order against root rcu_node structure grace-period cleanup.
+ if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
+ rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If a full RCU grace period has elapsed since the earlier call to
+ * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
+ * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu(unsigned long oldstate)
+{
+ if (!poll_state_synchronize_rcu(oldstate))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
- put_online_cpus();
+/**
+ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu() to wait
+ * for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu();
}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, for the specified type of RCU, returning 1 if so.
- * The checks are in order of increasing expense: checks that can be
- * carried out against CPU-local state are performed first. However,
- * we must check for CPU stalls first, else we might not get a chance.
+ * Check to see if there is any immediate RCU-related work to be done by
+ * the current CPU, returning 1 if so and zero otherwise. The checks are
+ * in order of increasing expense: checks that can be carried out against
+ * CPU-local state are performed first. However, we must check for CPU
+ * stalls first, else we might not get a chance.
*/
-static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+static int rcu_pending(int user)
{
+ bool gp_in_progress;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- rdp->n_rcu_pending++;
+ lockdep_assert_irqs_disabled();
/* Check for CPU stalls, if enabled. */
- check_cpu_stall(rsp, rdp);
+ check_cpu_stall(rdp);
- /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
- if (rcu_nohz_full_cpu(rsp))
+ /* Does this CPU need a deferred NOCB wakeup? */
+ if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
+ return 1;
+
+ /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
+ gp_in_progress = rcu_gp_in_progress();
+ if ((user || rcu_is_cpu_rrupt_from_idle() ||
+ (gp_in_progress &&
+ time_before(jiffies, READ_ONCE(rcu_state.gp_start) +
+ nohz_full_patience_delay_jiffies))) &&
+ rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rcu_scheduler_fully_active &&
- rdp->qs_pending && !rdp->passed_quiesce &&
- rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
- rdp->n_rp_qs_pending++;
- } else if (rdp->qs_pending &&
- (rdp->passed_quiesce ||
- rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
- rdp->n_rp_report_qs++;
+ if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
- }
/* Does this CPU have callbacks ready to invoke? */
- if (cpu_has_callbacks_ready_to_invoke(rdp)) {
- rdp->n_rp_cb_ready++;
+ if (!rcu_rdp_is_offloaded(rdp) &&
+ rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
- }
/* Has RCU gone idle with this CPU needing another grace period? */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rdp->n_rp_cpu_needs_gp++;
+ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
+ !rcu_rdp_is_offloaded(rdp) &&
+ !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
- }
- /* Has another RCU grace period completed? */
- if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
- rdp->n_rp_gp_completed++;
+ /* Have RCU grace period completed or started? */
+ if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq ||
+ unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
return 1;
- }
-
- /* Has a new RCU grace period started? */
- if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
- rdp->n_rp_gp_started++;
- return 1;
- }
-
- /* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp)) {
- rdp->n_rp_nocb_defer_wakeup++;
- return 1;
- }
/* nothing to do */
- rdp->n_rp_need_nothing++;
return 0;
}
/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so. This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
+ * Helper function for rcu_barrier() tracing. If tracing is disabled,
+ * the compiler is expected to optimize this away.
*/
-static int rcu_pending(void)
+static void rcu_barrier_trace(const char *s, int cpu, unsigned long done)
{
- struct rcu_state *rsp;
-
- for_each_rcu_flavor(rsp)
- if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
- return 1;
- return 0;
+ trace_rcu_barrier(rcu_state.name, s, cpu,
+ atomic_read(&rcu_state.barrier_cpu_count), done);
}
/*
- * Return true if the specified CPU has any callback. If all_lazy is
- * non-NULL, store an indication of whether all callbacks are lazy.
- * (If there are no callbacks, all of them are deemed to be lazy.)
+ * RCU callback function for rcu_barrier(). If we are last, wake
+ * up the task executing rcu_barrier().
+ *
+ * Note that the value of rcu_state.barrier_sequence must be captured
+ * before the atomic_dec_and_test(). Otherwise, if this CPU is not last,
+ * other CPUs might count the value down to zero before this CPU gets
+ * around to invoking rcu_barrier_trace(), which might result in bogus
+ * data from the next instance of rcu_barrier().
*/
-static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+static void rcu_barrier_callback(struct rcu_head *rhp)
{
- bool al = true;
- bool hc = false;
- struct rcu_data *rdp;
- struct rcu_state *rsp;
+ unsigned long __maybe_unused s = rcu_state.barrier_sequence;
- for_each_rcu_flavor(rsp) {
- rdp = this_cpu_ptr(rsp->rda);
- if (!rdp->nxtlist)
- continue;
- hc = true;
- if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
- al = false;
- break;
- }
+ rhp->next = rhp; // Mark the callback as having been invoked.
+ if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
+ rcu_barrier_trace(TPS("LastCB"), -1, s);
+ complete(&rcu_state.barrier_completion);
+ } else {
+ rcu_barrier_trace(TPS("CB"), -1, s);
}
- if (all_lazy)
- *all_lazy = al;
- return hc;
-}
-
-/*
- * Helper function for _rcu_barrier() tracing. If tracing is disabled,
- * the compiler is expected to optimize this away.
- */
-static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
- int cpu, unsigned long done)
-{
- trace_rcu_barrier(rsp->name, s, cpu,
- atomic_read(&rsp->barrier_cpu_count), done);
}
/*
- * RCU callback function for _rcu_barrier(). If we are last, wake
- * up the task executing _rcu_barrier().
+ * If needed, entrain an rcu_barrier() callback on rdp->cblist.
*/
-static void rcu_barrier_callback(struct rcu_head *rhp)
+static void rcu_barrier_entrain(struct rcu_data *rdp)
{
- struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
- struct rcu_state *rsp = rdp->rsp;
+ unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
+ unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+ bool wake_nocb = false;
+ bool was_alldone = false;
- if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
- complete(&rsp->barrier_completion);
+ lockdep_assert_held(&rcu_state.barrier_lock);
+ if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
+ return;
+ rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
+ rdp->barrier_head.func = rcu_barrier_callback;
+ debug_rcu_head_queue(&rdp->barrier_head);
+ rcu_nocb_lock(rdp);
+ /*
+ * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
+ * queue. This way we don't wait for bypass timer that can reach seconds
+ * if it's fully lazy.
+ */
+ was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
+ atomic_inc(&rcu_state.barrier_cpu_count);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ debug_rcu_head_unqueue(&rdp->barrier_head);
+ rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
}
+ rcu_nocb_unlock(rdp);
+ if (wake_nocb)
+ wake_nocb_gp(rdp, false);
+ smp_store_release(&rdp->barrier_seq_snap, gseq);
}
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
-static void rcu_barrier_func(void *type)
+static void rcu_barrier_handler(void *cpu_in)
{
- struct rcu_state *rsp = type;
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
- atomic_inc(&rsp->barrier_cpu_count);
- rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+ uintptr_t cpu = (uintptr_t)cpu_in;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ lockdep_assert_irqs_disabled();
+ WARN_ON_ONCE(cpu != rdp->cpu);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+ raw_spin_lock(&rcu_state.barrier_lock);
+ rcu_barrier_entrain(rdp);
+ raw_spin_unlock(&rcu_state.barrier_lock);
}
-/*
- * Orchestrate the specified type of RCU barrier, waiting for all
- * RCU callbacks of the specified type to complete.
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ *
+ * Note that this primitive does not necessarily wait for an RCU grace period
+ * to complete. For example, if there are no RCU callbacks queued anywhere
+ * in the system, then rcu_barrier() is within its rights to return
+ * immediately, without waiting for anything, much less an RCU grace period.
+ * In fact, rcu_barrier() will normally not result in any RCU grace periods
+ * beyond those that were already destined to be executed.
+ *
+ * In kernels built with CONFIG_RCU_LAZY=y, this function also hurries all
+ * pending lazy RCU callbacks.
*/
-static void _rcu_barrier(struct rcu_state *rsp)
+void rcu_barrier(void)
{
- int cpu;
+ uintptr_t cpu;
+ unsigned long flags;
+ unsigned long gseq;
struct rcu_data *rdp;
- unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
- unsigned long snap_done;
+ unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, snap);
+ rcu_barrier_trace(TPS("Begin"), -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
- mutex_lock(&rsp->barrier_mutex);
-
- /*
- * Ensure that all prior references, including to ->n_barrier_done,
- * are ordered before the _rcu_barrier() machinery.
- */
- smp_mb(); /* See above block comment. */
-
- /*
- * Recheck ->n_barrier_done to see if others did our work for us.
- * This means checking ->n_barrier_done for an even-to-odd-to-even
- * transition. The "if" expression below therefore rounds the old
- * value up to the next even number and adds two before comparing.
- */
- snap_done = rsp->n_barrier_done;
- _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+ mutex_lock(&rcu_state.barrier_mutex);
- /*
- * If the value in snap is odd, we needed to wait for the current
- * rcu_barrier() to complete, then wait for the next one, in other
- * words, we need the value of snap_done to be three larger than
- * the value of snap. On the other hand, if the value in snap is
- * even, we only had to wait for the next rcu_barrier() to complete,
- * in other words, we need the value of snap_done to be only two
- * greater than the value of snap. The "(snap + 3) & ~0x1" computes
- * this for us (thank you, Linus!).
- */
- if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ /* Did someone else do our work for us? */
+ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
+ rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
- mutex_unlock(&rsp->barrier_mutex);
+ mutex_unlock(&rcu_state.barrier_mutex);
return;
}
- /*
- * Increment ->n_barrier_done to avoid duplicate work. Use
- * ACCESS_ONCE() to prevent the compiler from speculating
- * the increment to precede the early-exit check.
- */
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
- smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+ /* Mark the start of the barrier operation. */
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ rcu_seq_start(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
+ rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
- * Initialize the count to one rather than to zero in order to
- * avoid a too-soon return to zero in case of a short grace period
- * (or preemption of this task). Exclude CPU-hotplug operations
- * to ensure that no offline CPU has callbacks queued.
+ * Initialize the count to two rather than to zero in order
+ * to avoid a too-soon return to zero in case of an immediate
+ * invocation of the just-enqueued callback (or preemption of
+ * this task). Exclude CPU-hotplug operations to ensure that no
+ * offline non-offloaded CPU has callbacks queued.
*/
- init_completion(&rsp->barrier_completion);
- atomic_set(&rsp->barrier_cpu_count, 1);
- get_online_cpus();
+ init_completion(&rcu_state.barrier_completion);
+ atomic_set(&rcu_state.barrier_cpu_count, 2);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
/*
* Force each CPU with callbacks to register a new callback.
@@ -3630,90 +3846,279 @@ static void _rcu_barrier(struct rcu_state *rsp)
* corresponding CPU's preceding callbacks have been invoked.
*/
for_each_possible_cpu(cpu) {
- if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+retry:
+ if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
+ continue;
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
continue;
- rdp = per_cpu_ptr(rsp->rda, cpu);
- if (rcu_is_nocb_cpu(cpu)) {
- if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
- _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
- rsp->n_barrier_done);
- } else {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
- smp_mb__before_atomic();
- atomic_inc(&rsp->barrier_cpu_count);
- __call_rcu(&rdp->barrier_head,
- rcu_barrier_callback, rsp, cpu, 0);
- }
- } else if (ACCESS_ONCE(rdp->qlen)) {
- _rcu_barrier_trace(rsp, "OnlineQ", cpu,
- rsp->n_barrier_done);
- smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
- } else {
- _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
- rsp->n_barrier_done);
}
+ if (!rcu_rdp_cpu_online(rdp)) {
+ rcu_barrier_entrain(rdp);
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
+ schedule_timeout_uninterruptible(1);
+ goto retry;
+ }
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
}
- put_online_cpus();
/*
* Now that we have an rcu_barrier_callback() callback on each
* CPU, and thus each counted, remove the initial count.
*/
- if (atomic_dec_and_test(&rsp->barrier_cpu_count))
- complete(&rsp->barrier_completion);
-
- /* Increment ->n_barrier_done to prevent duplicate work. */
- smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
- smp_mb(); /* Keep increment before caller's subsequent code. */
+ if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count))
+ complete(&rcu_state.barrier_completion);
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
- wait_for_completion(&rsp->barrier_completion);
+ wait_for_completion(&rcu_state.barrier_completion);
+
+ /* Mark the end of the barrier operation. */
+ rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
+ rcu_seq_end(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ }
/* Other rcu_barrier() invocations can now safely proceed. */
- mutex_unlock(&rsp->barrier_mutex);
+ mutex_unlock(&rcu_state.barrier_mutex);
}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+static unsigned long rcu_barrier_last_throttle;
/**
- * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
+ *
+ * This can be thought of as guard rails around rcu_barrier() that
+ * permits unrestricted userspace use, at least assuming the hardware's
+ * try_cmpxchg() is robust. There will be at most one call per second to
+ * rcu_barrier() system-wide from use of this function, which means that
+ * callers might needlessly wait a second or three.
+ *
+ * This is intended for use by test suites to avoid OOM by flushing RCU
+ * callbacks from the previous test before starting the next. See the
+ * rcutree.do_rcu_barrier module parameter for more information.
+ *
+ * Why not simply make rcu_barrier() more scalable? That might be
+ * the eventual endpoint, but let's keep it simple for the time being.
+ * Note that the module parameter infrastructure serializes calls to a
+ * given .set() function, but should concurrent .set() invocation ever be
+ * possible, we are ready!
*/
-void rcu_barrier_bh(void)
+static void rcu_barrier_throttled(void)
{
- _rcu_barrier(&rcu_bh_state);
+ unsigned long j = jiffies;
+ unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
+ unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
+
+ while (time_in_range(j, old, old + HZ / 16) ||
+ !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
+ schedule_timeout_idle(HZ / 16);
+ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
+ smp_mb(); /* caller's subsequent code after above check. */
+ return;
+ }
+ j = jiffies;
+ old = READ_ONCE(rcu_barrier_last_throttle);
+ }
+ rcu_barrier();
}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-/**
- * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+/*
+ * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
+ * request arrives. We insist on a true value to allow for possible
+ * future expansion.
+ */
+static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp)
+{
+ bool b;
+ int ret;
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
+ return -EAGAIN;
+ ret = kstrtobool(val, &b);
+ if (!ret && b) {
+ atomic_inc((atomic_t *)kp->arg);
+ rcu_barrier_throttled();
+ atomic_dec((atomic_t *)kp->arg);
+ }
+ return ret;
+}
+
+/*
+ * Output the number of outstanding rcutree.do_rcu_barrier requests.
+ */
+static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg));
+}
+
+static const struct kernel_param_ops do_rcu_barrier_ops = {
+ .set = param_set_do_rcu_barrier,
+ .get = param_get_do_rcu_barrier,
+};
+static atomic_t do_rcu_barrier;
+module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644);
+
+/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return READ_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+bool rcu_cpu_online(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return rcu_rdp_cpu_online(rdp);
+}
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online as far as RCU is concerned?
+ *
+ * Disable preemption to avoid false positives that could otherwise
+ * happen due to the current CPU number being sampled, this task being
+ * preempted, its old CPU being taken offline, resuming on some other CPU,
+ * then determining that its old CPU is now offline.
+ *
+ * Disable checking if in an NMI handler because we cannot safely
+ * report errors from NMI handlers anyway. In addition, it is OK to use
+ * RCU on an offline processor during initial boot, hence the check for
+ * rcu_scheduler_fully_active.
+ */
+bool notrace rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ bool ret = false;
+
+ if (in_nmi() || !rcu_scheduler_fully_active)
+ return true;
+ preempt_disable_notrace();
+ rdp = this_cpu_ptr(&rcu_data);
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
+ ret = true;
+ preempt_enable_notrace();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!READ_ONCE(rcu_state.n_online_cpus);
+}
+
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated.
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
*/
-void rcu_barrier_sched(void)
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
{
- _rcu_barrier(&rcu_sched_state);
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ raw_lockdep_assert_held_rcu_node(rnp_leaf);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
+ WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ rnp->qsmaskinit &= ~mask;
+ /* Between grace periods, so better already be zero! */
+ WARN_ON_ONCE(rnp->qsmask);
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock_rcu_node(rnp);
+ /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
- * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * must hold the corresponding leaf rcu_node ->lock with interrupts
* disabled.
*/
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
{
long mask;
+ long oldmask;
struct rcu_node *rnp = rnp_leaf;
+ raw_lockdep_assert_held_rcu_node(rnp_leaf);
+ WARN_ON_ONCE(rnp->wait_blkd_tasks);
for (;;) {
mask = rnp->grpmask;
rnp = rnp->parent;
if (rnp == NULL)
return;
- raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
+ oldmask = rnp->qsmaskinit;
rnp->qsmaskinit |= mask;
- raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+ raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
+ if (oldmask)
+ return;
}
}
@@ -3721,51 +4126,126 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
* Do boot-time initialization of a CPU's per-CPU RCU data.
*/
static void __init
-rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
+rcu_boot_init_percpu_data(int cpu)
{
- unsigned long flags;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rcu_get_root(rsp);
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
- rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
- WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
- WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
+ INIT_WORK(&rdp->strict_work, strict_work_handler);
+ WARN_ON_ONCE(ct->nesting != 1);
+ WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)));
+ rdp->barrier_seq_snap = rcu_state.barrier_sequence;
+ rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
+ rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
+ rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
+ rdp->rcu_onl_gp_state = RCU_GP_CLEANED;
+ rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
- rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp)
+{
+ cpumask_var_t affinity;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return;
+
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ cpumask_set_cpu(cpu, affinity);
+
+ kthread_affine_preferred(t, affinity);
+
+ free_cpumask_var(affinity);
+}
+
+struct kthread_worker *rcu_exp_gp_kworker;
+
+static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
+{
+ struct kthread_worker *kworker;
+ const char *name = "rcu_exp_par_gp_kthread_worker/%d";
+ struct sched_param param = { .sched_priority = kthread_prio };
+ int rnp_index = rnp - rcu_get_root();
+
+ if (rnp->exp_kworker)
+ return;
+
+ kworker = kthread_create_worker(0, name, rnp_index);
+ if (IS_ERR_OR_NULL(kworker)) {
+ pr_err("Failed to create par gp kworker on %d/%d\n",
+ rnp->grplo, rnp->grphi);
+ return;
+ }
+ WRITE_ONCE(rnp->exp_kworker, kworker);
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
+
+ rcu_thread_affine_rnp(kworker->task, rnp);
+ wake_up_process(kworker->task);
+}
+
+static void __init rcu_start_exp_gp_kworker(void)
+{
+ const char *name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_run_worker(0, name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", name);
+ rcu_exp_gp_kworker = NULL;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+}
+
+static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
+{
+ if (rcu_scheduler_fully_active) {
+ mutex_lock(&rnp->kthread_mutex);
+ rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_exp_par_gp_kworker(rnp);
+ mutex_unlock(&rnp->kthread_mutex);
+ }
}
/*
- * Initialize a CPU's per-CPU RCU data. Note that only one online or
- * offline event can be happening at a given time. Note also that we
- * can accept some slop in the rsp->completed access due to the fact
- * that this CPU cannot possibly have any RCU callbacks in flight yet.
+ * Invoked early in the CPU-online process, when pretty much all services
+ * are available. The incoming CPU is not present.
+ *
+ * Initializes a CPU's per-CPU RCU data. Note that only one online or
+ * offline event can be happening at a given time. Note also that we can
+ * accept some slop in the rsp->gp_seq access due to the fact that this
+ * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
+ * And any offloaded callbacks are being numbered elsewhere.
*/
-static void
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+int rcutree_prepare_cpu(unsigned int cpu)
{
unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_node *rnp = rcu_get_root(rsp);
+ struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rcu_get_root();
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rdp->beenonline = 1; /* We have now been online. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
- if (!rdp->nxtlist)
- init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
- rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_sysidle_init_percpu_data(rdp->dynticks);
- atomic_set(&rdp->dynticks->dynticks,
- (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ ct->nesting = 1; /* CPU not up, no tearing. */
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+
+ /*
+ * Only non-NOCB CPUs that didn't have early-boot callbacks need to be
+ * (re-)initialized.
+ */
+ if (!rcu_segcblist_is_enabled(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
/*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
@@ -3773,90 +4253,306 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
* of the next grace period.
*/
rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ rdp->gp_seq = READ_ONCE(rnp->gp_seq);
+ rdp->gp_seq_needed = rdp->gp_seq;
+ rdp->cpu_no_qs.b.norm = true;
+ rdp->core_needs_qs = false;
+ rdp->rcu_iw_pending = false;
+ rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
+ rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
+ trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcu_preempt_deferred_qs_init(rdp);
+ rcu_spawn_rnp_kthreads(rnp);
+ rcu_spawn_cpu_nocb_kthread(cpu);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
+
+ return 0;
+}
+
+/*
+ * Has the specified (known valid) CPU ever been fully online?
+ */
+bool rcu_cpu_beenfullyonline(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return smp_load_acquire(&rdp->beenonline);
+}
+
+/*
+ * Near the end of the CPU-online process. Pretty much all services
+ * enabled, and the CPU is now very much alive.
+ */
+int rcutree_online_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask |= rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return 0; /* Too early in boot for scheduler work. */
+
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
+}
+
+/*
+ * Mark the specified CPU as being online so that subsequent grace periods
+ * (both expedited and normal) will wait on it. Note that this means that
+ * incoming CPUs are not allowed to use RCU read-side critical sections
+ * until this function is called. Failing to observe this restriction
+ * will result in lockdep splats.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the incoming CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
+ * This incoming CPU must not have enabled interrupts yet.
+ *
+ * This mirrors the effects of rcutree_report_cpu_dead().
+ */
+void rcutree_report_cpu_starting(unsigned int cpu)
+{
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ bool newcpu;
+
+ lockdep_assert_irqs_disabled();
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu_started)
+ return;
+ rdp->cpu_started = true;
+
+ rnp = rdp->mynode;
mask = rdp->grpmask;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- rnp->qsmaskinitnext |= mask;
- rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
- rdp->completed = rnp->completed;
- rdp->passed_quiesce = false;
- rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->qs_pending = false;
- trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ rcu_watching_online();
+ raw_spin_lock(&rcu_state.barrier_lock);
+ raw_spin_lock_rcu_node(rnp);
+ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
+ raw_spin_unlock(&rcu_state.barrier_lock);
+ newcpu = !(rnp->expmaskinitnext & mask);
+ rnp->expmaskinitnext |= mask;
+ /* Allow lockless access for expedited grace periods. */
+ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
+ rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
+ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
+ rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state);
+
+ /* An incoming CPU should never be blocking a grace period. */
+ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+ /* rcu_report_qs_rnp() *really* wants some flags to restore */
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_disable_urgency_upon_qs(rdp);
+ /* Report QS -after- changing ->qsmaskinitnext! */
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+ } else {
+ raw_spin_unlock_rcu_node(rnp);
+ }
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ smp_store_release(&rdp->beenonline, true);
+ smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
-static void rcu_prepare_cpu(int cpu)
+/*
+ * The outgoing function has no further need of RCU, so remove it from
+ * the rcu_node tree's ->qsmaskinitnext bit masks.
+ *
+ * Note that this function is special in that it is invoked directly
+ * from the outgoing CPU rather than from the cpuhp_step mechanism.
+ * This is because this function must be invoked at a precise location.
+ *
+ * This mirrors the effect of rcutree_report_cpu_starting().
+ */
+void rcutree_report_cpu_dead(void)
{
- struct rcu_state *rsp;
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
- for_each_rcu_flavor(rsp)
- rcu_init_percpu_data(cpu, rsp);
+ /*
+ * IRQS must be disabled from now on and until the CPU dies, or an interrupt
+ * may introduce a new READ-side while it is actually off the QS masks.
+ */
+ lockdep_assert_irqs_disabled();
+ /*
+ * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution.
+ * The requested QS must have been reported on the last context switch
+ * from stop machine to idle.
+ */
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
+ // Do any dangling deferred wakeups.
+ do_nocb_deferred_wakeup(rdp);
+
+ rcu_preempt_deferred_qs(current);
+
+ /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+ mask = rdp->grpmask;
+
+ /*
+ * Hold the ofl_lock and rnp lock to avoid races between CPU going
+ * offline and doing a QS report (as below), versus rcu_gp_init().
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section
+ * for more details.
+ */
+ arch_spin_lock(&rcu_state.ofl_lock);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
+ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
+ rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state);
+ if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
+ /* Report quiescent state -before- changing ->qsmaskinitnext! */
+ rcu_disable_urgency_upon_qs(rdp);
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ /* Clear from ->qsmaskinitnext to mark offline. */
+ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ rdp->cpu_started = false;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
- * Handle CPU online/offline notification events.
+ * The outgoing CPU has just passed through the dying-idle state, and we
+ * are being invoked from the CPU that was IPIed to continue the offline
+ * operation. Migrate the outgoing CPU's callbacks to the current CPU.
*/
-int rcu_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+void rcutree_migrate_callbacks(int cpu)
{
- long cpu = (long)hcpu;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
- struct rcu_state *rsp;
+ unsigned long flags;
+ struct rcu_data *my_rdp;
+ struct rcu_node *my_rnp;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ bool needwake;
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- rcu_prepare_cpu(cpu);
- rcu_prepare_kthreads(cpu);
- rcu_spawn_all_nocb_kthreads(cpu);
- break;
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- rcu_boost_kthread_setaffinity(rnp, -1);
- break;
- case CPU_DOWN_PREPARE:
- rcu_boost_kthread_setaffinity(rnp, cpu);
- break;
- case CPU_DYING:
- case CPU_DYING_FROZEN:
- for_each_rcu_flavor(rsp)
- rcu_cleanup_dying_cpu(rsp);
- break;
- case CPU_DYING_IDLE:
- for_each_rcu_flavor(rsp) {
- rcu_cleanup_dying_idle_cpu(cpu, rsp);
- }
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp) {
- rcu_cleanup_dead_cpu(cpu, rsp);
- do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
- }
- break;
- default:
- break;
+ if (rcu_rdp_is_offloaded(rdp))
+ return;
+
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (rcu_segcblist_empty(&rdp->cblist)) {
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ return; /* No callbacks to migrate. */
}
- return NOTIFY_OK;
+
+ WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
+ rcu_barrier_entrain(rdp);
+ my_rdp = this_cpu_ptr(&rcu_data);
+ my_rnp = my_rdp->mynode;
+ rcu_nocb_lock(my_rdp); /* irqs already disabled. */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
+ raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
+ /* Leverage recent GPs and set GP for new callbacks. */
+ needwake = rcu_advance_cbs(my_rnp, rdp) ||
+ rcu_advance_cbs(my_rnp, my_rdp);
+ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
+ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
+ rcu_segcblist_disable(&rdp->cblist);
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ check_cb_ovld_locked(my_rdp, my_rnp);
+ if (rcu_rdp_is_offloaded(my_rdp)) {
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
+ __call_rcu_nocb_wake(my_rdp, true, flags);
+ } else {
+ rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
+ }
+ local_irq_restore(flags);
+ if (needwake)
+ rcu_gp_kthread_wake();
+ lockdep_assert_irqs_enabled();
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
}
+/*
+ * Near the end of the offline process. Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ bool blkd;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+ return 0;
+}
+
+/*
+ * Near the beginning of the process. The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
+int rcutree_offline_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ // nohz_full CPUs need the tick for stop-machine to work quickly
+ tick_dep_set(TICK_DEP_BIT_RCU);
+ return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * On non-huge systems, use expedited RCU grace periods to make suspend
+ * and hibernation run faster.
+ */
static int rcu_pm_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_expedite_gp();
+ rcu_async_hurry();
+ rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
+ rcu_async_relax();
break;
default:
break;
@@ -3865,140 +4561,121 @@ static int rcu_pm_notify(struct notifier_block *self,
}
/*
- * Spawn the kthreads that handle each RCU flavor's grace periods.
+ * Spawn the kthreads that handle RCU's grace periods.
*/
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
- int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
- struct rcu_state *rsp;
struct sched_param sp;
struct task_struct *t;
-
- /* Force priority into range. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
- kthread_prio = 1;
- else if (kthread_prio < 0)
- kthread_prio = 0;
- else if (kthread_prio > 99)
- kthread_prio = 99;
- if (kthread_prio != kthread_prio_in)
- pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
- kthread_prio, kthread_prio_in);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
rcu_scheduler_fully_active = 1;
- for_each_rcu_flavor(rsp) {
- t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
- BUG_ON(IS_ERR(t));
- rnp = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rsp->gp_kthread = t;
- if (kthread_prio) {
- sp.sched_priority = kthread_prio;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- }
- wake_up_process(t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
+ return 0;
+ if (kthread_prio) {
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
- rcu_spawn_nocb_kthreads();
- rcu_spawn_boost_kthreads();
+ rnp = rcu_get_root();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ // Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
+ smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ wake_up_process(t);
+ /* This is a pre-SMP initcall, we expect a single CPU */
+ WARN_ON(num_online_cpus() > 1);
+ /*
+ * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
+ * due to rcu_scheduler_fully_active.
+ */
+ rcu_spawn_cpu_nocb_kthread(smp_processor_id());
+ rcu_spawn_rnp_kthreads(rdp->mynode);
+ rcu_spawn_core_kthreads();
+ /* Create kthread worker for expedited GPs */
+ rcu_start_exp_gp_kworker();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
/*
- * This function is invoked towards the end of the scheduler's initialization
- * process. Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system). After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections. This function also enables RCU lockdep checking.
+ * This function is invoked towards the end of the scheduler's
+ * initialization process. Before this is called, the idle task might
+ * contain synchronous grace-period primitives (during which time, this idle
+ * task is booting the system, and such primitives are no-ops). After this
+ * function is called, any synchronous grace-period primitives are run as
+ * expedited, with the requesting task driving the grace period forward.
+ * A later core_initcall() rcu_set_runtime_mode() will switch to full
+ * runtime RCU functionality.
*/
void rcu_scheduler_starting(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = 1;
-}
+ rcu_test_sync_prims();
-/*
- * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
- */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
- int i;
+ // Fix up the ->gp_seq counters.
+ local_irq_save(flags);
+ rcu_for_each_node_breadth_first(rnp)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
- if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
- rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
- for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- } else {
- int ccur;
- int cprv;
-
- cprv = nr_cpu_ids;
- for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
- cprv = ccur;
- }
- }
+ // Switch out of early boot mode.
+ rcu_scheduler_active = RCU_SCHEDULER_INIT;
+ rcu_test_sync_prims();
}
/*
- * Helper function for rcu_init() that initializes one rcu_state structure.
+ * Helper function for rcu_init() that initializes the rcu_state structure.
*/
-static void __init rcu_init_one(struct rcu_state *rsp,
- struct rcu_data __percpu *rda)
-{
- static const char * const buf[] = {
- "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static const char * const fqs[] = {
- "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
- static u8 fl_mask = 0x1;
+static void __init rcu_init_one(void)
+{
+ static const char * const buf[] = RCU_NODE_NAME_INIT;
+ static const char * const fqs[] = RCU_FQS_NAME_INIT;
+ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+
+ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
int cpustride = 1;
int i;
int j;
struct rcu_node *rnp;
- BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+ BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
- /* Silence gcc 4.8 warning about array index out of range. */
- if (rcu_num_lvls > RCU_NUM_LVLS)
- panic("rcu_init_one: rcu_num_lvls overflow");
+ /* Silence gcc 4.8 false positive about array index out of range. */
+ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls out of range");
/* Initialize the level-tracking arrays. */
- for (i = 0; i < rcu_num_lvls; i++)
- rsp->levelcnt[i] = num_rcu_lvl[i];
for (i = 1; i < rcu_num_lvls; i++)
- rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
- rcu_init_levelspread(rsp);
- rsp->flavor_mask = fl_mask;
- fl_mask <<= 1;
+ rcu_state.level[i] =
+ rcu_state.level[i - 1] + num_rcu_lvl[i - 1];
+ rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Initialize the elements themselves, starting from the leaves. */
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- cpustride *= rsp->levelspread[i];
- rnp = rsp->level[i];
- for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
- raw_spin_lock_init(&rnp->lock);
- lockdep_set_class_and_name(&rnp->lock,
+ cpustride *= levelspread[i];
+ rnp = rcu_state.level[i];
+ for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
+ raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
+ lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
&rcu_node_class[i], buf[i]);
raw_spin_lock_init(&rnp->fqslock);
lockdep_set_class_and_name(&rnp->fqslock,
&rcu_fqs_class[i], fqs[i]);
- rnp->gpnum = rsp->gpnum;
- rnp->completed = rsp->completed;
+ rnp->gp_seq = rcu_state.gp_seq;
+ rnp->gp_seq_needed = rcu_state.gp_seq;
+ rnp->completedqs = rcu_state.gp_seq;
rnp->qsmask = 0;
rnp->qsmaskinit = 0;
rnp->grplo = j * cpustride;
@@ -4010,26 +4687,59 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->grpmask = 0;
rnp->parent = NULL;
} else {
- rnp->grpnum = j % rsp->levelspread[i - 1];
- rnp->grpmask = 1UL << rnp->grpnum;
- rnp->parent = rsp->level[i - 1] +
- j / rsp->levelspread[i - 1];
+ rnp->grpnum = j % levelspread[i - 1];
+ rnp->grpmask = BIT(rnp->grpnum);
+ rnp->parent = rcu_state.level[i - 1] +
+ j / levelspread[i - 1];
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
+ init_waitqueue_head(&rnp->exp_wq[0]);
+ init_waitqueue_head(&rnp->exp_wq[1]);
+ init_waitqueue_head(&rnp->exp_wq[2]);
+ init_waitqueue_head(&rnp->exp_wq[3]);
+ spin_lock_init(&rnp->exp_lock);
+ mutex_init(&rnp->kthread_mutex);
+ raw_spin_lock_init(&rnp->exp_poll_lock);
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
}
}
- init_waitqueue_head(&rsp->gp_wq);
- rnp = rsp->level[rcu_num_lvls - 1];
+ init_swait_queue_head(&rcu_state.gp_wq);
+ init_swait_queue_head(&rcu_state.expedited_wq);
+ rnp = rcu_first_leaf_node();
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
- per_cpu_ptr(rsp->rda, i)->mynode = rnp;
- rcu_boot_init_percpu_data(i, rsp);
+ per_cpu_ptr(&rcu_data, i)->mynode = rnp;
+ per_cpu_ptr(&rcu_data, i)->barrier_head.next =
+ &per_cpu_ptr(&rcu_data, i)->barrier_head;
+ rcu_boot_init_percpu_data(i);
}
- list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+
+/*
+ * Force priority from the kernel command-line into range.
+ */
+static void __init sanitize_kthread_prio(void)
+{
+ int kthread_prio_in = kthread_prio;
+
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
+ && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
+ kthread_prio = 2;
+ else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("%s: Limited prio to %d from %d\n",
+ __func__, kthread_prio, kthread_prio_in);
}
/*
@@ -4037,13 +4747,25 @@ static void __init rcu_init_one(struct rcu_state *rsp,
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
*/
-static void __init rcu_init_geometry(void)
+void rcu_init_geometry(void)
{
ulong d;
int i;
- int j;
- int n = nr_cpu_ids;
- int rcu_capacity[MAX_RCU_LVLS + 1];
+ static unsigned long old_nr_cpu_ids;
+ int rcu_capacity[RCU_NUM_LVLS];
+ static bool initialized;
+
+ if (initialized) {
+ /*
+ * Warn if setup_nr_cpu_ids() had not yet been invoked,
+ * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+ */
+ WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+ return;
+ }
+
+ old_nr_cpu_ids = nr_cpu_ids;
+ initialized = true;
/*
* Initialize any unspecified boot parameters.
@@ -4057,80 +4779,141 @@ static void __init rcu_init_geometry(void)
jiffies_till_first_fqs = d;
if (jiffies_till_next_fqs == ULONG_MAX)
jiffies_till_next_fqs = d;
+ adjust_jiffies_till_sched_qs();
/* If the compile-time values are accurate, just leave. */
- if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+ if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
- pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
rcu_fanout_leaf, nr_cpu_ids);
/*
+ * The boot-time rcu_fanout_leaf parameter must be at least two
+ * and cannot exceed the number of bits in the rcu_node masks.
+ * Complain and fall back to the compile-time values if this
+ * limit is exceeded.
+ */
+ if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) {
+ rcu_fanout_leaf = RCU_FANOUT_LEAF;
+ WARN_ON(1);
+ return;
+ }
+
+ /*
* Compute number of nodes that can be handled an rcu_node tree
- * with the given number of levels. Setting rcu_capacity[0] makes
- * some of the arithmetic easier.
+ * with the given number of levels.
*/
- rcu_capacity[0] = 1;
- rcu_capacity[1] = rcu_fanout_leaf;
- for (i = 2; i <= MAX_RCU_LVLS; i++)
- rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+ rcu_capacity[0] = rcu_fanout_leaf;
+ for (i = 1; i < RCU_NUM_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
/*
- * The boot-time rcu_fanout_leaf parameter is only permitted
- * to increase the leaf-level fanout, not decrease it. Of course,
- * the leaf-level fanout cannot exceed the number of bits in
- * the rcu_node masks. Finally, the tree must be able to accommodate
- * the configured number of CPUs. Complain and fall back to the
- * compile-time values if these limits are exceeded.
+ * The tree must be able to accommodate the configured number of CPUs.
+ * If this limit is exceeded, fall back to the compile-time values.
*/
- if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
- rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
- n > rcu_capacity[MAX_RCU_LVLS]) {
+ if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
+ rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
}
+ /* Calculate the number of levels in the tree. */
+ for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
+ }
+ rcu_num_lvls = i + 1;
+
/* Calculate the number of rcu_nodes at each level of the tree. */
- for (i = 1; i <= MAX_RCU_LVLS; i++)
- if (n <= rcu_capacity[i]) {
- for (j = 0; j <= i; j++)
- num_rcu_lvl[j] =
- DIV_ROUND_UP(n, rcu_capacity[i - j]);
- rcu_num_lvls = i;
- for (j = i + 1; j <= MAX_RCU_LVLS; j++)
- num_rcu_lvl[j] = 0;
- break;
- }
+ for (i = 0; i < rcu_num_lvls; i++) {
+ int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
+ num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
+ }
/* Calculate the total number of rcu_node structures. */
rcu_num_nodes = 0;
- for (i = 0; i <= MAX_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
rcu_num_nodes += num_rcu_lvl[i];
- rcu_num_nodes -= n;
}
+/*
+ * Dump out the structure of the rcu_node combining tree associated
+ * with the rcu_state structure.
+ */
+static void __init rcu_dump_rcu_node_tree(void)
+{
+ int level = 0;
+ struct rcu_node *rnp;
+
+ pr_info("rcu_node tree layout dump\n");
+ pr_info(" ");
+ rcu_for_each_node_breadth_first(rnp) {
+ if (rnp->level != level) {
+ pr_cont("\n");
+ pr_info(" ");
+ level = rnp->level;
+ }
+ pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
+ }
+ pr_cont("\n");
+}
+
+struct workqueue_struct *rcu_gp_wq;
+
void __init rcu_init(void)
{
- int cpu;
+ int cpu = smp_processor_id();
rcu_early_boot_tests();
rcu_bootup_announce();
+ sanitize_kthread_prio();
rcu_init_geometry();
- rcu_init_one(&rcu_bh_state, &rcu_bh_data);
- rcu_init_one(&rcu_sched_state, &rcu_sched_data);
- __rcu_init_preempt();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ rcu_init_one();
+ if (dump_tree)
+ rcu_dump_rcu_node_tree();
+ if (use_softirq)
+ open_softirq(RCU_SOFTIRQ, rcu_core_si);
/*
* We don't need protection against CPU-hotplug here because
* this is called early in boot, before either interrupts
* or the scheduler are operational.
*/
- cpu_notifier(rcu_cpu_notify, 0);
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu)
- rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+ rcutree_prepare_cpu(cpu);
+ rcutree_report_cpu_starting(cpu);
+ rcutree_online_cpu(cpu);
+
+ /* Create workqueue for Tree SRCU and for expedited GPs. */
+ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
+ WARN_ON(!rcu_gp_wq);
+
+ sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ WARN_ON(!sync_wq);
+
+ /* Respect if explicitly disabled via a boot parameter. */
+ if (rcu_normal_wake_from_gp < 0) {
+ if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD)
+ rcu_normal_wake_from_gp = 1;
+ }
+
+ /* Fill in default value for rcutree.qovld boot parameter. */
+ /* -After- the rcu_node ->lock fields are initialized! */
+ if (qovld < 0)
+ qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
+ else
+ qovld_calc = qovld;
+
+ // Kick-start in case any polled grace periods started early.
+ (void)start_poll_synchronize_rcu_expedited();
+
+ rcu_test_sync_prims();
+
+ tasks_cblist_init_generic();
}
+#include "tree_stall.h"
+#include "tree_exp.h"
+#include "tree_nocb.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a69d3dab2ec4..b8bbe7960cda 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -1,112 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/cache.h>
+#include <linux/kthread.h>
#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/swait.h>
+#include <linux/rcu_node_tree.h>
-/*
- * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
- * CONFIG_RCU_FANOUT_LEAF.
- * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this did work well going from three levels to four.
- * Of course, your mileage may vary.
- */
-#define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT_1
-# define RCU_NUM_LVLS 1
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (NR_CPUS)
-# define NUM_RCU_LVL_2 0
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_2
-# define RCU_NUM_LVLS 2
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_2 (NR_CPUS)
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_3
-# define RCU_NUM_LVLS 3
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_3 (NR_CPUS)
-# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_4
-# define RCU_NUM_LVLS 4
-# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_4 (NR_CPUS)
-#else
-# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
-#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
+#include "rcu_segcblist.h"
-/*
- * Dynticks per-CPU state.
- */
-struct rcu_dynticks {
- long long dynticks_nesting; /* Track irq/process nesting level. */
- /* Process level is worth LLONG_MAX/2. */
- int dynticks_nmi_nesting; /* Track NMI nesting level. */
- atomic_t dynticks; /* Even value for idle, else odd. */
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- long long dynticks_idle_nesting;
- /* irq/process nesting level from idle. */
- atomic_t dynticks_idle; /* Even value for idle, else odd. */
- /* "Idle" excludes userspace execution. */
- unsigned long dynticks_idle_jiffies;
- /* End of last non-NMI non-idle period. */
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-#ifdef CONFIG_RCU_FAST_NO_HZ
- bool all_lazy; /* Are all CPU's CBs lazy? */
- unsigned long nonlazy_posted;
- /* # times non-lazy CBs posted to CPU. */
- unsigned long nonlazy_posted_snap;
- /* idle-period nonlazy_posted snapshot. */
- unsigned long last_accelerate;
- /* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all;
- /* Last jiffy CBs were all advanced. */
- int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+/* Communicate arguments to a kthread worker handler. */
+struct rcu_exp_work {
+ unsigned long rew_s;
+ struct kthread_work rew_work;
};
/* RCU's kthread states for tracing. */
@@ -121,35 +39,46 @@ struct rcu_dynticks {
* Definition for node within the RCU grace-period-detection hierarchy.
*/
struct rcu_node {
- raw_spinlock_t lock; /* Root rcu_node's lock protects some */
- /* rcu_state fields as well as following. */
- unsigned long gpnum; /* Current grace period for this node. */
- /* This will either be equal to or one */
- /* behind the root rcu_node's gpnum. */
- unsigned long completed; /* Last GP completed for this node. */
- /* This will either be equal to or one */
- /* behind the root rcu_node's gpnum. */
+ raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
+ /* some rcu_state fields as well as */
+ /* following. */
+ unsigned long gp_seq; /* Track rsp->gp_seq. */
+ unsigned long gp_seq_needed; /* Track furthest future GP request. */
+ unsigned long completedqs; /* All QSes done for this node. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
/* order for current grace period to proceed.*/
/* In leaf rcu_node, each bit corresponds to */
/* an rcu_data structure, otherwise, each */
/* bit corresponds to a child rcu_node */
/* structure. */
- unsigned long expmask; /* Groups that have ->blkd_tasks */
- /* elements that need to drain to allow the */
- /* current expedited grace period to */
- /* complete (only for PREEMPT_RCU). */
+ unsigned long rcu_gp_init_mask; /* Mask of offline CPUs at GP init. */
unsigned long qsmaskinit;
- /* Per-GP initial value for qsmask & expmask. */
+ /* Per-GP initial value for qsmask. */
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
- /* Online CPUs for next grace period. */
+ unsigned long expmask; /* CPUs or groups that need to check in */
+ /* to allow the current expedited GP */
+ /* to complete. */
+ unsigned long expmaskinit;
+ /* Per-GP initial values for expmask. */
+ /* Initialized from ->expmaskinitnext at the */
+ /* beginning of each expedited GP. */
+ unsigned long expmaskinitnext;
+ /* Online CPUs for next expedited GP. */
+ /* Any CPU that has ever been online will */
+ /* have its bit set. */
+ struct kthread_worker *exp_kworker;
+ /* Workers performing per node expedited GP */
+ /* initialization. */
+ unsigned long cbovldmask;
+ /* CPUs experiencing callback overload. */
+ unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
- int grplo; /* lowest-numbered CPU or group here. */
- int grphi; /* highest-numbered CPU or group here. */
- u8 grpnum; /* CPU/group number for next level up. */
+ int grplo; /* lowest-numbered CPU here. */
+ int grphi; /* highest-numbered CPU here. */
+ u8 grpnum; /* group number for next level up. */
u8 level; /* root is at level 0. */
bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
/* exit RCU read-side critical sections */
@@ -170,7 +99,6 @@ struct rcu_node {
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there can cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
struct list_head *boost_tasks;
/* Pointer to first task that needs to be */
/* priority boosted, or NULL if no priority */
@@ -184,208 +112,196 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
+ struct mutex kthread_mutex;
+ /* Exclusion for thread spawning and affinity */
+ /* manipulation. */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
- unsigned long n_tasks_boosted;
- /* Total number of tasks boosted. */
- unsigned long n_exp_boosts;
- /* Number of tasks boosted for expedited GP. */
- unsigned long n_normal_boosts;
- /* Number of tasks boosted for normal GP. */
- unsigned long n_balk_blkd_tasks;
- /* Refused to boost: no blocked tasks. */
- unsigned long n_balk_exp_gp_tasks;
- /* Refused to boost: nothing blocking GP. */
- unsigned long n_balk_boost_tasks;
- /* Refused to boost: already boosting. */
- unsigned long n_balk_notblocked;
- /* Refused to boost: RCU RS CS still running. */
- unsigned long n_balk_notyet;
- /* Refused to boost: not yet time. */
- unsigned long n_balk_nos;
- /* Refused to boost: not sure why, though. */
- /* This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
+ unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */
#ifdef CONFIG_RCU_NOCB_CPU
- wait_queue_head_t nocb_gp_wq[2];
+ struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- int need_future_gp[2];
- /* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+ spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
+ unsigned long exp_seq_rq;
+ wait_queue_head_t exp_wq[4];
+ struct rcu_exp_work rew;
+ bool exp_need_flush; /* Need to flush workitem? */
+ raw_spinlock_t exp_poll_lock;
+ /* Lock and data for polled expedited grace periods. */
+ unsigned long exp_seq_poll_rq;
+ struct work_struct exp_poll_wq;
} ____cacheline_internodealigned_in_smp;
/*
- * Do a full breadth-first scan of the rcu_node structures for the
- * specified rcu_state structure.
+ * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
+ * are indexed relative to this interval rather than the global CPU ID space.
+ * This generates the bit for a CPU in node-local masks.
*/
-#define rcu_for_each_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+#define leaf_node_cpu_bit(rnp, cpu) (BIT((cpu) - (rnp)->grplo))
/*
- * Do a breadth-first scan of the non-leaf rcu_node structures for the
- * specified rcu_state structure. Note that if there is a singleton
- * rcu_node tree with but one rcu_node structure, this loop is a no-op.
+ * Union to allow "aggregate OR" operation on the need for a quiescent
+ * state by the normal and expedited grace periods.
*/
-#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+union rcu_noqs {
+ struct {
+ u8 norm;
+ u8 exp;
+ } b; /* Bits. */
+ u16 s; /* Set of bits, aggregate OR here. */
+};
/*
- * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
- * structure. Note that if there is a singleton rcu_node tree with but
- * one rcu_node structure, this loop -will- visit the rcu_node structure.
- * It is still a leaf node, even if it is also the root node.
+ * Record the snapshot of the core stats at half of the first RCU stall timeout.
+ * The member gp_seq is used to ensure that all members are updated only once
+ * during the sampling period. The snapshot is taken only if this gp_seq is not
+ * equal to rdp->gp_seq.
*/
-#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
- (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+struct rcu_snap_record {
+ unsigned long gp_seq; /* Track rdp->gp_seq counter */
+ u64 cputime_irq; /* Accumulated cputime of hard irqs */
+ u64 cputime_softirq;/* Accumulated cputime of soft irqs */
+ u64 cputime_system; /* Accumulated cputime of kernel tasks */
+ u64 nr_hardirqs; /* Accumulated number of hard irqs */
+ unsigned int nr_softirqs; /* Accumulated number of soft irqs */
+ unsigned long long nr_csw; /* Accumulated number of task switches */
+ unsigned long jiffies; /* Track jiffies value */
+};
-/* Index values for nxttail array in struct rcu_data. */
-#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
-#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
-#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
-#define RCU_NEXT_TAIL 3
-#define RCU_NEXT_SIZE 4
+/*
+ * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
+ * to report quiescent states at the soonest possible time.
+ * The request can be in one of the following states:
+ * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
+ * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
+ * ran and we still haven't reported a quiescent state.
+ */
+#define DEFER_QS_IDLE 0
+#define DEFER_QS_PENDING 1
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
- unsigned long completed; /* Track rsp->completed gp number */
- /* in order to detect GP end. */
- unsigned long gpnum; /* Highest gp number that this CPU */
- /* is aware of having started. */
- unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
- /* for rcu_all_qs() invocations. */
- bool passed_quiesce; /* User-mode/idle loop etc. */
- bool qs_pending; /* Core waits for quiesc state. */
+ unsigned long gp_seq; /* Track rsp->gp_seq counter. */
+ unsigned long gp_seq_needed; /* Track furthest future GP request. */
+ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
+ bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
- bool gpwrap; /* Possible gpnum/completed wrap. */
+ bool gpwrap; /* Possible ->gp_seq wrap. */
+ unsigned int gpwrap_count; /* Count of GP sequence wrap. */
+ bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned long ticks_this_gp; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
+ int defer_qs_iw_pending; /* Scheduler attention pending? */
+ struct work_struct strict_work; /* Schedule readers for strict GPs. */
/* 2) batch handling */
- /*
- * If nxtlist is not NULL, it is partitioned as follows.
- * Any of the partitions might be empty, in which case the
- * pointer to that partition will be equal to the pointer for
- * the following partition. When the list is empty, all of
- * the nxttail elements point to the ->nxtlist pointer itself,
- * which in that case is NULL.
- *
- * [nxtlist, *nxttail[RCU_DONE_TAIL]):
- * Entries that batch # <= ->completed
- * The grace period for these entries has completed, and
- * the other grace-period-completed entries may be moved
- * here temporarily in rcu_process_callbacks().
- * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
- * Entries that batch # <= ->completed - 1: waiting for current GP
- * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
- * Entries known to have arrived before current GP ended
- * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
- * Entries that might have arrived after current GP ended
- * Note that the value of *nxttail[RCU_NEXT_TAIL] will
- * always be NULL, as this is the end of the list.
- */
- struct rcu_head *nxtlist;
- struct rcu_head **nxttail[RCU_NEXT_SIZE];
- unsigned long nxtcompleted[RCU_NEXT_SIZE];
- /* grace periods for sublists. */
- long qlen_lazy; /* # of lazy queued callbacks */
- long qlen; /* # of queued callbacks, incl lazy */
+ struct rcu_segcblist cblist; /* Segmented callback list, with */
+ /* different callbacks waiting for */
+ /* different grace periods. */
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
- unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
- unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
+ unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
/* 3) dynticks interface. */
- struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
- int dynticks_snap; /* Per-GP tracking for dynticks. */
-
- /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
- unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
- unsigned long offline_fqs; /* Kicked due to being offline. */
- unsigned long cond_resched_completed;
- /* Grace period that needs help */
- /* from cond_resched(). */
-
- /* 5) __rcu_pending() statistics. */
- unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
- unsigned long n_rp_qs_pending;
- unsigned long n_rp_report_qs;
- unsigned long n_rp_cb_ready;
- unsigned long n_rp_cpu_needs_gp;
- unsigned long n_rp_gp_completed;
- unsigned long n_rp_gp_started;
- unsigned long n_rp_nocb_defer_wakeup;
- unsigned long n_rp_need_nothing;
-
- /* 6) _rcu_barrier() and OOM callbacks. */
+ int watching_snap; /* Per-GP tracking for dynticks. */
+ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
+ bool rcu_urgent_qs; /* GP old need light quiescent state. */
+ bool rcu_forced_tick; /* Forced tick to provide QS. */
+ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
+
+ /* 4) rcu_barrier(), OOM callbacks, and expediting. */
+ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
-#ifdef CONFIG_RCU_FAST_NO_HZ
- struct rcu_head oom_head;
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+ int exp_watching_snap; /* Double-check need for IPI. */
- /* 7) Callback offloading. */
+ /* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
- struct rcu_head *nocb_head; /* CBs waiting for kthread. */
- struct rcu_head **nocb_tail;
- atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
- atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
- struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
- struct rcu_head **nocb_follower_tail;
- wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
- struct task_struct *nocb_kthread;
+ struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_state_wq; /* For offloading state changes */
+ struct task_struct *nocb_gp_kthread;
+ raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
-
- /* The following fields are used by the leader, hence own cacheline. */
- struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
- /* CBs waiting for GP. */
- struct rcu_head **nocb_gp_tail;
- bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
- struct rcu_data *nocb_next_follower;
- /* Next follower in wakeup chain. */
-
- /* The following fields are used by the follower, hence new cachline. */
- struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
- /* Leader CPU takes GP-end wakeups. */
+ struct timer_list nocb_timer; /* Enforce finite deferral. */
+ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+ struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */
+ /* spawning */
+
+ /* The following fields are used by call_rcu, hence own cacheline. */
+ raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
+ struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
+ unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */
+ unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */
+ int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */
+
+ /* The following fields are used by GP kthread, hence own cacheline. */
+ raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
+ u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
+ u8 nocb_gp_bypass; /* Found a bypass on last scan? */
+ u8 nocb_gp_gp; /* GP to wait for on last scan? */
+ unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */
+ unsigned long nocb_gp_loops; /* # passes through wait code. */
+ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
+ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
+ struct task_struct *nocb_cb_kthread;
+ struct list_head nocb_head_rdp; /*
+ * Head of rcu_data list in wakeup chain,
+ * if rdp_gp.
+ */
+ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
+ struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
+
+ /* The following fields are used by CB kthread, hence new cacheline. */
+ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
+ /* GP rdp takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- /* 8) RCU CPU stall data. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
- unsigned int softirq_snap; /* Snapshot of softirq activity. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ /* 6) RCU priority boosting. */
+ struct task_struct *rcu_cpu_kthread_task;
+ /* rcuc per-CPU kthread or NULL. */
+ unsigned int rcu_cpu_kthread_status;
+ char rcu_cpu_has_work;
+ unsigned long rcuc_activity;
+ /* 7) Diagnostic data, including RCU CPU stall warnings. */
+ unsigned int softirq_snap; /* Snapshot of softirq activity. */
+ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
+ struct irq_work rcu_iw; /* Check for non-irq activity. */
+ bool rcu_iw_pending; /* Is ->rcu_iw pending? */
+ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */
+ unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */
+ short rcu_ofl_gp_state; /* ->gp_state at last offline. */
+ unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
+ short rcu_onl_gp_state; /* ->gp_state at last online. */
+ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
+ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
+ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
+ /* the first RCU stall timeout */
+
+ long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
- struct rcu_state *rsp;
};
-/* Values for fqs_state field in struct rcu_state. */
-#define RCU_GP_IDLE 0 /* No grace period in progress. */
-#define RCU_GP_INIT 1 /* Grace period being initialized. */
-#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
-#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
-#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
-
/* Values for nocb_defer_wakeup field in struct rcu_data. */
-#define RCU_NOGP_WAKE_NOT 0
-#define RCU_NOGP_WAKE 1
-#define RCU_NOGP_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_NOT 0
+#define RCU_NOCB_WAKE_BYPASS 1
+#define RCU_NOCB_WAKE_LAZY 2
+#define RCU_NOCB_WAKE 3
+#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -411,6 +327,19 @@ do { \
} while (0)
/*
+ * A max threshold for synchronize_rcu() users which are
+ * awaken directly by the rcu_gp_kthread(). Left part is
+ * deferred to the main worker.
+ */
+#define SR_MAX_USERS_WAKE_FROM_GP 5
+#define SR_NORMAL_GP_WAIT_HEAD_MAX 5
+
+struct sr_wait_node {
+ atomic_t inuse;
+ struct llist_node node;
+};
+
+/*
* RCU global state, including node hierarchy. This hierarchy is
* represented in "heap" form in a dense array. The root (first level)
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
@@ -422,199 +351,198 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
- u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
- u8 flavor_mask; /* bit in flavor mask. */
- struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
- void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
- void (*func)(struct rcu_head *head));
+ struct rcu_node *level[RCU_NUM_LVLS + 1];
+ /* Hierarchy levels (+1 to */
+ /* shut bogus gcc warning) */
+ int ncpus; /* # CPUs seen so far. */
+ int n_online_cpus; /* # CPUs online for RCU. */
/* The following fields are guarded by the root rcu_node's lock. */
- u8 fqs_state ____cacheline_internodealigned_in_smp;
- /* Force QS state. */
- u8 boost; /* Subject to priority boost. */
- unsigned long gpnum; /* Current gp number. */
- unsigned long completed; /* # of last completed gp. */
+ unsigned long gp_seq ____cacheline_internodealigned_in_smp;
+ /* Grace-period sequence #. */
+ unsigned long gp_max; /* Maximum GP duration in */
+ /* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
- wait_queue_head_t gp_wq; /* Where GP task waits. */
+ struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
+ unsigned long gp_wake_time; /* Last GP kthread wake. */
+ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
+ unsigned long gp_seq_polled; /* GP seq for polled API. */
+ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */
+ unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
- /* Protect following fields. */
- struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
- /* need a grace period. */
- struct rcu_head **orphan_nxttail; /* Tail of above. */
- struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
- /* are ready to invoke. */
- struct rcu_head **orphan_donetail; /* Tail of above. */
- long qlen_lazy; /* Number of lazy callbacks. */
- long qlen; /* Total number of callbacks. */
- /* End of fields guarded by orphan_lock. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
- unsigned long n_barrier_done; /* ++ at start and end of */
- /* _rcu_barrier(). */
+ unsigned long barrier_sequence; /* ++ at start and end of */
+ /* rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */
+
+ struct mutex exp_mutex; /* Serialize expedited GP. */
+ struct mutex exp_wake_mutex; /* Serialize wakeup. */
+ unsigned long expedited_sequence; /* Take a ticket. */
+ atomic_t expedited_need_qs; /* # CPUs left to check in. */
+ struct swait_queue_head expedited_wq; /* Wait for check-ins. */
+ int ncpus_snap; /* # CPUs seen last time. */
+ u8 cbovld; /* Callback overload now? */
+ u8 cbovldnext; /* ^ ^ next time? */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
+ unsigned long jiffies_kick_kthreads; /* Time at which to kick */
+ /* kthreads, if configured. */
unsigned long n_force_qs; /* Number of calls to */
/* force_quiescent_state(). */
- unsigned long n_force_qs_lh; /* ~Number of calls leaving */
- /* due to lock unavailable. */
- unsigned long n_force_qs_ngp; /* Number of calls leaving */
- /* due to no GP active. */
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
+ unsigned long gp_end; /* Time last GP ended, again */
+ /* in jiffies. */
unsigned long gp_activity; /* Time of last GP kthread */
/* activity in jiffies. */
+ unsigned long gp_req_activity; /* Time of last GP request */
+ /* in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
+ int nr_fqs_jiffies_stall; /* Number of fqs loops after
+ * which read jiffies and set
+ * jiffies_stall. Stall
+ * warnings disabled if !0. */
unsigned long jiffies_resched; /* Time at which to resched */
/* a reluctant CPU. */
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
/* GP start. */
- unsigned long gp_max; /* Maximum GP duration in */
- /* jiffies. */
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
- struct list_head flavors; /* List of RCU flavors. */
+
+ arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+ /* Synchronize offline with */
+ /* GP pre-initialization. */
+
+ /* synchronize_rcu() part. */
+ struct llist_head srs_next; /* request a GP users. */
+ struct llist_node *srs_wait_tail; /* wait for GP users. */
+ struct llist_node *srs_done_tail; /* ready for GP users. */
+ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
+ struct work_struct srs_cleanup_work;
+ atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+ struct mutex nocb_mutex; /* Guards (de-)offloading */
+ int nocb_is_setup; /* nocb is setup from boot */
+#endif
};
/* Values for rcu_state structure's gp_flags field. */
#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
+#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */
-/* Values for rcu_state structure's gp_flags field. */
-#define RCU_GP_WAIT_INIT 0 /* Initial state. */
+/* Values for rcu_state structure's gp_state field. */
+#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */
#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
-#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
-
-extern struct list_head rcu_struct_flavors;
-
-/* Sequence through rcu_state structures for each RCU flavor. */
-#define for_each_rcu_flavor(rsp) \
- list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
+#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
+#define RCU_GP_ONOFF 3 /* Grace-period initialization hotplug. */
+#define RCU_GP_INIT 4 /* Grace-period initialization. */
+#define RCU_GP_WAIT_FQS 5 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DOING_FQS 6 /* Wait done for force-quiescent-state time. */
+#define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */
+#define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */
/*
- * RCU implementation internal declarations:
+ * In order to export the rcu_state name to the tracing tools, it
+ * needs to be added in the __tracepoint_string section.
+ * This requires defining a separate variable tp_<sname>_varname
+ * that points to the string being used, and this will allow
+ * the tracing userspace tools to be able to decipher the string
+ * address to the matching string.
*/
-extern struct rcu_state rcu_sched_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
-
-extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
#ifdef CONFIG_PREEMPT_RCU
-extern struct rcu_state rcu_preempt_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
-#ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-#ifndef RCU_TREE_NONCORE
-
-/* Forward declarations for rcutree_plugin.h */
+#define RCU_ABBR 'p'
+#define RCU_NAME_RAW "rcu_preempt"
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+#define RCU_ABBR 's'
+#define RCU_NAME_RAW "rcu_sched"
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+#ifndef CONFIG_TRACING
+#define RCU_NAME RCU_NAME_RAW
+#else /* #ifdef CONFIG_TRACING */
+static char rcu_name[] = RCU_NAME_RAW;
+static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
+#define RCU_NAME rcu_name
+#endif /* #else #ifdef CONFIG_TRACING */
+
+/* Forward declarations for tree_plugin.h */
static void rcu_bootup_announce(void);
-static void rcu_preempt_note_context_switch(void);
+static void rcu_qs(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_print_detail_task_stall(struct rcu_state *rsp);
-static int rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-static void rcu_preempt_check_callbacks(void);
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-static void __init __rcu_init_preempt(void);
+static void rcu_flavor_sched_clock_irq(int user);
+static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
-static bool rcu_is_callbacks_kthread(void);
-#ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
-static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_prepare_kthreads(int cpu);
-static void rcu_cleanup_after_idle(void);
-static void rcu_prepare_for_idle(void);
-static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
-static void print_cpu_stall_info_begin(void);
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
-static void print_cpu_stall_info_end(void);
+static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
-static void increment_cpu_stall_ticks(void);
-static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
- struct rcu_data *rdp,
- unsigned long flags);
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy);
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy);
+static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_all_nocb_kthreads(int cpu);
-static void __init rcu_spawn_nocb_kthreads(void);
+static void rcu_spawn_cpu_nocb_kthread(int cpu);
+static void show_rcu_nocb_state(struct rcu_data *rdp);
+static void rcu_nocb_lock(struct rcu_data *rdp);
+static void rcu_nocb_unlock(struct rcu_data *rdp);
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags);
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
-static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
-static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(int irq);
-static void rcu_sysidle_exit(int irq);
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj);
-static bool is_sysidle_rcu_state(struct rcu_state *rsp);
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj);
-static void rcu_bind_gp_kthread(void);
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
-static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
-static void rcu_dynticks_task_enter(void);
-static void rcu_dynticks_task_exit(void);
+static void __init rcu_organize_nocb_kthreads(void);
-#endif /* #ifndef RCU_TREE_NONCORE */
-
-#ifdef CONFIG_RCU_TRACE
-/* Read out queue lengths for tracing. */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- *ql = atomic_long_read(&rdp->nocb_q_count);
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy);
+/*
+ * Disable IRQs before checking offloaded state so that local
+ * locking is safe against concurrent de-offloading.
+ */
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ local_irq_save(flags); \
+ if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ raw_spin_lock(&(rdp)->nocb_lock); \
+} while (0)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
- *ql = 0;
- *qll = 0;
+#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
+
+static void rcu_bind_gp_kthread(void);
+static bool rcu_nohz_full_cpu(void);
+
+/* Forward declarations for tree_stall.h */
+static void record_gp_stall_check_time(void);
+static void rcu_iw_handler(struct irq_work *iwp);
+static void check_cpu_stall(struct rcu_data *rdp);
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay);
+
+/* Forward declarations for tree_exp.h. */
+static void sync_rcu_do_polled_gp(struct work_struct *wp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
new file mode 100644
index 000000000000..96c49c56fc14
--- /dev/null
+++ b/kernel/rcu/tree_exp.h
@@ -0,0 +1,1113 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * RCU expedited grace periods
+ *
+ * Copyright IBM Corporation, 2016
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+#include <linux/console.h>
+#include <linux/lockdep.h>
+
+static void rcu_exp_handler(void *unused);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp);
+
+/*
+ * Record the start of an expedited grace period.
+ */
+static void rcu_exp_gp_seq_start(void)
+{
+ rcu_seq_start(&rcu_state.expedited_sequence);
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+}
+
+/*
+ * Return the value that the expedited-grace-period counter will have
+ * at the end of the current grace period.
+ */
+static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
+{
+ return rcu_seq_endval(&rcu_state.expedited_sequence);
+}
+
+/*
+ * Record the end of an expedited grace period.
+ */
+static void rcu_exp_gp_seq_end(void)
+{
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ rcu_seq_end(&rcu_state.expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+
+/*
+ * Take a snapshot of the expedited-grace-period counter, which is the
+ * earliest value that will indicate that a full grace period has
+ * elapsed since the current time.
+ */
+static unsigned long rcu_exp_gp_seq_snap(void)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = rcu_seq_snap(&rcu_state.expedited_sequence);
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("snap"));
+ return s;
+}
+
+/*
+ * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true
+ * if a full expedited grace period has elapsed since that snapshot
+ * was taken.
+ */
+static bool rcu_exp_gp_seq_done(unsigned long s)
+{
+ return rcu_seq_done(&rcu_state.expedited_sequence, s);
+}
+
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity. Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online. This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(void)
+{
+ bool done;
+ unsigned long flags;
+ unsigned long mask;
+ unsigned long oldmask;
+ int ncpus = smp_load_acquire(&rcu_state.ncpus); /* Order vs. locking. */
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_up;
+
+ /* If no new CPUs onlined since last time, nothing to do. */
+ if (likely(ncpus == rcu_state.ncpus_snap))
+ return;
+ rcu_state.ncpus_snap = ncpus;
+
+ /*
+ * Each pass through the following loop propagates newly onlined
+ * CPUs for the current rcu_node structure up the rcu_node tree.
+ */
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->expmaskinit == rnp->expmaskinitnext) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue; /* No new CPUs, nothing to do. */
+ }
+
+ /* Update this node's mask, track old value for propagation. */
+ oldmask = rnp->expmaskinit;
+ rnp->expmaskinit = rnp->expmaskinitnext;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ /* If was already nonzero, nothing to propagate. */
+ if (oldmask)
+ continue;
+
+ /* Propagate the new CPU up the tree. */
+ mask = rnp->grpmask;
+ rnp_up = rnp->parent;
+ done = false;
+ while (rnp_up) {
+ raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
+ if (rnp_up->expmaskinit)
+ done = true;
+ rnp_up->expmaskinit |= mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
+ if (done)
+ break;
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ }
+ }
+}
+
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(void)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ sync_exp_reset_tree_hotplug();
+ rcu_for_each_node_breadth_first(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ WARN_ON_ONCE(rnp->expmask);
+ WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited GP
+ * until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp))
+ WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
+/*
+ * Return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.
+ */
+static bool sync_rcu_exp_done(struct rcu_node *rnp)
+{
+ raw_lockdep_assert_held_rcu_node(rnp);
+ return READ_ONCE(rnp->exp_tasks) == NULL &&
+ READ_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Like sync_rcu_exp_done(), but where the caller does not hold the
+ * rcu_node's ->lock.
+ */
+static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ ret = sync_rcu_exp_done(rnp);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ return ret;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period. This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree. (Calm down, calm down, we do the recursion
+ * iteratively!)
+ */
+static void __rcu_report_exp_rnp(struct rcu_node *rnp,
+ bool wake, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long mask;
+
+ raw_lockdep_assert_held_rcu_node(rnp);
+ for (;;) {
+ if (!sync_rcu_exp_done(rnp)) {
+ if (!rnp->expmask)
+ rcu_initiate_boost(rnp, flags);
+ else
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ break;
+ }
+ if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (wake)
+ swake_up_one(&rcu_state.expedited_wq);
+
+ break;
+ }
+ mask = rnp->grpmask;
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
+ rnp = rnp->parent;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
+ WARN_ON_ONCE(!(rnp->expmask & mask));
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
+ }
+}
+
+/*
+ * Report expedited quiescent state for specified node. This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ __rcu_report_exp_rnp(rnp, wake, flags);
+}
+
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure, which is acquired by the caller.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags,
+ unsigned long mask_in, bool wake)
+ __releases(rnp->lock)
+{
+ int cpu;
+ unsigned long mask;
+ struct rcu_data *rdp;
+
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (!(rnp->expmask & mask_in)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ mask = mask_in & rnp->expmask;
+ WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp)
+ continue;
+ rdp->rcu_forced_tick_exp = false;
+ tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ }
+ __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */
+}
+
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ */
+static void rcu_report_exp_rdp(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
+ ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp);
+ rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true);
+}
+
+/* Common code for work-done checking. */
+static bool sync_exp_work_done(unsigned long s)
+{
+ if (rcu_exp_gp_seq_done(s)) {
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
+ /*
+ * Order GP completion with preceding accesses. Order also GP
+ * completion with post GP update side accesses. Pairs with
+ * rcu_seq_end().
+ */
+ smp_mb();
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held. Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
+ */
+static bool exp_funnel_lock(unsigned long s)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ struct rcu_node *rnp = rdp->mynode;
+ struct rcu_node *rnp_root = rcu_get_root();
+
+ /* Low-contention fastpath. */
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+ (rnp == rnp_root ||
+ ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+ mutex_trylock(&rcu_state.exp_mutex))
+ goto fastpath;
+
+ /*
+ * Each pass through the following loop works its way up
+ * the rcu_node tree, returning if others have done the work or
+ * otherwise falls through to acquire ->exp_mutex. The mapping
+ * from CPU to rcu_node structure can be inexact, as it is just
+ * promoting locality and is not strictly needed for correctness.
+ */
+ for (; rnp != NULL; rnp = rnp->parent) {
+ if (sync_exp_work_done(s))
+ return true;
+
+ /* Work not done, either wait here or go up. */
+ spin_lock(&rnp->exp_lock);
+ if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+ /* Someone else doing GP, so wait for them. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level,
+ rnp->grplo, rnp->grphi,
+ TPS("wait"));
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
+ sync_exp_work_done(s));
+ return true;
+ }
+ WRITE_ONCE(rnp->exp_seq_rq, s); /* Followers can wait on us. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level,
+ rnp->grplo, rnp->grphi, TPS("nxtlvl"));
+ }
+ mutex_lock(&rcu_state.exp_mutex);
+fastpath:
+ if (sync_exp_work_done(s)) {
+ mutex_unlock(&rcu_state.exp_mutex);
+ return true;
+ }
+ rcu_exp_gp_seq_start();
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("start"));
+ return false;
+}
+
+/*
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
+ */
+static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long mask_ofl_test;
+ unsigned long mask_ofl_ipi;
+ int ret;
+ struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+
+ /* Each pass checks a CPU for identity, offline, and idle. */
+ mask_ofl_test = 0;
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long mask = rdp->grpmask;
+ int snap;
+
+ if (raw_smp_processor_id() == cpu ||
+ !(rnp->qsmaskinitnext & mask)) {
+ mask_ofl_test |= mask;
+ } else {
+ /*
+ * Full ordering between remote CPU's post idle accesses
+ * and updater's accesses prior to current GP (and also
+ * the started GP sequence number) is enforced by
+ * rcu_seq_start() implicit barrier, relayed by kworkers
+ * locking and even further by smp_mb__after_unlock_lock()
+ * barriers chained all the way throughout the rnp locking
+ * tree since sync_exp_reset_tree() and up to the current
+ * leaf rnp locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and
+ * post grace period updater's accesses is enforced by the
+ * below acquire semantic.
+ */
+ snap = ct_rcu_watching_cpu_acquire(cpu);
+ if (rcu_watching_snap_in_eqs(snap))
+ mask_ofl_test |= mask;
+ else
+ rdp->exp_watching_snap = snap;
+ }
+ }
+ mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ /* IPI the remaining CPUs for expedited quiescent state. */
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long mask = rdp->grpmask;
+
+retry_ipi:
+ if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) {
+ mask_ofl_test |= mask;
+ continue;
+ }
+ if (get_cpu() == cpu) {
+ mask_ofl_test |= mask;
+ put_cpu();
+ continue;
+ }
+ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
+ put_cpu();
+ /* The CPU will report the QS in response to the IPI. */
+ if (!ret)
+ continue;
+
+ /* Failed, raced with CPU hotplug operation. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if ((rnp->qsmaskinitnext & mask) &&
+ (rnp->expmask & mask)) {
+ /* Online, so delay for a bit and try again. */
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl"));
+ schedule_timeout_idle(1);
+ goto retry_ipi;
+ }
+ /* CPU really is offline, so we must report its QS. */
+ if (rnp->expmask & mask)
+ mask_ofl_test |= mask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ /* Report quiescent states for those that went offline. */
+ if (mask_ofl_test) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false);
+ }
+}
+
+static void rcu_exp_sel_wait_wake(unsigned long s);
+
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_exp_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_gp_kworker);
+}
+
+static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
+{
+ return !!READ_ONCE(rnp->exp_kworker);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /*
+ * Use rcu_exp_par_gp_kworker, because flushing a work item from
+ * another work item on the same kthread worker can result in
+ * deadlock.
+ */
+ kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ kthread_flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
+ kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(void)
+{
+ struct rcu_node *rnp;
+
+ trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
+ sync_exp_reset_tree();
+ trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("select"));
+
+ /* Schedule work for each leaf rcu_node structure. */
+ rcu_for_each_leaf_node(rnp) {
+ rnp->exp_need_flush = false;
+ if (!READ_ONCE(rnp->expmask))
+ continue; /* Avoid early boot non-existent wq. */
+ if (!rcu_exp_par_worker_started(rnp) ||
+ rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
+ rcu_is_last_leaf_node(rnp)) {
+ /* No worker started yet or last leaf, do direct call. */
+ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+ continue;
+ }
+ sync_rcu_exp_select_cpus_queue_work(rnp);
+ rnp->exp_need_flush = true;
+ }
+
+ /* Wait for jobs (if any) to complete. */
+ rcu_for_each_leaf_node(rnp)
+ if (rnp->exp_need_flush)
+ sync_rcu_exp_select_cpus_flush_work(rnp);
+}
+
+/*
+ * Wait for the expedited grace period to elapse, within time limit.
+ * If the time limit is exceeded without the grace period elapsing,
+ * return false, otherwise return true.
+ */
+static bool synchronize_rcu_expedited_wait_once(long tlimit)
+{
+ int t;
+ struct rcu_node *rnp_root = rcu_get_root();
+
+ t = swait_event_timeout_exclusive(rcu_state.expedited_wq,
+ sync_rcu_exp_done_unlocked(rnp_root),
+ tlimit);
+ // Workqueues should not be signaled.
+ if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root))
+ return true;
+ WARN_ON(t < 0); /* workqueues should not be signaled. */
+ return false;
+}
+
+/*
+ * Print out an expedited RCU CPU stall warning message.
+ */
+static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j)
+{
+ int cpu;
+ unsigned long mask;
+ int ndetected;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root();
+
+ if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+ pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name);
+ ndetected = 0;
+ rcu_for_each_leaf_node(rnp) {
+ ndetected += rcu_print_task_exp_stall(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ struct rcu_data *rdp;
+
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(READ_ONCE(rnp->expmask) & mask))
+ continue;
+ ndetected++;
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ pr_cont(" %d-%c%c%c%c", cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
+ }
+ }
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask),
+ ".T"[!!data_race(rnp_root->exp_tasks)]);
+ if (ndetected) {
+ pr_err("blocking rcu_node structures (internal RCU debug):");
+ rcu_for_each_node_breadth_first(rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_exp_done_unlocked(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask),
+ ".T"[!!data_race(rnp->exp_tasks)]);
+ }
+ pr_cont("\n");
+ }
+ rcu_for_each_leaf_node(rnp) {
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(READ_ONCE(rnp->expmask) & mask))
+ continue;
+ dump_cpu_task(cpu);
+ }
+ rcu_exp_print_detail_task_stall_rnp(rnp);
+ }
+}
+
+/*
+ * Wait for the expedited grace period to elapse, issuing any needed
+ * RCU CPU stall warnings along the way.
+ */
+static void synchronize_rcu_expedited_wait(void)
+{
+ int cpu;
+ unsigned long j;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ unsigned long mask;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long flags;
+
+ trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
+ jiffies_stall = rcu_exp_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+ if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
+ if (synchronize_rcu_expedited_wait_once(1))
+ return;
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ mask = READ_ONCE(rnp->expmask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->rcu_forced_tick_exp)
+ continue;
+ rdp->rcu_forced_tick_exp = true;
+ if (cpu_online(cpu))
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ j = READ_ONCE(jiffies_till_first_fqs);
+ if (synchronize_rcu_expedited_wait_once(j + HZ))
+ return;
+ }
+
+ for (;;) {
+ unsigned long j;
+
+ if (synchronize_rcu_expedited_wait_once(jiffies_stall))
+ return;
+ if (rcu_stall_is_suppressed())
+ continue;
+
+ nbcon_cpu_emergency_enter();
+
+ j = jiffies;
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
+ trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
+ synchronize_rcu_expedited_stall(jiffies_start, j);
+ jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+
+ nbcon_cpu_emergency_exit();
+
+ panic_on_rcu_stall();
+ }
+}
+
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period. Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(unsigned long s)
+{
+ struct rcu_node *rnp;
+
+ synchronize_rcu_expedited_wait();
+
+ // Switch over to wakeup mode, allowing the next GP to proceed.
+ // End the previous grace period only after acquiring the mutex
+ // to ensure that only one GP runs concurrently with wakeups.
+ mutex_lock(&rcu_state.exp_wake_mutex);
+ rcu_exp_gp_seq_end();
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end"));
+
+ rcu_for_each_node_breadth_first(rnp) {
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+ spin_lock(&rnp->exp_lock);
+ /* Recheck, avoid hang in case someone just arrived. */
+ if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+ WRITE_ONCE(rnp->exp_seq_rq, s);
+ spin_unlock(&rnp->exp_lock);
+ }
+ smp_mb(); /* All above changes before wakeup. */
+ wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
+ }
+ trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
+ mutex_unlock(&rcu_state.exp_wake_mutex);
+}
+
+/*
+ * Common code to drive an expedited grace period forward, used by
+ * workqueues and mid-boot-time tasks.
+ */
+static void rcu_exp_sel_wait_wake(unsigned long s)
+{
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus();
+
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(s);
+}
+
+/* Request an expedited quiescent state. */
+static void rcu_exp_need_qs(void)
+{
+ lockdep_assert_irqs_disabled();
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp));
+ __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
+ set_need_resched_current();
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Remote handler for smp_call_function_single(). If there is an
+ * RCU read-side critical section in effect, request that the
+ * next rcu_read_unlock() record the quiescent state up the
+ * ->expmask fields in the rcu_node tree. Otherwise, immediately
+ * report the quiescent state.
+ */
+static void rcu_exp_handler(void *unused)
+{
+ int depth = rcu_preempt_depth();
+ unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
+ struct task_struct *t = current;
+
+ /*
+ * WARN if the CPU is unexpectedly already looking for a
+ * QS or has already reported one.
+ */
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
+ if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ READ_ONCE(rdp->cpu_no_qs.b.exp)))
+ return;
+
+ /*
+ * Second, the common case of not being in an RCU read-side
+ * critical section. If also enabled or idle, immediately
+ * report the quiescent state, otherwise defer.
+ */
+ if (!depth) {
+ if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
+ rcu_is_cpu_rrupt_from_idle())
+ rcu_report_exp_rdp(rdp);
+ else
+ rcu_exp_need_qs();
+ return;
+ }
+
+ /*
+ * Third, the less-common case of being in an RCU read-side
+ * critical section. In this case we can count on a future
+ * rcu_read_unlock(). However, this rcu_read_unlock() might
+ * execute on some other CPU, but in that case there will be
+ * a future context switch. Either way, if the expedited
+ * grace period is still waiting on this CPU, set ->deferred_qs
+ * so that the eventual quiescent state will be reported.
+ * Note that there is a large group of race conditions that
+ * can have caused this quiescent state to already have been
+ * reported, so we really do need to check ->expmask.
+ */
+ if (depth > 0) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->expmask & rdp->grpmask) {
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
+ t->rcu_read_unlock_special.b.exp_hint = true;
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+
+ // Fourth and finally, negative nesting depth should not happen.
+ WARN_ON_ONCE(1);
+}
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ int ndetected = 0;
+ struct task_struct *t;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rnp->exp_tasks) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return ndetected;
+}
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, dumping the stack of each that is blocking the current
+ * expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ if (!rcu_exp_stall_task_details)
+ return;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!READ_ONCE(rnp->exp_tasks)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static void rcu_exp_handler(void *unused)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
+ bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
+
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ __this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+ return;
+ if (rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
+ return;
+ }
+ rcu_exp_need_qs();
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to print out
+ * tasks blocked within RCU read-side critical sections that are blocking
+ * the current expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU grace period, but expedite it. The basic idea is to
+ * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether
+ * the CPU is in an RCU critical section, and if so, it sets a flag that
+ * causes the outermost rcu_read_unlock() to report the quiescent state
+ * for RCU-preempt or asks the scheduler for help for RCU-sched. On the
+ * other hand, if the CPU is not in an RCU read-side critical section,
+ * the IPI handler reports the quiescent state immediately.
+ *
+ * Although this is a great improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then use a single synchronize_rcu()
+ * instead.
+ *
+ * This has the same semantics as (but is more brutal than) synchronize_rcu().
+ */
+void synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_exp_work rew;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
+
+ /* Is the state is such that the call is a grace period? */
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs
+ // at process level. Therefore, this expedited GP overlaps
+ // with other expedited GPs only by being fully nested within
+ // them, which allows reuse of ->gp_seq_polled_exp_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
+ local_irq_restore(flags);
+ return; // Context allows vacuous grace periods.
+ }
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ synchronize_rcu_normal();
+ return;
+ }
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap();
+ if (exp_funnel_lock(s))
+ return; /* Someone else did our work for us. */
+
+ /* Ensure that load happens before action based on it. */
+ if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_s = s;
+ synchronize_rcu_expedited_queue_work(&rew);
+ }
+
+ /* Wait for expedited grace period to complete. */
+ rnp = rcu_get_root();
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
+ sync_exp_work_done(s));
+
+ /* Let the next expedited grace period start. */
+ mutex_unlock(&rcu_state.exp_mutex);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Ensure that start_poll_synchronize_rcu_expedited() has the expedited
+ * RCU grace periods that it needs.
+ */
+static void sync_rcu_do_polled_gp(struct work_struct *wp)
+{
+ unsigned long flags;
+ int i = 0;
+ struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
+ unsigned long s;
+
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+ if (s == RCU_GET_STATE_COMPLETED)
+ return;
+ while (!poll_state_synchronize_rcu(s)) {
+ synchronize_rcu_expedited();
+ if (i == 10 || i == 20)
+ pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled));
+ i++;
+ }
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ if (poll_state_synchronize_rcu(s))
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+}
+
+/**
+ * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
+ *
+ * Returns a cookie to pass to a call to cond_synchronize_rcu(),
+ * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
+ * allowing them to determine whether or not any sort of grace period has
+ * elapsed in the meantime. If the needed expedited grace period is not
+ * already slated to start, initiates that grace period.
+ */
+unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ s = get_state_synchronize_rcu();
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rdp->mynode;
+ if (rcu_init_invoked())
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ if (!poll_state_synchronize_rcu(s)) {
+ if (rcu_init_invoked()) {
+ rnp->exp_seq_poll_rq = s;
+ queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
+ }
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+
+ return s;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
+
+/**
+ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
+ * @rgosp: Place to put snapshot of grace-period state
+ *
+ * Places the normal and expedited grace-period states in rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed expedited grace period is not already slated to start,
+ * initiates that grace period.
+ */
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+ (void)start_poll_synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
+
+/**
+ * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
+ *
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If any type of full RCU grace period has elapsed since the earlier
+ * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
+ * or start_poll_synchronize_rcu_expedited(), just return. Otherwise,
+ * invoke synchronize_rcu_expedited() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+ if (!poll_state_synchronize_rcu(oldstate))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu_expedited()
+ * to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
new file mode 100644
index 000000000000..e6cd56603cad
--- /dev/null
+++ b/kernel/rcu/tree_nocb.h
@@ -0,0 +1,1705 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ * Copyright SUSE, 2021
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
+ * Frederic Weisbecker <frederic@kernel.org>
+ */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ /* Race on early boot between thread creation and assignment */
+ if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
+ return true;
+
+ if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
+ if (in_task())
+ return true;
+ return false;
+}
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
+ */
+
+
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * If the list is invalid, a warning is emitted and all CPUs are offloaded.
+ */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ if (*str == '=') {
+ if (cpulist_parse(++str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
+ }
+ rcu_state.nocb_is_setup = true;
+ return 1;
+}
+__setup("rcu_nocbs", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = true;
+ return 1;
+}
+__setup("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, perform minimal sanity check.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ /*
+ * Contention expected only when local enqueue collide with
+ * remote flush from kthreads.
+ */
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_rdp_is_offloaded(rdp))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_rdp_is_offloaded(rdp))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+ swake_up_all(sq);
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
+}
+
+static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp,
+ bool force, unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ bool needwake = false;
+
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ return false;
+ }
+
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ timer_delete(&rdp_gp->nocb_timer);
+ }
+
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
+ swake_up_one(&rdp_gp->nocb_gp_wq);
+ }
+
+ return needwake;
+}
+
+/*
+ * Kick the GP kthread for this NOCB group.
+ */
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+}
+
+#ifdef CONFIG_RCU_LAZY
+/*
+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
+ * can elapse before lazy callbacks are flushed. Lazy callbacks
+ * could be flushed much earlier for a number of other reasons
+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
+ * left unsubmitted to RCU after those many jiffies.
+ */
+#define LAZY_FLUSH_JIFFIES (10 * HZ)
+static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES;
+
+// To be called only from test code.
+void rcu_set_jiffies_lazy_flush(unsigned long jif)
+{
+ jiffies_lazy_flush = jif;
+}
+EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush);
+
+unsigned long rcu_get_jiffies_lazy_flush(void)
+{
+ return jiffies_lazy_flush;
+}
+EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush);
+#endif
+
+/*
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
+ */
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+
+ /*
+ * Bypass wakeup overrides previous deferments. In case of
+ * callback storms, no need to wake up too early.
+ */
+ if (waketype == RCU_NOCB_WAKE_LAZY &&
+ rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else {
+ if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+ if (rdp_gp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ }
+
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Return true if there was something to be flushed and it succeeded, otherwise
+ * false.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp_in,
+ unsigned long j, bool lazy)
+{
+ struct rcu_cblist rcl;
+ struct rcu_head *rhp = rhp_in;
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+
+ /*
+ * If the new CB requested was a lazy one, queue it onto the main
+ * ->cblist so that we can take advantage of the grace-period that will
+ * happen regardless. But queue it onto the bypass list first so that
+ * the lazy CB is ordered with the existing CBs in the bypass list.
+ */
+ if (lazy && rhp) {
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ rhp = NULL;
+ }
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ WRITE_ONCE(rdp->lazy_len, 0);
+
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ if (!rcu_rdp_is_offloaded(rdp))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
+}
+
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_rdp_is_offloaded(rdp) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
+}
+
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags,
+ bool lazy)
+{
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
+
+ lockdep_assert_irqs_disabled();
+
+ // Pure softirq/rcuc based processing: no bypassing, no
+ // locking.
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ // Lazy CBs throttle this back and do immediate bypass queuing.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ (ncbs && bypass_is_lazy &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) {
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+
+ // The flush succeeded and we moved CBs into the regular list.
+ // Don't wait for the wake up timer as it may be too far ahead.
+ // Wake up the GP thread now instead, if the cblist was empty.
+ __call_rcu_nocb_wake(rdp, *was_alldone, flags);
+
+ return true; // Callback already enqueued.
+ }
+
+ // We need to use the bypass.
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+
+ if (lazy)
+ WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
+
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+
+ // A wake up of the grace period kthread or timer adjustment
+ // needs to be done only if:
+ // 1. Bypass list was fully empty before (this is the first
+ // bypass list entry), or:
+ // 2. Both of these conditions are met:
+ // a. The bypass list previously had only lazy CBs, and:
+ // b. The new CB is non-lazy.
+ if (!ncbs || (bypass_is_lazy && !lazy)) {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock(rdp);
+ }
+ }
+ return true; // Callback already enqueued.
+}
+
+/*
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ long bypass_len;
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long lazy_len;
+ long len;
+ struct task_struct *t;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
+ if (rcu_nocb_poll || !t) {
+ rcu_nocb_unlock(rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_len = READ_ONCE(rdp->lazy_len);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
+ // Only lazy CBs in bypass list
+ if (lazy_len && bypass_len == lazy_len) {
+ rcu_nocb_unlock(rdp);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazy"));
+ } else if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ rcu_nocb_unlock(rdp);
+ wake_nocb_gp(rdp, false);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rcu_nocb_unlock(rdp);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp_gp->nocb_timer)) {
+ rcu_nocb_unlock(rdp);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ } else {
+ rcu_nocb_unlock(rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ } else {
+ rcu_nocb_unlock(rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+}
+
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
+{
+ bool was_alldone;
+
+ if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) {
+ /* Not enqueued on bypass but locked, do regular enqueue */
+ rcutree_enqueue(rdp, head, func);
+ __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ }
+}
+
+static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+
+ /*
+ * Locking orders future de-offloaded callbacks enqueue against previous
+ * handling of this rdp. Ie: Make sure rcuog is done with this rdp before
+ * deoffloaded callbacks can be enqueued.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ /*
+ * Offloading. Set our flag and notify the offload worker.
+ * We will handle this rdp until it ever gets de-offloaded.
+ */
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED);
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ list_del(&rdp->nocb_entry_rdp);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED);
+ }
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+
+static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
+{
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+}
+
+/*
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
+ */
+static void nocb_gp_wait(struct rcu_data *my_rdp)
+{
+ bool bypass = false;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool lazy = false;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
+ bool needwake;
+ bool needwake_gp;
+ struct rcu_data *rdp, *rdp_toggling = NULL;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
+
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
+ /*
+ * An rcu_data structure is removed from the list after its
+ * CPU is de-offloaded and added to the list before that CPU is
+ * (re-)offloaded. If the following loop happens to be referencing
+ * that rcu_data structure during the time that the corresponding
+ * CPU is de-offloaded and then immediately re-offloaded, this
+ * loop's rdp pointer will be carried to the end of the list by
+ * the resulting pair of list operations. This can cause the loop
+ * to skip over some of the rcu_data structures that were supposed
+ * to have been scanned. Fortunately a new iteration through the
+ * entire loop is forced after a given CPU's rcu_data structure
+ * is added to the list, so the skipped-over rcu_data structures
+ * won't be ignored for long.
+ */
+ list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+ long bypass_ncbs;
+ bool flush_bypass = false;
+ long lazy_ncbs;
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ lockdep_assert_held(&rdp->nocb_lock);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+
+ if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue; /* No callbacks here, try next. */
+ }
+
+ if (flush_bypass) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+ }
+
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
+ if (bypass_ncbs == lazy_ncbs)
+ lazy = true;
+ else
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ }
+
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ if (!rcu_nocb_poll) {
+ // If bypass list only has lazy CBs. Add a deferred lazy wake up.
+ if (lazy && !bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazyIsDeferred"));
+ // Otherwise add a deferred bypass wake up.
+ } else if (bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
+ }
+
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ if (list_empty(&my_rdp->nocb_head_rdp)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (!my_rdp->nocb_toggling_rdp)
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ /* Wait for any offloading rdp */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ schedule_timeout_idle(1);
+ }
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+ swait_event_interruptible_exclusive(
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+ }
+
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ // (De-)queue an rdp to/from the group if its nocb state is changing
+ rdp_toggling = my_rdp->nocb_toggling_rdp;
+ if (rdp_toggling)
+ my_rdp->nocb_toggling_rdp = NULL;
+
+ if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ timer_delete(&my_rdp->nocb_timer);
+ }
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ } else {
+ rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
+ if (rdp_toggling) {
+ /*
+ * Paranoid locking to make sure nocb_toggling_rdp is well
+ * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
+ * race with another round of nocb toggling for this rdp.
+ * Nocb locking should prevent from that already but we stick
+ * to paranoia, especially in rare path.
+ */
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ my_rdp->nocb_toggling_rdp = NULL;
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ }
+
+ if (rdp_toggling) {
+ nocb_gp_toggle_rdp(my_rdp, rdp_toggling);
+ swake_up_one(&rdp_toggling->nocb_state_wq);
+ }
+
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
+}
+
+/*
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
+ */
+static int rcu_nocb_gp_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park();
+}
+
+/*
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
+ */
+static void nocb_cb_wait(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_gp = false;
+ struct rcu_node *rnp = rdp->mynode;
+
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+ if (kthread_should_park()) {
+ /*
+ * kthread_park() must be preceded by an rcu_barrier().
+ * But yet another rcu_barrier() might have sneaked in between
+ * the barrier callback execution and the callbacks counter
+ * decrement.
+ */
+ if (rdp->nocb_cb_sleep) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ kthread_parkme();
+ }
+ } else if (READ_ONCE(rdp->nocb_cb_sleep)) {
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+
+ local_irq_save(flags);
+ rcu_momentary_eqs();
+ local_irq_restore(flags);
+ /*
+ * Disable BH to provide the expected environment. Also, when
+ * transitioning to/from NOCB mode, a self-requeuing callback might
+ * be invoked from softirq. A short grace period could cause both
+ * instances of this callback would execute concurrently.
+ */
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+
+ if (!rcu_segcblist_ready_cbs(cblist)) {
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+ } else {
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ }
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
+ */
+static int rcu_nocb_cb_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
+ for (;;) {
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp, int level,
+ unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ int ndw;
+ int ret;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ return false;
+ }
+
+ ndw = rdp_gp->nocb_defer_wakeup;
+ ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer);
+
+ WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+
+ raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+ return false;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
+}
+
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp)
+{
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ // Queue this rdp for add/del to/from the list to iterate on rcuog
+ WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ return wake_gp;
+}
+
+static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ bool ret;
+
+ /*
+ * Locking makes sure rcuog is done handling this rdp before deoffloaded
+ * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable
+ * while the ->nocb_lock is held.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ /* CPU must be offline, unless it's early boot */
+ WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id());
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ /* Flush all callbacks from segcblist and bypass */
+ rcu_barrier();
+
+ /*
+ * Make sure the rcuoc kthread isn't in the middle of a nocb locked
+ * sequence while offloading is deactivated, along with nocb locking.
+ */
+ if (rdp->nocb_cb_kthread)
+ kthread_park(rdp->nocb_cb_kthread);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
+
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ if (rdp_gp->nocb_gp_kthread) {
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_nocb_rdp_deoffload_wait_cond(rdp));
+ } else {
+ /*
+ * No kthread to clear the flags for us or remove the rdp from the nocb list
+ * to iterate. Do it here instead. Locking doesn't look stricly necessary
+ * but we stick to paranoia in this rare path.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ list_del(&rdp->nocb_entry_rdp);
+ }
+
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ return 0;
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.nocb_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ if (!cpu_online(cpu)) {
+ ret = rcu_nocb_rdp_deoffload(rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.nocb_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
+{
+ int wake_gp;
+
+ WARN_ON_ONCE(cpu_online(rdp->cpu));
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!rdp->nocb_gp_kthread))
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+
+ wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
+ if (wake_gp)
+ wake_up_process(rdp->nocb_gp_kthread);
+
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_nocb_rdp_offload_wait_cond(rdp));
+
+ kthread_unpark(rdp->nocb_cb_kthread);
+
+ return 0;
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.nocb_mutex);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ if (!cpu_online(cpu)) {
+ ret = rcu_nocb_rdp_offload(rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.nocb_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
+#ifdef CONFIG_RCU_LAZY
+static unsigned long
+lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+
+ /* Protect rcu_nocb_mask against concurrent (de-)offloading. */
+ if (!mutex_trylock(&rcu_state.nocb_mutex))
+ return 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ count += READ_ONCE(rdp->lazy_len);
+ }
+
+ mutex_unlock(&rcu_state.nocb_mutex);
+
+ return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long count = 0;
+
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+ /*
+ * Protect against concurrent (de-)offloading. Otherwise nocb locking
+ * may be ignored or imbalanced.
+ */
+ if (!mutex_trylock(&rcu_state.nocb_mutex)) {
+ /*
+ * But really don't insist if nocb_mutex is contended since we
+ * can't guarantee that it will never engage in a dependency
+ * chain involving memory allocation. The lock is seldom contended
+ * anyway.
+ */
+ return 0;
+ }
+
+ /* Snapshot count of all CPUs */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int _count;
+
+ if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)))
+ continue;
+
+ if (!READ_ONCE(rdp->lazy_len))
+ continue;
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Recheck under the nocb lock. Since we are not holding the bypass
+ * lock we may still race with increments from the enqueuer but still
+ * we know for sure if there is at least one lazy callback.
+ */
+ _count = READ_ONCE(rdp->lazy_len);
+ if (!_count) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue;
+ }
+ rcu_nocb_try_flush_bypass(rdp, jiffies);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ sc->nr_to_scan -= _count;
+ count += _count;
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ mutex_unlock(&rcu_state.nocb_mutex);
+
+ return count ? count : SHRINK_STOP;
+}
+#endif // #ifdef CONFIG_RCU_LAZY
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ const struct cpumask *cpumask = NULL;
+ struct shrinker * __maybe_unused lazy_rcu_shrinker;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+ cpumask = tick_nohz_full_mask;
+#endif
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) &&
+ !rcu_state.nocb_is_setup && !cpumask)
+ cpumask = cpu_possible_mask;
+
+ if (cpumask) {
+ if (!cpumask_available(rcu_nocb_mask)) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ }
+
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask);
+ rcu_state.nocb_is_setup = true;
+ }
+
+ if (!rcu_state.nocb_is_setup)
+ return;
+
+#ifdef CONFIG_RCU_LAZY
+ lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy");
+ if (!lazy_rcu_shrinker) {
+ pr_err("Failed to allocate lazy_rcu shrinker!\n");
+ } else {
+ lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count;
+ lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan;
+
+ shrinker_register(lazy_rcu_shrinker);
+ }
+#endif // #ifdef CONFIG_RCU_LAZY
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ }
+ rcu_organize_nocb_kthreads();
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
+ raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
+ WRITE_ONCE(rdp->lazy_len, 0);
+ mutex_init(&rdp->nocb_gp_kthread_mutex);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
+ */
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
+ struct task_struct *t;
+ struct sched_param sp;
+
+ if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup)
+ return;
+
+ /* If there already is an rcuo kthread, then nothing to do. */
+ if (rdp->nocb_cb_kthread)
+ return;
+
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ sp.sched_priority = kthread_prio;
+ rdp_gp = rdp->nocb_gp_rdp;
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+ goto err;
+ }
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ /* Spawn the kthread for this CPU. */
+ t = kthread_create(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
+ goto err;
+
+ if (rcu_rdp_is_offloaded(rdp))
+ wake_up_process(t);
+ else
+ kthread_park(t);
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+ return;
+
+err:
+ /*
+ * No need to protect against concurrent rcu_barrier()
+ * because the number of callbacks should be 0 for a non-boot CPU,
+ * therefore rcu_barrier() shouldn't even try to grab the nocb_lock.
+ * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker.
+ */
+ WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist));
+ mutex_lock(&rcu_state.nocb_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ rcu_nocb_rdp_deoffload(rdp);
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ mutex_unlock(&rcu_state.nocb_mutex);
+}
+
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
+
+/*
+ * Initialize GP-CB relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(void)
+{
+ int cpu;
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
+
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+ if (ls == -1) {
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
+ */
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu >= nl) {
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp_gp = rdp;
+ INIT_LIST_HEAD(&rdp->nocb_head_rdp);
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
+ } else {
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
+ }
+ rdp->nocb_gp_rdp = rdp_gp;
+ if (cpumask_test_cpu(cpu, rcu_nocb_mask))
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+ }
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
+}
+
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
+/*
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
+ */
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ char bufd[22];
+ char bufw[45];
+ char bufr[45];
+ char bufn[22];
+ char bufb[22];
+ struct rcu_data *nocb_next_rdp;
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return;
+
+ nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
+ &rdp->nocb_entry_rdp,
+ typeof(*rdp),
+ nocb_entry_rdp);
+
+ sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
+ sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
+ rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
+ sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ nocb_next_rdp ? nocb_next_rdp->cpu : -1,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd,
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn,
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb,
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+ return; /* Nothing untoward. */
+
+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+}
+
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ return false;
+}
+
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ return true;
+}
+
+static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
+ rcu_callback_t func, unsigned long flags, bool lazy)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return false;
+}
+
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+}
+
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8c0ec0f5a027..dbe2d02be824 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1,100 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions that provide either classic
* or preemptible semantics.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright Red Hat, 2009
* Copyright IBM Corporation, 2009
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/smpboot.h>
-#include "../time/tick-internal.h"
-
-#ifdef CONFIG_RCU_BOOST
-
#include "../locking/rtmutex_common.h"
-/*
- * Control variables for per-CPU and per-rcu_node kthreads. These
- * handle all flavors of RCU.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
+static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
+{
+ /*
+ * In order to read the offloaded state of an rdp in a safe
+ * and stable way and prevent from its value to be changed
+ * under us, we must either hold the barrier mutex, the cpu
+ * hotplug lock (read or write) or the nocb lock. Local
+ * non-preemptible reads are also safe. NOCB kthreads and
+ * timers have their own means of synchronization against the
+ * offloaded state updaters.
+ */
+ RCU_NOCB_LOCKDEP_WARN(
+ !(lockdep_is_held(&rcu_state.barrier_mutex) ||
+ (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
+ lockdep_is_held(&rdp->nocb_lock) ||
+ lockdep_is_held(&rcu_state.nocb_mutex) ||
+ ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) &&
+ rdp == this_cpu_ptr(&rcu_data)) ||
+ rcu_current_is_nocb_kthread(rdp)),
+ "Unsafe read of RCU_NOCB offloaded state"
+ );
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */
-static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ return rcu_segcblist_is_offloaded(&rdp->cblist);
+}
/*
* Check the RCU kernel configuration parameters and print informative
- * messages about anything out of the ordinary. If you like #ifdef, you
- * will love this function.
+ * messages about anything out of the ordinary.
*/
static void __init rcu_bootup_announce_oddness(void)
{
if (IS_ENABLED(CONFIG_RCU_TRACE))
- pr_info("\tRCU debugfs-based tracing is enabled.\n");
- if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
- (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
- pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
- CONFIG_RCU_FANOUT);
- if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+ pr_info("\tRCU event tracing is enabled.\n");
+ if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+ (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
+ pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n",
+ RCU_FANOUT);
+ if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
- if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
- if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
- pr_info("\tRCU torture testing starts during boot.\n");
- if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
- pr_info("\tAdditional per-CPU info printed with stalls.\n");
- if (NUM_RCU_LVL_4 != 0)
- pr_info("\tFour-level hierarchy is enabled.\n");
- if (CONFIG_RCU_FANOUT_LEAF != 16)
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+ pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
+ if (RCU_NUM_LVLS >= 4)
+ pr_info("\tFour(or more)-level hierarchy is enabled.\n");
+ if (RCU_FANOUT_LEAF != 16)
pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
- CONFIG_RCU_FANOUT_LEAF);
- if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
- pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+ RCU_FANOUT_LEAF);
+ if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
+ pr_info("\tBoot-time adjustment of leaf fanout to %d.\n",
+ rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
- pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
- if (IS_ENABLED(CONFIG_RCU_BOOST))
- pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+ pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
+#ifdef CONFIG_RCU_BOOST
+ pr_info("\tRCU priority boosting: priority %d delay %d ms.\n",
+ kthread_prio, CONFIG_RCU_BOOST_DELAY);
+#endif
+ if (blimit != DEFAULT_RCU_BLIMIT)
+ pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
+ if (qhimark != DEFAULT_RCU_QHIMARK)
+ pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
+ if (qlowmark != DEFAULT_RCU_QLOMARK)
+ pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
+ if (qovld != DEFAULT_RCU_QOVLD)
+ pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
+ if (jiffies_till_first_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
+ if (jiffies_till_next_fqs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
+ if (jiffies_till_sched_qs != ULONG_MAX)
+ pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs);
+ if (rcu_kick_kthreads)
+ pr_info("\tKick kthreads if too-long grace period.\n");
+ if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
+ pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
+ if (gp_preinit_delay)
+ pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
+ if (gp_init_delay)
+ pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
+ if (gp_cleanup_delay)
+ pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (nohz_full_patience_delay < 0) {
+ pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay);
+ nohz_full_patience_delay = 0;
+ } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) {
+ pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC);
+ nohz_full_patience_delay = 5 * MSEC_PER_SEC;
+ } else if (nohz_full_patience_delay) {
+ pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay);
+ }
+ nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay);
+ if (!use_softirq)
+ pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
+ if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
+ pr_info("\tRCU debug extended QS entry/exit.\n");
+ rcupdate_announce_bootup_oddness();
}
#ifdef CONFIG_PREEMPT_RCU
-RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state_p = &rcu_preempt_state;
-
-static int rcu_preempted_readers_exp(struct rcu_node *rnp);
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake);
+static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake);
+static void rcu_read_unlock_special(struct task_struct *t);
/*
* Tell them what RCU they are running.
@@ -105,24 +125,186 @@ static void __init rcu_bootup_announce(void)
rcu_bootup_announce_oddness();
}
+/* Flags for rcu_preempt_ctxt_queue() decision table. */
+#define RCU_GP_TASKS 0x8
+#define RCU_EXP_TASKS 0x4
+#define RCU_GP_BLKD 0x2
+#define RCU_EXP_BLKD 0x1
+
+/*
+ * Queues a task preempted within an RCU-preempt read-side critical
+ * section into the appropriate location within the ->blkd_tasks list,
+ * depending on the states of any ongoing normal and expedited grace
+ * periods. The ->gp_tasks pointer indicates which element the normal
+ * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
+ * indicates which element the expedited grace period is waiting on (again,
+ * NULL if none). If a grace period is waiting on a given element in the
+ * ->blkd_tasks list, it also waits on all subsequent elements. Thus,
+ * adding a task to the tail of the list blocks any grace period that is
+ * already waiting on one of the elements. In contrast, adding a task
+ * to the head of the list won't block any grace period that is already
+ * waiting on one of the elements.
+ *
+ * This queuing is imprecise, and can sometimes make an ongoing grace
+ * period wait for a task that is not strictly speaking blocking it.
+ * Given the choice, we needlessly block a normal grace period rather than
+ * blocking an expedited grace period.
+ *
+ * Note that an endless sequence of expedited grace periods still cannot
+ * indefinitely postpone a normal grace period. Eventually, all of the
+ * fixed number of preempted tasks blocking the normal grace period that are
+ * not also blocking the expedited grace period will resume and complete
+ * their RCU read-side critical sections. At that point, the ->gp_tasks
+ * pointer will equal the ->exp_tasks pointer, at which point the end of
+ * the corresponding expedited grace period will also be the end of the
+ * normal grace period.
+ */
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+ __releases(rnp->lock) /* But leaves rrupts disabled. */
+{
+ int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
+ (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
+ (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
+ (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
+ struct task_struct *t = current;
+
+ raw_lockdep_assert_held_rcu_node(rnp);
+ WARN_ON_ONCE(rdp->mynode != rnp);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
+ /* RCU better not be waiting on newly onlined CPUs! */
+ WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask &
+ rdp->grpmask);
+
+ /*
+ * Decide where to queue the newly blocked task. In theory,
+ * this could be an if-statement. In practice, when I tried
+ * that, it was quite messy.
+ */
+ switch (blkd_state) {
+ case 0:
+ case RCU_EXP_TASKS:
+ case RCU_EXP_TASKS | RCU_GP_BLKD:
+ case RCU_GP_TASKS:
+ case RCU_GP_TASKS | RCU_EXP_TASKS:
+
+ /*
+ * Blocking neither GP, or first task blocking the normal
+ * GP but not blocking the already-waiting expedited GP.
+ * Queue at the head of the list to avoid unnecessarily
+ * blocking the already-waiting GPs.
+ */
+ list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+ break;
+
+ case RCU_EXP_BLKD:
+ case RCU_GP_BLKD:
+ case RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+
+ /*
+ * First task arriving that blocks either GP, or first task
+ * arriving that blocks the expedited GP (with the normal
+ * GP already waiting), or a task arriving that blocks
+ * both GPs with both GPs already waiting. Queue at the
+ * tail of the list to avoid any GP waiting on any of the
+ * already queued tasks that are not blocking it.
+ */
+ list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
+ break;
+
+ case RCU_EXP_TASKS | RCU_EXP_BLKD:
+ case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD:
+
+ /*
+ * Second or subsequent task blocking the expedited GP.
+ * The task either does not block the normal GP, or is the
+ * first task blocking the normal GP. Queue just after
+ * the first task blocking the expedited GP.
+ */
+ list_add(&t->rcu_node_entry, rnp->exp_tasks);
+ break;
+
+ case RCU_GP_TASKS | RCU_GP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD:
+
+ /*
+ * Second or subsequent task blocking the normal GP.
+ * The task does not block the expedited GP. Queue just
+ * after the first task blocking the normal GP.
+ */
+ list_add(&t->rcu_node_entry, rnp->gp_tasks);
+ break;
+
+ default:
+
+ /* Yet another exercise in excessive paranoia. */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /*
+ * We have now queued the task. If it was the first one to
+ * block either grace period, update the ->gp_tasks and/or
+ * ->exp_tasks pointers, respectively, to reference the newly
+ * blocked tasks.
+ */
+ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
+ WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
+ WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
+ }
+ if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
+ WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry);
+ WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+ !(rnp->qsmask & rdp->grpmask));
+ WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+ !(rnp->expmask & rdp->grpmask));
+ raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
+
+ /*
+ * Report the quiescent state for the expedited GP. This expedited
+ * GP should not be able to end until we report, so there should be
+ * no need to check for a subsequent expedited GP. (Though we are
+ * still in a quiescent state in any case.)
+ *
+ * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
+ */
+ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
+ rcu_report_exp_rdp(rdp);
+ else
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
+}
+
/*
- * Record a preemptible-RCU quiescent state for the specified CPU. Note
- * that this just means that the task currently running on the CPU is
- * not in a quiescent state. There might be any number of tasks blocked
- * while in an RCU read-side critical section.
+ * Record a preemptible-RCU quiescent state for the specified CPU.
+ * Note that this does not necessarily mean that the task currently running
+ * on the CPU is in a quiescent state: Instead, it means that the current
+ * grace period need not wait on any RCU read-side critical section that
+ * starts later on this CPU. It also means that if the current task is
+ * in an RCU read-side critical section, it has already added itself to
+ * some leaf rcu_node structure's ->blkd_tasks list. In addition to the
+ * current task, there might be any number of other tasks blocked while
+ * in an RCU read-side critical section.
+ *
+ * Unlike non-preemptible-RCU, quiescent state reports for expedited
+ * grace periods are handled separately via deferred quiescent states
+ * and context switch events.
*
- * As with the other rcu_*_qs() functions, callers to this function
- * must disable preemption.
+ * Callers to this function must disable preemption.
*/
-static void rcu_preempt_qs(void)
+static void rcu_qs(void)
{
- if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
- __this_cpu_read(rcu_preempt_data.gpnum),
+ __this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
- __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
- barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
- current->rcu_read_unlock_special.b.need_qs = false;
+ __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
+ barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
}
}
@@ -137,72 +319,41 @@ static void rcu_preempt_qs(void)
* predating the current grace period drain, in other words, until
* rnp->gp_tasks becomes NULL.
*
- * Caller must disable preemption.
+ * Caller must disable interrupts.
*/
-static void rcu_preempt_note_context_switch(void)
+void rcu_note_context_switch(bool preempt)
{
struct task_struct *t = current;
- unsigned long flags;
- struct rcu_data *rdp;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp;
- if (t->rcu_read_lock_nesting > 0 &&
+ trace_rcu_utilization(TPS("Start context switch"));
+ lockdep_assert_irqs_disabled();
+ WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
+ if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp);
t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
/*
- * If this CPU has already checked in, then this task
- * will hold up the next grace period rather than the
- * current grace period. Queue the task accordingly.
- * If the task is queued for the current grace period
- * (i.e., this CPU has not yet passed through a quiescent
- * state for the current grace period), then as long
- * as that task remains queued, the current grace period
- * cannot end. Note that there is some uncertainty as
- * to exactly when the current grace period started.
- * We take a conservative approach, which can result
- * in unnecessarily waiting on tasks that started very
- * slightly after the current grace period began. C'est
- * la vie!!!
- *
- * But first, note that the current CPU must still be
- * on line!
+ * Verify the CPU's sanity, trace the preemption, and
+ * then queue the task as required based on the states
+ * of any ongoing and expedited grace periods.
*/
- WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+ WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
- if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
- list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
- rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
- if (rnp->boost_tasks != NULL)
- rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
- } else {
- list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
- if (rnp->qsmask & rdp->grpmask)
- rnp->gp_tasks = &t->rcu_node_entry;
- }
- trace_rcu_preempt_task(rdp->rsp->name,
+ trace_rcu_preempt_task(rcu_state.name,
t->pid,
(rnp->qsmask & rdp->grpmask)
- ? rnp->gpnum
- : rnp->gpnum + 1);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- } else if (t->rcu_read_lock_nesting < 0 &&
- t->rcu_read_unlock_special.s) {
-
- /*
- * Complete exit from RCU read-side critical section on
- * behalf of preempted instance of __rcu_read_unlock().
- */
- rcu_read_unlock_special(t);
+ ? rnp->gp_seq
+ : rcu_seq_snap(&rnp->gp_seq));
+ rcu_preempt_ctxt_queue(rnp, rdp);
+ } else {
+ rcu_preempt_deferred_qs(t);
}
/*
@@ -214,8 +365,13 @@ static void rcu_preempt_note_context_switch(void)
* grace period, then the fact that the task has been enqueued
* means that we continue to block the current grace period.
*/
- rcu_preempt_qs();
+ rcu_qs();
+ if (rdp->cpu_no_qs.b.exp)
+ rcu_report_exp_rdp(rdp);
+ rcu_tasks_qs(current, preempt);
+ trace_rcu_utilization(TPS("End context switch"));
}
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
/*
* Check for preempted RCU readers blocking the current grace period
@@ -224,8 +380,70 @@ static void rcu_preempt_note_context_switch(void)
*/
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
- return rnp->gp_tasks != NULL;
+ return READ_ONCE(rnp->gp_tasks) != NULL;
+}
+
+/* limit value for ->rcu_read_lock_nesting. */
+#define RCU_NEST_PMAX (INT_MAX / 2)
+
+static void rcu_preempt_read_enter(void)
+{
+ WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
+}
+
+static int rcu_preempt_read_exit(void)
+{
+ int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
+
+ WRITE_ONCE(current->rcu_read_lock_nesting, ret);
+ return ret;
+}
+
+static void rcu_preempt_depth_set(int val)
+{
+ WRITE_ONCE(current->rcu_read_lock_nesting, val);
+}
+
+/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ rcu_preempt_read_enter();
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
+ barrier(); /* critical section after entry code. */
}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ barrier(); // critical section before exit code.
+ if (rcu_preempt_read_exit() == 0) {
+ barrier(); // critical-section exit before .s check.
+ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
+ rcu_read_unlock_special(t);
+ }
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
+ int rrln = rcu_preempt_depth();
+
+ WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX);
+ }
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
@@ -252,91 +470,88 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
}
/*
- * Handle special cases during rcu_read_unlock(), such as needing to
- * notify RCU core processing or task having blocked during the RCU
- * read-side critical section.
+ * Report deferred quiescent states. The deferral time can
+ * be quite short, for example, in the case of the call from
+ * rcu_read_unlock_special().
*/
-void rcu_read_unlock_special(struct task_struct *t)
+static notrace void
+rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
{
bool empty_exp;
bool empty_norm;
bool empty_exp_now;
- unsigned long flags;
struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
bool drop_boost_mutex = false;
-#endif /* #ifdef CONFIG_RCU_BOOST */
+ struct rcu_data *rdp;
struct rcu_node *rnp;
union rcu_special special;
- /* NMI handlers cannot block and cannot safely manipulate state. */
- if (in_nmi())
- return;
-
- local_irq_save(flags);
+ rdp = this_cpu_ptr(&rcu_data);
+ if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
+ rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
/*
- * If RCU core is waiting for this CPU to exit critical section,
- * let it know that we have done so. Because irqs are disabled,
+ * If RCU core is waiting for this CPU to exit its critical section,
+ * report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
+ if (!special.s && !rdp->cpu_no_qs.b.exp) {
+ local_irq_restore(flags);
+ return;
+ }
+ t->rcu_read_unlock_special.s = 0;
if (special.b.need_qs) {
- rcu_preempt_qs();
- t->rcu_read_unlock_special.b.need_qs = false;
- if (!t->rcu_read_unlock_special.s) {
- local_irq_restore(flags);
- return;
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rdp->cpu_no_qs.b.norm = false;
+ rcu_report_qs_rdp(rdp);
+ udelay(rcu_unlock_delay);
+ } else {
+ rcu_qs();
}
}
- /* Hardware IRQ handlers cannot block, complain if they get here. */
- if (in_irq() || in_serving_softirq()) {
- lockdep_rcu_suspicious(__FILE__, __LINE__,
- "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
- pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
- t->rcu_read_unlock_special.s,
- t->rcu_read_unlock_special.b.blocked,
- t->rcu_read_unlock_special.b.need_qs);
- local_irq_restore(flags);
- return;
- }
+ /*
+ * Respond to a request by an expedited grace period for a
+ * quiescent state from this CPU. Note that requests from
+ * tasks are handled when removing the task from the
+ * blocked-tasks list below.
+ */
+ if (rdp->cpu_no_qs.b.exp)
+ rcu_report_exp_rdp(rdp);
/* Clean up if blocked during RCU read-side critical section. */
if (special.b.blocked) {
- t->rcu_read_unlock_special.b.blocked = false;
/*
- * Remove this task from the list it blocked on. The
- * task can migrate while we acquire the lock, but at
- * most one time. So at most two passes through loop.
+ * Remove this task from the list it blocked on. The task
+ * now remains queued on the rcu_node corresponding to the
+ * CPU it first blocked on, so there is no longer any need
+ * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
*/
- for (;;) {
- rnp = t->rcu_blocked_node;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- if (rnp == t->rcu_blocked_node)
- break;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
+ rnp = t->rcu_blocked_node;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
- empty_exp = !rcu_preempted_readers_exp(rnp);
- smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
+ WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
+ (!empty_norm || rnp->qsmask));
+ empty_exp = sync_rcu_exp_done(rnp);
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
t->rcu_blocked_node = NULL;
trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
- rnp->gpnum, t->pid);
+ rnp->gp_seq, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
- rnp->gp_tasks = np;
+ WRITE_ONCE(rnp->gp_tasks, np);
if (&t->rcu_node_entry == rnp->exp_tasks)
- rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
- /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-#endif /* #ifdef CONFIG_RCU_BOOST */
+ WRITE_ONCE(rnp->exp_tasks, np);
+ if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+ /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ WRITE_ONCE(rnp->boost_tasks, np);
+ }
/*
* If this was the last task on the current list, and if
@@ -344,511 +559,451 @@ void rcu_read_unlock_special(struct task_struct *t)
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state.
*/
- empty_exp_now = !rcu_preempted_readers_exp(rnp);
+ empty_exp_now = sync_rcu_exp_done(rnp);
if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
- rnp->gpnum,
+ rnp->gp_seq,
0, rnp->qsmask,
rnp->level,
rnp->grplo,
rnp->grphi,
!!rnp->gp_tasks);
- rcu_report_unblock_qs_rnp(&rcu_preempt_state,
- rnp, flags);
+ rcu_report_unblock_qs_rnp(rnp, flags);
} else {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-#ifdef CONFIG_RCU_BOOST
- /* Unboost if we were boosted. */
- if (drop_boost_mutex)
- rt_mutex_unlock(&rnp->boost_mtx);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
- rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+ rcu_report_exp_rnp(rnp, true);
+
+ /* Unboost if we were boosted. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+ rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
} else {
local_irq_restore(flags);
}
}
/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period on the specified rcu_node structure.
+ * Is a deferred quiescent-state pending, and are we also not in
+ * an RCU read-side critical section? It is the caller's responsibility
+ * to ensure it is otherwise safe to report any deferred quiescent
+ * states. The reason for this is that it is safe to report a
+ * quiescent state during context switch even though preemption
+ * is disabled. This function cannot be expected to understand these
+ * nuances, so the caller must handle them.
+ */
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+{
+ return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
+ READ_ONCE(t->rcu_read_unlock_special.s)) &&
+ rcu_preempt_depth() == 0;
+}
+
+/*
+ * Report a deferred quiescent state if needed and safe to do so.
+ * As with rcu_preempt_need_deferred_qs(), "safe" involves only
+ * not being in an RCU read-side critical section. The caller must
+ * evaluate safety in terms of interrupt, softirq, and preemption
+ * disabling.
*/
-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
- struct task_struct *t;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (!rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (!rcu_preempt_need_deferred_qs(t))
return;
- }
- t = list_entry(rnp->gp_tasks,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
- sched_show_task(t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ local_irq_save(flags);
+ rcu_preempt_deferred_qs_irqrestore(t, flags);
}
/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
+ * Minimal handler to give the scheduler a chance to re-evaluate.
*/
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
{
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- rcu_print_detail_task_stall_rnp(rnp);
- rcu_for_each_leaf_node(rsp, rnp)
- rcu_print_detail_task_stall_rnp(rnp);
-}
+ struct rcu_data *rdp;
-#ifdef CONFIG_RCU_CPU_STALL_INFO
+ lockdep_assert_irqs_disabled();
+ rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
- pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
- rnp->level, rnp->grplo, rnp->grphi);
+ /*
+ * If the IRQ work handler happens to run in the middle of RCU read-side
+ * critical section, it could be ineffective in getting the scheduler's
+ * attention to report a deferred quiescent state (the whole point of the
+ * IRQ work). For this reason, requeue the IRQ work.
+ *
+ * Basically, we want to avoid following situation:
+ * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
+ * 2. CPU enters new rcu_read_lock()
+ * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
+ * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
+ * 5. Deferred QS reporting does not happen.
+ */
+ if (rcu_preempt_depth() > 0)
+ WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
}
-static void rcu_print_task_stall_end(void)
+/*
+ * Check if expedited grace period processing during unlock is needed.
+ *
+ * This function determines whether expedited handling is required based on:
+ * 1. Task blocking an expedited grace period (based on a heuristic, could be
+ * false-positive, see below.)
+ * 2. CPU participating in an expedited grace period
+ * 3. Strict grace period mode requiring expedited handling
+ * 4. RCU priority deboosting needs when interrupts were disabled
+ *
+ * @t: The task being checked
+ * @rdp: The per-CPU RCU data
+ * @rnp: The RCU node for this CPU
+ * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock()
+ *
+ * Returns true if expedited processing of the rcu_read_unlock() is needed.
+ */
+static bool rcu_unlock_needs_exp_handling(struct task_struct *t,
+ struct rcu_data *rdp,
+ struct rcu_node *rnp,
+ bool irqs_were_disabled)
{
- pr_cont("\n");
-}
+ /*
+ * Check if this task is blocking an expedited grace period. If the
+ * task was preempted within an RCU read-side critical section and is
+ * on the expedited grace period blockers list (exp_tasks), we need
+ * expedited handling to unblock the expedited GP. This is not an exact
+ * check because 't' might not be on the exp_tasks list at all - its
+ * just a fast heuristic that can be false-positive sometimes.
+ */
+ if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks))
+ return true;
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ /*
+ * Check if this CPU is participating in an expedited grace period.
+ * The expmask bitmap tracks which CPUs need to check in for the
+ * current expedited GP. If our CPU's bit is set, we need expedited
+ * handling to help complete the expedited GP.
+ */
+ if (rdp->grpmask & READ_ONCE(rnp->expmask))
+ return true;
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-}
+ /*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods
+ * are treated as short for testing purposes even if that means
+ * disturbing the system more. Check if either:
+ * - This CPU has not yet reported a quiescent state, or
+ * - This task was preempted within an RCU critical section
+ * In either case, require expedited handling for strict GP mode.
+ */
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node))
+ return true;
-static void rcu_print_task_stall_end(void)
-{
-}
+ /*
+ * RCU priority boosting case: If a task is subject to RCU priority
+ * boosting and exits an RCU read-side critical section with interrupts
+ * disabled, we need expedited handling to ensure timely deboosting.
+ * Without this, a low-priority task could incorrectly run at high
+ * real-time priority for an extended period degrading real-time
+ * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels,
+ * not just to PREEMPT_RT.
+ */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node)
+ return true;
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ return false;
+}
/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
*/
-static int rcu_print_task_stall(struct rcu_node *rnp)
+static void rcu_read_unlock_special(struct task_struct *t)
{
- struct task_struct *t;
- int ndetected = 0;
+ unsigned long flags;
+ bool irqs_were_disabled;
+ bool preempt_bh_were_disabled =
+ !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
- if (!rcu_preempt_blocked_readers_cgp(rnp))
- return 0;
- rcu_print_task_stall_begin(rnp);
- t = list_entry(rnp->gp_tasks,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" P%d", t->pid);
- ndetected++;
+ /* NMI handlers cannot block and cannot safely manipulate state. */
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+ irqs_were_disabled = irqs_disabled_flags(flags);
+ if (preempt_bh_were_disabled || irqs_were_disabled) {
+ bool needs_exp; // Expedited handling needed.
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
+
+ needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled);
+
+ // Need to defer quiescent state until everything is enabled.
+ if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) {
+ // Using softirq, safe to awaken, and either the
+ // wakeup is free or there is either an expedited
+ // GP in flight or a potential need to deboost.
+ raise_softirq_irqoff(RCU_SOFTIRQ);
+ } else {
+ // Enabling BH or preempt does reschedule, so...
+ // Also if no expediting and no possible deboosting,
+ // slow is OK. Plus nohz_full CPUs eventually get
+ // tick enabled.
+ set_need_resched_current();
+ if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
+ needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+ cpu_online(rdp->cpu)) {
+ // Get scheduler to re-evaluate and call hooks.
+ // If !IRQ_WORK, FQS scan will eventually IPI.
+ rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
+ irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
+ }
+ }
+ local_irq_restore(flags);
+ return;
}
- rcu_print_task_stall_end();
- return ndetected;
+ rcu_preempt_deferred_qs_irqrestore(t, flags);
}
/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
- * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
- * must be held by the caller.
+ * invoked -before- updating this rnp's ->gp_seq.
*
* Also, if there are blocked tasks on the list, they automatically
* block the newly created grace period, so set up ->gp_tasks accordingly.
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
- WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (rcu_preempt_has_tasks(rnp))
- rnp->gp_tasks = rnp->blkd_tasks.next;
+ struct task_struct *t;
+
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+ dump_blkd_tasks(rnp, 10);
+ if (rcu_preempt_has_tasks(rnp) &&
+ (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
+ WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
+ t = container_of(rnp->gp_tasks, struct task_struct,
+ rcu_node_entry);
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+ rnp->gp_seq, t->pid);
+ }
WARN_ON_ONCE(rnp->qsmask);
}
/*
- * Check for a quiescent state from the current CPU. When a task blocks,
- * the task is recorded in the corresponding CPU's rcu_node structure,
- * which is checked elsewhere.
- *
- * Caller must disable hard irqs.
+ * Check for a quiescent state from the current CPU, including voluntary
+ * context switches for Tasks RCU. When a task blocks, the task is
+ * recorded in the corresponding CPU's rcu_node structure, which is checked
+ * elsewhere, hence this function need only check for quiescent states
+ * related to the current CPU, not to those related to tasks.
*/
-static void rcu_preempt_check_callbacks(void)
+static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
- if (t->rcu_read_lock_nesting == 0) {
- rcu_preempt_qs();
+ lockdep_assert_irqs_disabled();
+ if (rcu_preempt_depth() > 0 ||
+ (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
+ /* No QS, force context switch if deferred. */
+ if (rcu_preempt_need_deferred_qs(t))
+ set_need_resched_current();
+ } else if (rcu_preempt_need_deferred_qs(t)) {
+ rcu_preempt_deferred_qs(t); /* Report deferred QS. */
+ return;
+ } else if (!WARN_ON_ONCE(rcu_preempt_depth())) {
+ rcu_qs(); /* Report immediate QS. */
return;
}
- if (t->rcu_read_lock_nesting > 0 &&
- __this_cpu_read(rcu_preempt_data.qs_pending) &&
- !__this_cpu_read(rcu_preempt_data.passed_quiesce))
- t->rcu_read_unlock_special.b.need_qs = true;
-}
-
-#ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void)
-{
- rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+ /* If GP is oldish, ask for help from rcu_read_unlock_special(). */
+ if (rcu_preempt_depth() > 0 &&
+ __this_cpu_read(rcu_data.core_needs_qs) &&
+ __this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
+ !t->rcu_read_unlock_special.b.need_qs &&
+ time_after(jiffies, rcu_state.gp_start + HZ))
+ t->rcu_read_unlock_special.b.need_qs = true;
}
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/*
- * Queue a preemptible-RCU callback for invocation after a grace period.
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so. No need to issue warnings, as
+ * debug_check_no_locks_held() already does this if lockdep is enabled.
+ * Besides, if this function does anything other than just immediately
+ * return, there was a bug of some sort. Spewing warnings from this
+ * function is like as not to simply obscure important prior warnings.
*/
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void exit_rcu(void)
{
- __call_rcu(head, func, &rcu_preempt_state, -1, 0);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
+ struct task_struct *t = current;
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. Note, however, that
- * upon return from synchronize_rcu(), the caller might well be executing
- * concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- *
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
- */
-void synchronize_rcu(void)
-{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
- if (!rcu_scheduler_active)
+ if (unlikely(!list_empty(&current->rcu_node_entry))) {
+ rcu_preempt_depth_set(1);
+ barrier();
+ WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
+ } else if (unlikely(rcu_preempt_depth())) {
+ rcu_preempt_depth_set(1);
+ } else {
return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
-
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(struct rcu_node *rnp)
-{
- return rnp->exp_tasks != NULL;
-}
-
-/*
- * return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period. Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
- return !rcu_preempted_readers_exp(rnp) &&
- ACCESS_ONCE(rnp->expmask) == 0;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period. This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree. (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake)
-{
- unsigned long flags;
- unsigned long mask;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- break;
- }
- if (rnp->parent == NULL) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (wake) {
- smp_mb(); /* EGP done before wake_up(). */
- wake_up(&sync_rcu_preempt_exp_wq);
- }
- break;
- }
- mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- rnp = rnp->parent;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- rnp->expmask &= ~mask;
}
+ __rcu_read_unlock();
+ rcu_preempt_deferred_qs(current);
}
/*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure, phase 1. If there
- * are such tasks, set the ->expmask bits up the rcu_node tree and also
- * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
- * that work is needed here.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Dump the blocked-tasks state, but limit the list dump to the
+ * specified number of elements.
*/
static void
-sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
+dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
{
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp_up;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- WARN_ON_ONCE(rnp->expmask);
- WARN_ON_ONCE(rnp->exp_tasks);
- if (!rcu_preempt_has_tasks(rnp)) {
- /* No blocked tasks, nothing to do. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- /* Call for Phase 2 and propagate ->expmask bits up the tree. */
- rnp->expmask = 1;
- rnp_up = rnp;
- while (rnp_up->parent) {
- mask = rnp_up->grpmask;
- rnp_up = rnp_up->parent;
- if (rnp_up->expmask & mask)
+ int cpu;
+ int i;
+ struct list_head *lhp;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp1;
+
+ raw_lockdep_assert_held_rcu_node(rnp);
+ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
+ __func__, rnp->grplo, rnp->grphi, rnp->level,
+ (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
+ for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
+ pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
+ __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
+ pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
+ __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks),
+ READ_ONCE(rnp->exp_tasks));
+ pr_info("%s: ->blkd_tasks", __func__);
+ i = 0;
+ list_for_each(lhp, &rnp->blkd_tasks) {
+ pr_cont(" %p", lhp);
+ if (++i >= ncheck)
break;
- raw_spin_lock(&rnp_up->lock); /* irqs already off */
- smp_mb__after_unlock_lock();
- rnp_up->expmask |= mask;
- raw_spin_unlock(&rnp_up->lock); /* irqs still off */
}
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ pr_cont("\n");
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
+ cpu, ".o"[rcu_rdp_cpu_online(rdp)],
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
+ }
}
-/*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure, phase 2. If the
- * leaf rcu_node structure has its ->expmask field set, check for tasks.
- * If there are some, clear ->expmask and set ->exp_tasks accordingly,
- * then initiate RCU priority boosting. Otherwise, clear ->expmask and
- * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
- * enabling rcu_read_unlock_special() to do the bit-clearing.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static void
-sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- if (!rnp->expmask) {
- /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
-
- /* Phase 1 is over. */
- rnp->expmask = 0;
-
- /*
- * If there are still blocked tasks, set up ->exp_tasks so that
- * rcu_read_unlock_special() will wake us and then boost them.
- */
- if (rcu_preempt_has_tasks(rnp)) {
- rnp->exp_tasks = rnp->blkd_tasks.next;
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
- return;
- }
-
- /* No longer any blocked tasks, so undo bit setting. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rcu_report_exp_rnp(rsp, rnp, false);
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
}
+#else /* #ifdef CONFIG_PREEMPT_RCU */
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blkd_tasks lists and wait for this list to drain. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.
- * In fact, if you are using synchronize_rcu_expedited() in a loop,
- * please restructure your code to batch your updates, and then Use a
- * single synchronize_rcu() instead.
+/*
+ * If strict grace periods are enabled, and if the calling
+ * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
+ * report that quiescent state and, if requested, spin for a bit.
*/
-void synchronize_rcu_expedited(void)
+void rcu_read_unlock_strict(void)
{
- struct rcu_node *rnp;
- struct rcu_state *rsp = &rcu_preempt_state;
- unsigned long snap;
- int trycount = 0;
-
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
- smp_mb(); /* Above access cannot bleed into critical section. */
+ struct rcu_data *rdp;
- /*
- * Block CPU-hotplug operations. This means that any CPU-hotplug
- * operation that finds an rcu_node structure with tasks in the
- * process of being boosted will know that all tasks blocking
- * this expedited grace period will already be in the process of
- * being boosted. This simplifies the process of moving tasks
- * from leaf to root rcu_node structures.
- */
- if (!try_get_online_cpus()) {
- /* CPU-hotplug operation in flight, fall back to normal GP. */
- wait_rcu_gp(call_rcu);
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
- }
/*
- * Acquire lock, falling back to synchronize_rcu() if too many
- * lock-acquisition failures. Of course, if someone does the
- * expedited grace period for us, just leave.
- */
- while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
- if (ULONG_CMP_LT(snap,
- ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto mb_ret; /* Others did our work for us. */
- }
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- put_online_cpus();
- wait_rcu_gp(call_rcu);
- return;
- }
- }
- if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto unlock_mb_ret; /* Others did our work for us. */
- }
-
- /* force all RCU readers onto ->blkd_tasks lists. */
- synchronize_sched_expedited();
-
- /*
- * Snapshot current state of ->blkd_tasks lists into ->expmask.
- * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
- * to start clearing them. Doing this in one phase leads to
- * strange races between setting and clearing bits, so just say "no"!
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
*/
- rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init1(rsp, rnp);
- rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init2(rsp, rnp);
-
- put_online_cpus();
-
- /* Wait for snapshotted ->blkd_tasks lists to drain. */
- rnp = rcu_get_root(rsp);
- wait_event(sync_rcu_preempt_exp_wq,
- sync_rcu_preempt_exp_done(rnp));
-
- /* Clean up and exit. */
- smp_mb(); /* ensure expedited GP seen before counter increment. */
- ACCESS_ONCE(sync_rcu_preempt_exp_count) =
- sync_rcu_preempt_exp_count + 1;
-unlock_mb_ret:
- mutex_unlock(&sync_rcu_preempt_exp_mutex);
-mb_ret:
- smp_mb(); /* ensure subsequent action seen after grace period. */
+ rdp = this_cpu_ptr(&rcu_data);
+ rdp->cpu_no_qs.b.norm = false;
+ rcu_report_qs_rdp(rdp);
+ udelay(rcu_unlock_delay);
}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-/**
- * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
- *
- * Note that this primitive does not necessarily wait for an RCU grace period
- * to complete. For example, if there are no RCU callbacks queued anywhere
- * in the system, then rcu_barrier() is within its rights to return
- * immediately, without waiting for anything, much less an RCU grace period.
- */
-void rcu_barrier(void)
-{
- _rcu_barrier(&rcu_preempt_state);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
+EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
/*
- * Initialize preemptible RCU's state structures.
+ * Tell them what RCU they are running.
*/
-static void __init __rcu_init_preempt(void)
+static void __init rcu_bootup_announce(void)
{
- rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+ pr_info("Hierarchical RCU implementation.\n");
+ rcu_bootup_announce_oddness();
}
/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
+ * Note a quiescent state for PREEMPTION=n. Because we do not need to know
+ * how many quiescent states passed, just if there was at least one since
+ * the start of the grace period, this just sets a flag. The caller must
+ * have disabled preemption.
*/
-void exit_rcu(void)
+static void rcu_qs(void)
{
- struct task_struct *t = current;
-
- if (likely(list_empty(&current->rcu_node_entry)))
+ RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
+ if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
return;
- t->rcu_read_lock_nesting = 1;
- barrier();
- t->rcu_read_unlock_special.b.blocked = true;
- __rcu_read_unlock();
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
+ __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
}
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-
-static struct rcu_state *rcu_state_p = &rcu_sched_state;
-
/*
- * Tell them what RCU they are running.
+ * Register an urgently needed quiescent state. If there is an
+ * emergency, invoke rcu_momentary_eqs() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs, which will in
+ * some cases serve for expedited as well as normal grace periods.
+ * Either way, register a lightweight quiescent state.
*/
-static void __init rcu_bootup_announce(void)
+void rcu_all_qs(void)
{
- pr_info("Hierarchical RCU implementation.\n");
- rcu_bootup_announce_oddness();
+ unsigned long flags;
+
+ if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
+ return;
+ preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
+ preempt_enable();
+ return;
+ }
+ this_cpu_write(rcu_data.rcu_urgent_qs, false);
+ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
+ local_irq_save(flags);
+ rcu_momentary_eqs();
+ local_irq_restore(flags);
+ }
+ rcu_qs();
+ preempt_enable();
}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
/*
- * Because preemptible RCU does not exist, we never have to check for
- * CPUs being in quiescent states.
+ * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
*/
-static void rcu_preempt_note_context_switch(void)
+void rcu_note_context_switch(bool preempt)
{
+ trace_rcu_utilization(TPS("Start context switch"));
+ rcu_qs();
+ /* Load rcu_urgent_qs before other flags. */
+ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs)))
+ goto out;
+ this_cpu_write(rcu_data.rcu_urgent_qs, false);
+ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
+ rcu_momentary_eqs();
+out:
+ rcu_tasks_qs(current, preempt);
+ trace_rcu_utilization(TPS("End context switch"));
}
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
/*
* Because preemptible RCU does not exist, there are never any preempted
@@ -868,20 +1023,27 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
}
/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
+ * Because there is no preemptible RCU, there can be no deferred quiescent
+ * states.
*/
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
+ return false;
}
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
+// Except that we do need to respond to a request by an expedited
+// grace period for a quiescent state from this CPU. Note that in
+// non-preemptible kernels, there can be no context switches within RCU
+// read-side critical sections, which in turn means that the leaf rcu_node
+// structure's blocked-tasks list is always empty. is therefore no need to
+// actually check it. Instead, a quiescent state from this CPU suffices,
+// and this function is only called from such a quiescent state.
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
- return 0;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (READ_ONCE(rdp->cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(rdp);
}
/*
@@ -895,91 +1057,89 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
}
/*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to check.
+ * Check to see if this CPU is in a non-context-switch quiescent state,
+ * namely user mode and idle loop.
*/
-static void rcu_preempt_check_callbacks(void)
+static void rcu_flavor_sched_clock_irq(int user)
{
-}
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
-/*
- * Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptible RCU does not exist, map to rcu-sched.
- */
-void synchronize_rcu_expedited(void)
-{
- synchronize_sched_expedited();
+ /*
+ * Get here if this CPU took its interrupt from user
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
+ *
+ * No memory barrier is required here because rcu_qs()
+ * references only CPU-local variables that other CPUs
+ * neither access nor modify, at least not while the
+ * corresponding CPU is online.
+ */
+ rcu_qs();
+ }
}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
/*
- * Because preemptible RCU does not exist, rcu_barrier() is just
- * another name for rcu_barrier_sched().
+ * Because preemptible RCU does not exist, tasks cannot possibly exit
+ * while in preemptible RCU read-side critical sections.
*/
-void rcu_barrier(void)
+void exit_rcu(void)
{
- rcu_barrier_sched();
}
-EXPORT_SYMBOL_GPL(rcu_barrier);
/*
- * Because preemptible RCU does not exist, it need not be initialized.
+ * Dump the guaranteed-empty blocked-tasks state. Trust but verify.
*/
-static void __init __rcu_init_preempt(void)
+static void
+dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
{
+ WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
}
-/*
- * Because preemptible RCU does not exist, tasks cannot possibly exit
- * while in preemptible RCU read-side critical sections.
- */
-void exit_rcu(void)
-{
-}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
#ifdef CONFIG_RCU_BOOST
+ struct sched_param sp;
-#include "../locking/rtmutex_common.h"
-
-#ifdef CONFIG_RCU_TRACE
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
- if (!rcu_preempt_has_tasks(rnp))
- rnp->n_balk_blkd_tasks++;
- else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
- rnp->n_balk_exp_gp_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
- rnp->n_balk_boost_tasks++;
- else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
- rnp->n_balk_notblocked++;
- else if (rnp->gp_tasks != NULL &&
- ULONG_CMP_LT(jiffies, rnp->boost_time))
- rnp->n_balk_notyet++;
- else
- rnp->n_balk_nos++;
+ WRITE_ONCE(rdp->rcuc_activity, jiffies);
}
-#else /* #ifdef CONFIG_RCU_TRACE */
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return rdp->nocb_cb_kthread == current;
+#else
+ return false;
+#endif
}
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
-static void rcu_wake_cond(struct task_struct *t, int status)
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
{
- /*
- * If the thread is yielding, only wake it when this
- * is invoked from idle
- */
- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
- wake_up_process(t);
+ return rdp->rcu_cpu_kthread_task == current ||
+ rcu_is_callbacks_nocb_kthread(rdp);
}
+#ifdef CONFIG_RCU_BOOST
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -994,19 +1154,18 @@ static int rcu_boost(struct rcu_node *rnp)
struct task_struct *t;
struct list_head *tb;
- if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
- ACCESS_ONCE(rnp->boost_tasks) == NULL)
+ if (READ_ONCE(rnp->exp_tasks) == NULL &&
+ READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
/*
* Recheck under the lock: all tasks in need of boosting
* might exit their RCU read-side critical sections on their own.
*/
if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
}
@@ -1016,14 +1175,10 @@ static int rcu_boost(struct rcu_node *rnp)
* expedited grace period must boost all blocked tasks, including
* those blocking the pre-existing normal grace period.
*/
- if (rnp->exp_tasks != NULL) {
+ if (rnp->exp_tasks != NULL)
tb = rnp->exp_tasks;
- rnp->n_exp_boosts++;
- } else {
+ else
tb = rnp->boost_tasks;
- rnp->n_normal_boosts++;
- }
- rnp->n_tasks_boosted++;
/*
* We boost task t by manufacturing an rt_mutex that appears to
@@ -1042,19 +1197,19 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+ rnp->n_boosts++;
- return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
- ACCESS_ONCE(rnp->boost_tasks) != NULL;
+ return READ_ONCE(rnp->exp_tasks) != NULL ||
+ READ_ONCE(rnp->boost_tasks) != NULL;
}
/*
- * Priority-boosting kthread. One per leaf rcu_node and one for the
- * root rcu_node.
+ * Priority-boosting kthread, one per leaf rcu_node.
*/
static int rcu_boost_kthread(void *arg)
{
@@ -1064,20 +1219,21 @@ static int rcu_boost_kthread(void *arg)
trace_rcu_utilization(TPS("Start boost kthread@init"));
for (;;) {
- rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
- rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+ rcu_wait(READ_ONCE(rnp->boost_tasks) ||
+ READ_ONCE(rnp->exp_tasks));
trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
- rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
more2boost = rcu_boost(rnp);
if (more2boost)
spincnt++;
else
spincnt = 0;
if (spincnt > 10) {
- rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
+ WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
+ schedule_timeout_idle(2);
trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
spincnt = 0;
}
@@ -1100,56 +1256,28 @@ static int rcu_boost_kthread(void *arg)
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
- struct task_struct *t;
-
- if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
- rnp->n_balk_exp_gp_tasks++;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (!rnp->boost_kthread_task ||
+ (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
if (rnp->exp_tasks != NULL ||
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
- ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+ (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
+ IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
if (rnp->exp_tasks == NULL)
- rnp->boost_tasks = rnp->gp_tasks;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- t = rnp->boost_kthread_task;
- if (t)
- rcu_wake_cond(t, rnp->boost_kthread_status);
+ WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ rcu_wake_cond(rnp->boost_kthread_task,
+ READ_ONCE(rnp->boost_kthread_status));
} else {
- rcu_initiate_boost_trace(rnp);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
-/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task)) {
- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
- __this_cpu_read(rcu_cpu_kthread_status));
- }
- local_irq_restore(flags);
-}
-
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return __this_cpu_read(rcu_cpu_kthread_task) == current;
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1163,161 +1291,30 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
- * Returns zero if all is well, a negated errno otherwise.
*/
-static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
- int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
+ int rnp_index = rnp - rcu_get_root();
struct sched_param sp;
struct task_struct *t;
- if (&rcu_preempt_state != rsp)
- return 0;
-
- if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
- return 0;
+ if (rnp->boost_kthread_task)
+ return;
- rsp->boost = 1;
- if (rnp->boost_kthread_task != NULL)
- return 0;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ if (WARN_ON_ONCE(IS_ERR(t)))
+ return;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ rcu_thread_affine_rnp(t, rnp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- return 0;
-}
-
-static void rcu_kthread_do_work(void)
-{
- rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
- rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
- rcu_preempt_do_callbacks();
-}
-
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
- struct sched_param sp;
-
- sp.sched_priority = kthread_prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
- return __this_cpu_read(rcu_cpu_has_work);
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
- * RCU softirq used in flavors and configurations of RCU that do not
- * support RCU priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
- int spincnt;
-
- for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
- local_bh_disable();
- *statusp = RCU_KTHREAD_RUNNING;
- this_cpu_inc(rcu_cpu_kthread_loops);
- local_irq_disable();
- work = *workp;
- *workp = 0;
- local_irq_enable();
- if (work)
- rcu_kthread_do_work();
- local_bh_enable();
- if (*workp == 0) {
- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
- *statusp = RCU_KTHREAD_WAITING;
- return;
- }
- }
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
- *statusp = RCU_KTHREAD_WAITING;
-}
-
-/*
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
- struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rcu_rnp_online_cpus(rnp);
- cpumask_var_t cm;
- int cpu;
-
- if (!t)
- return;
- if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
- return;
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
- if ((mask & 0x1) && cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0)
- cpumask_setall(cm);
- set_cpus_allowed_ptr(t, cm);
- free_cpumask_var(cm);
-}
-
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
- .store = &rcu_cpu_kthread_task,
- .thread_should_run = rcu_cpu_kthread_should_run,
- .thread_fn = rcu_cpu_kthread,
- .thread_comm = "rcuc/%u",
- .setup = rcu_cpu_kthread_setup,
- .park = rcu_cpu_kthread_park,
-};
-
-/*
- * Spawn boost kthreads -- called as soon as the scheduler is running.
- */
-static void __init rcu_spawn_boost_kthreads(void)
-{
- struct rcu_node *rnp;
- int cpu;
-
- for_each_possible_cpu(cpu)
- per_cpu(rcu_cpu_has_work, cpu) = 0;
- BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
- rcu_for_each_leaf_node(rcu_state_p, rnp)
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
-}
-
-static void rcu_prepare_kthreads(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active)
- (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1325,1715 +1322,19 @@ static void rcu_prepare_kthreads(int cpu)
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-static void invoke_rcu_callbacks_kthread(void)
-{
- WARN_ON_ONCE(1);
-}
-
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void __init rcu_spawn_boost_kthreads(void)
-{
-}
-
-static void rcu_prepare_kthreads(int cpu)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
- * any flavor of RCU.
- */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies)
-{
- *delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(NULL);
-}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(void)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
- * is nothing.
- */
-static void rcu_prepare_for_idle(void)
-{
-}
-
-/*
- * Don't bother keeping a running count of the number of RCU callbacks
- * posted because CONFIG_RCU_FAST_NO_HZ=n.
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode. This is handled by a
- * state machine implemented by rcu_prepare_for_idle() below.
- *
- * The following three proprocessor symbols control this state machine:
- *
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- * to sleep in dyntick-idle mode with RCU callbacks pending. This
- * is sized to be roughly one RCU grace period. Those energy-efficiency
- * benchmarkers who might otherwise be tempted to set this to a large
- * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- * system. And if you are -that- concerned about energy efficiency,
- * just power the system down and be done with it!
- * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
- * permitted to sleep in dyntick-idle mode with only lazy RCU
- * callbacks pending. Setting this too high can OOM your system.
- *
- * The values below work well in practice. If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
-
-static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
-module_param(rcu_idle_gp_delay, int, 0644);
-static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
-module_param(rcu_idle_lazy_gp_delay, int, 0644);
-
-extern int tick_nohz_active;
-
-/*
- * Try to advance callbacks for all flavors of RCU on the current CPU, but
- * only if it has been awhile since the last time we did so. Afterwards,
- * if there are any callbacks ready for immediate invocation, return true.
- */
-static bool __maybe_unused rcu_try_advance_all_cbs(void)
-{
- bool cbs_ready = false;
- struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- struct rcu_node *rnp;
- struct rcu_state *rsp;
-
- /* Exit early if we advanced recently. */
- if (jiffies == rdtp->last_advance_all)
- return false;
- rdtp->last_advance_all = jiffies;
-
- for_each_rcu_flavor(rsp) {
- rdp = this_cpu_ptr(rsp->rda);
- rnp = rdp->mynode;
-
- /*
- * Don't bother checking unless a grace period has
- * completed since we last checked and there are
- * callbacks not yet ready to invoke.
- */
- if ((rdp->completed != rnp->completed ||
- unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
- rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
- note_gp_changes(rsp, rdp);
-
- if (cpu_has_callbacks_ready_to_invoke(rdp))
- cbs_ready = true;
- }
- return cbs_ready;
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller to set the timeout based on whether or not there are non-lazy
- * callbacks.
- *
- * The caller must have disabled interrupts.
- */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *dj)
-{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* Snapshot to detect later posting of non-lazy callback. */
- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
-
- /* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
- *dj = ULONG_MAX;
- return 0;
- }
-
- /* Attempt to advance callbacks. */
- if (rcu_try_advance_all_cbs()) {
- /* Some ready to invoke, so initiate later invocation. */
- invoke_rcu_core();
- return 1;
- }
- rdtp->last_accelerate = jiffies;
-
- /* Request timer delay depending on laziness, and round. */
- if (!rdtp->all_lazy) {
- *dj = round_up(rcu_idle_gp_delay + jiffies,
- rcu_idle_gp_delay) - jiffies;
- } else {
- *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
- }
- return 0;
-}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
-
-/*
- * Prepare a CPU for idle from an RCU perspective. The first major task
- * is to sense whether nohz mode has been enabled or disabled via sysfs.
- * The second major task is to check to see if a non-lazy callback has
- * arrived at a CPU that previously had only lazy callbacks. The third
- * major task is to accelerate (that is, assign grace-period numbers to)
- * any recently arrived callbacks.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(void)
-{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
- bool needwake;
- struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- struct rcu_node *rnp;
- struct rcu_state *rsp;
- int tne;
-
- /* Handle nohz enablement switches conservatively. */
- tne = ACCESS_ONCE(tick_nohz_active);
- if (tne != rdtp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(NULL))
- invoke_rcu_core(); /* force nohz to see update. */
- rdtp->tick_nohz_enabled_snap = tne;
- return;
- }
- if (!tne)
- return;
-
- /* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(smp_processor_id()))
- return;
-
- /*
- * If a non-lazy callback arrived at a CPU having only lazy
- * callbacks, invoke RCU core for the side-effect of recalculating
- * idle duration on re-entry to idle.
- */
- if (rdtp->all_lazy &&
- rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
- rdtp->all_lazy = false;
- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
- invoke_rcu_core();
- return;
- }
-
- /*
- * If we have not yet accelerated this jiffy, accelerate all
- * callbacks on this CPU.
- */
- if (rdtp->last_accelerate == jiffies)
- return;
- rdtp->last_accelerate = jiffies;
- for_each_rcu_flavor(rsp) {
- rdp = this_cpu_ptr(rsp->rda);
- if (!*rdp->nxttail[RCU_DONE_TAIL])
- continue;
- rnp = rdp->mynode;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- if (needwake)
- rcu_gp_kthread_wake(rsp);
- }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
-}
-
-/*
- * Clean up for exit from idle. Attempt to advance callbacks based on
- * any grace periods that elapsed while the CPU was idle, and if any
- * callbacks are now ready to invoke, initiate invocation.
- */
-static void rcu_cleanup_after_idle(void)
-{
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(smp_processor_id()))
- return;
- if (rcu_try_advance_all_cbs())
- invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
-}
-
-/*
- * Keep a running count of the number of non-lazy callbacks posted
- * on this CPU. This running counter (which is never decremented) allows
- * rcu_prepare_for_idle() to detect when something out of the idle loop
- * posts a callback, even if an equal number of callbacks are invoked.
- * Of course, callbacks should only be posted from within a trace event
- * designed to be called from idle or from within RCU_NONIDLE().
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
- __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
-}
-
-/*
- * Data for flushing lazy RCU callbacks at OOM time.
- */
-static atomic_t oom_callback_count;
-static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
-
-/*
- * RCU OOM callback -- decrement the outstanding count and deliver the
- * wake-up if we are the last one.
- */
-static void rcu_oom_callback(struct rcu_head *rhp)
-{
- if (atomic_dec_and_test(&oom_callback_count))
- wake_up(&oom_callback_wq);
-}
-
-/*
- * Post an rcu_oom_notify callback on the current CPU if it has at
- * least one lazy callback. This will unnecessarily post callbacks
- * to CPUs that already have a non-lazy callback at the end of their
- * callback list, but this is an infrequent operation, so accept some
- * extra overhead to keep things simple.
- */
-static void rcu_oom_notify_cpu(void *unused)
-{
- struct rcu_state *rsp;
- struct rcu_data *rdp;
-
- for_each_rcu_flavor(rsp) {
- rdp = raw_cpu_ptr(rsp->rda);
- if (rdp->qlen_lazy != 0) {
- atomic_inc(&oom_callback_count);
- rsp->call(&rdp->oom_head, rcu_oom_callback);
- }
- }
-}
-
-/*
- * If low on memory, ensure that each CPU has a non-lazy callback.
- * This will wake up CPUs that have only lazy callbacks, in turn
- * ensuring that they free up the corresponding memory in a timely manner.
- * Because an uncertain amount of memory will be freed in some uncertain
- * timeframe, we do not claim to have freed anything.
- */
-static int rcu_oom_notify(struct notifier_block *self,
- unsigned long notused, void *nfreed)
-{
- int cpu;
-
- /* Wait for callbacks from earlier instance to complete. */
- wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
- smp_mb(); /* Ensure callback reuse happens after callback invocation. */
-
- /*
- * Prevent premature wakeup: ensure that all increments happen
- * before there is a chance of the counter reaching zero.
- */
- atomic_set(&oom_callback_count, 1);
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
- cond_resched_rcu_qs();
- }
- put_online_cpus();
-
- /* Unconditionally decrement: no need to wake ourselves up. */
- atomic_dec(&oom_callback_count);
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block rcu_oom_nb = {
- .notifier_call = rcu_oom_notify
-};
-
-static int __init rcu_register_oom_notifier(void)
-{
- register_oom_notifier(&rcu_oom_nb);
- return 0;
-}
-early_initcall(rcu_register_oom_notifier);
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
-
- sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
- rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
- ulong2long(nlpd),
- rdtp->all_lazy ? 'L' : '.',
- rdtp->tick_nohz_enabled_snap ? '.' : 'D');
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-/* Initiate the stall-info list. */
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont("\n");
-}
-
-/*
- * Print out diagnostic information for the specified stalled CPU.
- *
- * If the specified CPU is aware of the current RCU grace period
- * (flavor specified by rsp), then print the number of scheduling
- * clock interrupts the CPU has taken during the time that it has
- * been aware. Otherwise, print the number of RCU grace periods
- * that this CPU is ignorant of, for example, "1" if the CPU was
- * aware of the previous grace period.
- *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
- */
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
- char fast_no_hz[72];
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- struct rcu_dynticks *rdtp = rdp->dynticks;
- char *ticks_title;
- unsigned long ticks_value;
-
- if (rsp->gpnum == rdp->gpnum) {
- ticks_title = "ticks this GP";
- ticks_value = rdp->ticks_this_gp;
- } else {
- ticks_title = "GPs behind";
- ticks_value = rsp->gpnum - rdp->gpnum;
- }
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
- cpu, ticks_value, ticks_title,
- atomic_read(&rdtp->dynticks) & 0xfff,
- rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
- rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
- fast_no_hz);
-}
-
-/* Terminate the stall-info list. */
-static void print_cpu_stall_info_end(void)
-{
- pr_err("\t");
-}
-
-/* Zero ->ticks_this_gp for all flavors of RCU. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
- rdp->ticks_this_gp = 0;
- rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
-}
-
-/* Increment ->ticks_this_gp for all flavors of RCU. */
-static void increment_cpu_stall_ticks(void)
-{
- struct rcu_state *rsp;
-
- for_each_rcu_flavor(rsp)
- raw_cpu_inc(rsp->rda->ticks_this_gp);
-}
-
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont(" {");
-}
-
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
- pr_cont(" %d", cpu);
-}
-
-static void print_cpu_stall_info_end(void)
-{
- pr_cont("} ");
-}
-
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-}
-
-static void increment_cpu_stall_ticks(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-#ifdef CONFIG_RCU_NOCB_CPU
-
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For each CPU in the set, there is a
- * kthread created that pulls the callbacks from the corresponding CPU,
- * waits for a grace period to elapse, and invokes the callbacks.
- * The no-CBs CPUs do a wake_up() on their kthread when they insert
- * a callback into any empty list, unless the rcu_nocb_poll boot parameter
- * has been specified, in which case each kthread actively polls its
- * CPU. (Which isn't so great for energy efficiency, but which does
- * reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callback processing could also in theory be used as
- * an energy-efficiency measure because CPUs with no RCU callbacks
- * queued are more aggressive about entering dyntick-idle mode.
- */
-
-
-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
-static int __init rcu_nocb_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- have_rcu_nocb_mask = true;
- cpulist_parse(str, rcu_nocb_mask);
- return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-
-static int __init parse_rcu_nocb_poll(char *arg)
-{
- rcu_nocb_poll = 1;
- return 0;
-}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
-
-/*
- * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
- * grace period.
- */
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
-{
- wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
-}
-
-/*
- * Set the root rcu_node structure's ->need_future_gp field
- * based on the sum of those of all rcu_node structures. This does
- * double-count the root rcu_node structure's requests, but this
- * is necessary to handle the possibility of a rcu_nocb_kthread()
- * having awakened during the time that the rcu_node structures
- * were being updated for the end of the previous grace period.
- */
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
- rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
- init_waitqueue_head(&rnp->nocb_gp_wq[0]);
- init_waitqueue_head(&rnp->nocb_gp_wq[1]);
-}
-
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (have_rcu_nocb_mask)
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
-
-/*
- * Kick the leader kthread for this NOCB group.
- */
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
-{
- struct rcu_data *rdp_leader = rdp->nocb_leader;
-
- if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
- return;
- if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
- /* Prior smp_mb__after_atomic() orders against prior enqueue. */
- ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
- wake_up(&rdp_leader->nocb_wq);
- }
-}
-
-/*
- * Does the specified CPU need an RCU callback for the specified flavor
- * of rcu_barrier()?
- */
-static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
- unsigned long ret;
-#ifdef CONFIG_PROVE_RCU
- struct rcu_head *rhp;
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
- /*
- * Check count of all no-CBs callbacks awaiting invocation.
- * There needs to be a barrier before this function is called,
- * but associated with a prior determination that no more
- * callbacks would be posted. In the worst case, the first
- * barrier in _rcu_barrier() suffices (but the caller cannot
- * necessarily rely on this, not a substitute for the caller
- * getting the concurrency design right!). There must also be
- * a barrier between the following load an posting of a callback
- * (if a callback is in fact needed). This is associated with an
- * atomic_inc() in the caller.
- */
- ret = atomic_long_read(&rdp->nocb_q_count);
-
-#ifdef CONFIG_PROVE_RCU
- rhp = ACCESS_ONCE(rdp->nocb_head);
- if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_gp_head);
- if (!rhp)
- rhp = ACCESS_ONCE(rdp->nocb_follower_head);
-
- /* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
- rcu_scheduler_fully_active) {
- /* RCU callback enqueued before CPU first came online??? */
- pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
- cpu, rhp->func);
- WARN_ON_ONCE(1);
- }
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
- return !!ret;
-}
-
-/*
- * Enqueue the specified string of rcu_head structures onto the specified
- * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
- * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
- * counts are supplied by rhcount and rhcount_lazy.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
- struct rcu_head *rhp,
- struct rcu_head **rhtp,
- int rhcount, int rhcount_lazy,
- unsigned long flags)
-{
- int len;
- struct rcu_head **old_rhpp;
- struct task_struct *t;
-
- /* Enqueue the callback on the nocb list and update counts. */
- atomic_long_add(rhcount, &rdp->nocb_q_count);
- /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
- old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- ACCESS_ONCE(*old_rhpp) = rhp;
- atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
- smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
-
- /* If we are not being polled and there is a kthread, awaken it ... */
- t = ACCESS_ONCE(rdp->nocb_kthread);
- if (rcu_nocb_poll || !t) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeNotPoll"));
- return;
- }
- len = atomic_long_read(&rdp->nocb_q_count);
- if (old_rhpp == &rdp->nocb_head) {
- if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- wake_nocb_leader(rdp, false);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeEmpty"));
- } else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeEmptyIsDeferred"));
- }
- rdp->qlen_last_fqs_check = 0;
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- if (!irqs_disabled_flags(flags)) {
- wake_nocb_leader(rdp, true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeOvf"));
- } else {
- rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeOvfIsDeferred"));
- }
- rdp->qlen_last_fqs_check = LONG_MAX / 2;
- } else {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
- }
- return;
-}
-
-/*
- * This is a helper for __call_rcu(), which invokes this when the normal
- * callback queue is inoperable. If this is not a no-CBs CPU, this
- * function returns failure back to __call_rcu(), which can complain
- * appropriately.
- *
- * Otherwise, this function queues the callback where the corresponding
- * "rcuo" kthread can find it.
- */
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
-{
-
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
- __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
- if (__is_kfree_rcu_offset((unsigned long)rhp->func))
- trace_rcu_kfree_callback(rdp->rsp->name, rhp,
- (unsigned long)rhp->func,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -atomic_long_read(&rdp->nocb_q_count));
- else
- trace_rcu_callback(rdp->rsp->name, rhp,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -atomic_long_read(&rdp->nocb_q_count));
-
- /*
- * If called from an extended quiescent state with interrupts
- * disabled, invoke the RCU core in order to allow the idle-entry
- * deferred-wakeup check to function.
- */
- if (irqs_disabled_flags(flags) &&
- !rcu_is_watching() &&
- cpu_online(smp_processor_id()))
- invoke_rcu_core();
-
- return true;
-}
-
-/*
- * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
- * not a no-CBs CPU.
- */
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
- struct rcu_data *rdp,
- unsigned long flags)
-{
- long ql = rsp->qlen;
- long qll = rsp->qlen_lazy;
-
- /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
- if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false;
- rsp->qlen = 0;
- rsp->qlen_lazy = 0;
-
- /* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_donelist != NULL) {
- __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
- rsp->orphan_donetail, ql, qll, flags);
- ql = qll = 0;
- rsp->orphan_donelist = NULL;
- rsp->orphan_donetail = &rsp->orphan_donelist;
- }
- if (rsp->orphan_nxtlist != NULL) {
- __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
- rsp->orphan_nxttail, ql, qll, flags);
- ql = qll = 0;
- rsp->orphan_nxtlist = NULL;
- rsp->orphan_nxttail = &rsp->orphan_nxtlist;
- }
- return true;
-}
-
-/*
- * If necessary, kick off a new grace period, and either way wait
- * for a subsequent grace period to complete.
- */
-static void rcu_nocb_wait_gp(struct rcu_data *rdp)
-{
- unsigned long c;
- bool d;
- unsigned long flags;
- bool needwake;
- struct rcu_node *rnp = rdp->mynode;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- needwake = rcu_start_future_gp(rnp, rdp, &c);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (needwake)
- rcu_gp_kthread_wake(rdp->rsp);
-
- /*
- * Wait for the grace period. Do so interruptibly to avoid messing
- * up the load average.
- */
- trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
- for (;;) {
- wait_event_interruptible(
- rnp->nocb_gp_wq[c & 0x1],
- (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
- if (likely(d))
- break;
- WARN_ON(signal_pending(current));
- trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
- }
- trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
- smp_mb(); /* Ensure that CB invocation happens after GP end. */
-}
-
-/*
- * Leaders come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
- */
-static void nocb_leader_wait(struct rcu_data *my_rdp)
-{
- bool firsttime = true;
- bool gotcbs;
- struct rcu_data *rdp;
- struct rcu_head **tail;
-
-wait_again:
-
- /* Wait for callbacks to appear. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
- wait_event_interruptible(my_rdp->nocb_wq,
- !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
- /* Memory barrier handled by smp_mb() calls below and repoll. */
- } else if (firsttime) {
- firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
- }
-
- /*
- * Each pass through the following loop checks a follower for CBs.
- * We are our own first follower. Any CBs found are moved to
- * nocb_gp_head, where they await a grace period.
- */
- gotcbs = false;
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
- if (!rdp->nocb_gp_head)
- continue; /* No CBs here, try next follower. */
-
- /* Move callbacks to wait-for-GP list, which is empty. */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
- rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- gotcbs = true;
- }
-
- /*
- * If there were no callbacks, sleep a bit, rescan after a
- * memory barrier, and go retry.
- */
- if (unlikely(!gotcbs)) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
- "WokeEmpty");
- WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
-
- /* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (ACCESS_ONCE(rdp->nocb_head)) {
- /* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_sleep = false;
- break;
- }
- goto wait_again;
- }
-
- /* Wait for one grace period. */
- rcu_nocb_wait_gp(my_rdp);
-
- /*
- * We left ->nocb_leader_sleep unset to reduce cache thrashing.
- * We set it now, but recheck for new callbacks while
- * traversing our follower list.
- */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
-
- /* Each pass through the following loop wakes a follower, if needed. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (ACCESS_ONCE(rdp->nocb_head))
- my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
- if (!rdp->nocb_gp_head)
- continue; /* No CBs, so no need to wake follower. */
-
- /* Append callbacks to follower's "done" list. */
- tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
- *tail = rdp->nocb_gp_head;
- smp_mb__after_atomic(); /* Store *tail before wakeup. */
- if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /*
- * List was empty, wake up the follower.
- * Memory barriers supplied by atomic_long_add().
- */
- wake_up(&rdp->nocb_wq);
- }
- }
-
- /* If we (the leader) don't have CBs, go wait some more. */
- if (!my_rdp->nocb_follower_head)
- goto wait_again;
-}
-
-/*
- * Followers come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
- */
-static void nocb_follower_wait(struct rcu_data *rdp)
-{
- bool firsttime = true;
-
- for (;;) {
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "FollowerSleep");
- wait_event_interruptible(rdp->nocb_wq,
- ACCESS_ONCE(rdp->nocb_follower_head));
- } else if (firsttime) {
- /* Don't drown trace log with "Poll"! */
- firsttime = false;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
- }
- if (smp_load_acquire(&rdp->nocb_follower_head)) {
- /* ^^^ Ensure CB invocation follows _head test. */
- return;
- }
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "WokeEmpty");
- WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
- }
-}
-
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU, however, there is
- * an optional leader-follower relationship so that the grace-period
- * kthreads don't have to do quite so many wakeups.
- */
-static int rcu_nocb_kthread(void *arg)
-{
- int c, cl;
- struct rcu_head *list;
- struct rcu_head *next;
- struct rcu_head **tail;
- struct rcu_data *rdp = arg;
-
- /* Each pass through this loop invokes one batch of callbacks */
- for (;;) {
- /* Wait for callbacks. */
- if (rdp->nocb_leader == rdp)
- nocb_leader_wait(rdp);
- else
- nocb_follower_wait(rdp);
-
- /* Pull the ready-to-invoke callbacks onto local list. */
- list = ACCESS_ONCE(rdp->nocb_follower_head);
- BUG_ON(!list);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
- tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
-
- /* Each pass through the following loop invokes a callback. */
- trace_rcu_batch_start(rdp->rsp->name,
- atomic_long_read(&rdp->nocb_q_count_lazy),
- atomic_long_read(&rdp->nocb_q_count), -1);
- c = cl = 0;
- while (list) {
- next = list->next;
- /* Wait for enqueuing to complete, if needed. */
- while (next == NULL && &list->next != tail) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WaitQueue"));
- schedule_timeout_interruptible(1);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeQueue"));
- next = list->next;
- }
- debug_rcu_head_unqueue(list);
- local_bh_disable();
- if (__rcu_reclaim(rdp->rsp->name, list))
- cl++;
- c++;
- local_bh_enable();
- list = next;
- }
- trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
- smp_mb__before_atomic(); /* _add after CB invocation. */
- atomic_long_add(-c, &rdp->nocb_q_count);
- atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
- rdp->n_nocbs_invoked += c;
- }
- return 0;
-}
-
-/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return ACCESS_ONCE(rdp->nocb_defer_wakeup);
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- int ndw;
-
- if (!rcu_nocb_need_deferred_wakeup(rdp))
- return;
- ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
- ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
- wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
-}
-
-void __init rcu_init_nohz(void)
-{
- int cpu;
- bool need_rcu_nocb_mask = true;
- struct rcu_state *rsp;
-
-#ifdef CONFIG_RCU_NOCB_CPU_NONE
- need_rcu_nocb_mask = false;
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
- need_rcu_nocb_mask = true;
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
- }
- have_rcu_nocb_mask = true;
- }
- if (!have_rcu_nocb_mask)
- return;
-
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
- pr_info("\tOffload RCU callbacks from CPU 0\n");
- cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
- pr_info("\tOffload RCU callbacks from all CPUs\n");
- cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-
- for_each_rcu_flavor(rsp) {
- for_each_cpu(cpu, rcu_nocb_mask)
- init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
- rcu_organize_nocb_kthreads(rsp);
- }
-}
-
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
- rdp->nocb_tail = &rdp->nocb_head;
- init_waitqueue_head(&rdp->nocb_wq);
- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
- * brought online out of order, this can require re-organizing the
- * leader-follower relationships.
- */
-static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
-{
- struct rcu_data *rdp;
- struct rcu_data *rdp_last;
- struct rcu_data *rdp_old_leader;
- struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
- struct task_struct *t;
-
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
- return;
-
- /* If we didn't spawn the leader first, reorganize! */
- rdp_old_leader = rdp_spawn->nocb_leader;
- if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
- rdp_last = NULL;
- rdp = rdp_old_leader;
- do {
- rdp->nocb_leader = rdp_spawn;
- if (rdp_last && rdp != rdp_spawn)
- rdp_last->nocb_next_follower = rdp;
- if (rdp == rdp_spawn) {
- rdp = rdp->nocb_next_follower;
- } else {
- rdp_last = rdp;
- rdp = rdp->nocb_next_follower;
- rdp_last->nocb_next_follower = NULL;
- }
- } while (rdp);
- rdp_spawn->nocb_next_follower = rdp_old_leader;
- }
-
- /* Spawn the kthread for this CPU and RCU flavor. */
- t = kthread_run(rcu_nocb_kthread, rdp_spawn,
- "rcuo%c/%d", rsp->abbr, cpu);
- BUG_ON(IS_ERR(t));
- ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthreads, spawn them.
- */
-static void rcu_spawn_all_nocb_kthreads(int cpu)
-{
- struct rcu_state *rsp;
-
- if (rcu_scheduler_fully_active)
- for_each_rcu_flavor(rsp)
- rcu_spawn_one_nocb_kthread(rsp, cpu);
-}
-
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- rcu_spawn_all_nocb_kthreads(cpu);
-}
-
-/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_leader_stride = -1;
-module_param(rcu_nocb_leader_stride, int, 0444);
-
-/*
- * Initialize leader-follower relationships for all no-CBs CPU.
- */
-static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
-{
- int cpu;
- int ls = rcu_nocb_leader_stride;
- int nl = 0; /* Next leader. */
- struct rcu_data *rdp;
- struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
-
- if (!have_rcu_nocb_mask)
- return;
- if (ls == -1) {
- ls = int_sqrt(nr_cpu_ids);
- rcu_nocb_leader_stride = ls;
- }
-
- /*
- * Each pass through this loop sets up one rcu_data structure and
- * spawns one rcu_nocb_kthread().
- */
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
- if (rdp->cpu >= nl) {
- /* New leader, set up for followers & next leader. */
- nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_leader = rdp;
- rdp_leader = rdp;
- } else {
- /* Another follower, link to previous leader. */
- rdp->nocb_leader = rdp_leader;
- rdp_prev->nocb_next_follower = rdp;
- }
- rdp_prev = rdp;
- }
-}
-
-/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static bool init_nocb_callback_list(struct rcu_data *rdp)
-{
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
-
- /* If there are early-boot callbacks, move them to nocb lists. */
- if (rdp->nxtlist) {
- rdp->nocb_head = rdp->nxtlist;
- rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
- atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
- atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
- rdp->nxtlist = NULL;
- rdp->qlen = 0;
- rdp->qlen_lazy = 0;
- }
- rdp->nxttail[RCU_NEXT_TAIL] = NULL;
- return true;
-}
-
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-
-static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
-{
- WARN_ON_ONCE(1); /* Should be dead code. */
- return false;
-}
-
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
-{
-}
-
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
-}
-
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
-{
- return false;
-}
-
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
- struct rcu_data *rdp,
- unsigned long flags)
-{
- return false;
-}
-
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
-}
-
-static void rcu_spawn_all_nocb_kthreads(int cpu)
-{
-}
-
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
-static bool init_nocb_callback_list(struct rcu_data *rdp)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-
-/*
- * An adaptive-ticks CPU can potentially execute in kernel mode for an
- * arbitrarily long period of time with the scheduling-clock tick turned
- * off. RCU will be paying attention to this CPU because it is in the
- * kernel, but the CPU cannot be guaranteed to be executing the RCU state
- * machine because the scheduling-clock tick has been disabled. Therefore,
- * if an adaptive-ticks CPU is failing to respond to the current grace
- * period and has not be idle from an RCU perspective, kick it.
- */
-static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
-{
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_cpu(cpu))
- smp_send_reschedule(cpu);
-#endif /* #ifdef CONFIG_NO_HZ_FULL */
-}
-
-
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-
-static int full_sysidle_state; /* Current system-idle state. */
-#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
-#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
-#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
-#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
-#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
-
-/*
- * Invoked to note exit from irq or task transition to idle. Note that
- * usermode execution does -not- count as idle here! After all, we want
- * to detect full-system idle states, not RCU quiescent states and grace
- * periods. The caller must have disabled interrupts.
- */
-static void rcu_sysidle_enter(int irq)
-{
- unsigned long j;
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for fully idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting--;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- if (rdtp->dynticks_idle_nesting != 0)
- return; /* Still not fully idle. */
- } else {
- if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
- DYNTICK_TASK_NEST_VALUE) {
- rdtp->dynticks_idle_nesting = 0;
- } else {
- rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
- return; /* Still not fully idle. */
- }
- }
-
- /* Record start of fully idle period. */
- j = jiffies;
- ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
-}
-
-/*
- * Unconditionally force exit from full system-idle state. This is
- * invoked when a normal CPU exits idle, but must be called separately
- * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
- * is that the timekeeping CPU is permitted to take scheduling-clock
- * interrupts while the system is in system-idle state, and of course
- * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
- * interrupt from any other type of interrupt.
- */
-void rcu_sysidle_force_exit(void)
-{
- int oldstate = ACCESS_ONCE(full_sysidle_state);
- int newoldstate;
-
- /*
- * Each pass through the following loop attempts to exit full
- * system-idle state. If contention proves to be a problem,
- * a trylock-based contention tree could be used here.
- */
- while (oldstate > RCU_SYSIDLE_SHORT) {
- newoldstate = cmpxchg(&full_sysidle_state,
- oldstate, RCU_SYSIDLE_NOT);
- if (oldstate == newoldstate &&
- oldstate == RCU_SYSIDLE_FULL_NOTED) {
- rcu_kick_nohz_cpu(tick_do_timer_cpu);
- return; /* We cleared it, done! */
- }
- oldstate = newoldstate;
- }
- smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
-}
-
-/*
- * Invoked to note entry to irq or task transition from idle. Note that
- * usermode execution does -not- count as idle here! The caller must
- * have disabled interrupts.
- */
-static void rcu_sysidle_exit(int irq)
-{
- struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- /* Adjust nesting, check for already non-idle. */
- if (irq) {
- rdtp->dynticks_idle_nesting++;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- if (rdtp->dynticks_idle_nesting != 1)
- return; /* Already non-idle. */
- } else {
- /*
- * Allow for irq misnesting. Yes, it really is possible
- * to enter an irq handler then never leave it, and maybe
- * also vice versa. Handle both possibilities.
- */
- if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
- rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
- WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
- return; /* Already non-idle. */
- } else {
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
- }
- }
-
- /* Record end of idle period. */
- smp_mb__before_atomic();
- atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic();
- WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
-
- /*
- * If we are the timekeeping CPU, we are permitted to be non-idle
- * during a system-idle state. This must be the case, because
- * the timekeeping CPU has to take scheduling-clock interrupts
- * during the time that the system is transitioning to full
- * system-idle state. This means that the timekeeping CPU must
- * invoke rcu_sysidle_force_exit() directly if it does anything
- * more than take a scheduling-clock interrupt.
- */
- if (smp_processor_id() == tick_do_timer_cpu)
- return;
-
- /* Update system-idle state: We are clearly no longer fully idle! */
- rcu_sysidle_force_exit();
-}
-
-/*
- * Check to see if the current CPU is idle. Note that usermode execution
- * does not count as idle. The caller must have disabled interrupts,
- * and must be running on tick_do_timer_cpu.
- */
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
- int cur;
- unsigned long j;
- struct rcu_dynticks *rdtp = rdp->dynticks;
-
- /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
- if (!tick_nohz_full_enabled())
- return;
-
- /*
- * If some other CPU has already reported non-idle, if this is
- * not the flavor of RCU that tracks sysidle state, or if this
- * is an offline or the timekeeping CPU, nothing to do.
- */
- if (!*isidle || rdp->rsp != rcu_state_p ||
- cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
- return;
- /* Verify affinity of current kthread. */
- WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
-
- /* Pick up current idle and NMI-nesting counter and check. */
- cur = atomic_read(&rdtp->dynticks_idle);
- if (cur & 0x1) {
- *isidle = false; /* We are not idle! */
- return;
- }
- smp_mb(); /* Read counters before timestamps. */
-
- /* Pick up timestamps. */
- j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
- /* If this CPU entered idle more recently, update maxj timestamp. */
- if (ULONG_CMP_LT(*maxj, j))
- *maxj = j;
-}
-
-/*
- * Is this the flavor of RCU that is handling full-system idle?
- */
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return rsp == rcu_state_p;
-}
-
-/*
- * Return a delay in jiffies based on the number of CPUs, rcu_node
- * leaf fanout, and jiffies tick rate. The idea is to allow larger
- * systems more time to transition to full-idle state in order to
- * avoid the cache thrashing that otherwise occur on the state variable.
- * Really small systems (less than a couple of tens of CPUs) should
- * instead use a single global atomically incremented counter, and later
- * versions of this will automatically reconfigure themselves accordingly.
- */
-static unsigned long rcu_sysidle_delay(void)
-{
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return 0;
- return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
-}
-
-/*
- * Advance the full-system-idle state. This is invoked when all of
- * the non-timekeeping CPUs are idle.
- */
-static void rcu_sysidle(unsigned long j)
-{
- /* Check the current state. */
- switch (ACCESS_ONCE(full_sysidle_state)) {
- case RCU_SYSIDLE_NOT:
-
- /* First time all are idle, so note a short idle period. */
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
- break;
-
- case RCU_SYSIDLE_SHORT:
-
- /*
- * Idle for a bit, time to advance to next state?
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
- break;
-
- case RCU_SYSIDLE_LONG:
-
- /*
- * Do an additional check pass before advancing to full.
- * cmpxchg failure means race with non-idle, let them win.
- */
- if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
- (void)cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
- break;
-
- default:
- break;
- }
-}
-
-/*
- * Found a non-idle non-timekeeping CPU, so kick the system-idle state
- * back to the beginning.
- */
-static void rcu_sysidle_cancel(void)
-{
- smp_mb();
- if (full_sysidle_state > RCU_SYSIDLE_SHORT)
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
-}
-
-/*
- * Update the sysidle state based on the results of a force-quiescent-state
- * scan of the CPUs' dyntick-idle state.
- */
-static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
- unsigned long maxj, bool gpkt)
-{
- if (rsp != rcu_state_p)
- return; /* Wrong flavor, ignore. */
- if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
- return; /* Running state machine from timekeeping CPU. */
- if (isidle)
- rcu_sysidle(maxj); /* More idle! */
- else
- rcu_sysidle_cancel(); /* Idle is over. */
-}
-
-/*
- * Wrapper for rcu_sysidle_report() when called from the grace-period
- * kthread's context.
- */
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
- /* If there are no nohz_full= CPUs, no need to track this. */
- if (!tick_nohz_full_enabled())
- return;
-
- rcu_sysidle_report(rsp, isidle, maxj, true);
-}
-
-/* Callback and function for forcing an RCU grace period. */
-struct rcu_sysidle_head {
- struct rcu_head rh;
- int inuse;
-};
-
-static void rcu_sysidle_cb(struct rcu_head *rhp)
-{
- struct rcu_sysidle_head *rshp;
-
- /*
- * The following memory barrier is needed to replace the
- * memory barriers that would normally be in the memory
- * allocator.
- */
- smp_mb(); /* grace period precedes setting inuse. */
-
- rshp = container_of(rhp, struct rcu_sysidle_head, rh);
- ACCESS_ONCE(rshp->inuse) = 0;
-}
-
-/*
- * Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts. This is not intended to be
- * called unless tick_nohz_full_enabled().
- */
-bool rcu_sys_is_idle(void)
-{
- static struct rcu_sysidle_head rsh;
- int rss = ACCESS_ONCE(full_sysidle_state);
-
- if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
- return false;
-
- /* Handle small-system case by doing a full scan of CPUs. */
- if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
- int oldrss = rss - 1;
-
- /*
- * One pass to advance to each state up to _FULL.
- * Give up if any pass fails to advance the state.
- */
- while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
- int cpu;
- bool isidle = true;
- unsigned long maxj = jiffies - ULONG_MAX / 4;
- struct rcu_data *rdp;
-
- /* Scan all the CPUs looking for nonidle CPUs. */
- for_each_possible_cpu(cpu) {
- rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
- rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
- if (!isidle)
- break;
- }
- rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
- oldrss = rss;
- rss = ACCESS_ONCE(full_sysidle_state);
- }
- }
-
- /* If this is the first observation of an idle period, record it. */
- if (rss == RCU_SYSIDLE_FULL) {
- rss = cmpxchg(&full_sysidle_state,
- RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
- return rss == RCU_SYSIDLE_FULL;
- }
-
- smp_mb(); /* ensure rss load happens before later caller actions. */
-
- /* If already fully idle, tell the caller (in case of races). */
- if (rss == RCU_SYSIDLE_FULL_NOTED)
- return true;
-
- /*
- * If we aren't there yet, and a grace period is not in flight,
- * initiate a grace period. Either way, tell the caller that
- * we are not there yet. We use an xchg() rather than an assignment
- * to make up for the memory barriers that would otherwise be
- * provided by the memory allocator.
- */
- if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
- !rcu_gp_in_progress(rcu_state_p) &&
- !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
- call_rcu(&rsh.rh, rcu_sysidle_cb);
- return false;
-}
-
-/*
- * Initialize dynticks sysidle state for CPUs coming online.
- */
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
- rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
-}
-
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
-static void rcu_sysidle_enter(int irq)
-{
-}
-
-static void rcu_sysidle_exit(int irq)
-{
-}
-
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
- unsigned long *maxj)
-{
-}
-
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
- return false;
-}
-
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
- unsigned long maxj)
-{
-}
-
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -3041,50 +1342,25 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPU CPUs.
+ * RCU_NOCB_CPU CPUs.
*/
-static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
+static bool rcu_nohz_full_cpu(void)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(smp_processor_id()) &&
- (!rcu_gp_in_progress(rsp) ||
- ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
- return 1;
+ (!rcu_gp_in_progress() ||
+ time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ)))
+ return true;
#endif /* #ifdef CONFIG_NO_HZ_FULL */
- return 0;
+ return false;
}
/*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
+ * Bind the RCU grace-period kthreads to the housekeeping CPU.
*/
static void rcu_bind_gp_kthread(void)
{
- int __maybe_unused cpu;
-
if (!tick_nohz_full_enabled())
return;
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- cpu = tick_do_timer_cpu;
- if (cpu >= 0 && cpu < nr_cpu_ids)
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
- housekeeping_affine(current);
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-}
-
-/* Record the current task on dyntick-idle entry. */
-static void rcu_dynticks_task_enter(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Record no current task on dyntick-idle exit. */
-static void rcu_dynticks_task_exit(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
+ housekeeping_affine(current, HK_TYPE_RCU);
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
new file mode 100644
index 000000000000..b67532cb8770
--- /dev/null
+++ b/kernel/rcu/tree_stall.h
@@ -0,0 +1,1185 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU CPU stall warnings for normal RCU grace periods
+ *
+ * Copyright IBM Corporation, 2019
+ *
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+#include <linux/console.h>
+#include <linux/kvm_para.h>
+#include <linux/rcu_notifier.h>
+#include <linux/smp.h>
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Controlling CPU stall warnings, including delay calculation.
+
+/* panic() on RCU Stall sysctl. */
+static int sysctl_panic_on_rcu_stall __read_mostly;
+static int sysctl_max_rcu_stall_to_panic __read_mostly;
+
+static const struct ctl_table rcu_stall_sysctl_table[] = {
+ {
+ .procname = "panic_on_rcu_stall",
+ .data = &sysctl_panic_on_rcu_stall,
+ .maxlen = sizeof(sysctl_panic_on_rcu_stall),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "max_rcu_stall_to_panic",
+ .data = &sysctl_max_rcu_stall_to_panic,
+ .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+};
+
+static int __init init_rcu_stall_sysctl(void)
+{
+ register_sysctl_init("kernel", rcu_stall_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rcu_stall_sysctl);
+
+#ifdef CONFIG_SYSFS
+
+static unsigned int rcu_stall_count;
+
+static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%u\n", rcu_stall_count);
+}
+
+static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count);
+
+static __init int kernel_rcu_stall_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL);
+ return 0;
+}
+
+late_initcall(kernel_rcu_stall_sysfs_init);
+
+#endif // CONFIG_SYSFS
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+#define RCU_STALL_MIGHT_DIV 8
+#define RCU_STALL_MIGHT_MIN (2 * HZ)
+
+int rcu_exp_jiffies_till_stall_check(void)
+{
+ int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
+ int exp_stall_delay_delta = 0;
+ int till_stall_check;
+
+ // Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
+ if (!cpu_stall_timeout)
+ cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
+
+ // Limit check must be consistent with the Kconfig limits for
+ // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
+ // The minimum clamped value is "2UL", because at least one full
+ // tick has to be guaranteed.
+ till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ);
+
+ if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
+ WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
+
+#ifdef CONFIG_PROVE_RCU
+ /* Add extra ~25% out of till_stall_check. */
+ exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
+#endif
+
+ return till_stall_check + exp_stall_delay_delta;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
+
+/* Limit-check stall timeouts specified at boottime and runtime. */
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 3);
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 300);
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
+
+/* Don't do RCU CPU stall warnings during long sysrq printouts. */
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
+/* Don't print RCU CPU stall warnings during a kernel panic. */
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
+static void panic_on_rcu_stall(void)
+{
+ static int cpu_stall;
+
+ /*
+ * Attempt to kick out the BPF scheduler if it's installed and defer
+ * the panic to give the system a chance to recover.
+ */
+ if (scx_rcu_cpu_stall())
+ return;
+
+ if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
+ return;
+
+ if (sysctl_panic_on_rcu_stall)
+ panic("RCU Stall\n");
+}
+
+/**
+ * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
+ *
+ * To perform the reset request from the caller, disable stall detection until
+ * 3 fqs loops have passed. This is required to ensure a fresh jiffies is
+ * loaded. It should be safe to do from the fqs loop as enough timer
+ * interrupts and context switches should have passed.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3);
+ WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Interaction with RCU grace periods
+
+/* Start of new grace period, so record stall time (and forcing times). */
+static void record_gp_stall_check_time(void)
+{
+ unsigned long j = jiffies;
+ unsigned long j1;
+
+ WRITE_ONCE(rcu_state.gp_start, j);
+ j1 = rcu_jiffies_till_stall_check();
+ smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0);
+ WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
+ rcu_state.jiffies_resched = j + j1 / 2;
+ rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
+}
+
+/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+ rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+ WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+}
+
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(void)
+{
+ unsigned long j;
+
+ if (!READ_ONCE(rcu_kick_kthreads))
+ return;
+ j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
+ if (time_after(jiffies, j) && rcu_state.gp_kthread &&
+ (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
+ WARN_ONCE(1, "Kicking %s grace-period kthread\n",
+ rcu_state.name);
+ rcu_ftrace_dump(DUMP_ALL);
+ wake_up_process(rcu_state.gp_kthread);
+ WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
+ }
+}
+
+/*
+ * Handler for the irq_work request posted about halfway into the RCU CPU
+ * stall timeout, and used to detect excessive irq disabling. Set state
+ * appropriately, but just complain if there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gp_seq = rnp->gp_seq;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Printing RCU CPU stall warnings
+
+#ifdef CONFIG_PREEMPT_RCU
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+// Communicate task state back to the RCU CPU stall warning request.
+struct rcu_stall_chk_rdr {
+ int nesting;
+ union rcu_special rs;
+ bool on_blkd_list;
+};
+
+/*
+ * Report out the state of a not-running task that is stalling the
+ * current RCU grace period.
+ */
+static int check_slow_task(struct task_struct *t, void *arg)
+{
+ struct rcu_stall_chk_rdr *rscrp = arg;
+
+ if (task_curr(t))
+ return -EBUSY; // It is running, so decline to inspect it.
+ rscrp->nesting = t->rcu_read_lock_nesting;
+ rscrp->rs = t->rcu_read_unlock_special;
+ rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
+ return 0;
+}
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each of the first few of them.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ int i = 0;
+ int ndetected = 0;
+ struct rcu_stall_chk_rdr rscr;
+ struct task_struct *t;
+ struct task_struct *ts[8];
+
+ lockdep_assert_irqs_disabled();
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
+ pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+ rnp->level, rnp->grplo, rnp->grphi);
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ get_task_struct(t);
+ ts[i++] = t;
+ if (i >= ARRAY_SIZE(ts))
+ break;
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ while (i) {
+ t = ts[--i];
+ if (task_call_func(t, check_slow_task, &rscr))
+ pr_cont(" P%d", t->pid);
+ else
+ pr_cont(" P%d/%d:%c%c%c%c",
+ t->pid, rscr.nesting,
+ ".b"[rscr.rs.b.blocked],
+ ".q"[rscr.rs.b.need_qs],
+ ".e"[rscr.rs.b.exp_hint],
+ ".l"[rscr.on_blkd_list]);
+ lockdep_assert_irqs_disabled();
+ put_task_struct(t);
+ ndetected++;
+ }
+ pr_cont("\n");
+ return ndetected;
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
+{
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+}
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs. First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps. The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(unsigned long gp_seq)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ printk_deferred_enter();
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (gp_seq != data_race(rcu_state.gp_seq)) {
+ printk_deferred_exit();
+ pr_err("INFO: Stall ended during stack backtracing.\n");
+ return;
+ }
+ if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu)))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ if (cpu_is_offline(cpu))
+ pr_err("Offline CPU %d blocking current GP.\n", cpu);
+ else
+ dump_cpu_task(cpu);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ printk_deferred_exit();
+ }
+}
+
+static const char * const gp_state_names[] = {
+ [RCU_GP_IDLE] = "RCU_GP_IDLE",
+ [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
+ [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS",
+ [RCU_GP_ONOFF] = "RCU_GP_ONOFF",
+ [RCU_GP_INIT] = "RCU_GP_INIT",
+ [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS",
+ [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS",
+ [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP",
+ [RCU_GP_CLEANED] = "RCU_GP_CLEANED",
+};
+
+/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+ if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+ return "???";
+ return gp_state_names[gs];
+}
+
+/* Is the RCU grace-period kthread being starved of CPU time? */
+static bool rcu_is_gp_kthread_starving(unsigned long *jp)
+{
+ unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity);
+
+ if (jp)
+ *jp = j;
+ return j > 2 * HZ;
+}
+
+static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
+{
+ int cpu;
+ struct task_struct *rcuc;
+ unsigned long j;
+
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return false;
+
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return false;
+
+ j = jiffies - READ_ONCE(rdp->rcuc_activity);
+
+ if (jp)
+ *jp = j;
+ return j > 2 * HZ;
+}
+
+static void print_cpu_stat_info(int cpu)
+{
+ struct rcu_snap_record rsr, *rsrp;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);
+
+ if (!rcu_cpu_stall_cputime)
+ return;
+
+ rsrp = &rdp->snap_record;
+ if (rsrp->gp_seq != rdp->gp_seq)
+ return;
+
+ rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+
+ pr_err("\t hardirqs softirqs csw/system\n");
+ pr_err("\t number: %8lld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs,
+ kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
+ nr_context_switches_cpu(cpu) - rsrp->nr_csw);
+ pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
+ div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC),
+ jiffies_to_msecs(jiffies - rsrp->jiffies));
+}
+
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period, then
+ * print the number of scheduling clock interrupts the CPU has taken
+ * during the time that it has been aware. Otherwise, print the number
+ * of RCU grace periods that this CPU is ignorant of, for example, "1"
+ * if the CPU was aware of the previous grace period.
+ *
+ * Also print out idle info.
+ */
+static void print_cpu_stall_info(int cpu)
+{
+ unsigned long delta;
+ bool falsepositive;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ char *ticks_title;
+ unsigned long ticks_value;
+ bool rcuc_starved;
+ unsigned long j;
+ char buf[32];
+
+ /*
+ * We could be printing a lot while holding a spinlock. Avoid
+ * triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+
+ ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
+ if (ticks_value) {
+ ticks_title = "GPs behind";
+ } else {
+ ticks_title = "ticks this GP";
+ ticks_value = rdp->ticks_this_gp;
+ }
+ delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
+ falsepositive = rcu_is_gp_kthread_starving(NULL) &&
+ rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu));
+ rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
+ if (rcuc_starved)
+ // Print signed value, as negative values indicate a probable bug.
+ snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
+ cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+ "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
+ ticks_value, ticks_title,
+ ct_rcu_watching_cpu(cpu) & 0xffff,
+ ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu),
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
+ rcuc_starved ? buf : "",
+ falsepositive ? " (false positive?)" : "");
+
+ print_cpu_stat_info(cpu);
+}
+
+/* Complain about starvation of grace-period kthread. */
+static void rcu_check_gp_kthread_starvation(void)
+{
+ int cpu;
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ unsigned long j;
+
+ if (rcu_is_gp_kthread_starving(&j)) {
+ cpu = gpk ? task_cpu(gpk) : -1;
+ pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
+ rcu_state.name, j,
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ data_race(READ_ONCE(rcu_state.gp_flags)),
+ gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
+ if (gpk) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
+ pr_err("RCU grace-period kthread stack dump:\n");
+ sched_show_task(gpk);
+ if (cpu_is_offline(cpu)) {
+ pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+ } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) {
+ pr_err("Stack dump where RCU GP kthread last ran:\n");
+ dump_cpu_task(cpu);
+ }
+ wake_up_process(gpk);
+ }
+ }
+}
+
+/* Complain about missing wakeups from expired fqs wait timer */
+static void rcu_check_gp_kthread_expired_fqs_timer(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ short gp_state;
+ unsigned long jiffies_fqs;
+ int cpu;
+
+ /*
+ * Order reads of .gp_state and .jiffies_force_qs.
+ * Matching smp_wmb() is present in rcu_gp_fqs_loop().
+ */
+ gp_state = smp_load_acquire(&rcu_state.gp_state);
+ jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs);
+
+ if (gp_state == RCU_GP_WAIT_FQS &&
+ time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
+ gpk && !READ_ONCE(gpk->on_rq)) {
+ cpu = task_cpu(gpk);
+ pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
+ rcu_state.name, (jiffies - jiffies_fqs),
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read
+ gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
+ data_race(READ_ONCE(gpk->__state)));
+ pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
+ cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
+ }
+}
+
+static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
+ int ndetected = 0;
+ struct rcu_node *rnp;
+ long totqlen = 0;
+
+ lockdep_assert_irqs_disabled();
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_stall_is_suppressed())
+ return;
+
+ nbcon_cpu_emergency_enter();
+
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.rst for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ trace_rcu_stall_warning(rcu_state.name, TPS("StallDetected"));
+ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->qsmask != 0) {
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ print_cpu_stall_info(cpu);
+ ndetected++;
+ }
+ }
+ ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
+ lockdep_assert_irqs_disabled();
+ }
+
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
+ smp_processor_id(), (long)(jiffies - gps),
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
+ if (ndetected) {
+ rcu_dump_cpu_stacks(gp_seq);
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_for_each_leaf_node(rnp)
+ rcu_print_detail_task_stall_rnp(rnp);
+ } else {
+ if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = data_race(READ_ONCE(rcu_state.gp_activity));
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+ rcu_state.name, j - gpa, j, gpa,
+ data_race(READ_ONCE(jiffies_till_next_fqs)),
+ data_race(READ_ONCE(rcu_get_root()->qsmask)));
+ }
+ }
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+
+ rcu_check_gp_kthread_expired_fqs_timer();
+ rcu_check_gp_kthread_starvation();
+
+ nbcon_cpu_emergency_exit();
+
+ panic_on_rcu_stall();
+
+ rcu_force_quiescent_state(); /* Kick them all. */
+}
+
+static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rcu_get_root();
+ long totqlen = 0;
+
+ lockdep_assert_irqs_disabled();
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_stall_is_suppressed())
+ return;
+
+ nbcon_cpu_emergency_enter();
+
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.rst for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
+ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
+ print_cpu_stall_info(smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
+ jiffies - gps,
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
+
+ rcu_check_gp_kthread_expired_fqs_timer();
+ rcu_check_gp_kthread_starvation();
+
+ rcu_dump_cpu_stacks(gp_seq);
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ nbcon_cpu_emergency_exit();
+
+ panic_on_rcu_stall();
+
+ /*
+ * Attempt to revive the RCU machinery by forcing a context switch.
+ *
+ * A context switch would normally allow the RCU state machine to make
+ * progress and it could be we're stuck in kernel space without context
+ * switches for an entirely unreasonable amount of time.
+ */
+ set_need_resched_current();
+}
+
+static bool csd_lock_suppress_rcu_stall;
+module_param(csd_lock_suppress_rcu_stall, bool, 0644);
+
+static void check_cpu_stall(struct rcu_data *rdp)
+{
+ bool self_detected;
+ unsigned long gs1;
+ unsigned long gs2;
+ unsigned long gps;
+ unsigned long j;
+ unsigned long jn;
+ unsigned long js;
+ struct rcu_node *rnp;
+
+ lockdep_assert_irqs_disabled();
+ if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
+ !rcu_gp_in_progress())
+ return;
+ rcu_stall_kick_kthreads();
+
+ /*
+ * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS
+ * loop has to set jiffies to ensure a non-stale jiffies value. This
+ * is required to have good jiffies value after coming out of long
+ * breaks of jiffies updates. Not doing so can cause false positives.
+ */
+ if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0)
+ return;
+
+ j = jiffies;
+
+ /*
+ * Lots of memory barriers to reject false positives.
+ *
+ * The idea is to pick up rcu_state.gp_seq, then
+ * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
+ * another copy of rcu_state.gp_seq. These values are updated in
+ * the opposite order with memory barriers (or equivalent) during
+ * grace-period initialization and cleanup. Now, a false positive
+ * can occur if we get an new value of rcu_state.gp_start and a old
+ * value of rcu_state.jiffies_stall. But given the memory barriers,
+ * the only way that this can happen is if one grace period ends
+ * and another starts between these two fetches. This is detected
+ * by comparing the second fetch of rcu_state.gp_seq with the
+ * previous fetch from rcu_state.gp_seq.
+ *
+ * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
+ * and rcu_state.gp_start suffice to forestall false positives.
+ */
+ gs1 = READ_ONCE(rcu_state.gp_seq);
+ smp_rmb(); /* Pick up ->gp_seq first... */
+ js = READ_ONCE(rcu_state.jiffies_stall);
+ smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+ gps = READ_ONCE(rcu_state.gp_start);
+ smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
+ gs2 = READ_ONCE(rcu_state.gp_seq);
+ if (gs1 != gs2 ||
+ ULONG_CMP_LT(j, js) ||
+ ULONG_CMP_GE(gps, js) ||
+ !rcu_seq_state(gs2))
+ return; /* No stall or GP completed since entering function. */
+ rnp = rdp->mynode;
+ jn = jiffies + ULONG_MAX / 2;
+ self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask;
+ if (rcu_gp_in_progress() &&
+ (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) &&
+ cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
+#ifdef CONFIG_SYSFS
+ ++rcu_stall_count;
+#endif
+
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
+ if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+ pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+ } else if (self_detected) {
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(gs2, gps);
+ } else {
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(gs2, gps);
+ }
+
+ if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
+ rcu_ftrace_dump(DUMP_ALL);
+
+ if (READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU forward-progress mechanisms, including for callback invocation.
+
+
+/*
+ * Check to see if a failure to end RCU priority inversion was due to
+ * a CPU not passing through a quiescent state. When this happens, there
+ * is nothing that RCU priority boosting can do to help, so we shouldn't
+ * count this as an RCU priority boosting failure. A return of true says
+ * RCU priority boosting is to blame, and false says otherwise. If false
+ * is returned, the first of the CPUs to blame is stored through cpup.
+ * If there was no CPU blocking the current grace period, but also nothing
+ * in need of being boosted, *cpup is set to -1. This can happen in case
+ * of vCPU preemption while the last CPU is reporting its quiscent state,
+ * for example.
+ *
+ * If cpup is NULL, then a lockless quick check is carried out, suitable
+ * for high-rate usage. On the other hand, if cpup is non-NULL, each
+ * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+ */
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+{
+ bool atb = false;
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ if (!cpup) {
+ if (data_race(READ_ONCE(rnp->qsmask))) {
+ return false;
+ } else {
+ if (READ_ONCE(rnp->gp_tasks))
+ atb = true;
+ continue;
+ }
+ }
+ *cpup = -1;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->gp_tasks)
+ atb = true;
+ if (!rnp->qsmask) {
+ // No CPUs without quiescent states for this rnp.
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue;
+ }
+ // Find the first holdout CPU.
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ *cpup = cpu;
+ return false;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ // Can't blame CPUs, so must blame RCU priority boosting.
+ return atb;
+}
+EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
+
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ unsigned long cbs = 0;
+ int cpu;
+ unsigned long j;
+ unsigned long ja;
+ unsigned long jr;
+ unsigned long js;
+ unsigned long jw;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
+
+ j = jiffies;
+ ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
+ jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
+ js = j - data_race(READ_ONCE(rcu_state.gp_start));
+ jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
+ pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
+ rcu_state.name, gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU,
+ js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
+ (long)data_race(READ_ONCE(rcu_state.gp_seq)),
+ (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
+ data_race(READ_ONCE(rcu_state.gp_max)),
+ data_race(READ_ONCE(rcu_state.gp_flags)));
+ rcu_for_each_node_breadth_first(rnp) {
+ if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+ !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
+ !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
+ continue;
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+ rnp->grplo, rnp->grphi,
+ (long)data_race(READ_ONCE(rnp->gp_seq)),
+ (long)data_race(READ_ONCE(rnp->gp_seq_needed)),
+ data_race(READ_ONCE(rnp->qsmask)),
+ ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
+ ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
+ ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
+ ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
+ data_race(READ_ONCE(rnp->n_boosts)));
+ if (!rcu_is_leaf_node(rnp))
+ continue;
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (READ_ONCE(rdp->gpwrap) ||
+ ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
+ READ_ONCE(rdp->gp_seq_needed)))
+ continue;
+ pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+ cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
+ }
+ }
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
+ show_rcu_nocb_state(rdp);
+ }
+ pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
+ show_rcu_tasks_gp_kthreads();
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
+ * This function checks for grace-period requests that fail to motivate
+ * RCU to come out of its idle mode.
+ */
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay)
+{
+ unsigned long flags;
+ unsigned long j;
+ struct rcu_node *rnp_root = rcu_get_root();
+ static atomic_t warned = ATOMIC_INIT(0);
+
+ if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
+ ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
+ READ_ONCE(rnp_root->gp_seq_needed)) ||
+ !smp_load_acquire(&rcu_state.gp_kthread)) // Get stable kthread.
+ return;
+ j = jiffies; /* Expensive access, and in common case don't get here. */
+ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned))
+ return;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
+ READ_ONCE(rnp_root->gp_seq_needed)) ||
+ time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ /* Hold onto the leaf lock to make others see warned==1. */
+
+ if (rnp_root != rnp)
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq),
+ READ_ONCE(rnp_root->gp_seq_needed)) ||
+ time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_xchg(&warned, 1)) {
+ if (rnp_root != rnp)
+ /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ WARN_ON(1);
+ if (rnp_root != rnp)
+ raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ show_rcu_gp_kthreads();
+}
+
+/*
+ * Do a forward-progress check for rcutorture. This is normally invoked
+ * due to an OOM event. The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
+ */
+void rcu_fwd_progress_check(unsigned long j)
+{
+ unsigned long cbs;
+ int cpu;
+ unsigned long max_cbs = 0;
+ int max_cpu = -1;
+ struct rcu_data *rdp;
+
+ if (rcu_gp_in_progress()) {
+ pr_info("%s: GP age %lu jiffies\n",
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
+ show_rcu_gp_kthreads();
+ } else {
+ pr_info("%s: Last GP end %lu jiffies ago\n",
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
+ preempt_disable();
+ rdp = this_cpu_ptr(&rcu_data);
+ rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+ preempt_enable();
+ }
+ for_each_possible_cpu(cpu) {
+ cbs = rcu_get_n_cbs_cpu(cpu);
+ if (!cbs)
+ continue;
+ if (max_cpu < 0)
+ pr_info("%s: callbacks", __func__);
+ pr_cont(" %d: %lu", cpu, cbs);
+ if (cbs <= max_cbs)
+ continue;
+ max_cbs = cbs;
+ max_cpu = cpu;
+ }
+ if (max_cpu >= 0)
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
+
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(u8 key)
+{
+ show_rcu_gp_kthreads();
+}
+
+static const struct sysrq_key_op sysrq_rcudump_op = {
+ .handler = sysrq_show_rcu,
+ .help_msg = "show-rcu(y)",
+ .action_msg = "Show RCU tree",
+ .enable_mask = SYSRQ_ENABLE_DUMP,
+};
+
+static int __init rcu_sysrq_init(void)
+{
+ if (sysrq_rcu)
+ return register_sysrq_key('y', &sysrq_rcudump_op);
+ return 0;
+}
+early_initcall(rcu_sysrq_init);
+
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU CPU stall-warning notifiers
+
+static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
+
+/**
+ * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Adds an RCU CPU stall notifier to an atomic notifier chain.
+ * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or
+ * friends. The @data will be the duration of the stalled grace period,
+ * in jiffies, coerced to a void* pointer.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int rcu_stall_chain_notifier_register(struct notifier_block *n)
+{
+ int rcsn = rcu_cpu_stall_notifiers;
+
+ WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call,
+ rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well");
+ if (rcsn)
+ return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+ return -EEXIST;
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
+
+/**
+ * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Removes an RCU CPU stall notifier from an atomic notifier chain.
+ *
+ * Returns zero on success, %-ENOENT on failure.
+ */
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister);
+
+/*
+ * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in the RCU CPU stall notifier chain in turn, which
+ * is an atomic call chain. See atomic_notifier_call_chain() for more
+ * information.
+ *
+ * This is for use within RCU, hence the omission of the extra asterisk
+ * to indicate a non-kerneldoc format header comment.
+ */
+int rcu_stall_notifier_call_chain(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
+}
+
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
deleted file mode 100644
index f92361efd0f5..000000000000
--- a/kernel/rcu/tree_trace.c
+++ /dev/null
@@ -1,505 +0,0 @@
-/*
- * Read-Copy Update tracing for classic implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright IBM Corporation, 2008
- *
- * Papers: http://www.rdrop.com/users/paulmck/RCU
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- * Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-
-#define RCU_TREE_NONCORE
-#include "tree.h"
-
-DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
-
-static int r_open(struct inode *inode, struct file *file,
- const struct seq_operations *op)
-{
- int ret = seq_open(file, op);
- if (!ret) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = inode->i_private;
- }
- return ret;
-}
-
-static void *r_start(struct seq_file *m, loff_t *pos)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- *pos = cpumask_next(*pos - 1, cpu_possible_mask);
- if ((*pos) < nr_cpu_ids)
- return per_cpu_ptr(rsp->rda, *pos);
- return NULL;
-}
-
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- (*pos)++;
- return r_start(m, pos);
-}
-
-static void r_stop(struct seq_file *m, void *v)
-{
-}
-
-static int show_rcubarrier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d nbd: %lu\n",
- atomic_read(&rsp->barrier_cpu_count),
- rsp->n_barrier_done);
- return 0;
-}
-
-static int rcubarrier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcubarrier, inode->i_private);
-}
-
-static const struct file_operations rcubarrier_fops = {
- .owner = THIS_MODULE,
- .open = rcubarrier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static char convert_kthread_status(unsigned int kthread_status)
-{
- if (kthread_status > RCU_KTHREAD_MAX)
- return '?';
- return "SRWOY"[kthread_status];
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
-{
- long ql, qll;
-
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->passed_quiesce,
- rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
- rdp->qs_pending);
- seq_printf(m, " dt=%d/%llx/%d df=%lu",
- atomic_read(&rdp->dynticks->dynticks),
- rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi_nesting,
- rdp->dynticks_fqs);
- seq_printf(m, " of=%lu", rdp->offline_fqs);
- rcu_nocb_q_lengths(rdp, &ql, &qll);
- qll += rdp->qlen_lazy;
- ql += rdp->qlen;
- seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
- qll, ql,
- ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
- rdp->nxttail[RCU_NEXT_TAIL]],
- ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
- rdp->nxttail[RCU_NEXT_READY_TAIL]],
- ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
- rdp->nxttail[RCU_WAIT_TAIL]],
- ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
-#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c ktl=%x",
- per_cpu(rcu_cpu_has_work, rdp->cpu),
- convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
- rdp->cpu)),
- per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
-#endif /* #ifdef CONFIG_RCU_BOOST */
- seq_printf(m, " b=%ld", rdp->blimit);
- seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
- rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
- rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-}
-
-static int show_rcudata(struct seq_file *m, void *v)
-{
- print_one_rcu_data(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcudate_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcudata,
-};
-
-static int rcudata_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcudate_op);
-}
-
-static const struct file_operations rcudata_fops = {
- .owner = THIS_MODULE,
- .open = rcudata_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcuexp(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
-
- seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
- atomic_long_read(&rsp->expedited_start),
- atomic_long_read(&rsp->expedited_done),
- atomic_long_read(&rsp->expedited_wrap),
- atomic_long_read(&rsp->expedited_tryfail),
- atomic_long_read(&rsp->expedited_workdone1),
- atomic_long_read(&rsp->expedited_workdone2),
- atomic_long_read(&rsp->expedited_normal),
- atomic_long_read(&rsp->expedited_stoppedcpus),
- atomic_long_read(&rsp->expedited_done_tries),
- atomic_long_read(&rsp->expedited_done_lost),
- atomic_long_read(&rsp->expedited_done_exit));
- return 0;
-}
-
-static int rcuexp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuexp, inode->i_private);
-}
-
-static const struct file_operations rcuexp_fops = {
- .owner = THIS_MODULE,
- .open = rcuexp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
-{
- seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
- rnp->grplo, rnp->grphi,
- "T."[list_empty(&rnp->blkd_tasks)],
- "N."[!rnp->gp_tasks],
- "E."[!rnp->exp_tasks],
- "B."[!rnp->boost_tasks],
- convert_kthread_status(rnp->boost_kthread_status),
- rnp->n_tasks_boosted, rnp->n_exp_boosts,
- rnp->n_normal_boosts);
- seq_printf(m, "j=%04x bt=%04x\n",
- (int)(jiffies & 0xffff),
- (int)(rnp->boost_time & 0xffff));
- seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
- rnp->n_balk_blkd_tasks,
- rnp->n_balk_exp_gp_tasks,
- rnp->n_balk_boost_tasks,
- rnp->n_balk_notblocked,
- rnp->n_balk_notyet,
- rnp->n_balk_nos);
-}
-
-static int show_rcu_node_boost(struct seq_file *m, void *unused)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
- print_one_rcu_node_boost(m, rnp);
- return 0;
-}
-
-static int rcu_node_boost_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcu_node_boost, NULL);
-}
-
-static const struct file_operations rcu_node_boost_fops = {
- .owner = THIS_MODULE,
- .open = rcu_node_boost_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long gpnum;
- int level = 0;
- struct rcu_node *rnp;
-
- gpnum = rsp->gpnum;
- seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
- ulong2long(rsp->completed), ulong2long(gpnum),
- rsp->fqs_state,
- (long)(rsp->jiffies_force_qs - jiffies),
- (int)(jiffies & 0xffff));
- seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
- rsp->n_force_qs, rsp->n_force_qs_ngp,
- rsp->n_force_qs - rsp->n_force_qs_ngp,
- ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
- for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
- if (rnp->level != level) {
- seq_puts(m, "\n");
- level = rnp->level;
- }
- seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
- rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
- ".G"[rnp->gp_tasks != NULL],
- ".E"[rnp->exp_tasks != NULL],
- ".T"[!list_empty(&rnp->blkd_tasks)],
- rnp->grplo, rnp->grphi, rnp->grpnum);
- }
- seq_puts(m, "\n");
-}
-
-static int show_rcuhier(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- print_one_rcu_state(m, rsp);
- return 0;
-}
-
-static int rcuhier_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcuhier, inode->i_private);
-}
-
-static const struct file_operations rcuhier_fops = {
- .owner = THIS_MODULE,
- .open = rcuhier_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
-{
- unsigned long flags;
- unsigned long completed;
- unsigned long gpnum;
- unsigned long gpage;
- unsigned long gpmax;
- struct rcu_node *rnp = &rsp->node[0];
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- completed = ACCESS_ONCE(rsp->completed);
- gpnum = ACCESS_ONCE(rsp->gpnum);
- if (completed == gpnum)
- gpage = 0;
- else
- gpage = jiffies - rsp->gp_start;
- gpmax = rsp->gp_max;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n",
- ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
-}
-
-static int show_rcugp(struct seq_file *m, void *v)
-{
- struct rcu_state *rsp = (struct rcu_state *)m->private;
- show_one_rcugp(m, rsp);
- return 0;
-}
-
-static int rcugp_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcugp, inode->i_private);
-}
-
-static const struct file_operations rcugp_fops = {
- .owner = THIS_MODULE,
- .open = rcugp_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = single_release,
-};
-
-static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
-{
- if (!rdp->beenonline)
- return;
- seq_printf(m, "%3d%cnp=%ld ",
- rdp->cpu,
- cpu_is_offline(rdp->cpu) ? '!' : ' ',
- rdp->n_rcu_pending);
- seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
- rdp->n_rp_qs_pending,
- rdp->n_rp_report_qs,
- rdp->n_rp_cb_ready,
- rdp->n_rp_cpu_needs_gp);
- seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
- rdp->n_rp_gp_completed,
- rdp->n_rp_gp_started,
- rdp->n_rp_nocb_defer_wakeup,
- rdp->n_rp_need_nothing);
-}
-
-static int show_rcu_pending(struct seq_file *m, void *v)
-{
- print_one_rcu_pending(m, (struct rcu_data *)v);
- return 0;
-}
-
-static const struct seq_operations rcu_pending_op = {
- .start = r_start,
- .next = r_next,
- .stop = r_stop,
- .show = show_rcu_pending,
-};
-
-static int rcu_pending_open(struct inode *inode, struct file *file)
-{
- return r_open(inode, file, &rcu_pending_op);
-}
-
-static const struct file_operations rcu_pending_fops = {
- .owner = THIS_MODULE,
- .open = rcu_pending_open,
- .read = seq_read,
- .llseek = no_llseek,
- .release = seq_release,
-};
-
-static int show_rcutorture(struct seq_file *m, void *unused)
-{
- seq_printf(m, "rcutorture test sequence: %lu %s\n",
- rcutorture_testseq >> 1,
- (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
- seq_printf(m, "rcutorture update version number: %lu\n",
- rcutorture_vernum);
- return 0;
-}
-
-static int rcutorture_open(struct inode *inode, struct file *file)
-{
- return single_open(file, show_rcutorture, NULL);
-}
-
-static const struct file_operations rcutorture_fops = {
- .owner = THIS_MODULE,
- .open = rcutorture_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutree_trace_init(void)
-{
- struct rcu_state *rsp;
- struct dentry *retval;
- struct dentry *rspdir;
-
- rcudir = debugfs_create_dir("rcu", NULL);
- if (!rcudir)
- goto free_out;
-
- for_each_rcu_flavor(rsp) {
- rspdir = debugfs_create_dir(rsp->name, rcudir);
- if (!rspdir)
- goto free_out;
-
- retval = debugfs_create_file("rcudata", 0444,
- rspdir, rsp, &rcudata_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuexp", 0444,
- rspdir, rsp, &rcuexp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcu_pending", 0444,
- rspdir, rsp, &rcu_pending_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcubarrier", 0444,
- rspdir, rsp, &rcubarrier_fops);
- if (!retval)
- goto free_out;
-
-#ifdef CONFIG_RCU_BOOST
- if (rsp == &rcu_preempt_state) {
- retval = debugfs_create_file("rcuboost", 0444,
- rspdir, NULL, &rcu_node_boost_fops);
- if (!retval)
- goto free_out;
- }
-#endif
-
- retval = debugfs_create_file("rcugp", 0444,
- rspdir, rsp, &rcugp_fops);
- if (!retval)
- goto free_out;
-
- retval = debugfs_create_file("rcuhier", 0444,
- rspdir, rsp, &rcuhier_fops);
- if (!retval)
- goto free_out;
- }
-
- retval = debugfs_create_file("rcutorture", 0444, rcudir,
- NULL, &rcutorture_fops);
- if (!retval)
- goto free_out;
- return 0;
-free_out:
- debugfs_remove_recursive(rcudir);
- return 1;
-}
-
-static void __exit rcutree_trace_cleanup(void)
-{
- debugfs_remove_recursive(rcudir);
-}
-
-
-module_init(rcutree_trace_init);
-module_exit(rcutree_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-MODULE_LICENSE("GPL");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1f133350da01..dfeba9b35395 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -1,26 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
@@ -36,7 +23,9 @@
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+#include <linux/torture.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
@@ -46,33 +35,161 @@
#include <linux/export.h>
#include <linux/hardirq.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/tick.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
+#include <linux/kprobes.h>
+#include <linux/slab.h>
+#include <linux/irq_work.h>
+#include <linux/rcupdate_trace.h>
#define CREATE_TRACE_POINTS
#include "rcu.h"
-MODULE_ALIAS("rcupdate");
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "rcupdate."
-module_param(rcu_expedited, int, 0);
+#ifndef CONFIG_TINY_RCU
+module_param(rcu_expedited, int, 0444);
+module_param(rcu_normal, int, 0444);
+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
+#if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL)
+module_param(rcu_normal_after_boot, int, 0444);
+#endif
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/**
+ * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section?
+ * @ret: Best guess answer if lockdep cannot be relied on
+ *
+ * Returns true if lockdep must be ignored, in which case ``*ret`` contains
+ * the best guess described below. Otherwise returns false, in which
+ * case ``*ret`` tells the caller nothing and the caller should instead
+ * consult lockdep.
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an
+ * RCU-sched read-side critical section. In absence of
+ * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
+ * critical section unless it can prove otherwise. Note that disabling
+ * of preemption (including disabling irqs) counts as an RCU-sched
+ * read-side critical section. This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of view (ie:
+ * that we are in the section between ct_idle_enter() and ct_idle_exit())
+ * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an
+ * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
+ * in such a section, considering these as in extended quiescent state,
+ * so such a CPU is effectively never in an RCU read-side critical section
+ * regardless of what RCU primitives it invokes. This state of affairs is
+ * required --- we need to keep an RCU-free window in idle where the CPU may
+ * possibly enter into low power mode. This way we can notice an extended
+ * quiescent state to other CPUs that started a grace period. Otherwise
+ * we would delay any grace period as long as we run in the idle task.
+ *
+ * Similarly, we avoid claiming an RCU read lock held if the current
+ * CPU is offline.
+ */
+static bool rcu_read_lock_held_common(bool *ret)
+{
+ if (!debug_lockdep_rcu_enabled()) {
+ *ret = true;
+ return true;
+ }
+ if (!rcu_is_watching()) {
+ *ret = false;
+ return true;
+ }
+ if (!rcu_lockdep_current_cpu_online()) {
+ *ret = false;
+ return true;
+ }
+ return false;
+}
+
+int notrace rcu_read_lock_sched_held(void)
+{
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ return lock_is_held(&rcu_sched_lock_map) || !preemptible();
+}
+EXPORT_SYMBOL(rcu_read_lock_sched_held);
+#endif
#ifndef CONFIG_TINY_RCU
-static atomic_t rcu_expedited_nesting =
- ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
+/*
+ * Should expedited grace-period primitives always fall back to their
+ * non-expedited counterparts? Intended for use within RCU. Note
+ * that if the user specifies both rcu_expedited and rcu_normal, then
+ * rcu_normal wins. (Except during the time period during boot from
+ * when the first task is spawned until the rcu_set_runtime_mode()
+ * core_initcall() is invoked, at which point everything is expedited.)
+ */
+bool rcu_gp_is_normal(void)
+{
+ return READ_ONCE(rcu_normal) &&
+ rcu_scheduler_active != RCU_SCHEDULER_INIT;
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
+
+static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1);
+/*
+ * Should call_rcu() callbacks be processed with urgency or are
+ * they OK being executed with arbitrary delays?
+ */
+bool rcu_async_should_hurry(void)
+{
+ return !IS_ENABLED(CONFIG_RCU_LAZY) ||
+ atomic_read(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_should_hurry);
+
+/**
+ * rcu_async_hurry - Make future async RCU callbacks not lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a timely fashion.
+ */
+void rcu_async_hurry(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_inc(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_hurry);
+
+/**
+ * rcu_async_relax - Make future async RCU callbacks lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a lazy fashion.
+ */
+void rcu_async_relax(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_dec(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_relax);
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
- * sysfs/boot variable into account as well as the rcu_expedite_gp()
- * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
- * returns false is a -really- bad idea.
+ * sysfs/boot variable and rcu_scheduler_active into account as well
+ * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
+ * until rcu_gp_is_expedited() returns false is a -really- bad idea.
*/
bool rcu_gp_is_expedited(void)
{
@@ -108,89 +225,99 @@ void rcu_unexpedite_gp(void)
}
EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
-#endif /* #ifndef CONFIG_TINY_RCU */
+static bool rcu_boot_ended __read_mostly;
/*
* Inform RCU of the end of the in-kernel boot sequence.
*/
void rcu_end_inkernel_boot(void)
{
- if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
+ rcu_async_relax();
+ if (rcu_normal_after_boot)
+ WRITE_ONCE(rcu_normal, 1);
+ rcu_boot_ended = true;
}
-#ifdef CONFIG_PREEMPT_RCU
-
/*
- * Preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
+ * Let rcutorture know when it is OK to turn it up to eleven.
*/
-void __rcu_read_lock(void)
+bool rcu_inkernel_boot_has_ended(void)
{
- current->rcu_read_lock_nesting++;
- barrier(); /* critical section after entry code. */
+ return rcu_boot_ended;
}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
+EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended);
+
+#endif /* #ifndef CONFIG_TINY_RCU */
/*
- * Preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
+ * Test each non-SRCU synchronous grace-period wait API. This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
*/
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting != 1) {
- --t->rcu_read_lock_nesting;
- } else {
- barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = INT_MIN;
- barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
- rcu_read_unlock_special(t);
- barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
- }
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+void rcu_test_sync_prims(void)
+{
+ if (!IS_ENABLED(CONFIG_PROVE_RCU))
+ return;
+ pr_info("Running RCU synchronous self tests\n");
+ synchronize_rcu();
+ synchronize_rcu_expedited();
+}
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
- }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
+#if !defined(CONFIG_TINY_RCU)
+
+/*
+ * Switch to run-time mode once RCU has fully initialized.
+ */
+static int __init rcu_set_runtime_mode(void)
+{
+ rcu_test_sync_prims();
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+ kfree_rcu_scheduler_running();
+ rcu_test_sync_prims();
+ return 0;
}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+core_initcall(rcu_set_runtime_mode);
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#endif /* #if !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+struct lockdep_map rcu_lock_map = {
+ .name = "rcu_read_lock",
+ .key = &rcu_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT implies PREEMPT_RCU */
+};
EXPORT_SYMBOL_GPL(rcu_lock_map);
static struct lock_class_key rcu_bh_lock_key;
-struct lockdep_map rcu_bh_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
+struct lockdep_map rcu_bh_lock_map = {
+ .name = "rcu_read_lock_bh",
+ .key = &rcu_bh_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
+};
EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
static struct lock_class_key rcu_sched_lock_key;
-struct lockdep_map rcu_sched_lock_map =
- STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
+struct lockdep_map rcu_sched_lock_map = {
+ .name = "rcu_read_lock_sched",
+ .key = &rcu_sched_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_SPIN,
+};
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
+// Tell lockdep when RCU callbacks are being invoked.
static struct lock_class_key rcu_callback_key;
struct lockdep_map rcu_callback_map =
STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
EXPORT_SYMBOL_GPL(rcu_callback_map);
-int notrace debug_lockdep_rcu_enabled(void)
+noinstr int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -215,14 +342,12 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
* Note that rcu_read_lock() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
-int rcu_read_lock_held(void)
+int notrace rcu_read_lock_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return lock_is_held(&rcu_lock_map);
}
EXPORT_SYMBOL_GPL(rcu_read_lock_held);
@@ -239,21 +364,33 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
*
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
*
- * Note that rcu_read_lock() is disallowed if the CPU is either idle or
+ * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or
* offline from an RCU perspective, so check for those as well.
*/
-int rcu_read_lock_bh_held(void)
+int notrace rcu_read_lock_bh_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+int notrace rcu_read_lock_any_held(void)
+{
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ if (lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map))
+ return 1;
+ return !preemptible();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_any_held);
+
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
@@ -269,55 +406,70 @@ void wakeme_after_rcu(struct rcu_head *head)
rcu = container_of(head, struct rcu_synchronize, head);
complete(&rcu->completion);
}
+EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void wait_rcu_gp(call_rcu_func_t crf)
+void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array,
+ struct rcu_synchronize *rs_array)
{
- struct rcu_synchronize rcu;
+ int i;
+ int j;
+
+ /* Initialize and register callbacks for each crcu_array element. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu)) {
+ might_sleep();
+ continue;
+ }
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i) {
+ init_rcu_head_on_stack(&rs_array[i].head);
+ init_completion(&rs_array[i].completion);
+ (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ }
+ }
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- crf(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
+ /* Wait for all callbacks to be invoked. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu))
+ continue;
+ for (j = 0; j < i; j++)
+ if (crcu_array[j] == crcu_array[i])
+ break;
+ if (j == i) {
+ wait_for_completion_state(&rs_array[i].completion, state);
+ destroy_rcu_head_on_stack(&rs_array[i].head);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(__wait_rcu_gp);
+
+void finish_rcuwait(struct rcuwait *w)
+{
+ rcu_assign_pointer(w->task, NULL);
+ __set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL_GPL(wait_rcu_gp);
+EXPORT_SYMBOL_GPL(finish_rcuwait);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
+EXPORT_SYMBOL_GPL(init_rcu_head);
void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
+EXPORT_SYMBOL_GPL(destroy_rcu_head);
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
{
- struct rcu_head *head = addr;
-
- switch (state) {
-
- case ODEBUG_STATE_NOTAVAILABLE:
- /*
- * This is not really a fixup. We just make sure that it is
- * tracked in the object tracker.
- */
- debug_object_init(head, &rcuhead_debug_descr);
- debug_object_activate(head, &rcuhead_debug_descr);
- return 0;
- default:
- return 1;
- }
+ return true;
}
/**
@@ -353,14 +505,14 @@ void destroy_rcu_head_on_stack(struct rcu_head *head)
}
EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
-struct debug_obj_descr rcuhead_debug_descr = {
+const struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
- .fixup_activate = rcuhead_fixup_activate,
+ .is_static_object = rcuhead_is_static_object,
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
@@ -373,391 +525,67 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
do { } while (0)
#endif
-#ifdef CONFIG_RCU_STALL_COMMON
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
+/* Get rcutorture access to sched_setaffinity(). */
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn)
+{
+ int ret;
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA 0
+ ret = sched_setaffinity(pid, in_mask);
+ WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
#endif
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
+#ifdef CONFIG_RCU_STALL_COMMON
+int rcu_cpu_stall_ftrace_dump __read_mostly;
+module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+module_param(rcu_cpu_stall_notifiers, int, 0444);
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings.
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
-
-int rcu_jiffies_till_stall_check(void)
-{
- int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
-
- /*
- * Limit check must be consistent with the Kconfig limits
- * for CONFIG_RCU_CPU_STALL_TIMEOUT.
- */
- if (till_stall_check < 3) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
- till_stall_check = 3;
- } else if (till_stall_check > 300) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
- till_stall_check = 300;
- }
- return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-
-void rcu_sysrq_start(void)
-{
- if (!rcu_cpu_stall_suppress)
- rcu_cpu_stall_suppress = 2;
-}
-
-void rcu_sysrq_end(void)
-{
- if (rcu_cpu_stall_suppress == 2)
- rcu_cpu_stall_suppress = 0;
-}
-
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
- rcu_cpu_stall_suppress = 1;
- return NOTIFY_DONE;
-}
-
-static struct notifier_block rcu_panic_block = {
- .notifier_call = rcu_panic,
-};
-
-static int __init check_cpu_stall_init(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
- return 0;
-}
-early_initcall(check_cpu_stall_init);
-
+int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
+module_param(rcu_exp_cpu_stall_timeout, int, 0644);
+int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
+module_param(rcu_cpu_stall_cputime, int, 0644);
+bool rcu_exp_stall_task_details __read_mostly;
+module_param(rcu_exp_stall_task_details, bool, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-#ifdef CONFIG_TASKS_RCU
-
-/*
- * Simple variant of RCU whose quiescent states are voluntary context switch,
- * user-space execution, and idle. As such, grace periods can take one good
- * long time. There are no read-side primitives similar to rcu_read_lock()
- * and rcu_read_unlock() because this implementation is intended to get
- * the system into a safe state for some of the manipulations involved in
- * tracing and the like. Finally, this implementation does not support
- * high call_rcu_tasks() rates from multiple CPUs. If this is required,
- * per-CPU callback lists will be needed.
- */
-
-/* Global list of callbacks and associated lock. */
-static struct rcu_head *rcu_tasks_cbs_head;
-static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
-static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
-
-/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
-
-/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
-static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
-module_param(rcu_task_stall_timeout, int, 0644);
-
-static void rcu_spawn_tasks_kthread(void);
-
-/*
- * Post an RCU-tasks callback. First call must be from process context
- * after the scheduler if fully operational.
- */
-void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
-{
- unsigned long flags;
- bool needwake;
-
- rhp->next = NULL;
- rhp->func = func;
- raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
- needwake = !rcu_tasks_cbs_head;
- *rcu_tasks_cbs_tail = rhp;
- rcu_tasks_cbs_tail = &rhp->next;
- raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
- if (needwake) {
- rcu_spawn_tasks_kthread();
- wake_up(&rcu_tasks_cbs_wq);
- }
-}
-EXPORT_SYMBOL_GPL(call_rcu_tasks);
-
-/**
- * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu-tasks
- * grace period has elapsed, in other words after all currently
- * executing rcu-tasks read-side critical sections have elapsed. These
- * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_rcu_qs(), idle execution, userspace execution, calls
- * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
- *
- * This is a very specialized primitive, intended only for a few uses in
- * tracing and other situations requiring manipulation of function
- * preambles and profiling hooks. The synchronize_rcu_tasks() function
- * is not (yet) intended for heavy use from multiple CPUs.
- *
- * Note that this guarantee implies further memory-ordering guarantees.
- * On systems with more than one CPU, when synchronize_rcu_tasks() returns,
- * each CPU is guaranteed to have executed a full memory barrier since the
- * end of its last RCU-tasks read-side critical section whose beginning
- * preceded the call to synchronize_rcu_tasks(). In addition, each CPU
- * having an RCU-tasks read-side critical section that extends beyond
- * the return from synchronize_rcu_tasks() is guaranteed to have executed
- * a full memory barrier after the beginning of synchronize_rcu_tasks()
- * and before the beginning of that RCU-tasks read-side critical section.
- * Note that these guarantees include CPUs that are offline, idle, or
- * executing in user mode, as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
- * (but again only if the system has more than one CPU).
- */
-void synchronize_rcu_tasks(void)
-{
- /* Complain if the scheduler has not started. */
- rcu_lockdep_assert(!rcu_scheduler_active,
- "synchronize_rcu_tasks called too soon");
-
- /* Wait for the grace period. */
- wait_rcu_gp(call_rcu_tasks);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
+// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
+// warnings. Also used by rcutorture even if stall warnings are excluded.
+int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
+module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
/**
- * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
+ * get_completed_synchronize_rcu - Return a pre-completed polled state cookie
*
- * Although the current implementation is guaranteed to wait, it is not
- * obligated to, for example, if there are no pending callbacks.
+ * Returns a value that will always be treated by functions like
+ * poll_state_synchronize_rcu() as a cookie whose grace period has already
+ * completed.
*/
-void rcu_barrier_tasks(void)
+unsigned long get_completed_synchronize_rcu(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks();
+ return RCU_GET_STATE_COMPLETED;
}
-EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
-
-/* See if tasks are still holding out, complain if so. */
-static void check_holdout_task(struct task_struct *t,
- bool needreport, bool *firstreport)
-{
- int cpu;
-
- if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
- t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
- !ACCESS_ONCE(t->on_rq) ||
- (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
- !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
- ACCESS_ONCE(t->rcu_tasks_holdout) = false;
- list_del_init(&t->rcu_tasks_holdout_list);
- put_task_struct(t);
- return;
- }
- if (!needreport)
- return;
- if (*firstreport) {
- pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
- *firstreport = false;
- }
- cpu = task_cpu(t);
- pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
- t, ".I"[is_idle_task(t)],
- "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
- t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
- t->rcu_tasks_idle_cpu, cpu);
- sched_show_task(t);
-}
-
-/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
-static int __noreturn rcu_tasks_kthread(void *arg)
-{
- unsigned long flags;
- struct task_struct *g, *t;
- unsigned long lastreport;
- struct rcu_head *list;
- struct rcu_head *next;
- LIST_HEAD(rcu_tasks_holdouts);
-
- /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current);
-
- /*
- * Each pass through the following loop makes one check for
- * newly arrived callbacks, and, if there are some, waits for
- * one RCU-tasks grace period and then invokes the callbacks.
- * This loop is terminated by the system going down. ;-)
- */
- for (;;) {
-
- /* Pick up any new callbacks. */
- raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
- list = rcu_tasks_cbs_head;
- rcu_tasks_cbs_head = NULL;
- rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
- raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
-
- /* If there were none, wait a bit and start over. */
- if (!list) {
- wait_event_interruptible(rcu_tasks_cbs_wq,
- rcu_tasks_cbs_head);
- if (!rcu_tasks_cbs_head) {
- WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(HZ/10);
- }
- continue;
- }
-
- /*
- * Wait for all pre-existing t->on_rq and t->nvcsw
- * transitions to complete. Invoking synchronize_sched()
- * suffices because all these transitions occur with
- * interrupts disabled. Without this synchronize_sched(),
- * a read-side critical section that started before the
- * grace period might be incorrectly seen as having started
- * after the grace period.
- *
- * This synchronize_sched() also dispenses with the
- * need for a memory barrier on the first store to
- * ->rcu_tasks_holdout, as it forces the store to happen
- * after the beginning of the grace period.
- */
- synchronize_sched();
-
- /*
- * There were callbacks, so we need to wait for an
- * RCU-tasks grace period. Start off by scanning
- * the task list for tasks that are not already
- * voluntarily blocked. Mark these tasks and make
- * a list of them in rcu_tasks_holdouts.
- */
- rcu_read_lock();
- for_each_process_thread(g, t) {
- if (t != current && ACCESS_ONCE(t->on_rq) &&
- !is_idle_task(t)) {
- get_task_struct(t);
- t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
- ACCESS_ONCE(t->rcu_tasks_holdout) = true;
- list_add(&t->rcu_tasks_holdout_list,
- &rcu_tasks_holdouts);
- }
- }
- rcu_read_unlock();
-
- /*
- * Wait for tasks that are in the process of exiting.
- * This does only part of the job, ensuring that all
- * tasks that were previously exiting reach the point
- * where they have disabled preemption, allowing the
- * later synchronize_sched() to finish the job.
- */
- synchronize_srcu(&tasks_rcu_exit_srcu);
-
- /*
- * Each pass through the following loop scans the list
- * of holdout tasks, removing any that are no longer
- * holdouts. When the list is empty, we are done.
- */
- lastreport = jiffies;
- while (!list_empty(&rcu_tasks_holdouts)) {
- bool firstreport;
- bool needreport;
- int rtst;
- struct task_struct *t1;
-
- schedule_timeout_interruptible(HZ);
- rtst = ACCESS_ONCE(rcu_task_stall_timeout);
- needreport = rtst > 0 &&
- time_after(jiffies, lastreport + rtst);
- if (needreport)
- lastreport = jiffies;
- firstreport = true;
- WARN_ON(signal_pending(current));
- list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
- rcu_tasks_holdout_list) {
- check_holdout_task(t, needreport, &firstreport);
- cond_resched();
- }
- }
-
- /*
- * Because ->on_rq and ->nvcsw are not guaranteed
- * to have a full memory barriers prior to them in the
- * schedule() path, memory reordering on other CPUs could
- * cause their RCU-tasks read-side critical sections to
- * extend past the end of the grace period. However,
- * because these ->nvcsw updates are carried out with
- * interrupts disabled, we can use synchronize_sched()
- * to force the needed ordering on all such CPUs.
- *
- * This synchronize_sched() also confines all
- * ->rcu_tasks_holdout accesses to be within the grace
- * period, avoiding the need for memory barriers for
- * ->rcu_tasks_holdout accesses.
- *
- * In addition, this synchronize_sched() waits for exiting
- * tasks to complete their final preempt_disable() region
- * of execution, cleaning up after the synchronize_srcu()
- * above.
- */
- synchronize_sched();
-
- /* Invoke the callbacks. */
- while (list) {
- next = list->next;
- local_bh_disable();
- list->func(list);
- local_bh_enable();
- list = next;
- cond_resched();
- }
- schedule_timeout_uninterruptible(HZ/10);
- }
-}
-
-/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
-static void rcu_spawn_tasks_kthread(void)
-{
- static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
- static struct task_struct *rcu_tasks_kthread_ptr;
- struct task_struct *t;
-
- if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
- smp_mb(); /* Ensure caller sees full kthread. */
- return;
- }
- mutex_lock(&rcu_tasks_kthread_mutex);
- if (rcu_tasks_kthread_ptr) {
- mutex_unlock(&rcu_tasks_kthread_mutex);
- return;
- }
- t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
- BUG_ON(IS_ERR(t));
- smp_mb(); /* Ensure others see full kthread. */
- ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
- mutex_unlock(&rcu_tasks_kthread_mutex);
-}
-
-#endif /* #ifdef CONFIG_TASKS_RCU */
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
#ifdef CONFIG_PROVE_RCU
/*
- * Early boot self test parameters, one for each flavor
+ * Early boot self test parameters.
*/
static bool rcu_self_test;
-static bool rcu_self_test_bh;
-static bool rcu_self_test_sched;
-
module_param(rcu_self_test, bool, 0444);
-module_param(rcu_self_test_bh, bool, 0444);
-module_param(rcu_self_test_sched, bool, 0444);
static int rcu_self_test_counter;
@@ -767,25 +595,28 @@ static void test_callback(struct rcu_head *r)
pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
}
-static void early_boot_test_call_rcu(void)
-{
- static struct rcu_head head;
-
- call_rcu(&head, test_callback);
-}
-
-static void early_boot_test_call_rcu_bh(void)
-{
- static struct rcu_head head;
+DEFINE_STATIC_SRCU(early_srcu);
+static unsigned long early_srcu_cookie;
- call_rcu_bh(&head, test_callback);
-}
+struct early_boot_kfree_rcu {
+ struct rcu_head rh;
+};
-static void early_boot_test_call_rcu_sched(void)
+static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
+ int idx;
+ static struct rcu_head shead;
+ struct early_boot_kfree_rcu *rhp;
- call_rcu_sched(&head, test_callback);
+ idx = srcu_down_read(&early_srcu);
+ srcu_up_read(&early_srcu, idx);
+ call_rcu(&head, test_callback);
+ early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
+ call_srcu(&early_srcu, &shead, test_callback);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (!WARN_ON_ONCE(!rhp))
+ kfree_rcu(rhp, rh);
}
void rcu_early_boot_tests(void)
@@ -794,10 +625,7 @@ void rcu_early_boot_tests(void)
if (rcu_self_test)
early_boot_test_call_rcu();
- if (rcu_self_test_bh)
- early_boot_test_call_rcu_bh();
- if (rcu_self_test_sched)
- early_boot_test_call_rcu_sched();
+ rcu_test_sync_prims();
}
static int rcu_verify_early_boot_tests(void)
@@ -808,16 +636,11 @@ static int rcu_verify_early_boot_tests(void)
if (rcu_self_test) {
early_boot_test_counter++;
rcu_barrier();
- }
- if (rcu_self_test_bh) {
- early_boot_test_counter++;
- rcu_barrier_bh();
- }
- if (rcu_self_test_sched) {
early_boot_test_counter++;
- rcu_barrier_sched();
+ srcu_barrier(&early_srcu);
+ WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
+ cleanup_srcu_struct(&early_srcu);
}
-
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);
ret = -1;
@@ -829,3 +652,27 @@ late_initcall(rcu_verify_early_boot_tests);
#else
void rcu_early_boot_tests(void) {}
#endif /* CONFIG_PROVE_RCU */
+
+#include "tasks.h"
+
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any significant non-default boot-time settings.
+ */
+void __init rcupdate_announce_bootup_oddness(void)
+{
+ if (rcu_normal)
+ pr_info("\tNo expedited grace period (rcu_normal).\n");
+ else if (rcu_normal_after_boot)
+ pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n");
+ else if (rcu_expedited)
+ pr_info("\tAll grace periods are expedited (rcu_expedited).\n");
+ if (rcu_cpu_stall_suppress)
+ pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n");
+ if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT)
+ pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout);
+ rcu_tasks_bootup_oddness();
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..ec087827c85c 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/reboot.c
*
@@ -6,6 +7,7 @@
#define pr_fmt(fmt) "reboot: " fmt
+#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/export.h>
#include <linux/kexec.h>
@@ -21,16 +23,20 @@
* this indicates whether you can reboot with ctrl-alt-del: the default is yes
*/
-int C_A_D = 1;
+static int C_A_D = 1;
struct pid *cad_pid;
EXPORT_SYMBOL(cad_pid);
-#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#if defined(CONFIG_ARM)
#define DEFAULT_REBOOT_MODE = REBOOT_HARD
#else
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+EXPORT_SYMBOL_GPL(reboot_mode);
+enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
+
+static enum hw_protection_action hw_protection_action = HWPROT_ACT_SHUTDOWN;
/*
* This variable is used privately to keep track of whether or not
@@ -44,11 +50,36 @@ int reboot_cpu;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
+struct sys_off_handler {
+ struct notifier_block nb;
+ int (*sys_off_cb)(struct sys_off_data *data);
+ void *cb_data;
+ enum sys_off_mode mode;
+ bool blocking;
+ void *list;
+ struct device *dev;
+};
+
/*
- * If set, this is used for preparing the system to power off.
+ * This variable is used to indicate if a halt was initiated instead of a
+ * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but
+ * the system cannot be powered off. This allowes kernel_halt() to notify users
+ * of that.
*/
+static bool poweroff_fallback_to_halt;
-void (*pm_power_off_prepare)(void);
+/*
+ * Temporary stub that prevents linkage failure while we're in process
+ * of removing all uses of legacy pm_power_off() around the kernel.
+ */
+void __weak (*pm_power_off)(void);
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * at shutdown. This is used to stop any idling DMA operations
+ * and the like.
+ */
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
/**
* emergency_restart - reboot the system
@@ -61,6 +92,7 @@ void (*pm_power_off_prepare)(void);
void emergency_restart(void)
{
kmsg_dump(KMSG_DUMP_EMERG);
+ system_state = SYSTEM_RESTART;
machine_emergency_restart();
}
EXPORT_SYMBOL_GPL(emergency_restart);
@@ -104,6 +136,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+ WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+ struct notifier_block **rcnb;
+ int ret;
+
+ rcnb = devres_alloc(devm_unregister_reboot_notifier,
+ sizeof(*rcnb), GFP_KERNEL);
+ if (!rcnb)
+ return -ENOMEM;
+
+ ret = register_reboot_notifier(nb);
+ if (!ret) {
+ *rcnb = nb;
+ devres_add(dev, rcnb);
+ } else {
+ devres_free(rcnb);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
/*
* Notifier list for kernel code which wants to be called
* to restart the system.
@@ -172,6 +231,9 @@ EXPORT_SYMBOL(unregister_restart_handler);
/**
* do_kernel_restart - Execute kernel restart handler call chain
*
+ * @cmd: pointer to buffer containing command to execute for restart
+ * or %NULL
+ *
* Calls functions registered with register_restart_handler.
*
* Expected to be called from machine_restart as last step of the restart
@@ -203,6 +265,17 @@ void migrate_to_reboot_cpu(void)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
}
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for restart.
+ */
+static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list);
+
+static void do_kernel_restart_prepare(void)
+{
+ blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL);
+}
+
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
@@ -214,13 +287,14 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
+ do_kernel_restart_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
pr_emerg("Restarting system\n");
else
pr_emerg("Restarting system with command '%s'\n", cmd);
- kmsg_dump(KMSG_DUMP_RESTART);
+ kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_restart(cmd);
}
EXPORT_SYMBOL_GPL(kernel_restart);
@@ -243,12 +317,386 @@ void kernel_halt(void)
kernel_shutdown_prepare(SYSTEM_HALT);
migrate_to_reboot_cpu();
syscore_shutdown();
- pr_emerg("System halted\n");
- kmsg_dump(KMSG_DUMP_HALT);
+ if (poweroff_fallback_to_halt)
+ pr_emerg("Power off not available: System halted instead\n");
+ else
+ pr_emerg("System halted\n");
+ kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_halt();
}
EXPORT_SYMBOL_GPL(kernel_halt);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to prepare system for power off.
+ */
+static BLOCKING_NOTIFIER_HEAD(power_off_prep_handler_list);
+
+/*
+ * Notifier list for kernel code which wants to be called
+ * to power off system.
+ */
+static ATOMIC_NOTIFIER_HEAD(power_off_handler_list);
+
+static int sys_off_notify(struct notifier_block *nb,
+ unsigned long mode, void *cmd)
+{
+ struct sys_off_handler *handler;
+ struct sys_off_data data = {};
+
+ handler = container_of(nb, struct sys_off_handler, nb);
+ data.cb_data = handler->cb_data;
+ data.mode = mode;
+ data.cmd = cmd;
+ data.dev = handler->dev;
+
+ return handler->sys_off_cb(&data);
+}
+
+static struct sys_off_handler platform_sys_off_handler;
+
+static struct sys_off_handler *alloc_sys_off_handler(int priority)
+{
+ struct sys_off_handler *handler;
+ gfp_t flags;
+
+ /*
+ * Platforms like m68k can't allocate sys_off handler dynamically
+ * at the early boot time because memory allocator isn't available yet.
+ */
+ if (priority == SYS_OFF_PRIO_PLATFORM) {
+ handler = &platform_sys_off_handler;
+ if (handler->cb_data)
+ return ERR_PTR(-EBUSY);
+ } else {
+ if (system_state > SYSTEM_RUNNING)
+ flags = GFP_ATOMIC;
+ else
+ flags = GFP_KERNEL;
+
+ handler = kzalloc(sizeof(*handler), flags);
+ if (!handler)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return handler;
+}
+
+static void free_sys_off_handler(struct sys_off_handler *handler)
+{
+ if (handler == &platform_sys_off_handler)
+ memset(handler, 0, sizeof(*handler));
+ else
+ kfree(handler);
+}
+
+/**
+ * register_sys_off_handler - Register sys-off handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers system power-off or restart handler that will be invoked
+ * at the step corresponding to the given sys-off mode. Handler's callback
+ * should return NOTIFY_DONE to permit execution of the next handler in
+ * the call chain or NOTIFY_STOP to break the chain (in error case for
+ * example).
+ *
+ * Multiple handlers can be registered at the default priority level.
+ *
+ * Only one handler can be registered at the non-default priority level,
+ * otherwise ERR_PTR(-EBUSY) is returned.
+ *
+ * Returns a new instance of struct sys_off_handler on success, or
+ * an ERR_PTR()-encoded error code otherwise.
+ */
+struct sys_off_handler *
+register_sys_off_handler(enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+ int err;
+
+ handler = alloc_sys_off_handler(priority);
+ if (IS_ERR(handler))
+ return handler;
+
+ switch (mode) {
+ case SYS_OFF_MODE_POWER_OFF_PREPARE:
+ handler->list = &power_off_prep_handler_list;
+ handler->blocking = true;
+ break;
+
+ case SYS_OFF_MODE_POWER_OFF:
+ handler->list = &power_off_handler_list;
+ break;
+
+ case SYS_OFF_MODE_RESTART_PREPARE:
+ handler->list = &restart_prep_handler_list;
+ handler->blocking = true;
+ break;
+
+ case SYS_OFF_MODE_RESTART:
+ handler->list = &restart_handler_list;
+ break;
+
+ default:
+ free_sys_off_handler(handler);
+ return ERR_PTR(-EINVAL);
+ }
+
+ handler->nb.notifier_call = sys_off_notify;
+ handler->nb.priority = priority;
+ handler->sys_off_cb = callback;
+ handler->cb_data = cb_data;
+ handler->mode = mode;
+
+ if (handler->blocking) {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = blocking_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = blocking_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ } else {
+ if (priority == SYS_OFF_PRIO_DEFAULT)
+ err = atomic_notifier_chain_register(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_register_unique_prio(handler->list,
+ &handler->nb);
+ }
+
+ if (err) {
+ free_sys_off_handler(handler);
+ return ERR_PTR(err);
+ }
+
+ return handler;
+}
+EXPORT_SYMBOL_GPL(register_sys_off_handler);
+
+/**
+ * unregister_sys_off_handler - Unregister sys-off handler
+ * @handler: Sys-off handler
+ *
+ * Unregisters given sys-off handler.
+ */
+void unregister_sys_off_handler(struct sys_off_handler *handler)
+{
+ int err;
+
+ if (IS_ERR_OR_NULL(handler))
+ return;
+
+ if (handler->blocking)
+ err = blocking_notifier_chain_unregister(handler->list,
+ &handler->nb);
+ else
+ err = atomic_notifier_chain_unregister(handler->list,
+ &handler->nb);
+
+ /* sanity check, shall never happen */
+ WARN_ON(err);
+
+ free_sys_off_handler(handler);
+}
+EXPORT_SYMBOL_GPL(unregister_sys_off_handler);
+
+static void devm_unregister_sys_off_handler(void *data)
+{
+ struct sys_off_handler *handler = data;
+
+ unregister_sys_off_handler(handler);
+}
+
+/**
+ * devm_register_sys_off_handler - Register sys-off handler
+ * @dev: Device that registers handler
+ * @mode: Sys-off mode
+ * @priority: Handler priority
+ * @callback: Callback function
+ * @cb_data: Callback argument
+ *
+ * Registers resource-managed sys-off handler.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_sys_off_handler(struct device *dev,
+ enum sys_off_mode mode,
+ int priority,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(mode, priority, callback, cb_data);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ handler->dev = dev;
+
+ return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
+ handler);
+}
+EXPORT_SYMBOL_GPL(devm_register_sys_off_handler);
+
+/**
+ * devm_register_power_off_handler - Register power-off handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using power-off mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_power_off_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_power_off_handler);
+
+/**
+ * devm_register_restart_handler - Register restart handler
+ * @dev: Device that registers callback
+ * @callback: Callback function
+ * @cb_data: Callback's argument
+ *
+ * Registers resource-managed sys-off handler with a default priority
+ * and using restart mode.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int devm_register_restart_handler(struct device *dev,
+ int (*callback)(struct sys_off_data *data),
+ void *cb_data)
+{
+ return devm_register_sys_off_handler(dev,
+ SYS_OFF_MODE_RESTART,
+ SYS_OFF_PRIO_DEFAULT,
+ callback, cb_data);
+}
+EXPORT_SYMBOL_GPL(devm_register_restart_handler);
+
+static struct sys_off_handler *platform_power_off_handler;
+
+static int platform_power_off_notify(struct sys_off_data *data)
+{
+ void (*platform_power_power_off_cb)(void) = data->cb_data;
+
+ platform_power_power_off_cb();
+
+ return NOTIFY_DONE;
+}
+
+/**
+ * register_platform_power_off - Register platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Registers power-off callback that will be called as last step
+ * of the power-off sequence. This callback is expected to be invoked
+ * for the last resort. Only one platform power-off callback is allowed
+ * to be registered at a time.
+ *
+ * Returns zero on success, or error code on failure.
+ */
+int register_platform_power_off(void (*power_off)(void))
+{
+ struct sys_off_handler *handler;
+
+ handler = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_PLATFORM,
+ platform_power_off_notify,
+ power_off);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+
+ platform_power_off_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_platform_power_off);
+
+/**
+ * unregister_platform_power_off - Unregister platform-level power-off callback
+ * @power_off: Power-off callback
+ *
+ * Unregisters previously registered platform power-off callback.
+ */
+void unregister_platform_power_off(void (*power_off)(void))
+{
+ if (platform_power_off_handler &&
+ platform_power_off_handler->cb_data == power_off) {
+ unregister_sys_off_handler(platform_power_off_handler);
+ platform_power_off_handler = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(unregister_platform_power_off);
+
+static int legacy_pm_power_off(struct sys_off_data *data)
+{
+ if (pm_power_off)
+ pm_power_off();
+
+ return NOTIFY_DONE;
+}
+
+static void do_kernel_power_off_prepare(void)
+{
+ blocking_notifier_call_chain(&power_off_prep_handler_list, 0, NULL);
+}
+
+/**
+ * do_kernel_power_off - Execute kernel power-off handler call chain
+ *
+ * Expected to be called as last step of the power-off sequence.
+ *
+ * Powers off the system immediately if a power-off handler function has
+ * been registered. Otherwise does nothing.
+ */
+void do_kernel_power_off(void)
+{
+ struct sys_off_handler *sys_off = NULL;
+
+ /*
+ * Register sys-off handlers for legacy PM callback. This allows
+ * legacy PM callbacks temporary co-exist with the new sys-off API.
+ *
+ * TODO: Remove legacy handlers once all legacy PM users will be
+ * switched to the sys-off based APIs.
+ */
+ if (pm_power_off)
+ sys_off = register_sys_off_handler(SYS_OFF_MODE_POWER_OFF,
+ SYS_OFF_PRIO_DEFAULT,
+ legacy_pm_power_off, NULL);
+
+ atomic_notifier_call_chain(&power_off_handler_list, 0, NULL);
+
+ unregister_sys_off_handler(sys_off);
+}
+
+/**
+ * kernel_can_power_off - check whether system can be powered off
+ *
+ * Returns true if power-off handler is registered and system can be
+ * powered off, false otherwise.
+ */
+bool kernel_can_power_off(void)
+{
+ return !atomic_notifier_call_chain_is_empty(&power_off_handler_list) ||
+ pm_power_off;
+}
+EXPORT_SYMBOL_GPL(kernel_can_power_off);
+
/**
* kernel_power_off - power_off the system
*
@@ -257,17 +705,17 @@ EXPORT_SYMBOL_GPL(kernel_halt);
void kernel_power_off(void)
{
kernel_shutdown_prepare(SYSTEM_POWER_OFF);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
+ do_kernel_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("Power down\n");
- kmsg_dump(KMSG_DUMP_POWEROFF);
+ pr_flush(1000, true);
+ kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_power_off();
}
EXPORT_SYMBOL_GPL(kernel_power_off);
-static DEFINE_MUTEX(reboot_mutex);
+DEFINE_MUTEX(system_transition_mutex);
/*
* Reboot system call: for obvious reasons only root may call it,
@@ -308,10 +756,12 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
- if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) {
+ poweroff_fallback_to_halt = true;
cmd = LINUX_REBOOT_CMD_HALT;
+ }
- mutex_lock(&reboot_mutex);
+ mutex_lock(&system_transition_mutex);
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
kernel_restart(NULL);
@@ -328,7 +778,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
case LINUX_REBOOT_CMD_HALT:
kernel_halt();
do_exit(0);
- panic("cannot halt");
case LINUX_REBOOT_CMD_POWER_OFF:
kernel_power_off();
@@ -346,7 +795,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
kernel_restart(buffer);
break;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
break;
@@ -362,7 +811,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
ret = -EINVAL;
break;
}
- mutex_unlock(&reboot_mutex);
+ mutex_unlock(&system_transition_mutex);
return ret;
}
@@ -386,7 +835,8 @@ void ctrl_alt_del(void)
kill_cad_pid(SIGINT, 1);
}
-char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+#define POWEROFF_CMD_PATH_LEN 256
+static char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
static const char reboot_cmd[] = "/sbin/reboot";
static int run_cmd(const char *cmd)
@@ -488,9 +938,167 @@ void orderly_reboot(void)
}
EXPORT_SYMBOL_GPL(orderly_reboot);
+static const char *hw_protection_action_str(enum hw_protection_action action)
+{
+ switch (action) {
+ case HWPROT_ACT_SHUTDOWN:
+ return "shutdown";
+ case HWPROT_ACT_REBOOT:
+ return "reboot";
+ default:
+ return "undefined";
+ }
+}
+
+static enum hw_protection_action hw_failure_emergency_action;
+
+/**
+ * hw_failure_emergency_action_func - emergency action work after a known delay
+ * @work: work_struct associated with the emergency action function
+ *
+ * This function is called in very critical situations to force
+ * a kernel poweroff or reboot after a configurable timeout value.
+ */
+static void hw_failure_emergency_action_func(struct work_struct *work)
+{
+ const char *action_str = hw_protection_action_str(hw_failure_emergency_action);
+
+ pr_emerg("Hardware protection timed-out. Trying forced %s\n",
+ action_str);
+
+ /*
+ * We have reached here after the emergency action waiting period has
+ * expired. This means orderly_poweroff/reboot has not been able to
+ * shut off the system for some reason.
+ *
+ * Try to shut off the system immediately if possible
+ */
+
+ if (hw_failure_emergency_action == HWPROT_ACT_REBOOT)
+ kernel_restart(NULL);
+ else
+ kernel_power_off();
+
+ /*
+ * Worst of the worst case trigger emergency restart
+ */
+ pr_emerg("Hardware protection %s failed. Trying emergency restart\n",
+ action_str);
+ emergency_restart();
+}
+
+static DECLARE_DELAYED_WORK(hw_failure_emergency_action_work,
+ hw_failure_emergency_action_func);
+
+/**
+ * hw_failure_emergency_schedule - Schedule an emergency system shutdown or reboot
+ *
+ * @action: The hardware protection action to be taken
+ * @action_delay_ms: Time in milliseconds to elapse before triggering action
+ *
+ * This may be called from any critical situation to trigger a system shutdown
+ * or reboot after a given period of time.
+ * If time is negative this is not scheduled.
+ */
+static void hw_failure_emergency_schedule(enum hw_protection_action action,
+ int action_delay_ms)
+{
+ if (action_delay_ms <= 0)
+ return;
+ hw_failure_emergency_action = action;
+ schedule_delayed_work(&hw_failure_emergency_action_work,
+ msecs_to_jiffies(action_delay_ms));
+}
+
+/**
+ * __hw_protection_trigger - Trigger an emergency system shutdown or reboot
+ *
+ * @reason: Reason of emergency shutdown or reboot to be printed.
+ * @ms_until_forced: Time to wait for orderly shutdown or reboot before
+ * triggering it. Negative value disables the forced
+ * shutdown or reboot.
+ * @action: The hardware protection action to be taken.
+ *
+ * Initiate an emergency system shutdown or reboot in order to protect
+ * hardware from further damage. Usage examples include a thermal protection.
+ * NOTE: The request is ignored if protection shutdown or reboot is already
+ * pending even if the previous request has given a large timeout for forced
+ * shutdown/reboot.
+ */
+void __hw_protection_trigger(const char *reason, int ms_until_forced,
+ enum hw_protection_action action)
+{
+ static atomic_t allow_proceed = ATOMIC_INIT(1);
+
+ if (action == HWPROT_ACT_DEFAULT)
+ action = hw_protection_action;
+
+ pr_emerg("HARDWARE PROTECTION %s (%s)\n",
+ hw_protection_action_str(action), reason);
+
+ /* Shutdown should be initiated only once. */
+ if (!atomic_dec_and_test(&allow_proceed))
+ return;
+
+ /*
+ * Queue a backup emergency shutdown in the event of
+ * orderly_poweroff failure
+ */
+ hw_failure_emergency_schedule(action, ms_until_forced);
+ if (action == HWPROT_ACT_REBOOT)
+ orderly_reboot();
+ else
+ orderly_poweroff(true);
+}
+EXPORT_SYMBOL_GPL(__hw_protection_trigger);
+
+static bool hw_protection_action_parse(const char *str,
+ enum hw_protection_action *action)
+{
+ if (sysfs_streq(str, "shutdown"))
+ *action = HWPROT_ACT_SHUTDOWN;
+ else if (sysfs_streq(str, "reboot"))
+ *action = HWPROT_ACT_REBOOT;
+ else
+ return false;
+
+ return true;
+}
+
+static int __init hw_protection_setup(char *str)
+{
+ hw_protection_action_parse(str, &hw_protection_action);
+ return 1;
+}
+__setup("hw_protection=", hw_protection_setup);
+
+#ifdef CONFIG_SYSFS
+static ssize_t hw_protection_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ hw_protection_action_str(hw_protection_action));
+}
+static ssize_t hw_protection_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!hw_protection_action_parse(buf, &hw_protection_action))
+ return -EINVAL;
+
+ return count;
+}
+static struct kobj_attribute hw_protection_attr = __ATTR_RW(hw_protection);
+#endif
+
static int __init reboot_setup(char *str)
{
for (;;) {
+ enum reboot_mode *mode;
+
/*
* Having anything passed on the command line via
* reboot= will cause us to disable DMI checking
@@ -498,38 +1106,49 @@ static int __init reboot_setup(char *str)
*/
reboot_default = 0;
+ if (!strncmp(str, "panic_", 6)) {
+ mode = &panic_reboot_mode;
+ str += 6;
+ } else {
+ mode = &reboot_mode;
+ }
+
switch (*str) {
case 'w':
- reboot_mode = REBOOT_WARM;
+ *mode = REBOOT_WARM;
break;
case 'c':
- reboot_mode = REBOOT_COLD;
+ *mode = REBOOT_COLD;
break;
case 'h':
- reboot_mode = REBOOT_HARD;
+ *mode = REBOOT_HARD;
break;
case 's':
- {
- int rc;
-
- if (isdigit(*(str+1))) {
- rc = kstrtoint(str+1, 0, &reboot_cpu);
- if (rc)
- return rc;
- } else if (str[1] == 'm' && str[2] == 'p' &&
- isdigit(*(str+3))) {
- rc = kstrtoint(str+3, 0, &reboot_cpu);
- if (rc)
- return rc;
+ /*
+ * reboot_cpu is s[mp]#### with #### being the processor
+ * to be used for rebooting. Skip 's' or 'smp' prefix.
+ */
+ str += str[1] == 'm' && str[2] == 'p' ? 3 : 1;
+
+ if (isdigit(str[0])) {
+ int cpu = simple_strtoul(str, NULL, 0);
+
+ if (cpu >= num_possible_cpus()) {
+ pr_err("Ignoring the CPU number in reboot= option. "
+ "CPU %d exceeds possible cpu number %d\n",
+ cpu, num_possible_cpus());
+ break;
+ }
+ reboot_cpu = cpu;
} else
- reboot_mode = REBOOT_SOFT;
+ *mode = REBOOT_SOFT;
break;
- }
+
case 'g':
- reboot_mode = REBOOT_GPIO;
+ *mode = REBOOT_GPIO;
break;
case 'b':
@@ -555,3 +1174,246 @@ static int __init reboot_setup(char *str)
return 1;
}
__setup("reboot=", reboot_setup);
+
+#ifdef CONFIG_SYSFS
+
+#define REBOOT_COLD_STR "cold"
+#define REBOOT_WARM_STR "warm"
+#define REBOOT_HARD_STR "hard"
+#define REBOOT_SOFT_STR "soft"
+#define REBOOT_GPIO_STR "gpio"
+#define REBOOT_UNDEFINED_STR "undefined"
+
+#define BOOT_TRIPLE_STR "triple"
+#define BOOT_KBD_STR "kbd"
+#define BOOT_BIOS_STR "bios"
+#define BOOT_ACPI_STR "acpi"
+#define BOOT_EFI_STR "efi"
+#define BOOT_PCI_STR "pci"
+
+static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ const char *val;
+
+ switch (reboot_mode) {
+ case REBOOT_COLD:
+ val = REBOOT_COLD_STR;
+ break;
+ case REBOOT_WARM:
+ val = REBOOT_WARM_STR;
+ break;
+ case REBOOT_HARD:
+ val = REBOOT_HARD_STR;
+ break;
+ case REBOOT_SOFT:
+ val = REBOOT_SOFT_STR;
+ break;
+ case REBOOT_GPIO:
+ val = REBOOT_GPIO_STR;
+ break;
+ default:
+ val = REBOOT_UNDEFINED_STR;
+ }
+
+ return sysfs_emit(buf, "%s\n", val);
+}
+static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (!strncmp(buf, REBOOT_COLD_STR, strlen(REBOOT_COLD_STR)))
+ reboot_mode = REBOOT_COLD;
+ else if (!strncmp(buf, REBOOT_WARM_STR, strlen(REBOOT_WARM_STR)))
+ reboot_mode = REBOOT_WARM;
+ else if (!strncmp(buf, REBOOT_HARD_STR, strlen(REBOOT_HARD_STR)))
+ reboot_mode = REBOOT_HARD;
+ else if (!strncmp(buf, REBOOT_SOFT_STR, strlen(REBOOT_SOFT_STR)))
+ reboot_mode = REBOOT_SOFT;
+ else if (!strncmp(buf, REBOOT_GPIO_STR, strlen(REBOOT_GPIO_STR)))
+ reboot_mode = REBOOT_GPIO;
+ else
+ return -EINVAL;
+
+ reboot_default = 0;
+
+ return count;
+}
+static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode);
+
+#ifdef CONFIG_X86
+static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", reboot_force);
+}
+static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool res;
+
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (kstrtobool(buf, &res))
+ return -EINVAL;
+
+ reboot_default = 0;
+ reboot_force = res;
+
+ return count;
+}
+static struct kobj_attribute reboot_force_attr = __ATTR_RW(force);
+
+static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ const char *val;
+
+ switch (reboot_type) {
+ case BOOT_TRIPLE:
+ val = BOOT_TRIPLE_STR;
+ break;
+ case BOOT_KBD:
+ val = BOOT_KBD_STR;
+ break;
+ case BOOT_BIOS:
+ val = BOOT_BIOS_STR;
+ break;
+ case BOOT_ACPI:
+ val = BOOT_ACPI_STR;
+ break;
+ case BOOT_EFI:
+ val = BOOT_EFI_STR;
+ break;
+ case BOOT_CF9_FORCE:
+ val = BOOT_PCI_STR;
+ break;
+ default:
+ val = REBOOT_UNDEFINED_STR;
+ }
+
+ return sysfs_emit(buf, "%s\n", val);
+}
+static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ if (!strncmp(buf, BOOT_TRIPLE_STR, strlen(BOOT_TRIPLE_STR)))
+ reboot_type = BOOT_TRIPLE;
+ else if (!strncmp(buf, BOOT_KBD_STR, strlen(BOOT_KBD_STR)))
+ reboot_type = BOOT_KBD;
+ else if (!strncmp(buf, BOOT_BIOS_STR, strlen(BOOT_BIOS_STR)))
+ reboot_type = BOOT_BIOS;
+ else if (!strncmp(buf, BOOT_ACPI_STR, strlen(BOOT_ACPI_STR)))
+ reboot_type = BOOT_ACPI;
+ else if (!strncmp(buf, BOOT_EFI_STR, strlen(BOOT_EFI_STR)))
+ reboot_type = BOOT_EFI;
+ else if (!strncmp(buf, BOOT_PCI_STR, strlen(BOOT_PCI_STR)))
+ reboot_type = BOOT_CF9_FORCE;
+ else
+ return -EINVAL;
+
+ reboot_default = 0;
+
+ return count;
+}
+static struct kobj_attribute reboot_type_attr = __ATTR_RW(type);
+#endif
+
+#ifdef CONFIG_SMP
+static ssize_t cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", reboot_cpu);
+}
+static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int cpunum;
+ int rc;
+
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ rc = kstrtouint(buf, 0, &cpunum);
+
+ if (rc)
+ return rc;
+
+ if (cpunum >= num_possible_cpus())
+ return -ERANGE;
+
+ reboot_default = 0;
+ reboot_cpu = cpunum;
+
+ return count;
+}
+static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu);
+#endif
+
+static struct attribute *reboot_attrs[] = {
+ &hw_protection_attr.attr,
+ &reboot_mode_attr.attr,
+#ifdef CONFIG_X86
+ &reboot_force_attr.attr,
+ &reboot_type_attr.attr,
+#endif
+#ifdef CONFIG_SMP
+ &reboot_cpu_attr.attr,
+#endif
+ NULL,
+};
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table kern_reboot_table[] = {
+ {
+ .procname = "poweroff_cmd",
+ .data = &poweroff_cmd,
+ .maxlen = POWEROFF_CMD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "ctrl-alt-del",
+ .data = &C_A_D,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static void __init kernel_reboot_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_reboot_table);
+}
+#else
+#define kernel_reboot_sysctls_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+static const struct attribute_group reboot_attr_group = {
+ .attrs = reboot_attrs,
+};
+
+static int __init reboot_ksysfs_init(void)
+{
+ struct kobject *reboot_kobj;
+ int ret;
+
+ reboot_kobj = kobject_create_and_add("reboot", kernel_kobj);
+ if (!reboot_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(reboot_kobj, &reboot_attr_group);
+ if (ret) {
+ kobject_put(reboot_kobj);
+ return ret;
+ }
+
+ kernel_reboot_sysctls_init();
+
+ return 0;
+}
+late_initcall(reboot_ksysfs_init);
+
+#endif
diff --git a/kernel/regset.c b/kernel/regset.c
new file mode 100644
index 000000000000..b2871fa68b2a
--- /dev/null
+++ b/kernel/regset.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/regset.h>
+
+static int __regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ void *p = *data, *to_free = NULL;
+ int res;
+
+ if (!regset->regset_get)
+ return -EOPNOTSUPP;
+ if (size > regset->n * regset->size)
+ size = regset->n * regset->size;
+ if (!p) {
+ to_free = p = kvzalloc(size, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ }
+ res = regset->regset_get(target, regset,
+ (struct membuf){.p = p, .left = size});
+ if (res < 0) {
+ kvfree(to_free);
+ return res;
+ }
+ *data = p;
+ return size - res;
+}
+
+int regset_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void *data)
+{
+ return __regset_get(target, regset, size, &data);
+}
+EXPORT_SYMBOL(regset_get);
+
+int regset_get_alloc(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int size,
+ void **data)
+{
+ *data = NULL;
+ return __regset_get(target, regset, size, data);
+}
+EXPORT_SYMBOL(regset_get_alloc);
+
+/**
+ * copy_regset_to_user - fetch a thread's user_regset data into user memory
+ * @target: thread to be examined
+ * @view: &struct user_regset_view describing user thread machine state
+ * @setno: index in @view->regsets
+ * @offset: offset into the regset data, in bytes
+ * @size: amount of data to copy, in bytes
+ * @data: user-mode pointer to copy into
+ */
+int copy_regset_to_user(struct task_struct *target,
+ const struct user_regset_view *view,
+ unsigned int setno,
+ unsigned int offset, unsigned int size,
+ void __user *data)
+{
+ const struct user_regset *regset = &view->regsets[setno];
+ void *buf;
+ int ret;
+
+ ret = regset_get_alloc(target, regset, size, &buf);
+ if (ret > 0)
+ ret = copy_to_user(data, buf, ret) ? -EFAULT : 0;
+ kvfree(buf);
+ return ret;
+}
diff --git a/kernel/relay.c b/kernel/relay.c
index e9dbaeb8fd65..e36f6b926f7f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1,7 +1,7 @@
/*
* Public API and common code for kernel->userspace relay file support.
*
- * See Documentation/filesystems/relay.txt for an overview.
+ * See Documentation/filesystems/relay.rst for an overview.
*
* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
@@ -28,21 +28,12 @@ static DEFINE_MUTEX(relay_channels_mutex);
static LIST_HEAD(relay_channels);
/*
- * close() vm_op implementation for relay file mapping.
- */
-static void relay_file_mmap_close(struct vm_area_struct *vma)
-{
- struct rchan_buf *buf = vma->vm_private_data;
- buf->chan->cb->buf_unmapped(buf, vma->vm_file);
-}
-
-/*
* fault() vm_op implementation for relay file mapping.
*/
-static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
{
struct page *page;
- struct rchan_buf *buf = vma->vm_private_data;
+ struct rchan_buf *buf = vmf->vma->vm_private_data;
pgoff_t pgoff = vmf->pgoff;
if (!buf)
@@ -62,7 +53,6 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
*/
static const struct vm_operations_struct relay_file_mmap_ops = {
.fault = relay_buf_fault,
- .close = relay_file_mmap_close,
};
/*
@@ -70,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
- const size_t pa_size = n_pages * sizeof(struct page *);
- if (pa_size > PAGE_SIZE)
- return vzalloc(pa_size);
- return kzalloc(pa_size, GFP_KERNEL);
+ return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
}
/*
@@ -81,25 +68,22 @@ static struct page **relay_alloc_page_array(unsigned int n_pages)
*/
static void relay_free_page_array(struct page **array)
{
- if (is_vmalloc_addr(array))
- vfree(array);
- else
- kfree(array);
+ kvfree(array);
}
/**
- * relay_mmap_buf: - mmap channel buffer to process address space
- * @buf: relay channel buffer
- * @vma: vm_area_struct describing memory to be mapped
+ * relay_mmap_prepare_buf: - mmap channel buffer to process address space
+ * @buf: the relay channel buffer
+ * @desc: describing what to map
*
* Returns 0 if ok, negative on error
*
- * Caller should already have grabbed mmap_sem.
+ * Caller should already have grabbed mmap_lock.
*/
-static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+static int relay_mmap_prepare_buf(struct rchan_buf *buf,
+ struct vm_area_desc *desc)
{
- unsigned long length = vma->vm_end - vma->vm_start;
- struct file *filp = vma->vm_file;
+ unsigned long length = vma_desc_size(desc);
if (!buf)
return -EBADF;
@@ -107,10 +91,9 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
if (length != (unsigned long)buf->chan->alloc_size)
return -EINVAL;
- vma->vm_ops = &relay_file_mmap_ops;
- vma->vm_flags |= VM_DONTEXPAND;
- vma->vm_private_data = buf;
- buf->chan->cb->buf_mapped(buf, filp);
+ desc->vm_ops = &relay_file_mmap_ops;
+ desc->vm_flags |= VM_DONTEXPAND;
+ desc->private_data = buf;
return 0;
}
@@ -136,7 +119,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
return NULL;
for (i = 0; i < n_pages; i++) {
- buf->page_array[i] = alloc_page(GFP_KERNEL);
+ buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (unlikely(!buf->page_array[i]))
goto depopulate;
set_page_private(buf->page_array[i], (unsigned long)buf);
@@ -145,7 +128,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
if (!mem)
goto depopulate;
- memset(mem, 0, *size);
buf->page_count = n_pages;
return mem;
@@ -166,13 +148,14 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
- if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
+ if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
- buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+ buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
+ GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -199,6 +182,7 @@ free_buf:
static void relay_destroy_channel(struct kref *kref)
{
struct rchan *chan = container_of(kref, struct rchan, kref);
+ free_percpu(chan->buf);
kfree(chan);
}
@@ -217,7 +201,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
__free_page(buf->page_array[i]);
relay_free_page_array(buf->page_array);
}
- chan->buf[buf->cpu] = NULL;
+ *per_cpu_ptr(chan->buf, buf->cpu) = NULL;
kfree(buf->padding);
kfree(buf);
kref_put(&chan->kref, relay_destroy_channel);
@@ -265,79 +249,32 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
* High-level relay kernel API and associated functions.
*/
-/*
- * rchan_callback implementations defining default channel behavior. Used
- * in place of corresponding NULL values in client callback struct.
- */
-
-/*
- * subbuf_start() default callback. Does nothing.
- */
-static int subbuf_start_default_callback (struct rchan_buf *buf,
- void *subbuf,
- void *prev_subbuf,
- size_t prev_padding)
+static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
+ void *prev_subbuf)
{
- if (relay_buf_full(buf))
- return 0;
+ int full = relay_buf_full(buf);
- return 1;
-}
+ if (full)
+ buf->stats.full_count++;
-/*
- * buf_mapped() default callback. Does nothing.
- */
-static void buf_mapped_default_callback(struct rchan_buf *buf,
- struct file *filp)
-{
-}
+ if (!buf->chan->cb->subbuf_start)
+ return !full;
-/*
- * buf_unmapped() default callback. Does nothing.
- */
-static void buf_unmapped_default_callback(struct rchan_buf *buf,
- struct file *filp)
-{
+ return buf->chan->cb->subbuf_start(buf, subbuf,
+ prev_subbuf);
}
-/*
- * create_buf_file_create() default callback. Does nothing.
- */
-static struct dentry *create_buf_file_default_callback(const char *filename,
- struct dentry *parent,
- umode_t mode,
- struct rchan_buf *buf,
- int *is_global)
-{
- return NULL;
-}
-
-/*
- * remove_buf_file() default callback. Does nothing.
- */
-static int remove_buf_file_default_callback(struct dentry *dentry)
-{
- return -EINVAL;
-}
-
-/* relay channel default callbacks */
-static struct rchan_callbacks default_channel_callbacks = {
- .subbuf_start = subbuf_start_default_callback,
- .buf_mapped = buf_mapped_default_callback,
- .buf_unmapped = buf_unmapped_default_callback,
- .create_buf_file = create_buf_file_default_callback,
- .remove_buf_file = remove_buf_file_default_callback,
-};
-
/**
* wakeup_readers - wake up readers waiting on a channel
- * @data: contains the channel buffer
+ * @work: contains the channel buffer
*
- * This is the timer function used to defer reader waking.
+ * This is the function used to defer reader waking
*/
-static void wakeup_readers(unsigned long data)
+static void wakeup_readers(struct irq_work *work)
{
- struct rchan_buf *buf = (struct rchan_buf *)data;
+ struct rchan_buf *buf;
+
+ buf = container_of(work, struct rchan_buf, wakeup_work);
wake_up_interruptible(&buf->read_wait);
}
@@ -355,9 +292,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
- setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
- } else
- del_timer_sync(&buf->timer);
+ init_irq_work(&buf->wakeup_work, wakeup_readers);
+ } else {
+ irq_work_sync(&buf->wakeup_work);
+ }
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
@@ -365,11 +303,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
buf->finalized = 0;
buf->data = buf->start;
buf->offset = 0;
+ buf->stats.full_count = 0;
+ buf->stats.big_count = 0;
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
- buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+ relay_subbuf_start(buf, buf->data, NULL);
}
/**
@@ -385,20 +325,21 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
*/
void relay_reset(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
- if (chan->is_global && chan->buf[0]) {
- __relay_reset(chan->buf[0], 0);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+ __relay_reset(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
- if (chan->buf[i])
- __relay_reset(chan->buf[i], 0);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ __relay_reset(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_reset);
@@ -417,15 +358,16 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
struct dentry *dentry;
char *tmpname;
- tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+ tmpname = kasprintf(GFP_KERNEL, "%s%d", chan->base_filename, cpu);
if (!tmpname)
return NULL;
- snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
/* Create file in fs */
dentry = chan->cb->create_buf_file(tmpname, chan->parent,
S_IRUSR, buf,
&chan->is_global);
+ if (IS_ERR(dentry))
+ dentry = NULL;
kfree(tmpname);
@@ -439,11 +381,11 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
*/
static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
{
- struct rchan_buf *buf = NULL;
+ struct rchan_buf *buf;
struct dentry *dentry;
if (chan->is_global)
- return chan->buf[0];
+ return *per_cpu_ptr(chan->buf, 0);
buf = relay_create_buf(chan);
if (!buf)
@@ -454,13 +396,20 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
if (!dentry)
goto free_buf;
relay_set_buf_dentry(buf, dentry);
+ } else {
+ /* Only retrieve global info, nothing more, nothing less */
+ dentry = chan->cb->create_buf_file(NULL, NULL,
+ S_IRUSR, buf,
+ &chan->is_global);
+ if (IS_ERR_OR_NULL(dentry))
+ goto free_buf;
}
buf->cpu = cpu;
__relay_reset(buf, 1);
if(chan->is_global) {
- chan->buf[0] = buf;
+ *per_cpu_ptr(chan->buf, 0) = buf;
buf->cpu = 0;
}
@@ -482,77 +431,35 @@ free_buf:
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
- del_timer_sync(&buf->timer);
+ irq_work_sync(&buf->wakeup_work);
buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
-static void setup_callbacks(struct rchan *chan,
- struct rchan_callbacks *cb)
+int relay_prepare_cpu(unsigned int cpu)
{
- if (!cb) {
- chan->cb = &default_channel_callbacks;
- return;
- }
-
- if (!cb->subbuf_start)
- cb->subbuf_start = subbuf_start_default_callback;
- if (!cb->buf_mapped)
- cb->buf_mapped = buf_mapped_default_callback;
- if (!cb->buf_unmapped)
- cb->buf_unmapped = buf_unmapped_default_callback;
- if (!cb->create_buf_file)
- cb->create_buf_file = create_buf_file_default_callback;
- if (!cb->remove_buf_file)
- cb->remove_buf_file = remove_buf_file_default_callback;
- chan->cb = cb;
-}
-
-/**
- * relay_hotcpu_callback - CPU hotplug callback
- * @nb: notifier block
- * @action: hotplug action to take
- * @hcpu: CPU number
- *
- * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
- */
-static int relay_hotcpu_callback(struct notifier_block *nb,
- unsigned long action,
- void *hcpu)
-{
- unsigned int hotcpu = (unsigned long)hcpu;
struct rchan *chan;
+ struct rchan_buf *buf;
- switch(action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&relay_channels_mutex);
- list_for_each_entry(chan, &relay_channels, list) {
- if (chan->buf[hotcpu])
- continue;
- chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
- if(!chan->buf[hotcpu]) {
- printk(KERN_ERR
- "relay_hotcpu_callback: cpu %d buffer "
- "creation failed\n", hotcpu);
- mutex_unlock(&relay_channels_mutex);
- return notifier_from_errno(-ENOMEM);
- }
+ mutex_lock(&relay_channels_mutex);
+ list_for_each_entry(chan, &relay_channels, list) {
+ if (*per_cpu_ptr(chan->buf, cpu))
+ continue;
+ buf = relay_open_buf(chan, cpu);
+ if (!buf) {
+ pr_err("relay: cpu %d buffer creation failed\n", cpu);
+ mutex_unlock(&relay_channels_mutex);
+ return -ENOMEM;
}
- mutex_unlock(&relay_channels_mutex);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- /* No need to flush the cpu : will be flushed upon
- * final relay_flush() call. */
- break;
+ *per_cpu_ptr(chan->buf, cpu) = buf;
}
- return NOTIFY_OK;
+ mutex_unlock(&relay_channels_mutex);
+ return 0;
}
/**
* relay_open - create a new relay channel
- * @base_filename: base name of files to create, %NULL for buffering only
+ * @base_filename: base name of files to create
* @parent: dentry of parent directory, %NULL for root directory or buffer
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
@@ -570,21 +477,30 @@ struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
size_t subbuf_size,
size_t n_subbufs,
- struct rchan_callbacks *cb,
+ const struct rchan_callbacks *cb,
void *private_data)
{
unsigned int i;
struct rchan *chan;
+ struct rchan_buf *buf;
if (!(subbuf_size && n_subbufs))
return NULL;
if (subbuf_size > UINT_MAX / n_subbufs)
return NULL;
+ if (!cb || !cb->create_buf_file || !cb->remove_buf_file)
+ return NULL;
chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
if (!chan)
return NULL;
+ chan->buf = alloc_percpu(struct rchan_buf *);
+ if (!chan->buf) {
+ kfree(chan);
+ return NULL;
+ }
+
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
@@ -593,16 +509,17 @@ struct rchan *relay_open(const char *base_filename,
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
+ strscpy(chan->base_filename, base_filename, NAME_MAX);
}
- setup_callbacks(chan, cb);
+ chan->cb = cb;
kref_init(&chan->kref);
mutex_lock(&relay_channels_mutex);
for_each_online_cpu(i) {
- chan->buf[i] = relay_open_buf(chan, i);
- if (!chan->buf[i])
+ buf = relay_open_buf(chan, i);
+ if (!buf)
goto free_bufs;
+ *per_cpu_ptr(chan->buf, i) = buf;
}
list_add(&chan->list, &relay_channels);
mutex_unlock(&relay_channels_mutex);
@@ -611,8 +528,8 @@ struct rchan *relay_open(const char *base_filename,
free_bufs:
for_each_possible_cpu(i) {
- if (chan->buf[i])
- relay_close_buf(chan->buf[i]);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_close_buf(buf);
}
kref_put(&chan->kref, relay_destroy_channel);
@@ -626,89 +543,6 @@ struct rchan_percpu_buf_dispatcher {
struct dentry *dentry;
};
-/* Called in atomic context. */
-static void __relay_set_buf_dentry(void *info)
-{
- struct rchan_percpu_buf_dispatcher *p = info;
-
- relay_set_buf_dentry(p->buf, p->dentry);
-}
-
-/**
- * relay_late_setup_files - triggers file creation
- * @chan: channel to operate on
- * @base_filename: base name of files to create
- * @parent: dentry of parent directory, %NULL for root directory
- *
- * Returns 0 if successful, non-zero otherwise.
- *
- * Use to setup files for a previously buffer-only channel.
- * Useful to do early tracing in kernel, before VFS is up, for example.
- */
-int relay_late_setup_files(struct rchan *chan,
- const char *base_filename,
- struct dentry *parent)
-{
- int err = 0;
- unsigned int i, curr_cpu;
- unsigned long flags;
- struct dentry *dentry;
- struct rchan_percpu_buf_dispatcher disp;
-
- if (!chan || !base_filename)
- return -EINVAL;
-
- strlcpy(chan->base_filename, base_filename, NAME_MAX);
-
- mutex_lock(&relay_channels_mutex);
- /* Is chan already set up? */
- if (unlikely(chan->has_base_filename)) {
- mutex_unlock(&relay_channels_mutex);
- return -EEXIST;
- }
- chan->has_base_filename = 1;
- chan->parent = parent;
- curr_cpu = get_cpu();
- /*
- * The CPU hotplug notifier ran before us and created buffers with
- * no files associated. So it's safe to call relay_setup_buf_file()
- * on all currently online CPUs.
- */
- for_each_online_cpu(i) {
- if (unlikely(!chan->buf[i])) {
- WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
- err = -EINVAL;
- break;
- }
-
- dentry = relay_create_buf_file(chan, chan->buf[i], i);
- if (unlikely(!dentry)) {
- err = -EINVAL;
- break;
- }
-
- if (curr_cpu == i) {
- local_irq_save(flags);
- relay_set_buf_dentry(chan->buf[i], dentry);
- local_irq_restore(flags);
- } else {
- disp.buf = chan->buf[i];
- disp.dentry = dentry;
- smp_mb();
- /* relay_channels_mutex must be held, so wait. */
- err = smp_call_function_single(i,
- __relay_set_buf_dentry,
- &disp, 1);
- }
- if (unlikely(err))
- break;
- }
- put_cpu();
- mutex_unlock(&relay_channels_mutex);
-
- return err;
-}
-
/**
* relay_switch_subbuf - switch to a new sub-buffer
* @buf: channel buffer
@@ -728,9 +562,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
goto toobig;
if (buf->offset != buf->chan->subbuf_size + 1) {
- buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+ size_t prev_padding;
+
+ prev_padding = buf->chan->subbuf_size - buf->offset;
old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
- buf->padding[old_subbuf] = buf->prev_padding;
+ buf->padding[old_subbuf] = prev_padding;
buf->subbufs_produced++;
if (buf->dentry)
d_inode(buf->dentry)->i_size +=
@@ -740,21 +576,22 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
buf->early_bytes += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
- if (waitqueue_active(&buf->read_wait))
+ if (waitqueue_active(&buf->read_wait)) {
/*
* Calling wake_up_interruptible() from here
* will deadlock if we happen to be logging
* from the scheduler (trying to re-grab
* rq->lock), so defer it.
*/
- mod_timer(&buf->timer, jiffies + 1);
+ irq_work_queue(&buf->wakeup_work);
+ }
}
old = buf->data;
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
- if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+ if (!relay_subbuf_start(buf, new, old)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
@@ -767,7 +604,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
return length;
toobig:
- buf->chan->last_toobig = length;
+ buf->stats.big_count++;
return 0;
}
EXPORT_SYMBOL_GPL(relay_switch_subbuf);
@@ -791,14 +628,13 @@ void relay_subbufs_consumed(struct rchan *chan,
{
struct rchan_buf *buf;
- if (!chan)
+ if (!chan || cpu >= NR_CPUS)
return;
- if (cpu >= NR_CPUS || !chan->buf[cpu] ||
- subbufs_consumed > chan->n_subbufs)
+ buf = *per_cpu_ptr(chan->buf, cpu);
+ if (!buf || subbufs_consumed > chan->n_subbufs)
return;
- buf = chan->buf[cpu];
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
else
@@ -814,23 +650,19 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
*/
void relay_close(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
mutex_lock(&relay_channels_mutex);
- if (chan->is_global && chan->buf[0])
- relay_close_buf(chan->buf[0]);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0)))
+ relay_close_buf(buf);
else
for_each_possible_cpu(i)
- if (chan->buf[i])
- relay_close_buf(chan->buf[i]);
-
- if (chan->last_toobig)
- printk(KERN_WARNING "relay: one or more items not logged "
- "[item size (%Zd) > sub-buffer size (%Zd)]\n",
- chan->last_toobig, chan->subbuf_size);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_close_buf(buf);
list_del(&chan->list);
kref_put(&chan->kref, relay_destroy_channel);
@@ -846,25 +678,62 @@ EXPORT_SYMBOL_GPL(relay_close);
*/
void relay_flush(struct rchan *chan)
{
+ struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
- if (chan->is_global && chan->buf[0]) {
- relay_switch_subbuf(chan->buf[0], 0);
+ if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
+ relay_switch_subbuf(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
- if (chan->buf[i])
- relay_switch_subbuf(chan->buf[i], 0);
+ if ((buf = *per_cpu_ptr(chan->buf, i)))
+ relay_switch_subbuf(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_flush);
/**
+ * relay_stats - get channel buffer statistics
+ * @chan: the channel
+ * @flags: select particular information to get
+ *
+ * Returns the count of certain field that caller specifies.
+ */
+size_t relay_stats(struct rchan *chan, int flags)
+{
+ unsigned int i, count = 0;
+ struct rchan_buf *rbuf;
+
+ if (!chan || flags > RELAY_STATS_LAST)
+ return 0;
+
+ if (chan->is_global) {
+ rbuf = *per_cpu_ptr(chan->buf, 0);
+ if (flags & RELAY_STATS_BUF_FULL)
+ count = rbuf->stats.full_count;
+ else if (flags & RELAY_STATS_WRT_BIG)
+ count = rbuf->stats.big_count;
+ } else {
+ for_each_online_cpu(i) {
+ rbuf = *per_cpu_ptr(chan->buf, i);
+ if (rbuf) {
+ if (flags & RELAY_STATS_BUF_FULL)
+ count += rbuf->stats.full_count;
+ else if (flags & RELAY_STATS_WRT_BIG)
+ count += rbuf->stats.big_count;
+ }
+ }
+ }
+
+ return count;
+}
+
+/**
* relay_file_open - open file op for relay files
* @inode: the inode
* @filp: the file
@@ -881,16 +750,16 @@ static int relay_file_open(struct inode *inode, struct file *filp)
}
/**
- * relay_file_mmap - mmap file op for relay files
- * @filp: the file
- * @vma: the vma describing what to map
+ * relay_file_mmap_prepare - mmap file op for relay files
+ * @desc: describing what to map
*
- * Calls upon relay_mmap_buf() to map the file into user space.
+ * Calls upon relay_mmap_prepare_buf() to map the file into user space.
*/
-static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+static int relay_file_mmap_prepare(struct vm_area_desc *desc)
{
- struct rchan_buf *buf = filp->private_data;
- return relay_mmap_buf(buf, vma);
+ struct rchan_buf *buf = desc->file->private_data;
+
+ return relay_mmap_prepare_buf(buf, desc);
}
/**
@@ -900,18 +769,18 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
*
* Poll implemention.
*/
-static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+static __poll_t relay_file_poll(struct file *filp, poll_table *wait)
{
- unsigned int mask = 0;
+ __poll_t mask = 0;
struct rchan_buf *buf = filp->private_data;
if (buf->finalized)
- return POLLERR;
+ return EPOLLERR;
if (filp->f_mode & FMODE_READ) {
poll_wait(filp, &buf->read_wait, wait);
if (!relay_buf_empty(buf))
- mask |= POLLIN | POLLRDNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
}
return mask;
@@ -970,14 +839,14 @@ static void relay_file_read_consume(struct rchan_buf *buf,
/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
-static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+static int relay_file_read_avail(struct rchan_buf *buf)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
- size_t consumed = buf->subbufs_consumed;
+ size_t consumed;
- relay_file_read_consume(buf, read_pos, 0);
+ relay_file_read_consume(buf, 0, 0);
consumed = buf->subbufs_consumed;
@@ -1038,23 +907,21 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
/**
* relay_file_read_start_pos - find the first available byte to read
- * @read_pos: file read position
* @buf: relay channel buffer
*
- * If the @read_pos is in the middle of padding, return the
+ * If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
* return the original value.
*/
-static size_t relay_file_read_start_pos(size_t read_pos,
- struct rchan_buf *buf)
+static size_t relay_file_read_start_pos(struct rchan_buf *buf)
{
size_t read_subbuf, padding, padding_start, padding_end;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
+ size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed)
+ % (n_subbufs * subbuf_size);
- if (!read_pos)
- read_pos = consumed * subbuf_size + buf->bytes_consumed;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
padding_start = (read_subbuf + 1) * subbuf_size - padding;
@@ -1093,268 +960,55 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
return end_pos;
}
-/*
- * subbuf_read_actor - read up to one subbuf's worth of data
- */
-static int subbuf_read_actor(size_t read_start,
- struct rchan_buf *buf,
- size_t avail,
- read_descriptor_t *desc)
-{
- void *from;
- int ret = 0;
-
- from = buf->start + read_start;
- ret = avail;
- if (copy_to_user(desc->arg.buf, from, avail)) {
- desc->error = -EFAULT;
- ret = 0;
- }
- desc->arg.data += ret;
- desc->written += ret;
- desc->count -= ret;
-
- return ret;
-}
-
-typedef int (*subbuf_actor_t) (size_t read_start,
- struct rchan_buf *buf,
- size_t avail,
- read_descriptor_t *desc);
-
-/*
- * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
- */
-static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
- subbuf_actor_t subbuf_actor,
- read_descriptor_t *desc)
+static ssize_t relay_file_read(struct file *filp,
+ char __user *buffer,
+ size_t count,
+ loff_t *ppos)
{
struct rchan_buf *buf = filp->private_data;
size_t read_start, avail;
+ size_t written = 0;
int ret;
- if (!desc->count)
+ if (!count)
return 0;
- mutex_lock(&file_inode(filp)->i_mutex);
+ inode_lock(file_inode(filp));
do {
- if (!relay_file_read_avail(buf, *ppos))
+ void *from;
+
+ if (!relay_file_read_avail(buf))
break;
- read_start = relay_file_read_start_pos(*ppos, buf);
+ read_start = relay_file_read_start_pos(buf);
avail = relay_file_read_subbuf_avail(read_start, buf);
if (!avail)
break;
- avail = min(desc->count, avail);
- ret = subbuf_actor(read_start, buf, avail, desc);
- if (desc->error < 0)
+ avail = min(count, avail);
+ from = buf->start + read_start;
+ ret = avail;
+ if (copy_to_user(buffer, from, avail))
break;
- if (ret) {
- relay_file_read_consume(buf, read_start, ret);
- *ppos = relay_file_read_end_pos(buf, read_start, ret);
- }
- } while (desc->count && ret);
- mutex_unlock(&file_inode(filp)->i_mutex);
+ buffer += ret;
+ written += ret;
+ count -= ret;
- return desc->written;
-}
+ relay_file_read_consume(buf, read_start, ret);
+ *ppos = relay_file_read_end_pos(buf, read_start, ret);
+ } while (count);
+ inode_unlock(file_inode(filp));
-static ssize_t relay_file_read(struct file *filp,
- char __user *buffer,
- size_t count,
- loff_t *ppos)
-{
- read_descriptor_t desc;
- desc.written = 0;
- desc.count = count;
- desc.arg.buf = buffer;
- desc.error = 0;
- return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
-}
-
-static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
-{
- rbuf->bytes_consumed += bytes_consumed;
-
- if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
- relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
- rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
- }
+ return written;
}
-static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- struct rchan_buf *rbuf;
-
- rbuf = (struct rchan_buf *)page_private(buf->page);
- relay_consume_bytes(rbuf, buf->private);
-}
-
-static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .can_merge = 0,
- .confirm = generic_pipe_buf_confirm,
- .release = relay_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
- .get = generic_pipe_buf_get,
-};
-
-static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
-{
-}
-
-/*
- * subbuf_splice_actor - splice up to one subbuf's worth of data
- */
-static ssize_t subbuf_splice_actor(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags,
- int *nonpad_ret)
-{
- unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
- struct rchan_buf *rbuf = in->private_data;
- unsigned int subbuf_size = rbuf->chan->subbuf_size;
- uint64_t pos = (uint64_t) *ppos;
- uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
- size_t read_start = (size_t) do_div(pos, alloc_size);
- size_t read_subbuf = read_start / subbuf_size;
- size_t padding = rbuf->padding[read_subbuf];
- size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct splice_pipe_desc spd = {
- .pages = pages,
- .nr_pages = 0,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .partial = partial,
- .flags = flags,
- .ops = &relay_pipe_buf_ops,
- .spd_release = relay_page_release,
- };
- ssize_t ret;
-
- if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
- return 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
- /*
- * Adjust read len, if longer than what is available
- */
- if (len > (subbuf_size - read_start % subbuf_size))
- len = subbuf_size - read_start % subbuf_size;
-
- subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
- pidx = (read_start / PAGE_SIZE) % subbuf_pages;
- poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
-
- for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
- unsigned int this_len, this_end, private;
- unsigned int cur_pos = read_start + total_len;
-
- if (!len)
- break;
-
- this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
- private = this_len;
-
- spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
- spd.partial[spd.nr_pages].offset = poff;
-
- this_end = cur_pos + this_len;
- if (this_end >= nonpad_end) {
- this_len = nonpad_end - cur_pos;
- private = this_len + padding;
- }
- spd.partial[spd.nr_pages].len = this_len;
- spd.partial[spd.nr_pages].private = private;
-
- len -= this_len;
- total_len += this_len;
- poff = 0;
- pidx = (pidx + 1) % subbuf_pages;
-
- if (this_end >= nonpad_end) {
- spd.nr_pages++;
- break;
- }
- }
-
- ret = 0;
- if (!spd.nr_pages)
- goto out;
-
- ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
- if (ret < 0 || ret < total_len)
- goto out;
-
- if (read_start + ret == nonpad_end)
- ret += padding;
-
-out:
- splice_shrink_spd(&spd);
- return ret;
-}
-
-static ssize_t relay_file_splice_read(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags)
-{
- ssize_t spliced;
- int ret;
- int nonpad_ret = 0;
-
- ret = 0;
- spliced = 0;
-
- while (len && !spliced) {
- ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
- if (ret < 0)
- break;
- else if (!ret) {
- if (flags & SPLICE_F_NONBLOCK)
- ret = -EAGAIN;
- break;
- }
-
- *ppos += ret;
- if (ret > len)
- len = 0;
- else
- len -= ret;
- spliced += nonpad_ret;
- nonpad_ret = 0;
- }
-
- if (spliced)
- return spliced;
-
- return ret;
-}
const struct file_operations relay_file_operations = {
.open = relay_file_open,
.poll = relay_file_poll,
- .mmap = relay_file_mmap,
+ .mmap_prepare = relay_file_mmap_prepare,
.read = relay_file_read,
- .llseek = no_llseek,
.release = relay_file_release,
- .splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
-
-static __init int relay_init(void)
-{
-
- hotcpu_notifier(relay_hotcpu_callback, 0);
- return 0;
-}
-
-early_initcall(relay_init);
diff --git a/kernel/resource.c b/kernel/resource.c
index 90552aab5f2d..e4e9bac12e6e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/resource.c
*
@@ -17,12 +18,17 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
+#include <linux/pseudo_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/resource_ext.h>
+#include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
@@ -42,43 +48,37 @@ struct resource iomem_resource = {
};
EXPORT_SYMBOL(iomem_resource);
-/* constraints to be met while allocating resources */
-struct resource_constraint {
- resource_size_t min, max, align;
- resource_size_t (*alignf)(void *, const struct resource *,
- resource_size_t, resource_size_t);
- void *alignf_data;
-};
-
static DEFINE_RWLOCK(resource_lock);
/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
+ * Return the next node of @p in pre-order tree traversal. If
+ * @skip_children is true, skip the descendant nodes of @p in
+ * traversal. If @p is a descendant of @subtree_root, only traverse
+ * the subtree under @subtree_root.
*/
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
-static struct resource *next_resource(struct resource *p, bool sibling_only)
+static struct resource *next_resource(struct resource *p, bool skip_children,
+ struct resource *subtree_root)
{
- /* Caller wants to traverse through siblings only */
- if (sibling_only)
- return p->sibling;
-
- if (p->child)
+ if (!skip_children && p->child)
return p->child;
- while (!p->sibling && p->parent)
+ while (!p->sibling && p->parent) {
p = p->parent;
+ if (p == subtree_root)
+ return NULL;
+ }
return p->sibling;
}
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct resource *p = v;
- (*pos)++;
- return (void *)next_resource(p, false);
-}
+/*
+ * Traverse the resource subtree under @_root in pre-order, excluding
+ * @_root itself.
+ *
+ * NOTE: '__p' is introduced to avoid shadowing '_p' outside of loop.
+ * And it is referenced to avoid unused variable warning.
+ */
+#define for_each_resource(_root, _p, _skip_children) \
+ for (typeof(_root) __root = (_root), __p = _p = __root->child; \
+ __p && _p; _p = next_resource(_p, _skip_children, __root))
#ifdef CONFIG_PROC_FS
@@ -87,14 +87,28 @@ enum { MAX_IORES_LEVEL = 5 };
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = m->private;
- loff_t l = 0;
+ struct resource *root = pde_data(file_inode(m->file));
+ struct resource *p;
+ loff_t l = *pos;
+
read_lock(&resource_lock);
- for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
- ;
+ for_each_resource(root, p, false) {
+ if (l-- == 0)
+ break;
+ }
+
return p;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+
+ (*pos)++;
+
+ return (void *)next_resource(p, false, NULL);
+}
+
static void r_stop(struct seq_file *m, void *v)
__releases(resource_lock)
{
@@ -103,18 +117,27 @@ static void r_stop(struct seq_file *m, void *v)
static int r_show(struct seq_file *m, void *v)
{
- struct resource *root = m->private;
+ struct resource *root = pde_data(file_inode(m->file));
struct resource *r = v, *p;
+ unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
+
+ if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
+ start = r->start;
+ end = r->end;
+ } else {
+ start = end = 0;
+ }
+
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
- width, (unsigned long long) r->start,
- width, (unsigned long long) r->end,
+ width, start,
+ width, end,
r->name ? r->name : "<BAD>");
return 0;
}
@@ -126,44 +149,11 @@ static const struct seq_operations resource_op = {
.show = r_show,
};
-static int ioports_open(struct inode *inode, struct file *file)
-{
- int res = seq_open(file, &resource_op);
- if (!res) {
- struct seq_file *m = file->private_data;
- m->private = &ioport_resource;
- }
- return res;
-}
-
-static int iomem_open(struct inode *inode, struct file *file)
-{
- int res = seq_open(file, &resource_op);
- if (!res) {
- struct seq_file *m = file->private_data;
- m->private = &iomem_resource;
- }
- return res;
-}
-
-static const struct file_operations proc_ioports_operations = {
- .open = ioports_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static const struct file_operations proc_iomem_operations = {
- .open = iomem_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init ioresources_init(void)
{
- proc_create("ioports", 0, NULL, &proc_ioports_operations);
- proc_create("iomem", 0, NULL, &proc_iomem_operations);
+ proc_create_seq_data("ioports", 0, NULL, &resource_op,
+ &ioport_resource);
+ proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource);
return 0;
}
__initcall(ioresources_init);
@@ -172,36 +162,19 @@ __initcall(ioresources_init);
static void free_resource(struct resource *res)
{
- if (!res)
- return;
-
- if (!PageSlab(virt_to_head_page(res))) {
- spin_lock(&bootmem_resource_lock);
- res->sibling = bootmem_resource_free;
- bootmem_resource_free = res;
- spin_unlock(&bootmem_resource_lock);
- } else {
+ /**
+ * If the resource was allocated using memblock early during boot
+ * we'll leak it here: we can only return full pages back to the
+ * buddy and trying to be smart and reusing them eventually in
+ * alloc_resource() overcomplicates resource handling.
+ */
+ if (res && PageSlab(virt_to_head_page(res)))
kfree(res);
- }
}
static struct resource *alloc_resource(gfp_t flags)
{
- struct resource *res = NULL;
-
- spin_lock(&bootmem_resource_lock);
- if (bootmem_resource_free) {
- res = bootmem_resource_free;
- bootmem_resource_free = res->sibling;
- }
- spin_unlock(&bootmem_resource_lock);
-
- if (res)
- memset(res, 0, sizeof(struct resource));
- else
- res = kzalloc(sizeof(struct resource), flags);
-
- return res;
+ return kzalloc(sizeof(struct resource), flags);
}
/* Return the conflict entry if you can't request it */
@@ -233,9 +206,9 @@ static struct resource * __request_resource(struct resource *root, struct resour
}
}
-static int __release_resource(struct resource *old)
+static int __release_resource(struct resource *old, bool release_child)
{
- struct resource *tmp, **p;
+ struct resource *tmp, **p, *chd;
p = &old->parent->child;
for (;;) {
@@ -243,7 +216,17 @@ static int __release_resource(struct resource *old)
if (!tmp)
break;
if (tmp == old) {
- *p = tmp->sibling;
+ if (release_child || !(tmp->child)) {
+ *p = tmp->sibling;
+ } else {
+ for (chd = tmp->child;; chd = chd->sibling) {
+ chd->parent = tmp->parent;
+ if (!(chd->sibling))
+ break;
+ }
+ *p = tmp->child;
+ chd->sibling = tmp->sibling;
+ }
old->parent = NULL;
return 0;
}
@@ -325,166 +308,251 @@ int release_resource(struct resource *old)
int retval;
write_lock(&resource_lock);
- retval = __release_resource(old);
+ retval = __release_resource(old, true);
write_unlock(&resource_lock);
return retval;
}
EXPORT_SYMBOL(release_resource);
-/*
- * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
- * the caller must specify res->start, res->end, res->flags and "name".
- * If found, returns 0, res is overwritten, if not found, returns -1.
- * This walks through whole tree and not just first level children
- * until and unless first_level_children_only is true.
+static bool is_type_match(struct resource *p, unsigned long flags, unsigned long desc)
+{
+ return (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc);
+}
+
+/**
+ * find_next_iomem_res - Finds the lowest iomem resource that covers part of
+ * [@start..@end].
+ *
+ * If a resource is found, returns 0 and @*res is overwritten with the part
+ * of the resource that's within [@start..@end]; if none is found, returns
+ * -ENODEV. Returns -EINVAL for invalid parameters.
+ *
+ * @start: start address of the resource searched for
+ * @end: end address of same resource
+ * @flags: flags which the resource must have
+ * @desc: descriptor the resource must have
+ * @res: return ptr, if resource found
+ *
+ * The caller must specify @start, @end, @flags, and @desc
+ * (which may be IORES_DESC_NONE).
*/
-static int find_next_iomem_res(struct resource *res, char *name,
- bool first_level_children_only)
+static int find_next_iomem_res(resource_size_t start, resource_size_t end,
+ unsigned long flags, unsigned long desc,
+ struct resource *res)
{
- resource_size_t start, end;
+ /* Skip children until we find a top level range that matches */
+ bool skip_children = true;
struct resource *p;
- bool sibling_only = false;
-
- BUG_ON(!res);
- start = res->start;
- end = res->end;
- BUG_ON(start >= end);
+ if (!res)
+ return -EINVAL;
- if (first_level_children_only)
- sibling_only = true;
+ if (start >= end)
+ return -EINVAL;
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
- if (p->flags != res->flags)
- continue;
- if (name && strcmp(p->name, name))
- continue;
+ for_each_resource(&iomem_resource, p, skip_children) {
+ /* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
break;
}
- if ((p->end >= start) && (p->start < end))
+
+ /* Skip until we find a range that matches what we look for */
+ if (p->end < start)
+ continue;
+
+ /*
+ * We found a top level range that matches what we are looking
+ * for. Time to start checking children too.
+ */
+ skip_children = false;
+
+ /* Found a match, break */
+ if (is_type_match(p, flags, desc))
break;
}
+ if (p) {
+ /* copy data */
+ *res = (struct resource) {
+ .start = max(start, p->start),
+ .end = min(end, p->end),
+ .flags = p->flags,
+ .desc = p->desc,
+ .parent = p->parent,
+ };
+ }
+
read_unlock(&resource_lock);
- if (!p)
- return -1;
- /* copy data */
- if (res->start < p->start)
- res->start = p->start;
- if (res->end > p->end)
- res->end = p->end;
- return 0;
+ return p ? 0 : -ENODEV;
}
-/*
- * Walks through iomem resources and calls func() with matching resource
- * ranges. This walks through whole tree and not just first level children.
- * All the memory ranges which overlap start,end and also match flags and
- * name are valid candidates.
- *
- * @name: name of resource
- * @flags: resource flags
- * @start: start addr
- * @end: end addr
- */
-int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
- void *arg, int (*func)(u64, u64, void *))
+static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
+ unsigned long flags, unsigned long desc,
+ void *arg,
+ int (*func)(struct resource *, void *))
{
struct resource res;
- u64 orig_end;
- int ret = -1;
+ int ret = -EINVAL;
- res.start = start;
- res.end = end;
- res.flags = flags;
- orig_end = res.end;
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, name, false))) {
- ret = (*func)(res.start, res.end, arg);
+ while (start < end &&
+ !find_next_iomem_res(start, end, flags, desc, &res)) {
+ ret = (*func)(&res, arg);
if (ret)
break;
- res.start = res.end + 1;
- res.end = orig_end;
+
+ start = res.end + 1;
}
+
return ret;
}
+/**
+ * walk_iomem_res_desc - Walks through iomem resources and calls func()
+ * with matching resource ranges.
+ * *
+ * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
+ * @flags: I/O resource flags
+ * @start: start addr
+ * @end: end addr
+ * @arg: function argument for the callback @func
+ * @func: callback function that is called for each qualifying resource area
+ *
+ * All the memory ranges which overlap start,end and also match flags and
+ * desc are valid candidates.
+ *
+ * NOTE: For a new descriptor search, define a new IORES_DESC in
+ * <linux/ioport.h> and set it in 'desc' of a target resource entry.
+ */
+int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
+ u64 end, void *arg, int (*func)(struct resource *, void *))
+{
+ return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
+}
+EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
+
/*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM". This function deals with
- * full ranges and not pfn. If resources are not pfn aligned, dealing
- * with pfn can truncate ranges.
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * Now, this function is only for System RAM, it deals with full ranges and
+ * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
+ * ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
- int (*func)(u64, u64, void *))
+ int (*func)(struct resource *, void *))
{
- struct resource res;
- u64 orig_end;
+ unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
+}
+
+/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res, *rams;
+ int rams_size = 16, i;
+ unsigned long flags;
int ret = -1;
- res.start = start;
- res.end = end;
- res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- orig_end = res.end;
- while ((res.start < res.end) &&
- (!find_next_iomem_res(&res, "System RAM", true))) {
- ret = (*func)(res.start, res.end, arg);
+ /* create a list */
+ rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+ if (!rams)
+ return ret;
+
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ i = 0;
+ while ((start < end) &&
+ (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+ if (i >= rams_size) {
+ /* re-alloc */
+ struct resource *rams_new;
+
+ rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource),
+ GFP_KERNEL);
+ if (!rams_new)
+ goto out;
+
+ rams = rams_new;
+ rams_size += 16;
+ }
+
+ rams[i++] = res;
+ start = res.end + 1;
+ }
+
+ /* go reverse */
+ for (i--; i >= 0; i--) {
+ ret = (*func)(&rams[i], arg);
if (ret)
break;
- res.start = res.end + 1;
- res.end = orig_end;
}
+
+out:
+ kvfree(rams);
return ret;
}
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ */
+int walk_mem_res(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
+}
/*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM".
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * It is to be used only for System RAM.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg, int (*func)(unsigned long, unsigned long, void *))
+ void *arg, int (*func)(unsigned long, unsigned long, void *))
{
+ resource_size_t start, end;
+ unsigned long flags;
struct resource res;
unsigned long pfn, end_pfn;
- u64 orig_end;
- int ret = -1;
+ int ret = -EINVAL;
- res.start = (u64) start_pfn << PAGE_SHIFT;
- res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
- res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- orig_end = res.end;
- while ((res.start < res.end) &&
- (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
- pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
- end_pfn = (res.end + 1) >> PAGE_SHIFT;
+ start = (u64) start_pfn << PAGE_SHIFT;
+ end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ while (start < end &&
+ !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
+ pfn = PFN_UP(res.start);
+ end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
break;
- res.start = res.end + 1;
- res.end = orig_end;
+ start = res.end + 1;
}
return ret;
}
-#endif
-
static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
}
+
/*
* This generic page_is_ram() returns true if specified address is
- * registered as "System RAM" in iomem_resource list.
+ * registered as System RAM in iomem_resource list.
*/
int __weak page_is_ram(unsigned long pfn)
{
@@ -492,52 +560,104 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
-/*
- * Search for a resouce entry that fully contains the specified region.
- * If found, return 1 if it is RAM, 0 if not.
- * If not found, or region is not fully contained, return -1
- *
- * Used by the ioremap functions to ensure the user is not remapping RAM and is
- * a vast speed up over walking through the resource table page by page.
- */
-int region_is_ram(resource_size_t start, unsigned long size)
+static int __region_intersects(struct resource *parent, resource_size_t start,
+ size_t size, unsigned long flags,
+ unsigned long desc)
{
- struct resource *p;
- resource_size_t end = start + size - 1;
- int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- const char *name = "System RAM";
- int ret = -1;
+ int type = 0; int other = 0;
+ struct resource *p, *dp;
+ struct resource res, o;
+ bool covered;
- read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- if (end < p->start)
- continue;
+ res = DEFINE_RES(start, size, 0);
- if (p->start <= start && end <= p->end) {
- /* resource fully contains region */
- if ((p->flags != flags) || strcmp(p->name, name))
- ret = 0;
- else
- ret = 1;
- break;
+ for (p = parent->child; p ; p = p->sibling) {
+ if (!resource_intersection(p, &res, &o))
+ continue;
+ if (is_type_match(p, flags, desc)) {
+ type++;
+ continue;
}
- if (p->end < start)
- break; /* not found */
+ /*
+ * Continue to search in descendant resources as if the
+ * matched descendant resources cover some ranges of 'p'.
+ *
+ * |------------- "CXL Window 0" ------------|
+ * |-- "System RAM" --|
+ *
+ * will behave similar as the following fake resource
+ * tree when searching "System RAM".
+ *
+ * |-- "System RAM" --||-- "CXL Window 0a" --|
+ */
+ covered = false;
+ for_each_resource(p, dp, false) {
+ if (!resource_overlaps(dp, &res))
+ continue;
+ if (is_type_match(dp, flags, desc)) {
+ type++;
+ /*
+ * Range from 'o.start' to 'dp->start'
+ * isn't covered by matched resource.
+ */
+ if (dp->start > o.start)
+ break;
+ if (dp->end >= o.end) {
+ covered = true;
+ break;
+ }
+ /* Remove covered range */
+ o.start = max(o.start, dp->end + 1);
+ }
+ }
+ if (!covered)
+ other++;
}
- read_unlock(&resource_lock);
- return ret;
+
+ if (type == 0)
+ return REGION_DISJOINT;
+
+ if (other == 0)
+ return REGION_INTERSECTS;
+
+ return REGION_MIXED;
}
-void __weak arch_remove_reservations(struct resource *avail)
+/**
+ * region_intersects() - determine intersection of region with known resources
+ * @start: region start address
+ * @size: size of region
+ * @flags: flags of resource (in iomem_resource)
+ * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
+ *
+ * Check if the specified region partially overlaps or fully eclipses a
+ * resource identified by @flags and @desc (optional with IORES_DESC_NONE).
+ * Return REGION_DISJOINT if the region does not overlap @flags/@desc,
+ * return REGION_MIXED if the region overlaps @flags/@desc and another
+ * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
+ * and no other defined resource. Note that REGION_INTERSECTS is also
+ * returned in the case when the specified region overlaps RAM and undefined
+ * memory holes.
+ *
+ * region_intersect() is used by memory remapping functions to ensure
+ * the user is not remapping RAM and is a vast speed up over walking
+ * through the resource table page by page.
+ */
+int region_intersects(resource_size_t start, size_t size, unsigned long flags,
+ unsigned long desc)
{
+ int ret;
+
+ read_lock(&resource_lock);
+ ret = __region_intersects(&iomem_resource, start, size, flags, desc);
+ read_unlock(&resource_lock);
+
+ return ret;
}
+EXPORT_SYMBOL_GPL(region_intersects);
-static resource_size_t simple_align_resource(void *data,
- const struct resource *avail,
- resource_size_t size,
- resource_size_t align)
+void __weak arch_remove_reservations(struct resource *avail)
{
- return avail->start;
}
static void resource_clip(struct resource *res, resource_size_t min,
@@ -550,16 +670,16 @@ static void resource_clip(struct resource *res, resource_size_t min,
}
/*
- * Find empty slot in the resource tree with the given range and
+ * Find empty space in the resource tree with the given range and
* alignment constraints
*/
-static int __find_resource(struct resource *root, struct resource *old,
- struct resource *new,
- resource_size_t size,
- struct resource_constraint *constraint)
+static int __find_resource_space(struct resource *root, struct resource *old,
+ struct resource *new, resource_size_t size,
+ struct resource_constraint *constraint)
{
struct resource *this = root->child;
struct resource tmp = *new, avail, alloc;
+ resource_alignf alignf = constraint->alignf;
tmp.start = root->start;
/*
@@ -588,10 +708,15 @@ static int __find_resource(struct resource *root, struct resource *old,
avail.flags = new->flags & ~IORESOURCE_UNSET;
if (avail.start >= tmp.start) {
alloc.flags = avail.flags;
- alloc.start = constraint->alignf(constraint->alignf_data, &avail,
- size, constraint->align);
+ if (alignf) {
+ alloc.start = alignf(constraint->alignf_data,
+ &avail, size, constraint->align);
+ } else {
+ alloc.start = avail.start;
+ }
alloc.end = alloc.start + size - 1;
- if (resource_contains(&avail, &alloc)) {
+ if (alloc.start <= alloc.end &&
+ resource_contains(&avail, &alloc)) {
new->start = alloc.start;
new->end = alloc.end;
return 0;
@@ -608,15 +733,27 @@ next: if (!this || this->end == root->end)
return -EBUSY;
}
-/*
- * Find empty slot in the resource tree given range and alignment.
+/**
+ * find_resource_space - Find empty space in the resource tree
+ * @root: Root resource descriptor
+ * @new: Resource descriptor awaiting an empty resource space
+ * @size: The minimum size of the empty space
+ * @constraint: The range and alignment constraints to be met
+ *
+ * Finds an empty space under @root in the resource tree satisfying range and
+ * alignment @constraints.
+ *
+ * Return:
+ * * %0 - if successful, @new members start, end, and flags are altered.
+ * * %-EBUSY - if no empty space was found.
*/
-static int find_resource(struct resource *root, struct resource *new,
+int find_resource_space(struct resource *root, struct resource *new,
resource_size_t size,
- struct resource_constraint *constraint)
+ struct resource_constraint *constraint)
{
- return __find_resource(root, NULL, new, size, constraint);
+ return __find_resource_space(root, NULL, new, size, constraint);
}
+EXPORT_SYMBOL_GPL(find_resource_space);
/**
* reallocate_resource - allocate a slot in the resource tree given range & alignment.
@@ -626,11 +763,11 @@ static int find_resource(struct resource *root, struct resource *new,
* @root: root resource descriptor
* @old: resource descriptor desired by caller
* @newsize: new size of the resource descriptor
- * @constraint: the size and alignment constraints to be met.
+ * @constraint: the memory range and alignment constraints to be met.
*/
static int reallocate_resource(struct resource *root, struct resource *old,
- resource_size_t newsize,
- struct resource_constraint *constraint)
+ resource_size_t newsize,
+ struct resource_constraint *constraint)
{
int err=0;
struct resource new = *old;
@@ -638,7 +775,7 @@ static int reallocate_resource(struct resource *root, struct resource *old,
write_lock(&resource_lock);
- if ((err = __find_resource(root, old, &new, newsize, constraint)))
+ if ((err = __find_resource_space(root, old, &new, newsize, constraint)))
goto out;
if (resource_contains(&new, old)) {
@@ -656,7 +793,7 @@ static int reallocate_resource(struct resource *root, struct resource *old,
old->start = new.start;
old->end = new.end;
} else {
- __release_resource(old);
+ __release_resource(old, true);
*old = new;
conflict = __request_resource(root, old);
BUG_ON(conflict);
@@ -682,18 +819,12 @@ out:
int allocate_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
resource_size_t max, resource_size_t align,
- resource_size_t (*alignf)(void *,
- const struct resource *,
- resource_size_t,
- resource_size_t),
+ resource_alignf alignf,
void *alignf_data)
{
int err;
struct resource_constraint constraint;
- if (!alignf)
- alignf = simple_align_resource;
-
constraint.min = min;
constraint.max = max;
constraint.align = align;
@@ -707,7 +838,7 @@ int allocate_resource(struct resource *root, struct resource *new,
}
write_lock(&resource_lock);
- err = find_resource(root, new, size, &constraint);
+ err = find_resource_space(root, new, size, &constraint);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
@@ -802,6 +933,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
* entirely fit within the range of the new resource, then the new
* resource is inserted and the conflicting resources become children of
* the new resource.
+ *
+ * This function is intended for producers of resources, such as FW modules
+ * and bus drivers.
*/
struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
{
@@ -819,6 +953,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour
* @new: new resource to insert
*
* Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is intended for producers of resources, such as FW modules
+ * and bus drivers.
*/
int insert_resource(struct resource *parent, struct resource *new)
{
@@ -827,6 +964,7 @@ int insert_resource(struct resource *parent, struct resource *new)
conflict = insert_resource_conflict(parent, new);
return conflict ? -EBUSY : 0;
}
+EXPORT_SYMBOL_GPL(insert_resource);
/**
* insert_resource_expand_to_fit - Insert a resource into the resource tree
@@ -857,10 +995,43 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
if (conflict->end > new->end)
new->end = conflict->end;
- printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
}
write_unlock(&resource_lock);
}
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, "CXL");
+
+/**
+ * remove_resource - Remove a resource in the resource tree
+ * @old: resource to remove
+ *
+ * Returns 0 on success, -EINVAL if the resource is not valid.
+ *
+ * This function removes a resource previously inserted by insert_resource()
+ * or insert_resource_conflict(), and moves the children (if any) up to
+ * where they were before. insert_resource() and insert_resource_conflict()
+ * insert a new resource, and move any conflicting resources down to the
+ * children of the new resource.
+ *
+ * insert_resource(), insert_resource_conflict() and remove_resource() are
+ * intended for producers of resources, such as FW modules and bus drivers.
+ */
+int remove_resource(struct resource *old)
+{
+ int retval;
+
+ write_lock(&resource_lock);
+ retval = __release_resource(old, false);
+ write_unlock(&resource_lock);
+ return retval;
+}
+EXPORT_SYMBOL_GPL(remove_resource);
static int __adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size)
@@ -910,7 +1081,7 @@ skip:
* Existing children of the resource are assumed to be immutable.
*/
int adjust_resource(struct resource *res, resource_size_t start,
- resource_size_t size)
+ resource_size_t size)
{
int result;
@@ -921,14 +1092,15 @@ int adjust_resource(struct resource *res, resource_size_t start,
}
EXPORT_SYMBOL(adjust_resource);
-static void __init __reserve_region_with_split(struct resource *root,
- resource_size_t start, resource_size_t end,
- const char *name)
+static void __init
+__reserve_region_with_split(struct resource *root, resource_size_t start,
+ resource_size_t end, const char *name)
{
struct resource *parent = root;
struct resource *conflict;
struct resource *res = alloc_resource(GFP_ATOMIC);
struct resource *next_res = NULL;
+ int type = resource_type(root);
if (!res)
return;
@@ -936,7 +1108,8 @@ static void __init __reserve_region_with_split(struct resource *root,
res->name = name;
res->start = start;
res->end = end;
- res->flags = IORESOURCE_BUSY;
+ res->flags = type | IORESOURCE_BUSY;
+ res->desc = IORES_DESC_NONE;
while (1) {
@@ -970,7 +1143,8 @@ static void __init __reserve_region_with_split(struct resource *root,
next_res->name = name;
next_res->start = conflict->end + 1;
next_res->end = end;
- next_res->flags = IORESOURCE_BUSY;
+ next_res->flags = type | IORESOURCE_BUSY;
+ next_res->desc = IORES_DESC_NONE;
}
} else {
res->start = conflict->end + 1;
@@ -979,9 +1153,9 @@ static void __init __reserve_region_with_split(struct resource *root,
}
-void __init reserve_region_with_split(struct resource *root,
- resource_size_t start, resource_size_t end,
- const char *name)
+void __init
+reserve_region_with_split(struct resource *root, resource_size_t start,
+ resource_size_t end, const char *name)
{
int abort = 0;
@@ -1039,42 +1213,90 @@ resource_size_t resource_alignment(struct resource *res)
static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
-/**
- * __request_region - create a new busy resource region
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- * @name: reserving caller's ID string
- * @flags: IO resource flags
- */
-struct resource * __request_region(struct resource *parent,
+static struct inode *iomem_inode;
+
+#ifdef CONFIG_IO_STRICT_DEVMEM
+static void revoke_iomem(struct resource *res)
+{
+ /* pairs with smp_store_release() in iomem_init_inode() */
+ struct inode *inode = smp_load_acquire(&iomem_inode);
+
+ /*
+ * Check that the initialization has completed. Losing the race
+ * is ok because it means drivers are claiming resources before
+ * the fs_initcall level of init and prevent iomem_get_mapping users
+ * from establishing mappings.
+ */
+ if (!inode)
+ return;
+
+ /*
+ * The expectation is that the driver has successfully marked
+ * the resource busy by this point, so devmem_is_allowed()
+ * should start returning false, however for performance this
+ * does not iterate the entire resource range.
+ */
+ if (devmem_is_allowed(PHYS_PFN(res->start)) &&
+ devmem_is_allowed(PHYS_PFN(res->end))) {
+ /*
+ * *cringe* iomem=relaxed says "go ahead, what's the
+ * worst that can happen?"
+ */
+ return;
+ }
+
+ unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
+}
+#else
+static void revoke_iomem(struct resource *res) {}
+#endif
+
+struct address_space *iomem_get_mapping(void)
+{
+ /*
+ * This function is only called from file open paths, hence guaranteed
+ * that fs_initcalls have completed and no need to check for NULL. But
+ * since revoke_iomem can be called before the initcall we still need
+ * the barrier to appease checkers.
+ */
+ return smp_load_acquire(&iomem_inode)->i_mapping;
+}
+
+static int __request_region_locked(struct resource *res, struct resource *parent,
resource_size_t start, resource_size_t n,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = alloc_resource(GFP_KERNEL);
-
- if (!res)
- return NULL;
res->name = name;
res->start = start;
res->end = start + n - 1;
- res->flags = resource_type(parent);
- res->flags |= IORESOURCE_BUSY | flags;
-
- write_lock(&resource_lock);
for (;;) {
struct resource *conflict;
+ res->flags = resource_type(parent) | resource_ext_type(parent);
+ res->flags |= IORESOURCE_BUSY | flags;
+ res->desc = parent->desc;
+
conflict = __request_resource(parent, res);
if (!conflict)
break;
+ /*
+ * mm/hmm.c reserves physical addresses which then
+ * become unavailable to other users. Conflicts are
+ * not expected. Warn to aid debugging if encountered.
+ */
+ if (parent == &iomem_resource &&
+ conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_warn("Unaddressable device %s %pR conflicts with %pR\n",
+ conflict->name, conflict, res);
+ }
if (conflict != parent) {
- parent = conflict;
- if (!(conflict->flags & IORESOURCE_BUSY))
+ if (!(conflict->flags & IORESOURCE_BUSY)) {
+ parent = conflict;
continue;
+ }
}
if (conflict->flags & flags & IORESOURCE_MUXED) {
add_wait_queue(&muxed_resource_wait, &wait);
@@ -1086,11 +1308,42 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
- free_resource(res);
- res = NULL;
- break;
+ return -EBUSY;
}
+
+ return 0;
+}
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource *__request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n,
+ const char *name, int flags)
+{
+ struct resource *res = alloc_resource(GFP_KERNEL);
+ int ret;
+
+ if (!res)
+ return NULL;
+
+ write_lock(&resource_lock);
+ ret = __request_region_locked(res, parent, start, n, name, flags);
write_unlock(&resource_lock);
+
+ if (ret) {
+ free_resource(res);
+ return NULL;
+ }
+
+ if (parent == &iomem_resource)
+ revoke_iomem(res);
+
return res;
}
EXPORT_SYMBOL(__request_region);
@@ -1104,7 +1357,7 @@ EXPORT_SYMBOL(__request_region);
* The described resource region must match a currently busy region.
*/
void __release_region(struct resource *parent, resource_size_t start,
- resource_size_t n)
+ resource_size_t n)
{
struct resource **p;
resource_size_t end;
@@ -1138,16 +1391,54 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
- printk(KERN_WARNING "Trying to free nonexistent resource "
- "<%016llx-%016llx>\n", (unsigned long long)start,
- (unsigned long long)end);
+ pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
}
EXPORT_SYMBOL(__release_region);
#ifdef CONFIG_MEMORY_HOTREMOVE
+static void append_child_to_parent(struct resource *new_parent, struct resource *new_child)
+{
+ struct resource *child;
+
+ child = new_parent->child;
+ if (child) {
+ while (child->sibling)
+ child = child->sibling;
+ child->sibling = new_child;
+ } else {
+ new_parent->child = new_child;
+ }
+ new_child->parent = new_parent;
+ new_child->sibling = NULL;
+}
+
+/*
+ * Reparent all child resources that no longer belong to "low" after a split to
+ * "high". Note that "high" does not have any children, because "low" is the
+ * original resource and "high" is a new resource. Treat "low" as the original
+ * resource being split and defer its range adjustment to __adjust_resource().
+ */
+static void reparent_children_after_split(struct resource *low,
+ struct resource *high,
+ resource_size_t split_addr)
+{
+ struct resource *child, *next, **p;
+
+ p = &low->child;
+ while ((child = *p)) {
+ next = child->sibling;
+ if (child->start > split_addr) {
+ /* unlink child */
+ *p = next;
+ append_child_to_parent(high, child);
+ } else {
+ p = &child->sibling;
+ }
+ }
+}
+
/**
* release_mem_region_adjustable - release a previously reserved memory region
- * @parent: parent resource descriptor
* @start: resource start address
* @size: resource region size
*
@@ -1155,31 +1446,36 @@ EXPORT_SYMBOL(__release_region);
* is released from a currently busy memory resource. The requested region
* must either match exactly or fit into a single busy resource entry. In
* the latter case, the remaining resource is adjusted accordingly.
- * Existing children of the busy memory resource must be immutable in the
- * request.
*
* Note:
* - Additional release conditions, such as overlapping region, can be
* supported after they are confirmed as valid cases.
- * - When a busy memory resource gets split into two entries, the code
- * assumes that all children remain in the lower address entry for
- * simplicity. Enhance this logic when necessary.
+ * - When a busy memory resource gets split into two entries, its children are
+ * reassigned to the correct parent based on their range. If a child memory
+ * resource overlaps with more than one parent, enhance the logic as needed.
*/
-int release_mem_region_adjustable(struct resource *parent,
- resource_size_t start, resource_size_t size)
+void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
{
+ struct resource *parent = &iomem_resource;
+ struct resource *new_res = NULL;
+ bool alloc_nofail = false;
struct resource **p;
struct resource *res;
- struct resource *new_res;
resource_size_t end;
- int ret = -EINVAL;
end = start + size - 1;
- if ((start < parent->start) || (end > parent->end))
- return ret;
+ if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
+ return;
- /* The alloc_resource() result gets checked later */
- new_res = alloc_resource(GFP_KERNEL);
+ /*
+ * We free up quite a lot of memory on memory hotunplug (esp., memap),
+ * just before releasing the region. This is highly unlikely to
+ * fail - let's play save and make it never fail as the caller cannot
+ * perform any error handling (e.g., trying to re-add memory will fail
+ * similarly).
+ */
+retry:
+ new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0));
p = &parent->child;
write_lock(&resource_lock);
@@ -1207,32 +1503,36 @@ int release_mem_region_adjustable(struct resource *parent,
/* free the whole entry */
*p = res->sibling;
free_resource(res);
- ret = 0;
} else if (res->start == start && res->end != end) {
/* adjust the start */
- ret = __adjust_resource(res, end + 1,
- res->end - end);
+ WARN_ON_ONCE(__adjust_resource(res, end + 1,
+ res->end - end));
} else if (res->start != start && res->end == end) {
/* adjust the end */
- ret = __adjust_resource(res, res->start,
- start - res->start);
+ WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start));
} else {
- /* split into two entries */
+ /* split into two entries - we need a new resource */
if (!new_res) {
- ret = -ENOMEM;
- break;
+ new_res = alloc_resource(GFP_ATOMIC);
+ if (!new_res) {
+ alloc_nofail = true;
+ write_unlock(&resource_lock);
+ goto retry;
+ }
}
new_res->name = res->name;
new_res->start = end + 1;
new_res->end = res->end;
new_res->flags = res->flags;
+ new_res->desc = res->desc;
new_res->parent = res->parent;
new_res->sibling = res->sibling;
new_res->child = NULL;
+ reparent_children_after_split(res, new_res, end);
- ret = __adjust_resource(res, res->start,
- start - res->start);
- if (ret)
+ if (WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start)))
break;
res->sibling = new_res;
new_res = NULL;
@@ -1243,10 +1543,69 @@ int release_mem_region_adjustable(struct resource *parent,
write_unlock(&resource_lock);
free_resource(new_res);
- return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool system_ram_resources_mergeable(struct resource *r1,
+ struct resource *r2)
+{
+ /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
+ return r1->flags == r2->flags && r1->end + 1 == r2->start &&
+ r1->name == r2->name && r1->desc == r2->desc &&
+ !r1->child && !r2->child;
+}
+
+/**
+ * merge_system_ram_resource - mark the System RAM resource mergeable and try to
+ * merge it with adjacent, mergeable resources
+ * @res: resource descriptor
+ *
+ * This interface is intended for memory hotplug, whereby lots of contiguous
+ * system ram resources are added (e.g., via add_memory*()) by a driver, and
+ * the actual resource boundaries are not of interest (e.g., it might be
+ * relevant for DIMMs). Only resources that are marked mergeable, that have the
+ * same parent, and that don't have any children are considered. All mergeable
+ * resources must be immutable during the request.
+ *
+ * Note:
+ * - The caller has to make sure that no pointers to resources that are
+ * marked mergeable are used anymore after this call - the resource might
+ * be freed and the pointer might be stale!
+ * - release_mem_region_adjustable() will split on demand on memory hotunplug
+ */
+void merge_system_ram_resource(struct resource *res)
+{
+ const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ struct resource *cur;
+
+ if (WARN_ON_ONCE((res->flags & flags) != flags))
+ return;
+
+ write_lock(&resource_lock);
+ res->flags |= IORESOURCE_SYSRAM_MERGEABLE;
+
+ /* Try to merge with next item in the list. */
+ cur = res->sibling;
+ if (cur && system_ram_resources_mergeable(res, cur)) {
+ res->end = cur->end;
+ res->sibling = cur->sibling;
+ free_resource(cur);
+ }
+
+ /* Try to merge with previous item in the list. */
+ cur = res->parent->child;
+ while (cur && cur->sibling != res)
+ cur = cur->sibling;
+ if (cur && system_ram_resources_mergeable(cur, res)) {
+ cur->end = res->end;
+ cur->sibling = res->sibling;
+ free_resource(res);
+ }
+ write_unlock(&resource_lock);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
/*
* Managed region resource
*/
@@ -1341,9 +1700,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
this->start == match->start && this->n == match->n;
}
-struct resource * __devm_request_region(struct device *dev,
- struct resource *parent, resource_size_t start,
- resource_size_t n, const char *name)
+struct resource *
+__devm_request_region(struct device *dev, struct resource *parent,
+ resource_size_t start, resource_size_t n, const char *name)
{
struct region_devres *dr = NULL;
struct resource *res;
@@ -1372,14 +1731,13 @@ void __devm_release_region(struct device *dev, struct resource *parent,
{
struct region_devres match_data = { parent, start, n };
- __release_region(parent, start, n);
- WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
+ WARN_ON(devres_release(dev, devm_region_release, devm_region_match,
&match_data));
}
EXPORT_SYMBOL(__devm_release_region);
/*
- * Called from init/main.c to reserve IO ports.
+ * Reserve I/O ports or memory based on "reserve=" kernel parameter.
*/
#define MAXRESERVE 4
static int __init reserve_setup(char *str)
@@ -1390,25 +1748,33 @@ static int __init reserve_setup(char *str)
for (;;) {
unsigned int io_start, io_num;
int x = reserved;
+ struct resource *parent;
- if (get_option (&str, &io_start) != 2)
+ if (get_option(&str, &io_start) != 2)
break;
- if (get_option (&str, &io_num) == 0)
+ if (get_option(&str, &io_num) == 0)
break;
if (x < MAXRESERVE) {
struct resource *res = reserve + x;
- res->name = "reserved";
- res->start = io_start;
- res->end = io_start + io_num - 1;
- res->flags = IORESOURCE_BUSY;
- res->child = NULL;
- if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
+
+ /*
+ * If the region starts below 0x10000, we assume it's
+ * I/O port space; otherwise assume it's memory.
+ */
+ if (io_start < 0x10000) {
+ *res = DEFINE_RES_IO_NAMED(io_start, io_num, "reserved");
+ parent = &ioport_resource;
+ } else {
+ *res = DEFINE_RES_MEM_NAMED(io_start, io_num, "reserved");
+ parent = &iomem_resource;
+ }
+ res->flags |= IORESOURCE_BUSY;
+ if (request_resource(parent, res) == 0)
reserved = x+1;
}
}
return 1;
}
-
__setup("reserve=", reserve_setup);
/*
@@ -1417,22 +1783,22 @@ __setup("reserve=", reserve_setup);
*/
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
- struct resource *p = &iomem_resource;
+ resource_size_t end = addr + size - 1;
+ struct resource *p;
int err = 0;
- loff_t l;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ for_each_resource(&iomem_resource, p, false) {
/*
* We can probably skip the resources without
* IORESOURCE_IO attribute?
*/
- if (p->start >= addr + size)
+ if (p->start > end)
continue;
if (p->end < addr)
continue;
if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
- PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ PFN_DOWN(p->end) >= PFN_DOWN(end))
continue;
/*
* if a resource is "BUSY", it's not a hardware resource
@@ -1443,10 +1809,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
- (unsigned long long)addr,
- (unsigned long long)(addr + size - 1),
- p->name, p);
+ pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
+ &addr, &end, p->name, p);
err = -1;
break;
}
@@ -1462,34 +1826,50 @@ static int strict_iomem_checks;
#endif
/*
- * check if an address is reserved in the iomem resource tree
- * returns 1 if reserved, 0 if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
*/
-int iomem_is_exclusive(u64 addr)
+bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
{
- struct resource *p = &iomem_resource;
- int err = 0;
- loff_t l;
- int size = PAGE_SIZE;
-
- if (!strict_iomem_checks)
- return 0;
-
- addr = addr & PAGE_MASK;
+ const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE;
+ bool skip_children = false, err = false;
+ struct resource *p;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ for_each_resource(root, p, skip_children) {
+ if (p->start >= addr + size)
+ break;
+ if (p->end < addr) {
+ skip_children = true;
+ continue;
+ }
+ skip_children = false;
+
/*
- * We can probably skip the resources without
- * IORESOURCE_IO attribute?
+ * IORESOURCE_SYSTEM_RAM resources are exclusive if
+ * IORESOURCE_EXCLUSIVE is set, even if they
+ * are not busy and even if "iomem=relaxed" is set. The
+ * responsible driver dynamically adds/removes system RAM within
+ * such an area and uncontrolled access is dangerous.
*/
- if (p->start >= addr + size)
+ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+ err = true;
break;
- if (p->end < addr)
+ }
+
+ /*
+ * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
+ * or CONFIG_IO_STRICT_DEVMEM is enabled and the
+ * resource is busy.
+ */
+ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
continue;
- if (p->flags & IORESOURCE_BUSY &&
- p->flags & IORESOURCE_EXCLUSIVE) {
- err = 1;
+ if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
+ || p->flags & IORESOURCE_EXCLUSIVE) {
+ err = true;
break;
}
}
@@ -1498,6 +1878,12 @@ int iomem_is_exclusive(u64 addr)
return err;
}
+bool iomem_is_exclusive(u64 addr)
+{
+ return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
+ PAGE_SIZE);
+}
+
struct resource_entry *resource_list_create_entry(struct resource *res,
size_t extra_size)
{
@@ -1522,6 +1908,198 @@ void resource_list_free(struct list_head *head)
}
EXPORT_SYMBOL(resource_list_free);
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING (1UL << 0)
+#define GFR_REQUEST_REGION (1UL << 1)
+#ifdef PA_SECTION_SHIFT
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+#else
+#define GFR_DEFAULT_ALIGN PAGE_SIZE
+#endif
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+ resource_size_t align, unsigned long flags)
+{
+ if (flags & GFR_DESCENDING) {
+ resource_size_t end;
+
+ end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
+ return end - size + 1;
+ }
+
+ return ALIGN(max(base->start, align), align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+ resource_size_t size, unsigned long flags)
+{
+ if (flags & GFR_DESCENDING)
+ return addr > size && addr >= base->start;
+ /*
+ * In the ascend case be careful that the last increment by
+ * @size did not wrap 0.
+ */
+ return addr > addr - size &&
+ addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+ unsigned long flags)
+{
+ if (flags & GFR_DESCENDING)
+ return addr - size;
+ return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+ struct resource *res = _res;
+
+ if (res->parent)
+ remove_resource(res);
+ free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+ resource_size_t size, const unsigned long align,
+ const char *name, const unsigned long desc,
+ const unsigned long flags)
+{
+ resource_size_t addr;
+ struct resource *res;
+ struct region_devres *dr = NULL;
+
+ size = ALIGN(size, align);
+
+ res = alloc_resource(GFP_KERNEL);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev && (flags & GFR_REQUEST_REGION)) {
+ dr = devres_alloc(devm_region_release,
+ sizeof(struct region_devres), GFP_KERNEL);
+ if (!dr) {
+ free_resource(res);
+ return ERR_PTR(-ENOMEM);
+ }
+ } else if (dev) {
+ if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
+ return ERR_PTR(-ENOMEM);
+ }
+
+ write_lock(&resource_lock);
+ for (addr = gfr_start(base, size, align, flags);
+ gfr_continue(base, addr, align, flags);
+ addr = gfr_next(addr, align, flags)) {
+ if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+ REGION_DISJOINT)
+ continue;
+
+ if (flags & GFR_REQUEST_REGION) {
+ if (__request_region_locked(res, &iomem_resource, addr,
+ size, name, 0))
+ break;
+
+ if (dev) {
+ dr->parent = &iomem_resource;
+ dr->start = addr;
+ dr->n = size;
+ devres_add(dev, dr);
+ }
+
+ res->desc = desc;
+ write_unlock(&resource_lock);
+
+
+ /*
+ * A driver is claiming this region so revoke any
+ * mappings.
+ */
+ revoke_iomem(res);
+ } else {
+ *res = DEFINE_RES_NAMED_DESC(addr, size, name, IORESOURCE_MEM, desc);
+
+ /*
+ * Only succeed if the resource hosts an exclusive
+ * range after the insert
+ */
+ if (__insert_resource(base, res) || res->child)
+ break;
+
+ write_unlock(&resource_lock);
+ }
+
+ return res;
+ }
+ write_unlock(&resource_lock);
+
+ if (flags & GFR_REQUEST_REGION) {
+ free_resource(res);
+ devres_free(dr);
+ } else if (dev)
+ devm_release_action(dev, remove_free_mem_region, res);
+
+ return ERR_PTR(-ERANGE);
+}
+
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size)
+{
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+ dev_name(dev),
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+
+struct resource *request_free_mem_region(struct resource *base,
+ unsigned long size, const char *name)
+{
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
+}
+EXPORT_SYMBOL_GPL(request_free_mem_region);
+
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+ unsigned long size, unsigned long align,
+ const char *name)
+{
+ /* Default of ascending direction and insert resource */
+ unsigned long flags = 0;
+
+ return get_free_mem_region(NULL, base, size, align, name,
+ IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_GPL(alloc_free_mem_region);
+#endif /* CONFIG_GET_FREE_REGION */
+
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
@@ -1531,4 +2109,48 @@ static int __init strict_iomem(char *str)
return 1;
}
+static int iomem_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type iomem_fs_type = {
+ .name = "iomem",
+ .owner = THIS_MODULE,
+ .init_fs_context = iomem_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init iomem_init_inode(void)
+{
+ static struct vfsmount *iomem_vfs_mount;
+ static int iomem_fs_cnt;
+ struct inode *inode;
+ int rc;
+
+ rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
+ if (rc < 0) {
+ pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
+ return rc;
+ }
+
+ inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+ if (IS_ERR(inode)) {
+ rc = PTR_ERR(inode);
+ pr_err("Cannot allocate inode for iomem: %d\n", rc);
+ simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
+ return rc;
+ }
+
+ /*
+ * Publish iomem revocation inode initialized.
+ * Pairs with smp_load_acquire() in revoke_iomem().
+ */
+ smp_store_release(&iomem_inode, inode);
+
+ return 0;
+}
+
+fs_initcall(iomem_init_inode);
+
__setup("iomem=", strict_iomem);
diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c
new file mode 100644
index 000000000000..b8ef75b99eb2
--- /dev/null
+++ b/kernel/resource_kunit.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Test cases for API provided by resource.c and ioport.h
+ */
+
+#include <kunit/test.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/sizes.h>
+#include <linux/mm.h>
+
+#define R0_START 0x0000
+#define R0_END 0xffff
+#define R1_START 0x1234
+#define R1_END 0x2345
+#define R2_START 0x4567
+#define R2_END 0x5678
+#define R3_START 0x6789
+#define R3_END 0x789a
+#define R4_START 0x2000
+#define R4_END 0x7000
+
+static struct resource r0 = { .start = R0_START, .end = R0_END };
+static struct resource r1 = { .start = R1_START, .end = R1_END };
+static struct resource r2 = { .start = R2_START, .end = R2_END };
+static struct resource r3 = { .start = R3_START, .end = R3_END };
+static struct resource r4 = { .start = R4_START, .end = R4_END };
+
+struct result {
+ struct resource *r1;
+ struct resource *r2;
+ struct resource r;
+ bool ret;
+};
+
+static struct result results_for_union[] = {
+ {
+ .r1 = &r1, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r4, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r3, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r4, .r2 = &r1, .r.start = R1_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r3, .ret = false,
+ }, {
+ .r1 = &r2, .r2 = &r4, .r.start = R4_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r4, .r.start = R4_START, .r.end = R3_END, .ret = true,
+ },
+};
+
+static struct result results_for_intersection[] = {
+ {
+ .r1 = &r1, .r2 = &r0, .r.start = R1_START, .r.end = R1_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r0, .r.start = R2_START, .r.end = R2_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r0, .r.start = R3_START, .r.end = R3_END, .ret = true,
+ }, {
+ .r1 = &r4, .r2 = &r0, .r.start = R4_START, .r.end = R4_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r3, .r2 = &r1, .ret = false,
+ }, {
+ .r1 = &r4, .r2 = &r1, .r.start = R4_START, .r.end = R1_END, .ret = true,
+ }, {
+ .r1 = &r2, .r2 = &r3, .ret = false,
+ }, {
+ .r1 = &r2, .r2 = &r4, .r.start = R2_START, .r.end = R2_END, .ret = true,
+ }, {
+ .r1 = &r3, .r2 = &r4, .r.start = R3_START, .r.end = R4_END, .ret = true,
+ },
+};
+
+static void resource_do_test(struct kunit *test, bool ret, struct resource *r,
+ bool exp_ret, struct resource *exp_r,
+ struct resource *r1, struct resource *r2)
+{
+ KUNIT_EXPECT_EQ_MSG(test, ret, exp_ret, "Resources %pR %pR", r1, r2);
+ KUNIT_EXPECT_EQ_MSG(test, r->start, exp_r->start, "Start elements are not equal");
+ KUNIT_EXPECT_EQ_MSG(test, r->end, exp_r->end, "End elements are not equal");
+}
+
+static void resource_do_union_test(struct kunit *test, struct result *r)
+{
+ struct resource result;
+ bool ret;
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_union(r->r1, r->r2, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2);
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_union(r->r2, r->r1, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1);
+}
+
+static void resource_test_union(struct kunit *test)
+{
+ struct result *r = results_for_union;
+ unsigned int i = 0;
+
+ do {
+ resource_do_union_test(test, &r[i]);
+ } while (++i < ARRAY_SIZE(results_for_union));
+}
+
+static void resource_do_intersection_test(struct kunit *test, struct result *r)
+{
+ struct resource result;
+ bool ret;
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_intersection(r->r1, r->r2, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2);
+
+ memset(&result, 0, sizeof(result));
+ ret = resource_intersection(r->r2, r->r1, &result);
+ resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1);
+}
+
+static void resource_test_intersection(struct kunit *test)
+{
+ struct result *r = results_for_intersection;
+ unsigned int i = 0;
+
+ do {
+ resource_do_intersection_test(test, &r[i]);
+ } while (++i < ARRAY_SIZE(results_for_intersection));
+}
+
+/*
+ * The test resource tree for region_intersects() test:
+ *
+ * BASE-BASE+1M-1 : Test System RAM 0
+ * # hole 0 (BASE+1M-BASE+2M)
+ * BASE+2M-BASE+3M-1 : Test CXL Window 0
+ * BASE+3M-BASE+4M-1 : Test System RAM 1
+ * BASE+4M-BASE+7M-1 : Test CXL Window 1
+ * BASE+4M-BASE+5M-1 : Test System RAM 2
+ * BASE+4M+128K-BASE+4M+256K-1: Test Code
+ * BASE+5M-BASE+6M-1 : Test System RAM 3
+ */
+#define RES_TEST_RAM0_OFFSET 0
+#define RES_TEST_RAM0_SIZE SZ_1M
+#define RES_TEST_HOLE0_OFFSET (RES_TEST_RAM0_OFFSET + RES_TEST_RAM0_SIZE)
+#define RES_TEST_HOLE0_SIZE SZ_1M
+#define RES_TEST_WIN0_OFFSET (RES_TEST_HOLE0_OFFSET + RES_TEST_HOLE0_SIZE)
+#define RES_TEST_WIN0_SIZE SZ_1M
+#define RES_TEST_RAM1_OFFSET (RES_TEST_WIN0_OFFSET + RES_TEST_WIN0_SIZE)
+#define RES_TEST_RAM1_SIZE SZ_1M
+#define RES_TEST_WIN1_OFFSET (RES_TEST_RAM1_OFFSET + RES_TEST_RAM1_SIZE)
+#define RES_TEST_WIN1_SIZE (SZ_1M * 3)
+#define RES_TEST_RAM2_OFFSET RES_TEST_WIN1_OFFSET
+#define RES_TEST_RAM2_SIZE SZ_1M
+#define RES_TEST_CODE_OFFSET (RES_TEST_RAM2_OFFSET + SZ_128K)
+#define RES_TEST_CODE_SIZE SZ_128K
+#define RES_TEST_RAM3_OFFSET (RES_TEST_RAM2_OFFSET + RES_TEST_RAM2_SIZE)
+#define RES_TEST_RAM3_SIZE SZ_1M
+#define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE))
+
+KUNIT_DEFINE_ACTION_WRAPPER(kfree_wrapper, kfree, const void *);
+
+static void remove_free_resource(void *ctx)
+{
+ struct resource *res = (struct resource *)ctx;
+
+ remove_resource(res);
+ kfree(res);
+}
+
+static void resource_test_add_action_or_abort(
+ struct kunit *test, void (*action)(void *), void *ctx)
+{
+ KUNIT_ASSERT_EQ_MSG(test, 0,
+ kunit_add_action_or_reset(test, action, ctx),
+ "Fail to add action");
+}
+
+static void resource_test_request_region(struct kunit *test, struct resource *parent,
+ resource_size_t start, resource_size_t size,
+ const char *name, unsigned long flags)
+{
+ struct resource *res;
+
+ res = __request_region(parent, start, size, name, flags);
+ KUNIT_ASSERT_NOT_NULL(test, res);
+ resource_test_add_action_or_abort(test, remove_free_resource, res);
+}
+
+static void resource_test_insert_resource(struct kunit *test, struct resource *parent,
+ resource_size_t start, resource_size_t size,
+ const char *name, unsigned long flags)
+{
+ struct resource *res;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, res);
+
+ res->name = name;
+ res->start = start;
+ res->end = start + size - 1;
+ res->flags = flags;
+ if (insert_resource(parent, res)) {
+ resource_test_add_action_or_abort(test, kfree_wrapper, res);
+ KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res);
+ }
+
+ resource_test_add_action_or_abort(test, remove_free_resource, res);
+}
+
+static void resource_test_region_intersects(struct kunit *test)
+{
+ unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ struct resource *parent;
+ resource_size_t start;
+
+ /* Find an iomem_resource hole to hold test resources */
+ parent = alloc_free_mem_region(&iomem_resource, RES_TEST_TOTAL_SIZE, SZ_1M,
+ "test resources");
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent);
+ start = parent->start;
+ resource_test_add_action_or_abort(test, remove_free_resource, parent);
+
+ resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET,
+ RES_TEST_RAM0_SIZE, "Test System RAM 0", flags);
+ resource_test_insert_resource(test, parent, start + RES_TEST_WIN0_OFFSET,
+ RES_TEST_WIN0_SIZE, "Test CXL Window 0",
+ IORESOURCE_MEM);
+ resource_test_request_region(test, parent, start + RES_TEST_RAM1_OFFSET,
+ RES_TEST_RAM1_SIZE, "Test System RAM 1", flags);
+ resource_test_insert_resource(test, parent, start + RES_TEST_WIN1_OFFSET,
+ RES_TEST_WIN1_SIZE, "Test CXL Window 1",
+ IORESOURCE_MEM);
+ resource_test_request_region(test, parent, start + RES_TEST_RAM2_OFFSET,
+ RES_TEST_RAM2_SIZE, "Test System RAM 2", flags);
+ resource_test_insert_resource(test, parent, start + RES_TEST_CODE_OFFSET,
+ RES_TEST_CODE_SIZE, "Test Code", flags);
+ resource_test_request_region(test, parent, start + RES_TEST_RAM3_OFFSET,
+ RES_TEST_RAM3_SIZE, "Test System RAM 3", flags);
+ kunit_release_action(test, remove_free_resource, parent);
+
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM0_OFFSET, PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM0_OFFSET +
+ RES_TEST_RAM0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_DISJOINT,
+ region_intersects(start + RES_TEST_HOLE0_OFFSET, PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_DISJOINT,
+ region_intersects(start + RES_TEST_HOLE0_OFFSET +
+ RES_TEST_HOLE0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_MIXED,
+ region_intersects(start + RES_TEST_WIN0_OFFSET +
+ RES_TEST_WIN0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM1_OFFSET +
+ RES_TEST_RAM1_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM2_OFFSET +
+ RES_TEST_RAM2_SIZE - PAGE_SIZE, 2 * PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_CODE_OFFSET, PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_INTERSECTS,
+ region_intersects(start + RES_TEST_RAM2_OFFSET,
+ RES_TEST_RAM2_SIZE + PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+ KUNIT_EXPECT_EQ(test, REGION_MIXED,
+ region_intersects(start + RES_TEST_RAM3_OFFSET,
+ RES_TEST_RAM3_SIZE + PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE));
+}
+
+static struct kunit_case resource_test_cases[] = {
+ KUNIT_CASE(resource_test_union),
+ KUNIT_CASE(resource_test_intersection),
+ KUNIT_CASE(resource_test_region_intersects),
+ {}
+};
+
+static struct kunit_suite resource_test_suite = {
+ .name = "resource",
+ .test_cases = resource_test_cases,
+};
+kunit_test_suite(resource_test_suite);
+
+MODULE_DESCRIPTION("I/O Port & Memory Resource manager unit tests");
+MODULE_LICENSE("GPL");
diff --git a/kernel/rseq.c b/kernel/rseq.c
new file mode 100644
index 000000000000..395d8b002350
--- /dev/null
+++ b/kernel/rseq.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Restartable sequences system call
+ *
+ * Copyright (C) 2015, Google, Inc.,
+ * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
+ * Copyright (C) 2015-2018, EfficiOS Inc.,
+ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+/*
+ * Restartable sequences are a lightweight interface that allows
+ * user-level code to be executed atomically relative to scheduler
+ * preemption and signal delivery. Typically used for implementing
+ * per-cpu operations.
+ *
+ * It allows user-space to perform update operations on per-cpu data
+ * without requiring heavy-weight atomic operations.
+ *
+ * Detailed algorithm of rseq user-space assembly sequences:
+ *
+ * init(rseq_cs)
+ * cpu = TLS->rseq::cpu_id_start
+ * [1] TLS->rseq::rseq_cs = rseq_cs
+ * [start_ip] ----------------------------
+ * [2] if (cpu != TLS->rseq::cpu_id)
+ * goto abort_ip;
+ * [3] <last_instruction_in_cs>
+ * [post_commit_ip] ----------------------------
+ *
+ * The address of jump target abort_ip must be outside the critical
+ * region, i.e.:
+ *
+ * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip]
+ *
+ * Steps [2]-[3] (inclusive) need to be a sequence of instructions in
+ * userspace that can handle being interrupted between any of those
+ * instructions, and then resumed to the abort_ip.
+ *
+ * 1. Userspace stores the address of the struct rseq_cs assembly
+ * block descriptor into the rseq_cs field of the registered
+ * struct rseq TLS area. This update is performed through a single
+ * store within the inline assembly instruction sequence.
+ * [start_ip]
+ *
+ * 2. Userspace tests to check whether the current cpu_id field match
+ * the cpu number loaded before start_ip, branching to abort_ip
+ * in case of a mismatch.
+ *
+ * If the sequence is preempted or interrupted by a signal
+ * at or after start_ip and before post_commit_ip, then the kernel
+ * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
+ * ip to abort_ip before returning to user-space, so the preempted
+ * execution resumes at abort_ip.
+ *
+ * 3. Userspace critical section final instruction before
+ * post_commit_ip is the commit. The critical section is
+ * self-terminating.
+ * [post_commit_ip]
+ *
+ * 4. <success>
+ *
+ * On failure at [2], or if interrupted by preempt or signal delivery
+ * between [1] and [3]:
+ *
+ * [abort_ip]
+ * F1. <failure>
+ */
+
+/* Required to select the proper per_cpu ops for rseq_stats_inc() */
+#define RSEQ_BUILD_SLOW_PATH
+
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+#include <linux/rseq_entry.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rseq.h>
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+
+static inline void rseq_control_debug(bool on)
+{
+ if (on)
+ static_branch_enable(&rseq_debug_enabled);
+ else
+ static_branch_disable(&rseq_debug_enabled);
+}
+
+static int __init rseq_setup_debug(char *str)
+{
+ bool on;
+
+ if (kstrtobool(str, &on))
+ return -EINVAL;
+ rseq_control_debug(on);
+ return 1;
+}
+__setup("rseq_debug=", rseq_setup_debug);
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Out of line, so the actual update functions can be in a header to be
+ * inlined into the exit to user code.
+ */
+void __rseq_trace_update(struct task_struct *t)
+{
+ trace_rseq_update(t);
+}
+
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip)
+{
+ trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
+}
+#endif /* CONFIG_TRACEPOINTS */
+
+#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_RSEQ_STATS
+DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
+
+static int rseq_stats_show(struct seq_file *m, void *p)
+{
+ struct rseq_stats stats = { };
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
+ stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
+ stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
+ stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu));
+ stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
+ stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
+ stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
+ stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
+ }
+
+ seq_printf(m, "exit: %16lu\n", stats.exit);
+ seq_printf(m, "signal: %16lu\n", stats.signal);
+ seq_printf(m, "slowp: %16lu\n", stats.slowpath);
+ seq_printf(m, "fastp: %16lu\n", stats.fastpath);
+ seq_printf(m, "ids: %16lu\n", stats.ids);
+ seq_printf(m, "cs: %16lu\n", stats.cs);
+ seq_printf(m, "clear: %16lu\n", stats.clear);
+ seq_printf(m, "fixup: %16lu\n", stats.fixup);
+ return 0;
+}
+
+static int rseq_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_stats_show, inode->i_private);
+}
+
+static const struct file_operations stat_ops = {
+ .open = rseq_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rseq_stats_init(struct dentry *root_dir)
+{
+ debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
+ return 0;
+}
+#else
+static inline void rseq_stats_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_STATS */
+
+static int rseq_debug_show(struct seq_file *m, void *p)
+{
+ bool on = static_branch_unlikely(&rseq_debug_enabled);
+
+ seq_printf(m, "%d\n", on);
+ return 0;
+}
+
+static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ bool on;
+
+ if (kstrtobool_from_user(ubuf, count, &on))
+ return -EINVAL;
+
+ rseq_control_debug(on);
+ return count;
+}
+
+static int rseq_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_debug_show, inode->i_private);
+}
+
+static const struct file_operations debug_ops = {
+ .open = rseq_debug_open,
+ .read = seq_read,
+ .write = rseq_debug_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rseq_debugfs_init(void)
+{
+ struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
+
+ debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
+ rseq_stats_init(root_dir);
+ return 0;
+}
+__initcall(rseq_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
+
+static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
+{
+ return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
+}
+
+static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
+{
+ struct rseq __user *urseq = t->rseq.usrptr;
+ u64 csaddr;
+
+ scoped_user_read_access(urseq, efault)
+ unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
+ if (likely(!csaddr))
+ return true;
+ return rseq_update_user_cs(t, regs, csaddr);
+efault:
+ return false;
+}
+
+static void rseq_slowpath_update_usr(struct pt_regs *regs)
+{
+ /*
+ * Preserve rseq state and user_irq state. The generic entry code
+ * clears user_irq on the way out, the non-generic entry
+ * architectures are not having user_irq.
+ */
+ const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
+ struct task_struct *t = current;
+ struct rseq_ids ids;
+ u32 node_id;
+ bool event;
+
+ if (unlikely(t->flags & PF_EXITING))
+ return;
+
+ rseq_stat_inc(rseq_stats.slowpath);
+
+ /*
+ * Read and clear the event pending bit first. If the task
+ * was not preempted or migrated or a signal is on the way,
+ * there is no point in doing any of the heavy lifting here
+ * on production kernels. In that case TIF_NOTIFY_RESUME
+ * was raised by some other functionality.
+ *
+ * This is correct because the read/clear operation is
+ * guarded against scheduler preemption, which makes it CPU
+ * local atomic. If the task is preempted right after
+ * re-enabling preemption then TIF_NOTIFY_RESUME is set
+ * again and this function is invoked another time _before_
+ * the task is able to return to user mode.
+ *
+ * On a debug kernel, invoke the fixup code unconditionally
+ * with the result handed in to allow the detection of
+ * inconsistencies.
+ */
+ scoped_guard(irq) {
+ event = t->rseq.event.sched_switch;
+ t->rseq.event.all &= evt_mask.all;
+ ids.cpu_id = task_cpu(t);
+ ids.mm_cid = task_mm_cid(t);
+ }
+
+ if (!event)
+ return;
+
+ node_id = cpu_to_node(ids.cpu_id);
+
+ if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+ /*
+ * Clear the errors just in case this might survive magically, but
+ * leave the rest intact.
+ */
+ t->rseq.event.error = 0;
+ force_sig(SIGSEGV);
+ }
+}
+
+void __rseq_handle_slowpath(struct pt_regs *regs)
+{
+ /*
+ * If invoked from hypervisors before entering the guest via
+ * resume_user_mode_work(), then @regs is a NULL pointer.
+ *
+ * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+ * it before returning from the ioctl() to user space when
+ * rseq_event.sched_switch is set.
+ *
+ * So it's safe to ignore here instead of pointlessly updating it
+ * in the vcpu_run() loop.
+ */
+ if (!regs)
+ return;
+
+ rseq_slowpath_update_usr(regs);
+}
+
+void __rseq_signal_deliver(int sig, struct pt_regs *regs)
+{
+ rseq_stat_inc(rseq_stats.signal);
+ /*
+ * Don't update IDs, they are handled on exit to user if
+ * necessary. The important thing is to abort a critical section of
+ * the interrupted context as after this point the instruction
+ * pointer in @regs points to the signal handler.
+ */
+ if (unlikely(!rseq_handle_cs(current, regs))) {
+ /*
+ * Clear the errors just in case this might survive
+ * magically, but leave the rest intact.
+ */
+ current->rseq.event.error = 0;
+ force_sigsegv(sig);
+ }
+}
+
+/*
+ * Terminate the process if a syscall is issued within a restartable
+ * sequence.
+ */
+void __rseq_debug_syscall_return(struct pt_regs *regs)
+{
+ struct task_struct *t = current;
+ u64 csaddr;
+
+ if (!t->rseq.event.has_rseq)
+ return;
+ if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
+ goto fail;
+ if (likely(!csaddr))
+ return;
+ if (unlikely(csaddr >= TASK_SIZE))
+ goto fail;
+ if (rseq_debug_update_user_cs(t, regs, csaddr))
+ return;
+fail:
+ force_sig(SIGSEGV);
+}
+
+#ifdef CONFIG_DEBUG_RSEQ
+/* Kept around to keep GENERIC_ENTRY=n architectures supported. */
+void rseq_syscall(struct pt_regs *regs)
+{
+ __rseq_debug_syscall_return(regs);
+}
+#endif
+
+static bool rseq_reset_ids(void)
+{
+ struct rseq_ids ids = {
+ .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
+ .mm_cid = 0,
+ };
+
+ /*
+ * If this fails, terminate it because this leaves the kernel in
+ * stupid state as exit to user space will try to fixup the ids
+ * again.
+ */
+ if (rseq_set_ids(current, &ids, 0))
+ return true;
+
+ force_sig(SIGSEGV);
+ return false;
+}
+
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
+/*
+ * sys_rseq - setup restartable sequences for caller thread.
+ */
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
+{
+ if (flags & RSEQ_FLAG_UNREGISTER) {
+ if (flags & ~RSEQ_FLAG_UNREGISTER)
+ return -EINVAL;
+ /* Unregister rseq for current thread. */
+ if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
+ return -EINVAL;
+ if (rseq_len != current->rseq.len)
+ return -EINVAL;
+ if (current->rseq.sig != sig)
+ return -EPERM;
+ if (!rseq_reset_ids())
+ return -EFAULT;
+ rseq_reset(current);
+ return 0;
+ }
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (current->rseq.usrptr) {
+ /*
+ * If rseq is already registered, check whether
+ * the provided address differs from the prior
+ * one.
+ */
+ if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
+ return -EINVAL;
+ if (current->rseq.sig != sig)
+ return -EPERM;
+ /* Already registered. */
+ return -EBUSY;
+ }
+
+ /*
+ * If there was no rseq previously registered, ensure the provided rseq
+ * is properly aligned, as communcated to user-space through the ELF
+ * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq
+ * size, the required alignment is the original struct rseq alignment.
+ *
+ * In order to be valid, rseq_len is either the original rseq size, or
+ * large enough to contain all supported fields, as communicated to
+ * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.
+ */
+ if (rseq_len < ORIG_RSEQ_SIZE ||
+ (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) ||
+ (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
+ rseq_len < offsetof(struct rseq, end))))
+ return -EINVAL;
+ if (!access_ok(rseq, rseq_len))
+ return -EFAULT;
+
+ scoped_user_write_access(rseq, efault) {
+ /*
+ * If the rseq_cs pointer is non-NULL on registration, clear it to
+ * avoid a potential segfault on return to user-space. The proper thing
+ * to do would have been to fail the registration but this would break
+ * older libcs that reuse the rseq area for new threads without
+ * clearing the fields. Don't bother reading it, just reset it.
+ */
+ unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+ /* Initialize IDs in user space */
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
+ unsafe_put_user(0U, &rseq->node_id, efault);
+ unsafe_put_user(0U, &rseq->mm_cid, efault);
+ }
+
+ /*
+ * Activate the registration by setting the rseq area address, length
+ * and signature in the task struct.
+ */
+ current->rseq.usrptr = rseq;
+ current->rseq.len = rseq_len;
+ current->rseq.sig = sig;
+
+ /*
+ * If rseq was previously inactive, and has just been
+ * registered, ensure the cpu_id_start and cpu_id fields
+ * are updated before returning to user-space.
+ */
+ current->rseq.event.has_rseq = true;
+ rseq_force_update();
+ return 0;
+
+efault:
+ return -EFAULT;
+}
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
new file mode 100644
index 000000000000..d86d2d9c4624
--- /dev/null
+++ b/kernel/scftorture.c
@@ -0,0 +1,701 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Torture test for smp_call_function() and friends.
+//
+// Copyright (C) Facebook, 2020.
+//
+// Author: Paul E. McKenney <paulmck@kernel.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#define SCFTORT_STRING "scftorture"
+#define SCFTORT_FLAG SCFTORT_STRING ": "
+
+#define VERBOSE_SCFTORTOUT(s, x...) \
+ do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0)
+
+#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x)
+
+MODULE_DESCRIPTION("Torture tests on the smp_call_function() family of primitives");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0,
+ "Holdoff time before test start (s)");
+torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)");
+torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs.");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable.");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s.");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug.");
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations.");
+torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
+torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations.");
+torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
+torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations.");
+torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations.");
+torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations.");
+torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations.");
+
+static char *torture_type = "";
+
+#ifdef MODULE
+# define SCFTORT_SHUTDOWN 0
+#else
+# define SCFTORT_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test.");
+
+struct scf_statistics {
+ struct task_struct *task;
+ int cpu;
+ long long n_resched;
+ long long n_single;
+ long long n_single_ofl;
+ long long n_single_rpc;
+ long long n_single_rpc_ofl;
+ long long n_single_wait;
+ long long n_single_wait_ofl;
+ long long n_many;
+ long long n_many_wait;
+ long long n_all;
+ long long n_all_wait;
+};
+
+static struct scf_statistics *scf_stats_p;
+static struct task_struct *scf_torture_stats_task;
+static DEFINE_PER_CPU(long long, scf_invoked_count);
+static DEFINE_PER_CPU(struct llist_head, scf_free_pool);
+
+// Data for random primitive selection
+#define SCF_PRIM_RESCHED 0
+#define SCF_PRIM_SINGLE 1
+#define SCF_PRIM_SINGLE_RPC 2
+#define SCF_PRIM_MANY 3
+#define SCF_PRIM_ALL 4
+#define SCF_NPRIMS 8 // Need wait and no-wait versions of each,
+ // except for SCF_PRIM_RESCHED and
+ // SCF_PRIM_SINGLE_RPC.
+
+static char *scf_prim_name[] = {
+ "resched_cpu",
+ "smp_call_function_single",
+ "smp_call_function_single_rpc",
+ "smp_call_function_many",
+ "smp_call_function",
+};
+
+struct scf_selector {
+ unsigned long scfs_weight;
+ int scfs_prim;
+ bool scfs_wait;
+};
+static struct scf_selector scf_sel_array[SCF_NPRIMS];
+static int scf_sel_array_len;
+static unsigned long scf_sel_totweight;
+
+// Communicate between caller and handler.
+struct scf_check {
+ bool scfc_in;
+ bool scfc_out;
+ int scfc_cpu; // -1 for not _single().
+ bool scfc_wait;
+ bool scfc_rpc;
+ struct completion scfc_completion;
+ struct llist_node scf_node;
+};
+
+// Use to wait for all threads to start.
+static atomic_t n_started;
+static atomic_t n_errs;
+static atomic_t n_mb_in_errs;
+static atomic_t n_mb_out_errs;
+static atomic_t n_alloc_errs;
+static bool scfdone;
+static char *bangstr = "";
+
+static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
+
+extern void resched_cpu(int cpu); // An alternative IPI vector.
+
+static void scf_add_to_free_list(struct scf_check *scfcp)
+{
+ struct llist_head *pool;
+ unsigned int cpu;
+
+ if (!scfcp)
+ return;
+ cpu = raw_smp_processor_id() % nthreads;
+ pool = &per_cpu(scf_free_pool, cpu);
+ llist_add(&scfcp->scf_node, pool);
+}
+
+static void scf_cleanup_free_list(unsigned int cpu)
+{
+ struct llist_head *pool;
+ struct llist_node *node;
+ struct scf_check *scfcp;
+
+ pool = &per_cpu(scf_free_pool, cpu);
+ node = llist_del_all(pool);
+ while (node) {
+ scfcp = llist_entry(node, struct scf_check, scf_node);
+ node = node->next;
+ kfree(scfcp);
+ }
+}
+
+// Print torture statistics. Caller must ensure serialization.
+static void scf_torture_stats_print(void)
+{
+ int cpu;
+ int i;
+ long long invoked_count = 0;
+ bool isdone = READ_ONCE(scfdone);
+ struct scf_statistics scfs = {};
+
+ for_each_possible_cpu(cpu)
+ invoked_count += data_race(per_cpu(scf_invoked_count, cpu));
+ for (i = 0; i < nthreads; i++) {
+ scfs.n_resched += scf_stats_p[i].n_resched;
+ scfs.n_single += scf_stats_p[i].n_single;
+ scfs.n_single_ofl += scf_stats_p[i].n_single_ofl;
+ scfs.n_single_rpc += scf_stats_p[i].n_single_rpc;
+ scfs.n_single_wait += scf_stats_p[i].n_single_wait;
+ scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl;
+ scfs.n_many += scf_stats_p[i].n_many;
+ scfs.n_many_wait += scf_stats_p[i].n_many_wait;
+ scfs.n_all += scf_stats_p[i].n_all;
+ scfs.n_all_wait += scf_stats_p[i].n_all_wait;
+ }
+ if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
+ atomic_read(&n_mb_out_errs) ||
+ (!IS_ENABLED(CONFIG_KASAN) && atomic_read(&n_alloc_errs)))
+ bangstr = "!!! ";
+ pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ",
+ SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
+ scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
+ scfs.n_single_rpc, scfs.n_single_rpc_ofl,
+ scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
+ torture_onoff_stats();
+ pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs),
+ atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs),
+ atomic_read(&n_alloc_errs));
+}
+
+// Periodically prints torture statistics, if periodic statistics printing
+// was specified via the stat_interval module parameter.
+static int
+scf_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("scf_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ scf_torture_stats_print();
+ torture_shutdown_absorb("scf_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("scf_torture_stats");
+ return 0;
+}
+
+// Add a primitive to the scf_sel_array[].
+static void scf_sel_add(unsigned long weight, int prim, bool wait)
+{
+ struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len];
+
+ // If no weight, if array would overflow, if computing three-place
+ // percentages would overflow, or if the scf_prim_name[] array would
+ // overflow, don't bother. In the last three two cases, complain.
+ if (!weight ||
+ WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) ||
+ WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) ||
+ WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name)))
+ return;
+ scf_sel_totweight += weight;
+ scfsp->scfs_weight = scf_sel_totweight;
+ scfsp->scfs_prim = prim;
+ scfsp->scfs_wait = wait;
+ scf_sel_array_len++;
+}
+
+// Dump out weighting percentages for scf_prim_name[] array.
+static void scf_sel_dump(void)
+{
+ int i;
+ unsigned long oldw = 0;
+ struct scf_selector *scfsp;
+ unsigned long w;
+
+ for (i = 0; i < scf_sel_array_len; i++) {
+ scfsp = &scf_sel_array[i];
+ w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight;
+ pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000,
+ scf_prim_name[scfsp->scfs_prim],
+ scfsp->scfs_wait ? "wait" : "nowait");
+ oldw = scfsp->scfs_weight;
+ }
+}
+
+// Randomly pick a primitive and wait/nowait, based on weightings.
+static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
+{
+ int i;
+ unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1);
+
+ for (i = 0; i < scf_sel_array_len; i++)
+ if (scf_sel_array[i].scfs_weight >= w)
+ return &scf_sel_array[i];
+ WARN_ON_ONCE(1);
+ return &scf_sel_array[0];
+}
+
+// Update statistics and occasionally burn up mass quantities of CPU time,
+// if told to do so via scftorture.longwait. Otherwise, occasionally burn
+// a little bit.
+static void scf_handler(void *scfc_in)
+{
+ int i;
+ int j;
+ unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand));
+ struct scf_check *scfcp = scfc_in;
+
+ if (likely(scfcp)) {
+ WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers.
+ if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in))))
+ atomic_inc(&n_mb_in_errs);
+ }
+ this_cpu_inc(scf_invoked_count);
+ if (longwait <= 0) {
+ if (!(r & 0xffc0)) {
+ udelay(r & 0x3f);
+ goto out;
+ }
+ }
+ if (r & 0xfff)
+ goto out;
+ r = (r >> 12);
+ if (longwait <= 0) {
+ udelay((r & 0xff) + 1);
+ goto out;
+ }
+ r = r % longwait + 1;
+ for (i = 0; i < r; i++) {
+ for (j = 0; j < 1000; j++) {
+ udelay(1000);
+ cpu_relax();
+ }
+ }
+out:
+ if (unlikely(!scfcp))
+ return;
+ if (scfcp->scfc_wait) {
+ WRITE_ONCE(scfcp->scfc_out, true);
+ if (scfcp->scfc_rpc)
+ complete(&scfcp->scfc_completion);
+ } else {
+ scf_add_to_free_list(scfcp);
+ }
+}
+
+// As above, but check for correct CPU.
+static void scf_handler_1(void *scfc_in)
+{
+ struct scf_check *scfcp = scfc_in;
+
+ if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) {
+ atomic_inc(&n_errs);
+ }
+ scf_handler(scfcp);
+}
+
+// Randomly do an smp_call_function*() invocation.
+static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp)
+{
+ bool allocfail = false;
+ uintptr_t cpu;
+ int ret = 0;
+ struct scf_check *scfcp = NULL;
+ struct scf_selector *scfsp = scf_sel_rand(trsp);
+
+ if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
+ scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
+ if (!scfcp) {
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_KASAN));
+ atomic_inc(&n_alloc_errs);
+ allocfail = true;
+ } else {
+ scfcp->scfc_cpu = -1;
+ scfcp->scfc_wait = scfsp->scfs_wait;
+ scfcp->scfc_out = false;
+ scfcp->scfc_rpc = false;
+ }
+ }
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
+ switch (scfsp->scfs_prim) {
+ case SCF_PRIM_RESCHED:
+ if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) {
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_resched++;
+ resched_cpu(cpu);
+ this_cpu_inc(scf_invoked_count);
+ }
+ break;
+ case SCF_PRIM_SINGLE:
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ if (scfsp->scfs_wait)
+ scfp->n_single_wait++;
+ else
+ scfp->n_single++;
+ if (scfcp) {
+ scfcp->scfc_cpu = cpu;
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ }
+ ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait);
+ if (ret) {
+ if (scfsp->scfs_wait)
+ scfp->n_single_wait_ofl++;
+ else
+ scfp->n_single_ofl++;
+ scf_add_to_free_list(scfcp);
+ scfcp = NULL;
+ }
+ break;
+ case SCF_PRIM_SINGLE_RPC:
+ if (!scfcp)
+ break;
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_single_rpc++;
+ scfcp->scfc_cpu = cpu;
+ scfcp->scfc_wait = true;
+ init_completion(&scfcp->scfc_completion);
+ scfcp->scfc_rpc = true;
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0);
+ if (!ret) {
+ if (use_cpus_read_lock)
+ cpus_read_unlock();
+ else
+ preempt_enable();
+ wait_for_completion(&scfcp->scfc_completion);
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
+ } else {
+ scfp->n_single_rpc_ofl++;
+ scf_add_to_free_list(scfcp);
+ scfcp = NULL;
+ }
+ break;
+ case SCF_PRIM_MANY:
+ if (scfsp->scfs_wait)
+ scfp->n_many_wait++;
+ else
+ scfp->n_many++;
+ if (scfcp) {
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ }
+ smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait);
+ break;
+ case SCF_PRIM_ALL:
+ if (scfsp->scfs_wait)
+ scfp->n_all_wait++;
+ else
+ scfp->n_all++;
+ if (scfcp) {
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ }
+ smp_call_function(scf_handler, scfcp, scfsp->scfs_wait);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ if (scfcp)
+ scfcp->scfc_out = true;
+ }
+ if (scfcp && scfsp->scfs_wait) {
+ if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) &&
+ !scfcp->scfc_out)) {
+ pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
+ atomic_inc(&n_mb_out_errs); // Leak rather than trash!
+ } else {
+ scf_add_to_free_list(scfcp);
+ }
+ barrier(); // Prevent race-reduction compiler optimizations.
+ }
+ if (use_cpus_read_lock)
+ cpus_read_unlock();
+ else
+ preempt_enable();
+ if (allocfail)
+ schedule_timeout_idle((1 + longwait) * HZ); // Let no-wait handlers complete.
+ else if (!(torture_random(trsp) & 0xfff))
+ schedule_timeout_uninterruptible(1);
+}
+
+// SCF test kthread. Repeatedly does calls to members of the
+// smp_call_function() family of functions.
+static int scftorture_invoker(void *arg)
+{
+ int cpu;
+ int curcpu;
+ DEFINE_TORTURE_RANDOM(rand);
+ struct scf_statistics *scfp = (struct scf_statistics *)arg;
+ bool was_offline = false;
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu);
+ cpu = scfp->cpu % nr_cpu_ids;
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(cpu)));
+ set_user_nice(current, MAX_NICE);
+ if (holdoff)
+ schedule_timeout_interruptible(holdoff * HZ);
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, raw_smp_processor_id());
+
+ // Make sure that the CPU is affinitized appropriately during testing.
+ curcpu = raw_smp_processor_id();
+ WARN_ONCE(curcpu != cpu,
+ "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
+ __func__, scfp->cpu, curcpu, nr_cpu_ids);
+
+ if (!atomic_dec_return(&n_started))
+ while (atomic_read_acquire(&n_started)) {
+ if (torture_must_stop()) {
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu);
+ goto end;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
+
+ do {
+ scf_cleanup_free_list(cpu);
+
+ scftorture_invoke_one(scfp, &rand);
+ while (cpu_is_offline(cpu) && !torture_must_stop()) {
+ schedule_timeout_interruptible(HZ / 5);
+ was_offline = true;
+ }
+ if (was_offline) {
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ was_offline = false;
+ }
+ cond_resched();
+ stutter_wait("scftorture_invoker");
+ } while (!torture_must_stop());
+
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu);
+end:
+ torture_kthread_stopping("scftorture_invoker");
+ return 0;
+}
+
+static void
+scftorture_print_module_parms(const char *tag)
+{
+ pr_alert(SCFTORT_FLAG
+ "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+ verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
+}
+
+static void scf_cleanup_handler(void *unused)
+{
+}
+
+static void scf_torture_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup_begin())
+ return;
+
+ WRITE_ONCE(scfdone, true);
+ if (nthreads && scf_stats_p)
+ for (i = 0; i < nthreads; i++)
+ torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
+ else
+ goto end;
+ smp_call_function(scf_cleanup_handler, NULL, 1);
+ torture_stop_kthread(scf_torture_stats, scf_torture_stats_task);
+ scf_torture_stats_print(); // -After- the stats thread is stopped!
+ kfree(scf_stats_p); // -After- the last stats print has completed!
+ scf_stats_p = NULL;
+
+ for (i = 0; i < nr_cpu_ids; i++)
+ scf_cleanup_free_list(i);
+
+ if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs))
+ scftorture_print_module_parms("End of test: FAILURE");
+ else if (torture_onoff_failures())
+ scftorture_print_module_parms("End of test: LOCK_HOTPLUG");
+ else
+ scftorture_print_module_parms("End of test: SUCCESS");
+
+end:
+ torture_cleanup_end();
+}
+
+static int __init scf_torture_init(void)
+{
+ long i;
+ int firsterr = 0;
+ unsigned long weight_resched1 = weight_resched;
+ unsigned long weight_single1 = weight_single;
+ unsigned long weight_single_rpc1 = weight_single_rpc;
+ unsigned long weight_single_wait1 = weight_single_wait;
+ unsigned long weight_many1 = weight_many;
+ unsigned long weight_many_wait1 = weight_many_wait;
+ unsigned long weight_all1 = weight_all;
+ unsigned long weight_all_wait1 = weight_all_wait;
+
+ if (!torture_init_begin(SCFTORT_STRING, verbose))
+ return -EBUSY;
+
+ scftorture_print_module_parms("Start of test");
+
+ if (weight_resched <= 0 &&
+ weight_single <= 0 && weight_single_rpc <= 0 && weight_single_wait <= 0 &&
+ weight_many <= 0 && weight_many_wait <= 0 &&
+ weight_all <= 0 && weight_all_wait <= 0) {
+ weight_resched1 = weight_resched == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single1 = weight_single == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_rpc1 = weight_single_rpc == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_single_wait1 = weight_single_wait == 0 ? 0 : 2 * nr_cpu_ids;
+ weight_many1 = weight_many == 0 ? 0 : 2;
+ weight_many_wait1 = weight_many_wait == 0 ? 0 : 2;
+ weight_all1 = weight_all == 0 ? 0 : 1;
+ weight_all_wait1 = weight_all_wait == 0 ? 0 : 1;
+ } else {
+ if (weight_resched == -1)
+ weight_resched1 = 0;
+ if (weight_single == -1)
+ weight_single1 = 0;
+ if (weight_single_rpc == -1)
+ weight_single_rpc1 = 0;
+ if (weight_single_wait == -1)
+ weight_single_wait1 = 0;
+ if (weight_many == -1)
+ weight_many1 = 0;
+ if (weight_many_wait == -1)
+ weight_many_wait1 = 0;
+ if (weight_all == -1)
+ weight_all1 = 0;
+ if (weight_all_wait == -1)
+ weight_all_wait1 = 0;
+ }
+ if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 &&
+ weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 &&
+ weight_all1 == 0 && weight_all_wait1 == 0) {
+ SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST))
+ scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false);
+ else if (weight_resched1)
+ SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
+ scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
+ scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true);
+ scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
+ scf_sel_add(weight_many1, SCF_PRIM_MANY, false);
+ scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true);
+ scf_sel_add(weight_all1, SCF_PRIM_ALL, false);
+ scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true);
+ scf_sel_dump();
+
+ if (onoff_interval > 0) {
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ if (shutdown_secs > 0) {
+ firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter, stutter);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ // Worker tasks invoking smp_call_function().
+ if (nthreads < 0)
+ nthreads = num_online_cpus();
+ scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL);
+ if (!scf_stats_p) {
+ SCFTORTOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+
+ VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads", nthreads);
+
+ atomic_set(&n_started, nthreads);
+ for (i = 0; i < nthreads; i++) {
+ scf_stats_p[i].cpu = i;
+ firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i],
+ scf_stats_p[i].task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ scf_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_SCF_TORTURE_TEST));
+ kernel_power_off();
+ }
+ return firsterr;
+}
+
+module_init(scf_torture_init);
+module_exit(scf_torture_cleanup);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..8ae86371ddcd 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,6 +1,17 @@
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
-endif
+# SPDX-License-Identifier: GPL-2.0
+
+# The compilers are complaining about unused variables inside an if(0) scope
+# block. This is daft, shut them up.
+ccflags-y += $(call cc-disable-warning, unused-but-set-variable)
+
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
+# Disable KCSAN to avoid excessive noise and performance degradation. To avoid
+# false positives ensure barriers implied by sched functions are instrumented.
+KCSAN_SANITIZE := n
+KCSAN_INSTRUMENT_BARRIERS := y
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -11,11 +22,18 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o proc.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING
+endif
+#
+# Build efficiency:
+#
+# These compilation units have roughly the same size and complexity - so their
+# build parallelizes well and finishes roughly at once:
+#
+obj-y += core.o
+obj-y += fair.o
+obj-y += build_policy.o
+obj-y += build_utility.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/autogroup.c
index eae160dd669d..954137775f38 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/autogroup.c
@@ -1,24 +1,44 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
+// SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
+/*
+ * Auto-group scheduling implementation:
+ */
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include <linux/security.h>
-#include <linux/export.h>
+#include "autogroup.h"
+#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_autogroup_sysctls[] = {
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static void __init sched_autogroup_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_autogroup_sysctls);
+}
+#else /* !CONFIG_SYSCTL: */
+#define sched_autogroup_sysctl_init() do { } while (0)
+#endif /* !CONFIG_SYSCTL */
+
void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
kref_init(&autogroup_default.kref);
init_rwsem(&autogroup_default.lock);
init_task->signal->autogroup = &autogroup_default;
+ sched_autogroup_sysctl_init();
}
void autogroup_free(struct task_group *tg)
@@ -35,7 +55,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
- sched_offline_group(ag->tg);
+ sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -73,7 +93,6 @@ static inline struct autogroup *autogroup_create(void)
goto out_fail;
tg = sched_create_group(&root_task_group);
-
if (IS_ERR(tg))
goto out_free;
@@ -92,7 +111,7 @@ static inline struct autogroup *autogroup_create(void)
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
tg->rt_rq = root_task_group.rt_rq;
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
tg->autogroup = ag;
sched_online_group(tg, &root_task_group);
@@ -103,7 +122,7 @@ out_free:
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
- ag ? "sched_create_group()" : "kmalloc()");
+ ag ? "sched_create_group()" : "kzalloc()");
}
return autogroup_kref_get(&autogroup_default);
@@ -113,10 +132,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
-
/*
- * We can only assume the task group can't go away on us if
- * autogroup_move_group() can see us on ->thread_group list.
+ * If we race with autogroup_move_group() the caller can use the old
+ * value of signal->autogroup but in this case sched_move_task() will
+ * be called again before autogroup_kref_put().
+ *
+ * However, there is no way sched_autogroup_exit_task() could tell us
+ * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
*/
if (p->flags & PF_EXITING)
return false;
@@ -124,6 +146,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
return true;
}
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+ /*
+ * We are going to call exit_notify() and autogroup_move_group() can't
+ * see this thread after that: we can no longer use signal->autogroup.
+ * See the PF_EXITING check in task_wants_autogroup().
+ */
+ sched_move_task(p, true);
+}
+
static void
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
{
@@ -131,7 +163,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
struct task_struct *t;
unsigned long flags;
- BUG_ON(!lock_task_sighand(p, &flags));
+ if (WARN_ON_ONCE(!lock_task_sighand(p, &flags)))
+ return;
prev = p->signal->autogroup;
if (prev == ag) {
@@ -140,29 +173,37 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
}
p->signal->autogroup = autogroup_kref_get(ag);
-
- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
- goto out;
-
+ /*
+ * We can't avoid sched_move_task() after we changed signal->autogroup,
+ * this process can already run with task_group() == prev->tg or we can
+ * race with cgroup code which can read autogroup = prev under rq->lock.
+ * In the latter case for_each_thread() can not miss a migrating thread,
+ * cpu_cgroup_attach() must not be possible after cgroup_task_exit()
+ * and it can't be removed from thread list, we hold ->siglock.
+ *
+ * If an exiting thread was already removed from thread list we rely on
+ * sched_autogroup_exit_task().
+ */
for_each_thread(p, t)
- sched_move_task(t);
-out:
+ sched_move_task(t, true);
+
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
void sched_autogroup_create_attach(struct task_struct *p)
{
struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag);
- /* drop extra reference added by autogroup_create() */
+
+ /* Drop extra reference added by autogroup_create(): */
autogroup_kref_put(ag);
}
EXPORT_SYMBOL(sched_autogroup_create_attach);
-/* Cannot be called under siglock. Currently has no users */
+/* Cannot be called under siglock. Currently has no users: */
void sched_autogroup_detach(struct task_struct *p)
{
autogroup_move_group(p, &autogroup_default);
@@ -185,7 +226,6 @@ static int __init setup_autogroup(char *str)
return 1;
}
-
__setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
@@ -194,7 +234,8 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
{
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
- int err;
+ unsigned long shares;
+ int err, idx;
if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
@@ -206,15 +247,18 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
if (nice < 0 && !can_nice(current, nice))
return -EPERM;
- /* this is a heavy operation taking global locks.. */
+ /* This is a heavy operation, taking global locks.. */
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
return -EAGAIN;
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
+ idx = array_index_nospec(nice + 20, 40);
+ shares = scale_load(sched_prio_to_weight[idx]);
+
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+ err = sched_group_set_shares(ag->tg, shares);
if (!err)
ag->nice = nice;
up_write(&ag->lock);
@@ -240,7 +284,6 @@ out:
}
#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SCHED_DEBUG
int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
@@ -248,6 +291,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
-#endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/autogroup.h
index 8bd047142816..06c82b2bdfb5 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/autogroup.h
@@ -1,13 +1,16 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_SCHED_AUTOGROUP_H
+#define _KERNEL_SCHED_AUTOGROUP_H
+
+#include "sched.h"
-#include <linux/kref.h>
-#include <linux/rwsem.h>
+#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
/*
- * reference doesn't mean how many thread attach to this
- * autogroup now. It just stands for the number of task
- * could use this autogroup.
+ * Reference doesn't mean how many threads attach to this
+ * autogroup now. It just stands for the number of tasks
+ * which could use this autogroup.
*/
struct kref kref;
struct task_group *tg;
@@ -29,7 +32,8 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+ extern unsigned int sysctl_sched_autogroup_enabled;
+ int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
return p->signal->autogroup->tg;
@@ -39,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
-#else /* !CONFIG_SCHED_AUTOGROUP */
+#else /* !CONFIG_SCHED_AUTOGROUP: */
static inline void autogroup_init(struct task_struct *init_task) { }
static inline void autogroup_free(struct task_group *tg) { }
@@ -54,11 +58,11 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
return tg;
}
-#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
return 0;
}
-#endif
-#endif /* CONFIG_SCHED_AUTOGROUP */
+#endif /* !CONFIG_SCHED_AUTOGROUP */
+
+#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
new file mode 100644
index 000000000000..755883faf751
--- /dev/null
+++ b/kernel/sched/build_policy.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are the scheduling policy related scheduler files, built
+ * in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c and fair.c, the other two big
+ * compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ *
+ * core.c and fair.c are built separately.
+ */
+
+/* Headers: */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/posix-timers.h>
+#include <linux/sched/rt.h>
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/kobject.h>
+#include <linux/livepatch.h>
+#include <linux/pm.h>
+#include <linux/psi.h>
+#include <linux/rhashtable.h>
+#include <linux/seq_buf.h>
+#include <linux/seqlock_api.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/tsacct_kern.h>
+#include <linux/vtime.h>
+#include <linux/sysrq.h>
+#include <linux/percpu-rwsem.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "smp.h"
+
+#include "autogroup.h"
+#include "stats.h"
+#include "pelt.h"
+
+/* Source code modules: */
+
+#include "idle.c"
+
+#include "rt.c"
+#include "cpudeadline.c"
+
+#include "pelt.c"
+
+#include "cputime.c"
+#include "deadline.c"
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext_internal.h"
+# include "ext.c"
+# include "ext_idle.c"
+#endif
+
+#include "syscalls.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
new file mode 100644
index 000000000000..e2cf3b08d4e9
--- /dev/null
+++ b/kernel/sched/build_utility.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are various utility functions of the scheduler,
+ * built in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c, fair.c, smp.c and policy.c, the other
+ * big compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/hashtable_api.h>
+#include <linux/irq.h>
+#include <linux/kobject_api.h>
+#include <linux/membarrier.h>
+#include <linux/mempolicy.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/ptrace_api.h>
+#include <linux/sched_clock.h>
+#include <linux/security.h>
+#include <linux/spinlock_api.h>
+#include <linux/swait_api.h>
+#include <linux/timex.h>
+#include <linux/utsname.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#include <uapi/linux/prctl.h>
+#include <uapi/linux/sched/types.h>
+
+#include <asm/switch_to.h>
+
+#include "sched.h"
+#include "sched-pelt.h"
+#include "stats.h"
+#include "autogroup.h"
+
+#include "clock.c"
+
+#ifdef CONFIG_CGROUP_CPUACCT
+# include "cpuacct.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+# include "cpufreq.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+# include "cpufreq_schedutil.c"
+#endif
+
+#include "debug.c"
+
+#ifdef CONFIG_SCHEDSTATS
+# include "stats.c"
+#endif
+
+#include "loadavg.c"
+#include "completion.c"
+#include "swait.c"
+#include "wait_bit.c"
+#include "wait.c"
+
+#include "cpupri.c"
+#include "stop_task.c"
+
+#include "topology.c"
+
+#ifdef CONFIG_SCHED_CORE
+# include "core_sched.c"
+#endif
+
+#ifdef CONFIG_PSI
+# include "psi.c"
+#endif
+
+#ifdef CONFIG_MEMBARRIER
+# include "membarrier.c"
+#endif
+
+#ifdef CONFIG_CPU_ISOLATION
+# include "isolation.c"
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+# include "autogroup.c"
+#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c0a205101c23..f5e6dd6a6b3a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * sched_clock for unstable cpu clocks
+ * sched_clock() for unstable CPU clocks
*
- * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
*
* Updates and enhancements:
* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
@@ -11,7 +12,7 @@
* Guillaume Chazarain <guichaz@gmail.com>
*
*
- * What:
+ * What this file implements:
*
* cpu_clock(i) provides a fast (execution time) high resolution
* clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +27,11 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
- * local_clock() -- is cpu_clock() on the current cpu.
+ * local_clock() -- is cpu_clock() on the current CPU.
*
* sched_clock_cpu(i)
*
- * How:
+ * How it is implemented:
*
* The implementation either uses sched_clock() when
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -40,7 +41,7 @@
* Otherwise it tries to create a semi stable clock from a mixture of other
* clocks, including:
*
- * - GTOD (clock monotomic)
+ * - GTOD (clock monotonic)
* - sched_clock()
* - explicit idle events
*
@@ -52,111 +53,180 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/ktime.h>
-#include <linux/sched.h>
-#include <linux/static_key.h>
-#include <linux/workqueue.h>
-#include <linux/compiler.h>
+
+#include <linux/sched/clock.h>
+#include "sched.h"
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __weak sched_clock(void)
+notrace unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
}
EXPORT_SYMBOL_GPL(sched_clock);
-__read_mostly int sched_clock_running;
+static DEFINE_STATIC_KEY_FALSE(sched_clock_running);
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
-static int __sched_clock_stable_early;
+/*
+ * We must start with !__sched_clock_stable because the unstable -> stable
+ * transition is accurate, while the stable -> unstable transition is not.
+ *
+ * Similarly we start with __sched_clock_stable_early, thereby assuming we
+ * will become stable, such that there's only a single 1 -> 0 transition.
+ */
+static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
+static int __sched_clock_stable_early = 1;
+
+/*
+ * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
+ */
+__read_mostly u64 __sched_clock_offset;
+static __read_mostly u64 __gtod_offset;
+
+struct sched_clock_data {
+ u64 tick_raw;
+ u64 tick_gtod;
+ u64 clock;
+};
-int sched_clock_stable(void)
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static __always_inline struct sched_clock_data *this_scd(void)
{
- return static_key_false(&__sched_clock_stable);
+ return this_cpu_ptr(&sched_clock_data);
}
-static void __set_sched_clock_stable(void)
+notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
{
- if (!sched_clock_stable())
- static_key_slow_inc(&__sched_clock_stable);
+ return &per_cpu(sched_clock_data, cpu);
}
-void set_sched_clock_stable(void)
+notrace int sched_clock_stable(void)
{
- __sched_clock_stable_early = 1;
+ return static_branch_likely(&__sched_clock_stable);
+}
- smp_mb(); /* matches sched_clock_init() */
+notrace static void __scd_stamp(struct sched_clock_data *scd)
+{
+ scd->tick_gtod = ktime_get_ns();
+ scd->tick_raw = sched_clock();
+}
- if (!sched_clock_running)
- return;
+notrace static void __set_sched_clock_stable(void)
+{
+ struct sched_clock_data *scd;
- __set_sched_clock_stable();
+ /*
+ * Since we're still unstable and the tick is already running, we have
+ * to disable IRQs in order to get a consistent scd->tick* reading.
+ */
+ local_irq_disable();
+ scd = this_scd();
+ /*
+ * Attempt to make the (initial) unstable->stable transition continuous.
+ */
+ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+ local_irq_enable();
+
+ printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
+ static_branch_enable(&__sched_clock_stable);
+ tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
-static void __clear_sched_clock_stable(struct work_struct *work)
+/*
+ * If we ever get here, we're screwed, because we found out -- typically after
+ * the fact -- that TSC wasn't good. This means all our clocksources (including
+ * ktime) could have reported wrong values.
+ *
+ * What we do here is an attempt to fix up and continue sort of where we left
+ * off in a coherent manner.
+ *
+ * The only way to fully avoid random clock jumps is to boot with:
+ * "tsc=unstable".
+ */
+notrace static void __sched_clock_work(struct work_struct *work)
{
- /* XXX worry about clock continuity */
- if (sched_clock_stable())
- static_key_slow_dec(&__sched_clock_stable);
-}
+ struct sched_clock_data *scd;
+ int cpu;
-static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+ /* take a current timestamp and set 'now' */
+ preempt_disable();
+ scd = this_scd();
+ __scd_stamp(scd);
+ scd->clock = scd->tick_gtod + __gtod_offset;
+ preempt_enable();
-void clear_sched_clock_stable(void)
-{
- __sched_clock_stable_early = 0;
+ /* clone to all CPUs */
+ for_each_possible_cpu(cpu)
+ per_cpu(sched_clock_data, cpu) = *scd;
+
+ printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
+ printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
+
+ static_branch_disable(&__sched_clock_stable);
+}
- smp_mb(); /* matches sched_clock_init() */
+static DECLARE_WORK(sched_clock_work, __sched_clock_work);
- if (!sched_clock_running)
+notrace static void __clear_sched_clock_stable(void)
+{
+ if (!sched_clock_stable())
return;
+ tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
schedule_work(&sched_clock_work);
}
-struct sched_clock_data {
- u64 tick_raw;
- u64 tick_gtod;
- u64 clock;
-};
+notrace void clear_sched_clock_stable(void)
+{
+ __sched_clock_stable_early = 0;
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+ smp_mb(); /* matches sched_clock_init_late() */
-static inline struct sched_clock_data *this_scd(void)
-{
- return this_cpu_ptr(&sched_clock_data);
+ if (static_key_count(&sched_clock_running.key) == 2)
+ __clear_sched_clock_stable();
}
-static inline struct sched_clock_data *cpu_sdc(int cpu)
+notrace static void __sched_clock_gtod_offset(void)
{
- return &per_cpu(sched_clock_data, cpu);
+ struct sched_clock_data *scd = this_scd();
+
+ __scd_stamp(scd);
+ __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod;
}
-void sched_clock_init(void)
+void __init sched_clock_init(void)
{
- u64 ktime_now = ktime_to_ns(ktime_get());
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct sched_clock_data *scd = cpu_sdc(cpu);
-
- scd->tick_raw = 0;
- scd->tick_gtod = ktime_now;
- scd->clock = ktime_now;
- }
-
- sched_clock_running = 1;
+ /*
+ * Set __gtod_offset such that once we mark sched_clock_running,
+ * sched_clock_tick() continues where sched_clock() left off.
+ *
+ * Even if TSC is buggered, we're still UP at this point so it
+ * can't really be out of sync.
+ */
+ local_irq_disable();
+ __sched_clock_gtod_offset();
+ local_irq_enable();
+ static_branch_inc(&sched_clock_running);
+}
+/*
+ * We run this as late_initcall() such that it runs after all built-in drivers,
+ * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
+ */
+static int __init sched_clock_init_late(void)
+{
+ static_branch_inc(&sched_clock_running);
/*
* Ensure that it is impossible to not do a static_key update.
*
@@ -168,20 +238,21 @@ void sched_clock_init(void)
if (__sched_clock_stable_early)
__set_sched_clock_stable();
- else
- __clear_sched_clock_stable(NULL);
+
+ return 0;
}
+late_initcall(sched_clock_init_late);
/*
* min, max except they take wrapping into account
*/
-static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -192,13 +263,13 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
- u64 now, clock, old_clock, min_clock, max_clock;
+ u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
again:
- now = sched_clock();
+ now = sched_clock_noinstr();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
@@ -211,20 +282,46 @@ again:
* scd->tick_gtod + TICK_NSEC);
*/
- clock = scd->tick_gtod + delta;
- min_clock = wrap_max(scd->tick_gtod, old_clock);
- max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+ gtod = scd->tick_gtod + __gtod_offset;
+ clock = gtod + delta;
+ min_clock = wrap_max(gtod, old_clock);
+ max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock_noinstr(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock_noinstr() + __sched_clock_offset;
+
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock_noinstr();
+
+ clock = sched_clock_local(this_scd());
+
+ return clock;
+}
+
+u64 local_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = local_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
@@ -239,21 +336,21 @@ again:
* cmpxchg64 below only protects one readout.
*
* We must reread via sched_clock_local() in the retry case on
- * 32bit as an NMI could use sched_clock_local() via the
+ * 32-bit kernels as an NMI could use sched_clock_local() via the
* tracer and hit between the readout of
- * the low32bit and the high 32bit portion.
+ * the low 32-bit and the high 32-bit portion.
*/
this_clock = sched_clock_local(my_scd);
/*
- * We must enforce atomic readout on 32bit, otherwise the
- * update on the remote cpu can hit inbetween the readout of
- * the low32bit and the high 32bit portion.
+ * We must enforce atomic readout on 32-bit, otherwise the
+ * update on the remote CPU can hit in between the readout of
+ * the low 32-bit and the high 32-bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
#else
/*
- * On 64bit the read of [my]scd->clock is atomic versus the
- * update, so we can avoid the above 32bit dance.
+ * On 64-bit kernels the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32-bit dance.
*/
sched_clock_local(my_scd);
again:
@@ -280,7 +377,7 @@ again:
val = remote_clock;
}
- if (cmpxchg64(ptr, old_val, val) != old_val)
+ if (!try_cmpxchg64(ptr, &old_val, val))
goto again;
return val;
@@ -291,16 +388,16 @@ again:
*
* See cpu_clock().
*/
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
if (sched_clock_stable())
- return sched_clock();
+ return sched_clock() + __sched_clock_offset;
- if (unlikely(!sched_clock_running))
- return 0ull;
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock();
preempt_disable_notrace();
scd = cpu_sdc(cpu);
@@ -313,113 +410,89 @@ u64 sched_clock_cpu(int cpu)
return clock;
}
+EXPORT_SYMBOL_GPL(sched_clock_cpu);
-void sched_clock_tick(void)
+notrace void sched_clock_tick(void)
{
struct sched_clock_data *scd;
- u64 now, now_gtod;
if (sched_clock_stable())
return;
- if (unlikely(!sched_clock_running))
+ if (!static_branch_likely(&sched_clock_running))
return;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
scd = this_scd();
- now_gtod = ktime_to_ns(ktime_get());
- now = sched_clock();
-
- scd->tick_raw = now;
- scd->tick_gtod = now_gtod;
+ __scd_stamp(scd);
sched_clock_local(scd);
}
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
+notrace void sched_clock_tick_stable(void)
{
- sched_clock_cpu(smp_processor_id());
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
- if (timekeeping_suspended)
+ if (!sched_clock_stable())
return;
- sched_clock_tick();
- touch_softlockup_watchdog();
+ /*
+ * Called under watchdog_lock.
+ *
+ * The watchdog just found this TSC to (still) be stable, so now is a
+ * good moment to update our __gtod_offset. Because once we find the
+ * TSC to be unstable, any computation will be computing crap.
+ */
+ local_irq_disable();
+ __sched_clock_gtod_offset();
+ local_irq_enable();
}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
/*
- * As outlined at the top, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !! #
- * ####################################################################
+ * We are going deep-idle (IRQs are disabled):
*/
-u64 cpu_clock(int cpu)
+notrace void sched_clock_idle_sleep_event(void)
{
- if (!sched_clock_stable())
- return sched_clock_cpu(cpu);
-
- return sched_clock();
+ sched_clock_cpu(smp_processor_id());
}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
- * Similar to cpu_clock() for the current cpu. Time will only be observed
- * to be monotonic if care is taken to only compare timestampt taken on the
- * same CPU.
- *
- * See cpu_clock().
+ * We just idled; resync with ktime.
*/
-u64 local_clock(void)
+notrace void sched_clock_idle_wakeup_event(void)
{
- if (!sched_clock_stable())
- return sched_clock_cpu(raw_smp_processor_id());
+ unsigned long flags;
- return sched_clock();
-}
+ if (sched_clock_stable())
+ return;
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+ if (unlikely(timekeeping_suspended))
+ return;
-void sched_clock_init(void)
-{
- sched_clock_running = 1;
+ local_irq_save(flags);
+ sched_clock_tick();
+ local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-u64 sched_clock_cpu(int cpu)
-{
- if (unlikely(!sched_clock_running))
- return 0;
+#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */
- return sched_clock();
-}
-
-u64 cpu_clock(int cpu)
+void __init sched_clock_init(void)
{
- return sched_clock();
+ static_branch_inc(&sched_clock_running);
+ local_irq_disable();
+ generic_sched_clock_init();
+ local_irq_enable();
}
-u64 local_clock(void)
+notrace u64 sched_clock_cpu(int cpu)
{
+ if (!static_branch_likely(&sched_clock_running))
+ return 0;
+
return sched_clock();
}
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-
-EXPORT_SYMBOL_GPL(cpu_clock);
-EXPORT_SYMBOL_GPL(local_clock);
+#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
* Running clock - returns the time that has elapsed while a guest has been
@@ -429,7 +502,7 @@ EXPORT_SYMBOL_GPL(local_clock);
* On bare metal this function should return the same as local_clock.
* Architectures and sub-architectures can override this.
*/
-u64 __weak running_clock(void)
+notrace u64 __weak running_clock(void)
{
return local_clock();
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 8d0f35debf35..19ee702273c0 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
/*
* Generic wait-for-completion handler;
*
@@ -11,8 +13,27 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include <linux/sched.h>
+#include <linux/linkage.h>
+#include <linux/sched/debug.h>
#include <linux/completion.h>
+#include "sched.h"
+
+static void complete_with_flags(struct completion *x, int wake_flags)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ swake_up_locked(&x->wait, wake_flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
+void complete_on_current_cpu(struct completion *x)
+{
+ return complete_with_flags(x, WF_CURRENT_CPU);
+}
/**
* complete: - signals a single thread waiting on this completion
@@ -23,17 +44,12 @@
*
* See also complete_all(), wait_for_completion() and related routines.
*
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
*/
void complete(struct completion *x)
{
- unsigned long flags;
-
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
- __wake_up_locked(&x->wait, TASK_NORMAL, 1);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ complete_with_flags(x, 0);
}
EXPORT_SYMBOL(complete);
@@ -43,17 +59,26 @@ EXPORT_SYMBOL(complete);
*
* This will wake up all threads waiting on this particular completion event.
*
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
*/
void complete_all(struct completion *x)
{
unsigned long flags;
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done += UINT_MAX/2;
- __wake_up_locked(&x->wait, TASK_NORMAL, 0);
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ lockdep_assert_RT_in_threaded_ctx();
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+ x->done = UINT_MAX;
+ swake_up_all_locked(&x->wait);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
@@ -62,24 +87,25 @@ do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
- DECLARE_WAITQUEUE(wait, current);
+ DECLARE_SWAITQUEUE(wait);
- __add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
+ __prepare_to_swait(&x->wait, &wait);
__set_current_state(state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
- spin_lock_irq(&x->wait.lock);
+ raw_spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
- __remove_wait_queue(&x->wait, &wait);
+ __finish_swait(&x->wait, &wait);
if (!x->done)
return timeout;
}
- x->done--;
+ if (x->done != UINT_MAX)
+ x->done--;
return timeout ?: 1;
}
@@ -89,9 +115,14 @@ __wait_for_common(struct completion *x,
{
might_sleep();
- spin_lock_irq(&x->wait.lock);
+ complete_acquire(x);
+
+ raw_spin_lock_irq(&x->wait.lock);
timeout = do_wait_for_common(x, action, timeout, state);
- spin_unlock_irq(&x->wait.lock);
+ raw_spin_unlock_irq(&x->wait.lock);
+
+ complete_release(x);
+
return timeout;
}
@@ -188,6 +219,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
@@ -225,12 +257,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_killable);
+int __sched wait_for_completion_state(struct completion *x, unsigned int state)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state);
+
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
@@ -266,7 +309,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
bool try_wait_for_completion(struct completion *x)
{
unsigned long flags;
- int ret = 1;
+ bool ret = true;
/*
* Since x->done will need to be locked only
@@ -275,14 +318,14 @@ bool try_wait_for_completion(struct completion *x)
* return early in the blocking case.
*/
if (!READ_ONCE(x->done))
- return 0;
+ return false;
- spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
- ret = 0;
- else
+ ret = false;
+ else if (x->done != UINT_MAX)
x->done--;
- spin_unlock_irqrestore(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return ret;
}
EXPORT_SYMBOL(try_wait_for_completion);
@@ -294,9 +337,12 @@ EXPORT_SYMBOL(try_wait_for_completion);
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
+ * Note, this will always return true if complete_all() was called on @X.
*/
bool completion_done(struct completion *x)
{
+ unsigned long flags;
+
if (!READ_ONCE(x->done))
return false;
@@ -304,14 +350,9 @@ bool completion_done(struct completion *x)
* If ->done, we need to wait for complete() to release ->wait.lock
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
- *
- * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
- * the loads of ->done and ->wait.lock such that we cannot observe
- * the lock before complete() acquires it while observing the ->done
- * after it's acquired the lock.
*/
- smp_rmb();
- spin_unlock_wait(&x->wait.lock);
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291ffb..41ba0be16911 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,327 +1,857 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/core.c
*
- * Kernel scheduler and related syscalls
+ * Core kernel CPU scheduler code
*
* Copyright (C) 1991-2002 Linus Torvalds
- *
- * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
- * make semaphores SMP safe
- * 1998-11-19 Implemented schedule_timeout() and related stuff
- * by Andrea Arcangeli
- * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
- * hybrid priority-list and round-robin design with
- * an array-switch method of distributing timeslices
- * and per-CPU runqueues. Cleanups and useful suggestions
- * by Davide Libenzi, preemptible kernel bits by Robert Love.
- * 2003-09-03 Interactivity tuning by Con Kolivas.
- * 2004-04-02 Scheduler domains code by Nick Piggin
- * 2007-04-15 Work begun on replacing all interactivity tuning with a
- * fair scheduling design by Con Kolivas.
- * 2007-05-05 Load balancing (smp-nice) and other improvements
- * by Peter Williams
- * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
- * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
- * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
- * Thomas Gleixner, Mike Kravetz
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
-
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
+#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+#include <linux/sched.h>
#include <linux/highmem.h>
-#include <asm/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
+
#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/pid_namespace.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
+#include <linux/context_tracking.h>
#include <linux/cpuset.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/sysctl.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <linux/unistd.h>
-#include <linux/pagemap.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
-#include <linux/debugfs.h>
-#include <linux/ctype.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
#include <linux/init_task.h>
-#include <linux/binfmts.h>
-#include <linux/context_tracking.h>
-#include <linux/compiler.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
+#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/rseq.h>
+#include <linux/sched/wake_q.h>
+#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+#include <linux/livepatch_sched.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
+# endif
+#endif
+
+#include <uapi/linux/sched/types.h>
+#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include <asm/irq_regs.h>
-#include <asm/mutex.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
+
+#define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
+#include <trace/events/sched.h>
+#include <trace/events/ipi.h>
+#undef CREATE_TRACE_POINTS
#include "sched.h"
+#include "stats.h"
+
+#include "autogroup.h"
+#include "pelt.h"
+#include "smp.h"
+
#include "../workqueue_internal.h"
+#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
+#include "../locking/mutex.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
- unsigned long delta;
- ktime_t soft, hard, now;
+/*
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
+ * associated with them) to allow external modules to probe them.
+ */
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
- for (;;) {
- if (hrtimer_active(period_timer))
- break;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
- now = hrtimer_cb_get_time(period_timer);
- hrtimer_forward(period_timer, now, period);
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static int __init setup_proxy_exec(char *str)
+{
+ bool proxy_enable = true;
- soft = hrtimer_get_softexpires(period_timer);
- hard = hrtimer_get_expires(period_timer);
- delta = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(period_timer, soft, delta,
- HRTIMER_MODE_ABS_PINNED, 0);
+ if (*str && kstrtobool(str + 1, &proxy_enable)) {
+ pr_warn("Unable to parse sched_proxy_exec=\n");
+ return 0;
+ }
+
+ if (proxy_enable) {
+ pr_info("sched_proxy_exec enabled via boot arg\n");
+ static_branch_enable(&__sched_proxy_exec);
+ } else {
+ pr_info("sched_proxy_exec disabled via boot arg\n");
+ static_branch_disable(&__sched_proxy_exec);
}
+ return 1;
+}
+#else
+static int __init setup_proxy_exec(char *str)
+{
+ pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n");
+ return 0;
}
+#endif
+__setup("sched_proxy_exec", setup_proxy_exec);
-DEFINE_MUTEX(sched_domains_mutex);
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+/*
+ * Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
+ */
+#define SCHED_FEAT(name, enabled) \
+ (1UL << __SCHED_FEAT_##name) * enabled |
+__read_mostly unsigned int sysctl_sched_features =
+#include "features.h"
+ 0;
+#undef SCHED_FEAT
-static void update_rq_clock_task(struct rq *rq, s64 delta);
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * LATENCY_WARN is enabled).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ */
+__read_mostly int sysctl_resched_latency_warn_ms = 100;
+__read_mostly int sysctl_resched_latency_warn_once = 1;
-void update_rq_clock(struct rq *rq)
+/*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+
+__read_mostly int scheduler_running;
+
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/* kernel prio, less is more */
+static inline int __task_prio(const struct task_struct *p)
{
- s64 delta;
+ if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ return -2;
- lockdep_assert_held(&rq->lock);
+ if (p->dl_server)
+ return -1; /* deadline */
- if (rq->clock_skip_update & RQCF_ACT_SKIP)
- return;
+ if (rt_or_dl_prio(p->prio))
+ return p->prio; /* [-1, 99] */
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
- if (delta < 0)
- return;
- rq->clock += delta;
- update_rq_clock_task(rq, delta);
+ if (p->sched_class == &idle_sched_class)
+ return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+ if (task_on_scx(p))
+ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
}
/*
- * Debugging: various feature bits
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b) := l(b,a)
+ * ge(a,b) := !l(a,b)
*/
-#define SCHED_FEAT(name, enabled) \
- (1UL << __SCHED_FEAT_##name) * enabled |
+/* real prio, less is less */
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
+{
-const_debug unsigned int sysctl_sched_features =
-#include "features.h"
- 0;
+ int pa = __task_prio(a), pb = __task_prio(b);
-#undef SCHED_FEAT
+ if (-pa < -pb)
+ return true;
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled) \
- #name ,
+ if (-pb < -pa)
+ return false;
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
+ if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
+ const struct sched_dl_entity *a_dl, *b_dl;
-#undef SCHED_FEAT
+ a_dl = &a->dl;
+ /*
+ * Since,'a' and 'b' can be CFS tasks served by DL server,
+ * __task_prio() can return -1 (for DL) even for those. In that
+ * case, get to the dl_server's DL entity.
+ */
+ if (a->dl_server)
+ a_dl = a->dl_server;
-static int sched_feat_show(struct seq_file *m, void *v)
+ b_dl = &b->dl;
+ if (b->dl_server)
+ b_dl = b->dl_server;
+
+ return !dl_time_before(a_dl->deadline, b_dl->deadline);
+ }
+
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b, in_fi);
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */
+ return scx_prio_less(a, b, in_fi);
+#endif
+
+ return false;
+}
+
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
{
- int i;
+ if (a->core_cookie < b->core_cookie)
+ return true;
+
+ if (a->core_cookie > b->core_cookie)
+ return false;
+
+ /* flip prio, so high prio is leftmost */
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
+ return true;
+
+ return false;
+}
+
+#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
+
+static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
+{
+ return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
+}
+
+static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+{
+ const struct task_struct *p = __node_2_sc(node);
+ unsigned long cookie = (unsigned long)key;
+
+ if (cookie < p->core_cookie)
+ return -1;
+
+ if (cookie > p->core_cookie)
+ return 1;
- for (i = 0; i < __SCHED_FEAT_NR; i++) {
- if (!(sysctl_sched_features & (1UL << i)))
- seq_puts(m, "NO_");
- seq_printf(m, "%s ", sched_feat_names[i]);
+ return 0;
+}
+
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ return;
+
+ rq->core->core_task_seq++;
+
+ if (!p->core_cookie)
+ return;
+
+ rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
+}
+
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (p->se.sched_delayed)
+ return;
+
+ rq->core->core_task_seq++;
+
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
}
- seq_puts(m, "\n");
+
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
+}
+
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
+{
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
return 0;
}
-#ifdef HAVE_JUMP_LABEL
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+ struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
-#define jump_label_key__true STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
-#define SCHED_FEAT(name, enabled) \
- jump_label_key__##enabled ,
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
+ } while (sched_task_is_throttled(p, cpu));
-#undef SCHED_FEAT
+ return p;
+}
+
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
+
+ return sched_core_next(p, cookie);
+}
-static void sched_feat_disable(int i)
+/*
+ * Magic required such that:
+ *
+ * raw_spin_rq_lock(rq);
+ * ...
+ * raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static atomic_t sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void sched_core_lock(int cpu, unsigned long *flags)
{
- if (static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_dec(&sched_feat_keys[i]);
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t, i = 0;
+
+ local_irq_save(*flags);
+ for_each_cpu(t, smt_mask)
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
}
-static void sched_feat_enable(int i)
+static void sched_core_unlock(int cpu, unsigned long *flags)
{
- if (!static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_inc(&sched_feat_keys[i]);
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_restore(*flags);
}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-static int sched_feat_set(char *cmp)
+static void __sched_core_flip(bool enabled)
{
- int i;
- int neg = 0;
+ unsigned long flags;
+ int cpu, t;
- if (strncmp(cmp, "NO_", 3) == 0) {
- neg = 1;
- cmp += 3;
- }
+ cpus_read_lock();
- for (i = 0; i < __SCHED_FEAT_NR; i++) {
- if (strcmp(cmp, sched_feat_names[i]) == 0) {
- if (neg) {
- sysctl_sched_features &= ~(1UL << i);
- sched_feat_disable(i);
- } else {
- sysctl_sched_features |= (1UL << i);
- sched_feat_enable(i);
- }
- break;
- }
+ /*
+ * Toggle the online cores, one by one.
+ */
+ cpumask_copy(&sched_core_mask, cpu_online_mask);
+ for_each_cpu(cpu, &sched_core_mask) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ sched_core_lock(cpu, &flags);
+
+ for_each_cpu(t, smt_mask)
+ cpu_rq(t)->core_enabled = enabled;
+
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
+ sched_core_unlock(cpu, &flags);
+
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
}
- return i;
+ /*
+ * Toggle the offline CPUs.
+ */
+ for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ cpus_read_unlock();
}
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static void sched_core_assert_empty(void)
{
- char buf[64];
- char *cmp;
- int i;
- struct inode *inode;
+ int cpu;
- if (cnt > 63)
- cnt = 63;
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
+}
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
+static void __sched_core_enable(void)
+{
+ static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure all previous instances of raw_spin_rq_*lock() have finished
+ * and future ones will observe !sched_core_disabled().
+ */
+ synchronize_rcu();
+ __sched_core_flip(true);
+ sched_core_assert_empty();
+}
- buf[cnt] = 0;
- cmp = strstrip(buf);
+static void __sched_core_disable(void)
+{
+ sched_core_assert_empty();
+ __sched_core_flip(false);
+ static_branch_disable(&__sched_core_enabled);
+}
- /* Ensure the static_key remains in a consistent state */
- inode = file_inode(filp);
- mutex_lock(&inode->i_mutex);
- i = sched_feat_set(cmp);
- mutex_unlock(&inode->i_mutex);
- if (i == __SCHED_FEAT_NR)
- return -EINVAL;
+void sched_core_get(void)
+{
+ if (atomic_inc_not_zero(&sched_core_count))
+ return;
- *ppos += cnt;
+ mutex_lock(&sched_core_mutex);
+ if (!atomic_read(&sched_core_count))
+ __sched_core_enable();
- return cnt;
+ smp_mb__before_atomic();
+ atomic_inc(&sched_core_count);
+ mutex_unlock(&sched_core_mutex);
}
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static void __sched_core_put(struct work_struct *work)
{
- return single_open(filp, sched_feat_show, NULL);
+ if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+ }
}
-static const struct file_operations sched_feat_fops = {
- .open = sched_feat_open,
- .write = sched_feat_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static __init int sched_init_debug(void)
+void sched_core_put(void)
{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
+ static DECLARE_WORK(_work, __sched_core_put);
- return 0;
+ /*
+ * "There can be only one"
+ *
+ * Either this is the last one, or we don't actually need to do any
+ * 'work'. If it is the last *again*, we rely on
+ * WORK_STRUCT_PENDING_BIT.
+ */
+ if (!atomic_add_unless(&sched_core_count, -1, 1))
+ schedule_work(&_work);
}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-/*
- * Number of tasks to iterate in a single balance run.
- * Limited because this is done with IRQs disabled.
- */
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else /* !CONFIG_SCHED_CORE: */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
+
+#endif /* !CONFIG_SCHED_CORE */
+
+/* need a wrapper since we may need to trace from modules */
+EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
+
+/* Call via the helper macro trace_set_current_state. */
+void __trace_set_current_state(int state_value)
+{
+ trace_sched_set_state_tp(current, state_value);
+}
+EXPORT_SYMBOL(__trace_set_current_state);
/*
- * period over which we average the RT time consumption, measured
- * in ms.
+ * Serialization rules:
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
+ *
+ * rq1->lock
+ * rq2->lock where: rq1 < rq2
+ *
+ * Regular state:
+ *
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
+ * always looks at the local rq data structures to find the most eligible task
+ * to run next.
+ *
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
+ * the local CPU to avoid bouncing the runqueue state around [ see
+ * ttwu_queue_wakelist() ]
+ *
+ * Task wakeup, specifically wakeups that involve migration, are horribly
+ * complicated to avoid having to take two rq->locks.
+ *
+ * Special state:
+ *
+ * System-calls and anything external will use task_rq_lock() which acquires
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
+ * stable while holding either lock:
+ *
+ * - sched_setaffinity()/
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
+ * - set_user_nice(): p->se.load, p->*prio
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
+ * p->se.load, p->rt_priority,
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
+ * - sched_setnuma(): p->numa_preferred_nid
+ * - sched_move_task(): p->sched_task_group
+ * - uclamp_update_active() p->uclamp*
+ *
+ * p->state <- TASK_*:
+ *
+ * is changed locklessly using set_current_state(), __set_current_state() or
+ * set_special_state(), see their respective comments, or by
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
+ * concurrent self.
+ *
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
+ *
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
+ * ON_RQ_MIGRATING state is used for migration without holding both
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
+ *
+ * Additionally it is possible to be ->on_rq but still be considered not
+ * runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ * but will be dequeued as soon as they get picked again. See the
+ * task_is_runnable() helper.
+ *
+ * p->on_cpu <- { 0, 1 }:
+ *
+ * is set by prepare_task() and cleared by finish_task() such that it will be
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
+ *
+ * [ The astute reader will observe that it is possible for two tasks on one
+ * CPU to have ->on_cpu = 1 at the same time. ]
+ *
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
+ *
+ * - Don't call set_task_cpu() on a blocked task:
+ *
+ * We don't care what CPU we're not running on, this simplifies hotplug,
+ * the CPU assignment of blocked tasks isn't required to be valid.
+ *
+ * - for try_to_wake_up(), called under p->pi_lock:
+ *
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
+ *
+ * - for migration called under rq->lock:
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
+ *
+ * o move_queued_task()
+ * o detach_task()
+ *
+ * - for migration called under double_rq_lock():
+ *
+ * o __migrate_swap_task()
+ * o push_rt_task() / pull_rt_task()
+ * o push_dl_task() / pull_dl_task()
+ * o dl_task_offline_migration()
*
- * default: 1s
*/
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+
+void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+{
+ raw_spinlock_t *lock;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ raw_spin_lock_nested(lock, subclass);
+ if (likely(lock == __rq_lockp(rq))) {
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+bool raw_spin_rq_trylock(struct rq *rq)
+{
+ raw_spinlock_t *lock;
+ bool ret;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ ret = raw_spin_trylock(lock);
+ if (!ret || (likely(lock == __rq_lockp(rq)))) {
+ preempt_enable();
+ return ret;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+void raw_spin_rq_unlock(struct rq *rq)
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
/*
- * period over which we measure -rt task cpu usage in us.
- * default: 1s
+ * double_rq_lock - safely lock two runqueues
*/
-unsigned int sysctl_sched_rt_period = 1000000;
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+ lockdep_assert_irqs_disabled();
-__read_mostly int scheduler_running;
+ if (rq_order_less(rq2, rq1))
+ swap(rq1, rq2);
+
+ raw_spin_rq_lock(rq1);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+
+ double_rq_clock_clear_update(rq1, rq2);
+}
/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
+ * __task_rq_lock - lock the rq @p resides on.
*/
-int sysctl_sched_rt_runtime = 950000;
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
-/* cpus with isolated domains */
-cpumask_var_t cpu_isolated_map;
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_rq_lock(rq);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rq_pin_lock(rq, rf);
+ return rq;
+ }
+ raw_spin_rq_unlock(rq);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
/*
- * this_rq_lock - lock this runqueue and disable interrupts.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
-static struct rq *this_rq_lock(void)
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
- local_irq_disable();
- rq = this_rq();
- raw_spin_lock(&rq->lock);
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+ rq = task_rq(p);
+ raw_spin_rq_lock(rq);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old CPU in task_rq_lock(), the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new CPU in task_rq_lock(), the address
+ * dependency headed by '[L] rq = task_rq()' and the acquire
+ * will pair with the WMB to ensure we then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rq_pin_lock(rq, rf);
+ return rq;
+ }
+ raw_spin_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
- return rq;
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * RQ-clock updating methods:
+ */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+ s64 __maybe_unused steal = 0, irq_delta = 0;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (irqtime_enabled()) {
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}IRQ region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}IRQ
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ delayacct_irq(rq->curr, irq_delta);
+ }
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq = prev_steal;
+ delta -= steal;
+ }
+#endif
+
+ rq->clock_task += delta;
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ update_irq_load_avg(rq, irq_delta + steal);
+#endif
+ update_rq_clock_pelt(rq, delta);
+}
+
+void update_rq_clock(struct rq *rq)
+{
+ s64 delta;
+ u64 clock;
+
+ lockdep_assert_rq_held(rq);
+
+ if (rq->clock_update_flags & RQCF_ACT_SKIP)
+ return;
+
+ if (sched_feat(WARN_DOUBLE_CLOCK))
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED);
+ rq->clock_update_flags |= RQCF_UPDATED;
+
+ clock = sched_clock_cpu(cpu_of(rq));
+ scx_rq_clock_update(rq, clock);
+
+ delta = clock - rq->clock;
+ if (delta < 0)
+ return;
+ rq->clock += delta;
+
+ update_rq_clock_task(rq, delta);
}
#ifdef CONFIG_SCHED_HRTICK
@@ -342,25 +872,24 @@ static void hrtick_clear(struct rq *rq)
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+ struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
- rq->curr->sched_class->task_tick(rq, rq->curr, 1);
- raw_spin_unlock(&rq->lock);
+ rq->donor->sched_class->task_tick(rq, rq->donor, 1);
+ rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
}
-#ifdef CONFIG_SMP
-
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time = hrtimer_get_softexpires(timer);
+ ktime_t time = rq->hrtick_time;
- return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -369,22 +898,21 @@ static int __hrtick_restart(struct rq *rq)
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
__hrtick_restart(rq);
- rq->hrtick_csd_pending = 0;
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
* Called to set the hrtick timer state.
*
- * called with rq->lock held and irqs disabled
+ * called with rq->lock held and IRQs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time;
s64 delta;
/*
@@ -392,114 +920,52 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- time = ktime_add_ns(timer->base->get_time(), delta);
+ rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta);
- hrtimer_set_expires(timer, time);
-
- if (rq == this_rq()) {
+ if (rq == this_rq())
__hrtick_restart(rq);
- } else if (!rq->hrtick_csd_pending) {
+ else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
- rq->hrtick_csd_pending = 1;
- }
-}
-
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (int)(long)hcpu;
-
- switch (action) {
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- hrtick_clear(cpu_rq(cpu));
- return NOTIFY_OK;
- }
-
- return NOTIFY_DONE;
-}
-
-static __init void init_hrtick(void)
-{
- hotcpu_notifier(hotplug_hrtick, 0);
}
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- delay = max_t(u64, delay, 10000LL);
- __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
- HRTIMER_MODE_REL_PINNED, 0);
-}
-
-static inline void init_hrtick(void)
-{
-}
-#endif /* CONFIG_SMP */
-static void init_rq_hrtick(struct rq *rq)
+static void hrtick_rq_init(struct rq *rq)
{
-#ifdef CONFIG_SMP
- rq->hrtick_csd_pending = 0;
-
- rq->hrtick_csd.flags = 0;
- rq->hrtick_csd.func = __hrtick_start;
- rq->hrtick_csd.info = rq;
-#endif
-
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- rq->hrtick_timer.function = hrtick;
+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
+ hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
-#else /* CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void hrtick_clear(struct rq *rq)
{
}
-static inline void init_rq_hrtick(struct rq *rq)
+static inline void hrtick_rq_init(struct rq *rq)
{
}
-
-static inline void init_hrtick(void)
-{
-}
-#endif /* CONFIG_SCHED_HRTICK */
+#endif /* !CONFIG_SCHED_HRTICK */
/*
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
-#define fetch_or(ptr, val) \
-({ typeof(*(ptr)) __old, __val = *(ptr); \
- for (;;) { \
- __old = cmpxchg((ptr), __val, __val | (val)); \
- if (__old == __val) \
- break; \
- __val = __old; \
- } \
- __old; \
+ * try_cmpxchg based fetch_or() macro so it works for different integer types:
+ */
+#define fetch_or(ptr, mask) \
+ ({ \
+ typeof(ptr) _ptr = (ptr); \
+ typeof(mask) _mask = (mask); \
+ typeof(*_ptr) _val = *_ptr; \
+ \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
-#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+#ifdef TIF_POLLING_NRFLAG
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- struct thread_info *ti = task_thread_info(p);
- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
}
/*
@@ -511,35 +977,117 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
- for (;;) {
+ do {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
- break;
- val = old;
- }
+ } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- set_tsk_need_resched(p);
+ set_ti_thread_flag(ti, tif);
return true;
}
-#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
-#endif
+
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * it's already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * In order to ensure that a pending wakeup will observe our pending
+ * state, even in the failed case, an explicit smp_mb() must be used.
+ */
+ smp_mb__before_atomic();
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
+ return false;
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+ return true;
+}
+
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ if (__wake_q_add(head, task))
+ get_task_struct(task);
+}
+
+/**
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ *
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
+ * that already hold reference to @task can call the 'safe' version and trust
+ * wake_q to do the right thing depending whether or not the @task is already
+ * queued for wakeup.
+ */
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
+{
+ if (!__wake_q_add(head, task))
+ put_task_struct(task);
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ node = node->next;
+ /* pairs with cmpxchg_relaxed() in __wake_q_add() */
+ WRITE_ONCE(task->wake_q.next, NULL);
+ /* Task can safely be re-inserted now. */
+
+ /*
+ * wake_up_process() executes a full barrier, which pairs with
+ * the queueing in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
@@ -548,28 +1096,76 @@ static bool set_nr_if_polling(struct task_struct *p)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
{
struct task_struct *curr = rq->curr;
+ struct thread_info *cti = task_thread_info(curr);
int cpu;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Always immediately preempt the idle task; no point in delaying doing
+ * actual work.
+ */
+ if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+ tif = TIF_NEED_RESCHED;
- if (test_tsk_need_resched(curr))
+ if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
return;
cpu = cpu_of(rq);
+ trace_sched_set_need_resched_tp(curr, cpu, tif);
if (cpu == smp_processor_id()) {
- set_tsk_need_resched(curr);
- set_preempt_need_resched();
+ set_ti_thread_flag(cti, tif);
+ if (tif == TIF_NEED_RESCHED)
+ set_preempt_need_resched();
return;
}
- if (set_nr_and_not_polling(curr))
- smp_send_reschedule(cpu);
- else
+ if (set_nr_and_not_polling(cti, tif)) {
+ if (tif == TIF_NEED_RESCHED)
+ smp_send_reschedule(cpu);
+ } else {
trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void __trace_set_need_resched(struct task_struct *curr, int tif)
+{
+ trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
+}
+
+void resched_curr(struct rq *rq)
+{
+ __resched_curr(rq, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int get_lazy_tif_bit(void)
+{
+ if (dynamic_preempt_lazy())
+ return TIF_NEED_RESCHED_LAZY;
+
+ return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ __resched_curr(rq, get_lazy_tif_bit());
}
void resched_cpu(int cpu)
@@ -577,44 +1173,53 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
- resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
-#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu. This is good for power-savings.
+ * In the semi idle case, use the nearest busy CPU for migrating timers
+ * from an idle CPU. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ * selecting an idle CPU will add more delays to the timers than intended
+ * (as that CPU's timer base may not be up to date wrt jiffies etc).
*/
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
{
- int cpu = smp_processor_id();
- int i;
+ int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
+ const struct cpumask *hk_mask;
- if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
- return cpu;
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
+ if (!idle_cpu(cpu))
+ return cpu;
+ default_cpu = cpu;
+ }
+
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
+
+ guard(rcu)();
- rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
+ for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i))
+ return i;
}
}
-unlock:
- rcu_read_unlock();
- return cpu;
+
+ if (default_cpu == -1)
+ default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
+
+ return default_cpu;
}
+
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
@@ -632,7 +1237,29 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
- if (set_nr_and_not_polling(rq->idle))
+ /*
+ * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ * part of the idle loop. This forces an exit from the idle loop
+ * and a round trip to schedule(). Now this could be optimized
+ * because a simple new idle loop iteration is enough to
+ * re-evaluate the next tick. Provided some re-ordering of tick
+ * nohz functions that would need to follow TIF_NR_POLLING
+ * clearing:
+ *
+ * - On most architectures, a simple fetch_or on ti::flags with a
+ * "0" value would be enough to know if an IPI needs to be sent.
+ *
+ * - x86 needs to perform a last need_resched() check between
+ * monitor and mwait which doesn't take timers into account.
+ * There a dedicated TIF_TIMER flag would be required to
+ * fetch_or here and be checked along with TIF_NEED_RESCHED
+ * before mwait().
+ *
+ * However, remote timer enqueue is not such a frequent event
+ * and testing of the above solutions didn't appear to report
+ * much benefits.
+ */
+ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -646,6 +1273,8 @@ static bool wake_up_full_nohz_cpu(int cpu)
* If needed we can still optimize that later with an
* empty IRQ.
*/
+ if (cpu_is_offline(cpu))
+ return true; /* Don't try to wake offline CPUs. */
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
@@ -656,91 +1285,108 @@ static bool wake_up_full_nohz_cpu(int cpu)
return false;
}
+/*
+ * Wake up the specified CPU. If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
void wake_up_nohz_cpu(int cpu)
{
if (!wake_up_full_nohz_cpu(cpu))
wake_up_idle_cpu(cpu);
}
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
{
- int cpu = smp_processor_id();
-
- if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
- return false;
-
- if (idle_cpu(cpu) && !need_resched())
- return true;
+ struct rq *rq = info;
+ int cpu = cpu_of(rq);
+ unsigned int flags;
/*
- * We can't run Idle Load Balance on this CPU for this time so we
- * cancel it and clear NOHZ_BALANCE_KICK
+ * Release the rq::nohz_csd.
*/
- clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
- return false;
-}
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
-#else /* CONFIG_NO_HZ_COMMON */
-
-static inline bool got_nohz_idle_kick(void)
-{
- return false;
+ rq->idle_balance = idle_cpu(cpu);
+ if (rq->idle_balance) {
+ rq->nohz_idle_balance = flags;
+ __raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
}
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
-bool sched_can_stop_tick(void)
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
+bool sched_can_stop_tick(struct rq *rq)
+{
+ int fifo_nr_running;
+
+ /* Deadline tasks, even if single, need the tick */
+ if (rq->dl.dl_nr_running)
+ return false;
+
+ /*
+ * If there are more than one RR tasks, we need the tick to affect the
+ * actual RR behaviour.
+ */
+ if (rq->rt.rr_nr_running) {
+ if (rq->rt.rr_nr_running == 1)
+ return true;
+ else
+ return false;
+ }
+
/*
- * FIFO realtime policy runs the highest priority task. Other runnable
- * tasks are of a lower priority. The scheduler tick does nothing.
+ * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+ * forced preemption between FIFO tasks.
*/
- if (current->policy == SCHED_FIFO)
+ fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ if (fifo_nr_running)
return true;
/*
- * Round-robin realtime tasks time slice with other tasks at the same
- * realtime priority. Is this task the only one at this priority?
+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+ * left. For CFS, if there's more than one we need the tick for
+ * involuntary preemption. For SCX, ask.
*/
- if (current->policy == SCHED_RR) {
- struct sched_rt_entity *rt_se = &current->rt;
+ if (scx_enabled() && !scx_can_stop_tick(rq))
+ return false;
- return rt_se->run_list.prev == rt_se->run_list.next;
- }
+ if (rq->cfs.h_nr_queued > 1)
+ return false;
/*
- * More than one running task need preemption.
- * nr_running update is assumed to be visible
- * after IPI is sent from wakers.
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
*/
- if (this_rq()->nr_running > 1)
- return false;
+ if (__need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-void sched_avg_update(struct rq *rq)
-{
- s64 period = sched_avg_period();
-
- while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (rq->age_stamp));
- rq->age_stamp += period;
- rq->rt_avg /= 2;
- }
-}
-
-#endif /* CONFIG_SMP */
-
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
- (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
/*
* Iterate task_group tree rooted at *from, calling @down when first entering a
* node and @up when leaving it for the final time.
@@ -784,395 +1430,795 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p)
+void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
- struct load_weight *load = &p->se.load;
+ struct load_weight lw;
+
+ if (task_has_idle_policy(p)) {
+ lw.weight = scale_load(WEIGHT_IDLEPRIO);
+ lw.inv_weight = WMULT_IDLEPRIO;
+ } else {
+ lw.weight = scale_load(sched_prio_to_weight[prio]);
+ lw.inv_weight = sched_prio_to_wmult[prio];
+ }
/*
- * SCHED_IDLE tasks get minimal weight:
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
*/
- if (p->policy == SCHED_IDLE) {
- load->weight = scale_load(WEIGHT_IDLEPRIO);
- load->inv_weight = WMULT_IDLEPRIO;
+ if (update_load && p->sched_class->reweight_task)
+ p->sched_class->reweight_task(task_rq(p), p, &lw);
+ else
+ p->se.load = lw;
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static __maybe_unused DEFINE_MUTEX(uclamp_mutex);
+
+/* Max allowed minimum utilization */
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+
+/* Max allowed maximum utilization */
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
+
+/*
+ * This static key is used to reduce the uclamp overhead in the fast path. It
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
+ * enqueue/dequeue_task().
+ *
+ * This allows users to continue to enable uclamp in their kernel config with
+ * minimum uclamp overhead in the fast path.
+ *
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
+ * enabled, since we have an actual users that make use of uclamp
+ * functionality.
+ *
+ * The knobs that would enable this static key are:
+ *
+ * * A task modifying its uclamp value with sched_setattr().
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
+ */
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
+
+static inline unsigned int
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
+{
+ /*
+ * Avoid blocked utilization pushing up the frequency when we go
+ * idle (which drops the max-clamp) by retaining the last known
+ * max-clamp.
+ */
+ if (clamp_id == UCLAMP_MAX) {
+ rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
+ return clamp_value;
+ }
+
+ return uclamp_none(UCLAMP_MIN);
+}
+
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
+{
+ /* Reset max-clamp retention only on idle exit */
+ if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
+
+ uclamp_rq_set(rq, clamp_id, clamp_value);
+}
+
+static inline
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
+{
+ struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+ int bucket_id = UCLAMP_BUCKETS - 1;
+
+ /*
+ * Since both min and max clamps are max aggregated, find the
+ * top most bucket with tasks in.
+ */
+ for ( ; bucket_id >= 0; bucket_id--) {
+ if (!bucket[bucket_id].tasks)
+ continue;
+ return bucket[bucket_id].value;
}
- load->weight = scale_load(prio_to_weight[prio]);
- load->inv_weight = prio_to_wmult[prio];
+ /* No tasks -- default clamp values */
+ return uclamp_idle_value(rq, clamp_id, clamp_value);
}
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
{
- update_rq_clock(rq);
- sched_info_queued(rq, p);
- p->sched_class->enqueue_task(rq, p, flags);
+ unsigned int default_util_min;
+ struct uclamp_se *uc_se;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+ /* Only sync if user didn't override the default */
+ if (uc_se->user_defined)
+ return;
+
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+ uclamp_se_set(uc_se, default_util_min, false);
}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
- update_rq_clock(rq);
- sched_info_dequeued(rq, p);
- p->sched_class->dequeue_task(rq, p, flags);
+ if (!rt_task(p))
+ return;
+
+ /* Protect updates to p->uclamp_* */
+ guard(task_rq_lock)(p);
+ __uclamp_update_util_min_rt_default(p);
}
-void activate_task(struct rq *rq, struct task_struct *p, int flags)
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
+ /* Copy by value as we could modify it */
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ unsigned int tg_min, tg_max, value;
- enqueue_task(rq, p, flags);
+ /*
+ * Tasks in autogroups or root task group will be
+ * restricted by system defaults.
+ */
+ if (task_group_is_autogroup(task_group(p)))
+ return uc_req;
+ if (task_group(p) == &root_task_group)
+ return uc_req;
+
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
+ value = uc_req.value;
+ value = clamp(value, tg_min, tg_max);
+ uclamp_se_set(&uc_req, value, false);
+#endif
+
+ return uc_req;
}
-void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ * group or in an autogroup
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible++;
+ struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
+ struct uclamp_se uc_max = uclamp_default[clamp_id];
- dequeue_task(rq, p, flags);
+ /* System default restrictions always apply */
+ if (unlikely(uc_req.value > uc_max.value))
+ return uc_max;
+
+ return uc_req;
}
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
+ struct uclamp_se uc_eff;
+
+ /* Task currently refcounted: use back-annotated (effective) value */
+ if (p->uclamp[clamp_id].active)
+ return (unsigned long)p->uclamp[clamp_id].value;
+
+ uc_eff = uclamp_eff_get(p, clamp_id);
+
+ return (unsigned long)uc_eff.value;
+}
+
/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ *
+ * Tasks can have a task-specific value requested from user-space, track
+ * within each bucket the maximum value for tasks refcounted in it.
+ * This "local max aggregation" allows to track the exact "requested" value
+ * for each bucket when all its RUNNABLE tasks require the same clamp.
*/
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- s64 steal = 0, irq_delta = 0;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+
+ lockdep_assert_rq_held(rq);
+
+ /* Update task effective clamp */
+ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
+ bucket->tasks++;
+ uc_se->active = true;
+
+ uclamp_idle_reset(rq, clamp_id, uc_se->value);
+
+ /*
+ * Local max aggregation: rq buckets always track the max
+ * "requested" clamp value of its RUNNABLE tasks.
+ */
+ if (bucket->tasks == 1 || uc_se->value > bucket->value)
+ bucket->value = uc_se->value;
+
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
+}
+
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+ unsigned int bkt_clamp;
+ unsigned int rq_clamp;
+
+ lockdep_assert_rq_held(rq);
/*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
+ * If sched_uclamp_used was enabled after task @p was enqueued,
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
+ *
+ * In this case the uc_se->active flag should be false since no uclamp
+ * accounting was performed at enqueue time and we can just return
+ * here.
*
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
+ * Need to be careful of the following enqueue/dequeue ordering
+ * problem too
*
- * It does however cause some slight miss-attribution of {soft,}irq
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
+ * enqueue(taskA)
+ * // sched_uclamp_used gets enabled
+ * enqueue(taskB)
+ * dequeue(taskA)
+ * // Must not decrement bucket->tasks here
+ * dequeue(taskB)
+ *
+ * where we could end up with stale data in uc_se and
+ * bucket[uc_se->bucket_id].
+ *
+ * The following check here eliminates the possibility of such race.
*/
- if (irq_delta > delta)
- irq_delta = delta;
+ if (unlikely(!uc_se->active))
+ return;
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
- steal -= rq->prev_steal_time_rq;
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
- if (unlikely(steal > delta))
- steal = delta;
+ WARN_ON_ONCE(!bucket->tasks);
+ if (likely(bucket->tasks))
+ bucket->tasks--;
- rq->prev_steal_time_rq += steal;
- delta -= steal;
- }
-#endif
+ uc_se->active = false;
- rq->clock_task += delta;
+ /*
+ * Keep "local max aggregation" simple and accept to (possibly)
+ * overboost some RUNNABLE tasks in the same bucket.
+ * The rq clamp bucket value is reset to its base value whenever
+ * there are no more RUNNABLE tasks refcounting it.
+ */
+ if (likely(bucket->tasks))
+ return;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
- sched_rt_avg_update(rq, irq_delta + steal);
-#endif
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
+ /*
+ * Defensive programming: this should never happen. If it happens,
+ * e.g. due to future modification, warn and fix up the expected value.
+ */
+ WARN_ON_ONCE(bucket->value > rq_clamp);
+ if (bucket->value >= rq_clamp) {
+ bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
+ }
}
-void sched_set_stop_task(int cpu, struct task_struct *stop)
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
- struct task_struct *old_stop = cpu_rq(cpu)->stop;
+ enum uclamp_id clamp_id;
- if (stop) {
- /*
- * Make it appear like a SCHED_FIFO task, its something
- * userspace knows about and won't get confused about.
- *
- * Also, it will make PI more or less work without too
- * much confusion -- but then, stop work should not
- * rely on PI working anyway.
- */
- sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!uclamp_is_used())
+ return;
- stop->sched_class = &stop_sched_class;
- }
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
- cpu_rq(cpu)->stop = stop;
+ /* Only inc the delayed task which being woken up. */
+ if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
+ return;
- if (old_stop) {
- /*
- * Reset it back to a normal scheduling class so that
- * it can die in pieces.
- */
- old_stop->sched_class = &rt_sched_class;
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /* Reset clamp idle holding when there is one RUNNABLE task */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
- return p->static_prio;
+ enum uclamp_id clamp_id;
+
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!uclamp_is_used())
+ return;
+
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
+
+ if (p->se.sched_delayed)
+ return;
+
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_dec_id(rq, p, clamp_id);
}
-/*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
- */
-static inline int normal_prio(struct task_struct *p)
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
{
- int prio;
+ if (!p->uclamp[clamp_id].active)
+ return;
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
+static inline void
+uclamp_update_active(struct task_struct *p)
{
- p->normal_prio = normal_prio(p);
+ enum uclamp_id clamp_id;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ /*
+ * Lock the task and the rq where the task is (or was) queued.
+ *
+ * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ * price to pay to safely serialize util_{min,max} updates with
+ * enqueues, dequeues and migration operations.
+ * This is the same locking schema used by __set_cpus_allowed_ptr().
+ */
+ rq = task_rq_lock(p, &rf);
+
/*
- * If we are RT tasks or we were boosted to RT priority,
- * keep the priority unchanged. Otherwise, update priority
- * to the normal priority:
+ * Setting the clamp bucket is serialized by task_rq_lock().
+ * If the task is not yet RUNNABLE and its task_struct is not
+ * affecting a valid clamp bucket, the next time it's enqueued,
+ * it will already see the updated clamp bucket value.
*/
- if (!rt_prio(p->prio))
- return p->normal_prio;
- return p->prio;
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
+
+ task_rq_unlock(rq, p, &rf);
}
-/**
- * task_curr - is this task currently executing on a CPU?
- * @p: the task in question.
- *
- * Return: 1 if the task is currently executing. 0 otherwise.
- */
-inline int task_curr(const struct task_struct *p)
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{
- return cpu_curr(task_cpu(p)) == p;
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it)))
+ uclamp_update_active(p);
+ css_task_iter_end(&it);
}
-/*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
- */
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+#endif
+
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void uclamp_update_root_tg(void)
{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
- /* Possble rq->lock 'hole'. */
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
+ struct task_group *tg = &root_task_group;
+
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+
+ guard(rcu)();
+ cpu_util_update_eff(&root_task_group.css);
}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+static void uclamp_sync_util_min_rt_default(void)
{
- const struct sched_class *class;
-
- if (p->sched_class == rq->curr->sched_class) {
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- } else {
- for_each_class(class) {
- if (class == rq->curr->sched_class)
- break;
- if (class == p->sched_class) {
- resched_curr(rq);
- break;
- }
- }
- }
+ struct task_struct *g, *p;
/*
- * A queue event has occurred, and we're going to schedule. In
- * this case, we can save a useless back to back clock update.
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
*/
- if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
- rq_clock_skip_update(rq, true);
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ guard(rcu)();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
}
-#ifdef CONFIG_SMP
-void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
-#ifdef CONFIG_SCHED_DEBUG
+ bool update_root_tg = false;
+ int old_min, old_max, old_min_rt;
+ int result;
+
+ guard(mutex)(&uclamp_mutex);
+
+ old_min = sysctl_sched_uclamp_util_min;
+ old_max = sysctl_sched_uclamp_util_max;
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
+
+ result = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (result)
+ goto undo;
+ if (!write)
+ return 0;
+
+ if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
+ result = -EINVAL;
+ goto undo;
+ }
+
+ if (old_min != sysctl_sched_uclamp_util_min) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ update_root_tg = true;
+ }
+ if (old_max != sysctl_sched_uclamp_util_max) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+ update_root_tg = true;
+ }
+
+ if (update_root_tg) {
+ sched_uclamp_enable();
+ uclamp_update_root_tg();
+ }
+
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+ sched_uclamp_enable();
+ uclamp_sync_util_min_rt_default();
+ }
+
/*
- * We should never call set_task_cpu() on a blocked task,
- * ttwu() will sort out the placement.
+ * We update all RUNNABLE tasks only when task groups are in use.
+ * Otherwise, keep it simple and do just a lazy update at each next
+ * task enqueue time.
*/
- WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !p->on_rq);
+ return 0;
+
+undo:
+ sysctl_sched_uclamp_util_min = old_min;
+ sysctl_sched_uclamp_util_max = old_max;
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
+ return result;
+}
+#endif /* CONFIG_SYSCTL */
+
+static void uclamp_fork(struct task_struct *p)
+{
+ enum uclamp_id clamp_id;
-#ifdef CONFIG_LOCKDEP
/*
- * The caller should hold either p->pi_lock or rq->lock, when changing
- * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
- *
- * sched_move_task() holds both and thus holding either pins the cgroup,
- * see task_group().
- *
- * Furthermore, all task_rq users should acquire both locks, see
- * task_rq_lock().
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+ * as the task is still at its early fork stages.
*/
- WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock)));
-#endif
-#endif
+ for_each_clamp_id(clamp_id)
+ p->uclamp[clamp_id].active = false;
- trace_sched_migrate_task(p, new_cpu);
+ if (likely(!p->sched_reset_on_fork))
+ return;
- if (task_cpu(p) != new_cpu) {
- if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p, new_cpu);
- p->se.nr_migrations++;
- perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&p->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
}
+}
- __set_task_cpu(p, new_cpu);
+static void uclamp_post_fork(struct task_struct *p)
+{
+ uclamp_update_util_min_rt_default(p);
}
-static void __migrate_swap_task(struct task_struct *p, int cpu)
+static void __init init_uclamp_rq(struct rq *rq)
{
- if (task_on_rq_queued(p)) {
- struct rq *src_rq, *dst_rq;
+ enum uclamp_id clamp_id;
+ struct uclamp_rq *uc_rq = rq->uclamp;
- src_rq = task_rq(p);
- dst_rq = cpu_rq(cpu);
+ for_each_clamp_id(clamp_id) {
+ uc_rq[clamp_id] = (struct uclamp_rq) {
+ .value = uclamp_none(clamp_id)
+ };
+ }
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, cpu);
- activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
- } else {
- /*
- * Task isn't running anymore; make it appear like we migrated
- * it before it went to sleep. This means on wakeup we make the
- * previous cpu our targer instead of where it really is.
- */
- p->wake_cpu = cpu;
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
+}
+
+static void __init init_uclamp(void)
+{
+ struct uclamp_se uc_max = {};
+ enum uclamp_id clamp_id;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ init_uclamp_rq(cpu_rq(cpu));
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&init_task.uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ }
+
+ /* System defaults allow max clamp values for both indexes */
+ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
+ for_each_clamp_id(clamp_id) {
+ uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ root_task_group.uclamp_req[clamp_id] = uc_max;
+ root_task_group.uclamp[clamp_id] = uc_max;
+#endif
}
}
-struct migration_swap_arg {
- struct task_struct *src_task, *dst_task;
- int src_cpu, dst_cpu;
-};
+#else /* !CONFIG_UCLAMP_TASK: */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* !CONFIG_UCLAMP_TASK */
-static int migrate_swap_stop(void *data)
+bool sched_task_on_rq(struct task_struct *p)
{
- struct migration_swap_arg *arg = data;
- struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
+ return task_on_rq_queued(p);
+}
- src_rq = cpu_rq(arg->src_cpu);
- dst_rq = cpu_rq(arg->dst_cpu);
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
- if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
+ if (!p || p == current)
+ return 0;
- if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
- if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
- goto unlock;
+ return ip;
+}
- if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
- goto unlock;
+void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (!(flags & ENQUEUE_NOCLOCK))
+ update_rq_clock(rq);
- __migrate_swap_task(arg->src_task, arg->dst_cpu);
- __migrate_swap_task(arg->dst_task, arg->src_cpu);
+ /*
+ * Can be before ->enqueue_task() because uclamp considers the
+ * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
+ * in ->enqueue_task().
+ */
+ uclamp_rq_inc(rq, p, flags);
- ret = 0;
+ rq->queue_mask |= p->sched_class->queue_mask;
+ p->sched_class->enqueue_task(rq, p, flags);
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
+ psi_enqueue(p, flags);
- return ret;
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_enqueue(rq, p);
+
+ if (sched_core_enabled(rq))
+ sched_core_enqueue(rq, p);
}
/*
- * Cross migrate two tasks
+ * Must only return false when DEQUEUE_SLEEP.
*/
-int migrate_swap(struct task_struct *cur, struct task_struct *p)
+inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
- struct migration_swap_arg arg;
- int ret = -EINVAL;
+ if (sched_core_enabled(rq))
+ sched_core_dequeue(rq, p, flags);
- arg = (struct migration_swap_arg){
- .src_task = cur,
- .src_cpu = task_cpu(cur),
- .dst_task = p,
- .dst_cpu = task_cpu(p),
- };
+ if (!(flags & DEQUEUE_NOCLOCK))
+ update_rq_clock(rq);
- if (arg.src_cpu == arg.dst_cpu)
- goto out;
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeue(rq, p);
+
+ psi_dequeue(p, flags);
/*
- * These three tests are all lockless; this is OK since all of them
- * will be re-checked with proper locks held further down the line.
+ * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
+ * and mark the task ->sched_delayed.
*/
- if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
- goto out;
+ uclamp_rq_dec(rq, p);
+ rq->queue_mask |= p->sched_class->queue_mask;
+ return p->sched_class->dequeue_task(rq, p, flags);
+}
- if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
- goto out;
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (task_on_rq_migrating(p))
+ flags |= ENQUEUE_MIGRATED;
- if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
- goto out;
+ enqueue_task(rq, p, flags);
- trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
- ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+}
-out:
- return ret;
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
+
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+
+ /*
+ * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before*
+ * dequeue_task() and cleared *after* enqueue_task().
+ */
+
+ dequeue_task(rq, p, flags);
}
-struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
-};
+static void block_task(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
+ __block_task(rq, p);
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+ return cpu_curr(task_cpu(p)) == p;
+}
+
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct task_struct *donor = rq->donor;
-static int migration_cpu_stop(void *data);
+ if (p->sched_class == donor->sched_class)
+ donor->sched_class->wakeup_preempt(rq, p, flags);
+ else if (sched_class_above(p->sched_class, donor->sched_class))
+ resched_curr(rq);
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr))
+ rq_clock_skip_update(rq);
+}
+
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+ if (READ_ONCE(p->__state) & state)
+ return 1;
+
+ if (READ_ONCE(p->saved_state) & state)
+ return -1;
+
+ return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+ /*
+ * Serialize against current_save_and_set_rtlock_wait_state(),
+ * current_restore_rtlock_saved_state(), and __refrigerator().
+ */
+ guard(raw_spinlock_irq)(&p->pi_lock);
+ return __task_state_match(p, state);
+}
/*
* wait_task_inactive - wait for a thread to unschedule.
*
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
@@ -1180,10 +2226,10 @@ static int migration_cpu_stop(void *data);
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
- unsigned long flags;
- int running, queued;
+ int running, queued, match;
+ struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
@@ -1203,12 +2249,12 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
+ * But we don't care, since "task_on_cpu()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
- while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
+ while (task_on_cpu(rq, p)) {
+ if (!task_state_match(p, match_state))
return 0;
cpu_relax();
}
@@ -1218,14 +2264,27 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
+ /*
+ * If task is sched_delayed, force dequeue it, to avoid always
+ * hitting the tick timeout in the queued case
+ */
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
trace_sched_wait_task(p);
- running = task_running(rq, p);
+ running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || p->state == match_state)
+ if ((match = __task_state_match(p, match_state))) {
+ /*
+ * When matching on p->saved_state, consider this task
+ * still queued so it will wait.
+ */
+ if (match < 0)
+ queued = 1;
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &flags);
+ }
+ task_rq_unlock(rq, p, &rf);
/*
* If it changed from the expected state, bail out now.
@@ -1254,10 +2313,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(queued)) {
- ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+ ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
continue;
}
@@ -1272,6 +2331,1050 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
return ncsw;
}
+static void
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+ struct affinity_context ac = {
+ .new_mask = cpumask_of(rq->cpu),
+ .flags = SCA_MIGRATE_DISABLE,
+ };
+
+ if (likely(!p->migration_disabled))
+ return;
+
+ if (p->cpus_ptr != &p->cpus_mask)
+ return;
+
+ scoped_guard (task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
+}
+
+void ___migrate_enable(void)
+{
+ struct task_struct *p = current;
+ struct affinity_context ac = {
+ .new_mask = &p->cpus_mask,
+ .flags = SCA_MIGRATE_ENABLE,
+ };
+
+ __set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(___migrate_enable);
+
+void migrate_disable(void)
+{
+ __migrate_disable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+ __migrate_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+ /* When not in the task's cpumask, no point in looking further. */
+ if (!task_allowed_on_cpu(p, cpu))
+ return false;
+
+ /* migrate_disabled() must be allowed to finish. */
+ if (is_migration_disabled(p))
+ return cpu_online(cpu);
+
+ /* Non kernel threads are not allowed during either online or offline. */
+ if (!(p->flags & PF_KTHREAD))
+ return cpu_active(cpu);
+
+ /* KTHREAD_IS_PER_CPU is always allowed. */
+ if (kthread_is_per_cpu(p))
+ return cpu_online(cpu);
+
+ /* Regular kernel threads don't get to stay during offline. */
+ if (cpu_dying(cpu))
+ return false;
+
+ /* But are allowed during online. */
+ return cpu_online(cpu);
+}
+
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int new_cpu)
+{
+ lockdep_assert_rq_held(rq);
+
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
+ set_task_cpu(p, new_cpu);
+ rq_unlock(rq, rf);
+
+ rq = cpu_rq(new_cpu);
+
+ rq_lock(rq, rf);
+ WARN_ON_ONCE(task_cpu(p) != new_cpu);
+ activate_task(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
+struct set_affinity_pending {
+ refcount_t refs;
+ unsigned int stop_pending;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
+};
+
+/*
+ * Move (not current) task off this CPU, onto the destination CPU. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int dest_cpu)
+{
+ /* Affinity changed (again). */
+ if (!is_cpu_allowed(p, dest_cpu))
+ return rq;
+
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a high-prio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+ struct set_affinity_pending *pending = arg->pending;
+ struct task_struct *p = arg->task;
+ struct rq *rq = this_rq();
+ bool complete = false;
+ struct rq_flags rf;
+
+ /*
+ * The original target CPU might have gone down and we might
+ * be on another CPU but it doesn't matter.
+ */
+ local_irq_save(rf.flags);
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ flush_smp_call_function_queue();
+
+ raw_spin_lock(&p->pi_lock);
+ rq_lock(rq, &rf);
+
+ /*
+ * If we were passed a pending, then ->stop_pending was set, thus
+ * p->migration_pending must have remained stable.
+ */
+ WARN_ON_ONCE(pending && pending != p->migration_pending);
+
+ /*
+ * If task_rq(p) != rq, it cannot be migrated here, because we're
+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+ * we're holding p->pi_lock.
+ */
+ if (task_rq(p) == rq) {
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
+ goto out;
+ }
+
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ } else {
+ p->wake_cpu = arg->dest_cpu;
+ }
+
+ /*
+ * XXX __migrate_task() can fail, at which point we might end
+ * up running on a dodgy CPU, AFAICT this can only happen
+ * during CPU hotplug, at which point we'll get pushed out
+ * anyway, so it's probably not a big deal.
+ */
+
+ } else if (pending) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * The task moved before the stopper got to run. We're holding
+ * ->pi_lock, so the allowed mask is stable - if it got
+ * somewhere allowed, we're done.
+ */
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ p->migration_pending = NULL;
+ complete = true;
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ WARN_ON_ONCE(!pending->stop_pending);
+ preempt_disable();
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ preempt_enable();
+ return 0;
+ }
+out:
+ if (pending)
+ pending->stop_pending = false;
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+
+ if (complete)
+ complete_all(&pending->done);
+
+ return 0;
+}
+
+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_rq_lock(rq);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ move_queued_task_locked(rq, lowest_rq, p);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_rq_unlock(rq);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+ return 0;
+}
+
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask);
+
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
+{
+ if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = ctx->new_mask;
+ return;
+ }
+
+ cpumask_copy(&p->cpus_mask, ctx->new_mask);
+ p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+ mm_update_cpus_allowed(p->mm, ctx->new_mask);
+
+ /*
+ * Swap in a new user_cpus_ptr if SCA_USER flag set
+ */
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
+}
+
+static void
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+{
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->sched_class->set_cpus_allowed(p, ctx);
+}
+
+/*
+ * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ * affinity (if any) should be destroyed too.
+ */
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .user_mask = NULL,
+ .flags = SCA_USER, /* clear the user requested mask */
+ };
+ union cpumask_rcuhead {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ };
+
+ scoped_guard (__task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
+
+ /*
+ * Because this is called with p->pi_lock held, it is not possible
+ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using
+ * kfree_rcu().
+ */
+ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
+}
+
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ int node)
+{
+ cpumask_t *user_mask;
+ unsigned long flags;
+
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
+ return 0;
+
+ user_mask = alloc_user_cpus_ptr(node);
+ if (!user_mask)
+ return -ENOMEM;
+
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * set_cpus_allowed_force().
+ */
+ raw_spin_lock_irqsave(&src->pi_lock, flags);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
+ raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
+ return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = NULL;
+
+ swap(p->user_cpus_ptr, user_mask);
+
+ return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+ kfree(clear_user_cpus_ptr(p));
+}
+
+/*
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <resumes>
+ * migrate_enable();
+ * __set_cpus_allowed_ptr();
+ * <wakes local stopper>
+ * `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * occurs after the stopper bailed out due to the targeted task still being
+ * Migrate-Disable. Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * CPU0 P1 P2
+ * <P0>
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <migration/0>
+ * migration_cpu_stop()
+ * is_migration_disabled()
+ * <bails>
+ * set_cpus_allowed_ptr(P0, [0, 1]);
+ * <signal completion>
+ * <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded by an uninstallation of
+ * p->migration_pending done with p->pi_lock held.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
+ bool stop_pending, complete = false;
+
+ /*
+ * Can the task run on the task's current CPU? If so, we're done
+ *
+ * We are also done if the task is the current donor, boosting a lock-
+ * holding proxy, (and potentially has been migrated outside its
+ * current or previous affinity mask)
+ */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) ||
+ (task_current_donor(rq, p) && !task_current(rq, p))) {
+ struct task_struct *push_task = NULL;
+
+ if ((flags & SCA_MIGRATE_ENABLE) &&
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+ rq->push_busy = true;
+ push_task = get_task_struct(p);
+ }
+
+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
+ pending = p->migration_pending;
+ if (pending && !pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ preempt_disable();
+ task_rq_unlock(rq, p, rf);
+ if (push_task) {
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ p, &rq->push_work);
+ }
+ preempt_enable();
+
+ if (complete)
+ complete_all(&pending->done);
+
+ return 0;
+ }
+
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ /* serialized by p->pi_lock */
+ if (!p->migration_pending) {
+ /* Install the request */
+ refcount_set(&my_pending.refs, 1);
+ init_completion(&my_pending.done);
+ my_pending.arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = dest_cpu,
+ .pending = &my_pending,
+ };
+
+ p->migration_pending = &my_pending;
+ } else {
+ pending = p->migration_pending;
+ refcount_inc(&pending->refs);
+ /*
+ * Affinity has changed, but we've already installed a
+ * pending. migration_cpu_stop() *must* see this, else
+ * we risk a completion of the pending despite having a
+ * task on a disallowed CPU.
+ *
+ * Serialized by p->pi_lock, so this is safe.
+ */
+ pending->arg.dest_cpu = dest_cpu;
+ }
+ }
+ pending = p->migration_pending;
+ /*
+ * - !MIGRATE_ENABLE:
+ * we'll have installed a pending if there wasn't one already.
+ *
+ * - MIGRATE_ENABLE:
+ * we're here because the current CPU isn't matching anymore,
+ * the only way that can happen is because of a concurrent
+ * set_cpus_allowed_ptr() call, which should then still be
+ * pending completion.
+ *
+ * Either way, we really should have a @pending here.
+ */
+ if (WARN_ON_ONCE(!pending)) {
+ task_rq_unlock(rq, p, rf);
+ return -EINVAL;
+ }
+
+ if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ /*
+ * MIGRATE_ENABLE gets here because 'p == current', but for
+ * anything else we cannot do is_migration_disabled(), punt
+ * and have the stopper function handle it all race-free.
+ */
+ stop_pending = pending->stop_pending;
+ if (!stop_pending)
+ pending->stop_pending = true;
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ p->migration_flags &= ~MDF_PUSH;
+
+ preempt_disable();
+ task_rq_unlock(rq, p, rf);
+ if (!stop_pending) {
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ }
+ preempt_enable();
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ return 0;
+ } else {
+
+ if (!is_migration_disabled(p)) {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ }
+ task_rq_unlock(rq, p, rf);
+
+ if (complete)
+ complete_all(&pending->done);
+ }
+
+ wait_for_completion(&pending->done);
+
+ if (refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs); /* No UaF, just an address */
+
+ /*
+ * Block the original owner of &pending until all subsequent callers
+ * have seen the completion and decremented the refcount
+ */
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
+ return 0;
+}
+
+/*
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
+ */
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ struct affinity_context *ctx,
+ struct rq *rq,
+ struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool kthread = p->flags & PF_KTHREAD;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ if (kthread || is_migration_disabled(p)) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs,
+ * however, during cpu-hot-unplug, even these might get pushed
+ * away if not KTHREAD_IS_PER_CPU.
+ *
+ * Specifically, migration_disabled() tasks must not fail the
+ * cpumask_any_and_distribute() pick below, esp. so on
+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
+
+ if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
+ goto out;
+ }
+
+ if (WARN_ON_ONCE(p == current &&
+ is_migration_disabled(p) &&
+ !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ /*
+ * Picking a ~random cpu helps in cases where we are changing affinity
+ * for groups of tasks (ie. cpuset), so that load balancing is not
+ * immediately required to distribute the tasks within their new mask.
+ */
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
+ if (dest_cpu >= nr_cpu_ids) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, ctx);
+
+ return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
+
+out:
+ task_rq_unlock(rq, p, rf);
+
+ return ret;
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ ctx->new_mask = rq->scratch_mask;
+
+ return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+
+ return __set_cpus_allowed_ptr(p, &ac);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ * affinity or use cpu_online_mask instead.
+ *
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ struct cpumask *new_mask,
+ const struct cpumask *subset_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+ struct rq_flags rf;
+ struct rq *rq;
+ int err;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Forcefully restricting the affinity of a deadline task is
+ * likely to cause problems, so fail and noisily override the
+ * mask entirely.
+ */
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ err = -EPERM;
+ goto err_unlock;
+ }
+
+ if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
+
+err_unlock:
+ task_rq_unlock(rq, p, &rf);
+ return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ cpumask_var_t new_mask;
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+ /*
+ * __migrate_task() can fail silently in the face of concurrent
+ * offlining of the chosen destination CPU, so take the hotplug
+ * lock to ensure that the migration succeeds.
+ */
+ cpus_read_lock();
+ if (!cpumask_available(new_mask))
+ goto out_set_mask;
+
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ goto out_free_mask;
+
+ /*
+ * We failed to find a valid subset of the affinity mask for the
+ * task, so override it based on its cpuset hierarchy.
+ */
+ cpuset_cpus_allowed(p, new_mask);
+ override_mask = new_mask;
+
+out_set_mask:
+ if (printk_ratelimit()) {
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ task_pid_nr(p), p->comm,
+ cpumask_pr_args(override_mask));
+ }
+
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+ cpus_read_unlock();
+ free_cpumask_var(new_mask);
+}
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr().
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ struct affinity_context ac = {
+ .new_mask = task_user_cpus(p),
+ .flags = 0,
+ };
+ int ret;
+
+ /*
+ * Try to restore the old affinity mask with __sched_setaffinity().
+ * Cpuset masking will be done there too.
+ */
+ ret = __sched_setaffinity(p, &ac);
+ WARN_ON_ONCE(ret);
+}
+
+#ifdef CONFIG_SMP
+
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ /*
+ * We should never call set_task_cpu() on a blocked task,
+ * ttwu() will sort out the placement.
+ */
+ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
+
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+ *
+ * sched_move_task() holds both and thus holding either pins the cgroup,
+ * see task_group().
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_rq_lock().
+ */
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(__rq_lockp(task_rq(p)))));
+#endif
+ /*
+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
+ */
+ WARN_ON_ONCE(!cpu_online(new_cpu));
+
+ WARN_ON_ONCE(is_migration_disabled(p));
+
+ trace_sched_migrate_task(p, new_cpu);
+
+ if (task_cpu(p) != new_cpu) {
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
+ p->se.nr_migrations++;
+ perf_event_task_migrate(p);
+ }
+
+ __set_task_cpu(p, new_cpu);
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_NUMA_BALANCING
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+ if (task_on_rq_queued(p)) {
+ struct rq *src_rq, *dst_rq;
+ struct rq_flags srf, drf;
+
+ src_rq = task_rq(p);
+ dst_rq = cpu_rq(cpu);
+
+ rq_pin_lock(src_rq, &srf);
+ rq_pin_lock(dst_rq, &drf);
+
+ move_queued_task_locked(src_rq, dst_rq, p);
+ wakeup_preempt(dst_rq, p, 0);
+
+ rq_unpin_lock(dst_rq, &drf);
+ rq_unpin_lock(src_rq, &srf);
+
+ } else {
+ /*
+ * Task isn't running anymore; make it appear like we migrated
+ * it before it went to sleep. This means on wakeup we make the
+ * previous CPU our target instead of where it really is.
+ */
+ p->wake_cpu = cpu;
+ }
+}
+
+struct migration_swap_arg {
+ struct task_struct *src_task, *dst_task;
+ int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+ struct migration_swap_arg *arg = data;
+ struct rq *src_rq, *dst_rq;
+
+ if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+ return -EAGAIN;
+
+ src_rq = cpu_rq(arg->src_cpu);
+ dst_rq = cpu_rq(arg->dst_cpu);
+
+ guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
+ guard(double_rq_lock)(src_rq, dst_rq);
+
+ if (task_cpu(arg->dst_task) != arg->dst_cpu)
+ return -EAGAIN;
+
+ if (task_cpu(arg->src_task) != arg->src_cpu)
+ return -EAGAIN;
+
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
+ return -EAGAIN;
+
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
+ return -EAGAIN;
+
+ __migrate_swap_task(arg->src_task, arg->dst_cpu);
+ __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+ return 0;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p,
+ int target_cpu, int curr_cpu)
+{
+ struct migration_swap_arg arg;
+ int ret = -EINVAL;
+
+ arg = (struct migration_swap_arg){
+ .src_task = cur,
+ .src_cpu = curr_cpu,
+ .dst_task = p,
+ .dst_cpu = target_cpu,
+ };
+
+ if (arg.src_cpu == arg.dst_cpu)
+ goto out;
+
+ /*
+ * These three tests are all lockless; this is OK since all of them
+ * will be re-checked with proper locks held further down the line.
+ */
+ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
+ goto out;
+
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
+ goto out;
+
+ trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
+ ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+ return ret;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@@ -1287,20 +3390,35 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
*/
void kick_process(struct task_struct *p)
{
- int cpu;
+ guard(preempt)();
+ int cpu = task_cpu(p);
- preempt_disable();
- cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ * - cpu_active must be a subset of cpu_online
+ *
+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
+ * see __set_cpus_allowed_ptr(). At this point the newly online
+ * CPU isn't yet part of the sched domains, and balancing will not
+ * see it.
+ *
+ * - on CPU-down we clear cpu_active() to mask the sched domains and
+ * avoid the load balancer to place new tasks on the to be removed
+ * CPU. Existing tasks will remain running there and will be taken
+ * off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
@@ -1310,46 +3428,41 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
int dest_cpu;
/*
- * If the node that the cpu is on has been offlined, cpu_to_node()
- * will return -1. There is no cpu on the node, and we should
- * select the cpu on the other node.
+ * If the node that the CPU is on has been offlined, cpu_to_node()
+ * will return -1. There is no CPU on the node, and we should
+ * select the CPU on the other node.
*/
if (nid != -1) {
nodemask = cpumask_of_node(nid);
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_online(dest_cpu))
- continue;
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
- if (!cpu_online(dest_cpu))
- continue;
- if (!cpu_active(dest_cpu))
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
+ if (!is_cpu_allowed(p, dest_cpu))
continue;
+
goto out;
}
+ /* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- /* No more Mr. Nice Guy. */
- cpuset_cpus_allowed_fallback(p);
- state = possible;
- break;
-
+ if (cpuset_cpus_allowed_fallback(p)) {
+ state = possible;
+ break;
+ }
+ fallthrough;
case possible:
- do_set_cpus_allowed(p, cpu_possible_mask);
+ set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
state = fail;
break;
-
case fail:
BUG();
break;
@@ -1373,101 +3486,162 @@ out:
}
/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
{
- if (p->nr_cpus_allowed > 1)
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ lockdep_assert_held(&p->pi_lock);
+
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
+ cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
+ *wake_flags |= WF_RQ_SELECTED;
+ } else {
+ cpu = cpumask_any(p->cpus_ptr);
+ }
/*
* In order not to call set_task_cpu() on a blocking task we need
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
- * cpu.
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
+ * CPU.
*
* Since this is common to all placement strategies, this lives here.
*
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
- if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
- !cpu_online(cpu)))
+ if (unlikely(!is_cpu_allowed(p, cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
return cpu;
}
-static void update_avg(u64 *avg, u64 sample)
+void sched_set_stop_task(int cpu, struct task_struct *stop)
{
- s64 diff = sample - *avg;
- *avg += diff >> 3;
+ static struct lock_class_key stop_pi_lock;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+ stop->sched_class = &stop_sched_class;
+
+ /*
+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+ * adjust the effective priority of a task. As a result,
+ * rt_mutex_setprio() can trigger (RT) balancing operations,
+ * which can then trigger wakeups of the stop thread to push
+ * around the current task.
+ *
+ * The stop task itself will never be part of the PI-chain, it
+ * never blocks, therefore that ->pi_lock recursion is safe.
+ * Tell lockdep about this by placing the stop->pi_lock in its
+ * own class.
+ */
+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
}
-#endif
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
-#ifdef CONFIG_SCHEDSTATS
- struct rq *rq = this_rq();
+ struct rq *rq;
-#ifdef CONFIG_SMP
- int this_cpu = smp_processor_id();
+ if (!schedstat_enabled())
+ return;
+
+ rq = this_rq();
- if (cpu == this_cpu) {
- schedstat_inc(rq, ttwu_local);
- schedstat_inc(p, se.statistics.nr_wakeups_local);
+ if (cpu == rq->cpu) {
+ __schedstat_inc(rq->ttwu_local);
+ __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
- rcu_read_lock();
- for_each_domain(this_cpu, sd) {
+ __schedstat_inc(p->stats.nr_wakeups_remote);
+
+ guard(rcu)();
+ for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
+ __schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
- rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
-#endif /* CONFIG_SMP */
+ __schedstat_inc(p->stats.nr_wakeups_migrate);
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(p, se.statistics.nr_wakeups);
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->stats.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+ __schedstat_inc(p->stats.nr_wakeups_sync);
}
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+/*
+ * Mark the task runnable.
+ */
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- activate_task(rq, p, en_flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
-
- /* if a worker is waking up, notify workqueue */
- if (p->flags & PF_WQ_WORKER)
- wq_worker_waking_up(p, cpu_of(rq));
+ WRITE_ONCE(p->__state, TASK_RUNNING);
+ trace_sched_wakeup(p);
}
-/*
- * Mark the task runnable and perform wakeup-preemption.
- */
static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
{
- check_preempt_curr(rq, p, wake_flags);
- trace_sched_wakeup(p, true);
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
- p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+ if (wake_flags & WF_RQ_SELECTED)
+ en_flags |= ENQUEUE_RQ_SELECTED;
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ wakeup_preempt(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
+
+ if (p->sched_class->task_woken) {
+ /*
+ * Our task @p is fully woken up and running; so it's safe to
+ * drop the rq->lock, hereafter rq is only used for statistics.
+ */
+ rq_unpin_lock(rq, rf);
p->sched_class->task_woken(rq, p);
+ rq_repin_lock(rq, rf);
+ }
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -1480,269 +3654,683 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
rq->idle_stamp = 0;
}
-#endif
-}
-
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
-{
-#ifdef CONFIG_SMP
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-#endif
-
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
- ttwu_do_wakeup(rq, p, wake_flags);
}
/*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
+ * Consider @p being inside a wait loop:
+ *
+ * for (;;) {
+ * set_current_state(TASK_UNINTERRUPTIBLE);
+ *
+ * if (CONDITION)
+ * break;
+ *
+ * schedule();
+ * }
+ * __set_current_state(TASK_RUNNING);
+ *
+ * between set_current_state() and schedule(). In this case @p is still
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
+ * an atomic manner.
+ *
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
+ * then schedule() must still happen and p->state can be changed to
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
+ * need to do a full wakeup with enqueue.
+ *
+ * Returns: %true when the wakeup is done,
+ * %false otherwise.
*/
-static int ttwu_remote(struct task_struct *p, int wake_flags)
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
+ struct rq_flags rf;
struct rq *rq;
int ret = 0;
- rq = __task_rq_lock(p);
+ rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags);
+ if (p->se.sched_delayed)
+ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ wakeup_preempt(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
-#ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
{
+ struct llist_node *llist = arg;
struct rq *rq = this_rq();
- struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct task_struct *p;
- unsigned long flags;
+ struct task_struct *p, *t;
+ struct rq_flags rf;
if (!llist)
return;
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- while (llist) {
- p = llist_entry(llist, struct task_struct, wake_entry);
- llist = llist_next(llist);
- ttwu_do_activate(rq, p, 0);
- }
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
+ if (WARN_ON_ONCE(p->on_cpu))
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
-void scheduler_ipi(void)
-{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- preempt_fold_need_resched();
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
+ set_task_cpu(p, cpu_of(rq));
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
- return;
+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
+ }
/*
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
- * traditionally all their work was done from the interrupt return
- * path. Now that we actually do some work, we need to make sure
- * we do call them.
- *
- * Some archs already do call them, luckily irq_enter/exit nest
- * properly.
+ * Must be after enqueueing at least once task such that
+ * idle_cpu() does not observe a false-negative -- if it does,
+ * it is possible for select_idle_siblings() to stack a number
+ * of tasks on this CPU during that window.
*
- * Arguably we should visit all archs and update all handlers,
- * however a fair share of IPIs are still resched only so this would
- * somewhat pessimize the simple resched case.
+ * It is OK to clear ttwu_pending when another task pending.
+ * We will receive IPI after local IRQ enabled and then enqueue it.
+ * Since now nr_running > 0, idle_cpu() will always get correct result.
*/
- irq_enter();
- sched_ttwu_pending();
+ WRITE_ONCE(rq->ttwu_pending, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
- /*
- * Check if someone kicked us for doing the nohz idle load balance.
- */
- if (unlikely(got_nohz_idle_kick())) {
- this_rq()->idle_balance = 1;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
+/*
+ * Prepare the scene for sending an IPI for a remote smp_call
+ *
+ * Returns true if the caller can proceed with sending the IPI.
+ * Returns false otherwise.
+ */
+bool call_function_single_prep_ipi(int cpu)
+{
+ if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ return false;
}
- irq_exit();
+
+ return true;
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
- if (!set_nr_if_polling(rq->idle))
- smp_send_reschedule(cpu);
- else
- trace_sched_wake_idle_without_ipi(cpu);
- }
+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
+ WRITE_ONCE(rq->ttwu_pending, 1);
+#ifdef CONFIG_SMP
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
+#endif
}
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- rcu_read_lock();
-
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
- if (set_nr_if_polling(rq->idle)) {
- trace_sched_wake_idle_without_ipi(cpu);
- } else {
- raw_spin_lock_irqsave(&rq->lock, flags);
+ guard(rcu)();
+ if (is_idle_task(rcu_dereference(rq->curr))) {
+ guard(rq_lock_irqsave)(rq);
if (is_idle_task(rq->curr))
- smp_send_reschedule(cpu);
- /* Else cpu is not in idle, do nothing here */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ resched_curr(rq);
}
+}
-out:
- rcu_read_unlock();
+bool cpus_equal_capacity(int this_cpu, int that_cpu)
+{
+ if (!sched_asym_cpucap_active())
+ return true;
+
+ if (this_cpu == that_cpu)
+ return true;
+
+ return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu);
}
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-#endif /* CONFIG_SMP */
-static void ttwu_queue(struct task_struct *p, int cpu)
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
+static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
+{
+ /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
+ if (!scx_allow_ttwu_queue(p))
+ return false;
+
+#ifdef CONFIG_SMP
+ if (p->sched_class == &stop_sched_class)
+ return false;
+#endif
+
+ /*
+ * Do not complicate things with the async wake_list while the CPU is
+ * in hotplug state.
+ */
+ if (!cpu_active(cpu))
+ return false;
+
+ /* Ensure the task will still be allowed to run on the CPU. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
+ /*
+ * If the CPU does not share cache, then queue the task on the
+ * remote rqs wakelist to avoid accessing remote data.
+ */
+ if (!cpus_share_cache(smp_processor_id(), cpu))
+ return true;
+
+ if (cpu == smp_processor_id())
+ return false;
+
+ /*
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
+ */
+ if (!cpu_rq(cpu)->nr_running)
+ return true;
+
+ return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
+
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
-#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* sync clocks x-cpu */
- ttwu_queue_remote(p, cpu);
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
+ rq_unlock(rq, &rf);
+}
+
+/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of saved_state:
+ *
+ * The related locking code always holds p::pi_lock when updating
+ * p::saved_state, which means the code is fully serialized in both cases.
+ *
+ * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ * No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ * allows us to prevent early wakeup of tasks before they can be run on
+ * asymmetric ISA architectures (eg ARMv9).
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+ int match;
+
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ state != TASK_RTLOCK_WAIT);
}
-#endif
- raw_spin_lock(&rq->lock);
- ttwu_do_activate(rq, p, 0);
- raw_spin_unlock(&rq->lock);
+ *success = !!(match = __task_state_match(p, state));
+
+ /*
+ * Saved state preserves the task state across blocking on
+ * an RT lock or TASK_FREEZABLE tasks. If the state matches,
+ * set p::saved_state to TASK_RUNNING, but do not wake the task
+ * because it waits for a lock wakeup or __thaw_task(). Also
+ * indicate success because from the regular waker's point of
+ * view this has succeeded.
+ *
+ * After acquiring the lock the task will restore p::__state
+ * from p::saved_state which ensures that the regular
+ * wakeup is not lost. The restore will also set
+ * p::saved_state to TASK_RUNNING so any further tests will
+ * not result in false positives vs. @success
+ */
+ if (match < 0)
+ p->saved_state = TASK_RUNNING;
+
+ return match > 0;
}
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ * MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent
+ * execution on its new CPU [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ * A) UNLOCK of the rq(c0)->lock scheduling out task t
+ * B) migration for t is required to synchronize *both* rq(c0)->lock and
+ * rq(c1)->lock (if not at the same time, then in that order).
+ * C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Release/acquire chaining guarantees that B happens after A and C after B.
+ * Note: the CPU doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ * CPU0 CPU1 CPU2
+ *
+ * LOCK rq(0)->lock
+ * sched-out X
+ * sched-in Y
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(0)->lock // orders against CPU0
+ * dequeue X
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(1)->lock
+ * enqueue X
+ * UNLOCK rq(1)->lock
+ *
+ * LOCK rq(1)->lock // orders against CPU2
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(1)->lock
+ *
+ *
+ * BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
+ *
+ * Example:
+ *
+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ * LOCK rq(0)->lock LOCK X->pi_lock
+ * dequeue X
+ * sched-out X
+ * smp_store_release(X->on_cpu, 0);
+ *
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
+ * X->state = WAKING
+ * set_task_cpu(X,2)
+ *
+ * LOCK rq(2)->lock
+ * enqueue X
+ * X->state = RUNNING
+ * UNLOCK rq(2)->lock
+ *
+ * LOCK rq(2)->lock // orders against CPU1
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(2)->lock
+ *
+ * UNLOCK X->pi_lock
+ * UNLOCK rq(0)->lock
+ *
+ *
+ * However, for wakeups there is a second guarantee we must provide, namely we
+ * must ensure that CONDITION=1 done by the caller can not be reordered with
+ * accesses to the task state; see try_to_wake_up() and set_current_state().
+ */
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
+ * Conceptually does:
+ *
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * This function is atomic against schedule() which would dequeue the task.
*
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ * - p->sched_class
+ * - p->cpus_ptr
+ * - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
+ * - ttwu_queue() -- new rq, for enqueue of the task;
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ * %false otherwise.
*/
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- unsigned long flags;
+ guard(preempt)();
int cpu, success = 0;
+ wake_flags |= WF_TTWU;
+
+ if (p == current) {
+ /*
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
+ * == smp_processor_id()'. Together this means we can special
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
+ * without taking any locks.
+ *
+ * Specifically, given current runs ttwu() we must be before
+ * schedule()'s block_task(), as such this must not observe
+ * sched_delayed.
+ *
+ * In particular:
+ * - we rely on Program-Order guarantees for all the ordering,
+ * - we're serialized against set_special_state() by virtue of
+ * it disabling IRQs (this allows not taking ->pi_lock).
+ */
+ WARN_ON_ONCE(p->se.sched_delayed);
+ if (!ttwu_state_match(p, state, &success))
+ goto out;
+
+ trace_sched_waking(p);
+ ttwu_do_wakeup(p);
+ goto out;
+ }
+
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
- * reordered with p->state check below. This pairs with mb() in
- * set_current_state() the waiting thread does.
+ * reordered with p->state check below. This pairs with smp_store_mb()
+ * in set_current_state() that the waiting thread does.
*/
- smp_mb__before_spinlock();
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- if (!(p->state & state))
- goto out;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, state, &success))
+ break;
- success = 1; /* we're going to change ->state */
- cpu = task_cpu(p);
+ trace_sched_waking(p);
- if (p->on_rq && ttwu_remote(p, wake_flags))
- goto stat;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smp_rmb() lives in __task_needs_rq_lock().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ break;
-#ifdef CONFIG_SMP
- /*
- * If the owning (remote) cpu is still in the middle of schedule() with
- * this task as prev, wait until its done referencing the task.
- */
- while (p->on_cpu)
- cpu_relax();
- /*
- * Pairs with the smp_wmb() in finish_lock_switch().
- */
- smp_rmb();
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s block_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ WRITE_ONCE(p->__state, TASK_WAKING);
- if (p->sched_class->task_waking)
- p->sched_class->task_waking(p);
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
- if (task_cpu(p) != cpu) {
- wake_flags |= WF_MIGRATED;
- set_task_cpu(p, cpu);
- }
-#endif /* CONFIG_SMP */
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
- ttwu_queue(p, cpu);
-stat:
- ttwu_stat(p, cpu, wake_flags);
+ ttwu_queue(p, cpu, wake_flags);
+ }
out:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ if (success)
+ ttwu_stat(p, task_cpu(p), wake_flags);
return success;
}
+static bool __task_needs_rq_lock(struct task_struct *p)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ /*
+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+ * the task is blocked. Make sure to check @state since ttwu() can drop
+ * locks at the end, see ttwu_queue_wakelist().
+ */
+ if (state == TASK_RUNNING || state == TASK_WAKING)
+ return true;
+
+ /*
+ * Ensure we load p->on_rq after p->__state, otherwise it would be
+ * possible to, falsely, observe p->on_rq == 0.
+ *
+ * See try_to_wake_up() for a longer comment.
+ */
+ smp_rmb();
+ if (p->on_rq)
+ return true;
+
+ /*
+ * Ensure the task has finished __schedule() and will not be referenced
+ * anymore. Again, see try_to_wake_up() for a longer comment.
+ */
+ smp_rmb();
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ return false;
+}
+
/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
+ * task_call_func - Invoke a function on task in fixed state
+ * @p: Process for which the function is to be invoked, can be @current.
+ * @func: Function to invoke.
+ * @arg: Argument to function.
*
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
+ * Fix the task in it's current state by avoiding wakeups and or rq operations
+ * and call @func(@arg) on it. This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required. Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
+ *
+ * Returns:
+ * Whatever @func returns
*/
-static void try_to_wake_up_local(struct task_struct *p)
+int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
- struct rq *rq = task_rq(p);
+ struct rq *rq = NULL;
+ struct rq_flags rf;
+ int ret;
- if (WARN_ON_ONCE(rq != this_rq()) ||
- WARN_ON_ONCE(p == current))
- return;
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- lockdep_assert_held(&rq->lock);
+ if (__task_needs_rq_lock(p))
+ rq = __task_rq_lock(p, &rf);
- if (!raw_spin_trylock(&p->pi_lock)) {
- raw_spin_unlock(&rq->lock);
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- }
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
+ * p->__state to differentiate between these states.
+ */
+ ret = func(p, arg);
- if (!(p->state & TASK_NORMAL))
- goto out;
+ if (rq)
+ __task_rq_unlock(rq, p, &rf);
- if (!task_on_rq_queued(p))
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return ret;
+}
- ttwu_do_wakeup(rq, p, 0);
- ttwu_stat(p, smp_processor_id(), 0);
-out:
- raw_spin_unlock(&p->pi_lock);
+/**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee. Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *t;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */
+ t = rcu_dereference(cpu_curr(cpu));
+ rq_unlock_irqrestore(rq, &rf);
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+
+ return t;
}
/**
@@ -1754,12 +4342,10 @@ out:
*
* Return: 1 if the process was woken up, 0 if it was already running.
*
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * This function executes a full memory barrier before accessing the task state.
*/
int wake_up_process(struct task_struct *p)
{
- WARN_ON(task_is_stopped_or_traced(p));
return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -1770,30 +4356,13 @@ int wake_up_state(struct task_struct *p, unsigned int state)
}
/*
- * This function clears the sched_dl_entity static params.
- */
-void __dl_clear_params(struct task_struct *p)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = 0;
- dl_se->dl_deadline = 0;
- dl_se->dl_period = 0;
- dl_se->flags = 0;
- dl_se->dl_bw = 0;
-
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
-}
-
-/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
- * __sched_fork() is basic setup used by init_idle() too:
+ * __sched_fork() is basic setup which is also used by sched_init() to
+ * initialize the boot CPU's idle task.
*/
-static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+static void __sched_fork(u64 clone_flags, struct task_struct *p)
{
p->on_rq = 0;
@@ -1803,73 +4372,158 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
-#ifdef CONFIG_SMP
- p->se.avg.decay_count = 0;
-#endif
+ p->se.vlag = 0;
INIT_LIST_HEAD(&p->se.group_node);
+ /* A delayed task cannot be in clone(). */
+ WARN_ON_ONCE(p->se.sched_delayed);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#ifdef CONFIG_CFS_BANDWIDTH
+ init_cfs_throttle_work(p);
+#endif
+#endif
+
#ifdef CONFIG_SCHEDSTATS
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ /* Even if schedstat is disabled, there should not be garbage */
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- __dl_clear_params(p);
+ init_dl_entity(&p->dl);
INIT_LIST_HEAD(&p->rt.run_list);
+ p->rt.timeout = 0;
+ p->rt.time_slice = sched_rr_timeslice;
+ p->rt.on_rq = 0;
+ p->rt.on_list = 0;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ init_scx_entity(&p->scx);
+#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
+ init_numa_balancing(clone_flags, p);
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
+}
+
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
#ifdef CONFIG_NUMA_BALANCING
- if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
- p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- p->mm->numa_scan_seq = 0;
- }
- if (clone_flags & CLONE_VM)
- p->numa_preferred_nid = current->numa_preferred_nid;
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ static_branch_enable(&sched_numa_balancing);
+ else
+ static_branch_disable(&sched_numa_balancing);
+}
+
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
else
- p->numa_preferred_nid = -1;
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
- p->node_stamp = 0ULL;
- p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->last_task_numa_placement = 0;
- p->last_sum_exec_runtime = 0;
+#ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
- p->numa_group = NULL;
-#endif /* CONFIG_NUMA_BALANCING */
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
}
-#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
-void set_numabalancing_state(bool enabled)
+static int sysctl_numa_balancing(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = sysctl_numa_balancing_mode;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
+ return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_SCHEDSTATS
+
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+static void set_schedstats(bool enabled)
{
if (enabled)
- sched_feat_set("NUMA");
+ static_branch_enable(&sched_schedstats);
else
- sched_feat_set("NO_NUMA");
+ static_branch_disable(&sched_schedstats);
}
-#else
-__read_mostly bool numabalancing_enabled;
-void set_numabalancing_state(bool enabled)
+void force_schedstat_enabled(void)
{
- numabalancing_enabled = enabled;
+ if (!schedstat_enabled()) {
+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ static_branch_enable(&sched_schedstats);
+ }
}
-#endif /* CONFIG_SCHED_DEBUG */
+
+static int __init setup_schedstats(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ if (!strcmp(str, "enable")) {
+ set_schedstats(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_schedstats(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse schedstats=\n");
+
+ return ret;
+}
+__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int sysctl_schedstats(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = numabalancing_enabled;
+ int state = static_branch_likely(&sched_schedstats);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1880,33 +4534,88 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
if (err < 0)
return err;
if (write)
- set_numabalancing_state(state);
+ set_schedstats(state);
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_min_rt_default",
+ .data = &sysctl_sched_uclamp_util_min_rt_default,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif /* CONFIG_UCLAMP_TASK */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+};
+static int __init sched_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_core_sysctls);
+ return 0;
+}
+late_initcall(sched_core_sysctl_init);
+#endif /* CONFIG_SYSCTL */
/*
* fork()/clone()-time setup:
*/
-int sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(u64 clone_flags, struct task_struct *p)
{
- unsigned long flags;
- int cpu = get_cpu();
-
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->__state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
*/
p->prio = current->normal_prio;
+ uclamp_fork(p);
+
/*
* Revert to default priority/policy on fork if requested.
*/
@@ -1918,8 +4627,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
- set_load_weight(p);
+ p->prio = p->normal_prio = p->static_prio;
+ set_load_weight(p, false);
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
/*
* We don't need the reset flag anymore after the fork. It has
@@ -1928,145 +4639,93 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
- if (dl_prio(p->prio)) {
- put_cpu();
+ if (dl_prio(p->prio))
return -EAGAIN;
- } else if (rt_prio(p->prio)) {
+
+ scx_pre_fork(p);
+
+ if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ } else if (task_should_scx(p->policy)) {
+ p->sched_class = &ext_sched_class;
+#endif
} else {
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
- /*
- * The child is not yet in the pid-hash so no cgroup attach races,
- * and the cgroup is pinned to this child due to cgroup_fork()
- * is ran before sched_fork().
- *
- * Silence PROVE_RCU.
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP)
p->on_cpu = 0;
-#endif
init_task_preempt_count(p);
-#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-#endif
- put_cpu();
return 0;
}
-unsigned long to_ratio(u64 period, u64 runtime)
+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
- if (runtime == RUNTIME_INF)
- return 1ULL << 20;
+ unsigned long flags;
/*
- * Doing this here saves a lot of checks in all
- * the calling paths, and returning zero seems
- * safe for them anyway.
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
*/
- if (period == 0)
- return 0;
-
- return div64_u64(runtime << 20, period);
-}
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_CGROUP_SCHED
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
+#endif
+ /*
+ * We're setting the CPU for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, smp_processor_id());
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-#ifdef CONFIG_SMP
-inline struct dl_bw *dl_bw_of(int i)
-{
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
- return &cpu_rq(i)->rd->dl_bw;
+ return scx_fork(p);
}
-static inline int dl_bw_cpus(int i)
-{
- struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
-
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask)
- cpus++;
-
- return cpus;
-}
-#else
-inline struct dl_bw *dl_bw_of(int i)
+void sched_cancel_fork(struct task_struct *p)
{
- return &cpu_rq(i)->dl.dl_bw;
+ scx_cancel_fork(p);
}
-static inline int dl_bw_cpus(int i)
+void sched_post_fork(struct task_struct *p)
{
- return 1;
+ uclamp_post_fork(p);
+ scx_post_fork(p);
}
-#endif
-/*
- * We must be sure that accepting a new task (or allowing changing the
- * parameters of an existing one) is consistent with the bandwidth
- * constraints. If yes, this function also accordingly updates the currently
- * allocated bandwidth to reflect the new situation.
- *
- * This function is called while holding p's rq->lock.
- *
- * XXX we should delay bw change until the task's 0-lag point, see
- * __setparam_dl().
- */
-static int dl_overflow(struct task_struct *p, int policy,
- const struct sched_attr *attr)
+unsigned long to_ratio(u64 period, u64 runtime)
{
-
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- u64 period = attr->sched_period ?: attr->sched_deadline;
- u64 runtime = attr->sched_runtime;
- u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
-
- if (new_bw == p->dl.dl_bw)
- return 0;
+ if (runtime == RUNTIME_INF)
+ return BW_UNIT;
/*
- * Either if a task, enters, leave, or stays -deadline but changes
- * its parameters, we may need to update accordingly the total
- * allocated bandwidth of the container.
+ * Doing this here saves a lot of checks in all
+ * the calling paths, and returning zero seems
+ * safe for them anyway.
*/
- raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
- if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- __dl_add(dl_b, new_bw);
- err = 0;
- } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
- __dl_clear(dl_b, p->dl.dl_bw);
- err = 0;
- }
- raw_spin_unlock(&dl_b->lock);
+ if (period == 0)
+ return 0;
- return err;
+ return div64_u64(runtime << BW_SHIFT, period);
}
-extern void init_dl_bw(struct dl_bw *dl_b);
-
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
@@ -2076,41 +4735,66 @@ extern void init_dl_bw(struct dl_bw *dl_b);
*/
void wake_up_new_task(struct task_struct *p)
{
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
+ int wake_flags = WF_FORK;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
-#ifdef CONFIG_SMP
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ WRITE_ONCE(p->__state, TASK_RUNNING);
/*
* Fork balancing, do it here and not earlier because:
- * - cpus_allowed can change in the fork path
- * - any previously selected cpu might disappear through hotplug
+ * - cpus_ptr can change in the fork path
+ * - any previously selected CPU might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
-#endif
+ p->recent_used_cpu = task_cpu(p);
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
+ rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ post_init_entity_util_avg(p);
- /* Initialize new task's runnable average */
- init_task_runnable_average(p);
- rq = __task_rq_lock(p);
- activate_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
- trace_sched_wakeup_new(p, true);
- check_preempt_curr(rq, p, WF_FORK);
-#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
+ trace_sched_wakeup_new(p);
+ wakeup_preempt(rq, p, wake_flags);
+ if (p->sched_class->task_woken) {
+ /*
+ * Nothing relies on rq->lock after this, so it's fine to
+ * drop it.
+ */
+ rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
-#endif
- task_rq_unlock(rq, p, &flags);
+ rq_repin_lock(rq, &rf);
+ }
+ task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
+
+void preempt_notifier_inc(void)
+{
+ static_branch_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+ static_branch_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
/**
* preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
+ if (!static_branch_unlikely(&preempt_notifier_key))
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
hlist_add_head(&notifier->link, &current->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,7 +4803,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
* preempt_notifier_unregister - no longer interested in preemption notifications
* @notifier: notifier struct to unregister
*
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
*/
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
@@ -2127,7 +4811,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier)
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
@@ -2135,9 +4819,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_branch_unlikely(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
{
struct preempt_notifier *notifier;
@@ -2145,19 +4835,193 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_branch_unlikely(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS: */
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
-static void
+static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
}
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
+#endif /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void prepare_task(struct task_struct *next)
+{
+ /*
+ * Claim the task as running, we do this before switching to it
+ * such that any running task will have this set.
+ *
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
+ */
+ WRITE_ONCE(next->on_cpu, 1);
+}
+
+static inline void finish_task(struct task_struct *prev)
+{
+ /*
+ * This must be the very last reference to @prev from this CPU. After
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
+ * must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+}
+
+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+ void (*func)(struct rq *rq);
+ struct balance_callback *next;
+
+ lockdep_assert_rq_held(rq);
+
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+}
+
+static void balance_push(struct rq *rq);
+
+/*
+ * balance_push_callback is a right abuse of the callback interface and plays
+ * by significantly different rules.
+ *
+ * Where the normal balance_callback's purpose is to be ran in the same context
+ * that queued it (only later, when it's safe to drop rq->lock again),
+ * balance_push_callback is specifically targeted at __schedule().
+ *
+ * This abuse is tolerated because it places all the unlikely/odd cases behind
+ * a single test, namely: rq->balance_callback == NULL.
+ */
+struct balance_callback balance_push_callback = {
+ .next = NULL,
+ .func = balance_push,
+};
+
+static inline struct balance_callback *
+__splice_balance_callbacks(struct rq *rq, bool split)
+{
+ struct balance_callback *head = rq->balance_callback;
+
+ if (likely(!head))
+ return NULL;
+
+ lockdep_assert_rq_held(rq);
+ /*
+ * Must not take balance_push_callback off the list when
+ * splice_balance_callbacks() and balance_callbacks() are not
+ * in the same rq->lock section.
+ *
+ * In that case it would be possible for __schedule() to interleave
+ * and observe the list empty.
+ */
+ if (split && head == &balance_push_callback)
+ head = NULL;
+ else
+ rq->balance_callback = NULL;
+
+ return head;
+}
+
+struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return __splice_balance_callbacks(rq, true);
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+}
+
+void balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+ unsigned long flags;
+
+ if (unlikely(head)) {
+ raw_spin_rq_lock_irqsave(rq, flags);
+ do_balance_callbacks(rq, head);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+ }
+}
+
+static inline void
+prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
+{
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+ rq_unpin_lock(rq, rf);
+ spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq_lockp(rq)->owner = next;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock_irq(rq);
+}
+
+/*
+ * NOP if the arch has not defined these:
+ */
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
+
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_in();
+#endif
+}
/**
* prepare_task_switch - prepare to switch tasks
@@ -2176,11 +5040,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
- trace_sched_switch(prev, next);
+ kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
- prepare_lock_switch(rq, next);
+ kmap_local_sched_out();
+ prepare_task(next);
prepare_arch_switch(next);
}
@@ -2200,7 +5065,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
*
* The context switch have flipped the stack from under us and restored the
* local variables which were saved when this task called schedule() in the
- * past. prev == current is still correct but we need to recalculate this_rq
+ * past. 'prev == current' is still correct but we need to recalculate this_rq
* because prev may have moved to another CPU.
*/
static struct rq *finish_task_switch(struct task_struct *prev)
@@ -2208,7 +5073,23 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
+
+ /*
+ * The previous task will have left us with a preempt_count of 2
+ * because it left us after:
+ *
+ * schedule()
+ * preempt_disable(); // 1
+ * __schedule()
+ * raw_spin_lock_irq(&rq->lock) // 2
+ *
+ * Also, see FORK_PREEMPT_COUNT.
+ */
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
rq->prev_mm = NULL;
@@ -2217,63 +5098,68 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* If a task dies, then it sets TASK_DEAD in tsk->state and calls
* schedule one last time. The schedule call will never return, and
* the scheduled task must drop that reference.
- * The test for TASK_DEAD must occur while the runqueue locks are
- * still held, otherwise prev could be scheduled on another cpu, die
- * there before we look at prev->state, and then the reference would
- * be dropped twice.
- * Manfred Spraul <manfred@colorfullife.com>
+ *
+ * We must observe prev->state before clearing prev->on_cpu (in
+ * finish_task), otherwise a concurrent wakeup can get prev
+ * running on another CPU and we could rave with its RUNNING -> DEAD
+ * transition, resulting in a double drop.
*/
- prev_state = prev->state;
+ prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
- finish_arch_switch(prev);
perf_event_task_sched_in(prev, current);
- finish_lock_switch(rq, prev);
+ finish_task(prev);
+ tick_nohz_task_switch();
+ finish_lock_switch(rq);
finish_arch_post_lock_switch();
+ kcov_finish_switch(current);
+ /*
+ * kmap_local_sched_out() is invoked with rq::lock held and
+ * interrupts disabled. There is no requirement for that, but the
+ * sched out code does not have an interrupt enabled section.
+ * Restoring the maps on sched in does not require interrupts being
+ * disabled either.
+ */
+ kmap_local_sched_in();
fire_sched_in_preempt_notifiers(current);
- if (mm)
- mmdrop(mm);
+ /*
+ * When switching through a kernel thread, the loop in
+ * membarrier_{private,global}_expedited() may have observed that
+ * kernel thread and not issued an IPI. It is therefore possible to
+ * schedule between user->kernel->user threads without passing though
+ * switch_mm(). Membarrier requires a barrier after storing to
+ * rq->curr, before returning to userspace, so provide them here:
+ *
+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+ * provided by mmdrop_lazy_tlb(),
+ * - a sync_core for SYNC_CORE.
+ */
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
+ mmdrop_lazy_tlb_sched(mm);
+ }
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
/*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
+ * sched_ext_dead() must come before cgroup_task_dead() to
+ * prevent cgroups from being removed while its member tasks are
+ * visible to SCX schedulers.
*/
- kprobe_flush_task(prev);
- put_task_struct(prev);
- }
+ sched_ext_dead(prev);
+ cgroup_task_dead(prev);
- tick_nohz_task_switch(current);
- return rq;
-}
-
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
-{
- if (rq->post_schedule) {
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->curr->sched_class->post_schedule)
- rq->curr->sched_class->post_schedule(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ /* Task is done with its stack. */
+ put_task_stack(prev);
- rq->post_schedule = 0;
+ put_task_struct_rcu_user(prev);
}
-}
-
-#else
-static inline void post_schedule(struct rq *rq)
-{
+ return rq;
}
-#endif
-
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -2281,31 +5167,39 @@ static inline void post_schedule(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq;
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
- /* finish_task_switch() drops rq->lock and enables preemtion */
- preempt_disable();
- rq = finish_task_switch(prev);
- post_schedule(rq);
+ finish_task_switch(prev);
+ /*
+ * This is a special case: the newly created task has just
+ * switched the context for the first time. It is returning from
+ * schedule for the first time in this path.
+ */
+ trace_sched_exit_tp(true);
preempt_enable();
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
+
+ calculate_sigpending();
}
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
-static inline struct rq *
+static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next, struct rq_flags *rf)
{
- struct mm_struct *mm, *oldmm;
-
prepare_task_switch(rq, prev, next);
- mm = next->mm;
- oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
@@ -2313,26 +5207,51 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
- if (!mm) {
- next->active_mm = oldmm;
- atomic_inc(&oldmm->mm_count);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm(oldmm, mm, next);
+ /*
+ * kernel -> kernel lazy + transfer active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
+ *
+ * kernel -> user switch + mmdrop_lazy_tlb() active
+ * user -> user switch
+ */
+ if (!next->mm) { // to kernel
+ enter_lazy_tlb(prev->active_mm, next);
+
+ next->active_mm = prev->active_mm;
+ if (prev->mm) // from user
+ mmgrab_lazy_tlb(prev->active_mm);
+ else
+ prev->active_mm = NULL;
+ } else { // to user
+ membarrier_switch_mm(rq, prev->active_mm, next->mm);
+ /*
+ * sys_membarrier() requires an smp_mb() between setting
+ * rq->curr / membarrier_switch_mm() and returning to userspace.
+ *
+ * The below provides this either through switch_mm(), or in
+ * case 'prev->active_mm == next->mm' through
+ * finish_task_switch()'s mmdrop().
+ */
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
+ if (!prev->mm) { // from kernel
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+ }
}
+
+ mm_cid_switch_to(prev, next);
+
/*
- * Since the runqueue lock will be released by the next
- * task (which is an invalid locking op but in the case
- * of the scheduler it's an obvious special-case), so we
- * do an early lockdep release here:
+ * Tell rseq that the task was scheduled in. Must be after
+ * switch_mm_cid() to get the TIF flag set.
*/
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ rseq_sched_switch_event(next);
+
+ prepare_lock_switch(rq, next, rf);
- context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
@@ -2346,9 +5265,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* externally visible scheduler statistics: current number of runnable
* threads, total number of context switches performed since bootup.
*/
-unsigned long nr_running(void)
+unsigned int nr_running(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
@@ -2357,17 +5276,29 @@ unsigned long nr_running(void)
}
/*
- * Check if only the current task is running on the cpu.
+ * Check if only the current task is running on the CPU.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race. The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptible section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
*/
bool single_task_running(void)
{
- if (cpu_rq(smp_processor_id())->nr_running == 1)
- return true;
- else
- return false;
+ return raw_rq()->nr_running == 1;
}
EXPORT_SYMBOL(single_task_running);
+unsigned long long nr_context_switches_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_switches;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -2379,30 +5310,57 @@ unsigned long long nr_context_switches(void)
return sum;
}
-unsigned long nr_iowait(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += atomic_read(&cpu_rq(i)->nr_iowait);
-
- return sum;
-}
+/*
+ * Consumers of these two interfaces, like for example the cpuidle menu
+ * governor, are using nonsensical data. Preferring shallow idle state selection
+ * for a CPU that has IO-wait which might not even end up running the task when
+ * it does become runnable.
+ */
-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
{
- struct rq *this = cpu_rq(cpu);
- return atomic_read(&this->nr_iowait);
+ return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
-void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+/*
+ * IO-wait accounting, and how it's mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
+unsigned int nr_iowait(void)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
-}
+ unsigned int i, sum = 0;
-#ifdef CONFIG_SMP
+ for_each_possible_cpu(i)
+ sum += nr_iowait_cpu(i);
+
+ return sum;
+}
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
@@ -2411,27 +5369,22 @@ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
void sched_exec(void)
{
struct task_struct *p = current;
- unsigned long flags;
+ struct migration_arg arg;
int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
- if (dest_cpu == smp_processor_id())
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
+ if (dest_cpu == smp_processor_id())
+ return;
- if (likely(cpu_active(dest_cpu))) {
- struct migration_arg arg = { p, dest_cpu };
+ if (unlikely(!cpu_active(dest_cpu)))
+ return;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ arg = (struct migration_arg){ p, dest_cpu };
}
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
-#endif
-
DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
@@ -2439,24 +5392,41 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = p->se.cfs_rq->curr;
+#else
+ struct sched_entity *curr = task_rq(p)->cfs.curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
*/
unsigned long long task_sched_runtime(struct task_struct *p)
{
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
u64 ns;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+#ifdef CONFIG_64BIT
/*
- * 64-bit doesn't need locks to atomically read a 64bit value.
+ * 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
- * Reading ->on_cpu is racy, but this is ok.
+ * Reading ->on_cpu is racy, but this is OK.
*
- * If we race with it leaving cpu, we'll take a lock. So we're correct.
- * If we race with it entering cpu, unaccounted time is 0. This is
+ * If we race with it leaving CPU, we'll take a lock. So we're correct.
+ * If we race with it entering CPU, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
@@ -2465,89 +5435,280 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return p->se.sum_exec_runtime;
#endif
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* Must be ->curr _and_ ->on_rq. If dequeued, we would
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (task_current(rq, p) && task_on_rq_queued(p)) {
+ if (task_current_donor(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
ns = p->se.sum_exec_runtime;
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return ns;
}
+static u64 cpu_resched_latency(struct rq *rq)
+{
+ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+ u64 resched_latency, now = rq_clock(rq);
+ static bool warned_once;
+
+ if (sysctl_resched_latency_warn_once && warned_once)
+ return 0;
+
+ if (!need_resched() || !latency_warn_ms)
+ return 0;
+
+ if (system_state == SYSTEM_BOOTING)
+ return 0;
+
+ if (!rq->last_seen_need_resched_ns) {
+ rq->last_seen_need_resched_ns = now;
+ rq->ticks_without_resched = 0;
+ return 0;
+ }
+
+ rq->ticks_without_resched++;
+ resched_latency = now - rq->last_seen_need_resched_ns;
+ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+ return 0;
+
+ warned_once = true;
+
+ return resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+ long val;
+
+ if ((kstrtol(str, 0, &val))) {
+ pr_warn("Unable to set resched_latency_warn_ms\n");
+ return 1;
+ }
+
+ sysctl_resched_latency_warn_ms = val;
+ return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
-void scheduler_tick(void)
+void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ /* accounting goes to the donor task */
+ struct task_struct *donor;
+ struct rq_flags rf;
+ unsigned long hw_pressure;
+ u64 resched_latency;
+
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
+ arch_scale_freq_tick();
sched_clock_tick();
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
+ donor = rq->donor;
+
+ psi_account_irqtime(rq, donor, NULL);
+
update_rq_clock(rq);
- curr->sched_class->task_tick(rq, curr, 0);
- update_cpu_load_active(rq);
- raw_spin_unlock(&rq->lock);
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+ if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+ resched_curr(rq);
+
+ donor->sched_class->task_tick(rq, donor, 0);
+ if (sched_feat(LATENCY_WARN))
+ resched_latency = cpu_resched_latency(rq);
+ calc_global_load_tick(rq);
+ sched_core_tick(rq);
+ scx_tick(rq);
+
+ rq_unlock(rq, &rf);
+
+ if (sched_feat(LATENCY_WARN) && resched_latency)
+ resched_latency_warn(cpu, resched_latency);
perf_event_task_tick();
-#ifdef CONFIG_SMP
- rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
-#endif
- rq_last_tick_reset(rq);
+ if (donor->flags & PF_WQ_WORKER)
+ wq_worker_tick(donor);
+
+ if (!scx_switched_all()) {
+ rq->idle_balance = idle_cpu(cpu);
+ sched_balance_trigger(rq);
+ }
}
#ifdef CONFIG_NO_HZ_FULL
-/**
- * scheduler_tick_max_deferment
+
+struct tick_work {
+ int cpu;
+ atomic_t state;
+ struct delayed_work work;
+};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
*
- * Keep at least one tick per second when a single
- * active task is running because the scheduler doesn't
- * yet completely support full dynticks environment.
*
- * This makes sure that uptime, CFS vruntime, load
- * balancing, etc... continue to move forward, even
- * with a very low granularity.
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
*
- * Return: Maximum deferment in nanoseconds.
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
*/
-u64 scheduler_tick_max_deferment(void)
+
+static struct tick_work __percpu *tick_work_cpu;
+
+static void sched_tick_remote(struct work_struct *work)
{
- struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct tick_work *twork = container_of(dwork, struct tick_work, work);
+ int cpu = twork->cpu;
+ struct rq *rq = cpu_rq(cpu);
+ int os;
- next = rq->last_sched_tick + HZ;
+ /*
+ * Handle the tick only if it appears the remote CPU is running in full
+ * dynticks mode. The check is racy by nature, but missing a tick or
+ * having one too much is no big deal because the scheduler tick updates
+ * statistics and checks timeslices in a time-independent way, regardless
+ * of when exactly it is running.
+ */
+ if (tick_nohz_tick_stopped_cpu(cpu)) {
+ guard(rq_lock_irq)(rq);
+ struct task_struct *curr = rq->curr;
- if (time_before_eq(next, now))
- return 0;
+ if (cpu_online(cpu)) {
+ /*
+ * Since this is a remote tick for full dynticks mode,
+ * we are always sure that there is no proxy (only a
+ * single task is running).
+ */
+ WARN_ON_ONCE(rq->curr != rq->donor);
+ update_rq_clock(rq);
- return jiffies_to_nsecs(next - now);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a
+ * reasonable amount of time.
+ */
+ u64 delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30);
+ }
+ curr->sched_class->task_tick(rq, curr, 0);
+
+ calc_load_nohz_remote(rq);
+ }
+ }
+
+ /*
+ * Run the remote tick once per second (1Hz). This arbitrary
+ * frequency is large enough to avoid overload but short enough
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
+ */
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
-#endif
-notrace unsigned long get_parent_ip(unsigned long addr)
+static void sched_tick_start(int cpu)
{
- if (in_lock_functions(addr)) {
- addr = CALLER_ADDR2;
- if (in_lock_functions(addr))
- addr = CALLER_ADDR3;
+ int os;
+ struct tick_work *twork;
+
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
}
- return addr;
}
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
- defined(CONFIG_PREEMPT_TRACER))
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+ struct tick_work *twork;
+ int os;
+
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
+ return;
+
+ WARN_ON_ONCE(!tick_work_cpu);
+
+ twork = per_cpu_ptr(tick_work_cpu, cpu);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __init sched_tick_offload_init(void)
+{
+ tick_work_cpu = alloc_percpu(struct tick_work);
+ BUG_ON(!tick_work_cpu);
+ return 0;
+}
+
+#else /* !CONFIG_NO_HZ_FULL: */
+static inline void sched_tick_start(int cpu) { }
+static inline void sched_tick_stop(int cpu) { }
+#endif /* !CONFIG_NO_HZ_FULL */
+
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ defined(CONFIG_TRACE_PREEMPT_TOGGLE))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+ if (preempt_count() == val) {
+ unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
void preempt_count_add(int val)
{
@@ -2566,17 +5727,21 @@ void preempt_count_add(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val) {
- unsigned long ip = get_parent_ip(CALLER_ADDR1);
-#ifdef CONFIG_DEBUG_PREEMPT
- current->preempt_disable_ip = ip;
-#endif
- trace_preempt_off(CALLER_ADDR0, ip);
- }
+ preempt_latency_start(val);
}
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
@@ -2593,20 +5758,34 @@ void preempt_count_sub(int val)
return;
#endif
- if (preempt_count() == val)
- trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ preempt_latency_stop(val);
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
#endif
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ return p->preempt_disable_ip;
+#else
+ return 0;
+#endif
+}
+
/*
* Print scheduling while atomic bug:
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
+ /* Save this before calling printk(), since that will clobber it */
+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
if (oops_in_progress)
return;
@@ -2617,13 +5796,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
- pr_cont("\n");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
}
-#endif
+ check_panic_on_warn("scheduling while atomic");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -2631,62 +5809,871 @@ static noinline void __schedule_bug(struct task_struct *prev)
/*
* Various schedule()-time debugging checks and statistics:
*/
-static inline void schedule_debug(struct task_struct *prev)
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+ if (task_stack_end_corrupted(prev))
+ panic("corrupted stack end detected inside scheduler\n");
+
+ if (task_scs_end_corrupted(prev))
+ panic("corrupted shadow stack detected inside scheduler\n");
#endif
- /*
- * Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path. Otherwise whine
- * if we are scheduling when we should not.
- */
- if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ prev->comm, prev->pid, prev->non_block_count);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ }
+#endif
+
+ if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
+ preempt_count_set(PREEMPT_DISABLED);
+ }
rcu_sleep_check();
+ WARN_ON_ONCE(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
- schedstat_inc(this_rq(), sched_count);
+ schedstat_inc(this_rq()->sched_count);
+}
+
+static void prev_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+ const struct sched_class *start_class = prev->sched_class;
+ const struct sched_class *class;
+
+ /*
+ * We must do the balancing pass before put_prev_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
+ */
+ for_active_class_range(class, start_class, &idle_sched_class) {
+ if (class->balance && class->balance(rq, prev, rf))
+ break;
+ }
}
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- const struct sched_class *class = &fair_sched_class;
+ const struct sched_class *class;
struct task_struct *p;
+ rq->dl_server = NULL;
+
+ if (scx_enabled())
+ goto restart;
+
/*
- * Optimization: we know that if all tasks are in
- * the fair class we can call that function directly:
+ * Optimization: we know that if all tasks are in the fair class we can
+ * call that function directly, but only if the @prev task wasn't of a
+ * higher scheduling class, because otherwise those lose the
+ * opportunity to pull in more work from other CPUs.
*/
- if (likely(prev->sched_class == class &&
- rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev);
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
+ rq->nr_running == rq->cfs.h_nr_queued)) {
+
+ p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
- /* assumes fair_sched_class->next == idle_sched_class */
- if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev);
+ /* Assume the next prioritized class is idle_sched_class */
+ if (!p) {
+ p = pick_task_idle(rq, rf);
+ put_prev_set_next_task(rq, prev, p);
+ }
return p;
}
-again:
- for_each_class(class) {
- p = class->pick_next_task(rq, prev);
- if (p) {
+restart:
+ prev_balance(rq, prev, rf);
+
+ for_each_active_class(class) {
+ if (class->pick_next_task) {
+ p = class->pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
+ if (p)
+ return p;
+ } else {
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
+ if (p) {
+ put_prev_set_next_task(rq, prev, p);
+ return p;
+ }
+ }
+ }
+
+ BUG(); /* The idle class should always have a runnable task. */
+}
+
+#ifdef CONFIG_SCHED_CORE
+static inline bool is_task_rq_idle(struct task_struct *t)
+{
+ return (task_rq(t)->idle == t);
+}
+
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+ return is_task_rq_idle(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_task_rq_idle(a) || is_task_rq_idle(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+/*
+ * Careful; this can return RETRY_TASK, it does not include the retry-loop
+ * itself due to the whole SMT pick retry thing below.
+ */
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
+{
+ const struct sched_class *class;
+ struct task_struct *p;
+
+ rq->dl_server = NULL;
+
+ for_each_active_class(class) {
+ p = class->pick_task(rq, rf);
+ if (p)
return p;
+ }
+
+ BUG(); /* The idle class should always have a runnable task. */
+}
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+static void queue_core_balance(struct rq *rq);
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *p, *max;
+ const struct cpumask *smt_mask;
+ bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
+ unsigned long cookie;
+ int i, cpu, occ = 0;
+ struct rq *rq_i;
+ bool need_sync;
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ cpu = cpu_of(rq);
+
+ /* Stopper task is switching into idle, no need core-wide selection. */
+ if (cpu_is_offline(cpu)) {
+ /*
+ * Reset core_pick so that we don't enter the fastpath when
+ * coming online. core_pick would already be migrated to
+ * another cpu during offline.
+ */
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ return __pick_next_task(rq, prev, rf);
+ }
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ *
+ * rq->core_pick can be NULL if no selection was made for a CPU because
+ * it was either offline or went offline during a sibling's core-wide
+ * selection. In this case, do a core-wide selection.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq &&
+ rq->core_pick) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ rq->dl_server = rq->core_dl_server;
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ goto out_set_next;
+ }
+
+ prev_balance(rq, prev, rf);
+
+ smt_mask = cpu_smt_mask(cpu);
+ need_sync = !!rq->core->core_cookie;
+
+ /* reset state */
+ rq->core->core_cookie = 0UL;
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
+ need_sync = true;
+ fi_before = true;
+ }
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+
+ /*
+ * Optimize for common case where this CPU has no cookies
+ * and there are no cookied tasks running on siblings.
+ */
+ if (!need_sync) {
+restart_single:
+ next = pick_task(rq, rf);
+ if (unlikely(next == RETRY_TASK))
+ goto restart_single;
+ if (!next->core_cookie) {
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ /*
+ * For robustness, update the min_vruntime_fi for
+ * unconstrained picks as well.
+ */
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq, next, false);
+ goto out_set_next;
+ }
+ }
+
+ /*
+ * For each thread: do the regular task pick and find the max prio task
+ * amongst them.
+ *
+ * Tie-break prio towards the current CPU
+ */
+restart_multi:
+ max = NULL;
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ rq_i = cpu_rq(i);
+
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+ update_rq_clock(rq_i);
+
+ p = pick_task(rq_i, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart_multi;
+
+ rq_i->core_pick = p;
+ rq_i->core_dl_server = rq_i->dl_server;
+
+ if (!max || prio_less(max, p, fi_before))
+ max = p;
+ }
+
+ cookie = rq->core->core_cookie = max->core_cookie;
+
+ /*
+ * For each thread: try and find a runnable task that matches @max or
+ * force idle.
+ */
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick;
+
+ if (!cookie_equals(p, cookie)) {
+ p = NULL;
+ if (cookie)
+ p = sched_core_find(rq_i, cookie);
+ if (!p)
+ p = idle_sched_class.pick_task(rq_i, rf);
+ }
+
+ rq_i->core_pick = p;
+ rq_i->core_dl_server = NULL;
+
+ if (p == rq_i->idle) {
+ if (rq_i->nr_running) {
+ rq->core->core_forceidle_count++;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }
+ } else {
+ occ++;
+ }
+ }
+
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+ next = rq->core_pick;
+ rq->core_sched_seq = rq->core->core_pick_seq;
+
+ /* Something should have been selected for current CPU */
+ WARN_ON_ONCE(!next);
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+
+ /*
+ * An online sibling might have gone offline before a task
+ * could be picked for it, or it might be offline but later
+ * happen to come online, but its too late and nothing was
+ * picked for it. That's Ok - it will pick tasks for itself,
+ * so ignore it.
+ */
+ if (!rq_i->core_pick)
+ continue;
+
+ /*
+ * Update for new !FI->FI transitions, or if continuing to be in !FI:
+ * fi_before fi update?
+ * 0 0 1
+ * 0 1 1
+ * 1 0 1
+ * 1 1 0
+ */
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
+
+ rq_i->core_pick->core_occupation = occ;
+
+ if (i == cpu) {
+ rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
+ continue;
+ }
+
+ /* Did we break L1TF mitigation requirements? */
+ WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+ if (rq_i->curr == rq_i->core_pick) {
+ rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
+ continue;
+ }
+
+ resched_curr(rq_i);
+ }
+
+out_set_next:
+ put_prev_set_next_task(rq, prev, next);
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
+ return next;
+}
+
+static bool try_steal_cookie(int this, int that)
+{
+ struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+ struct task_struct *p;
+ unsigned long cookie;
+ bool success = false;
+
+ guard(irq)();
+ guard(double_rq_lock)(dst, src);
+
+ cookie = dst->core->core_cookie;
+ if (!cookie)
+ return false;
+
+ if (dst->curr != dst->idle)
+ return false;
+
+ p = sched_core_find(src, cookie);
+ if (!p)
+ return false;
+
+ do {
+ if (p == src->core_pick || p == src->curr)
+ goto next;
+
+ if (!is_cpu_allowed(p, this))
+ goto next;
+
+ if (p->core_occupation > dst->idle->core_occupation)
+ goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure
+ * that task @p is not throttled now, we also need to
+ * check whether the runqueue of the destination CPU is
+ * being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
+
+ move_queued_task_locked(src, dst, p);
+ resched_curr(dst);
+
+ success = true;
+ break;
+
+next:
+ p = sched_core_next(p, cookie);
+ } while (p);
+
+ return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+ int i;
+
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
+ if (i == cpu)
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (try_steal_cookie(cpu, i))
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+ struct sched_domain *sd;
+ int cpu = cpu_of(rq);
+
+ guard(preempt)();
+ guard(rcu)();
+
+ raw_spin_rq_unlock_irq(rq);
+ for_each_domain(cpu, sd) {
+ if (need_resched())
+ break;
+
+ if (steal_cookie_task(cpu, sd))
+ break;
+ }
+ raw_spin_rq_lock_irq(rq);
+}
+
+static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
+
+static void queue_core_balance(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_cookie)
+ return;
+
+ if (!rq->nr_running) /* not forced idle */
+ return;
+
+ queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
+DEFINE_LOCK_GUARD_1(core_lock, int,
+ sched_core_lock(*_T->lock, &_T->flags),
+ sched_core_unlock(*_T->lock, &_T->flags),
+ unsigned long flags)
+
+static void sched_core_cpu_starting(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ int t;
+
+ guard(core_lock)(&cpu);
+
+ WARN_ON_ONCE(rq->core != rq);
+
+ /* if we're the first, we'll be our own leader */
+ if (cpumask_weight(smt_mask) == 1)
+ return;
+
+ /* find the leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ rq = cpu_rq(t);
+ if (rq->core == rq) {
+ core_rq = rq;
+ break;
}
}
- BUG(); /* the idle class will always have a runnable task */
+ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+ return;
+
+ /* install and validate core_rq */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+
+ if (t == cpu)
+ rq->core = core_rq;
+
+ WARN_ON_ONCE(rq->core != core_rq);
+ }
+}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ int t;
+
+ guard(core_lock)(&cpu);
+
+ /* if we're the last man standing, nothing to do */
+ if (cpumask_weight(smt_mask) == 1) {
+ WARN_ON_ONCE(rq->core != rq);
+ return;
+ }
+
+ /* if we're not the leader, nothing to do */
+ if (rq->core != rq)
+ return;
+
+ /* find a new leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ core_rq = cpu_rq(t);
+ break;
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* impossible */
+ return;
+
+ /* copy the shared state to the new leader */
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
+
+ /* install new leader */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+ rq->core = core_rq;
+ }
+}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->core != rq)
+ rq->core = rq;
+}
+
+#else /* !CONFIG_SCHED_CORE: */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* !CONFIG_SCHED_CORE */
+
+/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock.
+ */
+#define SM_IDLE (-1)
+#define SM_NONE 0
+#define SM_PREEMPT 1
+#define SM_RTLOCK_WAIT 2
+
+/*
+ * Helper function for __schedule()
+ *
+ * Tries to deactivate the task, unless the should_block arg
+ * is false or if a signal is pending. In the case a signal
+ * is pending, marks the task's __state as RUNNING (and clear
+ * blocked_on).
+ */
+static bool try_to_block_task(struct rq *rq, struct task_struct *p,
+ unsigned long *task_state_p, bool should_block)
+{
+ unsigned long task_state = *task_state_p;
+ int flags = DEQUEUE_NOCLOCK;
+
+ if (signal_pending_state(task_state, p)) {
+ WRITE_ONCE(p->__state, TASK_RUNNING);
+ *task_state_p = TASK_RUNNING;
+ return false;
+ }
+
+ /*
+ * We check should_block after signal_pending because we
+ * will want to wake the task in that case. But if
+ * should_block is false, its likely due to the task being
+ * blocked on a mutex, and we want to keep it on the runqueue
+ * to be selectable for proxy-execution.
+ */
+ if (!should_block)
+ return false;
+
+ p->sched_contributes_to_load =
+ (task_state & TASK_UNINTERRUPTIBLE) &&
+ !(task_state & TASK_NOLOAD) &&
+ !(task_state & TASK_FROZEN);
+
+ if (unlikely(is_special_task_state(task_state)))
+ flags |= DEQUEUE_SPECIAL;
+
+ /*
+ * __schedule() ttwu()
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
+ *
+ * After this, schedule() must not care about p->state any more.
+ */
+ block_task(rq, p, flags);
+ return true;
+}
+
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline struct task_struct *proxy_resched_idle(struct rq *rq)
+{
+ put_prev_set_next_task(rq, rq->donor, rq->idle);
+ rq_set_donor(rq, rq->idle);
+ set_tsk_need_resched(rq->idle);
+ return rq->idle;
+}
+
+static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ unsigned long state = READ_ONCE(donor->__state);
+
+ /* Don't deactivate if the state has been changed to TASK_RUNNING */
+ if (state == TASK_RUNNING)
+ return false;
+ /*
+ * Because we got donor from pick_next_task(), it is *crucial*
+ * that we call proxy_resched_idle() before we deactivate it.
+ * As once we deactivate donor, donor->on_rq is set to zero,
+ * which allows ttwu() to immediately try to wake the task on
+ * another rq. So we cannot use *any* references to donor
+ * after that point. So things like cfs_rq->curr or rq->donor
+ * need to be changed from next *before* we deactivate.
+ */
+ proxy_resched_idle(rq);
+ return try_to_block_task(rq, donor, &state, true);
+}
+
+static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ if (!__proxy_deactivate(rq, donor)) {
+ /*
+ * XXX: For now, if deactivation failed, set donor
+ * as unblocked, as we aren't doing proxy-migrations
+ * yet (more logic will be needed then).
+ */
+ donor->blocked_on = NULL;
+ }
+ return NULL;
+}
+
+/*
+ * Find runnable lock owner to proxy for mutex blocked donor
+ *
+ * Follow the blocked-on relation:
+ * task->blocked_on -> mutex->owner -> task...
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * mutex->wait_lock
+ *
+ * Returns the task that is going to be used as execution context (the one
+ * that is actually going to be run on cpu_of(rq)).
+ */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ struct task_struct *owner = NULL;
+ int this_cpu = cpu_of(rq);
+ struct task_struct *p;
+ struct mutex *mutex;
+
+ /* Follow blocked_on chain. */
+ for (p = donor; task_is_blocked(p); p = owner) {
+ mutex = p->blocked_on;
+ /* Something changed in the chain, so pick again */
+ if (!mutex)
+ return NULL;
+ /*
+ * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
+ * and ensure @owner sticks around.
+ */
+ guard(raw_spinlock)(&mutex->wait_lock);
+
+ /* Check again that p is blocked with wait_lock held */
+ if (mutex != __get_task_blocked_on(p)) {
+ /*
+ * Something changed in the blocked_on chain and
+ * we don't know if only at this level. So, let's
+ * just bail out completely and let __schedule()
+ * figure things out (pick_again loop).
+ */
+ return NULL;
+ }
+
+ owner = __mutex_owner(mutex);
+ if (!owner) {
+ __clear_task_blocked_on(p, mutex);
+ return p;
+ }
+
+ if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
+ /* XXX Don't handle blocked owners/delayed dequeue yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_cpu(owner) != this_cpu) {
+ /* XXX Don't handle migrations yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_on_rq_migrating(owner)) {
+ /*
+ * One of the chain of mutex owners is currently migrating to this
+ * CPU, but has not yet been enqueued because we are holding the
+ * rq lock. As a simple solution, just schedule rq->idle to give
+ * the migration a chance to complete. Much like the migrate_task
+ * case we should end up back in find_proxy_task(), this time
+ * hopefully with all relevant tasks already enqueued.
+ */
+ return proxy_resched_idle(rq);
+ }
+
+ /*
+ * Its possible to race where after we check owner->on_rq
+ * but before we check (owner_cpu != this_cpu) that the
+ * task on another cpu was migrated back to this cpu. In
+ * that case it could slip by our checks. So double check
+ * we are still on this cpu and not migrating. If we get
+ * inconsistent results, try again.
+ */
+ if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu)
+ return NULL;
+
+ if (owner == p) {
+ /*
+ * It's possible we interleave with mutex_unlock like:
+ *
+ * lock(&rq->lock);
+ * find_proxy_task()
+ * mutex_unlock()
+ * lock(&wait_lock);
+ * donor(owner) = current->blocked_donor;
+ * unlock(&wait_lock);
+ *
+ * wake_up_q();
+ * ...
+ * ttwu_runnable()
+ * __task_rq_lock()
+ * lock(&wait_lock);
+ * owner == p
+ *
+ * Which leaves us to finish the ttwu_runnable() and make it go.
+ *
+ * So schedule rq->idle so that ttwu_runnable() can get the rq
+ * lock and mark owner as running.
+ */
+ return proxy_resched_idle(rq);
+ }
+ /*
+ * OK, now we're absolutely sure @owner is on this
+ * rq, therefore holding @rq->lock is sufficient to
+ * guarantee its existence, as per ttwu_remote().
+ */
+ }
+
+ WARN_ON_ONCE(owner && !owner->on_rq);
+ return owner;
+}
+#else /* SCHED_PROXY_EXEC */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n");
+ return donor;
+}
+#endif /* SCHED_PROXY_EXEC */
+
+static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
+{
+ if (!sched_proxy_exec())
+ return;
+ /*
+ * pick_next_task() calls set_next_task() on the chosen task
+ * at some point, which ensures it is not push/pullable.
+ * However, the chosen/donor task *and* the mutex owner form an
+ * atomic pair wrt push/pull.
+ *
+ * Make sure owner we run is not pushable. Unfortunately we can
+ * only deal with that by means of a dequeue/enqueue cycle. :-/
+ */
+ dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
+ enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
}
/*
@@ -2700,7 +6687,7 @@ again:
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
- * interrupt handler scheduler_tick().
+ * interrupt handler sched_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
@@ -2709,7 +6696,7 @@ again:
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
@@ -2718,7 +6705,7 @@ again:
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
@@ -2726,109 +6713,281 @@ again:
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
*/
-static void __sched __schedule(void)
+static void __sched notrace __schedule(int sched_mode)
{
struct task_struct *prev, *next;
+ /*
+ * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
+ * as a preemption by schedule_debug() and RCU.
+ */
+ bool preempt = sched_mode > SM_NONE;
+ bool is_switch = false;
unsigned long *switch_count;
+ unsigned long prev_state;
+ struct rq_flags rf;
struct rq *rq;
int cpu;
- preempt_disable();
+ /* Trace preemptions consistently with task switches */
+ trace_sched_entry_tp(sched_mode == SM_PREEMPT);
+
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
- schedule_debug(prev);
+ schedule_debug(prev, preempt);
- if (sched_feat(HRTICK))
+ if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
+ klp_sched_try_switch(prev);
+
+ local_irq_disable();
+ rcu_note_context_switch(preempt);
+ migrate_disable_switch(rq, prev);
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
- * done by the caller to avoid the race with signal_wake_up().
+ * done by the caller to avoid the race with signal_wake_up():
+ *
+ * __set_current_state(@state) signal_wake_up()
+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
+ * wake_up_state(p, state)
+ * LOCK rq->lock LOCK p->pi_state
+ * smp_mb__after_spinlock() smp_mb__after_spinlock()
+ * if (signal_pending_state()) if (p->state & @state)
+ *
+ * Also, the membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr; this
+ * barrier matches a full barrier in the proximity of the membarrier
+ * system call exit.
*/
- smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ rq_lock(rq, &rf);
+ smp_mb__after_spinlock();
- rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+ /* Promote REQ to ACT */
+ rq->clock_update_flags <<= 1;
+ update_rq_clock(rq);
+ rq->clock_update_flags = RQCF_UPDATED;
switch_count = &prev->nivcsw;
- if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- if (unlikely(signal_pending_state(prev->state, prev))) {
- prev->state = TASK_RUNNING;
- } else {
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
- prev->on_rq = 0;
- /*
- * If a worker went to sleep, notify and ask workqueue
- * whether it wants to wake up a task to maintain
- * concurrency.
- */
- if (prev->flags & PF_WQ_WORKER) {
- struct task_struct *to_wakeup;
+ /* Task state changes only considers SM_PREEMPT as preemption */
+ preempt = sched_mode == SM_PREEMPT;
- to_wakeup = wq_worker_sleeping(prev, cpu);
- if (to_wakeup)
- try_to_wake_up_local(to_wakeup);
- }
+ /*
+ * We must load prev->state once (task_struct::state is volatile), such
+ * that we form a control dependency vs deactivate_task() below.
+ */
+ prev_state = READ_ONCE(prev->__state);
+ if (sched_mode == SM_IDLE) {
+ /* SCX must consult the BPF scheduler to tell if rq is empty */
+ if (!rq->nr_running && !scx_enabled()) {
+ next = prev;
+ goto picked;
}
+ } else if (!preempt && prev_state) {
+ /*
+ * We pass task_is_blocked() as the should_block arg
+ * in order to keep mutex-blocked tasks on the runqueue
+ * for slection with proxy-exec (without proxy-exec
+ * task_is_blocked() will always be false).
+ */
+ try_to_block_task(rq, prev, &prev_state,
+ !task_is_blocked(prev));
switch_count = &prev->nvcsw;
}
- if (task_on_rq_queued(prev))
- update_rq_clock(rq);
-
- next = pick_next_task(rq, prev);
+pick_again:
+ next = pick_next_task(rq, rq->donor, &rf);
+ rq_set_donor(rq, next);
+ if (unlikely(task_is_blocked(next))) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next)
+ goto pick_again;
+ if (next == rq->idle)
+ goto keep_resched;
+ }
+picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->clock_skip_update = 0;
+keep_resched:
+ rq->last_seen_need_resched_ns = 0;
- if (likely(prev != next)) {
+ is_switch = prev != next;
+ if (likely(is_switch)) {
rq->nr_switches++;
- rq->curr = next;
+ /*
+ * RCU users of rcu_dereference(rq->curr) may not see
+ * changes to task_struct made by pick_next_task().
+ */
+ RCU_INIT_POINTER(rq->curr, next);
+
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
+ * RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
+ * on PowerPC and on RISC-V.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
+ *
+ * The barrier matches a full barrier in the proximity of
+ * the membarrier system call entry.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
+ */
++*switch_count;
- rq = context_switch(rq, prev, next); /* unlocks the rq */
- cpu = cpu_of(rq);
- } else
- raw_spin_unlock_irq(&rq->lock);
+ psi_account_irqtime(rq, prev, next);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+ prev->se.sched_delayed);
- post_schedule(rq);
+ trace_sched_switch(preempt, prev, next, prev_state);
- sched_preempt_enable_no_resched();
+ /* Also unlocks the rq: */
+ rq = context_switch(rq, prev, next, &rf);
+ } else {
+ /* In case next was already curr but just got blocked_donor */
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock_irq(rq);
+ }
+ trace_sched_exit_tp(is_switch);
+}
+
+void __noreturn do_task_dead(void)
+{
+ /* Causes final put_task_struct in finish_task_switch(): */
+ set_special_state(TASK_DEAD);
+
+ /* Tell freezer to ignore us: */
+ current->flags |= PF_NOFREEZE;
+
+ __schedule(SM_NONE);
+ BUG();
+
+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
+ for (;;)
+ cpu_relax();
}
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
- return;
+ static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
+ unsigned int task_flags;
+
+ /*
+ * Establish LD_WAIT_CONFIG context to ensure none of the code called
+ * will use a blocking primitive -- which would lead to recursion.
+ */
+ lock_map_acquire_try(&sched_map);
+
+ task_flags = tsk->flags;
+ /*
+ * If a worker goes to sleep, notify and ask workqueue whether it
+ * wants to wake up a task to maintain concurrency.
+ */
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);
+
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT);
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_schedule_flush_plug(tsk);
+ blk_flush_plug(tsk->plug, true);
+
+ lock_map_release(&sched_map);
+}
+
+static void sched_update_worker(struct task_struct *tsk)
+{
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
+ if (tsk->flags & PF_BLOCK_TS)
+ blk_plug_invalidate_ts(tsk);
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+ else if (tsk->flags & PF_IO_WORKER)
+ io_wq_worker_running(tsk);
+ }
+}
+
+static __always_inline void __schedule_loop(int sched_mode)
+{
+ do {
+ preempt_disable();
+ __schedule(sched_mode);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
}
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
- sched_submit_work(tsk);
+#ifdef CONFIG_RT_MUTEXES
+ lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+ if (!task_is_running(tsk))
+ sched_submit_work(tsk);
+ __schedule_loop(SM_NONE);
+ sched_update_worker(tsk);
+}
+EXPORT_SYMBOL(schedule);
+
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+ /*
+ * As this skips calling sched_submit_work(), which the idle task does
+ * regardless because that function is a NOP when the task is in a
+ * TASK_RUNNING state, make sure this isn't used someplace that the
+ * current task can be in any other state. Note, idle is always in the
+ * TASK_RUNNING state.
+ */
+ WARN_ON_ONCE(current->__state);
do {
- __schedule();
+ __schedule(SM_IDLE);
} while (need_resched());
}
-EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
@@ -2838,7 +6997,7 @@ asmlinkage __visible void __sched schedule_user(void)
* we find a better solution.
*
* NB: There are buggy callers of this function. Ideally we
- * should warn if prev_state != CONTEXT_USER, but that will trigger
+ * should warn if prev_state != CT_STATE_USER, but that will trigger
* too frequently to make sense yet.
*/
enum ctx_state prev_state = exception_enter();
@@ -2859,26 +7018,47 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+ __schedule_loop(SM_RTLOCK_WAIT);
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
static void __sched notrace preempt_schedule_common(void)
{
do {
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
+ __schedule(SM_PREEMPT);
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
- barrier();
} while (need_resched());
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
+ * This is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.
*/
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
@@ -2888,15 +7068,34 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# ifndef preempt_schedule_dynamic_enabled
+# define preempt_schedule_dynamic_enabled preempt_schedule
+# define preempt_schedule_dynamic_disabled NULL
+# endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+ return;
+ preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
/**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +7108,7 @@ EXPORT_SYMBOL(preempt_schedule);
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
@@ -2917,30 +7116,64 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
+ preempt_disable_notrace();
+ preempt_latency_start(1);
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule();
+ __schedule(SM_PREEMPT);
exception_exit(prev_ctx);
- __preempt_count_sub(PREEMPT_ACTIVE);
- barrier();
+ preempt_latency_stop(1);
+ preempt_enable_no_resched_notrace();
} while (need_resched());
}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# ifndef preempt_schedule_notrace_dynamic_enabled
+# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+# define preempt_schedule_notrace_dynamic_disabled NULL
+# endif
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+ return;
+ preempt_schedule_notrace();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+# endif
+#endif
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * This is the entry point to schedule() from kernel preemption
+ * off of IRQ context.
+ * Note, that this is called and return with IRQs disabled. This will
+ * protect us against recursive calling from IRQ contexts.
*/
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
@@ -2952,35 +7185,72 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_disable();
local_irq_enable();
- __schedule();
+ __schedule(SM_PREEMPT);
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
+ sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
}
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
+ WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
+const struct sched_class *__setscheduler_class(int policy, int prio)
+{
+ if (dl_prio(prio))
+ return &dl_sched_class;
+
+ if (rt_prio(prio))
+ return &rt_sched_class;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (task_should_scx(policy))
+ return &ext_sched_class;
+#endif
+
+ return &fair_sched_class;
+}
+
#ifdef CONFIG_RT_MUTEXES
/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+ lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+ sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+ lockdep_assert(current->sched_rt_mutex);
+ __schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+ sched_update_worker(current);
+ lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
+/*
* rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -2988,18 +7258,45 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int oldprio, queued, running, enqueue_flag = 0;
+ int prio, oldprio, queue_flag =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ const struct sched_class *prev_class, *next_class;
+ struct rq_flags rf;
struct rq *rq;
- const struct sched_class *prev_class;
- BUG_ON(prio > MAX_PRIO);
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+ return;
+
+ rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guarantees the task is present.
+ */
+ p->pi_top_task = pi_task;
- rq = __task_rq_lock(p);
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio && !dl_prio(prio))
+ goto out_unlock;
/*
- * Idle task boosting is a nono in general. There is one
+ * Idle task boosting is a no-no in general. There is one
* exception, when PREEMPT_RT and NOHZ is active:
*
* The idle task calls get_next_timer_interrupt() and holds
@@ -3016,1367 +7313,456 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
goto out_unlock;
}
- trace_sched_pi_setprio(p, prio);
+ trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
- prev_class = p->sched_class;
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, 0);
- if (running)
- put_prev_task(rq, p);
-
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.dl_boosted = 1;
- p->dl.dl_throttled = 0;
- enqueue_flag = ENQUEUE_REPLENISH;
- } else
- p->dl.dl_boosted = 0;
- p->sched_class = &dl_sched_class;
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
- if (oldprio < prio)
- enqueue_flag = ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
- } else {
- if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
- }
-
- p->prio = prio;
-
- if (running)
- p->sched_class->set_curr_task(rq);
- if (queued)
- enqueue_task(rq, p, enqueue_flag);
- check_class_changed(rq, p, prev_class, oldprio);
-out_unlock:
- __task_rq_unlock(rq);
-}
-#endif
+ if (oldprio == prio)
+ queue_flag &= ~DEQUEUE_MOVE;
-void set_user_nice(struct task_struct *p, long nice)
-{
- int old_prio, delta, queued;
- unsigned long flags;
- struct rq *rq;
-
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
- return;
- /*
- * We have to be careful, if called from sys_setpriority(),
- * the task might be in the middle of scheduling on another CPU.
- */
- rq = task_rq_lock(p, &flags);
- /*
- * The RT priorities are set via sched_setscheduler(), but we still
- * allow the 'normal' nice value to be set - but as expected
- * it wont have any effect on scheduling until the task is
- * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
- */
- if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
- p->static_prio = NICE_TO_PRIO(nice);
- goto out_unlock;
- }
- queued = task_on_rq_queued(p);
- if (queued)
- dequeue_task(rq, p, 0);
+ prev_class = p->sched_class;
+ next_class = __setscheduler_class(p->policy, prio);
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
- old_prio = p->prio;
- p->prio = effective_prio(p);
- delta = p->prio - old_prio;
+ if (prev_class != next_class)
+ queue_flag |= DEQUEUE_CLASS;
- if (queued) {
- enqueue_task(rq, p, 0);
+ scoped_guard (sched_change, p, queue_flag) {
/*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
*/
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_curr(rq);
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
+ } else {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
+ }
+
+ p->sched_class = next_class;
+ p->prio = prio;
}
out_unlock:
- task_rq_unlock(rq, p, &flags);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
- /* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = nice_to_rlimit(nice);
+ /* Caller holds task_struct::pi_lock, IRQs are still disabled */
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ rq_repin_lock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
}
+#endif /* CONFIG_RT_MUTEXES */
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+int __sched __cond_resched(void)
{
- long nice, retval;
-
+ if (should_resched(0) && !irqs_disabled()) {
+ preempt_schedule_common();
+ return 1;
+ }
/*
- * Setpriority might change our priority at the same moment.
- * We don't have to worry. Conceptually one call occurs first
- * and we have a single winner.
+ * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick
+ * whether the current CPU is in an RCU read-side critical section,
+ * so the tick can report quiescent states even for CPUs looping
+ * in kernel context. In contrast, in non-preemptible kernels,
+ * RCU readers leave no in-memory hints, which means that CPU-bound
+ * processes executing in kernel context might never report an
+ * RCU quiescent state. Therefore, the following code causes
+ * cond_resched() to report a quiescent state, but only when RCU
+ * is in urgent need of one.
+ * A third case, preemptible, but non-PREEMPT_RCU provides for
+ * urgently needed quiescent states via rcu_flavor_sched_clock_irq().
*/
- increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
- nice = task_nice(current) + increment;
-
- nice = clamp_val(nice, MIN_NICE, MAX_NICE);
- if (increment < 0 && !can_nice(current, nice))
- return -EPERM;
-
- retval = security_task_setnice(current, nice);
- if (retval)
- return retval;
-
- set_user_nice(current, nice);
+#ifndef CONFIG_PREEMPT_RCU
+ rcu_all_qs();
+#endif
return 0;
}
-
+EXPORT_SYMBOL(__cond_resched);
#endif
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
- */
-int task_prio(const struct task_struct *p)
-{
- return p->prio - MAX_RT_PRIO;
-}
-
-/**
- * idle_cpu - is a given cpu idle currently?
- * @cpu: the processor in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->curr != rq->idle)
- return 0;
-
- if (rq->nr_running)
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# define cond_resched_dynamic_enabled __cond_resched
+# define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(cond_resched);
+
+# define might_resched_dynamic_enabled __cond_resched
+# define might_resched_dynamic_disabled ((void *)&__static_call_return0)
+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(might_resched);
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
-
-#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
- return 0;
-#endif
-
- return 1;
-}
-
-/**
- * idle_task - return the idle task for a given cpu.
- * @cpu: the processor in question.
- *
- * Return: The idle task for the cpu @cpu.
- */
-struct task_struct *idle_task(int cpu)
-{
- return cpu_rq(cpu)->idle;
+ return __cond_resched();
}
+EXPORT_SYMBOL(dynamic_cond_resched);
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- *
- * The task of @pid, if found. %NULL otherwise.
- */
-static struct task_struct *find_process_by_pid(pid_t pid)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
{
- return pid ? find_task_by_vpid(pid) : current;
+ if (!static_branch_unlikely(&sk_dynamic_might_resched))
+ return 0;
+ return __cond_resched();
}
+EXPORT_SYMBOL(dynamic_might_resched);
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/*
- * This function initializes the sched_dl_entity of a newly becoming
- * SCHED_DEADLINE task.
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
*
- * Only the static values are considered here, the actual runtime and the
- * absolute deadline will be properly calculated when the task is enqueued
- * for the first time with its new policy.
- */
-static void
-__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- dl_se->dl_runtime = attr->sched_runtime;
- dl_se->dl_deadline = attr->sched_deadline;
- dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
- dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-
- /*
- * Changing the parameters of a task is 'tricky' and we're not doing
- * the correct thing -- also see task_dead_dl() and switched_from_dl().
- *
- * What we SHOULD do is delay the bandwidth release until the 0-lag
- * point. This would include retaining the task_struct until that time
- * and change dl_overflow() to not immediately decrement the current
- * amount.
- *
- * Instead we retain the current runtime/deadline and let the new
- * parameters take effect after the current reservation period lapses.
- * This is safe (albeit pessimistic) because the 0-lag point is always
- * before the current scheduling deadline.
- *
- * We can still have temporary overloads because we do not delay the
- * change in bandwidth until that time; so admission control is
- * not on the safe side. It does however guarantee tasks will never
- * consume more than promised.
- */
-}
-
-/*
- * sched_setparam() passes in -1 for its policy, to let the functions
- * it calls know not to change it.
- */
-#define SETPARAM_POLICY -1
-
-static void __setscheduler_params(struct task_struct *p,
- const struct sched_attr *attr)
-{
- int policy = attr->sched_policy;
-
- if (policy == SETPARAM_POLICY)
- policy = p->policy;
-
- p->policy = policy;
-
- if (dl_policy(policy))
- __setparam_dl(p, attr);
- else if (fair_policy(policy))
- p->static_prio = NICE_TO_PRIO(attr->sched_nice);
-
- /*
- * __sched_setscheduler() ensures attr->sched_priority == 0 when
- * !rt_policy. Always setting this ensures that things like
- * getparam()/getattr() don't report silly values for !rt tasks.
- */
- p->rt_priority = attr->sched_priority;
- p->normal_prio = normal_prio(p);
- set_load_weight(p);
-}
-
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
-static void
-__getparam_dl(struct task_struct *p, struct sched_attr *attr)
-{
- struct sched_dl_entity *dl_se = &p->dl;
-
- attr->sched_priority = p->rt_priority;
- attr->sched_runtime = dl_se->dl_runtime;
- attr->sched_deadline = dl_se->dl_deadline;
- attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
-}
-
-/*
- * This function validates the new parameters of a -deadline task.
- * We ask for the deadline not being zero, and greater or equal
- * than the runtime, as well as the period of being zero or
- * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one) and
- * below 2^63 ns (we have to check both sched_deadline and
- * sched_period, as the latter can be zero).
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
*/
-static bool
-__checkparam_dl(const struct sched_attr *attr)
+int __cond_resched_lock(spinlock_t *lock)
{
- /* deadline != 0 */
- if (attr->sched_deadline == 0)
- return false;
-
- /*
- * Since we truncate DL_SCALE bits, make sure we're at least
- * that big.
- */
- if (attr->sched_runtime < (1ULL << DL_SCALE))
- return false;
-
- /*
- * Since we use the MSB for wrap-around and sign issues, make
- * sure it's not set (mind that period can be equal to zero).
- */
- if (attr->sched_deadline & (1ULL << 63) ||
- attr->sched_period & (1ULL << 63))
- return false;
-
- /* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
- attr->sched_deadline < attr->sched_runtime)
- return false;
-
- return true;
-}
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
-/*
- * check the target process has a UID that matches the current process's
- */
-static bool check_same_owner(struct task_struct *p)
-{
- const struct cred *cred = current_cred(), *pcred;
- bool match;
+ lockdep_assert_held(lock);
- rcu_read_lock();
- pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
}
+EXPORT_SYMBOL(__cond_resched_lock);
-static bool dl_param_changed(struct task_struct *p,
- const struct sched_attr *attr)
+int __cond_resched_rwlock_read(rwlock_t *lock)
{
- struct sched_dl_entity *dl_se = &p->dl;
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
- if (dl_se->dl_runtime != attr->sched_runtime ||
- dl_se->dl_deadline != attr->sched_deadline ||
- dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
- return true;
+ lockdep_assert_held_read(lock);
- return false;
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
+ }
+ return ret;
}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
-static int __sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- bool user)
+int __cond_resched_rwlock_write(rwlock_t *lock)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
- unsigned long flags;
- const struct sched_class *prev_class;
- struct rq *rq;
- int reset_on_fork;
-
- /* may grab non-irq protected spin_locks */
- BUG_ON(in_interrupt());
-recheck:
- /* double check policy once rq lock held */
- if (policy < 0) {
- reset_on_fork = p->sched_reset_on_fork;
- policy = oldpolicy = p->policy;
- } else {
- reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-
- if (policy != SCHED_DEADLINE &&
- policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- policy != SCHED_IDLE)
- return -EINVAL;
- }
-
- if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
- return -EINVAL;
-
- /*
- * Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
- * SCHED_BATCH and SCHED_IDLE is 0.
- */
- if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
- return -EINVAL;
- if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
- (rt_policy(policy) != (attr->sched_priority != 0)))
- return -EINVAL;
-
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* can't set/change the rt policy */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* can't increase priority */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* can't change other user's priorities */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
- if (user) {
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
- }
-
- /*
- * make sure no PI-waiters arrive (or leave) while we are
- * changing the priority of the task:
- *
- * To be able to change p->policy safely, the appropriate
- * runqueue lock must be held.
- */
- rq = task_rq_lock(p, &flags);
-
- /*
- * Changing the policy of the stop threads its a very bad idea
- */
- if (p == rq->stop) {
- task_rq_unlock(rq, p, &flags);
- return -EINVAL;
- }
-
- /*
- * If not changing anything there's no need to proceed further,
- * but store a possible modification of reset_on_fork.
- */
- if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
- goto change;
- if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- goto change;
- if (dl_policy(policy) && dl_param_changed(p, attr))
- goto change;
-
- p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &flags);
- return 0;
- }
-change:
-
- if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
- !task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &flags);
- return -EPERM;
- }
-#endif
-#ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy)) {
- cpumask_t *span = rq->rd->span;
-
- /*
- * Don't allow tasks with an affinity mask smaller than
- * the entire root_domain to become SCHED_DEADLINE. We
- * will also fail if there's no bandwidth available.
- */
- if (!cpumask_subset(span, &p->cpus_allowed) ||
- rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &flags);
- return -EPERM;
- }
- }
-#endif
- }
-
- /* recheck policy now with rq lock held */
- if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
- policy = oldpolicy = -1;
- task_rq_unlock(rq, p, &flags);
- goto recheck;
- }
-
- /*
- * If setscheduling to SCHED_DEADLINE (or changing the parameters
- * of a SCHED_DEADLINE task) we need to check if enough bandwidth
- * is available.
- */
- if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &flags);
- return -EBUSY;
- }
-
- p->sched_reset_on_fork = reset_on_fork;
- oldprio = p->prio;
-
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
- }
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, 0);
- if (running)
- put_prev_task(rq, p);
+ lockdep_assert_held_write(lock);
- prev_class = p->sched_class;
- __setscheduler(rq, p, attr, true);
-
- if (running)
- p->sched_class->set_curr_task(rq);
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
}
-
- check_class_changed(rq, p, prev_class, oldprio);
- task_rq_unlock(rq, p, &flags);
-
- rt_mutex_adjust_pi(p);
-
- return 0;
+ return ret;
}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
-static int _sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool check)
-{
- struct sched_attr attr = {
- .sched_policy = policy,
- .sched_priority = param->sched_priority,
- .sched_nice = PRIO_TO_NICE(p->static_prio),
- };
+#ifdef CONFIG_PREEMPT_DYNAMIC
- /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
- if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- policy &= ~SCHED_RESET_ON_FORK;
- attr.sched_policy = policy;
- }
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
+# endif
- return __sched_setscheduler(p, &attr, check);
-}
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
*
- * Return: 0 on success. An error code otherwise.
*
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, true);
-}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
-
-int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
-{
- return __sched_setscheduler(p, attr, true);
-}
-EXPORT_SYMBOL_GPL(sched_setattr);
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission. For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
- * Return: 0 on success. An error code otherwise.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, false);
-}
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
- struct sched_param lparam;
- struct task_struct *p;
- int retval;
-
- if (!param || pid < 0)
- return -EINVAL;
- if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
- return -EFAULT;
-
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setscheduler(p, policy, &lparam);
- rcu_read_unlock();
-
- return retval;
-}
-
-/*
- * Mimics kernel/events/core.c perf_copy_attr().
- */
-static int sched_copy_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr)
-{
- u32 size;
- int ret;
-
- if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
- return -EFAULT;
-
- /*
- * zero the full structure, so that a short copy will be nice.
- */
- memset(attr, 0, sizeof(*attr));
-
- ret = get_user(size, &uattr->size);
- if (ret)
- return ret;
-
- if (size > PAGE_SIZE) /* silly large */
- goto err_size;
-
- if (!size) /* abi compat */
- size = SCHED_ATTR_SIZE_VER0;
-
- if (size < SCHED_ATTR_SIZE_VER0)
- goto err_size;
-
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
- }
-
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
-
- /*
- * XXX: do we want to be lenient like existing syscalls; or do we want
- * to be strict and return an error on out-of-bounds values?
- */
- attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-
- return 0;
-
-err_size:
- put_user(sizeof(*attr), &uattr->size);
- return -E2BIG;
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- false
*
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
- struct sched_param __user *, param)
-{
- /* negative values for policy are not valid */
- if (policy < 0)
- return -EINVAL;
-
- return do_sched_setscheduler(pid, policy, param);
-}
+ * LAZY:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- true
+ */
+
+enum {
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+ preempt_dynamic_lazy,
+};
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
- return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
-}
+int preempt_dynamic_mode = preempt_dynamic_undefined;
-/**
- * sys_sched_setattr - same as above, but with extended sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, flags)
+int sched_dynamic_mode(const char *str)
{
- struct sched_attr attr;
- struct task_struct *p;
- int retval;
-
- if (!uattr || pid < 0 || flags)
- return -EINVAL;
-
- retval = sched_copy_attr(uattr, &attr);
- if (retval)
- return retval;
-
- if ((int)attr.sched_policy < 0)
- return -EINVAL;
-
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setattr(p, &attr);
- rcu_read_unlock();
+# ifndef CONFIG_PREEMPT_RT
+ if (!strcmp(str, "none"))
+ return preempt_dynamic_none;
- return retval;
-}
+ if (!strcmp(str, "voluntary"))
+ return preempt_dynamic_voluntary;
+# endif
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- *
- * Return: On success, the policy of the thread. Otherwise, a negative error
- * code.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
- struct task_struct *p;
- int retval;
+ if (!strcmp(str, "full"))
+ return preempt_dynamic_full;
- if (pid < 0)
- return -EINVAL;
+# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+ if (!strcmp(str, "lazy"))
+ return preempt_dynamic_lazy;
+# endif
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy
- | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
- }
- rcu_read_unlock();
- return retval;
+ return -EINVAL;
}
-/**
- * sys_sched_getparam - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- *
- * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
- * code.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
- struct sched_param lp = { .sched_priority = 0 };
- struct task_struct *p;
- int retval;
+# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
- if (!param || pid < 0)
- return -EINVAL;
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
+# else
+# error "Unsupported PREEMPT_DYNAMIC mechanism"
+# endif
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+static DEFINE_MUTEX(sched_dynamic_mutex);
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
-
- /*
- * This one might sleep, we cannot do it with a spinlock held ...
- */
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
-}
-
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+static void __sched_dynamic_update(int mode)
{
- int ret;
-
- if (!access_ok(VERIFY_WRITE, uattr, usize))
- return -EFAULT;
-
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
+ break;
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
+ case preempt_dynamic_voluntary:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
+ case preempt_dynamic_full:
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
+ break;
- attr->size = usize;
+ case preempt_dynamic_lazy:
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_enable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: lazy\n");
+ break;
}
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
- return -EFAULT;
-
- return 0;
+ preempt_dynamic_mode = mode;
}
-/**
- * sys_sched_getattr - similar to sched_getparam, but with sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+void sched_dynamic_update(int mode)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
- struct task_struct *p;
- int retval;
-
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
- return -EINVAL;
-
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- attr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
- else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
- else
- attr.sched_nice = task_nice(p);
-
- rcu_read_unlock();
-
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+static int __init setup_preempt_mode(char *str)
{
- cpumask_var_t cpus_allowed, new_mask;
- struct task_struct *p;
- int retval;
-
- rcu_read_lock();
-
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
- return -ESRCH;
- }
-
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
-
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
- retval = -EPERM;
- if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
-
- retval = security_task_setscheduler(p);
- if (retval)
- goto out_free_new_mask;
-
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
-
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
-#ifdef CONFIG_SMP
- if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- rcu_read_lock();
- if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- retval = -EBUSY;
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
-#endif
-again:
- retval = set_cpus_allowed_ptr(p, new_mask);
-
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
- }
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 0;
}
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
-out_put_task:
- put_task_struct(p);
- return retval;
-}
-
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- struct cpumask *new_mask)
-{
- if (len < cpumask_size())
- cpumask_clear(new_mask);
- else if (len > cpumask_size())
- len = cpumask_size();
- return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+ sched_dynamic_update(mode);
+ return 1;
}
+__setup("preempt=", setup_preempt_mode);
-/**
- * sys_sched_setaffinity - set the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
+static void __init preempt_dynamic_init(void)
{
- cpumask_var_t new_mask;
- int retval;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
- if (retval == 0)
- retval = sched_setaffinity(pid, new_mask);
- free_cpumask_var(new_mask);
- return retval;
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ sched_dynamic_update(preempt_dynamic_lazy);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
+ }
}
-long sched_getaffinity(pid_t pid, struct cpumask *mask)
-{
- struct task_struct *p;
- unsigned long flags;
- int retval;
+# define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+ return preempt_dynamic_mode == preempt_dynamic_##mode; \
+ } \
+ EXPORT_SYMBOL_GPL(preempt_model_##mode)
- rcu_read_lock();
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
+#else /* !CONFIG_PREEMPT_DYNAMIC: */
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+#define preempt_dynamic_mode -1
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+static inline void preempt_dynamic_init(void) { }
-out_unlock:
- rcu_read_unlock();
+#endif /* CONFIG_PREEMPT_DYNAMIC */
- return retval;
-}
+const char *preempt_modes[] = {
+ "none", "voluntary", "full", "lazy", NULL,
+};
-/**
- * sys_sched_getaffinity - get the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
+const char *preempt_model_str(void)
{
- int ret;
- cpumask_var_t mask;
-
- if ((len * BITS_PER_BYTE) < nr_cpu_ids)
- return -EINVAL;
- if (len & (sizeof(unsigned long)-1))
- return -EINVAL;
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
+ bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) ||
+ IS_ENABLED(CONFIG_PREEMPT_LAZY));
+ static char buf[128];
- ret = sched_getaffinity(pid, mask);
- if (ret == 0) {
- size_t retlen = min_t(size_t, len, cpumask_size());
-
- if (copy_to_user(user_mask_ptr, mask, retlen))
- ret = -EFAULT;
- else
- ret = retlen;
- }
- free_cpumask_var(mask);
-
- return ret;
-}
-
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- *
- * Return: 0.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
- struct rq *rq = this_rq_lock();
+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) {
+ struct seq_buf s;
- schedstat_inc(rq, yld_count);
- current->sched_class->yield_task(rq);
+ seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_puts(&s, "PREEMPT");
- /*
- * Since we are going to call schedule() anyway, there's
- * no need to preempt or enable interrupts:
- */
- __release(rq->lock);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- do_raw_spin_unlock(&rq->lock);
- sched_preempt_enable_no_resched();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ seq_buf_printf(&s, "%sRT%s",
+ brace ? "_{" : "_",
+ brace ? "," : "");
- schedule();
+ if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
+ seq_buf_printf(&s, "(%s)%s",
+ preempt_dynamic_mode >= 0 ?
+ preempt_modes[preempt_dynamic_mode] : "undef",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
- return 0;
-}
+ if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ seq_buf_printf(&s, "LAZY%s",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
-int __sched _cond_resched(void)
-{
- if (should_resched()) {
- preempt_schedule_common();
- return 1;
+ return seq_buf_str(&s);
}
- return 0;
-}
-EXPORT_SYMBOL(_cond_resched);
-
-/*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-int __cond_resched_lock(spinlock_t *lock)
-{
- int resched = should_resched();
- int ret = 0;
- lockdep_assert_held(lock);
+ if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD))
+ return "VOLUNTARY";
- if (spin_needbreak(lock) || resched) {
- spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
- cpu_relax();
- ret = 1;
- spin_lock(lock);
- }
- return ret;
+ return "NONE";
}
-EXPORT_SYMBOL(__cond_resched_lock);
-int __sched __cond_resched_softirq(void)
+int io_schedule_prepare(void)
{
- BUG_ON(!in_softirq());
-
- if (should_resched()) {
- local_bh_enable();
- preempt_schedule_common();
- local_bh_disable();
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
+ int old_iowait = current->in_iowait;
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
- set_current_state(TASK_RUNNING);
- sys_sched_yield();
+ current->in_iowait = 1;
+ blk_flush_plug(current->plug, true);
+ return old_iowait;
}
-EXPORT_SYMBOL(yield);
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Return:
- * true (>0) if we indeed boosted the target task.
- * false (0) if we failed to boost the target.
- * -ESRCH if there's no task to yield to.
- */
-int __sched yield_to(struct task_struct *p, bool preempt)
+void io_schedule_finish(int token)
{
- struct task_struct *curr = current;
- struct rq *rq, *p_rq;
- unsigned long flags;
- int yielded = 0;
-
- local_irq_save(flags);
- rq = this_rq();
-
-again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- yielded = -ESRCH;
- goto out_irq;
- }
-
- double_rq_lock(rq, p_rq);
- if (task_rq(p) != p_rq) {
- double_rq_unlock(rq, p_rq);
- goto again;
- }
-
- if (!curr->sched_class->yield_to_task)
- goto out_unlock;
-
- if (curr->sched_class != p->sched_class)
- goto out_unlock;
-
- if (task_running(p_rq, p) || p->state)
- goto out_unlock;
-
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
- if (yielded) {
- schedstat_inc(rq, yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
- }
-
-out_unlock:
- double_rq_unlock(rq, p_rq);
-out_irq:
- local_irq_restore(flags);
-
- if (yielded > 0)
- schedule();
-
- return yielded;
+ current->in_iowait = token;
}
-EXPORT_SYMBOL_GPL(yield_to);
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
@@ -4384,194 +7770,103 @@ EXPORT_SYMBOL_GPL(yield_to);
*/
long __sched io_schedule_timeout(long timeout)
{
- int old_iowait = current->in_iowait;
- struct rq *rq;
+ int token;
long ret;
- current->in_iowait = 1;
- blk_schedule_flush_plug(current);
-
- delayacct_blkio_start();
- rq = raw_rq();
- atomic_inc(&rq->nr_iowait);
+ token = io_schedule_prepare();
ret = schedule_timeout(timeout);
- current->in_iowait = old_iowait;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
+ io_schedule_finish(token);
return ret;
}
EXPORT_SYMBOL(io_schedule_timeout);
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the maximum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = MAX_USER_RT_PRIO-1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- break;
- }
- return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the minimum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+void __sched io_schedule(void)
{
- int ret = -EINVAL;
+ int token;
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = 1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- }
- return ret;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
-{
- struct task_struct *p;
- unsigned int time_slice;
- unsigned long flags;
- struct rq *rq;
- int retval;
- struct timespec t;
-
- if (pid < 0)
- return -EINVAL;
-
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- rq = task_rq_lock(p, &flags);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &flags);
-
- rcu_read_unlock();
- jiffies_to_timespec(time_slice, &t);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ token = io_schedule_prepare();
+ schedule();
+ io_schedule_finish(token);
}
-
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+EXPORT_SYMBOL(io_schedule);
void sched_show_task(struct task_struct *p)
{
- unsigned long free = 0;
+ unsigned long free;
int ppid;
- unsigned long state = p->state;
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running ");
- else
- printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
- else
- printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
-#ifdef CONFIG_DEBUG_STACK_USAGE
+ if (!try_get_task_stack(p))
+ return;
+
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
+
+ if (task_is_running(p))
+ pr_cont(" running task ");
free = stack_not_used(p);
-#endif
ppid = 0;
rcu_read_lock();
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n",
+ free, task_pid_nr(p), task_tgid_nr(p),
+ ppid, p->flags, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
- show_stack(p, NULL);
+ print_stop_info(KERN_INFO, p);
+ print_scx_info(KERN_INFO, p);
+ show_stack(p, NULL, KERN_INFO);
+ put_task_stack(p);
}
+EXPORT_SYMBOL_GPL(sched_show_task);
-void show_state_filter(unsigned long state_filter)
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ unsigned int state = READ_ONCE(p->__state);
+
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
+ return false;
+
+ return true;
+}
+
+
+void show_state_filter(unsigned int state_filter)
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
*/
touch_nmi_watchdog();
- if (!state_filter || (p->state & state_filter))
+ touch_all_softlockup_watchdogs();
+ if (state_filter_match(state_filter, p))
sched_show_task(p);
}
- touch_all_softlockup_watchdogs();
+ if (!state_filter)
+ sysrq_sched_debug_show();
-#ifdef CONFIG_SCHED_DEBUG
- sysrq_sched_debug_show();
-#endif
rcu_read_unlock();
/*
* Only show locks if all tasks are dumped:
@@ -4580,34 +7875,43 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void init_idle_bootup_task(struct task_struct *idle)
-{
- idle->sched_class = &idle_sched_class;
-}
-
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
- * @cpu: cpu the idle task belongs to
+ * @cpu: CPU the idle task belongs to
*
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
+ struct affinity_context ac = (struct affinity_context) {
+ .new_mask = cpumask_of(cpu),
+ .flags = 0,
+ };
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_rq_lock(rq);
- __sched_fork(0, idle);
- idle->state = TASK_RUNNING;
+ idle->__state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ /*
+ * PF_KTHREAD should already be set at this point; regardless, make it
+ * look like a proper per-CPU kthread.
+ */
+ idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
+ kthread_set_per_cpu(idle, cpu);
- do_set_cpus_allowed(idle, cpumask_of(cpu));
+ /*
+ * No validation and serialization required at boot time and for
+ * setting up the idle tasks of not yet online CPUs.
+ */
+ set_cpus_allowed_common(idle, &ac);
/*
* We're having a chicken and egg problem, even though we are
- * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * holding rq->lock, the CPU isn't yet set to this CPU so the
* lockdep check in task_group() will fail.
*
* Similar case to sched_fork(). / Alternatively we could
@@ -4619,12 +7923,13 @@ void init_idle(struct task_struct *idle, int cpu)
__set_task_cpu(idle, cpu);
rcu_read_unlock();
- rq->curr = rq->idle = idle;
+ rq->idle = idle;
+ rq_set_donor(rq, idle);
+ rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
idle->on_cpu = 1;
-#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -4635,233 +7940,42 @@ void init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
}
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
- struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
- if (!cpumask_weight(cur))
+ if (cpumask_empty(cur))
return ret;
- rcu_read_lock_sched();
- cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
- raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
- ret = 0;
- raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
- rcu_read_unlock_sched();
+ ret = dl_cpuset_cpumask_can_shrink(cur, trial);
return ret;
}
-int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_cpus_allowed)
+int task_can_attach(struct task_struct *p)
{
int ret = 0;
/*
* Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
+ * to a new cpuset; we don't want to change their CPU
* affinity and isolating such threads by their set of
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
+ * before cpus_mask may be changed.
*/
- if (p->flags & PF_NO_SETAFFINITY) {
- ret = -EINVAL;
- goto out;
- }
-
-#ifdef CONFIG_SMP
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed)) {
- unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
- cs_cpus_allowed);
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
- unsigned long flags;
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
- if (overflow)
- ret = -EBUSY;
- else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw);
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
-
- }
-#endif
-out:
- return ret;
-}
-
-#ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
- struct rq *rq = task_rq(p);
-
- lockdep_assert_held(&rq->lock);
-
- dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
-
- rq = cpu_rq(new_cpu);
-
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
-
- return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- * stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- * off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- * it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- * is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- unsigned long flags;
- struct rq *rq;
- unsigned int dest_cpu;
- int ret = 0;
-
- rq = task_rq_lock(p, &flags);
-
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ if (p->flags & PF_NO_SETAFFINITY)
ret = -EINVAL;
- goto out;
- }
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- } else if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-out:
- task_rq_unlock(rq, p, &flags);
return ret;
}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- struct rq *rq;
- int ret = 0;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return ret;
- rq = cpu_rq(src_cpu);
-
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- /* Already moved. */
- if (task_cpu(p) != src_cpu)
- goto done;
-
- /* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- goto fail;
-
- /*
- * If we're not on a rq, the next wake-up will ensure we're
- * placed properly.
- */
- if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-done:
- ret = 1;
-fail:
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock(&p->pi_lock);
- return ret;
-}
+bool sched_smp_initialized __read_mostly;
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -4873,7 +7987,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -4888,340 +8002,179 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- struct rq *rq;
- unsigned long flags;
- bool queued, running;
-
- rq = task_rq_lock(p, &flags);
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
-
- if (queued)
- dequeue_task(rq, p, 0);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (running)
- p->sched_class->set_curr_task(rq);
- if (queued)
- enqueue_task(rq, p, 0);
- task_rq_unlock(rq, p, &flags);
-}
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
- struct migration_arg *arg = data;
-
- /*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
- */
- local_irq_disable();
- /*
- * We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- local_irq_enable();
- return 0;
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
-
/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Invoked on the outgoing CPU in context of the CPU hotplug thread
+ * after ensuring that there are no user space tasks left on the CPU.
+ *
+ * If there is a lazy mm in use on the hotplug thread, drop it and
+ * switch to init_mm.
+ *
+ * The reference count on init_mm is dropped in finish_cpu().
*/
-void idle_task_exit(void)
+static void sched_force_init_mm(void)
{
struct mm_struct *mm = current->active_mm;
- BUG_ON(cpu_online(smp_processor_id()));
-
if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ mmgrab_lazy_tlb(&init_mm);
+ local_irq_disable();
+ current->active_mm = &init_mm;
+ switch_mm_irqs_off(mm, &init_mm, current);
+ local_irq_enable();
finish_arch_post_lock_switch();
+ mmdrop_lazy_tlb(mm);
}
- mmdrop(mm);
-}
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
-{
- long delta = calc_load_fold_active(rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static int __balance_push_cpu_stop(void *arg)
{
-}
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
-static const struct sched_class fake_sched_class = {
- .put_prev_task = put_prev_task_fake,
-};
+ scoped_guard (raw_spinlock_irq, &p->pi_lock) {
+ cpu = select_fallback_rq(rq->cpu, p);
-static struct task_struct fake_task = {
- /*
- * Avoid pull_{rt,dl}_task()
- */
- .prio = MAX_PRIO + 1,
- .sched_class = &fake_sched_class,
-};
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, &rf, p, cpu);
+ rq_unlock(rq, &rf);
+ }
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
*
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
+ * effective when the hotplug motion is down.
*/
-static void migrate_tasks(unsigned int dead_cpu)
+static void balance_push(struct rq *rq)
{
- struct rq *rq = cpu_rq(dead_cpu);
- struct task_struct *next, *stop = rq->stop;
- int dest_cpu;
+ struct task_struct *push_task = rq->curr;
+
+ lockdep_assert_rq_held(rq);
/*
- * Fudge the rq selection such that the below task selection loop
- * doesn't get stuck on the currently eligible stop task.
- *
- * We're currently inside stop_machine() and the rq is either stuck
- * in the stop_machine_cpu_stop() loop, or we're executing this code,
- * either way we should never end up calling schedule() until we're
- * done here.
+ * Ensure the thing is persistent until balance_push_set(.on = false);
*/
- rq->stop = NULL;
+ rq->balance_callback = &balance_push_callback;
/*
- * put_prev_task() and pick_next_task() sched
- * class method both need to have an up-to-date
- * value of rq->clock[_task]
+ * Only active while going offline and when invoked on the outgoing
+ * CPU.
*/
- update_rq_clock(rq);
+ if (!cpu_dying(rq->cpu) || rq != this_rq())
+ return;
+
+ /*
+ * Both the cpu-hotplug and stop task are in this case and are
+ * required to complete the hotplug process.
+ */
+ if (kthread_is_per_cpu(push_task) ||
+ is_migration_disabled(push_task)) {
- for ( ; ; ) {
/*
- * There's this thread running, bail when that's the only
- * remaining thread.
+ * If this is the idle task on the outgoing CPU try to wake
+ * up the hotplug control thread which might wait for the
+ * last task to vanish. The rcuwait_active() check is
+ * accurate here because the waiter is pinned on this CPU
+ * and can't obviously be running in parallel.
+ *
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
*/
- if (rq->nr_running == 1)
- break;
-
- next = pick_next_task(rq, &fake_task);
- BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
-
- /* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_cpu, next);
- raw_spin_unlock(&rq->lock);
-
- __migrate_task(next, dead_cpu, dest_cpu);
-
- raw_spin_lock(&rq->lock);
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
+ raw_spin_rq_unlock(rq);
+ rcuwait_wake_up(&rq->hotplug_wait);
+ raw_spin_rq_lock(rq);
+ }
+ return;
}
- rq->stop = stop;
-}
-
-#endif /* CONFIG_HOTPLUG_CPU */
-
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
- {
- .procname = "sched_domain",
- .mode = 0555,
- },
- {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = sd_ctl_dir,
- },
- {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
- struct ctl_table *entry =
- kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
- return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
- struct ctl_table *entry;
-
+ get_task_struct(push_task);
/*
- * In the intermediate directories, both the child directory and
- * procname are dynamically allocated and could fail but the mode
- * will always be set. In the lowest directory the names are
- * static strings and all have proc handlers.
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
+ * Both preemption and IRQs are still disabled.
*/
- for (entry = *tablep; entry->mode; entry++) {
- if (entry->child)
- sd_free_ctl_entry(&entry->child);
- if (entry->proc_handler == NULL)
- kfree(entry->procname);
- }
-
- kfree(*tablep);
- *tablep = NULL;
+ preempt_disable();
+ raw_spin_rq_unlock(rq);
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+ this_cpu_ptr(&push_work));
+ preempt_enable();
+ /*
+ * At this point need_resched() is true and we'll take the loop in
+ * schedule(). The next pick is obviously going to be the stop task
+ * which kthread_is_per_cpu() and will push this task away.
+ */
+ raw_spin_rq_lock(rq);
}
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
- const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler,
- bool load_idx)
+static void balance_push_set(int cpu, bool on)
{
- entry->procname = procname;
- entry->data = data;
- entry->maxlen = maxlen;
- entry->mode = mode;
- entry->proc_handler = proc_handler;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
- if (load_idx) {
- entry->extra1 = &min_load_idx;
- entry->extra2 = &max_load_idx;
+ rq_lock_irqsave(rq, &rf);
+ if (on) {
+ WARN_ON_ONCE(rq->balance_callback);
+ rq->balance_callback = &balance_push_callback;
+ } else if (rq->balance_callback == &balance_push_callback) {
+ rq->balance_callback = NULL;
}
+ rq_unlock_irqrestore(rq, &rf);
}
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
-
- if (table == NULL)
- return NULL;
-
- set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax, true);
- set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[9], "cache_nice_tries",
- &sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "max_newidle_lb_cost",
- &sd->max_newidle_lb_cost,
- sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[12], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
-
- return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
- struct ctl_table *entry, *table;
- struct sched_domain *sd;
- int domain_num = 0, i;
- char buf[32];
-
- for_each_domain(cpu, sd)
- domain_num++;
- entry = table = sd_alloc_ctl_entry(domain_num + 1);
- if (table == NULL)
- return NULL;
+ struct rq *rq = this_rq();
- i = 0;
- for_each_domain(cpu, sd) {
- snprintf(buf, 32, "domain%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
- entry++;
- i++;
- }
- return table;
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
+ TASK_UNINTERRUPTIBLE);
}
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
- char buf[32];
-
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
-
- if (entry == NULL)
- return;
-
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
- }
+#else /* !CONFIG_HOTPLUG_CPU: */
- WARN_ON(sd_sysctl_header);
- sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
+static inline void balance_push(struct rq *rq)
{
- if (sd_sysctl_header)
- unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
-#else
-static void register_sched_domain_sysctl(void)
+
+static inline void balance_push_set(int cpu, bool on)
{
}
-static void unregister_sched_domain_sysctl(void)
+
+static inline void balance_hotplug_wait(void)
{
}
-#endif
-static void set_rq_online(struct rq *rq)
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+void set_rq_online(struct rq *rq)
{
if (!rq->online) {
const struct sched_class *class;
@@ -5236,11 +8189,12 @@ static void set_rq_online(struct rq *rq)
}
}
-static void set_rq_offline(struct rq *rq)
+void set_rq_offline(struct rq *rq)
{
if (rq->online) {
const struct sched_class *class;
+ update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@@ -5251,1824 +8205,321 @@ static void set_rq_offline(struct rq *rq)
}
}
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (long)hcpu;
- unsigned long flags;
- struct rq *rq = cpu_rq(cpu);
-
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- rq->calc_load_update = calc_load_update;
- break;
-
- case CPU_ONLINE:
- /* Update our root-domain */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
- set_rq_online(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DYING:
- sched_ttwu_pending();
- /* Update our root-domain */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- migrate_tasks(cpu);
- BUG_ON(rq->nr_running != 1); /* the migration thread */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- break;
-
- case CPU_DEAD:
- calc_load_migrate(rq);
- break;
-#endif
- }
-
- update_max_interval();
-
- return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else. This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block migration_notifier = {
- .notifier_call = migration_call,
- .priority = CPU_PRI_MIGRATION,
-};
-
-static void __cpuinit set_cpu_rq_start_time(void)
-{
- int cpu = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
- rq->age_stamp = sched_clock_cpu(cpu);
-}
-
-static int sched_cpu_active(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
- set_cpu_rq_start_time();
- return NOTIFY_OK;
- case CPU_DOWN_FAILED:
- set_cpu_active((long)hcpu, true);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- set_cpu_active((long)hcpu, false);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int __init migration_init(void)
-{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- /* Initialize migration for the boot CPU */
- err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
- BUG_ON(err == NOTIFY_BAD);
- migration_call(&migration_notifier, CPU_ONLINE, cpu);
- register_cpu_notifier(&migration_notifier);
-
- /* Register cpu active notifiers */
- cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
- cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
- return 0;
-}
-early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
-
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_debug_enabled;
-
-static int __init sched_debug_setup(char *str)
-{
- sched_debug_enabled = 1;
-
- return 0;
-}
-early_param("sched_debug", sched_debug_setup);
-
-static inline bool sched_debug(void)
-{
- return sched_debug_enabled;
-}
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
- struct cpumask *groupmask)
-{
- struct sched_group *group = sd->groups;
-
- cpumask_clear(groupmask);
-
- printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
- if (!(sd->flags & SD_LOAD_BALANCE)) {
- printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
- return -1;
- }
-
- printk(KERN_CONT "span %*pbl level %s\n",
- cpumask_pr_args(sched_domain_span(sd)), sd->name);
-
- if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- printk(KERN_ERR "ERROR: domain->span does not contain "
- "CPU%d\n", cpu);
- }
- if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
- printk(KERN_ERR "ERROR: domain->groups does not contain"
- " CPU%d\n", cpu);
- }
-
- printk(KERN_DEBUG "%*s groups:", level + 1, "");
- do {
- if (!group) {
- printk("\n");
- printk(KERN_ERR "ERROR: group is NULL\n");
- break;
- }
-
- if (!cpumask_weight(sched_group_cpus(group))) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: empty group\n");
- break;
- }
-
- if (!(sd->flags & SD_OVERLAP) &&
- cpumask_intersects(groupmask, sched_group_cpus(group))) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: repeated CPUs\n");
- break;
- }
-
- cpumask_or(groupmask, groupmask, sched_group_cpus(group));
-
- printk(KERN_CONT " %*pbl",
- cpumask_pr_args(sched_group_cpus(group)));
- if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
- group->sgc->capacity);
- }
-
- group = group->next;
- } while (group != sd->groups);
- printk(KERN_CONT "\n");
-
- if (!cpumask_equal(sched_domain_span(sd), groupmask))
- printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
- if (sd->parent &&
- !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
- printk(KERN_ERR "ERROR: parent span is not a superset "
- "of domain->span\n");
- return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
- int level = 0;
-
- if (!sched_debug_enabled)
- return;
-
- if (!sd) {
- printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
- return;
- }
-
- printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
- for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
- break;
- level++;
- sd = sd->parent;
- if (!sd)
- break;
- }
-}
-#else /* !CONFIG_SCHED_DEBUG */
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
+static inline void sched_set_rq_online(struct rq *rq, int cpu)
{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
-
- /* Following flags need at least 2 groups */
- if (sd->flags & (SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
-
- /* Following flags don't use groups */
- if (sd->flags & (SD_WAKE_AFFINE))
- return 0;
-
- return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
- unsigned long cflags = sd->flags, pflags = parent->flags;
-
- if (sd_degenerate(parent))
- return 1;
-
- if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
- return 0;
-
- /* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_LOAD_BALANCE |
- SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
- if (~cflags & pflags)
- return 0;
-
- return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
- struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
- cpupri_cleanup(&rd->cpupri);
- cpudl_cleanup(&rd->cpudl);
- free_cpumask_var(rd->dlo_mask);
- free_cpumask_var(rd->rto_mask);
- free_cpumask_var(rd->online);
- free_cpumask_var(rd->span);
- kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
- struct root_domain *old_rd = NULL;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ struct rq_flags rf;
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
- old_rd = rq->rd;
-
- if (cpumask_test_cpu(rq->cpu, old_rd->online))
- set_rq_offline(rq);
-
- cpumask_clear_cpu(rq->cpu, old_rd->span);
-
- /*
- * If we dont want to free the old_rd yet then
- * set old_rd to NULL to skip the freeing later
- * in this function:
- */
- if (!atomic_dec_and_test(&old_rd->refcount))
- old_rd = NULL;
- }
-
- atomic_inc(&rd->refcount);
- rq->rd = rd;
-
- cpumask_set_cpu(rq->cpu, rd->span);
- if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-
- if (old_rd)
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
- memset(rd, 0, sizeof(*rd));
-
- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
- goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
- goto free_span;
- if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
- goto free_online;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
- goto free_dlo_mask;
-
- init_dl_bw(&rd->dl_bw);
- if (cpudl_init(&rd->cpudl) != 0)
- goto free_dlo_mask;
-
- if (cpupri_init(&rd->cpupri) != 0)
- goto free_rto_mask;
- return 0;
-
-free_rto_mask:
- free_cpumask_var(rd->rto_mask);
-free_dlo_mask:
- free_cpumask_var(rd->dlo_mask);
-free_online:
- free_cpumask_var(rd->online);
-free_span:
- free_cpumask_var(rd->span);
-out:
- return -ENOMEM;
-}
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-struct root_domain def_root_domain;
-
-static void init_defrootdomain(void)
-{
- init_rootdomain(&def_root_domain);
-
- atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
- struct root_domain *rd;
-
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
- if (!rd)
- return NULL;
-
- if (init_rootdomain(rd) != 0) {
- kfree(rd);
- return NULL;
- }
-
- return rd;
-}
-
-static void free_sched_groups(struct sched_group *sg, int free_sgc)
-{
- struct sched_group *tmp, *first;
-
- if (!sg)
- return;
-
- first = sg;
- do {
- tmp = sg->next;
-
- if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
- kfree(sg->sgc);
-
- kfree(sg);
- sg = tmp;
- } while (sg != first);
-}
-
-static void free_sched_domain(struct rcu_head *rcu)
-{
- struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
- /*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
- */
- if (sd->flags & SD_OVERLAP) {
- free_sched_groups(sd->groups, 1);
- } else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgc);
- kfree(sd->groups);
}
- kfree(sd);
+ rq_unlock_irqrestore(rq, &rf);
}
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+static inline void sched_set_rq_offline(struct rq *rq, int cpu)
{
- call_rcu(&sd->rcu, free_sched_domain);
-}
+ struct rq_flags rf;
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
-{
- for (; sd; sd = sd->parent)
- destroy_sched_domain(sd, cpu);
-}
-
-/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first cpu number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two cpus are in the same cache domain, see cpus_share_cache().
- */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
-
-static void update_top_cache_domain(int cpu)
-{
- struct sched_domain *sd;
- struct sched_domain *busy_sd = NULL;
- int id = cpu;
- int size = 1;
-
- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd) {
- id = cpumask_first(sched_domain_span(sd));
- size = cpumask_weight(sched_domain_span(sd));
- busy_sd = sd->parent; /* sd_busy */
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
}
- rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
-
- rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
- per_cpu(sd_llc_size, cpu) = size;
- per_cpu(sd_llc_id, cpu) = id;
-
- sd = lowest_flag_domain(cpu, SD_NUMA);
- rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
-
- sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
- rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+ rq_unlock_irqrestore(rq, &rf);
}
/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
+ * used to mark begin/end of suspend/resume:
*/
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- struct sched_domain *tmp;
-
- /* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; ) {
- struct sched_domain *parent = tmp->parent;
- if (!parent)
- break;
-
- if (sd_parent_degenerate(tmp, parent)) {
- tmp->parent = parent->parent;
- if (parent->parent)
- parent->parent->child = tmp;
- /*
- * Transfer SD_PREFER_SIBLING down in case of a
- * degenerate parent; the spans match for this
- * so the property transfers.
- */
- if (parent->flags & SD_PREFER_SIBLING)
- tmp->flags |= SD_PREFER_SIBLING;
- destroy_sched_domain(parent, cpu);
- } else
- tmp = tmp->parent;
- }
-
- if (sd && sd_degenerate(sd)) {
- tmp = sd;
- sd = sd->parent;
- destroy_sched_domain(tmp, cpu);
- if (sd)
- sd->child = NULL;
- }
-
- sched_domain_debug(sd, cpu);
-
- rq_attach_root(rq, rd);
- tmp = rq->sd;
- rcu_assign_pointer(rq->sd, sd);
- destroy_sched_domains(tmp, cpu);
-
- update_top_cache_domain(cpu);
-}
-
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
- cpulist_parse(str, cpu_isolated_map);
- return 1;
-}
-
-__setup("isolcpus=", isolated_cpu_setup);
-
-struct s_data {
- struct sched_domain ** __percpu sd;
- struct root_domain *rd;
-};
-
-enum s_alloc {
- sa_rootdomain,
- sa_sd,
- sa_sd_storage,
- sa_none,
-};
+static int num_cpus_frozen;
/*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
- *
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
- *
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * cpu they're built on, so check that.
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
*
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
-{
- const struct cpumask *span = sched_domain_span(sd);
- struct sd_data *sdd = sd->private;
- struct sched_domain *sibling;
- int i;
-
- for_each_cpu(i, span) {
- sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
- continue;
-
- cpumask_set_cpu(i, sched_group_mask(sg));
- }
-}
-
-/*
- * Return the canonical balance cpu for this group, this is the first cpu
- * of this group that's also in the iteration mask.
- */
-int group_balance_cpu(struct sched_group *sg)
-{
- return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
-}
-
-static int
-build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+static void cpuset_cpu_active(void)
{
- struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
- const struct cpumask *span = sched_domain_span(sd);
- struct cpumask *covered = sched_domains_tmpmask;
- struct sd_data *sdd = sd->private;
- struct sched_domain *sibling;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct cpumask *sg_span;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- sibling = *per_cpu_ptr(sdd->sd, i);
-
- /* See the comment near build_group_mask(). */
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
- continue;
-
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(cpu));
-
- if (!sg)
- goto fail;
-
- sg_span = sched_group_cpus(sg);
- if (sibling->child)
- cpumask_copy(sg_span, sched_domain_span(sibling->child));
- else
- cpumask_set_cpu(i, sg_span);
-
- cpumask_or(covered, covered, sg_span);
-
- sg->sgc = *per_cpu_ptr(sdd->sgc, i);
- if (atomic_inc_return(&sg->sgc->ref) == 1)
- build_group_mask(sd, sg);
-
+ if (cpuhp_tasks_frozen) {
/*
- * Initialize sgc->capacity such that even if we mess up the
- * domains and no possible iteration will get us here, we won't
- * die on a /0 trap.
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
*/
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-
+ cpuset_reset_sched_domains();
+ if (--num_cpus_frozen)
+ return;
/*
- * Make sure the first group of this domain contains the
- * canonical balance cpu. Otherwise the sched_domain iteration
- * breaks. See update_sg_lb_stats().
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
*/
- if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- group_balance_cpu(sg) == cpu)
- groups = sg;
-
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- last->next = first;
+ cpuset_force_rebuild();
}
- sd->groups = groups;
-
- return 0;
-
-fail:
- free_sched_groups(first, 0);
-
- return -ENOMEM;
-}
-
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-{
- struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
- struct sched_domain *child = sd->child;
-
- if (child)
- cpu = cpumask_first(sched_domain_span(child));
-
- if (sg) {
- *sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
- atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
- }
-
- return cpu;
-}
-
-/*
- * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
- *
- * Assumes the sched_domain tree is fully constructed
- */
-static int
-build_sched_groups(struct sched_domain *sd, int cpu)
-{
- struct sched_group *first = NULL, *last = NULL;
- struct sd_data *sdd = sd->private;
- const struct cpumask *span = sched_domain_span(sd);
- struct cpumask *covered;
- int i;
-
- get_group(cpu, sdd, &sd->groups);
- atomic_inc(&sd->groups->ref);
-
- if (cpu != cpumask_first(span))
- return 0;
-
- lockdep_assert_held(&sched_domains_mutex);
- covered = sched_domains_tmpmask;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group, j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- group = get_group(i, sdd, &sg);
- cpumask_setall(sched_group_mask(sg));
-
- for_each_cpu(j, span) {
- if (get_group(j, sdd, NULL) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
-
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-
- return 0;
-}
-
-/*
- * Initialize sched groups cpu_capacity.
- *
- * cpu_capacity indicates the capacity of sched group, which is used while
- * distributing the load between different sched groups in a sched domain.
- * Typically cpu_capacity for all the groups in a sched domain will be same
- * unless there are asymmetries in the topology. If there are asymmetries,
- * group having more cpu_capacity will pickup more load compared to the
- * group having less cpu_capacity.
- */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
-{
- struct sched_group *sg = sd->groups;
-
- WARN_ON(!sg);
-
- do {
- sg->group_weight = cpumask_weight(sched_group_cpus(sg));
- sg = sg->next;
- } while (sg != sd->groups);
-
- if (cpu != group_balance_cpu(sg))
- return;
-
- update_group_capacity(sd, cpu);
- atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
+ cpuset_update_active_cpus();
}
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
- if (kstrtoint(str, 0, &default_relax_domain_level))
- pr_warn("Unable to set relax_domain_level\n");
-
- return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
- struct sched_domain_attr *attr)
+static void cpuset_cpu_inactive(unsigned int cpu)
{
- int request;
-
- if (!attr || attr->relax_domain_level < 0) {
- if (default_relax_domain_level < 0)
- return;
- else
- request = default_relax_domain_level;
- } else
- request = attr->relax_domain_level;
- if (request < sd->level) {
- /* turn off idle balance on this domain */
- sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ if (!cpuhp_tasks_frozen) {
+ cpuset_update_active_cpus();
} else {
- /* turn on idle balance on this domain */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- }
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
- const struct cpumask *cpu_map)
-{
- switch (what) {
- case sa_rootdomain:
- if (!atomic_read(&d->rd->refcount))
- free_rootdomain(&d->rd->rcu); /* fall through */
- case sa_sd:
- free_percpu(d->sd); /* fall through */
- case sa_sd_storage:
- __sdt_free(cpu_map); /* fall through */
- case sa_none:
- break;
+ num_cpus_frozen++;
+ cpuset_reset_sched_domains();
}
}
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
- const struct cpumask *cpu_map)
+static inline void sched_smt_present_inc(int cpu)
{
- memset(d, 0, sizeof(*d));
-
- if (__sdt_alloc(cpu_map))
- return sa_sd_storage;
- d->sd = alloc_percpu(struct sched_domain *);
- if (!d->sd)
- return sa_sd_storage;
- d->rd = alloc_rootdomain();
- if (!d->rd)
- return sa_sd;
- return sa_rootdomain;
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
+#endif
}
-/*
- * NULL the sd_data elements we've used to build the sched_domain and
- * sched_group structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
+static inline void sched_smt_present_dec(int cpu)
{
- struct sd_data *sdd = sd->private;
-
- WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
- *per_cpu_ptr(sdd->sd, cpu) = NULL;
-
- if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
- *per_cpu_ptr(sdd->sg, cpu) = NULL;
-
- if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
- *per_cpu_ptr(sdd->sgc, cpu) = NULL;
-}
-
-#ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
-enum numa_topology_type sched_numa_topology_type;
-static int *sched_domains_numa_distance;
-int sched_max_numa_distance;
-static struct cpumask ***sched_domains_numa_masks;
-static int sched_domains_curr_level;
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
#endif
+}
-/*
- * SD_flags allowed in topology descriptions.
- *
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
- *
- * Odd one out:
- * SD_ASYM_PACKING - describes SMT quirks
- */
-#define TOPOLOGY_SD_FLAGS \
- (SD_SHARE_CPUCAPACITY | \
- SD_SHARE_PKG_RESOURCES | \
- SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
-
-static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+int sched_cpu_activate(unsigned int cpu)
{
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int sd_weight, sd_flags = 0;
+ struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_NUMA
/*
- * Ugly hack to pass state to sd_numa_mask()...
+ * Clear the balance_push callback and prepare to schedule
+ * regular tasks.
*/
- sched_domains_curr_level = tl->numa_level;
-#endif
-
- sd_weight = cpumask_weight(tl->mask(cpu));
-
- if (tl->sd_flags)
- sd_flags = (*tl->sd_flags)();
- if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
- "wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
- *sd = (struct sched_domain){
- .min_interval = sd_weight,
- .max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
-
- .cache_nice_tries = 0,
- .busy_idx = 0,
- .idle_idx = 0,
- .newidle_idx = 0,
- .wake_idx = 0,
- .forkexec_idx = 0,
-
- .flags = 1*SD_LOAD_BALANCE
- | 1*SD_BALANCE_NEWIDLE
- | 1*SD_BALANCE_EXEC
- | 1*SD_BALANCE_FORK
- | 0*SD_BALANCE_WAKE
- | 1*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUCAPACITY
- | 0*SD_SHARE_PKG_RESOURCES
- | 0*SD_SERIALIZE
- | 0*SD_PREFER_SIBLING
- | 0*SD_NUMA
- | sd_flags
- ,
-
- .last_balance = jiffies,
- .balance_interval = sd_weight,
- .smt_gain = 0,
- .max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
-#ifdef CONFIG_SCHED_DEBUG
- .name = tl->name,
-#endif
- };
+ balance_push_set(cpu, false);
/*
- * Convert topological properties into behaviour.
+ * When going up, increment the number of cores with SMT present.
*/
+ sched_smt_present_inc(cpu);
+ set_cpu_active(cpu, true);
- if (sd->flags & SD_SHARE_CPUCAPACITY) {
- sd->flags |= SD_PREFER_SIBLING;
- sd->imbalance_pct = 110;
- sd->smt_gain = 1178; /* ~15% */
-
- } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->imbalance_pct = 117;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
-
-#ifdef CONFIG_NUMA
- } else if (sd->flags & SD_NUMA) {
- sd->cache_nice_tries = 2;
- sd->busy_idx = 3;
- sd->idle_idx = 2;
-
- sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
- sd->flags &= ~(SD_BALANCE_EXEC |
- SD_BALANCE_FORK |
- SD_WAKE_AFFINE);
- }
-
-#endif
- } else {
- sd->flags |= SD_PREFER_SIBLING;
- sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
- sd->idle_idx = 1;
- }
-
- sd->private = &tl->data;
-
- return sd;
-}
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
- { NULL, },
-};
-
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->mask; tl++)
-
-void set_sched_topology(struct sched_domain_topology_level *tl)
-{
- sched_domain_topology = tl;
-}
-
-#ifdef CONFIG_NUMA
-
-static const struct cpumask *sd_numa_mask(int cpu)
-{
- return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
-}
-
-static void sched_numa_warn(const char *str)
-{
- static int done = false;
- int i,j;
-
- if (done)
- return;
-
- done = true;
-
- printk(KERN_WARNING "ERROR: %s\n\n", str);
-
- for (i = 0; i < nr_node_ids; i++) {
- printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
- printk(KERN_CONT "\n");
+ if (sched_smp_initialized) {
+ sched_update_numa(cpu, true);
+ sched_domains_numa_masks_set(cpu);
+ cpuset_cpu_active();
}
- printk(KERN_WARNING "\n");
-}
-
-bool find_numa_distance(int distance)
-{
- int i;
- if (distance == node_distance(0, 0))
- return true;
+ scx_rq_activate(rq);
- for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
- }
+ /*
+ * Put the rq online, if not already. This happens:
+ *
+ * 1) In the early boot process, because we build the real domains
+ * after all CPUs have been brought up.
+ *
+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+ * domains.
+ */
+ sched_set_rq_online(rq, cpu);
- return false;
+ return 0;
}
-/*
- * A system can have three types of NUMA topology:
- * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
- * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
- * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
- *
- * The difference between a glueless mesh topology and a backplane
- * topology lies in whether communication between not directly
- * connected nodes goes through intermediary nodes (where programs
- * could run), or through backplane controllers. This affects
- * placement of programs.
- *
- * The type of topology can be discerned with the following tests:
- * - If the maximum distance between any nodes is 1 hop, the system
- * is directly connected.
- * - If for two nodes A and B, located N > 1 hops away from each other,
- * there is an intermediary node C, which is < N hops away from both
- * nodes A and B, the system is a glueless mesh.
- */
-static void init_numa_topology_type(void)
+int sched_cpu_deactivate(unsigned int cpu)
{
- int a, b, c, n;
-
- n = sched_max_numa_distance;
-
- if (n <= 1)
- sched_numa_topology_type = NUMA_DIRECT;
-
- for_each_online_node(a) {
- for_each_online_node(b) {
- /* Find two nodes furthest removed from each other. */
- if (node_distance(a, b) < n)
- continue;
-
- /* Is there an intermediary node between a and b? */
- for_each_online_node(c) {
- if (node_distance(a, c) < n &&
- node_distance(b, c) < n) {
- sched_numa_topology_type =
- NUMA_GLUELESS_MESH;
- return;
- }
- }
-
- sched_numa_topology_type = NUMA_BACKPLANE;
- return;
- }
- }
-}
+ struct rq *rq = cpu_rq(cpu);
+ int ret;
-static void sched_init_numa(void)
-{
- int next_distance, curr_distance = node_distance(0, 0);
- struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
+ ret = dl_bw_deactivate(cpu);
- sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
+ if (ret)
+ return ret;
/*
- * O(nr_nodes^2) deduplicating selection sort -- in order to find the
- * unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
+ * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ * load balancing when not active
*/
- next_distance = curr_distance;
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
+ nohz_balance_exit_idle(rq);
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
- }
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
- }
-
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
- }
-
- if (!level)
- return;
+ set_cpu_active(cpu, false);
/*
- * 'level' contains the number of unique distances, excluding the
- * identity distance node_distance(i,i).
- *
- * The sched_domains_numa_distance[] array includes the actual distance
- * numbers.
+ * From this point forward, this CPU will refuse to run any task that
+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
+ * push those tasks away until this gets cleared, see
+ * sched_cpu_dying().
*/
+ balance_push_set(cpu, true);
/*
- * Here, we should temporarily reset sched_domains_numa_levels to 0.
- * If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
- * dangerous when we use it to iterate array sched_domains_numa_masks[][]
- * in other functions.
+ * We've cleared cpu_active_mask / set balance_push, wait for all
+ * preempt-disabled and RCU users of this state to go away such that
+ * all new such users will observe it.
*
- * We reset it to 'level' at the end of this function.
- */
- sched_domains_numa_levels = 0;
-
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
- if (!sched_domains_numa_masks)
- return;
-
- /*
- * Now for each level, construct a mask per node which contains all
- * cpus of nodes that are that many hops away from us.
+ * Specifically, we rely on ttwu to no longer target this CPU, see
+ * ttwu_queue_cond() and is_cpu_allowed().
+ *
+ * Do sync before park smpboot threads to take care the RCU boost case.
*/
- for (i = 0; i < level; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
- return;
-
- for (j = 0; j < nr_node_ids; j++) {
- struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
- if (!mask)
- return;
+ synchronize_rcu();
- sched_domains_numa_masks[i][j] = mask;
+ sched_set_rq_offline(rq, cpu);
- for (k = 0; k < nr_node_ids; k++) {
- if (node_distance(j, k) > sched_domains_numa_distance[i])
- continue;
-
- cpumask_or(mask, mask, cpumask_of_node(k));
- }
- }
- }
-
- /* Compute default topology size */
- for (i = 0; sched_domain_topology[i].mask; i++);
-
- tl = kzalloc((i + level + 1) *
- sizeof(struct sched_domain_topology_level), GFP_KERNEL);
- if (!tl)
- return;
+ scx_rq_deactivate(rq);
/*
- * Copy the default topology bits..
+ * When going down, decrement the number of cores with SMT present.
*/
- for (i = 0; sched_domain_topology[i].mask; i++)
- tl[i] = sched_domain_topology[i];
-
- /*
- * .. and append 'j' levels of NUMA goodness.
- */
- for (j = 0; j < level; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
- }
-
- sched_domain_topology = tl;
-
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
-
- init_numa_topology_type();
-}
-
-static void sched_domains_numa_masks_set(int cpu)
-{
- int i, j;
- int node = cpu_to_node(cpu);
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- if (node_distance(j, node) <= sched_domains_numa_distance[i])
- cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
- }
-}
+ sched_smt_present_dec(cpu);
-static void sched_domains_numa_masks_clear(int cpu)
-{
- int i, j;
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
- }
-}
-
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- sched_domains_numa_masks_set(cpu);
- break;
-
- case CPU_DEAD:
- sched_domains_numa_masks_clear(cpu);
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
- return NOTIFY_OK;
-}
-#else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- return 0;
-}
-#endif /* CONFIG_NUMA */
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- sdd->sd = alloc_percpu(struct sched_domain *);
- if (!sdd->sd)
- return -ENOMEM;
-
- sdd->sg = alloc_percpu(struct sched_group *);
- if (!sdd->sg)
- return -ENOMEM;
-
- sdd->sgc = alloc_percpu(struct sched_group_capacity *);
- if (!sdd->sgc)
- return -ENOMEM;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
- struct sched_group *sg;
- struct sched_group_capacity *sgc;
-
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sd)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sd, j) = sd;
-
- sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sg)
- return -ENOMEM;
-
- sg->next = sg;
-
- *per_cpu_ptr(sdd->sg, j) = sg;
-
- sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(j));
- if (!sgc)
- return -ENOMEM;
-
- *per_cpu_ptr(sdd->sgc, j) = sgc;
- }
- }
-
- return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
- struct sched_domain_topology_level *tl;
- int j;
-
- for_each_sd_topology(tl) {
- struct sd_data *sdd = &tl->data;
-
- for_each_cpu(j, cpu_map) {
- struct sched_domain *sd;
-
- if (sdd->sd) {
- sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
- free_sched_groups(sd->groups, 0);
- kfree(*per_cpu_ptr(sdd->sd, j));
- }
-
- if (sdd->sg)
- kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgc)
- kfree(*per_cpu_ptr(sdd->sgc, j));
- }
- free_percpu(sdd->sd);
- sdd->sd = NULL;
- free_percpu(sdd->sg);
- sdd->sg = NULL;
- free_percpu(sdd->sgc);
- sdd->sgc = NULL;
- }
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int cpu)
-{
- struct sched_domain *sd = sd_init(tl, cpu);
- if (!sd)
- return child;
-
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- if (child) {
- sd->level = child->level + 1;
- sched_domain_level_max = max(sched_domain_level_max, sd->level);
- child->parent = sd;
- sd->child = child;
-
- if (!cpumask_subset(sched_domain_span(child),
- sched_domain_span(sd))) {
- pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
- pr_err(" the %s domain not a subset of the %s domain\n",
- child->name, sd->name);
+#ifdef CONFIG_SCHED_SMT
+ sched_core_cpu_deactivate(cpu);
#endif
- /* Fixup, ensure @sd has at least @child cpus. */
- cpumask_or(sched_domain_span(sd),
- sched_domain_span(sd),
- sched_domain_span(child));
- }
- }
- set_domain_attribute(sd, attr);
+ if (!sched_smp_initialized)
+ return 0;
- return sd;
+ sched_update_numa(cpu, false);
+ cpuset_cpu_inactive(cpu);
+ sched_domains_numa_masks_clear(cpu);
+ return 0;
}
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
+static void sched_rq_cpu_starting(unsigned int cpu)
{
- enum s_alloc alloc_state;
- struct sched_domain *sd;
- struct s_data d;
- int i, ret = -ENOMEM;
-
- alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
- if (alloc_state != sa_rootdomain)
- goto error;
-
- /* Set up domains for cpus specified by the cpu_map. */
- for_each_cpu(i, cpu_map) {
- struct sched_domain_topology_level *tl;
-
- sd = NULL;
- for_each_sd_topology(tl) {
- sd = build_sched_domain(tl, cpu_map, attr, sd, i);
- if (tl == sched_domain_topology)
- *per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
- sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
- }
- }
-
- /* Build the groups for the domains */
- for_each_cpu(i, cpu_map) {
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
- if (build_overlap_sched_groups(sd, i))
- goto error;
- } else {
- if (build_sched_groups(sd, i))
- goto error;
- }
- }
- }
-
- /* Calculate CPU capacity for physical packages and nodes */
- for (i = nr_cpumask_bits-1; i >= 0; i--) {
- if (!cpumask_test_cpu(i, cpu_map))
- continue;
-
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
- claim_allocations(i, sd);
- init_sched_groups_capacity(i, sd);
- }
- }
-
- /* Attach the domains */
- rcu_read_lock();
- for_each_cpu(i, cpu_map) {
- sd = *per_cpu_ptr(d.sd, i);
- cpu_attach_domain(sd, d.rd, i);
- }
- rcu_read_unlock();
+ struct rq *rq = cpu_rq(cpu);
- ret = 0;
-error:
- __free_domain_allocs(&d, alloc_state, cpu_map);
- return ret;
+ rq->calc_load_update = calc_load_update;
+ update_max_interval();
}
-static cpumask_var_t *doms_cur; /* current sched domains */
-static int ndoms_cur; /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
- /* attribues of custom domains in 'doms_cur' */
-
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __weak arch_update_cpu_topology(void)
+int sched_cpu_starting(unsigned int cpu)
{
+ sched_core_cpu_starting(cpu);
+ sched_rq_cpu_starting(cpu);
+ sched_tick_start(cpu);
return 0;
}
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
- int i;
- cpumask_var_t *doms;
-
- doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
- if (!doms)
- return NULL;
- for (i = 0; i < ndoms; i++) {
- if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
- free_sched_domains(doms, i);
- return NULL;
- }
- }
- return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
- unsigned int i;
- for (i = 0; i < ndoms; i++)
- free_cpumask_var(doms[i]);
- kfree(doms);
-}
+#ifdef CONFIG_HOTPLUG_CPU
/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
*/
-static int init_sched_domains(const struct cpumask *cpu_map)
+int sched_cpu_wait_empty(unsigned int cpu)
{
- int err;
-
- arch_update_cpu_topology();
- ndoms_cur = 1;
- doms_cur = alloc_sched_domains(ndoms_cur);
- if (!doms_cur)
- doms_cur = &fallback_doms;
- cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
- err = build_sched_domains(doms_cur[0], NULL);
- register_sched_domain_sysctl();
-
- return err;
+ balance_hotplug_wait();
+ sched_force_init_mm();
+ return 0;
}
/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the tear-down thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
*/
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
- int i;
-
- rcu_read_lock();
- for_each_cpu(i, cpu_map)
- cpu_attach_domain(NULL, &def_root_domain, i);
- rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
- struct sched_domain_attr *new, int idx_new)
+static void calc_load_migrate(struct rq *rq)
{
- struct sched_domain_attr tmp;
+ long delta = calc_load_fold_active(rq, 1);
- /* fast path */
- if (!new && !cur)
- return 1;
-
- tmp = SD_ATTR_INIT;
- return !memcmp(cur ? (cur + idx_cur) : &tmp,
- new ? (new + idx_new) : &tmp,
- sizeof(struct sched_domain_attr));
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
}
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains. This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+static void dump_rq_tasks(struct rq *rq, const char *loglvl)
{
- int i, j, n;
- int new_topology;
-
- mutex_lock(&sched_domains_mutex);
+ struct task_struct *g, *p;
+ int cpu = cpu_of(rq);
- /* always unregister in case we don't destroy any domains */
- unregister_sched_domain_sysctl();
+ lockdep_assert_rq_held(rq);
- /* Let architecture update cpu core mappings. */
- new_topology = arch_update_cpu_topology();
+ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != cpu)
+ continue;
- n = doms_new ? ndoms_new : 0;
+ if (!task_on_rq_queued(p))
+ continue;
- /* Destroy deleted domains */
- for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_cur[i], doms_new[j])
- && dattrs_equal(dattr_cur, i, dattr_new, j))
- goto match1;
- }
- /* no match - a current sched domain not in new doms_new[] */
- detach_destroy_domains(doms_cur[i]);
-match1:
- ;
- }
-
- n = ndoms_cur;
- if (doms_new == NULL) {
- n = 0;
- doms_new = &fallback_doms;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
- }
-
- /* Build new domains */
- for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < n && !new_topology; j++) {
- if (cpumask_equal(doms_new[i], doms_cur[j])
- && dattrs_equal(dattr_new, i, dattr_cur, j))
- goto match2;
- }
- /* no match - add a new doms_new */
- build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
- ;
+ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
}
-
- /* Remember the new sched domains */
- if (doms_cur != &fallback_doms)
- free_sched_domains(doms_cur, ndoms_cur);
- kfree(dattr_cur); /* kfree(NULL) is safe */
- doms_cur = doms_new;
- dattr_cur = dattr_new;
- ndoms_cur = ndoms_new;
-
- register_sched_domain_sysctl();
-
- mutex_unlock(&sched_domains_mutex);
}
-static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
-
-/*
- * Update cpusets according to cpu_active mask. If cpusets are
- * disabled, cpuset_update_active_cpus() becomes a simple wrapper
- * around partition_sched_domains().
- *
- * If we come here as part of a suspend/resume, don't touch cpusets because we
- * want to restore it back to its original state upon resume anyway.
- */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+int sched_cpu_dying(unsigned int cpu)
{
- switch (action) {
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED_FROZEN:
-
- /*
- * num_cpus_frozen tracks how many CPUs are involved in suspend
- * resume sequence. As long as this is not the last online
- * operation in the resume sequence, just build a single sched
- * domain, ignoring cpusets.
- */
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
- break;
- }
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
- /*
- * This is the last CPU online operation. So fall through and
- * restore the original sched domains by considering the
- * cpuset configurations.
- */
+ /* Handle pending wakeups and then migrate everything off */
+ sched_tick_stop(cpu);
- case CPU_ONLINE:
- cpuset_update_active_cpus(true);
- break;
- default:
- return NOTIFY_DONE;
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+ WARN(true, "Dying CPU not properly vacated!");
+ dump_rq_tasks(rq, KERN_WARNING);
}
- return NOTIFY_OK;
-}
+ dl_server_stop(&rq->fair_server);
+ rq_unlock_irqrestore(rq, &rf);
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- unsigned long flags;
- long cpu = (long)hcpu;
- struct dl_bw *dl_b;
- bool overflow;
- int cpus;
-
- switch (action) {
- case CPU_DOWN_PREPARE:
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
-
- if (overflow)
- return notifier_from_errno(-EBUSY);
- cpuset_update_active_cpus(false);
- break;
- case CPU_DOWN_PREPARE_FROZEN:
- num_cpus_frozen++;
- partition_sched_domains(1, NULL, NULL);
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
+ calc_load_migrate(rq);
+ update_max_interval();
+ hrtick_clear(rq);
+ sched_core_cpu_dying(cpu);
+ return 0;
}
+#endif /* CONFIG_HOTPLUG_CPU */
void __init sched_init_smp(void)
{
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
- alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ sched_init_numa(NUMA_NO_NODE);
- sched_init_numa();
+ prandom_init_once(&sched_rnd_state);
/*
* There's no userspace yet to cause hotplug operations; hence all the
- * cpu masks are stable and all blatant races in the below code cannot
+ * CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
- mutex_lock(&sched_domains_mutex);
- init_sched_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
- mutex_unlock(&sched_domains_mutex);
-
- hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
- hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
- hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
- init_hrtick();
+ sched_domains_mutex_lock();
+ sched_init_domains(cpu_active_mask);
+ sched_domains_mutex_unlock();
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
+ current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
- free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
+
+ sched_init_dl_servers();
+
+ sched_smp_initialized = true;
}
-#else
-void __init sched_init_smp(void)
+
+static int __init migration_init(void)
{
- sched_init_granularity();
+ sched_cpu_starting(smp_processor_id());
+ return 0;
}
-#endif /* CONFIG_SMP */
-
-const_debug unsigned int sysctl_timer_migration = 1;
+early_initcall(migration_init);
int in_sched_functions(unsigned long addr)
{
@@ -7084,23 +8535,36 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
-#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __ro_after_init;
+#endif
void __init sched_init(void)
{
- int i, j;
- unsigned long alloc_size = 0, ptr;
+ unsigned long ptr = 0;
+ int i;
+
+ /* Make sure the linker didn't screw up */
+ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
+ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
+ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
+ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
+#ifdef CONFIG_SCHED_CLASS_EXT
+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
+#endif
+
+ wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
- if (alloc_size) {
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ if (ptr) {
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
@@ -7109,7 +8573,12 @@ void __init sched_init(void)
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_EXT_GROUP_SCHED
+ scx_tg_init(&root_task_group);
+#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -7119,21 +8588,8 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
-
- init_rt_bandwidth(&def_rt_bandwidth,
- global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth,
- global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_SMP
init_defrootdomain();
-#endif
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
@@ -7141,18 +8597,19 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
-
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
struct rq *rq;
rq = cpu_rq(i);
- raw_spin_lock_init(&rq->lock);
+ raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -7160,46 +8617,43 @@ void __init sched_init(void)
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
- * How much cpu bandwidth does root_task_group get?
+ * How much CPU bandwidth does root_task_group get?
*
- * In case of task-groups formed thr' the cgroup filesystem, it
- * gets 100% of the cpu resources in the system. This overall
- * system cpu resource is divided among the tasks of
+ * In case of task-groups formed through the cgroup filesystem, it
+ * gets 100% of the CPU resources in the system. This overall
+ * system CPU resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
* In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
- * then A0's share of the cpu resource is:
+ * then A0's share of the CPU resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * This is required for init cpu because rt.c:__enable_runtime()
+ * starts working after scheduler_running, which is not the case
+ * yet.
+ */
+ rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
- rq->cpu_load[j] = 0;
-
- rq->last_load_update_tick = jiffies;
-
-#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->post_schedule = 0;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -7213,32 +8667,49 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
- rq->nohz_flags = 0;
-#endif
-#ifdef CONFIG_NO_HZ_FULL
- rq->last_sched_tick = 0;
+ rq->last_blocked_load_update_tick = jiffies;
+ atomic_set(&rq->nohz_flags, 0);
+
+ INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ rcuwait_init(&rq->hotplug_wait);
#endif
- init_rq_hrtick(rq);
+ hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+ fair_server_init(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = rq;
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
+
+ rq->core_cookie = 0UL;
+#endif
+ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
- set_load_weight(&init_task);
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
- INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
+ set_load_weight(&init_task, false);
+ init_task.se.slice = sysctl_sched_base_slice,
/*
* The boot idle thread does lazy MMU switching as well:
*/
- atomic_inc(&init_mm.mm_count);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
- * During early bootup we pretend to be a normal task:
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
*/
- current->sched_class = &fair_sched_class;
+ WARN_ON(!set_kthread_struct(current));
/*
* Make us the idle thread. Technically, schedule() should not be
@@ -7246,115 +8717,187 @@ void __init sched_init(void)
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
+ __sched_fork(0, current);
init_idle(current, smp_processor_id());
calc_load_update = jiffies + LOAD_FREQ;
-#ifdef CONFIG_SMP
- zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
- set_cpu_rq_start_time();
-#endif
+
+ balance_push_set(smp_processor_id(), false);
init_sched_fair_class();
+ init_sched_ext_class();
+
+ psi_init();
+
+ init_uclamp();
+
+ preempt_dynamic_init();
scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
- int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-
- return (nested == preempt_offset);
-}
-void __might_sleep(const char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line)
{
+ unsigned int state = get_current_state();
/*
* Blocking primitives will set (and therefore destroy) current->state,
* since we will exit with TASK_RUNNING make sure we enter with it,
* otherwise we will destroy state.
*/
- WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
- "state=%lx set at [<%p>] %pS\n",
- current->state,
+ "state=%x set at [<%p>] %pS\n", state,
(void *)current->task_state_change,
(void *)current->task_state_change);
- ___might_sleep(file, line, preempt_offset);
+ __might_resched(file, line, 0);
}
EXPORT_SYMBOL(__might_sleep);
-void ___might_sleep(const char *file, int line, int preempt_offset)
+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
{
- static unsigned long prev_jiffy; /* ratelimiting */
+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ return;
- rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ if (preempt_count() == preempt_offset)
return;
+
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, ip);
+}
+
+static inline bool resched_offsets_ok(unsigned int offsets)
+{
+ unsigned int nested = preempt_count();
+
+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
+
+ return nested == offsets;
+}
+
+void __might_resched(const char *file, int line, unsigned int offsets)
+{
+ /* Ratelimiting timestamp: */
+ static unsigned long prev_jiffy;
+
+ unsigned long preempt_disable_ip;
+
+ /* WARN_ON_ONCE() by default, no rate limit required: */
+ rcu_sleep_check();
+
+ if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
+ !is_idle_task(current) && !current->non_block_count) ||
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
+ return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
- printk(KERN_ERR
- "BUG: sleeping function called from invalid context at %s:%d\n",
- file, line);
- printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
- current->pid, current->comm);
+ /* Save this before calling printk(), since that will clobber it: */
+ preempt_disable_ip = get_preempt_disable_ip(current);
+
+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
+ offsets & MIGHT_RESCHED_PREEMPT_MASK);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ pr_err("RCU nest depth: %d, expected: %u\n",
+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
+ }
if (task_stack_end_corrupted(current))
- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+ pr_emerg("Thread overran stack, or stack corrupted\n");
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
- if (!preempt_count_equals(preempt_offset)) {
- pr_err("Preemption disabled at:");
- print_ip_sym(current->preempt_disable_ip);
- pr_cont("\n");
- }
-#endif
+
+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
+ preempt_disable_ip);
+
dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
-EXPORT_SYMBOL(___might_sleep);
-#endif
+EXPORT_SYMBOL(__might_resched);
-#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void __cant_sleep(const char *file, int line, int preempt_offset)
{
- const struct sched_class *prev_class = p->sched_class;
- struct sched_attr attr = {
- .sched_policy = SCHED_NORMAL,
- };
- int old_prio = p->prio;
- int queued;
-
- queued = task_on_rq_queued(p);
- if (queued)
- dequeue_task(rq, p, 0);
- __setscheduler(rq, p, &attr, false);
- if (queued) {
- enqueue_task(rq, p, 0);
- resched_curr(rq);
- }
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > preempt_offset)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
+
+# ifdef CONFIG_SMP
+void __cant_migrate(const char *file, int line)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (is_migration_disabled(current))
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > 0)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), is_migration_disabled(current),
+ current->pid, current->comm);
- check_class_changed(rq, p, prev_class, old_prio);
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
+EXPORT_SYMBOL_GPL(__cant_migrate);
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */
+#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
{
struct task_struct *g, *p;
- unsigned long flags;
- struct rq *rq;
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ };
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7364,14 +8907,12 @@ void normalize_rt_tasks(void)
if (p->flags & PF_KTHREAD)
continue;
- p->se.exec_start = 0;
-#ifdef CONFIG_SCHEDSTATS
- p->se.statistics.wait_start = 0;
- p->se.statistics.sleep_start = 0;
- p->se.statistics.block_start = 0;
-#endif
+ p->se.exec_start = 0;
+ schedstat_set(p->stats.wait_start, 0);
+ schedstat_set(p->stats.sleep_start, 0);
+ schedstat_set(p->stats.block_start, 0);
- if (!dl_task(p) && !rt_task(p)) {
+ if (!rt_or_dl_task(p)) {
/*
* Renice negative nice level userspace
* tasks back to 0:
@@ -7381,18 +8922,16 @@ void normalize_rt_tasks(void)
continue;
}
- rq = task_rq_lock(p, &flags);
- normalize_task(rq, p);
- task_rq_unlock(rq, p, &flags);
+ __sched_setscheduler(p, &attr, false, false);
}
read_unlock(&tasklist_lock);
}
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#ifdef CONFIG_KGDB_KDB
/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for KDB.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -7402,7 +8941,7 @@ void normalize_rt_tasks(void)
*/
/**
- * curr_task - return the current task for a given cpu.
+ * curr_task - return the current task for a given CPU.
* @cpu: the processor in question.
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
@@ -7414,41 +8953,48 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * set_curr_task - set the current task for a given cpu.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void set_curr_task(int cpu, struct task_struct *p)
-{
- cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* CONFIG_KGDB_KDB */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
-static void free_sched_group(struct task_group *tg)
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&tg->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ }
+#endif
+}
+
+static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kfree(tg);
+ kmem_cache_free(task_group_cache, tg);
+}
+
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+ unregister_fair_sched_group(tg);
+ unregister_rt_sched_group(tg);
+ /*
+ * We have to wait for yet another RCU grace period to expire, as
+ * print_cfs_stats() might run concurrently.
+ */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
}
/* allocate runqueue etc for a new task group */
@@ -7456,7 +9002,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
@@ -7466,10 +9012,13 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ scx_tg_init(tg);
+ alloc_uclamp_sched_group(tg, parent);
+
return tg;
err:
- free_sched_group(tg);
+ sched_free_group(tg);
return ERR_PTR(-ENOMEM);
}
@@ -7478,66 +9027,58 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
unsigned long flags;
spin_lock_irqsave(&task_group_lock, flags);
- list_add_rcu(&tg->list, &task_groups);
+ list_add_tail_rcu(&tg->list, &task_groups);
- WARN_ON(!parent); /* root should already exist */
+ /* Root should already exist: */
+ WARN_ON(!parent);
tg->parent = parent;
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
-/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+/* RCU callback to free various structures associated with a task group */
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
- /* now it should be safe to free those cfs_rqs */
- free_sched_group(container_of(rhp, struct task_group, rcu));
+ /* Now it should be safe to free those cfs_rqs: */
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
-/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
+ /* Wait for possible concurrent references to cfs_rqs complete: */
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
{
unsigned long flags;
- int i;
-
- /* end participation in shares distribution */
- for_each_possible_cpu(i)
- unregister_fair_sched_group(tg, i);
+ /*
+ * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+ * sched_cfs_period_timer()).
+ *
+ * For this to be effective, we have to wait for all pending users of
+ * this task group to leave their RCU critical section to ensure no new
+ * user will see our dying task group any more. Specifically ensure
+ * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+ *
+ * We therefore defer calling unregister_fair_sched_group() to
+ * sched_unregister_group() which is guarantied to get called only after the
+ * current RCU grace period has expired.
+ */
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
- int queued, running;
- unsigned long flags;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &flags);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, 0);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7550,546 +9091,375 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, queued);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
-
- if (unlikely(running))
- tsk->sched_class->set_curr_task(rq);
- if (queued)
- enqueue_task(rq, tsk, 0);
-
- task_rq_unlock(rq, tsk, &flags);
}
-#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
/*
- * Ensure that the real time constraints are schedulable.
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
*/
-static DEFINE_MUTEX(rt_constraints_mutex);
-
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
+void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- struct task_struct *g, *p;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ bool resched = false;
+ struct rq *rq;
- /*
- * Autogroups do not have RT tasks; see autogroup_create().
- */
- if (task_group_is_autogroup(tg))
- return 0;
+ CLASS(task_rq_lock, rq_guard)(tsk);
+ rq = rq_guard.rq;
- for_each_process_thread(g, p) {
- if (rt_task(p) && task_group(p) == tg)
- return 1;
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
}
- return 0;
+ if (resched)
+ resched_curr(rq);
}
-struct rt_schedulable_data {
- struct task_group *tg;
- u64 rt_period;
- u64 rt_runtime;
-};
-
-static int tg_rt_schedulable(struct task_group *tg, void *data)
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct rt_schedulable_data *d = data;
- struct task_group *child;
- unsigned long total, sum = 0;
- u64 period, runtime;
-
- period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- runtime = tg->rt_bandwidth.rt_runtime;
-
- if (tg == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
-
- /*
- * Cannot have more runtime than the period.
- */
- if (runtime > period && runtime != RUNTIME_INF)
- return -EINVAL;
-
- /*
- * Ensure we don't starve existing RT tasks.
- */
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
- return -EBUSY;
-
- total = to_ratio(period, runtime);
-
- /*
- * Nobody can have more than the global setting allows.
- */
- if (total > to_ratio(global_rt_period(), global_rt_runtime()))
- return -EINVAL;
-
- /*
- * The sum of our children's runtime should not exceed our own.
- */
- list_for_each_entry_rcu(child, &tg->children, siblings) {
- period = ktime_to_ns(child->rt_bandwidth.rt_period);
- runtime = child->rt_bandwidth.rt_runtime;
-
- if (child == d->tg) {
- period = d->rt_period;
- runtime = d->rt_runtime;
- }
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
- sum += to_ratio(period, runtime);
+ if (!parent) {
+ /* This is early initialization for the top cgroup */
+ return &root_task_group.css;
}
- if (sum > total)
- return -EINVAL;
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
- return 0;
+ return &tg->css;
}
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
int ret;
- struct rt_schedulable_data data = {
- .tg = tg,
- .rt_period = period,
- .rt_runtime = runtime,
- };
-
- rcu_read_lock();
- ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
-}
-
-static int tg_set_rt_bandwidth(struct task_group *tg,
- u64 rt_period, u64 rt_runtime)
-{
- int i, err = 0;
-
- /*
- * Disallowing the root group RT runtime is BAD, it would disallow the
- * kernel creating (and or operating) RT threads.
- */
- if (tg == &root_task_group && rt_runtime == 0)
- return -EINVAL;
-
- /* No period doesn't make any sense. */
- if (rt_period == 0)
- return -EINVAL;
-
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- err = __rt_schedulable(tg, rt_period, rt_runtime);
- if (err)
- goto unlock;
-
- raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
- tg->rt_bandwidth.rt_runtime = rt_runtime;
+ ret = scx_tg_online(tg);
+ if (ret)
+ return ret;
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = tg->rt_rq[i];
+ if (parent)
+ sched_online_group(tg, parent);
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = rt_runtime;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* Propagate the effective uclamp value for the new group */
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
+ cpu_util_update_eff(css);
+#endif
- return err;
+ return 0;
}
-static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
{
- u64 rt_runtime, rt_period;
-
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
- if (rt_runtime_us < 0)
- rt_runtime = RUNTIME_INF;
+ struct task_group *tg = css_tg(css);
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+ scx_tg_offline(tg);
}
-static long sched_group_rt_runtime(struct task_group *tg)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
- u64 rt_runtime_us;
-
- if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
- return -1;
+ struct task_group *tg = css_tg(css);
- rt_runtime_us = tg->rt_bandwidth.rt_runtime;
- do_div(rt_runtime_us, NSEC_PER_USEC);
- return rt_runtime_us;
+ sched_release_group(tg);
}
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
- u64 rt_runtime, rt_period;
-
- rt_period = (u64)rt_period_us * NSEC_PER_USEC;
- rt_runtime = tg->rt_bandwidth.rt_runtime;
+ struct task_group *tg = css_tg(css);
- return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_unregister_group(tg);
}
-static long sched_group_rt_period(struct task_group *tg)
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
- u64 rt_period_us;
-
- rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
- do_div(rt_period_us, NSEC_PER_USEC);
- return rt_period_us;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_RT_GROUP_SCHED
-static int sched_rt_global_constraints(void)
-{
- int ret = 0;
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
- mutex_lock(&rt_constraints_mutex);
- read_lock(&tasklist_lock);
- ret = __rt_schedulable(NULL, 0, 0);
- read_unlock(&tasklist_lock);
- mutex_unlock(&rt_constraints_mutex);
+ if (!rt_group_sched_enabled())
+ goto scx_check;
- return ret;
+ cgroup_taskset_for_each(task, css, tset) {
+ if (!sched_rt_can_attach(css_tg(css), task))
+ return -EINVAL;
+ }
+scx_check:
+#endif /* CONFIG_RT_GROUP_SCHED */
+ return scx_cgroup_can_attach(tset);
}
-static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
- /* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
- return 0;
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
- return 1;
+ cgroup_taskset_for_each(task, css, tset)
+ sched_move_task(task, false);
}
-#else /* !CONFIG_RT_GROUP_SCHED */
-static int sched_rt_global_constraints(void)
+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
- unsigned long flags;
- int i, ret = 0;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = global_rt_runtime();
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
- return ret;
+ scx_cgroup_cancel_attach(tset);
}
-#endif /* CONFIG_RT_GROUP_SCHED */
-static int sched_dl_global_validate(void)
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
{
- u64 runtime = global_rt_runtime();
- u64 period = global_rt_period();
- u64 new_bw = to_ratio(period, runtime);
- struct dl_bw *dl_b;
- int cpu, ret = 0;
- unsigned long flags;
+ struct cgroup_subsys_state *top_css = css;
+ struct uclamp_se *uc_parent = NULL;
+ struct uclamp_se *uc_se = NULL;
+ unsigned int eff[UCLAMP_CNT];
+ enum uclamp_id clamp_id;
+ unsigned int clamps;
- /*
- * Here we want to check the bandwidth not being set to some
- * value smaller than the currently allocated bandwidth in
- * any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
+ lockdep_assert_held(&uclamp_mutex);
+ WARN_ON_ONCE(!rcu_read_lock_held());
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
- ret = -EBUSY;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ css_for_each_descendant_pre(css, top_css) {
+ uc_parent = css_tg(css)->parent
+ ? css_tg(css)->parent->uclamp : NULL;
- rcu_read_unlock_sched();
+ for_each_clamp_id(clamp_id) {
+ /* Assume effective clamps matches requested clamps */
+ eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ /* Cap effective clamps with parent's effective clamps */
+ if (uc_parent &&
+ eff[clamp_id] > uc_parent[clamp_id].value) {
+ eff[clamp_id] = uc_parent[clamp_id].value;
+ }
+ }
+ /* Ensure protection is always capped by limit */
+ eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+ /* Propagate most restrictive effective clamps */
+ clamps = 0x0;
+ uc_se = css_tg(css)->uclamp;
+ for_each_clamp_id(clamp_id) {
+ if (eff[clamp_id] == uc_se[clamp_id].value)
+ continue;
+ uc_se[clamp_id].value = eff[clamp_id];
+ uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ clamps |= (0x1 << clamp_id);
+ }
+ if (!clamps) {
+ css = css_rightmost_descendant(css);
+ continue;
+ }
- if (ret)
- break;
+ /* Immediately update descendants RUNNABLE tasks */
+ uclamp_update_active_tasks(css);
}
-
- return ret;
}
-static void sched_dl_do_global(void)
-{
- u64 new_bw = -1;
- struct dl_bw *dl_b;
- int cpu;
- unsigned long flags;
-
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
- if (global_rt_runtime() != RUNTIME_INF)
- new_bw = to_ratio(global_rt_period(), global_rt_runtime());
-
- /*
- * FIXME: As above...
- */
- for_each_possible_cpu(cpu) {
- rcu_read_lock_sched();
- dl_b = dl_bw_of(cpu);
-
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- dl_b->bw = new_bw;
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
- rcu_read_unlock_sched();
- }
-}
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT 2
+#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ s64 percent;
+ u64 util;
+ int ret;
+};
-static int sched_rt_global_validate(void)
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
+ struct uclamp_request req = {
+ .percent = UCLAMP_PERCENT_SCALE,
+ .util = SCHED_CAPACITY_SCALE,
+ .ret = 0,
+ };
- if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
- (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
- return -EINVAL;
+ buf = strim(buf);
+ if (strcmp(buf, "max")) {
+ req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ &req.percent);
+ if (req.ret)
+ return req;
+ if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
+ req.ret = -ERANGE;
+ return req;
+ }
- return 0;
-}
+ req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ }
-static void sched_rt_do_global(void)
-{
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+ return req;
}
-int sched_rt_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off,
+ enum uclamp_id clamp_id)
{
- int old_period, old_runtime;
- static DEFINE_MUTEX(mutex);
- int ret;
+ struct uclamp_request req;
+ struct task_group *tg;
- mutex_lock(&mutex);
- old_period = sysctl_sched_rt_period;
- old_runtime = sysctl_sched_rt_runtime;
+ req = capacity_from_percent(buf);
+ if (req.ret)
+ return req.ret;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ sched_uclamp_enable();
- if (!ret && write) {
- ret = sched_rt_global_validate();
- if (ret)
- goto undo;
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
- ret = sched_dl_global_validate();
- if (ret)
- goto undo;
+ tg = css_tg(of_css(of));
+ if (tg->uclamp_req[clamp_id].value != req.util)
+ uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
- ret = sched_rt_global_constraints();
- if (ret)
- goto undo;
+ /*
+ * Because of not recoverable conversion rounding we keep track of the
+ * exact requested value
+ */
+ tg->uclamp_pct[clamp_id] = req.percent;
- sched_rt_do_global();
- sched_dl_do_global();
- }
- if (0) {
-undo:
- sysctl_sched_rt_period = old_period;
- sysctl_sched_rt_runtime = old_runtime;
- }
- mutex_unlock(&mutex);
+ /* Update effective clamps to track the most restrictive value */
+ cpu_util_update_eff(of_css(of));
- return ret;
+ return nbytes;
}
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- int ret;
- static DEFINE_MUTEX(mutex);
-
- mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- /* make sure that internally we keep jiffies */
- /* also, writing zero resets timeslice to default */
- if (!ret && write) {
- sched_rr_timeslice = sched_rr_timeslice <= 0 ?
- RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
- }
- mutex_unlock(&mutex);
- return ret;
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
}
-#ifdef CONFIG_CGROUP_SCHED
-
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- return css ? container_of(css, struct task_group, css) : NULL;
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
}
-static struct cgroup_subsys_state *
-cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static inline void cpu_uclamp_print(struct seq_file *sf,
+ enum uclamp_id clamp_id)
{
- struct task_group *parent = css_tg(parent_css);
struct task_group *tg;
+ u64 util_clamp;
+ u64 percent;
+ u32 rem;
- if (!parent) {
- /* This is early initialization for the top cgroup */
- return &root_task_group.css;
+ scoped_guard (rcu) {
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
}
- tg = sched_create_group(parent);
- if (IS_ERR(tg))
- return ERR_PTR(-ENOMEM);
+ if (util_clamp == SCHED_CAPACITY_SCALE) {
+ seq_puts(sf, "max\n");
+ return;
+ }
- return &tg->css;
+ percent = tg->uclamp_pct[clamp_id];
+ percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
}
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
{
- struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css->parent);
-
- if (parent)
- sched_online_group(tg, parent);
+ cpu_uclamp_print(sf, UCLAMP_MIN);
return 0;
}
-static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
{
- struct task_group *tg = css_tg(css);
-
- sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
- struct task_group *tg = css_tg(css);
-
- sched_offline_group(tg);
-}
-
-static void cpu_cgroup_fork(struct task_struct *task)
-{
- sched_move_task(task);
+ cpu_uclamp_print(sf, UCLAMP_MAX);
+ return 0;
}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
-static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+static unsigned long tg_weight(struct task_group *tg)
{
- struct task_struct *task;
-
- cgroup_taskset_for_each(task, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(css_tg(css), task))
- return -EINVAL;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ return scale_load_down(tg->shares);
#else
- /* We don't support RT-tasks being in separate groups */
- if (task->sched_class != &fair_sched_class)
- return -EINVAL;
+ return sched_weight_from_cgroup(tg->scx.weight);
#endif
- }
- return 0;
}
-static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
-{
- struct task_struct *task;
-
- cgroup_taskset_for_each(task, tset)
- sched_move_task(task);
-}
-
-static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
-{
- /*
- * cgroup_exit() is called in the copy_process() failure path.
- * Ignore this case since the task hasn't ran yet, this avoids
- * trying to poke a half freed task state from generic code.
- */
- if (!(task->flags & PF_EXITING))
- return;
-
- sched_move_task(task);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
- return sched_group_set_shares(css_tg(css), scale_load(shareval));
+ int ret;
+
+ if (shareval > scale_load_down(ULONG_MAX))
+ shareval = MAX_SHARES;
+ ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(shareval));
+ return ret;
}
static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct task_group *tg = css_tg(css);
-
- return (u64) scale_load_down(tg->shares);
+ return tg_weight(css_tg(css));
}
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
-const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
-
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 period, quota, burst;
- if (tg == &root_task_group)
- return -EINVAL;
+ period = (u64)period_us * NSEC_PER_USEC;
- /*
- * Ensure we have at some amount of bandwidth every period. This is
- * to prevent reaching a state of large arrears when throttled via
- * entity_tick() resulting in prolonged exit starvation.
- */
- if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
- return -EINVAL;
+ if (quota_us == RUNTIME_INF)
+ quota = RUNTIME_INF;
+ else
+ quota = (u64)quota_us * NSEC_PER_USEC;
- /*
- * Likewise, bound things on the otherside by preventing insane quota
- * periods. This also allows us to normalize in computing quota
- * feasibility.
- */
- if (period > max_cfs_quota_period)
- return -EINVAL;
+ burst = (u64)burst_us * NSEC_PER_USEC;
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- get_online_cpus();
- mutex_lock(&cfs_constraints_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&cfs_constraints_mutex);
+
ret = __cfs_schedulable(tg, period, quota);
if (ret)
- goto out_unlock;
+ return ret;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -8099,58 +9469,56 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
- raw_spin_lock_irq(&cfs_b->lock);
- cfs_b->period = ns_to_ktime(period);
- cfs_b->quota = quota;
- __refill_cfs_bandwidth_runtime(cfs_b);
- /* restart the period timer (if active) to handle new period expiry */
- if (runtime_enabled && cfs_b->timer_active) {
- /* force a reprogram */
- __start_cfs_bandwidth(cfs_b, true);
+ scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ cfs_b->burst = burst;
+
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
+ /*
+ * Restart the period timer (if active) to handle new
+ * period expiry:
+ */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
}
- raw_spin_unlock_irq(&cfs_b->lock);
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
- raw_spin_lock_irq(&rq->lock);
+ guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
- cfs_rq->runtime_remaining = 0;
+ cfs_rq->runtime_remaining = 1;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- raw_spin_unlock_irq(&rq->lock);
}
+
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
-out_unlock:
- mutex_unlock(&cfs_constraints_mutex);
- put_online_cpus();
- return ret;
+ return 0;
}
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static u64 tg_get_cfs_period(struct task_group *tg)
{
- u64 quota, period;
+ u64 cfs_period_us;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- if (cfs_quota_us < 0)
- quota = RUNTIME_INF;
- else
- quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
- return tg_set_cfs_bandwidth(tg, period, quota);
+ return cfs_period_us;
}
-long tg_get_cfs_quota(struct task_group *tg)
+static u64 tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
if (tg->cfs_bandwidth.quota == RUNTIME_INF)
- return -1;
+ return RUNTIME_INF;
quota_us = tg->cfs_bandwidth.quota;
do_div(quota_us, NSEC_PER_USEC);
@@ -8158,48 +9526,14 @@ long tg_get_cfs_quota(struct task_group *tg)
return quota_us;
}
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
-{
- u64 quota, period;
-
- period = (u64)cfs_period_us * NSEC_PER_USEC;
- quota = tg->cfs_bandwidth.quota;
-
- return tg_set_cfs_bandwidth(tg, period, quota);
-}
-
-long tg_get_cfs_period(struct task_group *tg)
-{
- u64 cfs_period_us;
-
- cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
- do_div(cfs_period_us, NSEC_PER_USEC);
-
- return cfs_period_us;
-}
-
-static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static u64 tg_get_cfs_burst(struct task_group *tg)
{
- return tg_get_cfs_quota(css_tg(css));
-}
+ u64 burst_us;
-static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
- struct cftype *cftype, s64 cfs_quota_us)
-{
- return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
-}
+ burst_us = tg->cfs_bandwidth.burst;
+ do_div(burst_us, NSEC_PER_USEC);
-static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_period(css_tg(css));
-}
-
-static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_period_us)
-{
- return tg_set_cfs_period(css_tg(css), cfs_period_us);
+ return burst_us;
}
struct cfs_schedulable_data {
@@ -8246,13 +9580,23 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
parent_quota = parent_b->hierarchical_quota;
/*
- * ensure max(child_quota) <= parent_quota, inherit when no
- * limit is set
+ * Ensure max(child_quota) <= parent_quota. On cgroup2,
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
- if (quota == RUNTIME_INF)
- quota = parent_quota;
- else if (parent_quota != RUNTIME_INF && quota > parent_quota)
- return -EINVAL;
+ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
+ } else {
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+ return -EINVAL;
+ }
}
cfs_b->hierarchical_quota = quota;
@@ -8261,7 +9605,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
- int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
@@ -8273,14 +9616,11 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
do_div(data.quota, NSEC_PER_USEC);
}
- rcu_read_lock();
- ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
}
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -8289,10 +9629,183 @@ static int cpu_stats_show(struct seq_file *sf, void *v)
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+ if (schedstat_enabled() && tg != &root_task_group) {
+ struct sched_statistics *stats;
+ u64 ws = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ stats = __schedstats_from_se(tg->se[i]);
+ ws += schedstat_val(stats->wait_sum);
+ }
+
+ seq_printf(sf, "wait_sum %llu\n", ws);
+ }
+
+ seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
+ seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
+
+ return 0;
+}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
+const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
+static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_bw_runtime_us = MAX_BW;
+
+static void tg_bandwidth(struct task_group *tg,
+ u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (period_us_p)
+ *period_us_p = tg_get_cfs_period(tg);
+ if (quota_us_p)
+ *quota_us_p = tg_get_cfs_quota(tg);
+ if (burst_us_p)
+ *burst_us_p = tg_get_cfs_burst(tg);
+#else /* !CONFIG_CFS_BANDWIDTH */
+ if (period_us_p)
+ *period_us_p = tg->scx.bw_period_us;
+ if (quota_us_p)
+ *quota_us_p = tg->scx.bw_quota_us;
+ if (burst_us_p)
+ *burst_us_p = tg->scx.bw_burst_us;
+#endif /* CONFIG_CFS_BANDWIDTH */
+}
+
+static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 period_us;
+
+ tg_bandwidth(css_tg(css), &period_us, NULL, NULL);
+ return period_us;
+}
+
+static int tg_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+ int ret = 0;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /* Values should survive translation to nsec */
+ if (period_us > max_usec ||
+ (quota_us != RUNTIME_INF && quota_us > max_usec) ||
+ burst_us > max_usec)
+ return -EINVAL;
+
+ /*
+ * Ensure we have some amount of bandwidth every period. This is to
+ * prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota_us < min_bw_quota_period_us ||
+ period_us < min_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the other side by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period_us > max_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
+ return -EINVAL;
+
+ if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
+ burst_us + quota_us > max_bw_runtime_us))
+ return -EINVAL;
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#endif /* CONFIG_CFS_BANDWIDTH */
+ if (!ret)
+ scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
+ return ret;
+}
+
+static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 quota_us;
+
+ tg_bandwidth(css_tg(css), NULL, &quota_us, NULL);
+ return quota_us; /* (s64)RUNTIME_INF becomes -1 */
+}
+
+static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 burst_us;
+
+ tg_bandwidth(css_tg(css), NULL, NULL, &burst_us);
+ return burst_us;
+}
+
+static int cpu_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 period_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 quota_us, burst_us;
+
+ tg_bandwidth(tg, NULL, &quota_us, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 quota_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, burst_us;
+
+ if (quota_us < 0)
+ quota_us = RUNTIME_INF;
+
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 burst_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, quota_us;
+
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -8320,31 +9833,84 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static struct cftype cpu_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+{
+ int ret;
+
+ ret = sched_group_set_idle(css_tg(css), idle);
+ if (!ret)
+ scx_group_set_idle(css_tg(css), idle);
+ return ret;
+}
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
+
+static struct cftype cpu_legacy_files[] = {
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
.name = "shares",
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+ {
+ .name = "idle",
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
+ {
+ .name = "cfs_period_us",
+ .read_u64 = cpu_period_read_u64,
+ .write_u64 = cpu_period_write_u64,
+ },
{
.name = "cfs_quota_us",
- .read_s64 = cpu_cfs_quota_read_s64,
- .write_s64 = cpu_cfs_quota_write_s64,
+ .read_s64 = cpu_quota_read_s64,
+ .write_s64 = cpu_quota_write_s64,
},
{
- .name = "cfs_period_us",
- .read_u64 = cpu_cfs_period_read_u64,
- .write_u64 = cpu_cfs_period_write_u64,
+ .name = "cfs_burst_us",
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "stat",
- .seq_show = cpu_stats_show,
+ .seq_show = cpu_cfs_stat_show,
+ },
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
+#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
},
#endif
+ { } /* Terminate */
+};
+
#ifdef CONFIG_RT_GROUP_SCHED
+static struct cftype rt_group_files[] = {
{
.name = "rt_runtime_us",
.read_s64 = cpu_rt_runtime_read,
@@ -8355,27 +9921,909 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_rt_period_read_uint,
.write_u64 = cpu_rt_period_write_uint,
},
+ { } /* Terminate */
+};
+
+# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
+DEFINE_STATIC_KEY_FALSE(rt_group_sched);
+# else
+DEFINE_STATIC_KEY_TRUE(rt_group_sched);
+# endif
+
+static int __init setup_rt_group_sched(char *str)
+{
+ long val;
+
+ if (kstrtol(str, 0, &val) || val < 0 || val > 1) {
+ pr_warn("Unable to set rt_group_sched\n");
+ return 1;
+ }
+ if (val)
+ static_branch_enable(&rt_group_sched);
+ else
+ static_branch_disable(&rt_group_sched);
+
+ return 1;
+}
+__setup("rt_group_sched=", setup_rt_group_sched);
+
+static int __init cpu_rt_group_init(void)
+{
+ if (!rt_group_sched_enabled())
+ return 0;
+
+ WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files));
+ return 0;
+}
+subsys_initcall(cpu_rt_group_init);
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 throttled_usec, burst_usec;
+
+ throttled_usec = cfs_b->throttled_time;
+ do_div(throttled_usec, NSEC_PER_USEC);
+ burst_usec = cfs_b->burst_time;
+ do_div(burst_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "nr_periods %d\n"
+ "nr_throttled %d\n"
+ "throttled_usec %llu\n"
+ "nr_bursts %d\n"
+ "burst_usec %llu\n",
+ cfs_b->nr_periods, cfs_b->nr_throttled,
+ throttled_usec, cfs_b->nr_burst, burst_usec);
+ }
+#endif /* CONFIG_CFS_BANDWIDTH */
+ return 0;
+}
+
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return sched_weight_to_cgroup(tg_weight(css_tg(css)));
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 cgrp_weight)
+{
+ unsigned long weight;
+ int ret;
+
+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
+ return -ERANGE;
+
+ weight = sched_weight_from_cgroup(cgrp_weight);
+
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css), cgrp_weight);
+ return ret;
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ unsigned long weight = tg_weight(css_tg(css));
+ int last_delta = INT_MAX;
+ int prio, delta;
+
+ /* find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+ delta = abs(sched_prio_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ unsigned long weight;
+ int idx, ret;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
+ idx = array_index_nospec(idx, 40);
+ weight = sched_prio_to_weight[idx];
+
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(weight));
+ return ret;
+}
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ long period, long quota)
+{
+ if (quota < 0)
+ seq_puts(sf, "max");
+ else
+ seq_printf(sf, "%ld", quota);
+
+ seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
+ u64 *quota_us_p)
+{
+ char tok[21]; /* U64_MAX */
+
+ if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1)
+ return -EINVAL;
+
+ if (sscanf(tok, "%llu", quota_us_p) < 1) {
+ if (!strcmp(tok, "max"))
+ *quota_us_p = RUNTIME_INF;
+ else
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+ u64 period_us, quota_us;
+
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ cpu_period_quota_print(sf, period_us, quota_us);
+ return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct task_group *tg = css_tg(of_css(of));
+ u64 period_us, quota_us, burst_us;
+ int ret;
+
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ ret = cpu_period_quota_parse(buf, &period_us, &quota_us);
+ if (!ret)
+ ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+ return ret ?: nbytes;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_weight_read_u64,
+ .write_u64 = cpu_weight_write_u64,
+ },
+ {
+ .name = "weight.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_weight_nice_read_s64,
+ .write_s64 = cpu_weight_nice_write_s64,
+ },
+ {
+ .name = "idle",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_max_show,
+ .write = cpu_max_write,
+ },
+ {
+ .name = "max.burst",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
+ },
+#endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
{ } /* terminate */
};
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
- .css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
- .fork = cpu_cgroup_fork,
+ .css_released = cpu_cgroup_css_released,
+ .css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
+ .css_local_stat_show = cpu_local_stat_show,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .exit = cpu_cgroup_exit,
- .legacy_cftypes = cpu_files,
- .early_init = 1,
+ .cancel_attach = cpu_cgroup_cancel_attach,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
+ .early_init = true,
+ .threaded = true,
};
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
void dump_cpu_task(int cpu)
{
+ if (in_hardirq() && cpu == smp_processor_id()) {
+ struct pt_regs *regs;
+
+ regs = get_irq_regs();
+ if (regs) {
+ show_regs(regs);
+ return;
+ }
+ }
+
+ if (trigger_single_cpu_backtrace(cpu))
+ return;
+
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * pre-calculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+ trace_sched_update_nr_running_tp(rq, count);
+}
+
+#ifdef CONFIG_SCHED_MM_CID
+/*
+ * Concurrency IDentifier management
+ *
+ * Serialization rules:
+ *
+ * mm::mm_cid::mutex: Serializes fork() and exit() and therefore
+ * protects mm::mm_cid::users.
+ *
+ * mm::mm_cid::lock: Serializes mm_update_max_cids() and
+ * mm_update_cpus_allowed(). Nests in mm_cid::mutex
+ * and runqueue lock.
+ *
+ * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks
+ * and can only be modified with atomic operations.
+ *
+ * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
+ * lock.
+ *
+ * CID ownership:
+ *
+ * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
+ * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
+ * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
+ * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
+ * task needs to drop the CID into the pool when scheduling out. Both bits
+ * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
+ * actually handed over to user space in the RSEQ memory.
+ *
+ * Mode switching:
+ *
+ * Switching to per CPU mode happens when the user count becomes greater
+ * than the maximum number of CIDs, which is calculated by:
+ *
+ * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
+ * max_cids = min(1.25 * opt_cids, num_possible_cpus());
+ *
+ * The +25% allowance is useful for tight CPU masks in scenarios where only
+ * a few threads are created and destroyed to avoid frequent mode
+ * switches. Though this allowance shrinks, the closer opt_cids becomes to
+ * num_possible_cpus(), which is the (unfortunate) hard ABI limit.
+ *
+ * At the point of switching to per CPU mode the new user is not yet
+ * visible in the system, so the task which initiated the fork() runs the
+ * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either transfers each tasks owned CID to the CPU the task runs on or
+ * drops it into the CID pool if a task is not on a CPU at that point in
+ * time. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
+ * it's guaranteed that no task related to that MM owns a CID anymore.
+ *
+ * Switching back to task mode happens when the user count goes below the
+ * threshold which was recorded on the per CPU mode switch:
+ *
+ * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2);
+ *
+ * This threshold is updated when a affinity change increases the number of
+ * allowed CPUs for the MM, which might cause a switch back to per task
+ * mode.
+ *
+ * If the switch back was initiated by a exiting task, then that task runs
+ * the fixup function. If it was initiated by a affinity change, then it's
+ * run either in the deferred update function in context of a workqueue or
+ * by a task which forks a new one or by a task which exits. Whatever
+ * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
+ * CPUs and either transfers the CPU owned CIDs to a related task which
+ * runs on the CPU or drops it into the pool. Tasks which schedule in on a
+ * CPU which the walk did not cover yet do the handover themself.
+ *
+ * This transition from CPU to per task ownership happens in two phases:
+ *
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
+ * CID and denotes that the CID is only temporarily owned by the
+ * task. When it schedules out the task drops the CID back into the
+ * pool if this bit is set.
+ *
+ * 2) The initiating context walks the per CPU space and after completion
+ * clears mm:mm_cid.transit. So after that point the CIDs are strictly
+ * task owned again.
+ *
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail if
+ * two tasks are scheduled in on the same CPU before the fixup freed per
+ * CPU CIDs.
+ *
+ * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
+ * related to that MM is owned by a CPU anymore.
+ */
+
+/*
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
+ */
+static void __mm_update_max_cids(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids, max_cids;
+
+ /* Calculate the new optimal constraint */
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+
+ /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */
+ max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus());
+ WRITE_ONCE(mc->max_cids, max_cids);
+}
+
+static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids;
+
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+ /* Has to be at least 1 because 0 indicates PCPU mode off */
+ return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
+}
+
+static bool mm_update_max_cids(struct mm_struct *mm)
+{
+ struct mm_mm_cid *mc = &mm->mm_cid;
+
+ lockdep_assert_held(&mm->mm_cid.lock);
+
+ /* Clear deferred mode switch flag. A change is handled by the caller */
+ mc->update_deferred = false;
+ __mm_update_max_cids(mc);
+
+ /* Check whether owner mode must be changed */
+ if (!mc->percpu) {
+ /* Enable per CPU mode when the number of users is above max_cids */
+ if (mc->users > mc->max_cids)
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ } else {
+ /* Switch back to per task if user count under threshold */
+ if (mc->users < mc->pcpu_thrs)
+ mc->pcpu_thrs = 0;
+ }
+
+ /* Mode change required? */
+ if (!!mc->percpu == !!mc->pcpu_thrs)
+ return false;
+ /* When switching back to per TASK mode, set the transition flag */
+ if (!mc->pcpu_thrs)
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+ WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ return true;
+}
+
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
+{
+ struct cpumask *mm_allowed;
+ struct mm_mm_cid *mc;
+ unsigned int weight;
+
+ if (!mm || !READ_ONCE(mm->mm_cid.users))
+ return;
+ /*
+ * mm::mm_cid::mm_cpus_allowed is the superset of each threads
+ * allowed CPUs mask which means it can only grow.
+ */
+ mc = &mm->mm_cid;
+ guard(raw_spinlock)(&mc->lock);
+ mm_allowed = mm_cpus_allowed(mm);
+ weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
+ if (weight == mc->nr_cpus_allowed)
+ return;
+
+ WRITE_ONCE(mc->nr_cpus_allowed, weight);
+ __mm_update_max_cids(mc);
+ if (!mc->percpu)
+ return;
+
+ /* Adjust the threshold to the wider set */
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ /* Switch back to per task mode? */
+ if (mc->users >= mc->pcpu_thrs)
+ return;
+
+ /* Don't queue twice */
+ if (mc->update_deferred)
+ return;
+
+ /* Queue the irq work, which schedules the real work */
+ mc->update_deferred = true;
+ irq_work_queue(&mc->irq_work);
+}
+
+static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_cpu(t->mm_cid.cid)) {
+ unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
+
+ t->mm_cid.cid = cid_to_transit_cid(cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
+
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+{
+ unsigned int cpu;
+
+ /* Walk the CPUs and fixup all stale CIDs */
+ for_each_possible_cpu(cpu) {
+ struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
+ struct rq *rq = cpu_rq(cpu);
+
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(rq_lock_irq)(rq);
+ /* Is the CID still owned by the CPU? */
+ if (cid_on_cpu(pcp->cid)) {
+ /*
+ * If rq->curr has @mm, transfer it with the
+ * transition bit set. Otherwise drop it.
+ */
+ if (rq->curr->mm == mm && rq->curr->mm_cid.active)
+ mm_cid_transit_to_task(rq->curr, pcp);
+ else
+ mm_drop_cid_on_cpu(mm, pcp);
+
+ } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) {
+ unsigned int cid = rq->curr->mm_cid.cid;
+
+ /* Ensure it has the transition bit set */
+ if (!cid_in_transit(cid)) {
+ cid = cid_to_transit_cid(cid);
+ rq->curr->mm_cid.cid = cid;
+ pcp->cid = cid;
+ }
+ }
+ }
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
+
+static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_task(t->mm_cid.cid)) {
+ t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
+
+static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
+{
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(task_rq_lock)(t);
+ /* If the task is not active it is not in the users count */
+ if (!t->mm_cid.active)
+ return false;
+ if (cid_on_task(t->mm_cid.cid)) {
+ /* If running on the CPU, transfer the CID, otherwise drop it */
+ if (task_rq(t)->curr == t)
+ mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ else
+ mm_unset_cid_on_task(t);
+ }
+ return true;
+}
+
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct task_struct *p, *t;
+ unsigned int users;
+
+ /*
+ * This can obviously race with a concurrent affinity change, which
+ * increases the number of allowed CPUs for this mm, but that does
+ * not affect the mode and only changes the CID constraints. A
+ * possible switch back to per task mode happens either in the
+ * deferred handler function or in the next fork()/exit().
+ *
+ * The caller has already transferred. The newly incoming task is
+ * already accounted for, but not yet visible.
+ */
+ users = mm->mm_cid.users - 2;
+ if (!users)
+ return;
+
+ guard(rcu)();
+ for_other_threads(current, t) {
+ if (mm_cid_fixup_task_to_cpu(t, mm))
+ users--;
+ }
+
+ if (!users)
+ return;
+
+ /* Happens only for VM_CLONE processes. */
+ for_each_process_thread(p, t) {
+ if (t == current || t->mm != mm)
+ continue;
+ if (mm_cid_fixup_task_to_cpu(t, mm)) {
+ if (--users == 0)
+ return;
+ }
+ }
+}
+
+static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
+{
+ t->mm_cid.active = 1;
+ mm->mm_cid.users++;
+ return mm_update_max_cids(mm);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ bool percpu;
+
+ WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+
+ guard(mutex)(&mm->mm_cid.mutex);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+ /* First user ? */
+ if (!mm->mm_cid.users) {
+ sched_mm_cid_add_user(t, mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ /* Required for execve() */
+ pcp->cid = t->mm_cid.cid;
+ return;
+ }
+
+ if (!sched_mm_cid_add_user(t, mm)) {
+ if (!mm->mm_cid.percpu)
+ t->mm_cid.cid = mm_get_cid(mm);
+ return;
+ }
+
+ /* Handle the mode change and transfer current's CID */
+ percpu = !!mm->mm_cid.percpu;
+ if (!percpu)
+ mm_cid_transit_to_task(current, pcp);
+ else
+ mm_cid_transfer_to_cpu(current, pcp);
+ }
+
+ if (percpu) {
+ mm_cid_fixup_tasks_to_cpus();
+ } else {
+ mm_cid_fixup_cpus_to_tasks(mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ }
+}
+
+static bool sched_mm_cid_remove_user(struct task_struct *t)
+{
+ t->mm_cid.active = 0;
+ scoped_guard(preempt) {
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ }
+ t->mm->mm_cid.users--;
+ return mm_update_max_cids(t->mm);
+}
+
+static bool __sched_mm_cid_exit(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+
+ if (!sched_mm_cid_remove_user(t))
+ return false;
+ /*
+ * Contrary to fork() this only deals with a switch back to per
+ * task mode either because the above decreased users or an
+ * affinity change increased the number of allowed CPUs and the
+ * deferred fixup did not run yet.
+ */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return false;
+ /*
+ * A failed fork(2) cleanup never gets here, so @current must have
+ * the same MM as @t. That's true for exit() and the failed
+ * pthread_create() cleanup case.
+ */
+ if (WARN_ON_ONCE(current->mm != mm))
+ return false;
+ return true;
+}
+
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
+void sched_mm_cid_exit(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+
+ if (!mm || !t->mm_cid.active)
+ return;
+ /*
+ * Ensure that only one instance is doing MM CID operations within
+ * a MM. The common case is uncontended. The rare fixup case adds
+ * some overhead.
+ */
+ scoped_guard(mutex, &mm->mm_cid.mutex) {
+ /* mm_cid::mutex is sufficient to protect mm_cid::users */
+ if (likely(mm->mm_cid.users > 1)) {
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ if (!__sched_mm_cid_exit(t))
+ return;
+ /* Mode change required. Transfer currents CID */
+ mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+ return;
+ }
+ /* Last user */
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Required across execve() */
+ if (t == current)
+ mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+ /* Ignore mode change. There is nothing to do. */
+ sched_mm_cid_remove_user(t);
+ }
+ }
+
+ /*
+ * As this is the last user (execve(), process exit or failed
+ * fork(2)) there is no concurrency anymore.
+ *
+ * Synchronize eventually pending work to ensure that there are no
+ * dangling references left. @t->mm_cid.users is zero so nothing
+ * can queue this work anymore.
+ */
+ irq_work_sync(&mm->mm_cid.irq_work);
+ cancel_work_sync(&mm->mm_cid.work);
+}
+
+/* Deactivate MM CID allocation across execve() */
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ sched_mm_cid_exit(t);
+}
+
+/* Reactivate MM CID after successful execve() */
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ sched_mm_cid_fork(t);
+}
+
+static void mm_cid_work_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
+
+ guard(mutex)(&mm->mm_cid.mutex);
+ /* Did the last user task exit already? */
+ if (!mm->mm_cid.users)
+ return;
+
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Have fork() or exit() handled it already? */
+ if (!mm->mm_cid.update_deferred)
+ return;
+ /* This clears mm_cid::update_deferred */
+ if (!mm_update_max_cids(mm))
+ return;
+ /* Affinity changes can only switch back to task mode */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return;
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+}
+
+static void mm_cid_irq_work(struct irq_work *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
+
+ /*
+ * Needs to be unconditional because mm_cid::lock cannot be held
+ * when scheduling work as mm_update_cpus_allowed() nests inside
+ * rq::lock and schedule_work() might end up in wakeup...
+ */
+ schedule_work(&mm->mm_cid.work);
+}
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
+ mm->mm_cid.max_cids = 0;
+ mm->mm_cid.percpu = 0;
+ mm->mm_cid.transit = 0;
+ mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ mm->mm_cid.users = 0;
+ mm->mm_cid.pcpu_thrs = 0;
+ mm->mm_cid.update_deferred = 0;
+ raw_spin_lock_init(&mm->mm_cid.lock);
+ mutex_init(&mm->mm_cid.mutex);
+ mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+ INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+ bitmap_zero(mm_cidmask(mm), num_possible_cpus());
+}
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+#endif /* !CONFIG_SCHED_MM_CID */
+
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
+{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
+ struct rq *rq = task_rq(p);
+
+ /*
+ * Must exclusively use matched flags since this is both dequeue and
+ * enqueue.
+ */
+ WARN_ON_ONCE(flags & 0xFFFF0000);
+
+ lockdep_assert_rq_held(rq);
+
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
+ if (flags & DEQUEUE_CLASS) {
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
+ *ctx = (struct sched_change_ctx){
+ .p = p,
+ .flags = flags,
+ .queued = task_on_rq_queued(p),
+ .running = task_current_donor(rq, p),
+ };
+
+ if (!(flags & DEQUEUE_CLASS)) {
+ if (p->sched_class->get_prio)
+ ctx->prio = p->sched_class->get_prio(rq, p);
+ else
+ ctx->prio = p->prio;
+ }
+
+ if (ctx->queued)
+ dequeue_task(rq, p, flags);
+ if (ctx->running)
+ put_prev_task(rq, p);
+
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
+ return ctx;
+}
+
+void sched_change_end(struct sched_change_ctx *ctx)
+{
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
+
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
+ if (ctx->queued)
+ enqueue_task(rq, p, ctx->flags);
+ if (ctx->running)
+ set_next_task(rq, p);
+
+ if (ctx->flags & ENQUEUE_CLASS) {
+ if (p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
+ } else {
+ p->sched_class->prio_changed(rq, p, ctx->prio);
+ }
+}
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
new file mode 100644
index 000000000000..9ede71ecba7f
--- /dev/null
+++ b/kernel/sched/core_sched.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+#include "sched.h"
+
+struct sched_core_cookie {
+ refcount_t refcnt;
+};
+
+static unsigned long sched_core_alloc_cookie(void)
+{
+ struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
+ if (!ck)
+ return 0;
+
+ refcount_set(&ck->refcnt, 1);
+ sched_core_get();
+
+ return (unsigned long)ck;
+}
+
+static void sched_core_put_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+ kfree(ptr);
+ sched_core_put();
+ }
+}
+
+static unsigned long sched_core_get_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr)
+ refcount_inc(&ptr->refcnt);
+
+ return cookie;
+}
+
+/*
+ * sched_core_update_cookie - replace the cookie on a task
+ * @p: the task to update
+ * @cookie: the new cookie
+ *
+ * Effectively exchange the task cookie; caller is responsible for lifetimes on
+ * both ends.
+ *
+ * Returns: the old cookie
+ */
+static unsigned long sched_core_update_cookie(struct task_struct *p,
+ unsigned long cookie)
+{
+ unsigned long old_cookie;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Since creating a cookie implies sched_core_get(), and we cannot set
+ * a cookie until after we've created it, similarly, we cannot destroy
+ * a cookie until after we've removed it, we must have core scheduling
+ * enabled here.
+ */
+ WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq));
+
+ if (sched_core_enqueued(p))
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);
+
+ old_cookie = p->core_cookie;
+ p->core_cookie = cookie;
+
+ /*
+ * Consider the cases: !prev_cookie and !cookie.
+ */
+ if (cookie && task_on_rq_queued(p))
+ sched_core_enqueue(rq, p);
+
+ /*
+ * If task is currently running, it may not be compatible anymore after
+ * the cookie change, so enter the scheduler on its CPU to schedule it
+ * away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
+ */
+ if (task_on_cpu(rq, p))
+ resched_curr(rq);
+
+ task_rq_unlock(rq, p, &rf);
+
+ return old_cookie;
+}
+
+static unsigned long sched_core_clone_cookie(struct task_struct *p)
+{
+ unsigned long cookie, flags;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cookie = sched_core_get_cookie(p->core_cookie);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return cookie;
+}
+
+void sched_core_fork(struct task_struct *p)
+{
+ RB_CLEAR_NODE(&p->core_node);
+ p->core_cookie = sched_core_clone_cookie(current);
+}
+
+void sched_core_free(struct task_struct *p)
+{
+ sched_core_put_cookie(p->core_cookie);
+}
+
+static void __sched_core_set(struct task_struct *p, unsigned long cookie)
+{
+ cookie = sched_core_get_cookie(cookie);
+ cookie = sched_core_update_cookie(p, cookie);
+ sched_core_put_cookie(cookie);
+}
+
+/* Called from prctl interface: PR_SCHED_CORE */
+int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr)
+{
+ unsigned long cookie = 0, id = 0;
+ struct task_struct *task, *p;
+ struct pid *grp;
+ int err = 0;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -ENODEV;
+
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID);
+
+ if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
+ (cmd != PR_SCHED_CORE_GET && uaddr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case PR_SCHED_CORE_GET:
+ if (type != PIDTYPE_PID || uaddr & 7) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ if (cookie) {
+ /* XXX improve ? */
+ ptr_to_hashval((void *)cookie, &id);
+ }
+ err = put_user(id, (u64 __user *)uaddr);
+ goto out;
+
+ case PR_SCHED_CORE_CREATE:
+ cookie = sched_core_alloc_cookie();
+ if (!cookie) {
+ err = -ENOMEM;
+ goto out;
+ }
+ break;
+
+ case PR_SCHED_CORE_SHARE_TO:
+ cookie = sched_core_clone_cookie(current);
+ break;
+
+ case PR_SCHED_CORE_SHARE_FROM:
+ if (type != PIDTYPE_PID) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ __sched_core_set(current, cookie);
+ goto out;
+
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (type == PIDTYPE_PID) {
+ __sched_core_set(task, cookie);
+ goto out;
+ }
+
+ read_lock(&tasklist_lock);
+ grp = task_pid_type(task, type);
+
+ do_each_pid_thread(grp, type, p) {
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out_tasklist;
+ }
+ } while_each_pid_thread(grp, type, p);
+
+ do_each_pid_thread(grp, type, p) {
+ __sched_core_set(p, cookie);
+ } while_each_pid_thread(grp, type, p);
+out_tasklist:
+ read_unlock(&tasklist_lock);
+
+out:
+ sched_core_put_cookie(cookie);
+ put_task_struct(task);
+ return err;
+}
+
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (p == rq_i->idle)
+ continue;
+
+ /*
+ * Note: this will account forceidle to the current CPU, even
+ * if it comes from our SMT sibling.
+ */
+ __account_forceidle_time(p, delta);
+ }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+
+ __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb55bbf2..23a56ba12d81 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,14 +1,4 @@
-#include <linux/cgroup.h>
-#include <linux/slab.h>
-#include <linux/percpu.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/seq_file.h>
-#include <linux/rcupdate.h>
-#include <linux/kernel_stat.h>
-#include <linux/err.h>
-
-#include "sched.h"
+// SPDX-License-Identifier: GPL-2.0
/*
* CPU accounting code for task groups.
@@ -16,8 +6,10 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <linux/sched/cputime.h>
+#include "sched.h"
-/* Time spent by the tasks of the cpu accounting group executing in ... */
+/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
@@ -25,12 +17,17 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS,
};
-/* track cpu usage of a group of tasks and its child groups */
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
- struct kernel_cpustat __percpu *cpustat;
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every CPU */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
};
static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -38,7 +35,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
return css ? container_of(css, struct cpuacct, css) : NULL;
}
-/* return cpu accounting group to which this task belongs */
+/* Return CPU accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -55,7 +52,7 @@ static struct cpuacct root_cpuacct = {
.cpuusage = &root_cpuacct_cpuusage,
};
-/* create a new cpu accounting group */
+/* Create a new CPU accounting group */
static struct cgroup_subsys_state *
cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -86,7 +83,7 @@ out:
return ERR_PTR(-ENOMEM);
}
-/* destroy an existing cpu accounting group */
+/* Destroy an existing CPU accounting group */
static void cpuacct_css_free(struct cgroup_subsys_state *css)
{
struct cpuacct *ca = css_ca(css);
@@ -96,116 +93,200 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
kfree(ca);
}
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
+ enum cpuacct_stat_index index)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data;
+ /*
+ * We allow index == CPUACCT_STAT_NSTATS here to read
+ * the sum of usages.
+ */
+ if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
+ return 0;
+
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
+#endif
+
+ switch (index) {
+ case CPUACCT_STAT_USER:
+ data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
+ break;
+ case CPUACCT_STAT_SYSTEM:
+ data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
+ cpustat[CPUTIME_SOFTIRQ];
+ break;
+ case CPUACCT_STAT_NSTATS:
+ data = *cpuusage;
+ break;
+ }
+
+#ifndef CONFIG_64BIT
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
return data;
}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ /* Don't allow to reset global kernel_cpustat */
+ if (ca == &root_cpuacct)
+ return;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
+#endif
+ *cpuusage = 0;
+ cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
+ cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
+ cpustat[CPUTIME_SOFTIRQ] = 0;
+
+#ifndef CONFIG_64BIT
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+/* Return total CPU usage (in nanoseconds) of a group */
+static u64 __cpuusage_read(struct cgroup_subsys_state *css,
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
int i;
- for_each_present_cpu(i)
- totalcpuusage += cpuacct_cpuusage_read(ca, i);
+ for_each_possible_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i, index);
return totalcpuusage;
}
+static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_STAT_USER);
+}
+
+static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
+}
+
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
+}
+
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 reset)
+ u64 val)
{
struct cpuacct *ca = css_ca(css);
- int err = 0;
- int i;
+ int cpu;
- if (reset) {
- err = -EINVAL;
- goto out;
- }
+ /*
+ * Only allow '0' here to do a reset.
+ */
+ if (val)
+ return -EINVAL;
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
+ for_each_possible_cpu(cpu)
+ cpuacct_cpuusage_write(ca, cpu);
-out:
- return err;
+ return 0;
}
-static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+static int __cpuacct_percpu_seq_show(struct seq_file *m,
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
int i;
- for_each_present_cpu(i) {
- percpu = cpuacct_cpuusage_read(ca, i);
+ for_each_possible_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i, index);
seq_printf(m, "%llu ", (unsigned long long) percpu);
}
seq_printf(m, "\n");
return 0;
}
-static const char * const cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
+static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
+}
+
+static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
+}
+
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ int index;
+ int cpu;
+
+ seq_puts(m, "cpu");
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %s", cpuacct_stat_desc[index]);
+ seq_puts(m, "\n");
+
+ for_each_possible_cpu(cpu) {
+ seq_printf(m, "%d", cpu);
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %llu",
+ cpuacct_cpuusage_read(ca, cpu, index));
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
+ struct task_cputime cputime;
+ u64 val[CPUACCT_STAT_NSTATS];
int cpu;
- s64 val = 0;
+ int stat;
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-
- val = 0;
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ memset(&cputime, 0, sizeof(cputime));
+ for_each_possible_cpu(cpu) {
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ cputime.utime += cpustat[CPUTIME_USER];
+ cputime.utime += cpustat[CPUTIME_NICE];
+ cputime.stime += cpustat[CPUTIME_SYSTEM];
+ cputime.stime += cpustat[CPUTIME_IRQ];
+ cputime.stime += cpustat[CPUTIME_SOFTIRQ];
+
+ cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
}
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+ cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
+ &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
+
+ for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
+ seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
+ nsec_to_clock_t(val[stat]));
+ }
return 0;
}
@@ -217,10 +298,30 @@ static struct cftype files[] = {
.write_u64 = cpuusage_write,
},
{
+ .name = "usage_user",
+ .read_u64 = cpuusage_user_read,
+ },
+ {
+ .name = "usage_sys",
+ .read_u64 = cpuusage_sys_read,
+ },
+ {
.name = "usage_percpu",
.seq_show = cpuacct_percpu_seq_show,
},
{
+ .name = "usage_percpu_user",
+ .seq_show = cpuacct_percpu_user_seq_show,
+ },
+ {
+ .name = "usage_percpu_sys",
+ .seq_show = cpuacct_percpu_sys_seq_show,
+ },
+ {
+ .name = "usage_all",
+ .seq_show = cpuacct_all_seq_show,
+ },
+ {
.name = "stat",
.seq_show = cpuacct_stats_show,
},
@@ -234,25 +335,13 @@ static struct cftype files[] = {
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
+ unsigned int cpu = task_cpu(tsk);
struct cpuacct *ca;
- int cpu;
-
- cpu = task_cpu(tsk);
- rcu_read_lock();
+ lockdep_assert_rq_held(cpu_rq(cpu));
- ca = task_ca(tsk);
-
- while (true) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
-
- ca = parent_ca(ca);
- if (!ca)
- break;
- }
-
- rcu_read_unlock();
+ for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+ *per_cpu_ptr(ca->cpuusage, cpu) += cputime;
}
/*
@@ -260,24 +349,17 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
*
* Note: it's the caller that updates the account of the root cgroup.
*/
-void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
- struct kernel_cpustat *kcpustat;
struct cpuacct *ca;
- rcu_read_lock();
- ca = task_ca(p);
- while (ca != &root_cpuacct) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += val;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
+ for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+ __this_cpu_add(ca->cpustat->cpustat[index], val);
}
struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
.legacy_cftypes = files,
- .early_init = 1,
+ .early_init = true,
};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index ed605624a5e7..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *p, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index c6acb07466bb..37b572cc8aca 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,20 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * kernel/sched/cpudl.c
+ * kernel/sched/cpudeadline.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
-
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include "cpudeadline.h"
+#include "sched.h"
static inline int parent(int i)
{
@@ -31,61 +23,82 @@ static inline int right_child(int i)
return (i << 1) + 2;
}
-static inline int dl_time_before(u64 a, u64 b)
-{
- return (s64)(a - b) < 0;
-}
-
-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
{
- int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+ int l, r, largest;
- swap(cp->elements[a].cpu, cp->elements[b].cpu);
- swap(cp->elements[a].dl , cp->elements[b].dl );
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
- swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
-}
-
-static void cpudl_heapify(struct cpudl *cp, int idx)
-{
- int l, r, largest;
+ if (left_child(idx) >= cp->size)
+ return;
/* adapted from lib/prio_heap.c */
- while(1) {
+ while (1) {
+ u64 largest_dl;
+
l = left_child(idx);
r = right_child(idx);
largest = idx;
+ largest_dl = orig_dl;
- if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
- cp->elements[l].dl))
+ if ((l < cp->size) && dl_time_before(orig_dl,
+ cp->elements[l].dl)) {
largest = l;
- if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
- cp->elements[r].dl))
+ largest_dl = cp->elements[l].dl;
+ }
+ if ((r < cp->size) && dl_time_before(largest_dl,
+ cp->elements[r].dl))
largest = r;
+
if (largest == idx)
break;
- /* Push idx down the heap one level and bump one up */
- cpudl_exchange(cp, largest, idx);
+ /* pull largest child onto idx */
+ cp->elements[idx].cpu = cp->elements[largest].cpu;
+ cp->elements[idx].dl = cp->elements[largest].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
idx = largest;
}
+ /* actual push down of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
}
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
{
- WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+ int p;
- if (dl_time_before(new_dl, cp->elements[idx].dl)) {
- cp->elements[idx].dl = new_dl;
- cpudl_heapify(cp, idx);
- } else {
- cp->elements[idx].dl = new_dl;
- while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
- cp->elements[idx].dl)) {
- cpudl_exchange(cp, idx, parent(idx));
- idx = parent(idx);
- }
- }
+ int orig_cpu = cp->elements[idx].cpu;
+ u64 orig_dl = cp->elements[idx].dl;
+
+ if (idx == 0)
+ return;
+
+ do {
+ p = parent(idx);
+ if (dl_time_before(orig_dl, cp->elements[p].dl))
+ break;
+ /* pull parent onto idx */
+ cp->elements[idx].cpu = cp->elements[p].cpu;
+ cp->elements[idx].dl = cp->elements[p].dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+ idx = p;
+ } while (idx != 0);
+ /* actual push up of saved original values orig_* */
+ cp->elements[idx].cpu = orig_cpu;
+ cp->elements[idx].dl = orig_dl;
+ cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+ if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+ cp->elements[idx].dl))
+ cpudl_heapify_up(cp, idx);
+ else
+ cpudl_heapify_down(cp, idx);
}
static inline int cpudl_maximum(struct cpudl *cp)
@@ -99,42 +112,67 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
*
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask)
{
- int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
- dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
- best_cpu = cpudl_maximum(cp);
- if (later_mask)
- cpumask_set_cpu(best_cpu, later_mask);
- }
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
+ unsigned long cap, max_cap = 0;
+ int cpu, max_cpu = -1;
+
+ if (!sched_asym_cpucap_active())
+ return 1;
+
+ /* Ensure the capacity of the CPUs fits the task. */
+ for_each_cpu(cpu, later_mask) {
+ if (!dl_task_fits_capacity(p, cpu)) {
+ cpumask_clear_cpu(cpu, later_mask);
+
+ cap = arch_scale_cpu_capacity(cpu);
+
+ if (cap > max_cap ||
+ (cpu == task_cpu(p) && cap == max_cap)) {
+ max_cap = cap;
+ max_cpu = cpu;
+ }
+ }
+ }
+
+ if (cpumask_empty(later_mask))
+ cpumask_set_cpu(max_cpu, later_mask);
+
+ return 1;
+ } else {
+ int best_cpu = cpudl_maximum(cp);
-out:
- WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- return best_cpu;
+ if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
+
+ return 1;
+ }
+ }
+ return 0;
}
/*
- * cpudl_set - update the cpudl max-heap
+ * cpudl_clear - remove a CPU from the cpudl max-heap
* @cp: the cpudl max-heap context
- * @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
+ * @cpu: the target CPU
+ * @online: the online state of the deadline runqueue
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_clear(struct cpudl *cp, int cpu, bool online)
{
int old_idx, new_cpu;
unsigned long flags;
@@ -142,68 +180,65 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
+
old_idx = cp->elements[cpu].idx;
- if (!is_valid) {
- /* remove item */
- if (old_idx == IDX_INVALID) {
- /*
- * Nothing to remove if old_idx was invalid.
- * This could happen if a rq_offline_dl is
- * called for a CPU without -dl tasks running.
- */
- goto out;
- }
+ if (old_idx == IDX_INVALID) {
+ /*
+ * Nothing to remove if old_idx was invalid.
+ * This could happen if rq_online_dl or rq_offline_dl is
+ * called for a CPU without -dl tasks running.
+ */
+ } else {
new_cpu = cp->elements[cp->size - 1].cpu;
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID;
- while (old_idx > 0 && dl_time_before(
- cp->elements[parent(old_idx)].dl,
- cp->elements[old_idx].dl)) {
- cpudl_exchange(cp, old_idx, parent(old_idx));
- old_idx = parent(old_idx);
- }
- cpumask_set_cpu(cpu, cp->free_cpus);
- cpudl_heapify(cp, old_idx);
-
- goto out;
+ cpudl_heapify(cp, old_idx);
}
+ if (likely(online))
+ __cpumask_set_cpu(cpu, cp->free_cpus);
+ else
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
- if (old_idx == IDX_INVALID) {
- cp->size++;
- cp->elements[cp->size - 1].dl = 0;
- cp->elements[cp->size - 1].cpu = cpu;
- cp->elements[cpu].idx = cp->size - 1;
- cpudl_change_key(cp, cp->size - 1, dl);
- cpumask_clear_cpu(cpu, cp->free_cpus);
- } else {
- cpudl_change_key(cp, old_idx, dl);
- }
-
-out:
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
/*
- * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
+ * @cpu: the target CPU
+ * @dl: the new earliest deadline for this CPU
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
*/
-void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
{
- cpumask_set_cpu(cpu, cp->free_cpus);
-}
+ int old_idx;
+ unsigned long flags;
-/*
- * cpudl_clear_freecpu - Clear the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
- */
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
-{
- cpumask_clear_cpu(cpu, cp->free_cpus);
+ WARN_ON(!cpu_present(cpu));
+
+ raw_spin_lock_irqsave(&cp->lock, flags);
+
+ old_idx = cp->elements[cpu].idx;
+ if (old_idx == IDX_INVALID) {
+ int new_idx = cp->size++;
+
+ cp->elements[new_idx].dl = dl;
+ cp->elements[new_idx].cpu = cpu;
+ cp->elements[cpu].idx = new_idx;
+ cpudl_heapify_up(cp, new_idx);
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
+ } else {
+ cp->elements[old_idx].dl = dl;
+ cpudl_heapify(cp, old_idx);
+ }
+
+ raw_spin_unlock_irqrestore(&cp->lock, flags);
}
/*
@@ -214,7 +249,6 @@ int cpudl_init(struct cpudl *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 1a0a6ef2fbe1..d7699468eedd 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,32 +1,24 @@
-#ifndef _LINUX_CPUDL_H
-#define _LINUX_CPUDL_H
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/spinlock.h>
-#include <linux/sched.h>
-
-#define IDX_INVALID -1
+#define IDX_INVALID -1
struct cpudl_item {
- u64 dl;
- int cpu;
- int idx;
+ u64 dl;
+ int cpu;
+ int idx;
};
struct cpudl {
- raw_spinlock_t lock;
- int size;
- cpumask_var_t free_cpus;
- struct cpudl_item *elements;
+ raw_spinlock_t lock;
+ int size;
+ cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
};
-
-#ifdef CONFIG_SMP
-int cpudl_find(struct cpudl *cp, struct task_struct *p,
- struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
-int cpudl_init(struct cpudl *cp);
-void cpudl_set_freecpu(struct cpudl *cp, int cpu);
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
+int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu, bool online);
+int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
-#endif /* CONFIG_SMP */
-
-#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..742fb9e62e1a
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ */
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep. @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+ void (*func)(struct update_util_data *data, u64 time,
+ unsigned int flags))
+{
+ if (WARN_ON(!data || !func))
+ return;
+
+ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+ return;
+
+ data->func = func;
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_rcu()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_this_cpu_can_update - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ * offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy)
+{
+ return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+ (policy->dvfs_possible_from_any_cpu &&
+ rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..0ab5f9d4bc59
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,937 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ */
+#include <uapi/linux/sched/types.h>
+#include "sched.h"
+
+#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
+
+struct sugov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int rate_limit_us;
+};
+
+struct sugov_policy {
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock;
+ u64 last_freq_update_time;
+ s64 freq_update_delay_ns;
+ unsigned int next_freq;
+ unsigned int cached_raw_freq;
+
+ /* The next fields are only needed if fast switch cannot be used: */
+ struct irq_work irq_work;
+ struct kthread_work work;
+ struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
+ bool work_in_progress;
+
+ bool limits_changed;
+ bool need_freq_update;
+};
+
+struct sugov_cpu {
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+ unsigned int cpu;
+
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ u64 last_update;
+
+ unsigned long util;
+ unsigned long bw_min;
+
+ /* The field below is for single-CPU policies only: */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+ s64 delta_ns;
+
+ /*
+ * Since cpufreq_update_util() is called with rq->lock held for
+ * the @target_cpu, our per-CPU data is fully serialized.
+ *
+ * However, drivers cannot in general deal with cross-CPU
+ * requests, so while get_next_freq() will work, our
+ * sugov_update_commit() call may not for the fast switching platforms.
+ *
+ * Hence stop here for remote requests if they aren't supported
+ * by the hardware, as calculating the frequency is pointless if
+ * we cannot in fact act on it.
+ *
+ * This is needed on the slow switching platforms too to prevent CPUs
+ * going offline from leaving stale IRQ work items behind.
+ */
+ if (!cpufreq_this_cpu_can_update(sg_policy->policy))
+ return false;
+
+ if (unlikely(READ_ONCE(sg_policy->limits_changed))) {
+ WRITE_ONCE(sg_policy->limits_changed, false);
+ sg_policy->need_freq_update = true;
+
+ /*
+ * The above limits_changed update must occur before the reads
+ * of policy limits in cpufreq_driver_resolve_freq() or a policy
+ * limits update might be missed, so use a memory barrier to
+ * ensure it.
+ *
+ * This pairs with the write memory barrier in sugov_limits().
+ */
+ smp_mb();
+
+ return true;
+ } else if (sg_policy->need_freq_update) {
+ /* ignore_dl_rate_limit() wants a new frequency to be found. */
+ return true;
+ }
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ return delta_ns >= sg_policy->freq_update_delay_ns;
+}
+
+static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ if (sg_policy->need_freq_update) {
+ sg_policy->need_freq_update = false;
+ /*
+ * The policy limits have changed, but if the return value of
+ * cpufreq_driver_resolve_freq() after applying the new limits
+ * is still equal to the previously selected frequency, the
+ * driver callback need not be invoked unless the driver
+ * specifically wants that to happen on every update of the
+ * policy limits.
+ */
+ if (sg_policy->next_freq == next_freq &&
+ !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS))
+ return false;
+ } else if (sg_policy->next_freq == next_freq) {
+ return false;
+ }
+
+ sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
+
+ return true;
+}
+
+static void sugov_deferred_update(struct sugov_policy *sg_policy)
+{
+ if (!sg_policy->work_in_progress) {
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
+ }
+}
+
+/**
+ * get_capacity_ref_freq - get the reference frequency that has been used to
+ * correlate frequency and compute capacity for a given cpufreq policy. We use
+ * the CPU managing it for the arch_scale_freq_ref() call in the function.
+ * @policy: the cpufreq policy of the CPU in question.
+ *
+ * Return: the reference CPU frequency to compute a capacity.
+ */
+static __always_inline
+unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
+{
+ unsigned int freq = arch_scale_freq_ref(policy->cpu);
+
+ if (freq)
+ return freq;
+
+ if (arch_scale_freq_invariant())
+ return policy->cpuinfo.max_freq;
+
+ /*
+ * Apply a 25% margin so that we select a higher frequency than
+ * the current one before the CPU is fully busy:
+ */
+ return policy->cur + (policy->cur >> 2);
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
+ */
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int freq;
+
+ freq = get_capacity_ref_freq(policy);
+ freq = map_util_freq(util, freq, max);
+
+ if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
+ return sg_policy->next_freq;
+
+ sg_policy->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max)
+{
+ /* Add dvfs headroom to actual utilization */
+ actual = map_util_perf(actual);
+ /* Actually we don't need to target the max performance */
+ if (actual < max)
+ max = actual;
+
+ /*
+ * Ensure at least minimum performance while providing more compute
+ * capacity when possible.
+ */
+ return max(min, max);
+}
+
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
+{
+ unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
+
+ if (!scx_switched_all())
+ util += cpu_util_cfs_boost(sg_cpu->cpu);
+ util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+ util = max(util, boost);
+ sg_cpu->bw_min = min;
+ sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
+}
+
+/**
+ * sugov_iowait_reset() - Reset the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @set_iowait_boost: true if an IO boost has been requested
+ *
+ * The IO wait boost of a task is disabled after a tick since the last update
+ * of a CPU. If a new IO wait boost is requested after more then a tick, then
+ * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
+ * efficiency by ignoring sporadic wakeups from IO.
+ */
+static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
+ bool set_iowait_boost)
+{
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Reset boost only if a tick has elapsed since last request */
+ if (delta_ns <= TICK_NSEC)
+ return false;
+
+ sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
+ sg_cpu->iowait_boost_pending = set_iowait_boost;
+
+ return true;
+}
+
+/**
+ * sugov_iowait_boost() - Updates the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
+ *
+ * Each time a task wakes up after an IO operation, the CPU utilization can be
+ * boosted to a certain utilization which doubles at each "frequent and
+ * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
+ * of the maximum OPP.
+ *
+ * To keep doubling, an IO boost has to be requested at least once per tick,
+ * otherwise we restart from the utilization of the minimum OPP.
+ */
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sg_cpu->iowait_boost &&
+ sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
+ return;
+
+ /* Boost only tasks waking up after IO */
+ if (!set_iowait_boost)
+ return;
+
+ /* Ensure boost doubles only one time at each request */
+ if (sg_cpu->iowait_boost_pending)
+ return;
+ sg_cpu->iowait_boost_pending = true;
+
+ /* Double the boost at each request */
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost =
+ min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
+ return;
+ }
+
+ /* First wakeup after IO: start with minimum boost */
+ sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
+}
+
+/**
+ * sugov_iowait_apply() - Apply the IO boost to a CPU.
+ * @sg_cpu: the sugov data for the cpu to boost
+ * @time: the update time from the caller
+ * @max_cap: the max CPU capacity
+ *
+ * A CPU running a task which woken up after an IO operation can have its
+ * utilization boosted to speed up the completion of those IO operations.
+ * The IO boost value is increased each time a task wakes up from IO, in
+ * sugov_iowait_apply(), and it's instead decreased by this function,
+ * each time an increase has not been requested (!iowait_boost_pending).
+ *
+ * A CPU which also appears to have been idle for at least one tick has also
+ * its IO boost utilization reset.
+ *
+ * This mechanism is designed to boost high frequently IO waiting tasks, while
+ * being more conservative on tasks which does sporadic IO operations.
+ */
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long max_cap)
+{
+ /* No boost currently required */
+ if (!sg_cpu->iowait_boost)
+ return 0;
+
+ /* Reset boost if the CPU appears to have been idle enough */
+ if (sugov_iowait_reset(sg_cpu, time, false))
+ return 0;
+
+ if (!sg_cpu->iowait_boost_pending) {
+ /*
+ * No boost pending; reduce the boost value.
+ */
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
+ sg_cpu->iowait_boost = 0;
+ return 0;
+ }
+ }
+
+ sg_cpu->iowait_boost_pending = false;
+
+ /*
+ * sg_cpu->util is already in capacity scale; convert iowait_boost
+ * into the same scale so we can compare.
+ */
+ return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls;
+ bool ret;
+
+ /*
+ * The heuristics in this function is for the fair class. For SCX, the
+ * performance target comes directly from the BPF scheduler. Let's just
+ * follow it.
+ */
+ if (scx_switched_all())
+ return false;
+
+ /* if capped by uclamp_max, always update to be in compliance */
+ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
+ return false;
+
+ /*
+ * Maintain the frequency if the CPU has not been idle recently, as
+ * reduction is likely to be premature.
+ */
+ idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
+ ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else /* !CONFIG_NO_HZ_COMMON: */
+static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* !CONFIG_NO_HZ_COMMON */
+
+/*
+ * Make sugov_should_update_freq() ignore the rate limit when DL
+ * has increased the utilization.
+ */
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
+{
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
+ sg_cpu->sg_policy->need_freq_update = true;
+}
+
+static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
+{
+ unsigned long boost;
+
+ sugov_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ ignore_dl_rate_limit(sg_cpu);
+
+ if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
+ return false;
+
+ boost = sugov_iowait_apply(sg_cpu, time, max_cap);
+ sugov_get_util(sg_cpu, boost);
+
+ return true;
+}
+
+static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
+ unsigned int next_f;
+
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
+ return;
+
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
+
+ if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
+ !sg_policy->need_freq_update) {
+ next_f = sg_policy->next_freq;
+
+ /* Restore cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = cached_freq;
+ }
+
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ return;
+
+ /*
+ * This code runs under rq->lock for the target CPU, so it won't run
+ * concurrently on two different CPUs for the same target and it is not
+ * necessary to acquire the lock in the fast switch case.
+ */
+ if (sg_policy->policy->fast_switch_enabled) {
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
+ } else {
+ raw_spin_lock(&sg_policy->update_lock);
+ sugov_deferred_update(sg_policy);
+ raw_spin_unlock(&sg_policy->update_lock);
+ }
+}
+
+static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
+
+ /*
+ * Fall back to the "frequency" path if frequency invariance is not
+ * supported, because the direct mapping between the utilization and
+ * the performance levels depends on the frequency invariance.
+ */
+ if (!arch_scale_freq_invariant()) {
+ sugov_update_single_freq(hook, time, flags);
+ return;
+ }
+
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
+ return;
+
+ if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
+ sg_cpu->util = prev_util;
+
+ cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ sg_cpu->util, max_cap);
+
+ sg_cpu->sg_policy->last_freq_update_time = time;
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
+{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util = 0, max_cap;
+ unsigned int j;
+
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ for_each_cpu(j, policy->cpus) {
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+ unsigned long boost;
+
+ boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ sugov_get_util(j_sg_cpu, boost);
+
+ util = max(j_sg_cpu->util, util);
+ }
+
+ return get_next_freq(sg_policy, util, max_cap);
+}
+
+static void
+sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int next_f;
+
+ raw_spin_lock(&sg_policy->update_lock);
+
+ sugov_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ ignore_dl_rate_limit(sg_cpu);
+
+ if (sugov_should_update_freq(sg_policy, time)) {
+ next_f = sugov_next_freq_shared(sg_cpu, time);
+
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ goto unlock;
+
+ if (sg_policy->policy->fast_switch_enabled)
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
+ else
+ sugov_deferred_update(sg_policy);
+ }
+unlock:
+ raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct kthread_work *work)
+{
+ struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+ unsigned int freq;
+ unsigned long flags;
+
+ /*
+ * Hold sg_policy->update_lock shortly to handle the case where:
+ * in case sg_policy->next_freq is read here, and then updated by
+ * sugov_deferred_update() just before work_in_progress is set to false
+ * here, we may miss queueing the new update.
+ *
+ * Note: If a work was queued after the update_lock is released,
+ * sugov_work() will just be called again by kthread_work code; and the
+ * request will be proceed before the sugov thread sleeps.
+ */
+ raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+ freq = sg_policy->next_freq;
+ sg_policy->work_in_progress = false;
+ raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
+
+ mutex_lock(&sg_policy->work_lock);
+ __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
+ mutex_unlock(&sg_policy->work_lock);
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+
+ kthread_queue_work(&sg_policy->worker, &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->rate_limit_us);
+}
+
+static ssize_t
+rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+
+ return count;
+}
+
+static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+
+static struct attribute *sugov_attrs[] = {
+ &rate_limit_us.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(sugov);
+
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
+static const struct kobj_type sugov_tunables_ktype = {
+ .default_groups = sugov_groups,
+ .sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
+};
+
+/********************** cpufreq governor interface *********************/
+
+static struct cpufreq_governor schedutil_gov;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+ if (!sg_policy)
+ return NULL;
+
+ sg_policy->policy = policy;
+ raw_spin_lock_init(&sg_policy->update_lock);
+ return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+ kfree(sg_policy);
+}
+
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ .sched_policy = SCHED_DEADLINE,
+ .sched_flags = SCHED_FLAG_SUGOV,
+ .sched_nice = 0,
+ .sched_priority = 0,
+ /*
+ * Fake (unused) bandwidth; workaround to "fix"
+ * priority inheritance.
+ */
+ .sched_runtime = NSEC_PER_MSEC,
+ .sched_deadline = 10 * NSEC_PER_MSEC,
+ .sched_period = 10 * NSEC_PER_MSEC,
+ };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ kthread_init_work(&sg_policy->work, sugov_work);
+ kthread_init_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setattr_nocheck(thread, &attr);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ if (policy->dvfs_possible_from_any_cpu)
+ set_cpus_allowed_ptr(thread, policy->related_cpus);
+ else
+ kthread_bind_mask(thread, policy->related_cpus);
+
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ kthread_flush_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+ struct sugov_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (tunables) {
+ gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+ }
+ return tunables;
+}
+
+static void sugov_clear_global_tunables(void)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+ struct sugov_tunables *tunables;
+ int ret = 0;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ cpufreq_enable_fast_switch(policy);
+
+ sg_policy = sugov_policy_alloc(policy);
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto stop_kthread;
+ }
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+ goto out;
+ }
+
+ tunables = sugov_tunables_alloc(sg_policy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto stop_kthread;
+ }
+
+ tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = tunables;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ schedutil_gov.name);
+ if (ret)
+ goto fail;
+
+out:
+ /*
+ * Schedutil is the preferred governor for EAS, so rebuild sched domains
+ * on governor changes to make sure the scheduler knows about them.
+ */
+ em_rebuild_sched_domains();
+ mutex_unlock(&global_tunables_lock);
+ return 0;
+
+fail:
+ kobject_put(&tunables->attr_set.kobj);
+ policy->governor_data = NULL;
+ sugov_clear_global_tunables();
+
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+ mutex_unlock(&global_tunables_lock);
+
+free_sg_policy:
+ sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
+ pr_err("initialization failed (error %d)\n", ret);
+ return ret;
+}
+
+static void sugov_exit(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ unsigned int count;
+
+ mutex_lock(&global_tunables_lock);
+
+ count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ sugov_clear_global_tunables();
+
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_kthread_stop(sg_policy);
+ sugov_policy_free(sg_policy);
+ cpufreq_disable_fast_switch(policy);
+
+ em_rebuild_sched_domains();
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
+ unsigned int cpu;
+
+ sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = 0;
+ sg_policy->work_in_progress = false;
+ sg_policy->limits_changed = false;
+ sg_policy->cached_raw_freq = 0;
+
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+
+ if (policy_is_shared(policy))
+ uu = sugov_update_shared;
+ else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
+ uu = sugov_update_single_perf;
+ else
+ uu = sugov_update_single_freq;
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
+ sg_cpu->sg_policy = sg_policy;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
+ }
+ return 0;
+}
+
+static void sugov_stop(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ cpufreq_remove_update_util_hook(cpu);
+
+ synchronize_rcu();
+
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
+}
+
+static void sugov_limits(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+
+ if (!policy->fast_switch_enabled) {
+ mutex_lock(&sg_policy->work_lock);
+ cpufreq_policy_apply_limits(policy);
+ mutex_unlock(&sg_policy->work_lock);
+ }
+
+ /*
+ * The limits_changed update below must take place before the updates
+ * of policy limits in cpufreq_set_policy() or a policy limits update
+ * might be missed, so use a memory barrier to ensure it.
+ *
+ * This pairs with the memory barrier in sugov_should_update_freq().
+ */
+ smp_wmb();
+
+ WRITE_ONCE(sg_policy->limits_changed, true);
+}
+
+static struct cpufreq_governor schedutil_gov = {
+ .name = "schedutil",
+ .owner = THIS_MODULE,
+ .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
+ .init = sugov_init,
+ .exit = sugov_exit,
+ .start = sugov_start,
+ .stop = sugov_stop,
+ .limits = sugov_limits,
+};
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+ return &schedutil_gov;
+}
+#endif
+
+bool sugov_is_governor(struct cpufreq_policy *policy)
+{
+ return policy->governor == &schedutil_gov;
+}
+
+cpufreq_governor_init(schedutil_gov);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..76a9ac5eb794 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/cpupri.c
*
@@ -10,51 +11,127 @@
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
- * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ * (INVALID), NORMAL, RT1, ... RT99, HIGHER
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
- * a 2 dimensional bitmap (the first for priority class, the second for cpus
+ * a 2 dimensional bitmap (the first for priority class, the second for CPUs
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
- * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
+#include "sched.h"
-#include <linux/gfp.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/slab.h>
-#include "cpupri.h"
-
-/* Convert between a 140 based task->prio, and our 102 based cpupri */
+/*
+ * p->rt_priority p->prio newpri cpupri
+ *
+ * -1 -1 (CPUPRI_INVALID)
+ *
+ * 99 0 (CPUPRI_NORMAL)
+ *
+ * 1 98 98 1
+ * ...
+ * 49 50 50 49
+ * 50 49 49 50
+ * ...
+ * 99 0 0 99
+ *
+ * 100 100 (CPUPRI_HIGHER)
+ */
static int convert_prio(int prio)
{
int cpupri;
- if (prio == CPUPRI_INVALID)
- cpupri = CPUPRI_INVALID;
- else if (prio == MAX_PRIO)
- cpupri = CPUPRI_IDLE;
- else if (prio >= MAX_RT_PRIO)
- cpupri = CPUPRI_NORMAL;
- else
- cpupri = MAX_RT_PRIO - prio + 1;
+ switch (prio) {
+ case CPUPRI_INVALID:
+ cpupri = CPUPRI_INVALID; /* -1 */
+ break;
+
+ case 0 ... 98:
+ cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
+ break;
+
+ case MAX_RT_PRIO-1:
+ cpupri = CPUPRI_NORMAL; /* 0 */
+ break;
+
+ case MAX_RT_PRIO:
+ cpupri = CPUPRI_HIGHER; /* 100 */
+ break;
+ }
return cpupri;
}
+static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask, int idx)
+{
+ struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
+ int skip = 0;
+
+ if (!atomic_read(&(vec)->count))
+ skip = 1;
+ /*
+ * When looking at the vector, we need to read the counter,
+ * do a memory barrier, then read the mask.
+ *
+ * Note: This is still all racy, but we can deal with it.
+ * Ideally, we only want to look at masks that are set.
+ *
+ * If a mask is not set, then the only thing wrong is that we
+ * did a little more work than necessary.
+ *
+ * If we read a zero count but the mask is set, because of the
+ * memory barriers, that can only happen when the highest prio
+ * task for a run queue has left the run queue, in which case,
+ * it will be followed by a pull. If the task we are processing
+ * fails to find a proper place to go, that pull request will
+ * pull this task if the run queue is running at a lower
+ * priority.
+ */
+ smp_rmb();
+
+ /* Need to do the rmb for every iteration */
+ if (skip)
+ return 0;
+
+ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
+ return 0;
+
+ if (lowest_mask) {
+ cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+ cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
+
+ /*
+ * We have to ensure that we have at least one bit
+ * still set in the array, since the map could have
+ * been concurrently emptied between the first and
+ * second reads of vec->mask. If we hit this
+ * condition, simply act as though we never hit this
+ * priority level and continue on.
+ */
+ if (cpumask_empty(lowest_mask))
+ return 0;
+ }
+
+ return 1;
+}
+
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask)
+{
+ return cpupri_find_fitness(cp, p, lowest_mask, NULL);
+}
+
/**
- * cpupri_find - find the best (lowest-pri) CPU in the system
+ * cpupri_find_fitness - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ * @fitness_fn: A pointer to a function to do custom checks whether the CPU
+ * fits a specific criteria so that we only return those CPUs.
*
* Note: This function returns the recommended CPUs as calculated during the
* current invocation. By the time the call returns, the CPUs may have in
@@ -65,73 +142,67 @@ static int convert_prio(int prio)
*
* Return: (int)bool - CPUs were found
*/
-int cpupri_find(struct cpupri *cp, struct task_struct *p,
- struct cpumask *lowest_mask)
+int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu))
{
- int idx = 0;
int task_pri = convert_prio(p->prio);
+ int idx, cpu;
- BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+ WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
- struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
- int skip = 0;
-
- if (!atomic_read(&(vec)->count))
- skip = 1;
- /*
- * When looking at the vector, we need to read the counter,
- * do a memory barrier, then read the mask.
- *
- * Note: This is still all racey, but we can deal with it.
- * Ideally, we only want to look at masks that are set.
- *
- * If a mask is not set, then the only thing wrong is that we
- * did a little more work than necessary.
- *
- * If we read a zero count but the mask is set, because of the
- * memory barriers, that can only happen when the highest prio
- * task for a run queue has left the run queue, in which case,
- * it will be followed by a pull. If the task we are processing
- * fails to find a proper place to go, that pull request will
- * pull this task if the run queue is running at a lower
- * priority.
- */
- smp_rmb();
- /* Need to do the rmb for every iteration */
- if (skip)
+ if (!__cpupri_find(cp, p, lowest_mask, idx))
continue;
- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
- continue;
+ if (!lowest_mask || !fitness_fn)
+ return 1;
- if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-
- /*
- * We have to ensure that we have at least one bit
- * still set in the array, since the map could have
- * been concurrently emptied between the first and
- * second reads of vec->mask. If we hit this
- * condition, simply act as though we never hit this
- * priority level and continue on.
- */
- if (cpumask_any(lowest_mask) >= nr_cpu_ids)
- continue;
+ /* Ensure the capacity of the CPUs fit the task */
+ for_each_cpu(cpu, lowest_mask) {
+ if (!fitness_fn(p, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
}
+ /*
+ * If no CPU at the current priority can fit the task
+ * continue looking
+ */
+ if (cpumask_empty(lowest_mask))
+ continue;
+
return 1;
}
+ /*
+ * If we failed to find a fitting lowest_mask, kick off a new search
+ * but without taking into account any fitness criteria this time.
+ *
+ * This rule favours honouring priority over fitting the task in the
+ * correct CPU (Capacity Awareness being the only user now).
+ * The idea is that if a higher priority task can run, then it should
+ * run even if this ends up being on unfitting CPU.
+ *
+ * The cost of this trade-off is not entirely clear and will probably
+ * be good for some workloads and bad for others.
+ *
+ * The main idea here is that if some CPUs were over-committed, we try
+ * to spread which is what the scheduler traditionally did. Sys admins
+ * must do proper RT planning to avoid overloading the system if they
+ * really care.
+ */
+ if (fitness_fn)
+ return cpupri_find(cp, p, lowest_mask);
+
return 0;
}
/**
- * cpupri_set - update the cpu priority setting
+ * cpupri_set - update the CPU priority setting
* @cp: The cpupri context
- * @cpu: The target cpu
- * @newpri: The priority (INVALID-RT99) to assign to this CPU
+ * @cpu: The target CPU
+ * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
@@ -151,7 +222,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
return;
/*
- * If the cpu was currently mapped to a different value, we
+ * If the CPU was currently mapped to a different value, we
* need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the
* cpu being missed by the priority loop in cpupri_find.
@@ -209,8 +280,6 @@ int cpupri_init(struct cpupri *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
-
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 63cbb9ca0496..6f562088c056 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,31 +1,30 @@
-#ifndef _LINUX_CPUPRI_H
-#define _LINUX_CPUPRI_H
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/atomic.h>
+#include <linux/cpumask.h>
+#include <linux/sched/rt.h>
-#include <linux/sched.h>
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
-#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
-
-#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE 0
-#define CPUPRI_NORMAL 1
-/* values 2-101 are RT priorities 0-99 */
+#define CPUPRI_INVALID -1
+#define CPUPRI_NORMAL 0
+/* values 1-99 are for RT1-RT99 priorities */
+#define CPUPRI_HIGHER 100
struct cpupri_vec {
- atomic_t count;
- cpumask_var_t mask;
+ atomic_t count;
+ cpumask_var_t mask;
};
struct cpupri {
- struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int *cpu_to_pri;
+ struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ int *cpu_to_pri;
};
-#ifdef CONFIG_SMP
-int cpupri_find(struct cpupri *cp,
- struct task_struct *p, struct cpumask *lowest_mask);
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask);
+int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
+ struct cpumask *lowest_mask,
+ bool (*fitness_fn)(struct task_struct *p, int cpu));
void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp);
+int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#endif
-
-#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..4f97896887ec 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,11 +1,14 @@
-#include <linux/export.h>
-#include <linux/sched.h>
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Simple CPU accounting cgroup controller
+ */
+#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
-#include <linux/kernel_stat.h>
-#include <linux/static_key.h>
-#include <linux/context_tracking.h>
#include "sched.h"
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -14,17 +17,15 @@
* They are only modified in vtime_account, on corresponding CPU
* with interrupts disabled. So, writes are safe.
* They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
+ * This may result in other CPU reading this CPU's IRQ time and can
* race with irq/vtime_account on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of IRQ time to wrong
+ * task when IRQ is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each IRQ in account_system_time.
*/
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
+int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
{
@@ -36,79 +37,66 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}
-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+ enum cpu_usage_stat idx)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ u64_stats_update_begin(&irqtime->sync);
+ cpustat[idx] += delta;
+ irqtime->total += delta;
+ irqtime->tick_delta += delta;
+ u64_stats_update_end(&irqtime->sync);
+}
/*
- * Called before incrementing preempt_count on {soft,}irq_enter
+ * Called after incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
-void irqtime_account_irq(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
- unsigned long flags;
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ unsigned int pc;
s64 delta;
int cpu;
- if (!sched_clock_irqtime)
+ if (!irqtime_enabled())
return;
- local_irq_save(flags);
-
cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
+ delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+ irqtime->irq_start_time += delta;
+ pc = irq_count() - offset;
- irq_time_write_begin();
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
-
- irq_time_write_end();
- local_irq_restore(flags);
+ if (pc & HARDIRQ_MASK)
+ irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+ else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static int irqtime_account_hi_update(void)
+static u64 irqtime_tick_accounted(u64 maxtime)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
-
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
-}
+ struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ u64 delta;
-static int irqtime_account_si_update(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ delta = min(irqtime->tick_delta, maxtime);
+ irqtime->tick_delta -= delta;
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
+ return delta;
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
-#define sched_clock_irqtime (0)
+static u64 irqtime_tick_accounted(u64 dummy)
+{
+ return 0;
+}
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
@@ -123,98 +111,87 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
- cpuacct_account_field(p, index, tmp);
+ cgroup_account_cputime_field(p, index, tmp);
}
/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
+ * Account user CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in user space since the last update
*/
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+void account_user_time(struct task_struct *p, u64 cputime)
{
int index;
/* Add user time to process. */
p->utime += cputime;
- p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
+ task_group_account_field(p, index, cputime);
/* Account for user time used */
acct_account_cputime(p);
}
/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
+ * Account guest CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in virtual machine since the last update
*/
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+void account_guest_time(struct task_struct *p, u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
/* Add guest time to process. */
p->utime += cputime;
- p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
p->gtime += cputime;
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
+ cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
+ cpustat[CPUTIME_GUEST] += cputime;
}
}
/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
+ * Account system CPU time to a process and desired cpustat field
+ * @p: the process that the CPU time gets accounted to
+ * @cputime: the CPU time spent in kernel space since the last update
+ * @index: pointer to cpustat field that has to be updated
*/
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, int index)
+void account_system_index_time(struct task_struct *p,
+ u64 cputime, enum cpu_usage_stat index)
{
/* Add system time to process. */
p->stime += cputime;
- p->stimescaled += cputime_scaled;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
+ task_group_account_field(p, index, cputime);
/* Account for system time used */
acct_account_cputime(p);
}
/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
+ * Account system CPU time to a process.
+ * @p: the process that the CPU time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
+ * @cputime: the CPU time spent in kernel space since the last update
*/
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
+void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
{
int index;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
- account_guest_time(p, cputime, cputime_scaled);
+ account_guest_time(p, cputime);
return;
}
@@ -225,99 +202,150 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
index = CPUTIME_SYSTEM;
- __account_system_time(p, cputime, cputime_scaled, index);
+ account_system_index_time(p, cputime, index);
}
/*
* Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
+ * @cputime: the CPU time spent in involuntary wait
*/
-void account_steal_time(cputime_t cputime)
+void account_steal_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
- cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+ cpustat[CPUTIME_STEAL] += cputime;
}
/*
* Account for idle time.
- * @cputime: the cpu time spent in idle wait
+ * @cputime: the CPU time spent in idle wait
*/
-void account_idle_time(cputime_t cputime)
+void account_idle_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+ cpustat[CPUTIME_IOWAIT] += cputime;
else
- cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+ cpustat[CPUTIME_IDLE] += cputime;
}
-static __always_inline bool steal_account_process_tick(void)
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+
+ task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif /* CONFIG_SCHED_CORE */
+
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
+static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
u64 steal;
- cputime_t steal_ct;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
+ steal = min(steal, maxtime);
+ account_steal_time(steal);
+ this_rq()->prev_steal_time += steal;
- /*
- * cputime_t may be less precise than nsecs (eg: if it's
- * based on jiffies). Lets cast the result to cputime
- * granularity and account the rest on the next rounds.
- */
- steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
-
- account_steal_time(steal_ct);
- return steal_ct;
+ return steal;
}
-#endif
- return false;
+#endif /* CONFIG_PARAVIRT */
+ return 0;
}
/*
+ * Account how much elapsed time was spent in steal, IRQ, or softirq time.
+ */
+static inline u64 account_other_time(u64 max)
+{
+ u64 accounted;
+
+ lockdep_assert_irqs_disabled();
+
+ accounted = steal_account_process_time(max);
+
+ if (accounted < max)
+ accounted += irqtime_tick_accounted(max - accounted);
+
+ return accounted;
+}
+
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ return t->se.sum_exec_runtime;
+}
+#else /* !CONFIG_64BIT: */
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+ u64 ns;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(t, &rf);
+ ns = t->se.sum_exec_runtime;
+ task_rq_unlock(rq, t, &rf);
+
+ return ns;
+}
+#endif /* !CONFIG_64BIT */
+
+/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
* tasks (sum on group iteration) belonging to @tsk's group.
*/
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
- cputime_t utime, stime;
struct task_struct *t;
- unsigned int seq, nextseq;
- unsigned long flags;
+ u64 utime, stime;
- rcu_read_lock();
- /* Attempt a lockless read on the first round. */
- nextseq = 0;
- do {
- seq = nextseq;
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ /*
+ * Update current task runtime to account pending time since last
+ * scheduler action or thread_group_cputime() call. This thread group
+ * might have other running tasks on different CPUs, but updating
+ * their runtime can affect syscall performance, so we skip account
+ * those pending times and rely only on values updated on tick or
+ * other scheduler action.
+ */
+ if (same_thread_group(current, tsk))
+ (void) task_sched_runtime(current);
+
+ guard(rcu)();
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
- for_each_thread(tsk, t) {
+ __for_each_thread(sig, t) {
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
+ times->sum_exec_runtime += read_sum_exec_runtime(t);
}
- /* If lockless access failed, take the lock. */
- nextseq = 1;
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- rcu_read_unlock();
+ }
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
* @user_tick: is the tick from userspace
* @rq: the pointer to rq
*
@@ -333,119 +361,91 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* Check for hardirq is done both for system and user time as there is
* no timer going off while we are on hardirq and hence we may never get an
* opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
+ * p->stime and friends are only updated on system time and not on IRQ
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int ticks)
+ int ticks)
{
- cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 cputime = (__force u64) cputime_one_jiffy;
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 other, cputime = TICK_NSEC * ticks;
- if (steal_account_process_tick())
+ /*
+ * When returning from idle, many ticks can get accounted at
+ * once, including some ticks of steal, IRQ, and softirq time.
+ * Subtract those ticks from the amount of time accounted to
+ * idle, or potentially user or system time. Due to rounding,
+ * other time can exceed ticks occasionally.
+ */
+ other = account_other_time(ULONG_MAX);
+ if (other >= cputime)
return;
- cputime *= ticks;
- scaled *= ticks;
+ cputime -= other;
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += cputime;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += cputime;
- } else if (this_cpu_ksoftirqd() == p) {
+ if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
+ account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime, scaled);
- } else if (p == rq->idle) {
+ account_user_time(p, cputime);
+ } else if (p == this_rq()->idle) {
account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime, scaled);
+ account_guest_time(p, cputime);
} else {
- __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
+ account_system_index_time(p, cputime, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- struct rq *rq = this_rq();
-
- irqtime_account_process_tick(current, 0, rq, ticks);
+ irqtime_account_process_tick(current, 0, ticks);
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static inline void irqtime_account_idle_ticks(int ticks) {}
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
+static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq, int nr_ticks) {}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+ int nr_ticks) { }
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_system(prev);
-
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- vtime_account_user(prev);
-#endif
- arch_vtime_task_switch(prev);
-}
-#endif
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- if (!in_interrupt()) {
- /*
- * If we interrupted user, context_tracking_in_user()
- * is 1 because the context tracking don't hook
- * on irq entry/exit. This way we know if
- * we need to flush user time on kernel entry.
- */
- if (context_tracking_in_user()) {
- vtime_account_user(tsk);
- return;
- }
-
- if (is_idle_task(tsk)) {
- vtime_account_idle(tsk);
- return;
- }
+ unsigned int pc = irq_count() - offset;
+
+ if (pc & HARDIRQ_OFFSET) {
+ vtime_account_hardirq(tsk);
+ } else if (pc & SOFTIRQ_OFFSET) {
+ vtime_account_softirq(tsk);
+ } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
+ is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ } else {
+ vtime_account_kernel(tsk);
}
- vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
+{
+ *ut = curr->utime;
+ *st = curr->stime;
+}
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
*ut = p->utime;
*st = p->stime;
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime;
@@ -454,45 +454,40 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*ut = cputime.utime;
*st = cputime.stime;
}
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
+
/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
+ * Account a single tick of CPU time.
+ * @p: the process that the CPU time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- struct rq *rq = this_rq();
+ u64 cputime, steal;
- if (vtime_accounting_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
- if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq, 1);
+ if (irqtime_enabled()) {
+ irqtime_account_process_tick(p, user_tick, 1);
return;
}
- if (steal_account_process_tick())
+ cputime = TICK_NSEC;
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
return;
+ cputime -= steal;
+
if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
+ account_user_time(p, cputime);
+ else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
+ account_system_time(p, HARDIRQ_OFFSET, cputime);
else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
+ account_idle_time(cputime);
}
/*
@@ -501,102 +496,60 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+ u64 cputime, steal;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_idle_ticks(ticks);
return;
}
- account_idle_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * loosing precision when the numbers are big.
- */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime)
- swap(rtime, stime);
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
+ cputime = ticks * TICK_NSEC;
+ steal = steal_account_process_time(ULONG_MAX);
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
+ if (steal >= cputime)
+ return;
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return (__force cputime_t) scaled;
+ cputime -= steal;
+ account_idle_time(cputime);
}
/*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
*
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer. Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ * stime + utime == rtime
+ * stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
*/
-static void cputime_advance(cputime_t *counter, cputime_t new)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
{
- cputime_t old;
-
- while (new > (old = ACCESS_ONCE(*counter)))
- cmpxchg_cputime(counter, old, new);
-}
+ u64 rtime, stime, utime;
+ unsigned long flags;
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
- struct cputime *prev,
- cputime_t *ut, cputime_t *st)
-{
- cputime_t rtime, stime, utime;
+ /* Serialize concurrent callers such that we can honour our guarantees */
+ raw_spin_lock_irqsave(&prev->lock, flags);
+ rtime = curr->sum_exec_runtime;
/*
- * Tick based cputime accounting depend on random scheduling
- * timeslices of a task to be interrupted or not by the timer.
- * Depending on these circumstances, the number of these interrupts
- * may be over or under-optimistic, matching the real user and system
- * cputime with a variable precision.
+ * This is possible under two circumstances:
+ * - rtime isn't monotonic after all (a bug);
+ * - we got reordered by the lock.
*
- * Fix this by scaling these tick based values against the total
- * runtime accounted by the CFS scheduler.
- */
- rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-
- /*
- * Update userspace visible utime/stime values only if actual execution
- * time is bigger than already exported. Note that can happen, that we
- * provided bigger values due to scaling inaccuracy on big numbers.
+ * In both cases this acts as a filter such that the rest of the code
+ * can assume it is monotonic regardless of anything else.
*/
if (prev->stime + prev->utime >= rtime)
goto out;
@@ -604,37 +557,73 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
- } else if (stime == 0) {
+ /*
+ * If either stime or utime are 0, assume all runtime is userspace.
+ * Once a task gets some ticks, the monotonicity code at 'update:'
+ * will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
utime = rtime;
- } else {
- cputime_t total = stime + utime;
+ goto update;
+ }
- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
+ if (utime == 0) {
+ stime = rtime;
+ goto update;
}
- cputime_advance(&prev->stime, stime);
- cputime_advance(&prev->utime, utime);
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
+ /*
+ * Because mul_u64_u64_div_u64() can approximate on some
+ * achitectures; enforce the constraint that: a*b/(b+c) <= a.
+ */
+ if (unlikely(stime > rtime))
+ stime = rtime;
+
+update:
+ /*
+ * Make sure stime doesn't go backwards; this preserves monotonicity
+ * for utime because rtime is monotonic.
+ *
+ * utime_i+1 = rtime_i+1 - stime_i
+ * = rtime_i+1 - (rtime_i - utime_i)
+ * = (rtime_i+1 - rtime_i) + utime_i
+ * >= utime_i
+ */
+ if (stime < prev->stime)
+ stime = prev->stime;
+ utime = rtime - stime;
+
+ /*
+ * Make sure utime doesn't go backwards; this still preserves
+ * monotonicity for stime, analogous argument to above.
+ */
+ if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }
+ prev->stime = stime;
+ prev->utime = utime;
out:
*ut = prev->utime;
*st = prev->stime;
+ raw_spin_unlock_irqrestore(&prev->lock, flags);
}
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime = {
.sum_exec_runtime = p->se.sum_exec_runtime,
};
- task_cputime(p, &cputime.utime, &cputime.stime);
+ if (task_cputime(p, &cputime.utime, &cputime.stime))
+ cputime.sum_exec_runtime = task_sched_runtime(p);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
struct task_cputime cputime;
@@ -644,137 +633,195 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
{
unsigned long long clock;
- clock = local_clock();
- if (clock < tsk->vtime_snap)
+ clock = sched_clock();
+ if (clock < vtime->starttime)
return 0;
- return clock - tsk->vtime_snap;
+ return clock - vtime->starttime;
}
-static cputime_t get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
{
- unsigned long long delta = vtime_delta(tsk);
+ u64 delta = vtime_delta(vtime);
+ u64 other;
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
- tsk->vtime_snap += delta;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+ other = account_other_time(delta);
+ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+ vtime->starttime += delta;
- /* CHECKME: always safe to convert nsecs to cputime? */
- return nsecs_to_cputime(delta);
+ return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk)
+static void vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
{
- cputime_t delta_cpu = get_vtime_delta(tsk);
-
- account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+ vtime->stime += get_vtime_delta(vtime);
+ if (vtime->stime >= TICK_NSEC) {
+ account_system_time(tsk, irq_count(), vtime->stime);
+ vtime->stime = 0;
+ }
}
-void vtime_account_system(struct task_struct *tsk)
+static void vtime_account_guest(struct task_struct *tsk,
+ struct vtime *vtime)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
- write_sequnlock(&tsk->vtime_seqlock);
+ vtime->gtime += get_vtime_delta(vtime);
+ if (vtime->gtime >= TICK_NSEC) {
+ account_guest_time(tsk, vtime->gtime);
+ vtime->gtime = 0;
+ }
}
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
+static void __vtime_account_kernel(struct task_struct *tsk,
+ struct vtime *vtime)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
- if (context_tracking_in_user())
- tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ /* We might have scheduled out from guest path */
+ if (vtime->state == VTIME_GUEST)
+ vtime_account_guest(tsk, vtime);
+ else
+ vtime_account_system(tsk, vtime);
}
-void vtime_account_user(struct task_struct *tsk)
+void vtime_account_kernel(struct task_struct *tsk)
{
- cputime_t delta_cpu;
+ struct vtime *vtime = &tsk->vtime;
- write_seqlock(&tsk->vtime_seqlock);
- delta_cpu = get_vtime_delta(tsk);
- tsk->vtime_snap_whence = VTIME_SYS;
- account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- write_sequnlock(&tsk->vtime_seqlock);
+ if (!vtime_delta(vtime))
+ return;
+
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_kernel(tsk, vtime);
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_user_enter(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
- tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_system(tsk, vtime);
+ vtime->state = VTIME_USER;
+ write_seqcount_end(&vtime->seqcount);
+}
+
+void vtime_user_exit(struct task_struct *tsk)
+{
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->utime += get_vtime_delta(vtime);
+ if (vtime->utime >= TICK_NSEC) {
+ account_user_time(tsk, vtime->utime);
+ vtime->utime = 0;
+ }
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
{
+ struct vtime *vtime = &tsk->vtime;
/*
* The flags must be updated under the lock with
- * the vtime_snap flush and update.
+ * the vtime_starttime flush and update.
* That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
- current->flags |= PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_system(tsk, vtime);
+ tsk->flags |= PF_VCPU;
+ vtime->state = VTIME_GUEST;
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
- __vtime_account_system(tsk);
- current->flags &= ~PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_guest(tsk, vtime);
+ tsk->flags &= ~PF_VCPU;
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
- cputime_t delta_cpu = get_vtime_delta(tsk);
-
- account_idle_time(delta_cpu);
+ account_idle_time(get_vtime_delta(&tsk->vtime));
}
-void arch_vtime_task_switch(struct task_struct *prev)
+void vtime_task_switch_generic(struct task_struct *prev)
{
- write_seqlock(&prev->vtime_seqlock);
- prev->vtime_snap_whence = VTIME_SLEEPING;
- write_sequnlock(&prev->vtime_seqlock);
+ struct vtime *vtime = &prev->vtime;
- write_seqlock(&current->vtime_seqlock);
- current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = sched_clock_cpu(smp_processor_id());
- write_sequnlock(&current->vtime_seqlock);
+ write_seqcount_begin(&vtime->seqcount);
+ if (vtime->state == VTIME_IDLE)
+ vtime_account_idle(prev);
+ else
+ __vtime_account_kernel(prev, vtime);
+ vtime->state = VTIME_INACTIVE;
+ vtime->cpu = -1;
+ write_seqcount_end(&vtime->seqcount);
+
+ vtime = &current->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ if (is_idle_task(current))
+ vtime->state = VTIME_IDLE;
+ else if (current->flags & PF_VCPU)
+ vtime->state = VTIME_GUEST;
+ else
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock();
+ vtime->cpu = smp_processor_id();
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
+ struct vtime *vtime = &t->vtime;
unsigned long flags;
- write_seqlock_irqsave(&t->vtime_seqlock, flags);
- t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = sched_clock_cpu(cpu);
- write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+ local_irq_save(flags);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_IDLE;
+ vtime->starttime = sched_clock();
+ vtime->cpu = cpu;
+ write_seqcount_end(&vtime->seqcount);
+ local_irq_restore(flags);
}
-cputime_t task_gtime(struct task_struct *t)
+u64 task_gtime(struct task_struct *t)
{
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
- cputime_t gtime;
+ u64 gtime;
+
+ if (!vtime_accounting_enabled())
+ return t->gtime;
do {
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (t->flags & PF_VCPU)
- gtime += vtime_delta(t);
+ if (vtime->state == VTIME_GUEST)
+ gtime += vtime->gtime + vtime_delta(vtime);
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
return gtime;
}
@@ -784,69 +831,256 @@ cputime_t task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-static void
-fetch_task_cputime(struct task_struct *t,
- cputime_t *u_dst, cputime_t *s_dst,
- cputime_t *u_src, cputime_t *s_src,
- cputime_t *udelta, cputime_t *sdelta)
+bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
- unsigned long long delta;
+ u64 delta;
+ int ret;
- do {
- *udelta = 0;
- *sdelta = 0;
+ if (!vtime_accounting_enabled()) {
+ *utime = t->utime;
+ *stime = t->stime;
+ return false;
+ }
- seq = read_seqbegin(&t->vtime_seqlock);
+ do {
+ ret = false;
+ seq = read_seqcount_begin(&vtime->seqcount);
- if (u_dst)
- *u_dst = *u_src;
- if (s_dst)
- *s_dst = *s_src;
+ *utime = t->utime;
+ *stime = t->stime;
- /* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_SLEEPING ||
- is_idle_task(t))
+ /* Task is sleeping or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
continue;
- delta = vtime_delta(t);
+ ret = true;
+ delta = vtime_delta(vtime);
/*
- * Task runs either in user or kernel space, add pending nohz time to
- * the right place.
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
*/
- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
- *udelta = delta;
- } else {
- if (t->vtime_snap_whence == VTIME_SYS)
- *sdelta = delta;
+ if (vtime->state == VTIME_SYS)
+ *stime += vtime->stime + delta;
+ else
+ *utime += vtime->utime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return ret;
+}
+
+static int vtime_state_fetch(struct vtime *vtime, int cpu)
+{
+ int state = READ_ONCE(vtime->state);
+
+ /*
+ * We raced against a context switch, fetch the
+ * kcpustat task again.
+ */
+ if (vtime->cpu != cpu && vtime->cpu != -1)
+ return -EAGAIN;
+
+ /*
+ * Two possible things here:
+ * 1) We are seeing the scheduling out task (prev) or any past one.
+ * 2) We are seeing the scheduling in task (next) but it hasn't
+ * passed though vtime_task_switch() yet so the pending
+ * cputime of the prev task may not be flushed yet.
+ *
+ * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
+ */
+ if (state == VTIME_INACTIVE)
+ return -EAGAIN;
+
+ return state;
+}
+
+static u64 kcpustat_user_vtime(struct vtime *vtime)
+{
+ if (vtime->state == VTIME_USER)
+ return vtime->utime + vtime_delta(vtime);
+ else if (vtime->state == VTIME_GUEST)
+ return vtime->gtime + vtime_delta(vtime);
+ return 0;
+}
+
+static int kcpustat_field_vtime(u64 *cpustat,
+ struct task_struct *tsk,
+ enum cpu_usage_stat usage,
+ int cpu, u64 *val)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+
+ do {
+ int state;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ state = vtime_state_fetch(vtime, cpu);
+ if (state < 0)
+ return state;
+
+ *val = cpustat[usage];
+
+ /*
+ * Nice VS unnice cputime accounting may be inaccurate if
+ * the nice value has changed since the last vtime update.
+ * But proper fix would involve interrupting target on nice
+ * updates which is a no go on nohz_full (although the scheduler
+ * may still interrupt the target if rescheduling is needed...)
+ */
+ switch (usage) {
+ case CPUTIME_SYSTEM:
+ if (state == VTIME_SYS)
+ *val += vtime->stime + vtime_delta(vtime);
+ break;
+ case CPUTIME_USER:
+ if (task_nice(tsk) <= 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_NICE:
+ if (task_nice(tsk) > 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_GUEST:
+ if (state == VTIME_GUEST && task_nice(tsk) <= 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ case CPUTIME_GUEST_NICE:
+ if (state == VTIME_GUEST && task_nice(tsk) > 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ default:
+ break;
}
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
}
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
+ enum cpu_usage_stat usage, int cpu)
+{
+ u64 *cpustat = kcpustat->cpustat;
+ u64 val = cpustat[usage];
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu))
+ return val;
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ return cpustat[usage];
+ }
+
+ err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
+ rcu_read_unlock();
+
+ if (!err)
+ return val;
-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_field);
+
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
+ const struct kernel_cpustat *src,
+ struct task_struct *tsk, int cpu)
{
- cputime_t udelta, sdelta;
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+
+ do {
+ u64 *cpustat;
+ u64 delta;
+ int state;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ state = vtime_state_fetch(vtime, cpu);
+ if (state < 0)
+ return state;
- fetch_task_cputime(t, utime, stime, &t->utime,
- &t->stime, &udelta, &sdelta);
- if (utime)
- *utime += udelta;
- if (stime)
- *stime += sdelta;
+ *dst = *src;
+ cpustat = dst->cpustat;
+
+ /* Task is sleeping, dead or idle, nothing to add */
+ if (state < VTIME_SYS)
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
+ */
+ if (state == VTIME_SYS) {
+ cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+ } else if (state == VTIME_USER) {
+ if (task_nice(tsk) > 0)
+ cpustat[CPUTIME_NICE] += vtime->utime + delta;
+ else
+ cpustat[CPUTIME_USER] += vtime->utime + delta;
+ } else {
+ WARN_ON_ONCE(state != VTIME_GUEST);
+ if (task_nice(tsk) > 0) {
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
+ cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+ } else {
+ cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
+ cpustat[CPUTIME_USER] += vtime->gtime + delta;
+ }
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
}
-void task_cputime_scaled(struct task_struct *t,
- cputime_t *utimescaled, cputime_t *stimescaled)
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
- cputime_t udelta, sdelta;
+ const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu)) {
+ *dst = *src;
+ return;
+ }
- fetch_task_cputime(t, utimescaled, stimescaled,
- &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
- if (utimescaled)
- *utimescaled += cputime_to_scaled(udelta);
- if (stimescaled)
- *stimescaled += cputime_to_scaled(sdelta);
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ *dst = *src;
+ return;
+ }
+
+ err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
+ rcu_read_unlock();
+
+ if (!err)
+ return;
+
+ cpu_relax();
+ }
}
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..319439fe1870 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Deadline Scheduling Class (SCHED_DEADLINE)
*
@@ -14,14 +15,56 @@
* Michael Trimarchi <michael@amarulasolutions.com>,
* Fabio Checconi <fchecconi@gmail.com>
*/
+
+#include <linux/cpuset.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
#include "sched.h"
+#include "pelt.h"
-#include <linux/slab.h>
+/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_dl_sysctls[] = {
+ {
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)&sysctl_sched_dl_period_min,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = (void *)&sysctl_sched_dl_period_max,
+ },
+};
-struct dl_bandwidth def_dl_bandwidth;
+static int __init sched_dl_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_dl_sysctls);
+ return 0;
+}
+late_initcall(sched_dl_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_server;
+}
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
+ BUG_ON(dl_server(dl_se));
return container_of(dl_se, struct task_struct, dl);
}
@@ -30,12 +73,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
return container_of(dl_rq, struct rq, dl);
}
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = task_rq(p);
+ struct rq *rq = dl_se->rq;
- return &rq->dl;
+ if (!dl_server(dl_se))
+ rq = task_rq(dl_task_of(dl_se));
+
+ return rq;
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ return &rq_of_dl_se(dl_se)->dl;
}
static inline int on_dl_rq(struct sched_dl_entity *dl_se)
@@ -43,49 +93,440 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+#ifdef CONFIG_RT_MUTEXES
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
+ return dl_se->pi_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return pi_of(dl_se) != dl_se;
+}
+#else /* !CONFIG_RT_MUTEXES: */
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return false;
+}
+#endif /* !CONFIG_RT_MUTEXES */
+
+static inline struct dl_bw *dl_bw_of(int i)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ return cpumask_weight_and(rd->span, cpu_active_mask);
+}
+
+static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
+{
+ unsigned long cap = 0;
+ int i;
+
+ for_each_cpu_and(i, mask, cpu_active_mask)
+ cap += arch_scale_cpu_capacity(i);
+
+ return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+ if (!sched_asym_cpucap_active() &&
+ arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+ } else {
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ return __dl_bw_capacity(cpu_rq(i)->rd->span);
+ }
+}
+
+bool dl_bw_visited(int cpu, u64 cookie)
+{
+ struct root_domain *rd = cpu_rq(cpu)->rd;
+
+ if (rd->visit_cookie == cookie)
+ return true;
+
+ rd->visit_cookie = cookie;
+ return false;
+}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
+
+static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline bool
+__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+static inline
+void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
+ dl_rq->running_bw += dl_bw;
+ WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
+}
+
+static inline
+void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->running_bw;
+
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
+ dl_rq->running_bw -= dl_bw;
+ WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */
+ if (dl_rq->running_bw > old)
+ dl_rq->running_bw = 0;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
+}
+
+static inline
+void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
+ dl_rq->this_bw += dl_bw;
+ WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */
+}
+
+static inline
+void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+ u64 old = dl_rq->this_bw;
+
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
+ dl_rq->this_bw -= dl_bw;
+ WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */
+ if (dl_rq->this_bw > old)
+ dl_rq->this_bw = 0;
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
+}
+
+static inline
+void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_rq_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __add_running_bw(dl_se->dl_bw, dl_rq);
+}
+
+static inline
+void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ if (!dl_entity_is_special(dl_se))
+ __sub_running_bw(dl_se->dl_bw, dl_rq);
+}
+
+static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw)
+{
+ if (dl_se->dl_non_contending) {
+ sub_running_bw(dl_se, &rq->dl);
+ dl_se->dl_non_contending = 0;
+
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be canceled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+ if (!dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+ }
+ }
+ __sub_rq_bw(dl_se->dl_bw, &rq->dl);
+ __add_rq_bw(new_bw, &rq->dl);
+}
+
+static __always_inline
+void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer)
+{
+ /*
+ * If the timer callback was running (hrtimer_try_to_cancel == -1),
+ * it will eventually call put_task_struct().
+ */
+ if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+}
+
+static __always_inline
+void cancel_replenish_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->dl_timer);
+}
+
+static __always_inline
+void cancel_inactive_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->inactive_timer);
+}
+
+static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
+
+ if (task_on_rq_queued(p))
+ return;
+
+ dl_rq_change_utilization(task_rq(p), &p->dl, new_bw);
+}
+
+static void __dl_clear_params(struct sched_dl_entity *dl_se);
+
+/*
+ * The utilization of a task cannot be immediately removed from
+ * the rq active utilization (running_bw) when the task blocks.
+ * Instead, we have to wait for the so called "0-lag time".
+ *
+ * If a task blocks before the "0-lag time", a timer (the inactive
+ * timer) is armed, and running_bw is decreased when the timer
+ * fires.
+ *
+ * If the task wakes up again before the inactive timer fires,
+ * the timer is canceled, whereas if the task wakes up after the
+ * inactive timer fired (and running_bw has been decreased) the
+ * task's utilization has to be added to running_bw again.
+ * A flag in the deadline scheduling entity (dl_non_contending)
+ * is used to avoid race conditions between the inactive timer handler
+ * and task wakeups.
+ *
+ * The following diagram shows how running_bw is updated. A task is
+ * "ACTIVE" when its utilization contributes to running_bw; an
+ * "ACTIVE contending" task is in the TASK_RUNNING state, while an
+ * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
+ * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
+ * time already passed, which does not contribute to running_bw anymore.
+ * +------------------+
+ * wakeup | ACTIVE |
+ * +------------------>+ contending |
+ * | add_running_bw | |
+ * | +----+------+------+
+ * | | ^
+ * | dequeue | |
+ * +--------+-------+ | |
+ * | | t >= 0-lag | | wakeup
+ * | INACTIVE |<---------------+ |
+ * | | sub_running_bw | |
+ * +--------+-------+ | |
+ * ^ | |
+ * | t < 0-lag | |
+ * | | |
+ * | V |
+ * | +----+------+------+
+ * | sub_running_bw | ACTIVE |
+ * +-------------------+ |
+ * inactive timer | non contending |
+ * fired +------------------+
+ *
+ * The task_non_contending() function is invoked when a task
+ * blocks, and checks if the 0-lag time already passed or
+ * not (in the first case, it directly updates running_bw;
+ * in the second case, it arms the inactive timer).
+ *
+ * The task_contending() function is invoked when a task wakes
+ * up, and checks if the task is still in the "ACTIVE non contending"
+ * state or not (in the second case, it updates running_bw).
+ */
+static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task)
+{
+ struct hrtimer *timer = &dl_se->inactive_timer;
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct dl_rq *dl_rq = &rq->dl;
+ s64 zerolag_time;
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ WARN_ON(dl_se->dl_non_contending);
+
+ zerolag_time = dl_se->deadline -
+ div64_long((dl_se->runtime * dl_se->dl_period),
+ dl_se->dl_runtime);
+
+ /*
+ * Using relative times instead of the absolute "0-lag time"
+ * allows to simplify the code
+ */
+ zerolag_time -= rq_clock(rq);
+
+ /*
+ * If the "0-lag time" already passed, decrease the active
+ * utilization now, instead of starting a timer
+ */
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
+ if (dl_server(dl_se)) {
+ sub_running_bw(dl_se, dl_rq);
+ } else {
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (dl_task)
+ sub_running_bw(dl_se, dl_rq);
+
+ if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (READ_ONCE(p->__state) == TASK_DEAD)
+ sub_rq_bw(dl_se, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(dl_se);
+ }
+ }
+
+ return;
+ }
- return dl_rq->rb_leftmost == &dl_se->rb_node;
+ dl_se->dl_non_contending = 1;
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
+
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+static void task_contending(struct sched_dl_entity *dl_se, int flags)
{
- raw_spin_lock_init(&dl_b->dl_runtime_lock);
- dl_b->dl_period = period;
- dl_b->dl_runtime = runtime;
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ /*
+ * If this is a non-deadline task that has been boosted,
+ * do nothing
+ */
+ if (dl_se->dl_runtime == 0)
+ return;
+
+ if (flags & ENQUEUE_MIGRATED)
+ add_rq_bw(dl_se, dl_rq);
+
+ if (dl_se->dl_non_contending) {
+ dl_se->dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be canceled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ cancel_inactive_timer(dl_se);
+ } else {
+ /*
+ * Since "dl_non_contending" is not set, the
+ * task's utilization has already been removed from
+ * active utilization (either when the task blocked,
+ * when the "inactive timer" fired).
+ * So, add it back.
+ */
+ add_running_bw(dl_se, dl_rq);
+ }
}
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
+}
+
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
+
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
- raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
if (global_rt_runtime() == RUNTIME_INF)
dl_b->bw = -1;
else
dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
- raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
dl_b->total_bw = 0;
}
void init_dl_rq(struct dl_rq *dl_rq)
{
- dl_rq->rb_root = RB_ROOT;
+ dl_rq->root = RB_ROOT_CACHED;
-#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
- dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
- dl_rq->pushable_dl_tasks_root = RB_ROOT;
-#else
- init_dl_bw(&dl_rq->dl_bw);
-#endif
-}
+ dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
-#ifdef CONFIG_SMP
+ dl_rq->running_bw = 0;
+ dl_rq->this_bw = 0;
+ init_dl_rq_bw_ratio(dl_rq);
+}
static inline int dl_overloaded(struct rq *rq)
{
@@ -117,37 +558,17 @@ static inline void dl_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
}
-static void update_dl_migration(struct dl_rq *dl_rq)
-{
- if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
- if (!dl_rq->overloaded) {
- dl_set_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 1;
- }
- } else if (dl_rq->overloaded) {
- dl_clear_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 0;
- }
-}
+#define __node_2_pdl(node) \
+ rb_entry((node), struct task_struct, pushable_dl_tasks)
-static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory++;
-
- update_dl_migration(dl_rq);
+ return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
}
-static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+static inline int has_pushable_dl_tasks(struct rq *rq)
{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory--;
-
- update_dl_migration(dl_rq);
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
}
/*
@@ -156,92 +577,91 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
*/
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
- struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
- struct rb_node *parent = NULL;
- struct task_struct *entry;
- int leftmost = 1;
-
- BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct task_struct,
- pushable_dl_tasks);
- if (dl_entity_preempt(&p->dl, &entry->dl))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
+ struct rb_node *leftmost;
+
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+ leftmost = rb_add_cached(&p->pushable_dl_tasks,
+ &rq->dl.pushable_dl_tasks_root,
+ __pushable_less);
if (leftmost)
- dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ rq->dl.earliest_dl.next = p->dl.deadline;
- rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+ if (!rq->dl.overloaded) {
+ dl_set_overload(rq);
+ rq->dl.overloaded = 1;
+ }
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
+ struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
+ struct rb_node *leftmost;
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
- struct rb_node *next_node;
-
- next_node = rb_next(&p->pushable_dl_tasks);
- dl_rq->pushable_dl_tasks_leftmost = next_node;
- }
+ leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
+ if (leftmost)
+ dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
- rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-}
-static inline int has_pushable_dl_tasks(struct rq *rq)
-{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+ if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
+ dl_clear_overload(rq);
+ rq->dl.overloaded = 0;
+ }
}
static int push_dl_task(struct rq *rq);
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
{
- return dl_task(prev);
+ return rq->online && dl_task(prev);
}
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct balance_callback, dl_push_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_pull_head);
+
+static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
+
+static inline void deadline_queue_push_tasks(struct rq *rq)
{
- rq->post_schedule = has_pushable_dl_tasks(rq);
+ if (!has_pushable_dl_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void deadline_queue_pull_task(struct rq *rq)
+{
+ queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
}
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
- bool fallback = false;
+ struct dl_bw *dl_b;
later_rq = find_lock_later_rq(p, rq);
-
if (!later_rq) {
int cpu;
/*
* If we cannot preempt any rq, fall back to pick any
- * online cpu.
+ * online CPU:
*/
- fallback = true;
- cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
- * Fail to find any suitable cpu.
+ * Failed to find any suitable CPU.
* The task will never come back!
*/
- BUG_ON(dl_bandwidth_enabled());
+ WARN_ON_ONCE(dl_bandwidth_enabled());
/*
* If admission control is disabled we
@@ -254,57 +674,66 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
double_lock_balance(rq, later_rq);
}
- deactivate_task(rq, p, 0);
- set_task_cpu(p, later_rq->cpu);
- activate_task(later_rq, p, ENQUEUE_REPLENISH);
-
- if (!fallback)
- resched_curr(later_rq);
-
- double_unlock_balance(rq, later_rq);
-}
+ if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+ /*
+ * Inactive timer is armed (or callback is running, but
+ * waiting for us to release rq locks). In any case, when it
+ * will fire (or continue), it will see running_bw of this
+ * task migrated to later_rq (and correctly handle it).
+ */
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
-#else
+ add_rq_bw(&p->dl, &later_rq->dl);
+ add_running_bw(&p->dl, &later_rq->dl);
+ } else {
+ sub_rq_bw(&p->dl, &rq->dl);
+ add_rq_bw(&p->dl, &later_rq->dl);
+ }
-static inline
-void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
+ /*
+ * And we finally need to fix up root_domain(s) bandwidth accounting,
+ * since p is still hanging out in the old (now moved to default) root
+ * domain.
+ */
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
-static inline
-void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
+ dl_b = &later_rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
-static inline
-void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
+ set_task_cpu(p, later_rq->cpu);
+ double_unlock_balance(later_rq, rq);
-static inline
-void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
+ return later_rq;
}
-static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
-static inline int pull_dl_task(struct rq *rq)
+static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
+ struct rq *rq)
{
- return 0;
-}
+ /* for non-boosted task, pi_of(dl_se) == dl_se */
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
-static inline void set_post_schedule(struct rq *rq)
-{
+ /*
+ * If it is a deferred reservation, and the server
+ * is not handling an starvation case, defer it.
+ */
+ if (dl_se->dl_defer && !dl_se->dl_defer_running) {
+ dl_se->dl_throttled = 1;
+ dl_se->dl_defer_armed = 1;
+ }
}
-#endif /* CONFIG_SMP */
-
-static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
- int flags);
/*
* We are being explicitly informed that a new instance is starting,
@@ -318,24 +747,35 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
* one, and to (try to!) reconcile itself with its own scheduling
* parameters.
*/
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+ update_rq_clock(rq);
+
+ WARN_ON(is_dl_boosted(dl_se));
+ WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+ /*
+ * We are racing with the deadline timer. So, do nothing because
+ * the deadline timer handler will take care of properly recharging
+ * the runtime and postponing the deadline
+ */
+ if (dl_se->dl_throttled)
+ return;
/*
* We use the regular wall clock time to set deadlines in the
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
- dl_se->dl_new = 0;
+ replenish_dl_new_period(dl_se, rq);
}
+static int start_dl_timer(struct sched_dl_entity *dl_se);
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
+
/*
* Pure Earliest Deadline First (EDF) scheduling does not deal with the
* possibility of a entity lasting more than what it declared, and thus
@@ -354,23 +794,32 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* could happen are, typically, a entity voluntarily trying to overcome its
* runtime, or it just underestimated it during sched_setattr().
*/
-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void replenish_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- BUG_ON(pi_se->dl_runtime <= 0);
+ WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
+ *
+ * Or, it could be the case of a deferred reservation that
+ * was not able to consume its runtime in background and
+ * reached this point with current u > U.
+ *
+ * In both cases, set a new period.
*/
- if (dl_se->dl_deadline == 0) {
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ if (dl_se->dl_deadline == 0 ||
+ (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
+ if (dl_se->dl_yielded && dl_se->runtime > 0)
+ dl_se->runtime = 0;
+
/*
* We keep moving the deadline away until we get some
* available runtime for the entity. This ensures correct
@@ -378,8 +827,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* arbitrary large.
*/
while (dl_se->runtime <= 0) {
- dl_se->deadline += pi_se->dl_period;
- dl_se->runtime += pi_se->dl_runtime;
+ dl_se->deadline += pi_of(dl_se)->dl_period;
+ dl_se->runtime += pi_of(dl_se)->dl_runtime;
}
/*
@@ -392,15 +841,52 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* entity.
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
- printk_deferred_once("sched: DL replenish lagged to much\n");
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ printk_deferred_once("sched: DL replenish lagged too much\n");
+ replenish_dl_new_period(dl_se, rq);
}
if (dl_se->dl_yielded)
dl_se->dl_yielded = 0;
if (dl_se->dl_throttled)
dl_se->dl_throttled = 0;
+
+ /*
+ * If this is the replenishment of a deferred reservation,
+ * clear the flag and return.
+ */
+ if (dl_se->dl_defer_armed) {
+ dl_se->dl_defer_armed = 0;
+ return;
+ }
+
+ /*
+ * A this point, if the deferred server is not armed, and the deadline
+ * is in the future, if it is not running already, throttle the server
+ * and arm the defer timer.
+ */
+ if (dl_se->dl_defer && !dl_se->dl_defer_running &&
+ dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
+ if (!is_dl_boosted(dl_se)) {
+
+ /*
+ * Set dl_se->dl_defer_armed and dl_throttled variables to
+ * inform the start_dl_timer() that this is a deferred
+ * activation.
+ */
+ dl_se->dl_defer_armed = 1;
+ dl_se->dl_throttled = 1;
+ if (!start_dl_timer(dl_se)) {
+ /*
+ * If for whatever reason (delays), a previous timer was
+ * queued but not serviced, cancel it and clean the
+ * deferrable server variables intended for start_dl_timer().
+ */
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ }
+ }
+ }
}
/*
@@ -415,20 +901,19 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* refill the runtime and set the deadline a period in the future,
* because keeping the current (absolute) deadline of the task would
* result in breaking guarantees promised to other tasks (refer to
- * Documentation/scheduler/sched-deadline.txt for more informations).
+ * Documentation/scheduler/sched-deadline.rst for more information).
*
* This function returns true if:
*
- * runtime / (deadline - t) > dl_runtime / dl_period ,
+ * runtime / (deadline - t) > dl_runtime / dl_deadline ,
*
* IOW we can't recycle current parameters.
*
- * Notice that the bandwidth check is done against the period. For
+ * Notice that the bandwidth check is done against the deadline. For
* task with deadline equal to period this is the same of using
- * dl_deadline instead of dl_period in the equation above.
+ * dl_period instead of dl_deadline in the equation above.
*/
-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, u64 t)
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
{
u64 left, right;
@@ -450,72 +935,165 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
- (pi_se->dl_runtime >> DL_SCALE);
+ (pi_of(dl_se)->dl_runtime >> DL_SCALE);
return dl_time_before(right, left);
}
/*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
+ *
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
*
- * The policy here is that we update the deadline of the entity only if:
- * - the current deadline is in the past,
- * - using the remaining runtime with the current deadline would make
- * the entity exceed its bandwidth.
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
*/
-static void update_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ u64 laxity = dl_se->deadline - rq_clock(rq);
/*
- * The arrival of a new instance needs special treatment, i.e.,
- * the actual scheduling parameters have to be "renewed".
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
*/
- if (dl_se->dl_new) {
- setup_new_dl_entity(dl_se, pi_se);
- return;
- }
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. The runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
+ *
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
+ */
+static void update_dl_entity(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
- dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_entity_overflow(dl_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !is_dl_boosted(dl_se))) {
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
+ replenish_dl_new_period(dl_se, rq);
+ } else if (dl_server(dl_se) && dl_se->dl_defer) {
+ /*
+ * The server can still use its previous deadline, so check if
+ * it left the dl_defer_running state.
+ */
+ if (!dl_se->dl_defer_running) {
+ dl_se->dl_defer_armed = 1;
+ dl_se->dl_throttled = 1;
+ }
}
}
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+ return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
+
/*
* If the entity depleted all its runtime, and if we want it to sleep
* while waiting for some new execution time to become available, we
- * set the bandwidth enforcement timer to the replenishment instant
+ * set the bandwidth replenishment timer to the replenishment instant
* and try to activate it.
*
* Notice that it is important for the caller to know if the timer
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
{
+ struct hrtimer *timer = &dl_se->dl_timer;
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
ktime_t now, act;
- ktime_t soft, hard;
- unsigned long range;
s64 delta;
- if (boosted)
- return 0;
+ lockdep_assert_rq_held(rq);
+
/*
* We want the timer to fire at the deadline, but considering
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
+ *
+ * The deferred reservation will have its timer set to
+ * (deadline - runtime). At that point, the CBS rule will decide
+ * if the current deadline can be used, or if a replenishment is
+ * required to avoid add too much pressure on the system
+ * (current u > U).
*/
- act = ns_to_ktime(dl_se->deadline);
- now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ if (dl_se->dl_defer_armed) {
+ WARN_ON_ONCE(!dl_se->dl_throttled);
+ act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
+ } else {
+ /* act = deadline - rel-deadline + period */
+ act = ns_to_ktime(dl_next_period(dl_se));
+ }
+
+ now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -527,15 +1105,99 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
if (ktime_us_delta(act, now) < 0)
return 0;
- hrtimer_set_expires(&dl_se->dl_timer, act);
+ /*
+ * !enqueued will guarantee another callback; even if one is already in
+ * progress. This ensures a balanced {get,put}_task_struct().
+ *
+ * The race against __run_timer() clearing the enqueued state is
+ * harmless because we're holding task_rq()->lock, therefore the timer
+ * expiring after we've done the check will wait on its task_rq_lock()
+ * and observe our state.
+ */
+ if (!hrtimer_is_queued(timer)) {
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
+ }
- soft = hrtimer_get_softexpires(&dl_se->dl_timer);
- hard = hrtimer_get_expires(&dl_se->dl_timer);
- range = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
- range, HRTIMER_MODE_ABS, 0);
+ return 1;
+}
- return hrtimer_active(&dl_se->dl_timer);
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ rq_unpin_lock(rq, rf);
+ push_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+}
+
+/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
+static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
+
+static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = rq_of_dl_se(dl_se);
+ u64 fw;
+
+ scoped_guard (rq_lock, rq) {
+ struct rq_flags *rf = &scope.rf;
+
+ if (!dl_se->dl_throttled || !dl_se->dl_runtime)
+ return HRTIMER_NORESTART;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ /*
+ * Make sure current has propagated its pending runtime into
+ * any relevant server through calling dl_server_update() and
+ * friends.
+ */
+ rq->donor->sched_class->update_curr(rq);
+
+ if (dl_se->dl_defer_idle) {
+ dl_server_stop(dl_se);
+ return HRTIMER_NORESTART;
+ }
+
+ if (dl_se->dl_defer_armed) {
+ /*
+ * First check if the server could consume runtime in background.
+ * If so, it is possible to push the defer timer for this amount
+ * of time. The dl_server_min_res serves as a limit to avoid
+ * forwarding the timer for a too small amount of time.
+ */
+ if (dl_time_before(rq_clock(dl_se->rq),
+ (dl_se->deadline - dl_se->runtime - dl_server_min_res))) {
+
+ /* reset the defer timer */
+ fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime;
+
+ hrtimer_forward_now(timer, ns_to_ktime(fw));
+ return HRTIMER_RESTART;
+ }
+
+ dl_se->dl_defer_running = 1;
+ }
+
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+
+ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl))
+ resched_curr(rq);
+
+ __push_dl_task(rq, rf);
+ }
+
+ return HRTIMER_NORESTART;
}
/*
@@ -556,42 +1218,39 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
dl_timer);
- struct task_struct *p = dl_task_of(dl_se);
- unsigned long flags;
+ struct task_struct *p;
+ struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(p, &flags);
+ if (dl_server(dl_se))
+ return dl_server_timer(timer, dl_se);
+
+ p = dl_task_of(dl_se);
+ rq = task_rq_lock(p, &rf);
/*
- * We need to take care of several possible races here:
- *
- * - the task might have changed its scheduling policy
- * to something different than SCHED_DEADLINE
- * - the task might have changed its reservation parameters
- * (through sched_setattr())
- * - the task might have been boosted by someone else and
- * might be in the boosting/deboosting path
- *
- * In all this cases we bail out, as the task is already
- * in the runqueue or is going to be enqueued back anyway.
+ * The task might have changed its scheduling policy to something
+ * different than SCHED_DEADLINE (through switched_from_dl()).
*/
- if (!dl_task(p) || dl_se->dl_new ||
- dl_se->dl_boosted || !dl_se->dl_throttled)
+ if (!dl_task(p))
goto unlock;
- sched_clock_tick();
- update_rq_clock(rq);
+ /*
+ * The task might have been boosted by someone else and might be in the
+ * boosting/deboosting path, its not throttled.
+ */
+ if (is_dl_boosted(dl_se))
+ goto unlock;
-#ifdef CONFIG_SMP
/*
- * If we find that the rq the task was on is no longer
- * available, we need to select a new rq.
+ * Spurious timer due to start_dl_timer() race; or we already received
+ * a replenishment from rt_mutex_setprio().
*/
- if (unlikely(!rq->online)) {
- dl_task_offline_migration(rq, p);
+ if (!dl_se->dl_throttled)
goto unlock;
- }
-#endif
+
+ sched_clock_tick();
+ update_rq_clock(rq);
/*
* If the throttle happened during sched-out; like:
@@ -608,93 +1267,278 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* but do not enqueue -- wait for our wakeup to do that.
*/
if (!task_on_rq_queued(p)) {
- replenish_dl_entity(dl_se, dl_se);
+ replenish_dl_entity(dl_se);
goto unlock;
}
+ if (unlikely(!rq->online)) {
+ /*
+ * If the runqueue is no longer available, migrate the
+ * task elsewhere. This necessarily changes rq.
+ */
+ lockdep_unpin_lock(__rq_lockp(rq), rf.cookie);
+ rq = dl_task_offline_migration(rq, p);
+ rf.cookie = lockdep_pin_lock(__rq_lockp(rq));
+ update_rq_clock(rq);
+
+ /*
+ * Now that the task has been migrated to the new RQ and we
+ * have that locked, proceed as normal and enqueue the task
+ * there.
+ */
+ }
+
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (dl_task(rq->donor))
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
-#ifdef CONFIG_SMP
+
+ __push_dl_task(rq, &rf);
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+
/*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
+ * This can free the task_struct, including this hrtimer, do not touch
+ * anything related to that after this.
*/
- if (has_pushable_dl_tasks(rq))
- push_dl_task(rq);
-#endif
-unlock:
- task_rq_unlock(rq, p, &flags);
+ put_task_struct(p);
return HRTIMER_NORESTART;
}
-void init_dl_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- timer->function = dl_task_timer;
+ hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+}
+
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadline < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = rq_of_dl_se(dl_se);
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
+ return;
+ dl_se->dl_throttled = 1;
+ if (dl_se->runtime > 0)
+ dl_se->runtime = 0;
+ }
}
static
-int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
return (dl_se->runtime <= 0);
}
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
+ * This function implements the GRUB accounting rule. According to the
+ * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
+ * where u is the utilization of the task, Umax is the maximum reclaimable
+ * utilization, Uinact is the (per-runqueue) inactive utilization, computed
+ * as the difference between the "total runqueue utilization" and the
+ * "runqueue active utilization", and Uextra is the (per runqueue) extra
+ * reclaimable utilization.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value should be
+ * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ * not an issue here.
*/
-static void update_curr_dl(struct rq *rq)
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
- struct task_struct *curr = rq->curr;
- struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec;
+ u64 u_act;
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
- if (!dl_task(curr) || !on_dl_rq(dl_se))
- return;
+ /*
+ * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+ * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+ * can be larger than u_max. So, u_max - u_inact - u_extra would be
+ * negative leading to wrong results.
+ */
+ if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+ u_act = dl_se->dl_bw;
+ else
+ u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
+
+ u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
+ return (delta * u_act) >> BW_SHIFT;
+}
+
+s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ s64 scaled_delta_exec;
/*
- * Consumed budget is computed considering the time as
- * observed by schedulable tasks (excluding time spent
- * in hardirq context, etc.). Deadlines are instead
- * computed using hard walltime. This seems to be the more
- * natural solution, but the full ramifications of this
- * approach need further study.
+ * For tasks that participate in GRUB, we implement GRUB-PA: the
+ * spare reclaimed bandwidth is used to clock down frequency.
+ *
+ * For the others, we still need to scale reservation parameters
+ * according to current frequency and CPU maximum capacity.
*/
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
+ scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
+ } else {
+ int cpu = cpu_of(rq);
+ unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
+
+ scaled_delta_exec = cap_scale(delta_exec, scale_freq);
+ scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+ }
+
+ return scaled_delta_exec;
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags);
+
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ bool idle = rq->curr == rq->idle;
+ s64 scaled_delta_exec;
+
+ if (unlikely(delta_exec <= 0)) {
+ if (unlikely(dl_se->dl_yielded))
+ goto throttle;
return;
+ }
+
+ if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer)
+ return;
+
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ scaled_delta_exec = delta_exec;
+ if (!dl_server(dl_se))
+ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
+
+ dl_se->runtime -= scaled_delta_exec;
+
+ if (dl_se->dl_defer_idle && !idle)
+ dl_se->dl_defer_idle = 0;
+
+ /*
+ * The fair server can consume its runtime while throttled (not queued/
+ * running as regular CFS).
+ *
+ * If the server consumes its entire runtime in this state. The server
+ * is not required for the current period. Thus, reset the server by
+ * starting a new period, pushing the activation.
+ */
+ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
+ /*
+ * Non-servers would never get time accounted while throttled.
+ */
+ WARN_ON_ONCE(!dl_server(dl_se));
+
+ /*
+ * While the server is marked idle, do not push out the
+ * activation further, instead wait for the period timer
+ * to lapse and stop the server.
+ */
+ if (dl_se->dl_defer_idle && idle) {
+ /*
+ * The timer is at the zero-laxity point, this means
+ * dl_server_stop() / dl_server_start() can happen
+ * while now < deadline. This means update_dl_entity()
+ * will not replenish. Additionally start_dl_timer()
+ * will be set for 'deadline - runtime'. Negative
+ * runtime will not do.
+ */
+ dl_se->runtime = 0;
+ return;
+ }
+
+ /*
+ * If the server was previously activated - the starving condition
+ * took place, it this point it went away because the fair scheduler
+ * was able to get runtime in background. So return to the initial
+ * state.
+ */
+ dl_se->dl_defer_running = 0;
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
+ replenish_dl_new_period(dl_se, dl_se->rq);
- curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ if (idle)
+ dl_se->dl_defer_idle = 1;
- sched_rt_avg_update(rq, delta_exec);
+ /*
+ * Not being able to start the timer seems problematic. If it could not
+ * be started for whatever reason, we need to "unthrottle" the DL server
+ * and queue right away. Otherwise nothing might queue it. That's similar
+ * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn.
+ */
+ WARN_ON_ONCE(!start_dl_timer(dl_se));
+
+ return;
+ }
- dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
- if (dl_runtime_exceeded(rq, dl_se)) {
+throttle:
+ if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
- __dequeue_task_dl(rq, curr, 0);
- if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
- enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
- if (!is_leftmost(curr, &rq->dl))
+ /* If requested, inform the user about runtime overruns. */
+ if (dl_runtime_exceeded(dl_se) &&
+ (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
+ dl_se->dl_overrun = 1;
+
+ dequeue_dl_entity(dl_se, 0);
+ if (!dl_server(dl_se)) {
+ update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+ dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+ }
+
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+ if (dl_server(dl_se)) {
+ replenish_dl_new_period(dl_se, rq);
+ start_dl_timer(dl_se);
+ } else {
+ enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
+ }
+
+ if (!is_leftmost(dl_se, &rq->dl))
resched_curr(rq);
}
/*
+ * The fair server (sole dl_server) does not account for real-time
+ * workload because it is running fair work.
+ */
+ if (dl_se == &rq->fair_server)
+ return;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
* Because -- for now -- we share the rt bandwidth, we need to
* account our runtime there too, otherwise actual rt tasks
* would be able to exceed the shared quota.
@@ -718,46 +1562,452 @@ static void update_curr_dl(struct rq *rq)
rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
+#endif /* CONFIG_RT_GROUP_SCHED */
+}
+
+/*
+ * In the non-defer mode, the idle time is not accounted, as the
+ * server provides a guarantee.
+ *
+ * If the dl_server is in defer mode, the idle time is also considered
+ * as time available for the fair server, avoiding a penalty for the
+ * rt scheduler that did not consumed that time.
+ */
+void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer)
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ /* 0 runtime = fair server disabled */
+ if (dl_se->dl_server_active && dl_se->dl_runtime)
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+/*
+ * dl_server && dl_defer:
+ *
+ * 6
+ * +--------------------+
+ * v |
+ * +-------------+ 4 +-----------+ 5 +------------------+
+ * +-> | A:init | <--- | D:running | -----> | E:replenish-wait |
+ * | +-------------+ +-----------+ +------------------+
+ * | | | 1 ^ ^ |
+ * | | 1 +----------+ | 3 |
+ * | v | |
+ * | +--------------------------------+ 2 |
+ * | | | ----+ |
+ * | 8 | B:zero_laxity-wait | | |
+ * | | | <---+ |
+ * | +--------------------------------+ |
+ * | | ^ ^ 2 |
+ * | | 7 | 2 +--------------------+
+ * | v |
+ * | +-------------+ |
+ * +-- | C:idle-wait | -+
+ * +-------------+
+ * ^ 7 |
+ * +---------+
+ *
+ *
+ * [A] - init
+ * dl_server_active = 0
+ * dl_throttled = 0
+ * dl_defer_armed = 0
+ * dl_defer_running = 0/1
+ * dl_defer_idle = 0
+ *
+ * [B] - zero_laxity-wait
+ * dl_server_active = 1
+ * dl_throttled = 1
+ * dl_defer_armed = 1
+ * dl_defer_running = 0
+ * dl_defer_idle = 0
+ *
+ * [C] - idle-wait
+ * dl_server_active = 1
+ * dl_throttled = 1
+ * dl_defer_armed = 1
+ * dl_defer_running = 0
+ * dl_defer_idle = 1
+ *
+ * [D] - running
+ * dl_server_active = 1
+ * dl_throttled = 0
+ * dl_defer_armed = 0
+ * dl_defer_running = 1
+ * dl_defer_idle = 0
+ *
+ * [E] - replenish-wait
+ * dl_server_active = 1
+ * dl_throttled = 1
+ * dl_defer_armed = 0
+ * dl_defer_running = 1
+ * dl_defer_idle = 0
+ *
+ *
+ * [1] A->B, A->D
+ * dl_server_start()
+ * dl_server_active = 1;
+ * enqueue_dl_entity()
+ * update_dl_entity(WAKEUP)
+ * if (!dl_defer_running)
+ * dl_defer_armed = 1;
+ * dl_throttled = 1;
+ * if (dl_throttled && start_dl_timer())
+ * return; // [B]
+ * __enqueue_dl_entity();
+ * // [D]
+ *
+ * // deplete server runtime from client-class
+ * [2] B->B, C->B, E->B
+ * dl_server_update()
+ * update_curr_dl_se() // idle = false
+ * if (dl_defer_idle)
+ * dl_defer_idle = 0;
+ * if (dl_defer && dl_throttled && dl_runtime_exceeded())
+ * dl_defer_running = 0;
+ * hrtimer_try_to_cancel(); // stop timer
+ * replenish_dl_new_period()
+ * // fwd period
+ * dl_throttled = 1;
+ * dl_defer_armed = 1;
+ * start_dl_timer(); // restart timer
+ * // [B]
+ *
+ * // timer actually fires means we have runtime
+ * [3] B->D
+ * dl_server_timer()
+ * if (dl_defer_armed)
+ * dl_defer_running = 1;
+ * enqueue_dl_entity(REPLENISH)
+ * replenish_dl_entity()
+ * // fwd period
+ * if (dl_throttled)
+ * dl_throttled = 0;
+ * if (dl_defer_armed)
+ * dl_defer_armed = 0;
+ * __enqueue_dl_entity();
+ * // [D]
+ *
+ * // schedule server
+ * [4] D->A
+ * pick_task_dl()
+ * p = server_pick_task();
+ * if (!p)
+ * dl_server_stop()
+ * dequeue_dl_entity();
+ * hrtimer_try_to_cancel();
+ * dl_defer_armed = 0;
+ * dl_throttled = 0;
+ * dl_server_active = 0;
+ * // [A]
+ * return p;
+ *
+ * // server running
+ * [5] D->E
+ * update_curr_dl_se()
+ * if (dl_runtime_exceeded())
+ * dl_throttled = 1;
+ * dequeue_dl_entity();
+ * start_dl_timer();
+ * // [E]
+ *
+ * // server replenished
+ * [6] E->D
+ * dl_server_timer()
+ * enqueue_dl_entity(REPLENISH)
+ * replenish_dl_entity()
+ * fwd-period
+ * if (dl_throttled)
+ * dl_throttled = 0;
+ * __enqueue_dl_entity();
+ * // [D]
+ *
+ * // deplete server runtime from idle
+ * [7] B->C, C->C
+ * dl_server_update_idle()
+ * update_curr_dl_se() // idle = true
+ * if (dl_defer && dl_throttled && dl_runtime_exceeded())
+ * if (dl_defer_idle)
+ * return;
+ * dl_defer_running = 0;
+ * hrtimer_try_to_cancel();
+ * replenish_dl_new_period()
+ * // fwd period
+ * dl_throttled = 1;
+ * dl_defer_armed = 1;
+ * dl_defer_idle = 1;
+ * start_dl_timer(); // restart timer
+ * // [C]
+ *
+ * // stop idle server
+ * [8] C->A
+ * dl_server_timer()
+ * if (dl_defer_idle)
+ * dl_server_stop();
+ * // [A]
+ *
+ *
+ * digraph dl_server {
+ * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"]
+ * "A:init" -> "D:running" [label="1:dl_server_start"]
+ * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
+ * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
+ * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"]
+ * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"]
+ * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
+ * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
+ * "D:running" -> "A:init" [label="4:pick_task_dl"]
+ * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"]
+ * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
+ * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"]
+ * }
+ *
+ *
+ * Notes:
+ *
+ * - When there are fair tasks running the most likely loop is [2]->[2].
+ * the dl_server never actually runs, the timer never fires.
+ *
+ * - When there is actual fair starvation; the timer fires and starts the
+ * dl_server. This will then throttle and replenish like a normal DL
+ * task. Notably it will not 'defer' again.
+ *
+ * - When idle it will push the actication forward once, and then wait
+ * for the timer to hit or a non-idle update to restart things.
+ */
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = dl_se->rq;
+
+ if (!dl_server(dl_se) || dl_se->dl_server_active)
+ return;
+
+ /*
+ * Update the current task to 'now'.
+ */
+ rq->donor->sched_class->update_curr(rq);
+
+ if (WARN_ON_ONCE(!cpu_online(cpu_of(rq))))
+ return;
+
+ dl_se->dl_server_active = 1;
+ enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
+ resched_curr(dl_se->rq);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+ if (!dl_server(dl_se) || !dl_server_active(dl_se))
+ return;
+
+ dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ dl_se->dl_defer_idle = 0;
+ dl_se->dl_server_active = 0;
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_pick_f pick_task)
+{
+ dl_se->rq = rq;
+ dl_se->server_pick_task = pick_task;
+}
+
+void sched_init_dl_servers(void)
+{
+ int cpu;
+ struct rq *rq;
+ struct sched_dl_entity *dl_se;
+
+ for_each_online_cpu(cpu) {
+ u64 runtime = 50 * NSEC_PER_MSEC;
+ u64 period = 1000 * NSEC_PER_MSEC;
+
+ rq = cpu_rq(cpu);
+
+ guard(rq_lock_irq)(rq);
+
+ dl_se = &rq->fair_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+ }
}
-#ifdef CONFIG_SMP
+void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 new_bw = dl_se->dl_bw;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
+ dl_b = dl_bw_of(cpu_of(rq));
+ guard(raw_spinlock)(&dl_b->lock);
-static inline u64 next_deadline(struct rq *rq)
+ if (!dl_bw_cpus(cpu))
+ return;
+
+ __dl_add(dl_b, new_bw, dl_bw_cpus(cpu));
+}
+
+int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
{
- struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
+ u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ u64 new_bw = to_ratio(period, runtime);
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ unsigned long cap;
+ int retval = 0;
+ int cpus;
- if (next && dl_prio(next->prio))
- return next->dl.deadline;
- else
- return 0;
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
+ if (__dl_overflow(dl_b, cap, old_bw, new_bw))
+ return -EBUSY;
+
+ if (init) {
+ __add_rq_bw(new_bw, &rq->dl);
+ __dl_add(dl_b, new_bw, cpus);
+ } else {
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+
+ dl_rq_change_utilization(rq, dl_se, new_bw);
+ }
+
+ dl_se->dl_runtime = runtime;
+ dl_se->dl_deadline = period;
+ dl_se->dl_period = period;
+
+ dl_se->runtime = 0;
+ dl_se->deadline = 0;
+
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+
+ return retval;
}
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *donor = rq->donor;
+ struct sched_dl_entity *dl_se = &donor->dl;
+ s64 delta_exec;
+
+ if (!dl_task(donor) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = update_curr_common(rq);
+ update_curr_dl_se(rq, dl_se, delta_exec);
+}
+
+static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
+{
+ struct sched_dl_entity *dl_se = container_of(timer,
+ struct sched_dl_entity,
+ inactive_timer);
+ struct task_struct *p = NULL;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!dl_server(dl_se)) {
+ p = dl_task_of(dl_se);
+ rq = task_rq_lock(p, &rf);
+ } else {
+ rq = dl_se->rq;
+ rq_lock(rq, &rf);
+ }
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ if (dl_server(dl_se))
+ goto no_task;
+
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) {
+ sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
+ sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
+ dl_se->dl_non_contending = 0;
+ }
+
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(dl_se);
+
+ goto unlock;
+ }
+
+no_task:
+ if (dl_se->dl_non_contending == 0)
+ goto unlock;
+
+ sub_running_bw(dl_se, &rq->dl);
+ dl_se->dl_non_contending = 0;
+unlock:
+
+ if (!dl_server(dl_se)) {
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ } else {
+ rq_unlock(rq, &rf);
+ }
+
+ return HRTIMER_NORESTART;
+}
+
+static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+{
+ struct hrtimer *timer = &dl_se->inactive_timer;
+
+ hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+}
+
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
{
struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
- /*
- * If the dl_rq had no -deadline tasks, or if the new task
- * has shorter deadline than the current one on dl_rq, we
- * know that the previous earliest becomes our next earliest,
- * as the new task becomes the earliest itself.
- */
- dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
+ if (dl_rq->earliest_dl.curr == 0)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
dl_rq->earliest_dl.curr = deadline;
- cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
- } else if (dl_rq->earliest_dl.next == 0 ||
- dl_time_before(deadline, dl_rq->earliest_dl.next)) {
- /*
- * On the other hand, if the new -deadline task has a
- * a later deadline than the earliest one on dl_rq, but
- * it is earlier than the next (if any), we must
- * recompute the next-earliest.
- */
- dl_rq->earliest_dl.next = next_deadline(rq);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
}
@@ -772,79 +2022,124 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
- struct rb_node *leftmost = dl_rq->rb_leftmost;
- struct sched_dl_entity *entry;
+ struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
+ struct sched_dl_entity *entry = __node_2_dle(leftmost);
- entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
- dl_rq->earliest_dl.next = next_deadline(rq);
- cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
}
-#else
-
-static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-
-#endif /* CONFIG_SMP */
-
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
u64 deadline = dl_se->deadline;
- WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
- inc_dl_migration(dl_se, dl_rq);
}
static inline
void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
-
- WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
- dec_dl_migration(dl_se, dl_rq);
+}
+
+static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
+{
+ return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
+}
+
+static __always_inline struct sched_statistics *
+__schedstats_from_dl_se(struct sched_dl_entity *dl_se)
+{
+ if (!schedstat_enabled())
+ return NULL;
+
+ if (dl_server(dl_se))
+ return NULL;
+
+ return &dl_task_of(dl_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_dl(dl_rq, dl_se);
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (!schedstat_enabled())
+ return;
+
+ if ((flags & DEQUEUE_SLEEP)) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+ }
}
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_dl_entity *entry;
- int leftmost = 1;
-
- BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_dl_entity, rb_node);
- if (dl_time_before(dl_se->deadline, entry->deadline))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
- if (leftmost)
- dl_rq->rb_leftmost = &dl_se->rb_node;
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
- rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+ rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -856,93 +2151,189 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
if (RB_EMPTY_NODE(&dl_se->rb_node))
return;
- if (dl_rq->rb_leftmost == &dl_se->rb_node) {
- struct rb_node *next_node;
-
- next_node = rb_next(&dl_se->rb_node);
- dl_rq->rb_leftmost = next_node;
- }
+ rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
- rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
}
static void
-enqueue_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, int flags)
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- BUG_ON(on_dl_rq(dl_se));
+ WARN_ON_ONCE(on_dl_rq(dl_se));
+
+ update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
+
+ /*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!dl_se->dl_throttled && !dl_is_implicit(dl_se))
+ dl_check_constrained_dl(dl_se);
+
+ if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ add_rq_bw(dl_se, dl_rq);
+ add_running_bw(dl_se, dl_rq);
+ }
+
+ /*
+ * If p is throttled, we do not enqueue it. In fact, if it exhausted
+ * its budget it needs a replenishment and, since it now is on
+ * its rq, the bandwidth timer callback (which clearly has not
+ * run yet) will take care of this.
+ * However, the active utilization does not depend on the fact
+ * that the task is on the runqueue or not (but depends on the
+ * task's state - in GRUB parlance, "inactive" vs "active contending").
+ * In other words, even if a task is throttled its utilization must
+ * be counted in the active utilization; hence, we need to call
+ * add_running_bw().
+ */
+ if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (flags & ENQUEUE_WAKEUP)
+ task_contending(dl_se, flags);
+
+ return;
+ }
/*
* If this is a wakeup or a new instance, the scheduling
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
- if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
- update_dl_entity(dl_se, pi_se);
- else if (flags & ENQUEUE_REPLENISH)
- replenish_dl_entity(dl_se, pi_se);
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
+ update_dl_entity(dl_se);
+ } else if (flags & ENQUEUE_REPLENISH) {
+ replenish_dl_entity(dl_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ !is_dl_boosted(dl_se) &&
+ dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
+ setup_new_dl_entity(dl_se);
+ }
+
+ /*
+ * If the reservation is still throttled, e.g., it got replenished but is a
+ * deferred task and still got to wait, don't enqueue.
+ */
+ if (dl_se->dl_throttled && start_dl_timer(dl_se))
+ return;
+
+ /*
+ * We're about to enqueue, make sure we're not ->dl_throttled!
+ * In case the timer was not started, say because the defer time
+ * has passed, mark as not throttled and mark unarmed.
+ * Also cancel earlier timers, since letting those run is pointless.
+ */
+ if (dl_se->dl_throttled) {
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ }
__enqueue_dl_entity(dl_se);
}
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
__dequeue_dl_entity(dl_se);
-}
-static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
- struct sched_dl_entity *pi_se = &p->dl;
+ if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ sub_running_bw(dl_se, dl_rq);
+ sub_rq_bw(dl_se, dl_rq);
+ }
/*
- * Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (relative) deadline is
- * smaller than our one... OTW we keep our runtime and
- * deadline.
+ * This check allows to start the inactive timer (or to immediately
+ * decrease the active utilization, if needed) in two cases:
+ * when the task blocks and when it is terminating
+ * (p->state == TASK_DEAD). We can handle the two cases in the same
+ * way, because from GRUB's point of view the same thing is happening
+ * (the task moves from "active contending" to "active non contending"
+ * or "inactive")
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
- pi_se = &pi_task->dl;
+ if (flags & DEQUEUE_SLEEP)
+ task_non_contending(dl_se, true);
+}
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (is_dl_boosted(&p->dl)) {
+ /*
+ * Because of delays in the detection of the overrun of a
+ * thread's runtime, it might be the case that a thread
+ * goes to sleep in a rt mutex with negative runtime. As
+ * a consequence, the thread will be throttled.
+ *
+ * While waiting for the mutex, this thread can also be
+ * boosted via PI, resulting in a thread that is throttled
+ * and boosted at the same time.
+ *
+ * In this case, the boost overrides the throttle.
+ */
+ if (p->dl.dl_throttled) {
+ /*
+ * The replenish timer needs to be canceled. No
+ * problem if it fires concurrently: boosted threads
+ * are ignored in dl_task_timer().
+ */
+ cancel_replenish_timer(&p->dl);
+ p->dl.dl_throttled = 0;
+ }
} else if (!dl_prio(p->normal_prio)) {
/*
- * Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceedes its
- * runtime while doing so. No point in replenishing
- * it, as it's going to return back to its original
- * scheduling class after this.
+ * Special case in which we have a !SCHED_DEADLINE task that is going
+ * to be deboosted, but exceeds its runtime while doing so. No point in
+ * replenishing it, as it's going to return back to its original
+ * scheduling class after this. If it has been throttled, we need to
+ * clear the flag, otherwise the task may wake up as throttled after
+ * being boosted again with no means to replenish the runtime and clear
+ * the throttle.
*/
- BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ p->dl.dl_throttled = 0;
+ if (!(flags & ENQUEUE_REPLENISH))
+ printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
+ task_pid_nr(p));
+
return;
}
- /*
- * If p is throttled, we do nothing. In fact, if it exhausted
- * its budget it needs a replenishment and, since it now is on
- * its rq, the bandwidth timer callback (which clearly has not
- * run yet) will take care of this.
- */
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
+ check_schedstat_required();
+ update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= ENQUEUE_MIGRATING;
+
+ enqueue_dl_entity(&p->dl, flags);
+
+ if (dl_server(&p->dl))
return;
- enqueue_dl_entity(&p->dl, pi_se, flags);
+ if (task_is_blocked(p))
+ return;
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- dequeue_dl_entity(&p->dl);
- dequeue_pushable_dl_task(rq, p);
-}
-
-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
- __dequeue_task_dl(rq, p, flags);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= DEQUEUE_MIGRATING;
+
+ dequeue_dl_entity(&p->dl, flags);
+ if (!p->dl.dl_throttled && !dl_server(&p->dl))
+ dequeue_pushable_dl_task(rq, p);
+
+ return true;
}
/*
@@ -957,18 +2348,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/
static void yield_task_dl(struct rq *rq)
{
- struct task_struct *p = rq->curr;
-
/*
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
* new scheduling parameters (thanks to dl_yielded=1).
*/
- if (p->dl.runtime > 0) {
- rq->curr->dl.dl_yielded = 1;
- p->dl.runtime = 0;
- }
+ rq->donor->dl.dl_yielded = 1;
+
update_rq_clock(rq);
update_curr_dl(rq);
/*
@@ -976,26 +2363,34 @@ static void yield_task_dl(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq_clock_skip_update(rq, true);
+ rq_clock_skip_update(rq);
}
-#ifdef CONFIG_SMP
+static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
+ struct rq *rq)
+{
+ return (!rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ rq->dl.earliest_dl.curr));
+}
static int find_later_rq(struct task_struct *task);
static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
+ bool select_rq;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE)
- goto out;
+ if (!(flags & WF_TTWU))
+ return cpu;
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If we are dealing with a -deadline task, we must
@@ -1006,21 +2401,62 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ select_rq = unlikely(dl_task(donor)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &donor->dl)) &&
+ p->nr_cpus_allowed > 1;
+
+ /*
+ * Take the capacity of the CPU into account to
+ * ensure it fits the requirement of the task.
+ */
+ if (sched_asym_cpucap_active())
+ select_rq |= !dl_task_fits_capacity(p, cpu);
+
+ if (select_rq) {
int target = find_later_rq(p);
- if (target != -1)
+ if (target != -1 &&
+ dl_task_is_earliest_deadline(p, cpu_rq(target)))
cpu = target;
}
rcu_read_unlock();
-out:
return cpu;
}
+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (READ_ONCE(p->__state) != TASK_WAKING)
+ return;
+
+ rq = task_rq(p);
+ /*
+ * Since p->state == TASK_WAKING, set_task_cpu() has been called
+ * from try_to_wake_up(). Hence, p->pi_lock is locked, but
+ * rq->lock is not... So, lock it
+ */
+ rq_lock(rq, &rf);
+ if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
+ sub_running_bw(&p->dl, &rq->dl);
+ p->dl.dl_non_contending = 0;
+ /*
+ * If the timer handler is currently running and the
+ * timer cannot be canceled, inactive_task_timer()
+ * will see that dl_not_contending is not set, and
+ * will not touch the rq's active utilization,
+ * so we are still safe.
+ */
+ cancel_inactive_timer(&p->dl);
+ }
+ sub_rq_bw(&p->dl, &rq->dl);
+ rq_unlock(rq, &rf);
+}
+
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
{
/*
@@ -1028,7 +2464,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ !cpudl_find(&rq->rd->cpudl, rq->donor, NULL))
return;
/*
@@ -1036,129 +2472,171 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else.
*/
if (p->nr_cpus_allowed != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ cpudl_find(&rq->rd->cpudl, p, NULL))
return;
resched_curr(rq);
}
-static int pull_dl_task(struct rq *this_rq);
+static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+{
+ if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
-#endif /* CONFIG_SMP */
+ return sched_stop_runnable(rq) || sched_dl_runnable(rq);
+}
/*
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
int flags)
{
- if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
}
-#ifdef CONFIG_SMP
/*
* In the unlikely case current and p have the same deadline
* let us try to decide what's the best thing to do...
*/
- if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ if ((p->dl.deadline == rq->donor->dl.deadline) &&
!test_tsk_need_resched(rq->curr))
check_preempt_equal_dl(rq, p);
-#endif /* CONFIG_SMP */
}
#ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, dl_se->runtime);
}
-#else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+#else /* !CONFIG_SCHED_HRTICK: */
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
- struct dl_rq *dl_rq)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
- struct rb_node *left = dl_rq->rb_leftmost;
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
- if (!left)
- return NULL;
+ p->se.exec_start = rq_clock_task(rq);
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_end_dl(dl_rq, dl_se);
- return rb_entry(left, struct sched_dl_entity, rb_node);
-}
+ /* You can't push away the running task */
+ dequeue_pushable_dl_task(rq, p);
-struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
-{
- struct sched_dl_entity *dl_se;
- struct task_struct *p;
- struct dl_rq *dl_rq;
+ if (!first)
+ return;
- dl_rq = &rq->dl;
+ if (rq->donor->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
- if (need_pull_dl_task(rq, prev)) {
- pull_dl_task(rq);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a stop task can slip in, in which case we need to
- * re-start task selection.
- */
- if (rq->stop && task_on_rq_queued(rq->stop))
- return RETRY_TASK;
- }
+ deadline_queue_push_tasks(rq);
- /*
- * When prev is DL, we may throttle it in put_prev_task().
- * So, we update time before we check for dl_nr_running.
- */
- if (prev->sched_class == &dl_sched_class)
- update_curr_dl(rq);
+ if (hrtick_enabled_dl(rq))
+ start_hrtick_dl(rq, &p->dl);
+}
- if (unlikely(!dl_rq->dl_nr_running))
- return NULL;
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
+{
+ struct rb_node *left = rb_first_cached(&dl_rq->root);
- put_prev_task(rq, prev);
+ if (!left)
+ return NULL;
- dl_se = pick_next_dl_entity(rq, dl_rq);
- BUG_ON(!dl_se);
+ return __node_2_dle(left);
+}
- p = dl_task_of(dl_se);
- p->se.exec_start = rq_clock_task(rq);
+/*
+ * __pick_next_task_dl - Helper to pick the next -deadline task to run.
+ * @rq: The runqueue to pick the next task from.
+ */
+static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf)
+{
+ struct sched_dl_entity *dl_se;
+ struct dl_rq *dl_rq = &rq->dl;
+ struct task_struct *p;
- /* Running task will never be pushed. */
- dequeue_pushable_dl_task(rq, p);
+again:
+ if (!sched_dl_runnable(rq))
+ return NULL;
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
+ dl_se = pick_next_dl_entity(dl_rq);
+ WARN_ON_ONCE(!dl_se);
- set_post_schedule(rq);
+ if (dl_server(dl_se)) {
+ p = dl_se->server_pick_task(dl_se, rf);
+ if (!p) {
+ dl_server_stop(dl_se);
+ goto again;
+ }
+ rq->dl_server = dl_se;
+ } else {
+ p = dl_task_of(dl_se);
+ }
return p;
}
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf)
{
+ return __pick_task_dl(rq, rf);
+}
+
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_start_dl(dl_rq, dl_se);
+
update_curr_dl(rq);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+
+ if (task_is_blocked(p))
+ return;
+
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
* be set and schedule() will start a new hrtick for the next task.
*/
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
- is_leftmost(p, &rq->dl))
- start_hrtick_dl(rq, p);
+ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(&p->dl, &rq->dl))
+ start_hrtick_dl(rq, &p->dl);
}
static void task_fork_dl(struct task_struct *p)
@@ -1169,67 +2647,35 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void task_dead_dl(struct task_struct *p)
-{
- struct hrtimer *timer = &p->dl.dl_timer;
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- /*
- * Since we are TASK_DEAD we won't slip out of the domain!
- */
- raw_spin_lock_irq(&dl_b->lock);
- /* XXX we should retain the bw until 0-lag */
- dl_b->total_bw -= p->dl.dl_bw;
- raw_spin_unlock_irq(&dl_b->lock);
-
- hrtimer_cancel(timer);
-}
-
-static void set_curr_task_dl(struct rq *rq)
-{
- struct task_struct *p = rq->curr;
-
- p->se.exec_start = rq_clock_task(rq);
-
- /* You can't push away the running task */
- dequeue_pushable_dl_task(rq, p);
-}
-
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define DL_MAX_TRIES 3
-static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
- return 1;
- return 0;
-}
-
-/* Returns the second earliest -deadline task, NULL otherwise */
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
+/*
+ * Return the earliest pushable rq's task, which is suitable to be executed
+ * on the CPU, NULL otherwise:
+ */
+static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.rb_leftmost;
- struct sched_dl_entity *dl_se;
struct task_struct *p = NULL;
+ struct rb_node *next_node;
-next_node:
- next_node = rb_next(next_node);
- if (next_node) {
- dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
- p = dl_task_of(dl_se);
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+ while (next_node) {
+ p = __node_2_pdl(next_node);
- if (pick_dl_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
- goto next_node;
+ next_node = rb_next(next_node);
}
return NULL;
}
+/* Access rule: must be called on local CPU with preemption disabled */
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
static int find_later_rq(struct task_struct *task)
@@ -1237,7 +2683,7 @@ static int find_later_rq(struct task_struct *task)
struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
- int best_cpu, cpu = task_cpu(task);
+ int cpu = task_cpu(task);
/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1248,24 +2694,21 @@ static int find_later_rq(struct task_struct *task)
/*
* We have to consider system topology and task affinity
- * first, then we can look for a suitable cpu.
+ * first, then we can look for a suitable CPU.
*/
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
/*
- * If we are here, some target has been found,
- * the most suitable of which is cached in best_cpu.
- * This is, among the runqueues where the current tasks
- * have later deadlines than the task's one, the rq
- * with the latest possible one.
+ * If we are here, some targets have been found, including
+ * the most suitable which is, among the runqueues where the
+ * current tasks have later deadlines than the task's one, the
+ * rq with the latest possible one.
*
* Now we check how well this matches with task's
* affinity and system topology.
*
- * The last cpu where the task run is our first
+ * The last CPU where the task run is our first
* guess, since it is most likely cache-hot there.
*/
if (cpumask_test_cpu(cpu, later_mask))
@@ -1280,6 +2723,7 @@ static int find_later_rq(struct task_struct *task)
rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
/*
* If possible, preempting this_cpu is
@@ -1291,12 +2735,15 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
+ best_cpu = cpumask_any_and_distribute(later_mask,
+ sched_domain_span(sd));
/*
- * Last chance: if best_cpu is valid and is
- * in the mask, that becomes our choice.
+ * Last chance: if a CPU being in both later_mask
+ * and current sd span is valid, that becomes our
+ * choice. Of course, the latest possible CPU is
+ * already under consideration through later_mask.
*/
- if (best_cpu < nr_cpu_ids &&
- cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
}
@@ -1311,13 +2758,32 @@ static int find_later_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
- cpu = cpumask_any(later_mask);
+ cpu = cpumask_any_distribute(later_mask);
if (cpu < nr_cpu_ids)
return cpu;
return -1;
}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_dl_tasks(rq))
+ return NULL;
+
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
+
+ return p;
+}
+
/* Locks the rq it finds */
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
{
@@ -1333,13 +2799,49 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
+ if (!dl_task_is_earliest_deadline(task, later_rq)) {
+ /*
+ * Target rq has tasks of equal or earlier deadline,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ later_rq = NULL;
+ break;
+ }
+
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
- if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu,
- &task->cpus_allowed) ||
- task_running(rq, task) ||
- !task_on_rq_queued(task))) {
+ /*
+ * double_lock_balance had to release rq->lock, in the
+ * meantime, task may no longer be fit to be migrated.
+ * Check the following to ensure that the task is
+ * still suitable for migration:
+ * 1. It is possible the task was scheduled,
+ * migrate_disabled was set and then got preempted,
+ * so we must check the task migration disable
+ * flag.
+ * 2. The CPU picked is in the task's affinity.
+ * 3. For throttled task (dl_task_offline_migration),
+ * check the following:
+ * - the task is not on the rq anymore (it was
+ * migrated)
+ * - the task is not on CPU anymore
+ * - the task is still a dl task
+ * - the task is not queued on the rq anymore
+ * 4. For the non-throttled task (push_dl_task), the
+ * check to ensure that this task is still at the
+ * head of the pushable tasks list is enough.
+ */
+ if (unlikely(is_migration_disabled(task) ||
+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
+ (task->dl.dl_throttled &&
+ (task_rq(task) != rq ||
+ task_on_cpu(rq, task) ||
+ !dl_task(task) ||
+ !task_on_rq_queued(task))) ||
+ (!task->dl.dl_throttled &&
+ task != pick_next_pushable_dl_task(rq)))) {
+
double_unlock_balance(rq, later_rq);
later_rq = NULL;
break;
@@ -1351,9 +2853,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
* its earliest one has a later deadline than our
* task, the rq is a good one.
*/
- if (!later_rq->dl.dl_nr_running ||
- dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr))
+ if (dl_task_is_earliest_deadline(task, later_rq))
break;
/* Otherwise we try again. */
@@ -1364,26 +2864,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
return later_rq;
}
-static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_dl_tasks(rq))
- return NULL;
-
- p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
- struct task_struct, pushable_dl_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!dl_task(p));
-
- return p;
-}
-
/*
* See if the non running -deadline tasks on this rq
* can be sent to some other CPU where they can preempt
@@ -1395,31 +2875,29 @@ static int push_dl_task(struct rq *rq)
struct rq *later_rq;
int ret = 0;
- if (!rq->dl.overloaded)
- return 0;
-
next_task = pick_next_pushable_dl_task(rq);
if (!next_task)
return 0;
retry:
- if (unlikely(next_task == rq->curr)) {
- WARN_ON(1);
- return 0;
- }
-
/*
* If next_task preempts rq->curr, and rq->curr
* can move away, it makes sense to just reschedule
* without going further in pushing next_task.
*/
- if (dl_task(rq->curr) &&
- dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ if (dl_task(rq->donor) &&
+ dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
resched_curr(rq);
return 0;
}
+ if (is_migration_disabled(next_task))
+ return 0;
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -1434,10 +2912,10 @@ retry:
* then possible that next_task has migrated.
*/
task = pick_next_pushable_dl_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task is still there. We don't try
- * again, some other cpu will pull it when ready.
+ * again, some other CPU will pull it when ready.
*/
goto out;
}
@@ -1451,9 +2929,7 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, later_rq->cpu);
- activate_task(later_rq, next_task, 0);
+ move_queued_task_locked(rq, later_rq, next_task);
ret = 1;
resched_curr(later_rq);
@@ -1468,20 +2944,21 @@ out:
static void push_dl_tasks(struct rq *rq)
{
- /* Terminates as it moves a -deadline task */
+ /* push_dl_task() will return true if it moved a -deadline task */
while (push_dl_task(rq))
;
}
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
- struct task_struct *p;
+ int this_cpu = this_rq->cpu, cpu;
+ struct task_struct *p, *push_task;
+ bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
if (likely(!dl_overloaded(this_rq)))
- return 0;
+ return;
/*
* Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1496,7 +2973,7 @@ static int pull_dl_task(struct rq *this_rq)
src_rq = cpu_rq(cpu);
/*
- * It looks racy, abd it is! However, as in sched_rt.c,
+ * It looks racy, and it is! However, as in sched_rt.c,
* we are fine with this.
*/
if (this_rq->dl.dl_nr_running &&
@@ -1505,6 +2982,7 @@ static int pull_dl_task(struct rq *this_rq)
continue;
/* Might drop this_rq->lock */
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@@ -1514,7 +2992,7 @@ static int pull_dl_task(struct rq *this_rq)
if (src_rq->dl.dl_nr_running <= 1)
goto skip;
- p = pick_next_earliest_dl_task(src_rq, this_cpu);
+ p = pick_earliest_pushable_dl_task(src_rq, this_cpu);
/*
* We found a task to be pulled if:
@@ -1522,9 +3000,7 @@ static int pull_dl_task(struct rq *this_rq)
* - it will preempt the last one we pulled (if any).
*/
if (p && dl_time_before(p->dl.deadline, dmin) &&
- (!this_rq->dl.dl_nr_running ||
- dl_time_before(p->dl.deadline,
- this_rq->dl.earliest_dl.curr))) {
+ dl_task_is_earliest_deadline(p, this_rq)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!task_on_rq_queued(p));
@@ -1533,28 +3009,34 @@ static int pull_dl_task(struct rq *this_rq)
* deadline than the current task of its runqueue.
*/
if (dl_time_before(p->dl.deadline,
- src_rq->curr->dl.deadline))
+ src_rq->donor->dl.deadline))
goto skip;
- ret = 1;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- dmin = p->dl.deadline;
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ move_queued_task_locked(src_rq, this_rq, p);
+ dmin = p->dl.deadline;
+ resched = true;
+ }
/* Is there any other task even earlier? */
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(this_rq);
+ }
}
- return ret;
-}
-
-static void post_schedule_dl(struct rq *rq)
-{
- push_dl_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -1563,25 +3045,23 @@ static void post_schedule_dl(struct rq *rq)
*/
static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_dl_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
- dl_task(rq->curr) &&
+ dl_task(rq->donor) &&
(rq->curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->donor->dl))) {
push_dl_tasks(rq);
}
}
static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask)
+ struct affinity_context *ctx)
{
- struct rq *rq;
struct root_domain *src_rd;
- int weight;
+ struct rq *rq;
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
src_rd = rq->rd;
@@ -1591,7 +3071,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, new_mask)) {
+ if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -1601,41 +3081,11 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
- __dl_clear(src_dl_b, p->dl.dl_bw);
+ __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
- /*
- * Update only if the task is actually running (i.e.,
- * it is on the rq AND it is not throttled).
- */
- if (!on_dl_rq(&p->dl))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_dl_task(rq, p);
- BUG_ON(!rq->dl.dl_nr_migratory);
- rq->dl.dl_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_dl_task(rq, p);
- rq->dl.dl_nr_migratory++;
- }
-
- update_dl_migration(&rq->dl);
+ set_cpus_allowed_common(p, ctx);
}
/* Assumes rq->lock is held */
@@ -1644,9 +3094,10 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
- cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
- cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+ cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
+ else
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, true);
}
/* Assumes rq->lock is held */
@@ -1655,11 +3106,10 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);
- cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
- cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, false);
}
-void init_sched_dl_class(void)
+void __init init_sched_dl_class(void)
{
unsigned int i;
@@ -1668,50 +3118,150 @@ void init_sched_dl_class(void)
GFP_KERNEL, cpu_to_node(i));
}
-#endif /* CONFIG_SMP */
-
/*
- * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
+ * This function always returns a non-empty bitmap in @cpus. This is because
+ * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth
+ * check will prevent CPU hotplug from deactivating all CPUs in that domain.
*/
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
+static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus)
{
- struct hrtimer *dl_timer = &p->dl.dl_timer;
-
- /* Nobody will change task's class if pi_lock is held */
- lockdep_assert_held(&p->pi_lock);
-
- if (hrtimer_active(dl_timer)) {
- int ret = hrtimer_try_to_cancel(dl_timer);
+ const struct cpumask *hk_msk;
- if (unlikely(ret == -1)) {
+ hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN);
+ if (housekeeping_enabled(HK_TYPE_DOMAIN)) {
+ if (!cpumask_intersects(p->cpus_ptr, hk_msk)) {
/*
- * Note, p may migrate OR new deadline tasks
- * may appear in rq when we are unlocking it.
- * A caller of us must be fine with that.
+ * CPUs isolated by isolcpu="domain" always belong to
+ * def_root_domain.
*/
- raw_spin_unlock(&rq->lock);
- hrtimer_cancel(dl_timer);
- raw_spin_lock(&rq->lock);
+ cpumask_andnot(cpus, cpu_active_mask, hk_msk);
+ return;
}
}
+
+ /*
+ * If a root domain holds a DL task, it must have active CPUs. So
+ * active CPUs can always be found by walking up the task's cpuset
+ * hierarchy up to the partition root.
+ */
+ cpuset_cpus_allowed_locked(p, cpus);
+}
+
+/* The caller should hold cpuset_mutex */
+void dl_add_task_root_domain(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ struct dl_bw *dl_b;
+ unsigned int cpu;
+ struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
+
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return;
+ }
+
+ /*
+ * Get an active rq, whose rq->rd traces the correct root
+ * domain.
+ * Ideally this would be under cpuset reader lock until rq->rd is
+ * fetched. However, sleepable locks cannot nest inside pi_lock, so we
+ * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex'
+ * to guarantee the CPU stays in the cpuset.
+ */
+ dl_get_task_effective_cpus(p, msk);
+ cpu = cpumask_first_and(cpu_active_mask, msk);
+ BUG_ON(cpu >= nr_cpu_ids);
+ rq = cpu_rq(cpu);
+ dl_b = &rq->rd->dl_bw;
+ /* End of fetching rd */
+
+ raw_spin_lock(&dl_b->lock);
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+}
+
+void dl_clear_root_domain(struct root_domain *rd)
+{
+ int i;
+
+ guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
+
+ /*
+ * Reset total_bw to zero and extra_bw to max_bw so that next
+ * loop will add dl-servers contributions back properly,
+ */
+ rd->dl_bw.total_bw = 0;
+ for_each_cpu(i, rd->span)
+ cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw;
+
+ /*
+ * dl_servers are not tasks. Since dl_add_task_root_domain ignores
+ * them, we need to account for them here explicitly.
+ */
+ for_each_cpu(i, rd->span) {
+ struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
+
+ if (dl_server(dl_se) && cpu_active(i))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
+ }
+}
+
+void dl_clear_root_domain_cpu(int cpu)
+{
+ dl_clear_root_domain(cpu_rq(cpu)->rd);
}
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- /* XXX we should retain the bw until 0-lag */
- cancel_dl_timer(rq, p);
- __dl_clear_params(p);
+ /*
+ * task_non_contending() can start the "inactive timer" (if the 0-lag
+ * time is in the future). If the task switches back to dl before
+ * the "inactive timer" fires, it can continue to consume its current
+ * runtime using its current deadline. If it stays outside of
+ * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
+ * will reset the task parameters.
+ */
+ if (task_on_rq_queued(p) && p->dl.dl_runtime)
+ task_non_contending(&p->dl, false);
+
+ /*
+ * In case a task is setscheduled out from SCHED_DEADLINE we need to
+ * keep track of that on its cpuset (for correct bandwidth tracking).
+ */
+ dec_dl_tasks_cs(p);
+
+ if (!task_on_rq_queued(p)) {
+ /*
+ * Inactive timer is armed. However, p is leaving DEADLINE and
+ * might migrate away from this rq while continuing to run on
+ * some other class. We need to remove its contribution from
+ * this rq running_bw now, or sub_rq_bw (below) will complain.
+ */
+ if (p->dl.dl_non_contending)
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+ }
+
+ /*
+ * We cannot use inactive_task_timer() to invoke sub_running_bw()
+ * at the 0-lag time, because the task could have been migrated
+ * while SCHED_OTHER in the meanwhile.
+ */
+ if (p->dl.dl_non_contending)
+ p->dl.dl_non_contending = 0;
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
- * from an overloaded cpu, if any.
+ * from an overloaded CPU, if any.
*/
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;
- if (pull_dl_task(rq))
- resched_curr(rq);
+ deadline_queue_pull_task(rq);
}
/*
@@ -1720,99 +3270,518 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
-
- if (task_on_rq_queued(p) && rq->curr != p) {
-#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
- push_dl_task(rq) && rq != task_rq(p))
- /* Only reschedule if pushing failed */
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched) {
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
- }
+ cancel_inactive_timer(&p->dl);
+
+ /*
+ * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+ * track of that on its cpuset (for correct bandwidth tracking).
+ */
+ inc_dl_tasks_cs(p);
+
+ /* If p is not queued we will update its parameters at next wakeup. */
+ if (!task_on_rq_queued(p)) {
+ add_rq_bw(&p->dl, &rq->dl);
+
+ return;
}
+
+ if (rq->donor != p) {
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+ deadline_queue_push_tasks(rq);
+ if (dl_task(rq->donor))
+ wakeup_preempt_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ } else {
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ }
+}
+
+static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
+{
+ return p->dl.deadline;
}
/*
* If the scheduling parameters of a -deadline task changed,
* a push or pull operation might be needed.
*/
-static void prio_changed_dl(struct rq *rq, struct task_struct *p,
- int oldprio)
+static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline)
{
- if (task_on_rq_queued(p) || rq->curr == p) {
-#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- pull_dl_task(rq);
+ if (!task_on_rq_queued(p))
+ return;
+ if (p->dl.deadline == old_deadline)
+ return;
+
+ if (dl_time_before(old_deadline, p->dl.deadline))
+ deadline_queue_pull_task(rq);
+
+ if (task_current_donor(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
* runqueue.
*/
- if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
- rq->curr == p)
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
- } else
- switched_to_dl(rq, p);
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
+ }
}
-const struct sched_class dl_sched_class = {
- .next = &rt_sched_class,
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_dl(struct task_struct *p, int cpu)
+{
+ return p->dl.dl_throttled;
+}
+#endif
+
+DEFINE_SCHED_CLASS(dl) = {
+
+ .queue_mask = 8,
+
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
- .check_preempt_curr = check_preempt_curr_dl,
+ .wakeup_preempt = wakeup_preempt_dl,
- .pick_next_task = pick_next_task_dl,
+ .pick_task = pick_task_dl,
.put_prev_task = put_prev_task_dl,
+ .set_next_task = set_next_task_dl,
-#ifdef CONFIG_SMP
+ .balance = balance_dl,
.select_task_rq = select_task_rq_dl,
+ .migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
-#endif
+ .find_lock_rq = find_lock_later_rq,
- .set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
- .task_dead = task_dead_dl,
+ .get_prio = get_prio_dl,
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_dl,
+#endif
};
-#ifdef CONFIG_SCHED_DEBUG
-extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+/*
+ * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
+ * sched_domains_mutex.
+ */
+u64 dl_cookie;
+
+int sched_dl_global_validate(void)
+{
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ u64 cookie = ++dl_cookie;
+ struct dl_bw *dl_b;
+ int cpu, cpus, ret = 0;
+ unsigned long flags;
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ */
+ for_each_online_cpu(cpu) {
+ rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, cookie))
+ goto next;
+
+ dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ if (new_bw * cpus < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+next:
+ rcu_read_unlock_sched();
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+{
+ if (global_rt_runtime() == RUNTIME_INF) {
+ dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+ dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
+ } else {
+ dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
+ global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+ dl_rq->max_bw = dl_rq->extra_bw =
+ to_ratio(global_rt_period(), global_rt_runtime());
+ }
+}
+
+void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ u64 cookie = ++dl_cookie;
+ struct dl_bw *dl_b;
+ int cpu;
+ unsigned long flags;
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ for_each_possible_cpu(cpu)
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+
+ for_each_possible_cpu(cpu) {
+ rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, cookie)) {
+ rcu_read_unlock_sched();
+ continue;
+ }
+
+ dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->bw = new_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ rcu_read_unlock_sched();
+ }
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+int sched_dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+ u64 period = attr->sched_period ?: attr->sched_deadline;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1, cpu = task_cpu(p);
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ unsigned long cap;
+
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return 0;
+
+ /* !deadline task may carry old deadline bandwidth */
+ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
+ if (hrtimer_active(&p->dl.inactive_timer))
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
+ /*
+ * XXX this is slightly incorrect: when the task
+ * utilization decreases, we should delay the total
+ * utilization change until the task's 0-lag point.
+ * But this would require to set the task's "inactive
+ * timer" when the task is not inactive.
+ */
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+ dl_change_utilization(p, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ /*
+ * Do not decrease the total deadline utilization here,
+ * switched_from_dl() will take care to do it at the correct
+ * (0-lag) time.
+ */
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
+void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+}
+
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+bool __checkparam_dl(const struct sched_attr *attr)
+{
+ u64 period, max, min;
+
+ /* special dl tasks don't actually use any parameter */
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return true;
+
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ period = attr->sched_period;
+ if (!period)
+ period = attr->sched_deadline;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if (period < attr->sched_deadline ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+ if (period < min || period > max)
+ return false;
+
+ return true;
+}
+
+/*
+ * This function clears the sched_dl_entity static params.
+ */
+static void __dl_clear_params(struct sched_dl_entity *dl_se)
+{
+ dl_se->dl_runtime = 0;
+ dl_se->dl_deadline = 0;
+ dl_se->dl_period = 0;
+ dl_se->flags = 0;
+ dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ dl_se->dl_non_contending = 0;
+ dl_se->dl_overrun = 0;
+ dl_se->dl_server = 0;
+
+#ifdef CONFIG_RT_MUTEXES
+ dl_se->pi_se = dl_se;
+#endif
+}
+
+void init_dl_entity(struct sched_dl_entity *dl_se)
+{
+ RB_CLEAR_NODE(&dl_se->rb_node);
+ init_dl_task_timer(dl_se);
+ init_dl_inactive_task_timer(dl_se);
+ __dl_clear_params(dl_se);
+}
+
+bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
+ return true;
+
+ return false;
+}
+
+int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ unsigned long flags, cap;
+ struct dl_bw *cur_dl_b;
+ int ret = 1;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ cap = __dl_bw_capacity(trial);
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (__dl_overflow(cur_dl_b, cap, 0, 0))
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+enum dl_bw_request {
+ dl_bw_req_deactivate = 0,
+ dl_bw_req_alloc,
+ dl_bw_req_free
+};
+
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
+{
+ unsigned long flags, cap;
+ struct dl_bw *dl_b;
+ bool overflow = 0;
+ u64 fair_server_bw = 0;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+
+ cap = dl_bw_capacity(cpu);
+ switch (req) {
+ case dl_bw_req_free:
+ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+ break;
+ case dl_bw_req_alloc:
+ overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+
+ if (!overflow) {
+ /*
+ * We reserve space in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+ }
+ break;
+ case dl_bw_req_deactivate:
+ /*
+ * cpu is not off yet, but we need to do the math by
+ * considering it off already (i.e., what would happen if we
+ * turn cpu off?).
+ */
+ cap -= arch_scale_cpu_capacity(cpu);
+
+ /*
+ * cpu is going offline and NORMAL tasks will be moved away
+ * from it. We can thus discount dl_server bandwidth
+ * contribution as it won't need to be servicing tasks after
+ * the cpu is off.
+ */
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+
+ /*
+ * Not much to check if no DEADLINE bandwidth is present.
+ * dl_servers we can discount, as tasks will be moved out the
+ * offlined CPUs anyway.
+ */
+ if (dl_b->total_bw - fair_server_bw > 0) {
+ /*
+ * Leaving at least one CPU for DEADLINE tasks seems a
+ * wise thing to do. As said above, cpu is not offline
+ * yet, so account for that.
+ */
+ if (dl_bw_cpus(cpu) - 1)
+ overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ else
+ overflow = 1;
+ }
+
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return overflow ? -EBUSY : 0;
+}
+
+int dl_bw_deactivate(int cpu)
+{
+ return dl_bw_manage(dl_bw_req_deactivate, cpu, 0);
+}
+
+int dl_bw_alloc(int cpu, u64 dl_bw)
+{
+ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+}
+
+void dl_bw_free(int cpu, u64 dl_bw)
+{
+ dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
+}
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a245c1fc6f0a..41caa22e0680 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,28 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/debug.c
*
- * Print the CFS rbtree
+ * Print the CFS rbtree and other debugging details
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-
-#include <linux/proc_fs.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include <linux/mempolicy.h>
-
+#include <linux/debugfs.h>
+#include <linux/nmi.h>
#include "sched.h"
-static DEFINE_SPINLOCK(sched_debug_lock);
-
/*
- * This allows printing both to /proc/sched_debug and
+ * This allows printing both to /sys/kernel/debug/sched/debug and
* to the console
*/
#define SEQ_printf(m, x...) \
@@ -30,7 +19,7 @@ static DEFINE_SPINLOCK(sched_debug_lock);
if (m) \
seq_printf(m, x); \
else \
- printk(x); \
+ pr_cont(x); \
} while (0)
/*
@@ -58,92 +47,711 @@ static unsigned long nsec_low(unsigned long long nsec)
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+#define SCHED_FEAT(name, enabled) \
+ #name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
+ if (!(sysctl_sched_features & (1UL << i)))
+ seq_puts(m, "NO_");
+ seq_printf(m, "%s ", sched_feat_names[i]);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+#ifdef CONFIG_JUMP_LABEL
+
+#define jump_label_key__true STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled) \
+ jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+ static_key_disable_cpuslocked(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+ static_key_enable_cpuslocked(&sched_feat_keys[i]);
+}
+#else /* !CONFIG_JUMP_LABEL: */
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* !CONFIG_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+ int i;
+ int neg = 0;
+
+ if (strncmp(cmp, "NO_", 3) == 0) {
+ neg = 1;
+ cmp += 3;
+ }
+
+ i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
+ if (i < 0)
+ return i;
+
+ if (neg) {
+ sysctl_sched_features &= ~(1UL << i);
+ sched_feat_disable(i);
+ } else {
+ sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
+ }
+
+ return 0;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int ret;
+ struct inode *inode;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ cpus_read_lock();
+ inode_lock(inode);
+ ret = sched_feat_set(cmp);
+ inode_unlock(inode);
+ cpus_read_unlock();
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+ .open = sched_feat_open,
+ .write = sched_feat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+ unsigned int scaling;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+ buf[cnt] = '\0';
+
+ if (kstrtouint(buf, 10, &scaling))
+ return -EINVAL;
+
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
+
+ sysctl_sched_tunable_scaling = scaling;
+ if (sched_update_scaling())
+ return -EINVAL;
+
+ *ppos += cnt;
+ return cnt;
+}
+
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+ return 0;
+}
+
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_scaling_show, NULL);
+}
+
+static const struct file_operations sched_scaling_fops = {
+ .open = sched_scaling_open,
+ .write = sched_scaling_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_dynamic_show(struct seq_file *m, void *v)
+{
+ int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ int j;
+
+ /* Count entries in NULL terminated preempt_modes */
+ for (j = 0; preempt_modes[j]; j++)
+ ;
+ j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
+
+ for (; i < j; i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_dynamic_show, NULL);
+}
+
+static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+__read_mostly bool sched_debug_verbose;
+
+static struct dentry *sd_dentry;
+
+
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ ssize_t result;
+ bool orig;
+
+ cpus_read_lock();
+ sched_domains_mutex_lock();
+
+ orig = sched_debug_verbose;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+ if (sched_debug_verbose && !orig)
+ update_sched_domain_debugfs();
+ else if (!sched_debug_verbose && orig) {
+ debugfs_remove(sd_dentry);
+ sd_dentry = NULL;
+ }
+
+ sched_domains_mutex_unlock();
+ cpus_read_unlock();
+
+ return result;
+}
+
+static const struct file_operations sched_verbose_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_verbose_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct seq_operations sched_debug_sops;
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &sched_debug_sops);
+}
+
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+enum dl_param {
+ DL_RUNTIME = 0,
+ DL_PERIOD,
+};
+
+static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+
+static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, enum dl_param param)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+ u64 runtime, period;
+ size_t err;
+ int retval;
+ u64 value;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &value);
+ if (err)
+ return err;
+
+ scoped_guard (rq_lock_irqsave, rq) {
+ runtime = rq->fair_server.dl_runtime;
+ period = rq->fair_server.dl_period;
+
+ switch (param) {
+ case DL_RUNTIME:
+ if (runtime == value)
+ break;
+ runtime = value;
+ break;
+ case DL_PERIOD:
+ if (value == period)
+ break;
+ period = value;
+ break;
+ }
+
+ if (runtime > period ||
+ period > fair_server_period_max ||
+ period < fair_server_period_min) {
+ return -EINVAL;
+ }
+
+ update_rq_clock(rq);
+ dl_server_stop(&rq->fair_server);
+
+ retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
+ if (retval)
+ cnt = retval;
+
+ if (!runtime)
+ printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
+ cpu_of(rq));
+
+ if (rq->cfs.h_nr_queued)
+ dl_server_start(&rq->fair_server);
+ }
+
+ *ppos += cnt;
+ return cnt;
+}
+
+static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+ u64 value;
+
+ switch (param) {
+ case DL_RUNTIME:
+ value = rq->fair_server.dl_runtime;
+ break;
+ case DL_PERIOD:
+ value = rq->fair_server.dl_period;
+ break;
+ }
+
+ seq_printf(m, "%llu\n", value);
+ return 0;
+
+}
+
+static ssize_t
+sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+}
+
+static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_RUNTIME);
+}
+
+static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_runtime_fops = {
+ .open = sched_fair_server_runtime_open,
+ .write = sched_fair_server_runtime_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static ssize_t
+sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+}
+
+static int sched_fair_server_period_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_PERIOD);
+}
+
+static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_period_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_period_fops = {
+ .open = sched_fair_server_period_open,
+ .write = sched_fair_server_period_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *debugfs_sched;
+
+static void debugfs_fair_server_init(void)
+{
+ struct dentry *d_fair;
+ unsigned long cpu;
+
+ d_fair = debugfs_create_dir("fair_server", debugfs_sched);
+ if (!d_fair)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_fair);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
+ }
+}
+
+static __init int sched_init_debug(void)
+{
+ struct dentry __maybe_unused *numa;
+
+ debugfs_sched = debugfs_create_dir("sched", NULL);
+
+ debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+ debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+ debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+#endif
+
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
+
+ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+
+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+
+ sched_domains_mutex_lock();
+ update_sched_domain_debugfs();
+ sched_domains_mutex_unlock();
+
+#ifdef CONFIG_NUMA_BALANCING
+ numa = debugfs_create_dir("numa_balancing", debugfs_sched);
+
+ debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+ debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+ debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+ debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+#endif /* CONFIG_NUMA_BALANCING */
+
+ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+
+ debugfs_fair_server_init();
+
+ return 0;
+}
+late_initcall(sched_init_debug);
+
+static cpumask_var_t sd_sysctl_cpus;
+
+static int sd_flags_show(struct seq_file *m, void *v)
+{
+ unsigned long flags = *(unsigned int *)m->private;
+ int idx;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ seq_puts(m, sd_flag_debug[idx].name);
+ seq_puts(m, " ");
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static int sd_flags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, sd_flags_show, inode->i_private);
+}
+
+static const struct file_operations sd_flags_fops = {
+ .open = sd_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void register_sd(struct sched_domain *sd, struct dentry *parent)
+{
+#define SDM(type, mode, member) \
+ debugfs_create_##type(#member, mode, parent, &sd->member)
+
+ SDM(ulong, 0644, min_interval);
+ SDM(ulong, 0644, max_interval);
+ SDM(u64, 0644, max_newidle_lb_cost);
+ SDM(u32, 0644, busy_factor);
+ SDM(u32, 0644, imbalance_pct);
+ SDM(u32, 0644, cache_nice_tries);
+ SDM(str, 0444, name);
+
+#undef SDM
+
+ debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
+ debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
+
+ if (sd->flags & SD_ASYM_PACKING)
+ debugfs_create_u32("group_asym_prefer_cpu", 0444, parent,
+ (u32 *)&sd->groups->asym_prefer_cpu);
+}
+
+void update_sched_domain_debugfs(void)
+{
+ int cpu, i;
+
+ /*
+ * This can unfortunately be invoked before sched_debug_init() creates
+ * the debug directory. Don't touch sd_sysctl_cpus until then.
+ */
+ if (!debugfs_sched)
+ return;
+
+ if (!sched_debug_verbose)
+ return;
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ if (!sd_dentry) {
+ sd_dentry = debugfs_create_dir("domains", debugfs_sched);
+
+ /* rebuild sd_sysctl_cpus if empty since it gets cleared below */
+ if (cpumask_empty(sd_sysctl_cpus))
+ cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
+ }
+
+ for_each_cpu(cpu, sd_sysctl_cpus) {
+ struct sched_domain *sd;
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%d", cpu);
+ debugfs_lookup_and_remove(buf, sd_dentry);
+ d_cpu = debugfs_create_dir(buf, sd_dentry);
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ struct dentry *d_sd;
+
+ snprintf(buf, sizeof(buf), "domain%d", i);
+ d_sd = debugfs_create_dir(buf, d_cpu);
+
+ register_sd(sd, d_sd);
+ i++;
+ }
+
+ __cpumask_clear_cpu(cpu, sd_sysctl_cpus);
+ }
+}
+
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
-#define P(F) \
- SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
-#define PN(F) \
- SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \
+ #F, (long long)schedstat_val(stats->F))
+#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \
+ #F, SPLIT_NS((long long)schedstat_val(stats->F)))
- if (!se) {
- struct sched_avg *avg = &cpu_rq(cpu)->avg;
- P(avg->runnable_avg_sum);
- P(avg->avg_period);
+ if (!se)
return;
- }
-
PN(se->exec_start);
PN(se->vruntime);
PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
- PN(se->statistics.wait_start);
- PN(se->statistics.sleep_start);
- PN(se->statistics.block_start);
- PN(se->statistics.sleep_max);
- PN(se->statistics.block_max);
- PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
- PN(se->statistics.wait_max);
- PN(se->statistics.wait_sum);
- P(se->statistics.wait_count);
-#endif
+
+ if (schedstat_enabled()) {
+ struct sched_statistics *stats;
+ stats = __schedstats_from_se(se);
+
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
+ }
+
P(se->load.weight);
-#ifdef CONFIG_SMP
- P(se->avg.runnable_avg_sum);
- P(se->avg.running_avg_sum);
- P(se->avg.avg_period);
- P(se->avg.load_avg_contrib);
- P(se->avg.utilization_avg_contrib);
- P(se->avg.decay_count);
-#endif
+ P(se->avg.load_avg);
+ P(se->avg.util_avg);
+ P(se->avg.runnable_avg);
+
+#undef PN_SCHEDSTAT
#undef PN
+#undef P_SCHEDSTAT
#undef P
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
static char group_path[PATH_MAX];
-static char *task_group_path(struct task_group *tg)
+static void task_group_path(struct task_group *tg, char *path, int plen)
{
- if (autogroup_path(tg, group_path, PATH_MAX))
- return group_path;
+ if (autogroup_path(tg, path, plen))
+ return;
+
+ cgroup_path(tg->css.cgroup, path, plen);
+}
- return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+/*
+ * Only 1 SEQ_printf_task_group_path() caller can use the full length
+ * group_path[] for cgroup path. Other simultaneous callers will have
+ * to use a shorter stack buffer. A "..." suffix is appended at the end
+ * of the stack buffer so that it will show up in case the output length
+ * matches the given buffer size to indicate possible path name truncation.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...) \
+{ \
+ if (spin_trylock(&sched_debug_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, fmt, group_path); \
+ spin_unlock(&sched_debug_lock); \
+ } else { \
+ char buf[128]; \
+ char *bufend = buf + sizeof(buf) - 3; \
+ task_group_path(tg, buf, bufend - buf); \
+ strcpy(bufend - 1, "..."); \
+ SEQ_printf(m, fmt, buf); \
+ } \
}
#endif
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
- if (rq->curr == p)
- SEQ_printf(m, "R");
+ if (task_current(rq, p))
+ SEQ_printf(m, ">R");
else
- SEQ_printf(m, " ");
+ SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ p->se.custom_slice ? 'S' : ' ',
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
-#ifdef CONFIG_SCHEDSTATS
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(p->se.vruntime),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-#else
- SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
- 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
-#endif
+
+ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
+
#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d", task_node(p));
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf(m, " %s", task_group_path(task_group(p)));
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -153,12 +761,28 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
{
struct task_struct *g, *p;
- SEQ_printf(m,
- "\nrunnable tasks:\n"
- " task PID tree-key switches prio"
- " exec-runtime sum-exec sum-sleep\n"
- "------------------------------------------------------"
- "----------------------------------------------------\n");
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "runnable tasks:\n");
+ SEQ_printf(m, " S task PID vruntime eligible "
+ "deadline slice sum-exec switches "
+ "prio wait-time sum-sleep sum-block"
+#ifdef CONFIG_NUMA_BALANCING
+ " node group-id"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ " group-path"
+#endif
+ "\n");
+ SEQ_printf(m, "-------------------------------------------------------"
+ "------------------------------------------------------"
+ "------------------------------------------------------"
+#ifdef CONFIG_NUMA_BALANCING
+ "--------------"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ "--------------"
+#endif
+ "\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -172,66 +796,70 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
#else
- SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
- SPLIT_NS(cfs_rq->exec_clock));
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (cfs_rq->rb_leftmost)
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ raw_spin_rq_lock_irqsave(rq, flags);
+ root = __pick_root_entity(cfs_rq);
+ if (root)
+ left_vruntime = root->min_vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_deadline = first->deadline;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
- min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
- SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
- SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
- cfs_rq->nr_spread_over);
- SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ right_vruntime = last->vruntime;
+ zero_vruntime = cfs_rq->zero_vruntime;
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
+ SPLIT_NS(left_deadline));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
+ SPLIT_NS(zero_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
+ SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
- cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
- cfs_rq->blocked_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
- cfs_rq->utilization_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
+ cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
+ cfs_rq->avg.runnable_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %u\n", "util_est",
+ cfs_rq->avg.util_est);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
+ cfs_rq->removed.load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
+ cfs_rq->removed.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
+ cfs_rq->removed.runnable_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
- cfs_rq->tg_load_contrib);
- SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
- cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
+ cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
- SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
- atomic_read(&cfs_rq->tg->runnable_avg));
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CFS_BANDWIDTH
- SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
- cfs_rq->tg->cfs_bandwidth.timer_active);
SEQ_printf(m, " .%-30s: %d\n", "throttled",
cfs_rq->throttled);
SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
@@ -246,37 +874,54 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
- SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf(m, "\n");
+ SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
#else
- SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "rt_rq[%d]:\n", cpu);
#endif
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PU(x) \
+ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x))
#define PN(x) \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
- P(rt_nr_running);
+ PU(rt_nr_running);
+
+#ifdef CONFIG_RT_GROUP_SCHED
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
+#endif
#undef PN
+#undef PU
#undef P
}
void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{
- SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
- SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
-}
+ struct dl_bw *dl_bw;
-extern __read_mostly int sched_clock_running;
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "dl_rq[%d]:\n", cpu);
+
+#define PU(x) \
+ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
+
+ PU(dl_nr_running);
+ dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+ SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
+
+#undef PU
+}
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
#ifdef CONFIG_X86
{
@@ -285,14 +930,14 @@ static void print_cpu(struct seq_file *m, int cpu)
SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
-#else
+#else /* !CONFIG_X86: */
SEQ_printf(m, "cpu#%d\n", cpu);
-#endif
+#endif /* !CONFIG_X86 */
#define P(x) \
do { \
if (sizeof(rq->x) == 4) \
- SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
else \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
} while (0)
@@ -301,55 +946,41 @@ do { \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
- SEQ_printf(m, " .%-30s: %lu\n", "load",
- rq->load.weight);
P(nr_switches);
- P(nr_load_updates);
P(nr_uninterruptible);
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
PN(clock_task);
- P(cpu_load[0]);
- P(cpu_load[1]);
- P(cpu_load[2]);
- P(cpu_load[3]);
- P(cpu_load[4]);
#undef P
#undef PN
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
-
- P(yld_count);
-
- P(sched_count);
- P(sched_goidle);
-#ifdef CONFIG_SMP
P64(avg_idle);
P64(max_idle_balance_cost);
-#endif
-
- P(ttwu_count);
- P(ttwu_local);
+#undef P64
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
+ if (schedstat_enabled()) {
+ P(yld_count);
+ P(sched_count);
+ P(sched_goidle);
+ P(ttwu_count);
+ P(ttwu_local);
+ }
#undef P
-#undef P64
-#endif
- spin_lock_irqsave(&sched_debug_lock, flags);
+
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
- spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}
static const char *sched_tunable_scaling_names[] = {
"none",
- "logaritmic",
+ "logarithmic",
"linear"
};
@@ -390,10 +1021,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- PN(sysctl_sched_latency);
- PN(sysctl_sched_min_granularity);
- PN(sysctl_sched_wakeup_granularity);
- P(sysctl_sched_child_runs_first);
+ PN(sysctl_sched_base_slice);
P(sysctl_sched_features);
#undef PN
#undef P
@@ -422,17 +1050,24 @@ void sysrq_sched_debug_show(void)
int cpu;
sched_debug_header(NULL);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ /*
+ * Need to reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI or stop_machine.
+ */
+ touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
print_cpu(NULL, cpu);
-
+ }
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
- * This means 2 is cpu 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * This means 2 is CPU 0.
+ * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
*/
static void *sched_debug_start(struct seq_file *file, loff_t *offset)
{
@@ -452,6 +1087,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2);
+
return NULL;
}
@@ -466,114 +1102,60 @@ static void sched_debug_stop(struct seq_file *file, void *data)
}
static const struct seq_operations sched_debug_sops = {
- .start = sched_debug_start,
- .next = sched_debug_next,
- .stop = sched_debug_stop,
- .show = sched_debug_show,
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
};
-static int sched_debug_release(struct inode *inode, struct file *file)
-{
- seq_release(inode, file);
-
- return 0;
-}
-
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
- int ret = 0;
+#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
+#define __P(F) __PS(#F, F)
+#define P(F) __PS(#F, p->F)
+#define PM(F, M) __PS(#F, p->F & (M))
+#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
+#define __PN(F) __PSN(#F, F)
+#define PN(F) __PSN(#F, p->F)
- ret = seq_open(filp, &sched_debug_sops);
- return ret;
-}
-
-static const struct file_operations sched_debug_fops = {
- .open = sched_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = sched_debug_release,
-};
-
-static int __init init_sched_debug_procfs(void)
+#ifdef CONFIG_NUMA_BALANCING
+void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf)
{
- struct proc_dir_entry *pe;
-
- pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
- if (!pe)
- return -ENOMEM;
- return 0;
+ SEQ_printf(m, "numa_faults node=%d ", node);
+ SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
+ SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
}
-
-__initcall(init_sched_debug_procfs);
-
-#define __P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#endif
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
- int node, i;
-
if (p->mm)
P(mm->numa_scan_seq);
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
- SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
-
- for_each_online_node(node) {
- for (i = 0; i < 2; i++) {
- unsigned long nr_faults = -1;
- int cpu_current, home_node;
-
- if (p->numa_faults)
- nr_faults = p->numa_faults[2*node + i];
-
- cpu_current = !i ? (task_node(p) == node) :
- (pol && node_isset(node, pol->v.nodes));
-
- home_node = (p->numa_preferred_nid == node);
-
- SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
- i, node, cpu_current, home_node, nr_faults);
- }
- }
-
- mpol_put(pol);
-#endif
+ P(numa_pages_migrated);
+ P(numa_preferred_nid);
+ P(total_numa_faults);
+ SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
+ task_node(p), task_numa_group_id(p));
+ show_numa_stats(p, m);
+#endif /* CONFIG_NUMA_BALANCING */
}
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------"
"----------\n");
-#define __P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
-#define P(F) \
- SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
- SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F))
+#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F))
PN(se.exec_start);
PN(se.vruntime);
@@ -581,38 +1163,40 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
- PN(se.statistics.wait_start);
- PN(se.statistics.sleep_start);
- PN(se.statistics.block_start);
- PN(se.statistics.sleep_max);
- PN(se.statistics.block_max);
- PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
- PN(se.statistics.wait_max);
- PN(se.statistics.wait_sum);
- P(se.statistics.wait_count);
- PN(se.statistics.iowait_sum);
- P(se.statistics.iowait_count);
P(se.nr_migrations);
- P(se.statistics.nr_migrations_cold);
- P(se.statistics.nr_failed_migrations_affine);
- P(se.statistics.nr_failed_migrations_running);
- P(se.statistics.nr_failed_migrations_hot);
- P(se.statistics.nr_forced_migrations);
- P(se.statistics.nr_wakeups);
- P(se.statistics.nr_wakeups_sync);
- P(se.statistics.nr_wakeups_migrate);
- P(se.statistics.nr_wakeups_local);
- P(se.statistics.nr_wakeups_remote);
- P(se.statistics.nr_wakeups_affine);
- P(se.statistics.nr_wakeups_affine_attempts);
- P(se.statistics.nr_wakeups_passive);
- P(se.statistics.nr_wakeups_idle);
- {
+ if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
+ PN_SCHEDSTAT(sum_sleep_runtime);
+ PN_SCHEDSTAT(sum_block_runtime);
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
+ PN_SCHEDSTAT(iowait_sum);
+ P_SCHEDSTAT(iowait_count);
+ P_SCHEDSTAT(nr_migrations_cold);
+ P_SCHEDSTAT(nr_failed_migrations_affine);
+ P_SCHEDSTAT(nr_failed_migrations_running);
+ P_SCHEDSTAT(nr_failed_migrations_hot);
+ P_SCHEDSTAT(nr_forced_migrations);
+ P_SCHEDSTAT(nr_wakeups);
+ P_SCHEDSTAT(nr_wakeups_sync);
+ P_SCHEDSTAT(nr_wakeups_migrate);
+ P_SCHEDSTAT(nr_wakeups_local);
+ P_SCHEDSTAT(nr_wakeups_remote);
+ P_SCHEDSTAT(nr_wakeups_affine);
+ P_SCHEDSTAT(nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(nr_wakeups_passive);
+ P_SCHEDSTAT(nr_wakeups_idle);
+
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);
@@ -629,29 +1213,44 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
__PN(avg_atom);
__PN(avg_per_cpu);
- }
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
#endif
+ }
+
__P(nr_switches);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "nr_voluntary_switches", (long long)p->nvcsw);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "nr_involuntary_switches", (long long)p->nivcsw);
+ __PS("nr_voluntary_switches", p->nvcsw);
+ __PS("nr_involuntary_switches", p->nivcsw);
P(se.load.weight);
-#ifdef CONFIG_SMP
- P(se.avg.runnable_avg_sum);
- P(se.avg.running_avg_sum);
- P(se.avg.avg_period);
- P(se.avg.load_avg_contrib);
- P(se.avg.utilization_avg_contrib);
- P(se.avg.decay_count);
-#endif
+ P(se.avg.load_sum);
+ P(se.avg.runnable_sum);
+ P(se.avg.util_sum);
+ P(se.avg.load_avg);
+ P(se.avg.runnable_avg);
+ P(se.avg.util_avg);
+ P(se.avg.last_update_time);
+ PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
+#ifdef CONFIG_UCLAMP_TASK
+ __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
+ __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
+ __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
+ __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
+#endif /* CONFIG_UCLAMP_TASK */
P(policy);
P(prio);
-#undef PN
-#undef __PN
-#undef P
-#undef __P
+ if (task_has_dl_policy(p)) {
+ P(dl.runtime);
+ P(dl.deadline);
+ } else if (fair_policy(p->policy)) {
+ P(se.slice);
+ }
+#ifdef CONFIG_SCHED_CLASS_EXT
+ __PS("ext.enabled", task_on_scx(p));
+#endif
+#undef PN_SCHEDSTAT
+#undef P_SCHEDSTAT
{
unsigned int this_cpu = raw_smp_processor_id();
@@ -659,8 +1258,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
t0 = cpu_clock(this_cpu);
t1 = cpu_clock(this_cpu);
- SEQ_printf(m, "%-45s:%21Ld\n",
- "clock-delta", (long long)(t1-t0));
+ __PS("clock-delta", t1-t0);
}
sched_show_numa(p, m);
@@ -669,6 +1267,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
}
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+ static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+ if (likely(!__ratelimit(&latency_check_ratelimit)))
+ return;
+
+ pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n",
+ cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+ dump_stack();
+}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644
index 000000000000..05f5a49e9649
--- /dev/null
+++ b/kernel/sched/ext.c
@@ -0,0 +1,7310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <linux/btf_ids.h>
+#include "ext_idle.h"
+
+/*
+ * NOTE: sched_ext is in the process of growing multiple scheduler support and
+ * scx_root usage is in a transitional state. Naked dereferences are safe if the
+ * caller is one of the tasks attached to SCX and explicit RCU dereference is
+ * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but
+ * are used as temporary markers to indicate that the dereferences need to be
+ * updated to point to the associated scheduler instances rather than scx_root.
+ */
+static struct scx_sched __rcu *scx_root;
+
+/*
+ * During exit, a task may schedule after losing its PIDs. When disabling the
+ * BPF scheduler, we need to be able to iterate tasks in every state to
+ * guarantee system safety. Maintain a dedicated task list which contains every
+ * task between its fork and eventual free.
+ */
+static DEFINE_RAW_SPINLOCK(scx_tasks_lock);
+static LIST_HEAD(scx_tasks);
+
+/* ops enable/disable */
+static DEFINE_MUTEX(scx_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_enabled);
+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
+static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
+static int scx_bypass_depth;
+static cpumask_var_t scx_bypass_lb_donee_cpumask;
+static cpumask_var_t scx_bypass_lb_resched_cpumask;
+static bool scx_aborting;
+static bool scx_init_task_enabled;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+
+/*
+ * A monotically increasing sequence number that is incremented every time a
+ * scheduler is enabled. This can be used by to check if any custom sched_ext
+ * scheduler has ever been used in the system.
+ */
+static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
+
+/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_error().
+ */
+static unsigned long scx_watchdog_timeout;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
+
+static struct delayed_work scx_watchdog_work;
+
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence
+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
+ * isn't active.
+ */
+struct scx_kick_syncs {
+ struct rcu_head rcu;
+ unsigned long syncs[];
+};
+
+static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
+
+/*
+ * Direct dispatch marker.
+ *
+ * Non-NULL values are used for direct dispatch from enqueue path. A valid
+ * pointer points to the task currently being enqueued. An ERR_PTR value is used
+ * to indicate that direct dispatch has already happened.
+ */
+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
+
+static const struct rhashtable_params dsq_hash_params = {
+ .key_len = sizeof_field(struct scx_dispatch_q, id),
+ .key_offset = offsetof(struct scx_dispatch_q, id),
+ .head_offset = offsetof(struct scx_dispatch_q, hash_node),
+};
+
+static LLIST_HEAD(dsqs_to_free);
+
+/* dispatch buf */
+struct scx_dsp_buf_ent {
+ struct task_struct *task;
+ unsigned long qseq;
+ u64 dsq_id;
+ u64 enq_flags;
+};
+
+static u32 scx_dsp_max_batch;
+
+struct scx_dsp_ctx {
+ struct rq *rq;
+ u32 cursor;
+ u32 nr_tasks;
+ struct scx_dsp_buf_ent buf[];
+};
+
+static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
+
+/* string formatting from BPF */
+struct scx_bstr_buf {
+ u64 data[MAX_BPRINTF_VARARGS];
+ char line[SCX_EXIT_MSG_LEN];
+};
+
+static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
+static struct scx_bstr_buf scx_exit_bstr_buf;
+
+/* ops debug dump */
+struct scx_dump_data {
+ s32 cpu;
+ bool first;
+ s32 cursor;
+ struct seq_buf *s;
+ const char *prefix;
+ struct scx_bstr_buf buf;
+};
+
+static struct scx_dump_data scx_dump_data = {
+ .cpu = -1,
+};
+
+/* /sys/kernel/sched_ext interface */
+static struct kset *scx_kset;
+
+/*
+ * Parameters that can be adjusted through /sys/module/sched_ext/parameters.
+ * There usually is no reason to modify these as normal scheduler operation
+ * shouldn't be affected by them. The knobs are primarily for debugging.
+ */
+static u64 scx_slice_dfl = SCX_SLICE_DFL;
+static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
+static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
+
+static int set_slice_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC);
+}
+
+static const struct kernel_param_ops slice_us_param_ops = {
+ .set = set_slice_us,
+ .get = param_get_uint,
+};
+
+static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC);
+}
+
+static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
+ .set = set_bypass_lb_intv_us,
+ .get = param_get_uint,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "sched_ext."
+
+module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
+MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
+module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600);
+MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)");
+
+#undef MODULE_PARAM_PREFIX
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched_ext.h>
+
+static void process_ddsp_deferred_locals(struct rq *rq);
+static u32 reenq_local(struct rq *rq);
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
+static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, const char *fmt, va_list args);
+
+static __printf(4, 5) bool scx_exit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ va_list args;
+ bool ret;
+
+ va_start(args, fmt);
+ ret = scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+
+ return ret;
+}
+
+#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args)
+
+#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
+
+static long jiffies_delta_msecs(unsigned long at, unsigned long now)
+{
+ if (time_after(at, now))
+ return jiffies_to_msecs(at - now);
+ else
+ return -(long)jiffies_to_msecs(now - at);
+}
+
+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
+static u32 higher_bits(u32 flags)
+{
+ return ~((1 << fls(flags)) - 1);
+}
+
+/* return the mask with only the highest bit set */
+static u32 highest_bit(u32 flags)
+{
+ int bit = fls(flags);
+ return ((u64)1 << bit) >> 1;
+}
+
+static bool u32_before(u32 a, u32 b)
+{
+ return (s32)(a - b) < 0;
+}
+
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
+ struct task_struct *p)
+{
+ return sch->global_dsqs[cpu_to_node(task_cpu(p))];
+}
+
+static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
+{
+ return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
+}
+
+static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class)
+ return &stop_sched_class;
+
+ return __setscheduler_class(p->policy, p->prio);
+}
+
+/*
+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
+ * whether it's running from an allowed context.
+ *
+ * @mask is constant, always inline to cull the mask calculations.
+ */
+static __always_inline void scx_kf_allow(u32 mask)
+{
+ /* nesting is allowed only in increasing scx_kf_mask order */
+ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
+ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
+ current->scx.kf_mask, mask);
+ current->scx.kf_mask |= mask;
+ barrier();
+}
+
+static void scx_kf_disallow(u32 mask)
+{
+ barrier();
+ current->scx.kf_mask &= ~mask;
+}
+
+/*
+ * Track the rq currently locked.
+ *
+ * This allows kfuncs to safely operate on rq from any scx ops callback,
+ * knowing which rq is already locked.
+ */
+DEFINE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+static inline void update_locked_rq(struct rq *rq)
+{
+ /*
+ * Check whether @rq is actually locked. This can help expose bugs
+ * or incorrect assumptions about the context in which a kfunc or
+ * callback is executed.
+ */
+ if (rq)
+ lockdep_assert_rq_held(rq);
+ __this_cpu_write(scx_locked_rq_state, rq);
+}
+
+#define SCX_CALL_OP(sch, mask, op, rq, args...) \
+do { \
+ if (rq) \
+ update_locked_rq(rq); \
+ if (mask) { \
+ scx_kf_allow(mask); \
+ (sch)->ops.op(args); \
+ scx_kf_disallow(mask); \
+ } else { \
+ (sch)->ops.op(args); \
+ } \
+ if (rq) \
+ update_locked_rq(NULL); \
+} while (0)
+
+#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
+({ \
+ __typeof__((sch)->ops.op(args)) __ret; \
+ \
+ if (rq) \
+ update_locked_rq(rq); \
+ if (mask) { \
+ scx_kf_allow(mask); \
+ __ret = (sch)->ops.op(args); \
+ scx_kf_disallow(mask); \
+ } else { \
+ __ret = (sch)->ops.op(args); \
+ } \
+ if (rq) \
+ update_locked_rq(NULL); \
+ __ret; \
+})
+
+/*
+ * Some kfuncs are allowed only on the tasks that are subjects of the
+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
+ * restrictions, the following SCX_CALL_OP_*() variants should be used when
+ * invoking scx_ops operations that take task arguments. These can only be used
+ * for non-nesting operations due to the way the tasks are tracked.
+ *
+ * kfuncs which can only operate on such tasks can in turn use
+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
+ * the specific task.
+ */
+#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \
+do { \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task; \
+ SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+} while (0)
+
+#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \
+({ \
+ __typeof__((sch)->ops.op(task, ##args)) __ret; \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task; \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+ __ret; \
+})
+
+#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \
+({ \
+ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task0; \
+ current->scx.kf_tasks[1] = task1; \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+ current->scx.kf_tasks[1] = NULL; \
+ __ret; \
+})
+
+/* @mask is constant, always inline to cull unnecessary branches */
+static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
+{
+ if (unlikely(!(current->scx.kf_mask & mask))) {
+ scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
+ return false;
+ }
+
+ /*
+ * Enforce nesting boundaries. e.g. A kfunc which can be called from
+ * DISPATCH must not be called if we're running DEQUEUE which is nested
+ * inside ops.dispatch(). We don't need to check boundaries for any
+ * blocking kfuncs as the verifier ensures they're only called from
+ * sleepable progs.
+ */
+ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
+ (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
+ scx_error(sch, "cpu_release kfunc called from a nested operation");
+ return false;
+ }
+
+ if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
+ (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
+ scx_error(sch, "dispatch kfunc called from a nested operation");
+ return false;
+ }
+
+ return true;
+}
+
+/* see SCX_CALL_OP_TASK() */
+static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
+ u32 mask,
+ struct task_struct *p)
+{
+ if (!scx_kf_allowed(sch, mask))
+ return false;
+
+ if (unlikely((p != current->scx.kf_tasks[0] &&
+ p != current->scx.kf_tasks[1]))) {
+ scx_error(sch, "called on a task not being operated on");
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
+ * @dsq: user dsq being iterated
+ * @cur: current position, %NULL to start iteration
+ * @rev: walk backwards
+ *
+ * Returns %NULL when iteration is finished.
+ */
+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
+ struct task_struct *cur, bool rev)
+{
+ struct list_head *list_node;
+ struct scx_dsq_list_node *dsq_lnode;
+
+ lockdep_assert_held(&dsq->lock);
+
+ if (cur)
+ list_node = &cur->scx.dsq_list.node;
+ else
+ list_node = &dsq->list;
+
+ /* find the next task, need to skip BPF iteration cursors */
+ do {
+ if (rev)
+ list_node = list_node->prev;
+ else
+ list_node = list_node->next;
+
+ if (list_node == &dsq->list)
+ return NULL;
+
+ dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
+ node);
+ } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
+
+ return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
+}
+
+#define nldsq_for_each_task(p, dsq) \
+ for ((p) = nldsq_next_task((dsq), NULL, false); (p); \
+ (p) = nldsq_next_task((dsq), (p), false))
+
+
+/*
+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
+ * dispatch order. BPF-visible iterator is opaque and larger to allow future
+ * changes without breaking backward compatibility. Can be used with
+ * bpf_for_each(). See bpf_iter_scx_dsq_*().
+ */
+enum scx_dsq_iter_flags {
+ /* iterate in the reverse dispatch order */
+ SCX_DSQ_ITER_REV = 1U << 16,
+
+ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
+ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
+
+ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
+ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
+ __SCX_DSQ_ITER_HAS_SLICE |
+ __SCX_DSQ_ITER_HAS_VTIME,
+};
+
+struct bpf_iter_scx_dsq_kern {
+ struct scx_dsq_list_node cursor;
+ struct scx_dispatch_q *dsq;
+ u64 slice;
+ u64 vtime;
+} __attribute__((aligned(8)));
+
+struct bpf_iter_scx_dsq {
+ u64 __opaque[6];
+} __attribute__((aligned(8)));
+
+
+/*
+ * SCX task iterator.
+ */
+struct scx_task_iter {
+ struct sched_ext_entity cursor;
+ struct task_struct *locked_task;
+ struct rq *rq;
+ struct rq_flags rf;
+ u32 cnt;
+ bool list_locked;
+};
+
+/**
+ * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
+ * @iter: iterator to init
+ *
+ * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
+ * must eventually be stopped with scx_task_iter_stop().
+ *
+ * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
+ * between this and the first next() call or between any two next() calls. If
+ * the locks are released between two next() calls, the caller is responsible
+ * for ensuring that the task being iterated remains accessible either through
+ * RCU read lock or obtaining a reference count.
+ *
+ * All tasks which existed when the iteration started are guaranteed to be
+ * visited as long as they are not dead.
+ */
+static void scx_task_iter_start(struct scx_task_iter *iter)
+{
+ memset(iter, 0, sizeof(*iter));
+
+ raw_spin_lock_irq(&scx_tasks_lock);
+
+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
+ list_add(&iter->cursor.tasks_node, &scx_tasks);
+ iter->list_locked = true;
+}
+
+static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+{
+ if (iter->locked_task) {
+ task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
+ iter->locked_task = NULL;
+ }
+}
+
+/**
+ * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
+ * @iter: iterator to unlock
+ *
+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
+ * the task currently being visited in addition to scx_tasks_lock. Unlock both.
+ * This function can be safely called anytime during an iteration. The next
+ * iterator operation will automatically restore the necessary locking.
+ */
+static void scx_task_iter_unlock(struct scx_task_iter *iter)
+{
+ __scx_task_iter_rq_unlock(iter);
+ if (iter->list_locked) {
+ iter->list_locked = false;
+ raw_spin_unlock_irq(&scx_tasks_lock);
+ }
+}
+
+static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
+{
+ if (!iter->list_locked) {
+ raw_spin_lock_irq(&scx_tasks_lock);
+ iter->list_locked = true;
+ }
+}
+
+/**
+ * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
+ * @iter: iterator to exit
+ *
+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
+ * which is released on return. If the iterator holds a task's rq lock, that rq
+ * lock is also released. See scx_task_iter_start() for details.
+ */
+static void scx_task_iter_stop(struct scx_task_iter *iter)
+{
+ __scx_task_iter_maybe_relock(iter);
+ list_del_init(&iter->cursor.tasks_node);
+ scx_task_iter_unlock(iter);
+}
+
+/**
+ * scx_task_iter_next - Next task
+ * @iter: iterator to walk
+ *
+ * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
+ * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
+ * by holding scx_tasks_lock for too long.
+ */
+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
+{
+ struct list_head *cursor = &iter->cursor.tasks_node;
+ struct sched_ext_entity *pos;
+
+ if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
+ scx_task_iter_unlock(iter);
+ cond_resched();
+ }
+
+ __scx_task_iter_maybe_relock(iter);
+
+ list_for_each_entry(pos, cursor, tasks_node) {
+ if (&pos->tasks_node == &scx_tasks)
+ return NULL;
+ if (!(pos->flags & SCX_TASK_CURSOR)) {
+ list_move(cursor, &pos->tasks_node);
+ return container_of(pos, struct task_struct, scx);
+ }
+ }
+
+ /* can't happen, should always terminate at scx_tasks above */
+ BUG();
+}
+
+/**
+ * scx_task_iter_next_locked - Next non-idle task with its rq locked
+ * @iter: iterator to walk
+ *
+ * Visit the non-idle task with its rq lock held. Allows callers to specify
+ * whether they would like to filter out dead tasks. See scx_task_iter_start()
+ * for details.
+ */
+static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
+{
+ struct task_struct *p;
+
+ __scx_task_iter_rq_unlock(iter);
+
+ while ((p = scx_task_iter_next(iter))) {
+ /*
+ * scx_task_iter is used to prepare and move tasks into SCX
+ * while loading the BPF scheduler and vice-versa while
+ * unloading. The init_tasks ("swappers") should be excluded
+ * from the iteration because:
+ *
+ * - It's unsafe to use __setschduler_prio() on an init_task to
+ * determine the sched_class to use as it won't preserve its
+ * idle_sched_class.
+ *
+ * - ops.init/exit_task() can easily be confused if called with
+ * init_tasks as they, e.g., share PID 0.
+ *
+ * As init_tasks are never scheduled through SCX, they can be
+ * skipped safely. Note that is_idle_task() which tests %PF_IDLE
+ * doesn't work here:
+ *
+ * - %PF_IDLE may not be set for an init_task whose CPU hasn't
+ * yet been onlined.
+ *
+ * - %PF_IDLE can be set on tasks that are not init_tasks. See
+ * play_idle_precise() used by CONFIG_IDLE_INJECT.
+ *
+ * Test for idle_sched_class as only init_tasks are on it.
+ */
+ if (p->sched_class != &idle_sched_class)
+ break;
+ }
+ if (!p)
+ return NULL;
+
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked_task = p;
+
+ return p;
+}
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occurred
+ *
+ * This can be used when preemption is not disabled.
+ */
+#define scx_add_event(sch, name, cnt) do { \
+ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
+ trace_sched_ext_event(#name, (cnt)); \
+} while(0)
+
+/**
+ * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occurred
+ *
+ * This should be used only when preemption is disabled.
+ */
+#define __scx_add_event(sch, name, cnt) do { \
+ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
+ * @dst_e: destination event stats
+ * @src_e: source event stats
+ * @kind: a kind of event to be aggregated
+ */
+#define scx_agg_event(dst_e, src_e, kind) do { \
+ (dst_e)->kind += READ_ONCE((src_e)->kind); \
+} while(0)
+
+/**
+ * scx_dump_event - Dump an event 'kind' in 'events' to 's'
+ * @s: output seq_buf
+ * @events: event stats
+ * @kind: a kind of event to dump
+ */
+#define scx_dump_event(s, events, kind) do { \
+ dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
+} while (0)
+
+
+static void scx_read_events(struct scx_sched *sch,
+ struct scx_event_stats *events);
+
+static enum scx_enable_state scx_enable_state(void)
+{
+ return atomic_read(&scx_enable_state_var);
+}
+
+static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to)
+{
+ return atomic_xchg(&scx_enable_state_var, to);
+}
+
+static bool scx_tryset_enable_state(enum scx_enable_state to,
+ enum scx_enable_state from)
+{
+ int from_v = from;
+
+ return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
+}
+
+/**
+ * wait_ops_state - Busy-wait the specified ops state to end
+ * @p: target task
+ * @opss: state to wait the end of
+ *
+ * Busy-wait for @p to transition out of @opss. This can only be used when the
+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
+ * has load_acquire semantics to ensure that the caller can see the updates made
+ * in the enqueueing and dispatching paths.
+ */
+static void wait_ops_state(struct task_struct *p, unsigned long opss)
+{
+ do {
+ cpu_relax();
+ } while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
+}
+
+static inline bool __cpu_valid(s32 cpu)
+{
+ return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
+}
+
+/**
+ * ops_cpu_valid - Verify a cpu number, to be used on ops input args
+ * @sch: scx_sched to abort on error
+ * @cpu: cpu number which came from a BPF ops
+ * @where: extra information reported on error
+ *
+ * @cpu is a cpu number which came from the BPF scheduler and can be any value.
+ * Verify that it is in range and one of the possible cpus. If invalid, trigger
+ * an ops error.
+ */
+static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
+{
+ if (__cpu_valid(cpu)) {
+ return true;
+ } else {
+ scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
+ return false;
+ }
+}
+
+/**
+ * ops_sanitize_err - Sanitize a -errno value
+ * @sch: scx_sched to error out on error
+ * @ops_name: operation to blame on failure
+ * @err: -errno value to sanitize
+ *
+ * Verify @err is a valid -errno. If not, trigger scx_error() and return
+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
+ * cause misbehaviors. For an example, a large negative return from
+ * ops.init_task() triggers an oops when passed up the call chain because the
+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
+ * handled as a pointer.
+ */
+static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err)
+{
+ if (err < 0 && err >= -MAX_ERRNO)
+ return err;
+
+ scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err);
+ return -EPROTO;
+}
+
+static void run_deferred(struct rq *rq)
+{
+ process_ddsp_deferred_locals(rq);
+
+ if (local_read(&rq->scx.reenq_local_deferred)) {
+ local_set(&rq->scx.reenq_local_deferred, 0);
+ reenq_local(rq);
+ }
+}
+
+static void deferred_bal_cb_workfn(struct rq *rq)
+{
+ run_deferred(rq);
+}
+
+static void deferred_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
+
+ raw_spin_rq_lock(rq);
+ run_deferred(rq);
+ raw_spin_rq_unlock(rq);
+}
+
+/**
+ * schedule_deferred - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Deferred actions are executed
+ * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks
+ * to other rqs.
+ */
+static void schedule_deferred(struct rq *rq)
+{
+ /*
+ * Queue an irq work. They are executed on IRQ re-enable which may take
+ * a bit longer than the scheduler hook in schedule_deferred_locked().
+ */
+ irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
+/**
+ * schedule_deferred_locked - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Equivalent to
+ * schedule_deferred() but requires @rq to be locked and can be more efficient.
+ */
+static void schedule_deferred_locked(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * If in the middle of waking up a task, task_woken_scx() will be called
+ * afterwards which will then run the deferred actions, no need to
+ * schedule anything.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
+ return;
+
+ /* Don't do anything if there already is a deferred operation. */
+ if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
+ return;
+
+ /*
+ * If in balance, the balance callbacks will be called before rq lock is
+ * released. Schedule one.
+ *
+ *
+ * We can't directly insert the callback into the
+ * rq's list: The call can drop its lock and make the pending balance
+ * callback visible to unrelated code paths that call rq_pin_lock().
+ *
+ * Just let balance_one() know that it must do it itself.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
+ rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
+ return;
+ }
+
+ /*
+ * No scheduler hooks available. Use the generic irq_work path. The
+ * above WAKEUP and BALANCE paths should cover most of the cases and the
+ * time to IRQ re-enable shouldn't be long.
+ */
+ schedule_deferred(rq);
+}
+
+/**
+ * touch_core_sched - Update timestamp used for core-sched task ordering
+ * @rq: rq to read clock from, must be locked
+ * @p: task to update the timestamp for
+ *
+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called
+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice
+ * exhaustion).
+ */
+static void touch_core_sched(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * It's okay to update the timestamp spuriously. Use
+ * sched_core_disabled() which is cheaper than enabled().
+ *
+ * As this is used to determine ordering between tasks of sibling CPUs,
+ * it may be better to use per-core dispatch sequence instead.
+ */
+ if (!sched_core_disabled())
+ p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
+#endif
+}
+
+/**
+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
+ * @rq: rq to read clock from, must be locked
+ * @p: task being dispatched
+ *
+ * If the BPF scheduler implements custom core-sched ordering via
+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
+ * ordering within each local DSQ. This function is called from dispatch paths
+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
+ */
+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
+ touch_core_sched(rq, p);
+#endif
+}
+
+static void update_curr_scx(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ s64 delta_exec;
+
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
+ return;
+
+ if (curr->scx.slice != SCX_SLICE_INF) {
+ curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
+ if (!curr->scx.slice)
+ touch_core_sched(rq, curr);
+ }
+}
+
+static bool scx_dsq_priq_less(struct rb_node *node_a,
+ const struct rb_node *node_b)
+{
+ const struct task_struct *a =
+ container_of(node_a, struct task_struct, scx.dsq_priq);
+ const struct task_struct *b =
+ container_of(node_b, struct task_struct, scx.dsq_priq);
+
+ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
+}
+
+static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
+{
+ /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
+ WRITE_ONCE(dsq->nr, dsq->nr + delta);
+}
+
+static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
+{
+ p->scx.slice = READ_ONCE(scx_slice_dfl);
+ __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
+}
+
+static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
+{
+ bool is_local = dsq->id == SCX_DSQ_LOCAL;
+
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+ WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
+ !RB_EMPTY_NODE(&p->scx.dsq_priq));
+
+ if (!is_local) {
+ raw_spin_lock_nested(&dsq->lock,
+ (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0);
+
+ if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
+ scx_error(sch, "attempting to dispatch to a destroyed dsq");
+ /* fall back to the global dsq */
+ raw_spin_unlock(&dsq->lock);
+ dsq = find_global_dsq(sch, p);
+ raw_spin_lock(&dsq->lock);
+ }
+ }
+
+ if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
+ (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
+ /*
+ * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
+ * their FIFO queues. To avoid confusion and accidentally
+ * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
+ * disallow any internal DSQ from doing vtime ordering of
+ * tasks.
+ */
+ scx_error(sch, "cannot use vtime ordering for built-in DSQs");
+ enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
+ }
+
+ if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+ struct rb_node *rbp;
+
+ /*
+ * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
+ * linked to both the rbtree and list on PRIQs, this can only be
+ * tested easily when adding the first task.
+ */
+ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
+ nldsq_next_task(dsq, NULL, false)))
+ scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
+
+ p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
+ rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
+
+ /*
+ * Find the previous task and insert after it on the list so
+ * that @dsq->list is vtime ordered.
+ */
+ rbp = rb_prev(&p->scx.dsq_priq);
+ if (rbp) {
+ struct task_struct *prev =
+ container_of(rbp, struct task_struct,
+ scx.dsq_priq);
+ list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
+ /* first task unchanged - no update needed */
+ } else {
+ list_add(&p->scx.dsq_list.node, &dsq->list);
+ /* not builtin and new task is at head - use fastpath */
+ rcu_assign_pointer(dsq->first_task, p);
+ }
+ } else {
+ /* a FIFO DSQ shouldn't be using PRIQ enqueuing */
+ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
+ scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
+
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) {
+ list_add(&p->scx.dsq_list.node, &dsq->list);
+ /* new task inserted at head - use fastpath */
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ } else {
+ bool was_empty;
+
+ was_empty = list_empty(&dsq->list);
+ list_add_tail(&p->scx.dsq_list.node, &dsq->list);
+ if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ }
+ }
+
+ /* seq records the order tasks are queued, used by BPF DSQ iterator */
+ dsq->seq++;
+ p->scx.dsq_seq = dsq->seq;
+
+ dsq_mod_nr(dsq, 1);
+ p->scx.dsq = dsq;
+
+ /*
+ * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
+ * direct dispatch path, but we clear them here because the direct
+ * dispatch verdict may be overridden on the enqueue path during e.g.
+ * bypass.
+ */
+ p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
+ p->scx.ddsp_enq_flags = 0;
+
+ /*
+ * We're transitioning out of QUEUEING or DISPATCHING. store_release to
+ * match waiters' load_acquire.
+ */
+ if (enq_flags & SCX_ENQ_CLEAR_OPSS)
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+ if (is_local) {
+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+ bool preempt = false;
+
+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+ rq->curr->sched_class == &ext_sched_class) {
+ rq->curr->scx.slice = 0;
+ preempt = true;
+ }
+
+ if (preempt || sched_class_above(&ext_sched_class,
+ rq->curr->sched_class))
+ resched_curr(rq);
+ } else {
+ raw_spin_unlock(&dsq->lock);
+ }
+}
+
+static void task_unlink_from_dsq(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
+
+ if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
+ rb_erase(&p->scx.dsq_priq, &dsq->priq);
+ RB_CLEAR_NODE(&p->scx.dsq_priq);
+ p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
+ }
+
+ list_del_init(&p->scx.dsq_list.node);
+ dsq_mod_nr(dsq, -1);
+
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
+ struct task_struct *first_task;
+
+ first_task = nldsq_next_task(dsq, NULL, false);
+ rcu_assign_pointer(dsq->first_task, first_task);
+ }
+}
+
+static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
+{
+ struct scx_dispatch_q *dsq = p->scx.dsq;
+ bool is_local = dsq == &rq->scx.local_dsq;
+
+ lockdep_assert_rq_held(rq);
+
+ if (!dsq) {
+ /*
+ * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
+ * Unlinking is all that's needed to cancel.
+ */
+ if (unlikely(!list_empty(&p->scx.dsq_list.node)))
+ list_del_init(&p->scx.dsq_list.node);
+
+ /*
+ * When dispatching directly from the BPF scheduler to a local
+ * DSQ, the task isn't associated with any DSQ but
+ * @p->scx.holding_cpu may be set under the protection of
+ * %SCX_OPSS_DISPATCHING.
+ */
+ if (p->scx.holding_cpu >= 0)
+ p->scx.holding_cpu = -1;
+
+ return;
+ }
+
+ if (!is_local)
+ raw_spin_lock(&dsq->lock);
+
+ /*
+ * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't
+ * change underneath us.
+ */
+ if (p->scx.holding_cpu < 0) {
+ /* @p must still be on @dsq, dequeue */
+ task_unlink_from_dsq(p, dsq);
+ } else {
+ /*
+ * We're racing against dispatch_to_local_dsq() which already
+ * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
+ * holding_cpu which tells dispatch_to_local_dsq() that it lost
+ * the race.
+ */
+ WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
+ p->scx.holding_cpu = -1;
+ }
+ p->scx.dsq = NULL;
+
+ if (!is_local)
+ raw_spin_unlock(&dsq->lock);
+}
+
+/*
+ * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq
+ * and dsq are locked.
+ */
+static void dispatch_dequeue_locked(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_held(&dsq->lock);
+
+ task_unlink_from_dsq(p, dsq);
+ p->scx.dsq = NULL;
+}
+
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
+ struct rq *rq, u64 dsq_id,
+ struct task_struct *p)
+{
+ struct scx_dispatch_q *dsq;
+
+ if (dsq_id == SCX_DSQ_LOCAL)
+ return &rq->scx.local_dsq;
+
+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+ if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ return find_global_dsq(sch, p);
+
+ return &cpu_rq(cpu)->scx.local_dsq;
+ }
+
+ if (dsq_id == SCX_DSQ_GLOBAL)
+ dsq = find_global_dsq(sch, p);
+ else
+ dsq = find_user_dsq(sch, dsq_id);
+
+ if (unlikely(!dsq)) {
+ scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
+ return find_global_dsq(sch, p);
+ }
+
+ return dsq;
+}
+
+static void mark_direct_dispatch(struct scx_sched *sch,
+ struct task_struct *ddsp_task,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ /*
+ * Mark that dispatch already happened from ops.select_cpu() or
+ * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
+ * which can never match a valid task pointer.
+ */
+ __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
+
+ /* @p must match the task on the enqueue path */
+ if (unlikely(p != ddsp_task)) {
+ if (IS_ERR(ddsp_task))
+ scx_error(sch, "%s[%d] already direct-dispatched",
+ p->comm, p->pid);
+ else
+ scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
+ return;
+ }
+
+ WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
+ WARN_ON_ONCE(p->scx.ddsp_enq_flags);
+
+ p->scx.ddsp_dsq_id = dsq_id;
+ p->scx.ddsp_enq_flags = enq_flags;
+}
+
+static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
+{
+ struct rq *rq = task_rq(p);
+ struct scx_dispatch_q *dsq =
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
+
+ touch_core_sched_dispatch(rq, p);
+
+ p->scx.ddsp_enq_flags |= enq_flags;
+
+ /*
+ * We are in the enqueue path with @rq locked and pinned, and thus can't
+ * double lock a remote rq and enqueue to its local DSQ. For
+ * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
+ * the enqueue so that it's executed when @rq can be unlocked.
+ */
+ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
+ unsigned long opss;
+
+ opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_NONE:
+ break;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * As @p was never passed to the BPF side, _release is
+ * not strictly necessary. Still do it for consistency.
+ */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ default:
+ WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
+ p->comm, p->pid, opss);
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ }
+
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+ list_add_tail(&p->scx.dsq_list.node,
+ &rq->scx.ddsp_deferred_locals);
+ schedule_deferred_locked(rq);
+ return;
+ }
+
+ dispatch_enqueue(sch, dsq, p,
+ p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+}
+
+static bool scx_rq_online(struct rq *rq)
+{
+ /*
+ * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates
+ * the online state as seen from the BPF scheduler. cpu_active() test
+ * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will
+ * stay set until the current scheduling operation is complete even if
+ * we aren't locking @rq.
+ */
+ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
+}
+
+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
+ int sticky_cpu)
+{
+ struct scx_sched *sch = scx_root;
+ struct task_struct **ddsp_taskp;
+ struct scx_dispatch_q *dsq;
+ unsigned long qseq;
+
+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+
+ /* rq migration */
+ if (sticky_cpu == cpu_of(rq))
+ goto local_norefill;
+
+ /*
+ * If !scx_rq_online(), we already told the BPF scheduler that the CPU
+ * is offline and are just running the hotplug path. Don't bother the
+ * BPF scheduler.
+ */
+ if (!scx_rq_online(rq))
+ goto local;
+
+ if (scx_rq_bypassing(rq)) {
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
+ goto bypass;
+ }
+
+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
+ goto direct;
+
+ /* see %SCX_OPS_ENQ_EXITING */
+ if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
+ unlikely(p->flags & PF_EXITING)) {
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
+ goto local;
+ }
+
+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
+ if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
+ is_migration_disabled(p)) {
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+ goto local;
+ }
+
+ if (unlikely(!SCX_HAS_OP(sch, enqueue)))
+ goto global;
+
+ /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
+ qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
+
+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+ atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
+
+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+ WARN_ON_ONCE(*ddsp_taskp);
+ *ddsp_taskp = p;
+
+ SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
+
+ *ddsp_taskp = NULL;
+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
+ goto direct;
+
+ /*
+ * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
+ * dequeue may be waiting. The store_release matches their load_acquire.
+ */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
+ return;
+
+direct:
+ direct_dispatch(sch, p, enq_flags);
+ return;
+local_norefill:
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
+ return;
+local:
+ dsq = &rq->scx.local_dsq;
+ goto enqueue;
+global:
+ dsq = find_global_dsq(sch, p);
+ goto enqueue;
+bypass:
+ dsq = &task_rq(p)->scx.bypass_dsq;
+ goto enqueue;
+
+enqueue:
+ /*
+ * For task-ordering, slice refill must be treated as implying the end
+ * of the current slice. Otherwise, the longer @p stays on the CPU, the
+ * higher priority it becomes from scx_prio_less()'s POV.
+ */
+ touch_core_sched(rq, p);
+ refill_task_slice_dfl(sch, p);
+ dispatch_enqueue(sch, dsq, p, enq_flags);
+}
+
+static bool task_runnable(const struct task_struct *p)
+{
+ return !list_empty(&p->scx.runnable_node);
+}
+
+static void set_task_runnable(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+ if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
+ p->scx.runnable_at = jiffies;
+ p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+ }
+
+ /*
+ * list_add_tail() must be used. scx_bypass() depends on tasks being
+ * appended to the runnable_list.
+ */
+ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
+}
+
+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
+{
+ list_del_init(&p->scx.runnable_node);
+ if (reset_runnable_at)
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+}
+
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+{
+ struct scx_sched *sch = scx_root;
+ int sticky_cpu = p->scx.sticky_cpu;
+
+ if (enq_flags & ENQUEUE_WAKEUP)
+ rq->scx.flags |= SCX_RQ_IN_WAKEUP;
+
+ enq_flags |= rq->scx.extra_enq_flags;
+
+ if (sticky_cpu >= 0)
+ p->scx.sticky_cpu = -1;
+
+ /*
+ * Restoring a running task will be immediately followed by
+ * set_next_task_scx() which expects the task to not be on the BPF
+ * scheduler as tasks can only start running through local DSQs. Force
+ * direct-dispatch into the local DSQ by setting the sticky_cpu.
+ */
+ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
+ sticky_cpu = cpu_of(rq);
+
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ WARN_ON_ONCE(!task_runnable(p));
+ goto out;
+ }
+
+ set_task_runnable(rq, p);
+ p->scx.flags |= SCX_TASK_QUEUED;
+ rq->scx.nr_running++;
+ add_nr_running(rq, 1);
+
+ if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
+
+ if (enq_flags & SCX_ENQ_WAKEUP)
+ touch_core_sched(rq, p);
+
+ do_enqueue_task(rq, p, enq_flags, sticky_cpu);
+out:
+ rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+
+ if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
+ unlikely(cpu_of(rq) != p->scx.selected_cpu))
+ __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
+}
+
+static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
+{
+ struct scx_sched *sch = scx_root;
+ unsigned long opss;
+
+ /* dequeue is always temporary, don't reset runnable_at */
+ clr_task_runnable(p, false);
+
+ /* acquire ensures that we see the preceding updates on QUEUED */
+ opss = atomic_long_read_acquire(&p->scx.ops_state);
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_NONE:
+ break;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * QUEUEING is started and finished while holding @p's rq lock.
+ * As we're holding the rq lock now, we shouldn't see QUEUEING.
+ */
+ BUG();
+ case SCX_OPSS_QUEUED:
+ if (SCX_HAS_OP(sch, dequeue))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
+ p, deq_flags);
+
+ if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+ SCX_OPSS_NONE))
+ break;
+ fallthrough;
+ case SCX_OPSS_DISPATCHING:
+ /*
+ * If @p is being dispatched from the BPF scheduler to a DSQ,
+ * wait for the transfer to complete so that @p doesn't get
+ * added to its DSQ after dequeueing is complete.
+ *
+ * As we're waiting on DISPATCHING with the rq locked, the
+ * dispatching side shouldn't try to lock the rq while
+ * DISPATCHING is set. See dispatch_to_local_dsq().
+ *
+ * DISPATCHING shouldn't have qseq set and control can reach
+ * here with NONE @opss from the above QUEUED case block.
+ * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
+ */
+ wait_ops_state(p, SCX_OPSS_DISPATCHING);
+ BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+ break;
+ }
+}
+
+static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+{
+ struct scx_sched *sch = scx_root;
+
+ if (!(p->scx.flags & SCX_TASK_QUEUED)) {
+ WARN_ON_ONCE(task_runnable(p));
+ return true;
+ }
+
+ ops_dequeue(rq, p, deq_flags);
+
+ /*
+ * A currently running task which is going off @rq first gets dequeued
+ * and then stops running. As we want running <-> stopping transitions
+ * to be contained within runnable <-> quiescent transitions, trigger
+ * ->stopping() early here instead of in put_prev_task_scx().
+ *
+ * @p may go through multiple stopping <-> running transitions between
+ * here and put_prev_task_scx() if task attribute changes occur while
+ * balance_scx() leaves @rq unlocked. However, they don't contain any
+ * information meaningful to the BPF scheduler and can be suppressed by
+ * skipping the callbacks if the task is !QUEUED.
+ */
+ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
+ update_curr_scx(rq);
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
+ }
+
+ if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
+
+ if (deq_flags & SCX_DEQ_SLEEP)
+ p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
+ else
+ p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
+
+ p->scx.flags &= ~SCX_TASK_QUEUED;
+ rq->scx.nr_running--;
+ sub_nr_running(rq, 1);
+
+ dispatch_dequeue(rq, p);
+ return true;
+}
+
+static void yield_task_scx(struct rq *rq)
+{
+ struct scx_sched *sch = scx_root;
+ struct task_struct *p = rq->donor;
+
+ if (SCX_HAS_OP(sch, yield))
+ SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
+ else
+ p->scx.slice = 0;
+}
+
+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
+{
+ struct scx_sched *sch = scx_root;
+ struct task_struct *from = rq->donor;
+
+ if (SCX_HAS_OP(sch, yield))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
+ from, to);
+ else
+ return false;
+}
+
+static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+ struct scx_dispatch_q *src_dsq,
+ struct rq *dst_rq)
+{
+ struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
+
+ /* @dsq is locked and @p is on @dst_rq */
+ lockdep_assert_held(&src_dsq->lock);
+ lockdep_assert_rq_held(dst_rq);
+
+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ list_add(&p->scx.dsq_list.node, &dst_dsq->list);
+ else
+ list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
+
+ dsq_mod_nr(dst_dsq, 1);
+ p->scx.dsq = dst_dsq;
+}
+
+/**
+ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
+ * @p: task to move
+ * @enq_flags: %SCX_ENQ_*
+ * @src_rq: rq to move the task from, locked on entry, released on return
+ * @dst_rq: rq to move the task into, locked on return
+ *
+ * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
+ */
+static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+ struct rq *src_rq, struct rq *dst_rq)
+{
+ lockdep_assert_rq_held(src_rq);
+
+ /* the following marks @p MIGRATING which excludes dequeue */
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu_of(dst_rq));
+ p->scx.sticky_cpu = cpu_of(dst_rq);
+
+ raw_spin_rq_unlock(src_rq);
+ raw_spin_rq_lock(dst_rq);
+
+ /*
+ * We want to pass scx-specific enq_flags but activate_task() will
+ * truncate the upper 32 bit. As we own @rq, we can pass them through
+ * @rq->scx.extra_enq_flags instead.
+ */
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
+ WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
+ dst_rq->scx.extra_enq_flags = enq_flags;
+ activate_task(dst_rq, p, 0);
+ dst_rq->scx.extra_enq_flags = 0;
+}
+
+/*
+ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two
+ * differences:
+ *
+ * - is_cpu_allowed() asks "Can this task run on this CPU?" while
+ * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to
+ * this CPU?".
+ *
+ * While migration is disabled, is_cpu_allowed() has to say "yes" as the task
+ * must be allowed to finish on the CPU that it's currently on regardless of
+ * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the
+ * BPF scheduler shouldn't attempt to migrate a task which has migration
+ * disabled.
+ *
+ * - The BPF scheduler is bypassed while the rq is offline and we can always say
+ * no to the BPF scheduler initiated migrations while offline.
+ *
+ * The caller must ensure that @p and @rq are on different CPUs.
+ */
+static bool task_can_run_on_remote_rq(struct scx_sched *sch,
+ struct task_struct *p, struct rq *rq,
+ bool enforce)
+{
+ int cpu = cpu_of(rq);
+
+ WARN_ON_ONCE(task_cpu(p) == cpu);
+
+ /*
+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
+ * the pinned CPU in migrate_disable_switch() while @p is being switched
+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
+ * @p passing the below task_allowed_on_cpu() check while migration is
+ * disabled.
+ *
+ * Test the migration disabled state first as the race window is narrow
+ * and the BPF scheduler failing to check migration disabled state can
+ * easily be masked if task_allowed_on_cpu() is done first.
+ */
+ if (unlikely(is_migration_disabled(p))) {
+ if (enforce)
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
+ return false;
+ }
+
+ /*
+ * We don't require the BPF scheduler to avoid dispatching to offline
+ * CPUs mostly for convenience but also because CPUs can go offline
+ * between scx_bpf_dsq_insert() calls and here. Trigger error iff the
+ * picked CPU is outside the allowed mask.
+ */
+ if (!task_allowed_on_cpu(p, cpu)) {
+ if (enforce)
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
+ return false;
+ }
+
+ if (!scx_rq_online(rq)) {
+ if (enforce)
+ __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
+ * @p: target task
+ * @dsq: locked DSQ @p is currently on
+ * @src_rq: rq @p is currently on, stable with @dsq locked
+ *
+ * Called with @dsq locked but no rq's locked. We want to move @p to a different
+ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is
+ * required when transferring into a local DSQ. Even when transferring into a
+ * non-local DSQ, it's better to use the same mechanism to protect against
+ * dequeues and maintain the invariant that @p->scx.dsq can only change while
+ * @src_rq is locked, which e.g. scx_dump_task() depends on.
+ *
+ * We want to grab @src_rq but that can deadlock if we try while locking @dsq,
+ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As
+ * this may race with dequeue, which can't drop the rq lock or fail, do a little
+ * dancing from our side.
+ *
+ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
+ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu
+ * would be cleared to -1. While other cpus may have updated it to different
+ * values afterwards, as this operation can't be preempted or recurse, the
+ * holding_cpu can never become this CPU again before we're done. Thus, we can
+ * tell whether we lost to dequeue by testing whether the holding_cpu still
+ * points to this CPU. See dispatch_dequeue() for the counterpart.
+ *
+ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is
+ * still valid. %false if lost to dequeue.
+ */
+static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
+ struct scx_dispatch_q *dsq,
+ struct rq *src_rq)
+{
+ s32 cpu = raw_smp_processor_id();
+
+ lockdep_assert_held(&dsq->lock);
+
+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+ task_unlink_from_dsq(p, dsq);
+ p->scx.holding_cpu = cpu;
+
+ raw_spin_unlock(&dsq->lock);
+ raw_spin_rq_lock(src_rq);
+
+ /* task_rq couldn't have changed if we're still the holding cpu */
+ return likely(p->scx.holding_cpu == cpu) &&
+ !WARN_ON_ONCE(src_rq != task_rq(p));
+}
+
+static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
+ struct scx_dispatch_q *dsq, struct rq *src_rq)
+{
+ raw_spin_rq_unlock(this_rq);
+
+ if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
+ move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
+ return true;
+ } else {
+ raw_spin_rq_unlock(src_rq);
+ raw_spin_rq_lock(this_rq);
+ return false;
+ }
+}
+
+/**
+ * move_task_between_dsqs() - Move a task from one DSQ to another
+ * @sch: scx_sched being operated on
+ * @p: target task
+ * @enq_flags: %SCX_ENQ_*
+ * @src_dsq: DSQ @p is currently on, must not be a local DSQ
+ * @dst_dsq: DSQ @p is being moved to, can be any DSQ
+ *
+ * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local
+ * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq
+ * will change. As @p's task_rq is locked, this function doesn't need to use the
+ * holding_cpu mechanism.
+ *
+ * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
+ * return value, is locked.
+ */
+static struct rq *move_task_between_dsqs(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
+ struct scx_dispatch_q *src_dsq,
+ struct scx_dispatch_q *dst_dsq)
+{
+ struct rq *src_rq = task_rq(p), *dst_rq;
+
+ BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
+ lockdep_assert_held(&src_dsq->lock);
+ lockdep_assert_rq_held(src_rq);
+
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
+ dst_dsq = find_global_dsq(sch, p);
+ dst_rq = src_rq;
+ }
+ } else {
+ /* no need to migrate if destination is a non-local DSQ */
+ dst_rq = src_rq;
+ }
+
+ /*
+ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
+ * CPU, @p will be migrated.
+ */
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ /* @p is going from a non-local DSQ to a local DSQ */
+ if (src_rq == dst_rq) {
+ task_unlink_from_dsq(p, src_dsq);
+ move_local_task_to_local_dsq(p, enq_flags,
+ src_dsq, dst_rq);
+ raw_spin_unlock(&src_dsq->lock);
+ } else {
+ raw_spin_unlock(&src_dsq->lock);
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ }
+ } else {
+ /*
+ * @p is going from a non-local DSQ to a non-local DSQ. As
+ * $src_dsq is already locked, do an abbreviated dequeue.
+ */
+ dispatch_dequeue_locked(p, src_dsq);
+ raw_spin_unlock(&src_dsq->lock);
+
+ dispatch_enqueue(sch, dst_dsq, p, enq_flags);
+ }
+
+ return dst_rq;
+}
+
+static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dsq)
+{
+ struct task_struct *p;
+retry:
+ /*
+ * The caller can't expect to successfully consume a task if the task's
+ * addition to @dsq isn't guaranteed to be visible somehow. Test
+ * @dsq->list without locking and skip if it seems empty.
+ */
+ if (list_empty(&dsq->list))
+ return false;
+
+ raw_spin_lock(&dsq->lock);
+
+ nldsq_for_each_task(p, dsq) {
+ struct rq *task_rq = task_rq(p);
+
+ /*
+ * This loop can lead to multiple lockup scenarios, e.g. the BPF
+ * scheduler can put an enormous number of affinitized tasks into
+ * a contended DSQ, or the outer retry loop can repeatedly race
+ * against scx_bypass() dequeueing tasks from @dsq trying to put
+ * the system into the bypass mode. This can easily live-lock the
+ * machine. If aborting, exit from all non-bypass DSQs.
+ */
+ if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
+ break;
+
+ if (rq == task_rq) {
+ task_unlink_from_dsq(p, dsq);
+ move_local_task_to_local_dsq(p, 0, dsq, rq);
+ raw_spin_unlock(&dsq->lock);
+ return true;
+ }
+
+ if (task_can_run_on_remote_rq(sch, p, rq, false)) {
+ if (likely(consume_remote_task(rq, p, dsq, task_rq)))
+ return true;
+ goto retry;
+ }
+ }
+
+ raw_spin_unlock(&dsq->lock);
+ return false;
+}
+
+static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
+{
+ int node = cpu_to_node(cpu_of(rq));
+
+ return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
+}
+
+/**
+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @sch: scx_sched being operated on
+ * @rq: current rq which is locked
+ * @dst_dsq: destination DSQ
+ * @p: task to dispatch
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
+ * DSQ. This function performs all the synchronization dancing needed because
+ * local DSQs are protected with rq locks.
+ *
+ * The caller must have exclusive ownership of @p (e.g. through
+ * %SCX_OPSS_DISPATCHING).
+ */
+static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dst_dsq,
+ struct task_struct *p, u64 enq_flags)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+ struct rq *locked_rq = rq;
+
+ /*
+ * We're synchronized against dequeue through DISPATCHING. As @p can't
+ * be dequeued, its task_rq and cpus_allowed are stable too.
+ *
+ * If dispatching to @rq that @p is already on, no lock dancing needed.
+ */
+ if (rq == src_rq && rq == dst_rq) {
+ dispatch_enqueue(sch, dst_dsq, p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
+ return;
+ }
+
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
+ return;
+ }
+
+ /*
+ * @p is on a possibly remote @src_rq which we need to lock to move the
+ * task. If dequeue is in progress, it'd be locking @src_rq and waiting
+ * on DISPATCHING, so we can't grab @src_rq lock while holding
+ * DISPATCHING.
+ *
+ * As DISPATCHING guarantees that @p is wholly ours, we can pretend that
+ * we're moving from a DSQ and use the same mechanism - mark the task
+ * under transfer with holding_cpu, release DISPATCHING and then follow
+ * the same protocol. See unlink_dsq_and_lock_src_rq().
+ */
+ p->scx.holding_cpu = raw_smp_processor_id();
+
+ /* store_release ensures that dequeue sees the above */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+ /* switch to @src_rq lock */
+ if (locked_rq != src_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = src_rq;
+ raw_spin_rq_lock(src_rq);
+ }
+
+ /* task_rq couldn't have changed if we're still the holding cpu */
+ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
+ !WARN_ON_ONCE(src_rq != task_rq(p))) {
+ /*
+ * If @p is staying on the same rq, there's no need to go
+ * through the full deactivate/activate cycle. Optimize by
+ * abbreviating move_remote_task_to_local_dsq().
+ */
+ if (src_rq == dst_rq) {
+ p->scx.holding_cpu = -1;
+ dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+ enq_flags);
+ } else {
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ /* task has been moved to dst_rq, which is now locked */
+ locked_rq = dst_rq;
+ }
+
+ /* if the destination CPU is idle, wake it up */
+ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
+ resched_curr(dst_rq);
+ }
+
+ /* switch back to @rq lock */
+ if (locked_rq != rq) {
+ raw_spin_rq_unlock(locked_rq);
+ raw_spin_rq_lock(rq);
+ }
+}
+
+/**
+ * finish_dispatch - Asynchronously finish dispatching a task
+ * @rq: current rq which is locked
+ * @p: task to finish dispatching
+ * @qseq_at_dispatch: qseq when @p started getting dispatched
+ * @dsq_id: destination DSQ ID
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * Dispatching to local DSQs may need to wait for queueing to complete or
+ * require rq lock dancing. As we don't wanna do either while inside
+ * ops.dispatch() to avoid locking order inversion, we split dispatching into
+ * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the
+ * task and its qseq. Once ops.dispatch() returns, this function is called to
+ * finish up.
+ *
+ * There is no guarantee that @p is still valid for dispatching or even that it
+ * was valid in the first place. Make sure that the task is still owned by the
+ * BPF scheduler and claim the ownership before dispatching.
+ */
+static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *p,
+ unsigned long qseq_at_dispatch,
+ u64 dsq_id, u64 enq_flags)
+{
+ struct scx_dispatch_q *dsq;
+ unsigned long opss;
+
+ touch_core_sched_dispatch(rq, p);
+retry:
+ /*
+ * No need for _acquire here. @p is accessed only after a successful
+ * try_cmpxchg to DISPATCHING.
+ */
+ opss = atomic_long_read(&p->scx.ops_state);
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_DISPATCHING:
+ case SCX_OPSS_NONE:
+ /* someone else already got to it */
+ return;
+ case SCX_OPSS_QUEUED:
+ /*
+ * If qseq doesn't match, @p has gone through at least one
+ * dispatch/dequeue and re-enqueue cycle between
+ * scx_bpf_dsq_insert() and here and we have no claim on it.
+ */
+ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
+ return;
+
+ /*
+ * While we know @p is accessible, we don't yet have a claim on
+ * it - the BPF scheduler is allowed to dispatch tasks
+ * spuriously and there can be a racing dequeue attempt. Let's
+ * claim @p by atomically transitioning it from QUEUED to
+ * DISPATCHING.
+ */
+ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+ SCX_OPSS_DISPATCHING)))
+ break;
+ goto retry;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * do_enqueue_task() is in the process of transferring the task
+ * to the BPF scheduler while holding @p's rq lock. As we aren't
+ * holding any kernel or BPF resource that the enqueue path may
+ * depend upon, it's safe to wait.
+ */
+ wait_ops_state(p, opss);
+ goto retry;
+ }
+
+ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
+
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
+
+ if (dsq->id == SCX_DSQ_LOCAL)
+ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
+ else
+ dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+}
+
+static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ u32 u;
+
+ for (u = 0; u < dspc->cursor; u++) {
+ struct scx_dsp_buf_ent *ent = &dspc->buf[u];
+
+ finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
+ ent->enq_flags);
+ }
+
+ dspc->nr_tasks += dspc->cursor;
+ dspc->cursor = 0;
+}
+
+static inline void maybe_queue_balance_callback(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
+ return;
+
+ queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+ deferred_bal_cb_workfn);
+
+ rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
+}
+
+static int balance_one(struct rq *rq, struct task_struct *prev)
+{
+ struct scx_sched *sch = scx_root;
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
+ int nr_loops = SCX_DSP_MAX_LOOPS;
+
+ lockdep_assert_rq_held(rq);
+ rq->scx.flags |= SCX_RQ_IN_BALANCE;
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+
+ if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
+ unlikely(rq->scx.cpu_released)) {
+ /*
+ * If the previous sched_class for the current CPU was not SCX,
+ * notify the BPF scheduler that it again has control of the
+ * core. This callback complements ->cpu_release(), which is
+ * emitted in switch_class().
+ */
+ if (SCX_HAS_OP(sch, cpu_acquire))
+ SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
+ cpu_of(rq), NULL);
+ rq->scx.cpu_released = false;
+ }
+
+ if (prev_on_scx) {
+ update_curr_scx(rq);
+
+ /*
+ * If @prev is runnable & has slice left, it has priority and
+ * fetching more just increases latency for the fetched tasks.
+ * Tell pick_task_scx() to keep running @prev. If the BPF
+ * scheduler wants to handle this explicitly, it should
+ * implement ->cpu_release().
+ *
+ * See scx_disable_workfn() for the explanation on the bypassing
+ * test.
+ */
+ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ }
+
+ /* if there already are tasks to run, nothing to do */
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+
+ if (consume_global_dsq(sch, rq))
+ goto has_tasks;
+
+ if (scx_rq_bypassing(rq)) {
+ if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+ goto has_tasks;
+ else
+ goto no_tasks;
+ }
+
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
+ goto no_tasks;
+
+ dspc->rq = rq;
+
+ /*
+ * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
+ * the local DSQ might still end up empty after a successful
+ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
+ * produced some tasks, retry. The BPF scheduler may depend on this
+ * looping behavior to simplify its implementation.
+ */
+ do {
+ dspc->nr_tasks = 0;
+
+ SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
+ cpu_of(rq), prev_on_scx ? prev : NULL);
+
+ flush_dispatch_buf(sch, rq);
+
+ if (prev_on_rq && prev->scx.slice) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+ if (consume_global_dsq(sch, rq))
+ goto has_tasks;
+
+ /*
+ * ops.dispatch() can trap us in this loop by repeatedly
+ * dispatching ineligible tasks. Break out once in a while to
+ * allow the watchdog to run. As IRQ can't be enabled in
+ * balance(), we want to complete this scheduling cycle and then
+ * start a new one. IOW, we want to call resched_curr() on the
+ * next, most likely idle, task, not the current one. Use
+ * scx_kick_cpu() for deferred kicking.
+ */
+ if (unlikely(!--nr_loops)) {
+ scx_kick_cpu(sch, cpu_of(rq), 0);
+ break;
+ }
+ } while (dspc->nr_tasks);
+
+no_tasks:
+ /*
+ * Didn't find another task to run. Keep running @prev unless
+ * %SCX_OPS_ENQ_LAST is in effect.
+ */
+ if (prev_on_rq &&
+ (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
+ goto has_tasks;
+ }
+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+ return false;
+
+has_tasks:
+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+ return true;
+}
+
+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+ struct task_struct *p;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Now that @rq can be unlocked, execute the deferred enqueueing of
+ * tasks directly dispatched to the local DSQs of other CPUs. See
+ * direct_dispatch(). Keep popping from the head instead of using
+ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
+ * temporarily.
+ */
+ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
+ struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_root;
+ struct scx_dispatch_q *dsq;
+
+ list_del_init(&p->scx.dsq_list.node);
+
+ dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
+ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
+ dispatch_to_local_dsq(sch, rq, dsq, p,
+ p->scx.ddsp_enq_flags);
+ }
+}
+
+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
+{
+ struct scx_sched *sch = scx_root;
+
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ /*
+ * Core-sched might decide to execute @p before it is
+ * dispatched. Call ops_dequeue() to notify the BPF scheduler.
+ */
+ ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
+ dispatch_dequeue(rq, p);
+ }
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* see dequeue_task_scx() on why we skip when !QUEUED */
+ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
+
+ clr_task_runnable(p, true);
+
+ /*
+ * @p is getting newly scheduled or got kicked after someone updated its
+ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
+ */
+ if ((p->scx.slice == SCX_SLICE_INF) !=
+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+ if (p->scx.slice == SCX_SLICE_INF)
+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+ else
+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+ sched_update_tick_dependency(rq);
+
+ /*
+ * For now, let's refresh the load_avgs just when transitioning
+ * in and out of nohz. In the future, we might want to add a
+ * mechanism which calls the following periodically on
+ * tick-stopped CPUs.
+ */
+ update_other_load_avgs(rq);
+ }
+}
+
+static enum scx_cpu_preempt_reason
+preempt_reason_from_class(const struct sched_class *class)
+{
+ if (class == &stop_sched_class)
+ return SCX_CPU_PREEMPT_STOP;
+ if (class == &dl_sched_class)
+ return SCX_CPU_PREEMPT_DL;
+ if (class == &rt_sched_class)
+ return SCX_CPU_PREEMPT_RT;
+ return SCX_CPU_PREEMPT_UNKNOWN;
+}
+
+static void switch_class(struct rq *rq, struct task_struct *next)
+{
+ struct scx_sched *sch = scx_root;
+ const struct sched_class *next_class = next->sched_class;
+
+ if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
+ return;
+
+ /*
+ * The callback is conceptually meant to convey that the CPU is no
+ * longer under the control of SCX. Therefore, don't invoke the callback
+ * if the next class is below SCX (in which case the BPF scheduler has
+ * actively decided not to schedule any tasks on the CPU).
+ */
+ if (sched_class_above(&ext_sched_class, next_class))
+ return;
+
+ /*
+ * At this point we know that SCX was preempted by a higher priority
+ * sched_class, so invoke the ->cpu_release() callback if we have not
+ * done so already. We only send the callback once between SCX being
+ * preempted, and it regaining control of the CPU.
+ *
+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the
+ * next time that balance_scx() is invoked.
+ */
+ if (!rq->scx.cpu_released) {
+ if (SCX_HAS_OP(sch, cpu_release)) {
+ struct scx_cpu_release_args args = {
+ .reason = preempt_reason_from_class(next_class),
+ .task = next,
+ };
+
+ SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
+ cpu_of(rq), &args);
+ }
+ rq->scx.cpu_released = true;
+ }
+}
+
+static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
+ struct task_struct *next)
+{
+ struct scx_sched *sch = scx_root;
+
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
+ update_curr_scx(rq);
+
+ /* see dequeue_task_scx() on why we skip when !QUEUED */
+ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
+
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ set_task_runnable(rq, p);
+
+ /*
+ * If @p has slice left and is being put, @p is getting
+ * preempted by a higher priority scheduler class or core-sched
+ * forcing a different task. Leave it at the head of the local
+ * DSQ.
+ */
+ if (p->scx.slice && !scx_rq_bypassing(rq)) {
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p,
+ SCX_ENQ_HEAD);
+ goto switch_class;
+ }
+
+ /*
+ * If @p is runnable but we're about to enter a lower
+ * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
+ * ops.enqueue() that @p is the only one available for this cpu,
+ * which should trigger an explicit follow-up scheduling event.
+ */
+ if (sched_class_above(&ext_sched_class, next->sched_class)) {
+ WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
+ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
+ } else {
+ do_enqueue_task(rq, p, 0, -1);
+ }
+ }
+
+switch_class:
+ if (next && next->sched_class != &ext_sched_class)
+ switch_class(rq, next);
+}
+
+static struct task_struct *first_local_task(struct rq *rq)
+{
+ return list_first_entry_or_null(&rq->scx.local_dsq.list,
+ struct task_struct, scx.dsq_list.node);
+}
+
+static struct task_struct *
+do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
+{
+ struct task_struct *prev = rq->curr;
+ bool keep_prev, kick_idle = false;
+ struct task_struct *p;
+
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
+ rq_modified_clear(rq);
+
+ rq_unpin_lock(rq, rf);
+ balance_one(rq, prev);
+ rq_repin_lock(rq, rf);
+ maybe_queue_balance_callback(rq);
+
+ /*
+ * If any higher-priority sched class enqueued a runnable task on
+ * this rq during balance_one(), abort and return RETRY_TASK, so
+ * that the scheduler loop can restart.
+ *
+ * If @force_scx is true, always try to pick a SCHED_EXT task,
+ * regardless of any higher-priority sched classes activity.
+ */
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
+ return RETRY_TASK;
+
+ keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
+ WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
+ keep_prev = false;
+ }
+
+ /*
+ * If balance_scx() is telling us to keep running @prev, replenish slice
+ * if necessary and keep running @prev. Otherwise, pop the first one
+ * from the local DSQ.
+ */
+ if (keep_prev) {
+ p = prev;
+ if (!p->scx.slice)
+ refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
+ } else {
+ p = first_local_task(rq);
+ if (!p) {
+ if (kick_idle)
+ scx_kick_cpu(rcu_dereference_sched(scx_root),
+ cpu_of(rq), SCX_KICK_IDLE);
+ return NULL;
+ }
+
+ if (unlikely(!p->scx.slice)) {
+ struct scx_sched *sch = rcu_dereference_sched(scx_root);
+
+ if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
+ p->comm, p->pid, __func__);
+ sch->warned_zero_slice = true;
+ }
+ refill_task_slice_dfl(sch, p);
+ }
+ }
+
+ return p;
+}
+
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
+{
+ return do_pick_task_scx(rq, rf, false);
+}
+
+#ifdef CONFIG_SCHED_CORE
+/**
+ * scx_prio_less - Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ * @in_fi: in forced idle state
+ *
+ * Core-sched is implemented as an additional scheduling layer on top of the
+ * usual sched_class'es and needs to find out the expected task ordering. For
+ * SCX, core-sched calls this function to interrogate the task ordering.
+ *
+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
+ * to implement the default task ordering. The older the timestamp, the higher
+ * priority the task - the global FIFO ordering matching the default scheduling
+ * behavior.
+ *
+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
+ * implement FIFO ordering within each local DSQ. See pick_task_scx().
+ */
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
+{
+ struct scx_sched *sch = scx_root;
+
+ /*
+ * The const qualifiers are dropped from task_struct pointers when
+ * calling ops.core_sched_before(). Accesses are controlled by the
+ * verifier.
+ */
+ if (SCX_HAS_OP(sch, core_sched_before) &&
+ !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
+ NULL,
+ (struct task_struct *)a,
+ (struct task_struct *)b);
+ else
+ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
+}
+#endif /* CONFIG_SCHED_CORE */
+
+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
+{
+ struct scx_sched *sch = scx_root;
+ bool rq_bypass;
+
+ /*
+ * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
+ * can be a good migration opportunity with low cache and memory
+ * footprint. Returning a CPU different than @prev_cpu triggers
+ * immediate rq migration. However, for SCX, as the current rq
+ * association doesn't dictate where the task is going to run, this
+ * doesn't fit well. If necessary, we can later add a dedicated method
+ * which can decide to preempt self to force it through the regular
+ * scheduling path.
+ */
+ if (unlikely(wake_flags & WF_EXEC))
+ return prev_cpu;
+
+ rq_bypass = scx_rq_bypassing(task_rq(p));
+ if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
+ s32 cpu;
+ struct task_struct **ddsp_taskp;
+
+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+ WARN_ON_ONCE(*ddsp_taskp);
+ *ddsp_taskp = p;
+
+ cpu = SCX_CALL_OP_TASK_RET(sch,
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+ select_cpu, NULL, p, prev_cpu,
+ wake_flags);
+ p->scx.selected_cpu = cpu;
+ *ddsp_taskp = NULL;
+ if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
+ return cpu;
+ else
+ return prev_cpu;
+ } else {
+ s32 cpu;
+
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
+ if (cpu >= 0) {
+ refill_task_slice_dfl(sch, p);
+ p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+ } else {
+ cpu = prev_cpu;
+ }
+ p->scx.selected_cpu = cpu;
+
+ if (rq_bypass)
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
+ return cpu;
+ }
+}
+
+static void task_woken_scx(struct rq *rq, struct task_struct *p)
+{
+ run_deferred(rq);
+}
+
+static void set_cpus_allowed_scx(struct task_struct *p,
+ struct affinity_context *ac)
+{
+ struct scx_sched *sch = scx_root;
+
+ set_cpus_allowed_common(p, ac);
+
+ /*
+ * The effective cpumask is stored in @p->cpus_ptr which may temporarily
+ * differ from the configured one in @p->cpus_mask. Always tell the bpf
+ * scheduler the effective one.
+ *
+ * Fine-grained memory write control is enforced by BPF making the const
+ * designation pointless. Cast it away when calling the operation.
+ */
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
+ p, (struct cpumask *)p->cpus_ptr);
+}
+
+static void handle_hotplug(struct rq *rq, bool online)
+{
+ struct scx_sched *sch = scx_root;
+ int cpu = cpu_of(rq);
+
+ atomic_long_inc(&scx_hotplug_seq);
+
+ /*
+ * scx_root updates are protected by cpus_read_lock() and will stay
+ * stable here. Note that we can't depend on scx_enabled() test as the
+ * hotplug ops need to be enabled before __scx_enabled is set.
+ */
+ if (unlikely(!sch))
+ return;
+
+ if (scx_enabled())
+ scx_idle_update_selcpu_topology(&sch->ops);
+
+ if (online && SCX_HAS_OP(sch, cpu_online))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
+ else if (!online && SCX_HAS_OP(sch, cpu_offline))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
+ else
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
+}
+
+void scx_rq_activate(struct rq *rq)
+{
+ handle_hotplug(rq, true);
+}
+
+void scx_rq_deactivate(struct rq *rq)
+{
+ handle_hotplug(rq, false);
+}
+
+static void rq_online_scx(struct rq *rq)
+{
+ rq->scx.flags |= SCX_RQ_ONLINE;
+}
+
+static void rq_offline_scx(struct rq *rq)
+{
+ rq->scx.flags &= ~SCX_RQ_ONLINE;
+}
+
+
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+ struct scx_sched *sch;
+ struct task_struct *p;
+ struct rq_flags rf;
+ bool timed_out = false;
+
+ rq_lock_irqsave(rq, &rf);
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ unsigned long last_runnable = p->scx.runnable_at;
+
+ if (unlikely(time_after(jiffies,
+ last_runnable + scx_watchdog_timeout))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
+ timed_out = true;
+ break;
+ }
+ }
+out_unlock:
+ rq_unlock_irqrestore(rq, &rf);
+ return timed_out;
+}
+
+static void scx_watchdog_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+
+ for_each_online_cpu(cpu) {
+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+ break;
+
+ cond_resched();
+ }
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ scx_watchdog_timeout / 2);
+}
+
+void scx_tick(struct rq *rq)
+{
+ struct scx_sched *sch;
+ unsigned long last_check;
+
+ if (!scx_enabled())
+ return;
+
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ last_check = READ_ONCE(scx_watchdog_timestamp);
+ if (unlikely(time_after(jiffies,
+ last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
+ }
+
+ update_other_load_avgs(rq);
+}
+
+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+{
+ struct scx_sched *sch = scx_root;
+
+ update_curr_scx(rq);
+
+ /*
+ * While disabling, always resched and refresh core-sched timestamp as
+ * we can't trust the slice management or ops.core_sched_before().
+ */
+ if (scx_rq_bypassing(rq)) {
+ curr->scx.slice = 0;
+ touch_core_sched(rq, curr);
+ } else if (SCX_HAS_OP(sch, tick)) {
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
+ }
+
+ if (!curr->scx.slice)
+ resched_curr(rq);
+}
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+static struct cgroup *tg_cgrp(struct task_group *tg)
+{
+ /*
+ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
+ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
+ * root cgroup.
+ */
+ if (tg && tg->css.cgroup)
+ return tg->css.cgroup;
+ else
+ return &cgrp_dfl_root.cgrp;
+}
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg),
+
+#else /* CONFIG_EXT_GROUP_SCHED */
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg)
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+static enum scx_task_state scx_get_task_state(const struct task_struct *p)
+{
+ return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
+}
+
+static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
+{
+ enum scx_task_state prev_state = scx_get_task_state(p);
+ bool warn = false;
+
+ BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
+
+ switch (state) {
+ case SCX_TASK_NONE:
+ break;
+ case SCX_TASK_INIT:
+ warn = prev_state != SCX_TASK_NONE;
+ break;
+ case SCX_TASK_READY:
+ warn = prev_state == SCX_TASK_NONE;
+ break;
+ case SCX_TASK_ENABLED:
+ warn = prev_state != SCX_TASK_READY;
+ break;
+ default:
+ warn = true;
+ return;
+ }
+
+ WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
+ prev_state, state, p->comm, p->pid);
+
+ p->scx.flags &= ~SCX_TASK_STATE_MASK;
+ p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
+}
+
+static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+{
+ struct scx_sched *sch = scx_root;
+ int ret;
+
+ p->scx.disallow = false;
+
+ if (SCX_HAS_OP(sch, init_task)) {
+ struct scx_init_task_args args = {
+ SCX_INIT_TASK_ARGS_CGROUP(tg)
+ .fork = fork,
+ };
+
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
+ p, &args);
+ if (unlikely(ret)) {
+ ret = ops_sanitize_err(sch, "init_task", ret);
+ return ret;
+ }
+ }
+
+ scx_set_task_state(p, SCX_TASK_INIT);
+
+ if (p->scx.disallow) {
+ if (!fork) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * We're in the load path and @p->policy will be applied
+ * right after. Reverting @p->policy here and rejecting
+ * %SCHED_EXT transitions from scx_check_setscheduler()
+ * guarantees that if ops.init_task() sets @p->disallow,
+ * @p can never be in SCX.
+ */
+ if (p->policy == SCHED_EXT) {
+ p->policy = SCHED_NORMAL;
+ atomic_long_inc(&scx_nr_rejected);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+ } else if (p->policy == SCHED_EXT) {
+ scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
+ }
+ }
+
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+ return 0;
+}
+
+static void scx_enable_task(struct task_struct *p)
+{
+ struct scx_sched *sch = scx_root;
+ struct rq *rq = task_rq(p);
+ u32 weight;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Set the weight before calling ops.enable() so that the scheduler
+ * doesn't see a stale value if they inspect the task struct.
+ */
+ if (task_has_idle_policy(p))
+ weight = WEIGHT_IDLEPRIO;
+ else
+ weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
+
+ p->scx.weight = sched_weight_to_cgroup(weight);
+
+ if (SCX_HAS_OP(sch, enable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
+ scx_set_task_state(p, SCX_TASK_ENABLED);
+
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
+}
+
+static void scx_disable_task(struct task_struct *p)
+{
+ struct scx_sched *sch = scx_root;
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
+ WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
+
+ if (SCX_HAS_OP(sch, disable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
+ scx_set_task_state(p, SCX_TASK_READY);
+}
+
+static void scx_exit_task(struct task_struct *p)
+{
+ struct scx_sched *sch = scx_root;
+ struct scx_exit_task_args args = {
+ .cancelled = false,
+ };
+
+ lockdep_assert_rq_held(task_rq(p));
+
+ switch (scx_get_task_state(p)) {
+ case SCX_TASK_NONE:
+ return;
+ case SCX_TASK_INIT:
+ args.cancelled = true;
+ break;
+ case SCX_TASK_READY:
+ break;
+ case SCX_TASK_ENABLED:
+ scx_disable_task(p);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return;
+ }
+
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
+ p, &args);
+ scx_set_task_state(p, SCX_TASK_NONE);
+}
+
+void init_scx_entity(struct sched_ext_entity *scx)
+{
+ memset(scx, 0, sizeof(*scx));
+ INIT_LIST_HEAD(&scx->dsq_list.node);
+ RB_CLEAR_NODE(&scx->dsq_priq);
+ scx->sticky_cpu = -1;
+ scx->holding_cpu = -1;
+ INIT_LIST_HEAD(&scx->runnable_node);
+ scx->runnable_at = jiffies;
+ scx->ddsp_dsq_id = SCX_DSQ_INVALID;
+ scx->slice = READ_ONCE(scx_slice_dfl);
+}
+
+void scx_pre_fork(struct task_struct *p)
+{
+ /*
+ * BPF scheduler enable/disable paths want to be able to iterate and
+ * update all tasks which can become complex when racing forks. As
+ * enable/disable are very cold paths, let's use a percpu_rwsem to
+ * exclude forks.
+ */
+ percpu_down_read(&scx_fork_rwsem);
+}
+
+int scx_fork(struct task_struct *p)
+{
+ percpu_rwsem_assert_held(&scx_fork_rwsem);
+
+ if (scx_init_task_enabled)
+ return scx_init_task(p, task_group(p), true);
+ else
+ return 0;
+}
+
+void scx_post_fork(struct task_struct *p)
+{
+ if (scx_init_task_enabled) {
+ scx_set_task_state(p, SCX_TASK_READY);
+
+ /*
+ * Enable the task immediately if it's running on sched_ext.
+ * Otherwise, it'll be enabled in switching_to_scx() if and
+ * when it's ever configured to run with a SCHED_EXT policy.
+ */
+ if (p->sched_class == &ext_sched_class) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ scx_enable_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+ }
+
+ raw_spin_lock_irq(&scx_tasks_lock);
+ list_add_tail(&p->scx.tasks_node, &scx_tasks);
+ raw_spin_unlock_irq(&scx_tasks_lock);
+
+ percpu_up_read(&scx_fork_rwsem);
+}
+
+void scx_cancel_fork(struct task_struct *p)
+{
+ if (scx_enabled()) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+ WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
+ scx_exit_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ percpu_up_read(&scx_fork_rwsem);
+}
+
+void sched_ext_dead(struct task_struct *p)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&scx_tasks_lock, flags);
+ list_del_init(&p->scx.tasks_node);
+ raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
+
+ /*
+ * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
+ * transitions can't race us. Disable ops for @p.
+ */
+ if (scx_get_task_state(p) != SCX_TASK_NONE) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ scx_exit_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+}
+
+static void reweight_task_scx(struct rq *rq, struct task_struct *p,
+ const struct load_weight *lw)
+{
+ struct scx_sched *sch = scx_root;
+
+ lockdep_assert_rq_held(task_rq(p));
+
+ p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
+}
+
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
+{
+}
+
+static void switching_to_scx(struct rq *rq, struct task_struct *p)
+{
+ struct scx_sched *sch = scx_root;
+
+ scx_enable_task(p);
+
+ /*
+ * set_cpus_allowed_scx() is not called while @p is associated with a
+ * different scheduler class. Keep the BPF scheduler up-to-date.
+ */
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
+ p, (struct cpumask *)p->cpus_ptr);
+}
+
+static void switched_from_scx(struct rq *rq, struct task_struct *p)
+{
+ scx_disable_task(p);
+}
+
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
+
+int scx_check_setscheduler(struct task_struct *p, int policy)
+{
+ lockdep_assert_rq_held(task_rq(p));
+
+ /* if disallow, reject transitioning into SCX */
+ if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
+ p->policy != policy && policy == SCHED_EXT)
+ return -EACCES;
+
+ return 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (scx_rq_bypassing(rq))
+ return false;
+
+ if (p->sched_class != &ext_sched_class)
+ return true;
+
+ /*
+ * @rq can dispatch from different DSQs, so we can't tell whether it
+ * needs the tick or not by looking at nr_running. Allow stopping ticks
+ * iff the BPF scheduler indicated so. See set_next_task_scx().
+ */
+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem);
+static bool scx_cgroup_enabled;
+
+void scx_tg_init(struct task_group *tg)
+{
+ tg->scx.weight = CGROUP_WEIGHT_DFL;
+ tg->scx.bw_period_us = default_bw_period_us();
+ tg->scx.bw_quota_us = RUNTIME_INF;
+ tg->scx.idle = false;
+}
+
+int scx_tg_online(struct task_group *tg)
+{
+ struct scx_sched *sch = scx_root;
+ int ret = 0;
+
+ WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+
+ if (scx_cgroup_enabled) {
+ if (SCX_HAS_OP(sch, cgroup_init)) {
+ struct scx_cgroup_init_args args =
+ { .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us };
+
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
+ NULL, tg->css.cgroup, &args);
+ if (ret)
+ ret = ops_sanitize_err(sch, "cgroup_init", ret);
+ }
+ if (ret == 0)
+ tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+ } else {
+ tg->scx.flags |= SCX_TG_ONLINE;
+ }
+
+ return ret;
+}
+
+void scx_tg_offline(struct task_group *tg)
+{
+ struct scx_sched *sch = scx_root;
+
+ WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
+ (tg->scx.flags & SCX_TG_INITED))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ tg->css.cgroup);
+ tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+}
+
+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ struct scx_sched *sch = scx_root;
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+ int ret;
+
+ if (!scx_cgroup_enabled)
+ return 0;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ struct cgroup *from = tg_cgrp(task_group(p));
+ struct cgroup *to = tg_cgrp(css_tg(css));
+
+ WARN_ON_ONCE(p->scx.cgrp_moving_from);
+
+ /*
+ * sched_move_task() omits identity migrations. Let's match the
+ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
+ * always match one-to-one.
+ */
+ if (from == to)
+ continue;
+
+ if (SCX_HAS_OP(sch, cgroup_prep_move)) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
+ cgroup_prep_move, NULL,
+ p, from, css->cgroup);
+ if (ret)
+ goto err;
+ }
+
+ p->scx.cgrp_moving_from = from;
+ }
+
+ return 0;
+
+err:
+ cgroup_taskset_for_each(p, css, tset) {
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
+ p->scx.cgrp_moving_from = NULL;
+ }
+
+ return ops_sanitize_err(sch, "cgroup_prep_move", ret);
+}
+
+void scx_cgroup_move_task(struct task_struct *p)
+{
+ struct scx_sched *sch = scx_root;
+
+ if (!scx_cgroup_enabled)
+ return;
+
+ /*
+ * @p must have ops.cgroup_prep_move() called on it and thus
+ * cgrp_moving_from set.
+ */
+ if (SCX_HAS_OP(sch, cgroup_move) &&
+ !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
+ p, p->scx.cgrp_moving_from,
+ tg_cgrp(task_group(p)));
+ p->scx.cgrp_moving_from = NULL;
+}
+
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+ struct scx_sched *sch = scx_root;
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+
+ if (!scx_cgroup_enabled)
+ return;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
+ p->scx.cgrp_moving_from = NULL;
+ }
+}
+
+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
+{
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_ops_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
+ tg->scx.weight != weight)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
+ tg_cgrp(tg), weight);
+
+ tg->scx.weight = weight;
+
+ percpu_up_read(&scx_cgroup_ops_rwsem);
+}
+
+void scx_group_set_idle(struct task_group *tg, bool idle)
+{
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_ops_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
+ tg_cgrp(tg), idle);
+
+ /* Update the task group's idle state */
+ tg->scx.idle = idle;
+
+ percpu_up_read(&scx_cgroup_ops_rwsem);
+}
+
+void scx_group_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_ops_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+ (tg->scx.bw_period_us != period_us ||
+ tg->scx.bw_quota_us != quota_us ||
+ tg->scx.bw_burst_us != burst_us))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+ tg_cgrp(tg), period_us, quota_us, burst_us);
+
+ tg->scx.bw_period_us = period_us;
+ tg->scx.bw_quota_us = quota_us;
+ tg->scx.bw_burst_us = burst_us;
+
+ percpu_up_read(&scx_cgroup_ops_rwsem);
+}
+
+static void scx_cgroup_lock(void)
+{
+ percpu_down_write(&scx_cgroup_ops_rwsem);
+ cgroup_lock();
+}
+
+static void scx_cgroup_unlock(void)
+{
+ cgroup_unlock();
+ percpu_up_write(&scx_cgroup_ops_rwsem);
+}
+
+#else /* CONFIG_EXT_GROUP_SCHED */
+
+static void scx_cgroup_lock(void) {}
+static void scx_cgroup_unlock(void) {}
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+/*
+ * Omitted operations:
+ *
+ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
+ * isn't tied to the CPU at that point. Preemption is implemented by resetting
+ * the victim task's slice to 0 and triggering reschedule on the target CPU.
+ *
+ * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
+ *
+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
+ * their current sched_class. Call them directly from sched core instead.
+ */
+DEFINE_SCHED_CLASS(ext) = {
+ .queue_mask = 1,
+
+ .enqueue_task = enqueue_task_scx,
+ .dequeue_task = dequeue_task_scx,
+ .yield_task = yield_task_scx,
+ .yield_to_task = yield_to_task_scx,
+
+ .wakeup_preempt = wakeup_preempt_scx,
+
+ .pick_task = pick_task_scx,
+
+ .put_prev_task = put_prev_task_scx,
+ .set_next_task = set_next_task_scx,
+
+ .select_task_rq = select_task_rq_scx,
+ .task_woken = task_woken_scx,
+ .set_cpus_allowed = set_cpus_allowed_scx,
+
+ .rq_online = rq_online_scx,
+ .rq_offline = rq_offline_scx,
+
+ .task_tick = task_tick_scx,
+
+ .switching_to = switching_to_scx,
+ .switched_from = switched_from_scx,
+ .switched_to = switched_to_scx,
+ .reweight_task = reweight_task_scx,
+ .prio_changed = prio_changed_scx,
+
+ .update_curr = update_curr_scx,
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
+};
+
+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+{
+ memset(dsq, 0, sizeof(*dsq));
+
+ raw_spin_lock_init(&dsq->lock);
+ INIT_LIST_HEAD(&dsq->list);
+ dsq->id = dsq_id;
+}
+
+static void free_dsq_irq_workfn(struct irq_work *irq_work)
+{
+ struct llist_node *to_free = llist_del_all(&dsqs_to_free);
+ struct scx_dispatch_q *dsq, *tmp_dsq;
+
+ llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
+ kfree_rcu(dsq, rcu);
+}
+
+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
+
+static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
+{
+ struct scx_dispatch_q *dsq;
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ dsq = find_user_dsq(sch, dsq_id);
+ if (!dsq)
+ goto out_unlock_rcu;
+
+ raw_spin_lock_irqsave(&dsq->lock, flags);
+
+ if (dsq->nr) {
+ scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
+ goto out_unlock_dsq;
+ }
+
+ if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params))
+ goto out_unlock_dsq;
+
+ /*
+ * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
+ * queueing more tasks. As this function can be called from anywhere,
+ * freeing is bounced through an irq work to avoid nesting RCU
+ * operations inside scheduler locks.
+ */
+ dsq->id = SCX_DSQ_INVALID;
+ llist_add(&dsq->free_node, &dsqs_to_free);
+ irq_work_queue(&free_dsq_irq_work);
+
+out_unlock_dsq:
+ raw_spin_unlock_irqrestore(&dsq->lock, flags);
+out_unlock_rcu:
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void scx_cgroup_exit(struct scx_sched *sch)
+{
+ struct cgroup_subsys_state *css;
+
+ scx_cgroup_enabled = false;
+
+ /*
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
+ * cgroups and exit all the inited ones, all online cgroups are exited.
+ */
+ css_for_each_descendant_post(css, &root_task_group.css) {
+ struct task_group *tg = css_tg(css);
+
+ if (!(tg->scx.flags & SCX_TG_INITED))
+ continue;
+ tg->scx.flags &= ~SCX_TG_INITED;
+
+ if (!sch->ops.cgroup_exit)
+ continue;
+
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ css->cgroup);
+ }
+}
+
+static int scx_cgroup_init(struct scx_sched *sch)
+{
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ /*
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
+ * cgroups and init, all online cgroups are initialized.
+ */
+ css_for_each_descendant_pre(css, &root_task_group.css) {
+ struct task_group *tg = css_tg(css);
+ struct scx_cgroup_init_args args = {
+ .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us,
+ };
+
+ if ((tg->scx.flags &
+ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
+ continue;
+
+ if (!sch->ops.cgroup_init) {
+ tg->scx.flags |= SCX_TG_INITED;
+ continue;
+ }
+
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
+ css->cgroup, &args);
+ if (ret) {
+ css_put(css);
+ scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
+ return ret;
+ }
+ tg->scx.flags |= SCX_TG_INITED;
+ }
+
+ WARN_ON_ONCE(scx_cgroup_enabled);
+ scx_cgroup_enabled = true;
+
+ return 0;
+}
+
+#else
+static void scx_cgroup_exit(struct scx_sched *sch) {}
+static int scx_cgroup_init(struct scx_sched *sch) { return 0; }
+#endif
+
+
+/********************************************************************************
+ * Sysfs interface and ops enable/disable.
+ */
+
+#define SCX_ATTR(_name) \
+ static struct kobj_attribute scx_attr_##_name = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = scx_attr_##_name##_show, \
+ }
+
+static ssize_t scx_attr_state_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]);
+}
+SCX_ATTR(state);
+
+static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
+}
+SCX_ATTR(switch_all);
+
+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
+}
+SCX_ATTR(nr_rejected);
+
+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
+}
+SCX_ATTR(hotplug_seq);
+
+static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
+}
+SCX_ATTR(enable_seq);
+
+static struct attribute *scx_global_attrs[] = {
+ &scx_attr_state.attr,
+ &scx_attr_switch_all.attr,
+ &scx_attr_nr_rejected.attr,
+ &scx_attr_hotplug_seq.attr,
+ &scx_attr_enable_seq.attr,
+ NULL,
+};
+
+static const struct attribute_group scx_global_attr_group = {
+ .attrs = scx_global_attrs,
+};
+
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+ struct rcu_work *rcu_work = to_rcu_work(work);
+ struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+ int node;
+
+ irq_work_sync(&sch->error_irq_work);
+ kthread_stop(sch->helper->task);
+
+ free_percpu(sch->pcpu);
+
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+
+ rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(sch, dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+ free_exit_info(sch->exit_info);
+ kfree(sch);
+}
+
+static void scx_kobj_release(struct kobject *kobj)
+{
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+ queue_rcu_work(system_unbound_wq, &sch->rcu_work);
+}
+
+static ssize_t scx_attr_ops_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", scx_root->ops.name);
+}
+SCX_ATTR(ops);
+
+#define scx_attr_event_show(buf, at, events, kind) ({ \
+ sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \
+})
+
+static ssize_t scx_attr_events_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+ struct scx_event_stats events;
+ int at = 0;
+
+ scx_read_events(sch, &events);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+ return at;
+}
+SCX_ATTR(events);
+
+static struct attribute *scx_sched_attrs[] = {
+ &scx_attr_ops.attr,
+ &scx_attr_events.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(scx_sched);
+
+static const struct kobj_type scx_ktype = {
+ .release = scx_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = scx_sched_groups,
+};
+
+static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+ return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
+}
+
+static const struct kset_uevent_ops scx_uevent_ops = {
+ .uevent = scx_uevent,
+};
+
+/*
+ * Used by sched_fork() and __setscheduler_prio() to pick the matching
+ * sched_class. dl/rt are already handled.
+ */
+bool task_should_scx(int policy)
+{
+ if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
+ return false;
+ if (READ_ONCE(scx_switching_all))
+ return true;
+ return policy == SCHED_EXT;
+}
+
+bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ struct scx_sched *sch;
+
+ if (!scx_enabled())
+ return true;
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return true;
+
+ if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ return true;
+
+ if (unlikely(p->sched_class != &ext_sched_class))
+ return true;
+
+ return false;
+}
+
+/**
+ * handle_lockup - sched_ext common lockup handler
+ * @fmt: format string
+ *
+ * Called on system stall or lockup condition and initiates abort of sched_ext
+ * if enabled, which may resolve the reported lockup.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the lockup. %false if sched_ext is not enabled or abort was already
+ * initiated by someone else.
+ */
+static __printf(1, 2) bool handle_lockup(const char *fmt, ...)
+{
+ struct scx_sched *sch;
+ va_list args;
+ bool ret;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
+ va_start(args, fmt);
+ ret = scx_verror(sch, fmt, args);
+ va_end(args);
+ return ret;
+ default:
+ return false;
+ }
+}
+
+/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported RCU stall. %false if sched_ext is not enabled or someone
+ * else already initiated abort.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ return handle_lockup("RCU CPU stall detected!");
+}
+
+/**
+ * scx_softlockup - sched_ext softlockup handler
+ * @dur_s: number of seconds of CPU stuck due to soft lockup
+ *
+ * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
+ * live-lock the system by making many CPUs target the same DSQ to the point
+ * where soft-lockup detection triggers. This function is called from
+ * soft-lockup watchdog when the triggering point is close and tries to unjam
+ * the system and aborting the BPF scheduler.
+ */
+void scx_softlockup(u32 dur_s)
+{
+ if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s))
+ return;
+
+ printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n",
+ smp_processor_id(), dur_s);
+}
+
+/**
+ * scx_hardlockup - sched_ext hardlockup handler
+ *
+ * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting
+ * numerous affinitized tasks in a single queue and directing all CPUs at it.
+ * Try kicking out the current scheduler in an attempt to recover the system to
+ * a good state before taking more drastic actions.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported hardlockdup. %false if sched_ext is not enabled or
+ * someone else already initiated abort.
+ */
+bool scx_hardlockup(int cpu)
+{
+ if (!handle_lockup("hard lockup - CPU %d", cpu))
+ return false;
+
+ printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+ cpu);
+ return true;
+}
+
+static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+ struct cpumask *donee_mask, struct cpumask *resched_mask,
+ u32 nr_donor_target, u32 nr_donee_target)
+{
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+ struct task_struct *p, *n;
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+ s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
+ u32 nr_balanced = 0, min_delta_us;
+
+ /*
+ * All we want to guarantee is reasonable forward progress. No reason to
+ * fine tune. Assuming every task on @donor_dsq runs their full slice,
+ * consider offloading iff the total queued duration is over the
+ * threshold.
+ */
+ min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ return 0;
+
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ list_add(&cursor.node, &donor_dsq->list);
+resume:
+ n = container_of(&cursor, struct task_struct, scx.dsq_list);
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ while ((p = n)) {
+ struct rq *donee_rq;
+ struct scx_dispatch_q *donee_dsq;
+ int donee;
+
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ if (donor_dsq->nr <= nr_donor_target)
+ break;
+
+ if (cpumask_empty(donee_mask))
+ break;
+
+ donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
+ if (donee >= nr_cpu_ids)
+ continue;
+
+ donee_rq = cpu_rq(donee);
+ donee_dsq = &donee_rq->scx.bypass_dsq;
+
+ /*
+ * $p's rq is not locked but $p's DSQ lock protects its
+ * scheduling properties making this test safe.
+ */
+ if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+ continue;
+
+ /*
+ * Moving $p from one non-local DSQ to another. The source rq
+ * and DSQ are already locked. Do an abbreviated dequeue and
+ * then perform enqueue without unlocking $donor_dsq.
+ *
+ * We don't want to drop and reacquire the lock on each
+ * iteration as @donor_dsq can be very long and potentially
+ * highly contended. Donee DSQs are less likely to be contended.
+ * The nested locking is safe as only this LB moves tasks
+ * between bypass DSQs.
+ */
+ dispatch_dequeue_locked(p, donor_dsq);
+ dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+
+ /*
+ * $donee might have been idle and need to be woken up. No need
+ * to be clever. Kick every CPU that receives tasks.
+ */
+ cpumask_set_cpu(donee, resched_mask);
+
+ if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
+ cpumask_clear_cpu(donee, donee_mask);
+
+ nr_balanced++;
+ if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
+ list_move_tail(&cursor.node, &n->scx.dsq_list.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
+ cpu_relax();
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ goto resume;
+ }
+ }
+
+ list_del_init(&cursor.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
+
+ return nr_balanced;
+}
+
+static void bypass_lb_node(struct scx_sched *sch, int node)
+{
+ const struct cpumask *node_mask = cpumask_of_node(node);
+ struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
+ u32 nr_target, nr_donor_target;
+ u32 before_min = U32_MAX, before_max = 0;
+ u32 after_min = U32_MAX, after_max = 0;
+ int cpu;
+
+ /* count the target tasks and CPUs */
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ nr_tasks += nr;
+ nr_cpus++;
+
+ before_min = min(nr, before_min);
+ before_max = max(nr, before_max);
+ }
+
+ if (!nr_cpus)
+ return;
+
+ /*
+ * We don't want CPUs to have more than $nr_donor_target tasks and
+ * balancing to fill donee CPUs upto $nr_target. Once targets are
+ * calculated, find the donee CPUs.
+ */
+ nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus);
+ nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100);
+
+ cpumask_clear(donee_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+ cpumask_set_cpu(cpu, donee_mask);
+ }
+
+ /* iterate !donee CPUs and see if they should be offloaded */
+ cpumask_clear(resched_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+
+ if (cpumask_empty(donee_mask))
+ break;
+ if (cpumask_test_cpu(cpu, donee_mask))
+ continue;
+ if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+ continue;
+
+ nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+ nr_donor_target, nr_target);
+ }
+
+ for_each_cpu(cpu, resched_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ raw_spin_rq_lock_irq(rq);
+ resched_curr(rq);
+ raw_spin_rq_unlock_irq(rq);
+ }
+
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ after_min = min(nr, after_min);
+ after_max = max(nr, after_max);
+
+ }
+
+ trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced,
+ before_min, before_max, after_min, after_max);
+}
+
+/*
+ * In bypass mode, all tasks are put on the per-CPU bypass DSQs. If the machine
+ * is over-saturated and the BPF scheduler skewed tasks into few CPUs, some
+ * bypass DSQs can be overloaded. If there are enough tasks to saturate other
+ * lightly loaded CPUs, such imbalance can lead to very high execution latency
+ * on the overloaded CPUs and thus to hung tasks and RCU stalls. To avoid such
+ * outcomes, a simple load balancing mechanism is implemented by the following
+ * timer which runs periodically while bypass mode is in effect.
+ */
+static void scx_bypass_lb_timerfn(struct timer_list *timer)
+{
+ struct scx_sched *sch;
+ int node;
+ u32 intv_us;
+
+ sch = rcu_dereference_all(scx_root);
+ if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
+ return;
+
+ for_each_node_with_cpus(node)
+ bypass_lb_node(sch, node);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us)
+ mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
+}
+
+static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
+
+/**
+ * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * @bypass: true for bypass, false for unbypass
+ *
+ * Bypassing guarantees that all runnable tasks make forward progress without
+ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
+ * be held by tasks that the BPF scheduler is forgetting to run, which
+ * unfortunately also excludes toggling the static branches.
+ *
+ * Let's work around by overriding a couple ops and modifying behaviors based on
+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
+ * to force global FIFO scheduling.
+ *
+ * - ops.select_cpu() is ignored and the default select_cpu() is used.
+ *
+ * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ * %SCX_OPS_ENQ_LAST is also ignored.
+ *
+ * - ops.dispatch() is ignored.
+ *
+ * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
+ * can't be trusted. Whenever a tick triggers, the running task is rotated to
+ * the tail of the queue with core_sched_at touched.
+ *
+ * - pick_next_task() suppresses zero slice warning.
+ *
+ * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * operations.
+ *
+ * - scx_prio_less() reverts to the default core_sched_at order.
+ */
+static void scx_bypass(bool bypass)
+{
+ static DEFINE_RAW_SPINLOCK(bypass_lock);
+ static unsigned long bypass_timestamp;
+ struct scx_sched *sch;
+ unsigned long flags;
+ int cpu;
+
+ raw_spin_lock_irqsave(&bypass_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+
+ if (bypass) {
+ u32 intv_us;
+
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
+ WARN_ON_ONCE(scx_bypass_depth <= 0);
+ if (scx_bypass_depth != 1)
+ goto unlock;
+ WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
+ bypass_timestamp = ktime_get_ns();
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+
+ intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+ if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
+ scx_bypass_lb_timer.expires =
+ jiffies + usecs_to_jiffies(intv_us);
+ add_timer_global(&scx_bypass_lb_timer);
+ }
+ } else {
+ WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
+ WARN_ON_ONCE(scx_bypass_depth < 0);
+ if (scx_bypass_depth != 0)
+ goto unlock;
+ WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
+ }
+
+ /*
+ * No task property is changing. We just need to make sure all currently
+ * queued tasks are re-queued according to the new scx_rq_bypassing()
+ * state. As an optimization, walk each rq's runnable_list instead of
+ * the scx_tasks list.
+ *
+ * This function can't trust the scheduler and thus can't use
+ * cpus_read_lock(). Walk all possible CPUs instead of online.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p, *n;
+
+ raw_spin_rq_lock(rq);
+
+ if (bypass) {
+ WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
+ rq->scx.flags |= SCX_RQ_BYPASSING;
+ } else {
+ WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
+ rq->scx.flags &= ~SCX_RQ_BYPASSING;
+ }
+
+ /*
+ * We need to guarantee that no tasks are on the BPF scheduler
+ * while bypassing. Either we see enabled or the enable path
+ * sees scx_rq_bypassing() before moving tasks to SCX.
+ */
+ if (!scx_enabled()) {
+ raw_spin_rq_unlock(rq);
+ continue;
+ }
+
+ /*
+ * The use of list_for_each_entry_safe_reverse() is required
+ * because each task is going to be removed from and added back
+ * to the runnable_list during iteration. Because they're added
+ * to the tail of the list, safe reverse iteration can still
+ * visit all nodes.
+ */
+ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
+ scx.runnable_node) {
+ /* cycling deq/enq is enough, see the function comment */
+ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+ /* nothing */ ;
+ }
+ }
+
+ /* resched to restore ticks and idle state */
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+
+ raw_spin_rq_unlock(rq);
+ }
+
+unlock:
+ raw_spin_unlock_irqrestore(&bypass_lock, flags);
+}
+
+static void free_exit_info(struct scx_exit_info *ei)
+{
+ kvfree(ei->dump);
+ kfree(ei->msg);
+ kfree(ei->bt);
+ kfree(ei);
+}
+
+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
+{
+ struct scx_exit_info *ei;
+
+ ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
+ ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
+
+ if (!ei->bt || !ei->msg || !ei->dump) {
+ free_exit_info(ei);
+ return NULL;
+ }
+
+ return ei;
+}
+
+static const char *scx_exit_reason(enum scx_exit_kind kind)
+{
+ switch (kind) {
+ case SCX_EXIT_UNREG:
+ return "unregistered from user space";
+ case SCX_EXIT_UNREG_BPF:
+ return "unregistered from BPF";
+ case SCX_EXIT_UNREG_KERN:
+ return "unregistered from the main kernel";
+ case SCX_EXIT_SYSRQ:
+ return "disabled by sysrq-S";
+ case SCX_EXIT_ERROR:
+ return "runtime error";
+ case SCX_EXIT_ERROR_BPF:
+ return "scx_bpf_error";
+ case SCX_EXIT_ERROR_STALL:
+ return "runnable task stall";
+ default:
+ return "<UNKNOWN>";
+ }
+}
+
+static void free_kick_syncs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
+ struct scx_kick_syncs *to_free;
+
+ to_free = rcu_replace_pointer(*ksyncs, NULL, true);
+ if (to_free)
+ kvfree_rcu(to_free, rcu);
+ }
+}
+
+static void scx_disable_workfn(struct kthread_work *work)
+{
+ struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
+ struct scx_exit_info *ei = sch->exit_info;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ int kind, cpu;
+
+ kind = atomic_read(&sch->exit_kind);
+ while (true) {
+ if (kind == SCX_EXIT_DONE) /* already disabled? */
+ return;
+ WARN_ON_ONCE(kind == SCX_EXIT_NONE);
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
+ break;
+ }
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
+ /* guarantee forward progress by bypassing scx_ops */
+ scx_bypass(true);
+ WRITE_ONCE(scx_aborting, false);
+
+ switch (scx_set_enable_state(SCX_DISABLING)) {
+ case SCX_DISABLING:
+ WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
+ break;
+ case SCX_DISABLED:
+ pr_warn("sched_ext: ops error detected without ops (%s)\n",
+ sch->exit_info->msg);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
+ goto done;
+ default:
+ break;
+ }
+
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
+ mutex_lock(&scx_enable_mutex);
+
+ static_branch_disable(&__scx_switched_all);
+ WRITE_ONCE(scx_switching_all, false);
+
+ /*
+ * Shut down cgroup support before tasks so that the cgroup attach path
+ * doesn't race against scx_exit_task().
+ */
+ scx_cgroup_lock();
+ scx_cgroup_exit(sch);
+ scx_cgroup_unlock();
+
+ /*
+ * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
+ * must be switched out and exited synchronously.
+ */
+ percpu_down_write(&scx_fork_rwsem);
+
+ scx_init_task_enabled = false;
+
+ scx_task_iter_start(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class = scx_setscheduler_class(p);
+
+ update_rq_clock(task_rq(p));
+
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
+
+ scoped_guard (sched_change, p, queue_flags) {
+ p->sched_class = new_class;
+ }
+
+ scx_exit_task(p);
+ }
+ scx_task_iter_stop(&sti);
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * Invalidate all the rq clocks to prevent getting outdated
+ * rq clocks from a previous scx scheduler.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ scx_rq_clock_invalidate(rq);
+ }
+
+ /* no task is on scx, turn off all the switches and flush in-progress calls */
+ static_branch_disable(&__scx_enabled);
+ bitmap_zero(sch->has_op, SCX_OPI_END);
+ scx_idle_disable();
+ synchronize_rcu();
+
+ if (ei->kind >= SCX_EXIT_ERROR) {
+ pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
+ sch->ops.name, ei->reason);
+
+ if (ei->msg[0] != '\0')
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
+#ifdef CONFIG_STACKTRACE
+ stack_trace_print(ei->bt, ei->bt_len, 2);
+#endif
+ } else {
+ pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
+ sch->ops.name, ei->reason);
+ }
+
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
+
+ cancel_delayed_work_sync(&scx_watchdog_work);
+
+ /*
+ * scx_root clearing must be inside cpus_read_lock(). See
+ * handle_hotplug().
+ */
+ cpus_read_lock();
+ RCU_INIT_POINTER(scx_root, NULL);
+ cpus_read_unlock();
+
+ /*
+ * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+ * could observe an object of the same name still in the hierarchy when
+ * the next scheduler is loaded.
+ */
+ kobject_del(&sch->kobj);
+
+ free_percpu(scx_dsp_ctx);
+ scx_dsp_ctx = NULL;
+ scx_dsp_max_batch = 0;
+ free_kick_syncs();
+
+ mutex_unlock(&scx_enable_mutex);
+
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
+done:
+ scx_bypass(false);
+}
+
+static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
+{
+ int none = SCX_EXIT_NONE;
+
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
+ return false;
+
+ /*
+ * Some CPUs may be trapped in the dispatch paths. Set the aborting
+ * flag to break potential live-lock scenarios, ensuring we can
+ * successfully reach scx_bypass().
+ */
+ WRITE_ONCE(scx_aborting, true);
+ return true;
+}
+
+static void scx_disable(enum scx_exit_kind kind)
+{
+ struct scx_sched *sch;
+
+ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
+ kind = SCX_EXIT_ERROR;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ scx_claim_exit(sch, kind);
+ kthread_queue_work(sch->helper, &sch->disable_work);
+ }
+ rcu_read_unlock();
+}
+
+static void dump_newline(struct seq_buf *s)
+{
+ trace_sched_ext_dump("");
+
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size)
+ seq_buf_putc(s, '\n');
+}
+
+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
+{
+ va_list args;
+
+#ifdef CONFIG_TRACEPOINTS
+ if (trace_sched_ext_dump_enabled()) {
+ /* protected by scx_dump_state()::dump_lock */
+ static char line_buf[SCX_EXIT_MSG_LEN];
+
+ va_start(args, fmt);
+ vscnprintf(line_buf, sizeof(line_buf), fmt, args);
+ va_end(args);
+
+ trace_sched_ext_dump(line_buf);
+ }
+#endif
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size) {
+ va_start(args, fmt);
+ seq_buf_vprintf(s, fmt, args);
+ va_end(args);
+
+ seq_buf_putc(s, '\n');
+ }
+}
+
+static void dump_stack_trace(struct seq_buf *s, const char *prefix,
+ const unsigned long *bt, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i++)
+ dump_line(s, "%s%pS", prefix, (void *)bt[i]);
+}
+
+static void ops_dump_init(struct seq_buf *s, const char *prefix)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+
+ lockdep_assert_irqs_disabled();
+
+ dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */
+ dd->first = true;
+ dd->cursor = 0;
+ dd->s = s;
+ dd->prefix = prefix;
+}
+
+static void ops_dump_flush(void)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+ char *line = dd->buf.line;
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * There's something to flush and this is the first line. Insert a blank
+ * line to distinguish ops dump.
+ */
+ if (dd->first) {
+ dump_newline(dd->s);
+ dd->first = false;
+ }
+
+ /*
+ * There may be multiple lines in $line. Scan and emit each line
+ * separately.
+ */
+ while (true) {
+ char *end = line;
+ char c;
+
+ while (*end != '\n' && *end != '\0')
+ end++;
+
+ /*
+ * If $line overflowed, it may not have newline at the end.
+ * Always emit with a newline.
+ */
+ c = *end;
+ *end = '\0';
+ dump_line(dd->s, "%s%s", dd->prefix, line);
+ if (c == '\0')
+ break;
+
+ /* move to the next line */
+ end++;
+ if (*end == '\0')
+ break;
+ line = end;
+ }
+
+ dd->cursor = 0;
+}
+
+static void ops_dump_exit(void)
+{
+ ops_dump_flush();
+ scx_dump_data.cpu = -1;
+}
+
+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
+ struct task_struct *p, char marker)
+{
+ static unsigned long bt[SCX_EXIT_BT_LEN];
+ struct scx_sched *sch = scx_root;
+ char dsq_id_buf[19] = "(n/a)";
+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+ unsigned int bt_len = 0;
+
+ if (p->scx.dsq)
+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+ (unsigned long long)p->scx.dsq->id);
+
+ dump_newline(s);
+ dump_line(s, " %c%c %s[%d] %+ldms",
+ marker, task_state_to_char(p), p->comm, p->pid,
+ jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
+ dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
+ scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
+ p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
+ ops_state >> SCX_OPSS_QSEQ_SHIFT);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
+ p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
+ dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
+ p->migration_disabled);
+
+ if (SCX_HAS_OP(sch, dump_task)) {
+ ops_dump_init(s, " ");
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
+ ops_dump_exit();
+ }
+
+#ifdef CONFIG_STACKTRACE
+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+#endif
+ if (bt_len) {
+ dump_newline(s);
+ dump_stack_trace(s, " ", bt, bt_len);
+ }
+}
+
+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+{
+ static DEFINE_SPINLOCK(dump_lock);
+ static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_sched *sch = scx_root;
+ struct scx_dump_ctx dctx = {
+ .kind = ei->kind,
+ .exit_code = ei->exit_code,
+ .reason = ei->reason,
+ .at_ns = ktime_get_ns(),
+ .at_jiffies = jiffies,
+ };
+ struct seq_buf s;
+ struct scx_event_stats events;
+ unsigned long flags;
+ char *buf;
+ int cpu;
+
+ spin_lock_irqsave(&dump_lock, flags);
+
+ seq_buf_init(&s, ei->dump, dump_len);
+
+ if (ei->kind == SCX_EXIT_NONE) {
+ dump_line(&s, "Debug dump triggered by %s", ei->reason);
+ } else {
+ dump_line(&s, "%s[%d] triggered exit kind %d:",
+ current->comm, current->pid, ei->kind);
+ dump_line(&s, " %s (%s)", ei->reason, ei->msg);
+ dump_newline(&s);
+ dump_line(&s, "Backtrace:");
+ dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
+ }
+
+ if (SCX_HAS_OP(sch, dump)) {
+ ops_dump_init(&s, "");
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
+ ops_dump_exit();
+ }
+
+ dump_newline(&s);
+ dump_line(&s, "CPU states");
+ dump_line(&s, "----------");
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p;
+ struct seq_buf ns;
+ size_t avail, used;
+ bool idle;
+
+ rq_lock_irqsave(rq, &rf);
+
+ idle = list_empty(&rq->scx.runnable_list) &&
+ rq->curr->sched_class == &idle_sched_class;
+
+ if (idle && !SCX_HAS_OP(sch, dump_cpu))
+ goto next;
+
+ /*
+ * We don't yet know whether ops.dump_cpu() will produce output
+ * and we may want to skip the default CPU dump if it doesn't.
+ * Use a nested seq_buf to generate the standard dump so that we
+ * can decide whether to commit later.
+ */
+ avail = seq_buf_get_buf(&s, &buf);
+ seq_buf_init(&ns, buf, avail);
+
+ dump_newline(&ns);
+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu",
+ cpu, rq->scx.nr_running, rq->scx.flags,
+ rq->scx.cpu_released, rq->scx.ops_qseq,
+ rq->scx.kick_sync);
+ dump_line(&ns, " curr=%s[%d] class=%ps",
+ rq->curr->comm, rq->curr->pid,
+ rq->curr->sched_class);
+ if (!cpumask_empty(rq->scx.cpus_to_kick))
+ dump_line(&ns, " cpus_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick));
+ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
+ dump_line(&ns, " idle_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
+ if (!cpumask_empty(rq->scx.cpus_to_preempt))
+ dump_line(&ns, " cpus_to_preempt: %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_preempt));
+ if (!cpumask_empty(rq->scx.cpus_to_wait))
+ dump_line(&ns, " cpus_to_wait : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_wait));
+
+ used = seq_buf_used(&ns);
+ if (SCX_HAS_OP(sch, dump_cpu)) {
+ ops_dump_init(&ns, " ");
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
+ &dctx, cpu, idle);
+ ops_dump_exit();
+ }
+
+ /*
+ * If idle && nothing generated by ops.dump_cpu(), there's
+ * nothing interesting. Skip.
+ */
+ if (idle && used == seq_buf_used(&ns))
+ goto next;
+
+ /*
+ * $s may already have overflowed when $ns was created. If so,
+ * calling commit on it will trigger BUG.
+ */
+ if (avail) {
+ seq_buf_commit(&s, seq_buf_used(&ns));
+ if (seq_buf_has_overflowed(&ns))
+ seq_buf_set_overflow(&s);
+ }
+
+ if (rq->curr->sched_class == &ext_sched_class)
+ scx_dump_task(&s, &dctx, rq->curr, '*');
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+ scx_dump_task(&s, &dctx, p, ' ');
+ next:
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ dump_newline(&s);
+ dump_line(&s, "Event counters");
+ dump_line(&s, "--------------");
+
+ scx_read_events(sch, &events);
+ scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+
+ if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
+ memcpy(ei->dump + dump_len - sizeof(trunc_marker),
+ trunc_marker, sizeof(trunc_marker));
+
+ spin_unlock_irqrestore(&dump_lock, flags);
+}
+
+static void scx_error_irq_workfn(struct irq_work *irq_work)
+{
+ struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+ struct scx_exit_info *ei = sch->exit_info;
+
+ if (ei->kind >= SCX_EXIT_ERROR)
+ scx_dump_state(ei, sch->ops.exit_dump_len);
+
+ kthread_queue_work(sch->helper, &sch->disable_work);
+}
+
+static bool scx_vexit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, va_list args)
+{
+ struct scx_exit_info *ei = sch->exit_info;
+
+ if (!scx_claim_exit(sch, kind))
+ return false;
+
+ ei->exit_code = exit_code;
+#ifdef CONFIG_STACKTRACE
+ if (kind >= SCX_EXIT_ERROR)
+ ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
+#endif
+ vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
+
+ /*
+ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+ * in scx_disable_workfn().
+ */
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
+ irq_work_queue(&sch->error_irq_work);
+ return true;
+}
+
+static int alloc_kick_syncs(void)
+{
+ int cpu;
+
+ /*
+ * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+ * can exceed percpu allocator limits on large machines.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_kick_syncs **ksyncs = per_cpu_ptr(&scx_kick_syncs, cpu);
+ struct scx_kick_syncs *new_ksyncs;
+
+ WARN_ON_ONCE(rcu_access_pointer(*ksyncs));
+
+ new_ksyncs = kvzalloc_node(struct_size(new_ksyncs, syncs, nr_cpu_ids),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!new_ksyncs) {
+ free_kick_syncs();
+ return -ENOMEM;
+ }
+
+ rcu_assign_pointer(*ksyncs, new_ksyncs);
+ }
+
+ return 0;
+}
+
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+{
+ struct scx_sched *sch;
+ int node, ret;
+
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch)
+ return ERR_PTR(-ENOMEM);
+
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
+ ret = -ENOMEM;
+ goto err_free_sch;
+ }
+
+ ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
+ if (ret < 0)
+ goto err_free_ei;
+
+ sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
+ GFP_KERNEL);
+ if (!sch->global_dsqs) {
+ ret = -ENOMEM;
+ goto err_free_hash;
+ }
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ ret = -ENOMEM;
+ goto err_free_gdsqs;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ sch->global_dsqs[node] = dsq;
+ }
+
+ sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+ if (!sch->pcpu)
+ goto err_free_gdsqs;
+
+ sch->helper = kthread_run_worker(0, "sched_ext_helper");
+ if (IS_ERR(sch->helper)) {
+ ret = PTR_ERR(sch->helper);
+ goto err_free_pcpu;
+ }
+
+ sched_set_fifo(sch->helper->task);
+
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+ kthread_init_work(&sch->disable_work, scx_disable_workfn);
+ sch->ops = *ops;
+ ops->priv = sch;
+
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_stop_helper;
+
+ return sch;
+
+err_stop_helper:
+ kthread_stop(sch->helper->task);
+err_free_pcpu:
+ free_percpu(sch->pcpu);
+err_free_gdsqs:
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+err_free_hash:
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+err_free_ei:
+ free_exit_info(sch->exit_info);
+err_free_sch:
+ kfree(sch);
+ return ERR_PTR(ret);
+}
+
+static int check_hotplug_seq(struct scx_sched *sch,
+ const struct sched_ext_ops *ops)
+{
+ unsigned long long global_hotplug_seq;
+
+ /*
+ * If a hotplug event has occurred between when a scheduler was
+ * initialized, and when we were able to attach, exit and notify user
+ * space about it.
+ */
+ if (ops->hotplug_seq) {
+ global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
+ if (ops->hotplug_seq != global_hotplug_seq) {
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
+{
+ /*
+ * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
+ * ops.enqueue() callback isn't implemented.
+ */
+ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
+ scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ return -EINVAL;
+ }
+
+ /*
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
+ * selection policy to be enabled.
+ */
+ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
+ (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ return -EINVAL;
+ }
+
+ if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
+ pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
+
+ if (ops->cpu_acquire || ops->cpu_release)
+ pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
+
+ return 0;
+}
+
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ struct scx_sched *sch;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ unsigned long timeout;
+ int i, cpu, ret;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&scx_enable_mutex);
+
+ if (scx_enable_state() != SCX_DISABLED) {
+ ret = -EBUSY;
+ goto err_unlock;
+ }
+
+ ret = alloc_kick_syncs();
+ if (ret)
+ goto err_unlock;
+
+ sch = scx_alloc_and_add_sched(ops);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
+ goto err_free_ksyncs;
+ }
+
+ /*
+ * Transition to ENABLING and clear exit info to arm the disable path.
+ * Failure triggers full disabling from here on.
+ */
+ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
+ WARN_ON_ONCE(scx_root);
+ if (WARN_ON_ONCE(READ_ONCE(scx_aborting)))
+ WRITE_ONCE(scx_aborting, false);
+
+ atomic_long_set(&scx_nr_rejected, 0);
+
+ for_each_possible_cpu(cpu)
+ cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+
+ /*
+ * Keep CPUs stable during enable so that the BPF scheduler can track
+ * online CPUs by watching ->on/offline_cpu() after ->init().
+ */
+ cpus_read_lock();
+
+ /*
+ * Make the scheduler instance visible. Must be inside cpus_read_lock().
+ * See handle_hotplug().
+ */
+ rcu_assign_pointer(scx_root, sch);
+
+ scx_idle_enable(ops);
+
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
+ if (ret) {
+ ret = ops_sanitize_err(sch, "init", ret);
+ cpus_read_unlock();
+ scx_error(sch, "ops.init() failed (%d)", ret);
+ goto err_disable;
+ }
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
+ }
+
+ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
+ if (((void (**)(void))ops)[i])
+ set_bit(i, sch->has_op);
+
+ ret = check_hotplug_seq(sch, ops);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+ scx_idle_update_selcpu_topology(ops);
+
+ cpus_read_unlock();
+
+ ret = validate_ops(sch, ops);
+ if (ret)
+ goto err_disable;
+
+ WARN_ON_ONCE(scx_dsp_ctx);
+ scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
+ scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
+ scx_dsp_max_batch),
+ __alignof__(struct scx_dsp_ctx));
+ if (!scx_dsp_ctx) {
+ ret = -ENOMEM;
+ goto err_disable;
+ }
+
+ if (ops->timeout_ms)
+ timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
+ WRITE_ONCE(scx_watchdog_timeout, timeout);
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+ scx_watchdog_timeout / 2);
+
+ /*
+ * Once __scx_enabled is set, %current can be switched to SCX anytime.
+ * This can lead to stalls as some BPF schedulers (e.g. userspace
+ * scheduling) may not function correctly before all tasks are switched.
+ * Init in bypass mode to guarantee forward progress.
+ */
+ scx_bypass(true);
+
+ for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
+ if (((void (**)(void))ops)[i])
+ set_bit(i, sch->has_op);
+
+ if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+ sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
+
+ /*
+ * Lock out forks, cgroup on/offlining and moves before opening the
+ * floodgate so that they don't wander into the operations prematurely.
+ */
+ percpu_down_write(&scx_fork_rwsem);
+
+ WARN_ON_ONCE(scx_init_task_enabled);
+ scx_init_task_enabled = true;
+
+ /*
+ * Enable ops for every task. Fork is excluded by scx_fork_rwsem
+ * preventing new tasks from being added. No need to exclude tasks
+ * leaving as sched_ext_free() can handle both prepped and enabled
+ * tasks. Prep all tasks first and then enable them with preemption
+ * disabled.
+ *
+ * All cgroups should be initialized before scx_init_task() so that the
+ * BPF scheduler can reliably track each task's cgroup membership from
+ * scx_init_task(). Lock out cgroup on/offlining and task migrations
+ * while tasks are being initialized so that scx_cgroup_can_attach()
+ * never sees uninitialized tasks.
+ */
+ scx_cgroup_lock();
+ ret = scx_cgroup_init(sch);
+ if (ret)
+ goto err_disable_unlock_all;
+
+ scx_task_iter_start(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ /*
+ * @p may already be dead, have lost all its usages counts and
+ * be waiting for RCU grace period before being freed. @p can't
+ * be initialized for SCX in such cases and should be ignored.
+ */
+ if (!tryget_task_struct(p))
+ continue;
+
+ scx_task_iter_unlock(&sti);
+
+ ret = scx_init_task(p, task_group(p), false);
+ if (ret) {
+ put_task_struct(p);
+ scx_task_iter_stop(&sti);
+ scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
+ goto err_disable_unlock_all;
+ }
+
+ scx_set_task_state(p, SCX_TASK_READY);
+
+ put_task_struct(p);
+ }
+ scx_task_iter_stop(&sti);
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * All tasks are READY. It's safe to turn on scx_enabled() and switch
+ * all eligible tasks.
+ */
+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
+ static_branch_enable(&__scx_enabled);
+
+ /*
+ * We're fully committed and can't fail. The task READY -> ENABLED
+ * transitions here are synchronized against sched_ext_free() through
+ * scx_tasks_lock.
+ */
+ percpu_down_write(&scx_fork_rwsem);
+ scx_task_iter_start(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class = scx_setscheduler_class(p);
+
+ if (scx_get_task_state(p) != SCX_TASK_READY)
+ continue;
+
+ if (old_class != new_class)
+ queue_flags |= DEQUEUE_CLASS;
+
+ scoped_guard (sched_change, p, queue_flags) {
+ p->scx.slice = READ_ONCE(scx_slice_dfl);
+ p->sched_class = new_class;
+ }
+ }
+ scx_task_iter_stop(&sti);
+ percpu_up_write(&scx_fork_rwsem);
+
+ scx_bypass(false);
+
+ if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
+ goto err_disable;
+ }
+
+ if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
+ static_branch_enable(&__scx_switched_all);
+
+ pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
+ sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ mutex_unlock(&scx_enable_mutex);
+
+ atomic_long_inc(&scx_enable_seq);
+
+ return 0;
+
+err_free_ksyncs:
+ free_kick_syncs();
+err_unlock:
+ mutex_unlock(&scx_enable_mutex);
+ return ret;
+
+err_disable_unlock_all:
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+ /* we'll soon enter disable path, keep bypass on */
+err_disable:
+ mutex_unlock(&scx_enable_mutex);
+ /*
+ * Returning an error code here would not pass all the error information
+ * to userspace. Record errno using scx_error() for cases scx_error()
+ * wasn't already invoked and exit indicating success so that the error
+ * is notified through ops.exit() with all the details.
+ *
+ * Flush scx_disable_work to ensure that error is reported before init
+ * completion. sch's base reference will be put by bpf_scx_unreg().
+ */
+ scx_error(sch, "scx_enable() failed (%d)", ret);
+ kthread_flush_work(&sch->disable_work);
+ return 0;
+}
+
+
+/********************************************************************************
+ * bpf_struct_ops plumbing.
+ */
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+static const struct btf_type *task_struct_type;
+
+static bool bpf_scx_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type != BPF_READ)
+ return false;
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ return btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg, int off,
+ int size)
+{
+ const struct btf_type *t;
+
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t == task_struct_type) {
+ if (off >= offsetof(struct task_struct, scx.slice) &&
+ off + size <= offsetofend(struct task_struct, scx.slice))
+ return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+ return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.disallow) &&
+ off + size <= offsetofend(struct task_struct, scx.disallow))
+ return SCALAR_VALUE;
+ }
+
+ return -EACCES;
+}
+
+static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = bpf_scx_is_valid_access,
+ .btf_struct_access = bpf_scx_btf_struct_access,
+};
+
+static int bpf_scx_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct sched_ext_ops *uops = udata;
+ struct sched_ext_ops *ops = kdata;
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+ int ret;
+
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, dispatch_max_batch):
+ if (*(u32 *)(udata + moff) > INT_MAX)
+ return -E2BIG;
+ ops->dispatch_max_batch = *(u32 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, flags):
+ if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
+ return -EINVAL;
+ ops->flags = *(u64 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, name):
+ ret = bpf_obj_name_cpy(ops->name, uops->name,
+ sizeof(ops->name));
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EINVAL;
+ return 1;
+ case offsetof(struct sched_ext_ops, timeout_ms):
+ if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
+ SCX_WATCHDOG_MAX_TIMEOUT)
+ return -E2BIG;
+ ops->timeout_ms = *(u32 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, exit_dump_len):
+ ops->exit_dump_len =
+ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
+ return 1;
+ case offsetof(struct sched_ext_ops, hotplug_seq):
+ ops->hotplug_seq = *(u64 *)(udata + moff);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_scx_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, init_task):
+#ifdef CONFIG_EXT_GROUP_SCHED
+ case offsetof(struct sched_ext_ops, cgroup_init):
+ case offsetof(struct sched_ext_ops, cgroup_exit):
+ case offsetof(struct sched_ext_ops, cgroup_prep_move):
+#endif
+ case offsetof(struct sched_ext_ops, cpu_online):
+ case offsetof(struct sched_ext_ops, cpu_offline):
+ case offsetof(struct sched_ext_ops, init):
+ case offsetof(struct sched_ext_ops, exit):
+ break;
+ default:
+ if (prog->sleepable)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bpf_scx_reg(void *kdata, struct bpf_link *link)
+{
+ return scx_enable(kdata, link);
+}
+
+static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
+{
+ struct sched_ext_ops *ops = kdata;
+ struct scx_sched *sch = ops->priv;
+
+ scx_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&sch->disable_work);
+ kobject_put(&sch->kobj);
+}
+
+static int bpf_scx_init(struct btf *btf)
+{
+ task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]);
+
+ return 0;
+}
+
+static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+ /*
+ * sched_ext does not support updating the actively-loaded BPF
+ * scheduler, as registering a BPF scheduler can always fail if the
+ * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
+ * etc. Similarly, we can always race with unregistration happening
+ * elsewhere, such as with sysrq.
+ */
+ return -EOPNOTSUPP;
+}
+
+static int bpf_scx_validate(void *kdata)
+{
+ return 0;
+}
+
+static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
+static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {}
+static void sched_ext_ops__tick(struct task_struct *p) {}
+static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__running(struct task_struct *p) {}
+static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
+static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
+static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; }
+static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
+static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
+static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}
+static void sched_ext_ops__update_idle(s32 cpu, bool idle) {}
+static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {}
+static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {}
+static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
+static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {}
+static void sched_ext_ops__enable(struct task_struct *p) {}
+static void sched_ext_ops__disable(struct task_struct *p) {}
+#ifdef CONFIG_EXT_GROUP_SCHED
+static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
+static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {}
+static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
+static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
+static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
+#endif
+static void sched_ext_ops__cpu_online(s32 cpu) {}
+static void sched_ext_ops__cpu_offline(s32 cpu) {}
+static s32 sched_ext_ops__init(void) { return -EINVAL; }
+static void sched_ext_ops__exit(struct scx_exit_info *info) {}
+static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {}
+static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
+static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {}
+
+static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
+ .select_cpu = sched_ext_ops__select_cpu,
+ .enqueue = sched_ext_ops__enqueue,
+ .dequeue = sched_ext_ops__dequeue,
+ .dispatch = sched_ext_ops__dispatch,
+ .tick = sched_ext_ops__tick,
+ .runnable = sched_ext_ops__runnable,
+ .running = sched_ext_ops__running,
+ .stopping = sched_ext_ops__stopping,
+ .quiescent = sched_ext_ops__quiescent,
+ .yield = sched_ext_ops__yield,
+ .core_sched_before = sched_ext_ops__core_sched_before,
+ .set_weight = sched_ext_ops__set_weight,
+ .set_cpumask = sched_ext_ops__set_cpumask,
+ .update_idle = sched_ext_ops__update_idle,
+ .cpu_acquire = sched_ext_ops__cpu_acquire,
+ .cpu_release = sched_ext_ops__cpu_release,
+ .init_task = sched_ext_ops__init_task,
+ .exit_task = sched_ext_ops__exit_task,
+ .enable = sched_ext_ops__enable,
+ .disable = sched_ext_ops__disable,
+#ifdef CONFIG_EXT_GROUP_SCHED
+ .cgroup_init = sched_ext_ops__cgroup_init,
+ .cgroup_exit = sched_ext_ops__cgroup_exit,
+ .cgroup_prep_move = sched_ext_ops__cgroup_prep_move,
+ .cgroup_move = sched_ext_ops__cgroup_move,
+ .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
+ .cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+ .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
+ .cgroup_set_idle = sched_ext_ops__cgroup_set_idle,
+#endif
+ .cpu_online = sched_ext_ops__cpu_online,
+ .cpu_offline = sched_ext_ops__cpu_offline,
+ .init = sched_ext_ops__init,
+ .exit = sched_ext_ops__exit,
+ .dump = sched_ext_ops__dump,
+ .dump_cpu = sched_ext_ops__dump_cpu,
+ .dump_task = sched_ext_ops__dump_task,
+};
+
+static struct bpf_struct_ops bpf_sched_ext_ops = {
+ .verifier_ops = &bpf_scx_verifier_ops,
+ .reg = bpf_scx_reg,
+ .unreg = bpf_scx_unreg,
+ .check_member = bpf_scx_check_member,
+ .init_member = bpf_scx_init_member,
+ .init = bpf_scx_init,
+ .update = bpf_scx_update,
+ .validate = bpf_scx_validate,
+ .name = "sched_ext_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_ops_sched_ext_ops
+};
+
+
+/********************************************************************************
+ * System integration and init.
+ */
+
+static void sysrq_handle_sched_ext_reset(u8 key)
+{
+ scx_disable(SCX_EXIT_SYSRQ);
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
+ .handler = sysrq_handle_sched_ext_reset,
+ .help_msg = "reset-sched-ext(S)",
+ .action_msg = "Disable sched_ext and revert all tasks to CFS",
+ .enable_mask = SYSRQ_ENABLE_RTNICE,
+};
+
+static void sysrq_handle_sched_ext_dump(u8 key)
+{
+ struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+
+ if (scx_enabled())
+ scx_dump_state(&ei, 0);
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
+ .handler = sysrq_handle_sched_ext_dump,
+ .help_msg = "dump-sched-ext(D)",
+ .action_msg = "Trigger sched_ext debug dump",
+ .enable_mask = SYSRQ_ENABLE_RTNICE,
+};
+
+static bool can_skip_idle_kick(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * We can skip idle kicking if @rq is going to go through at least one
+ * full SCX scheduling cycle before going idle. Just checking whether
+ * curr is not idle is insufficient because we could be racing
+ * balance_one() trying to pull the next task from a remote rq, which
+ * may fail, and @rq may become idle afterwards.
+ *
+ * The race window is small and we don't and can't guarantee that @rq is
+ * only kicked while idle anyway. Skip only when sure.
+ */
+ return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
+}
+
+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_rq *this_scx = &this_rq->scx;
+ const struct sched_class *cur_class;
+ bool should_wait = false;
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+ cur_class = rq->curr->sched_class;
+
+ /*
+ * During CPU hotplug, a CPU may depend on kicking itself to make
+ * forward progress. Allow kicking self regardless of online state. If
+ * @cpu is running a higher class task, we have no control over @cpu.
+ * Skip kicking.
+ */
+ if ((cpu_online(cpu) || cpu == cpu_of(this_rq)) &&
+ !sched_class_above(cur_class, &ext_sched_class)) {
+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
+ if (cur_class == &ext_sched_class)
+ rq->curr->scx.slice = 0;
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ }
+
+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
+ if (cur_class == &ext_sched_class) {
+ ksyncs[cpu] = rq->scx.kick_sync;
+ should_wait = true;
+ } else {
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
+ }
+
+ resched_curr(rq);
+ } else {
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+
+ return should_wait;
+}
+
+static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+
+ if (!can_skip_idle_kick(rq) &&
+ (cpu_online(cpu) || cpu == cpu_of(this_rq)))
+ resched_curr(rq);
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+}
+
+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_rq *this_scx = &this_rq->scx;
+ struct scx_kick_syncs __rcu *ksyncs_pcpu = __this_cpu_read(scx_kick_syncs);
+ bool should_wait = false;
+ unsigned long *ksyncs;
+ s32 cpu;
+
+ if (unlikely(!ksyncs_pcpu)) {
+ pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
+ return;
+ }
+
+ ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
+
+ for_each_cpu(cpu, this_scx->cpus_to_kick) {
+ should_wait |= kick_one_cpu(cpu, this_rq, ksyncs);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+ }
+
+ for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
+ kick_one_cpu_if_idle(cpu, this_rq);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+ }
+
+ if (!should_wait)
+ return;
+
+ for_each_cpu(cpu, this_scx->cpus_to_wait) {
+ unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync;
+
+ /*
+ * Busy-wait until the task running at the time of kicking is no
+ * longer running. This can be used to implement e.g. core
+ * scheduling.
+ *
+ * smp_cond_load_acquire() pairs with store_releases in
+ * pick_task_scx() and put_prev_task_scx(). The former breaks
+ * the wait if SCX's scheduling path is entered even if the same
+ * task is picked subsequently. The latter is necessary to break
+ * the wait when $cpu is taken by a higher sched class.
+ */
+ if (cpu != cpu_of(this_rq))
+ smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]);
+
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
+}
+
+/**
+ * print_scx_info - print out sched_ext scheduler state
+ * @log_lvl: the log level to use when printing
+ * @p: target task
+ *
+ * If a sched_ext scheduler is enabled, print the name and state of the
+ * scheduler. If @p is on sched_ext, print further information about the task.
+ *
+ * This function can be safely called on any task as long as the task_struct
+ * itself is accessible. While safe, this function isn't synchronized and may
+ * print out mixups or garbages of limited length.
+ */
+void print_scx_info(const char *log_lvl, struct task_struct *p)
+{
+ struct scx_sched *sch = scx_root;
+ enum scx_enable_state state = scx_enable_state();
+ const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
+ char runnable_at_buf[22] = "?";
+ struct sched_class *class;
+ unsigned long runnable_at;
+
+ if (state == SCX_DISABLED)
+ return;
+
+ /*
+ * Carefully check if the task was running on sched_ext, and then
+ * carefully copy the time it's been runnable, and its state.
+ */
+ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
+ class != &ext_sched_class) {
+ printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
+ scx_enable_state_str[state], all);
+ return;
+ }
+
+ if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
+ sizeof(runnable_at)))
+ scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
+ jiffies_delta_msecs(runnable_at, jiffies));
+
+ /* print everything onto one line to conserve console space */
+ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
+ log_lvl, sch->ops.name, scx_enable_state_str[state], all,
+ runnable_at_buf);
+}
+
+static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+ /*
+ * SCX schedulers often have userspace components which are sometimes
+ * involved in critial scheduling paths. PM operations involve freezing
+ * userspace which can lead to scheduling misbehaviors including stalls.
+ * Let's bypass while PM operations are in progress.
+ */
+ switch (event) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ case PM_RESTORE_PREPARE:
+ scx_bypass(true);
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ case PM_POST_RESTORE:
+ scx_bypass(false);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block scx_pm_notifier = {
+ .notifier_call = scx_pm_handler,
+};
+
+void __init init_sched_ext_class(void)
+{
+ s32 cpu, v;
+
+ /*
+ * The following is to prevent the compiler from optimizing out the enum
+ * definitions so that BPF scheduler implementations can use them
+ * through the generated vmlinux.h.
+ */
+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
+ SCX_TG_ONLINE);
+
+ scx_idle_init_masks();
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ int n = cpu_to_node(cpu);
+
+ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+ init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
+ INIT_LIST_HEAD(&rq->scx.runnable_list);
+ INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
+
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+ rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
+ rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
+
+ if (cpu_online(cpu))
+ cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
+ }
+
+ register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+ register_sysrq_key('D', &sysrq_sched_ext_dump_op);
+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
+}
+
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
+{
+ if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ return false;
+
+ lockdep_assert_irqs_disabled();
+
+ if (unlikely(!p)) {
+ scx_error(sch, "called with NULL task");
+ return false;
+ }
+
+ if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
+ scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
+ return false;
+ }
+
+ return true;
+}
+
+static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 enq_flags)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct task_struct *ddsp_task;
+
+ ddsp_task = __this_cpu_read(direct_dispatch_task);
+ if (ddsp_task) {
+ mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags);
+ return;
+ }
+
+ if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
+ scx_error(sch, "dispatch buffer overflow");
+ return;
+ }
+
+ dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
+ .task = p,
+ .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
+ .dsq_id = dsq_id,
+ .enq_flags = enq_flags,
+ };
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
+ * call this function spuriously. Can be called from ops.enqueue(),
+ * ops.select_cpu(), and ops.dispatch().
+ *
+ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
+ * and @p must match the task being enqueued.
+ *
+ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
+ * will be directly inserted into the corresponding dispatch queue after
+ * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be
+ * inserted into the local DSQ of the CPU returned by ops.select_cpu().
+ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
+ * task is inserted.
+ *
+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
+ * and this function can be called upto ops.dispatch_max_batch times to insert
+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
+ * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the
+ * counter.
+ *
+ * This function doesn't have any locking restrictions and may be called under
+ * BPF locks (in the future when BPF introduces more flexible locking).
+ *
+ * @p is allowed to run for @slice. The scheduling path is triggered on slice
+ * exhaustion. If zero, the current residual slice is maintained. If
+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
+ * scx_bpf_kick_cpu() to trigger scheduling.
+ *
+ * Returns %true on successful insertion, %false on failure. On the root
+ * scheduler, %false return triggers scheduler abort and the caller doesn't need
+ * to check the return value.
+ */
+__bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 enq_flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+ return false;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
+
+ return true;
+}
+
+/*
+ * COMPAT: Will be removed in v6.23 along with the ___v2 suffix.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 enq_flags)
+{
+ scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags);
+}
+
+static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
+{
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+ return false;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ p->scx.dsq_vtime = vtime;
+
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+
+ return true;
+}
+
+struct scx_bpf_dsq_insert_vtime_args {
+ /* @p can't be packed together as KF_RCU is not transitive */
+ u64 dsq_id;
+ u64 slice;
+ u64 vtime;
+ u64 enq_flags;
+};
+
+/**
+ * __scx_bpf_dsq_insert_vtime - Arg-wrapped vtime DSQ insertion
+ * @p: task_struct to insert
+ * @args: struct containing the rest of the arguments
+ * @args->dsq_id: DSQ to insert into
+ * @args->slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @args->enq_flags: SCX_ENQ_*
+ *
+ * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
+ * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided
+ * as an inline wrapper in common.bpf.h.
+ *
+ * Insert @p into the vtime priority queue of the DSQ identified by
+ * @args->dsq_id. Tasks queued into the priority queue are ordered by
+ * @args->vtime. All other aspects are identical to scx_bpf_dsq_insert().
+ *
+ * @args->vtime ordering is according to time_before64() which considers
+ * wrapping. A numerically larger vtime may indicate an earlier position in the
+ * ordering and vice-versa.
+ *
+ * A DSQ can only be used as a FIFO or priority queue at any given time and this
+ * function must not be called on a DSQ which already has one or more FIFO tasks
+ * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
+ * SCX_DSQ_GLOBAL) cannot be used as priority queues.
+ *
+ * Returns %true on successful insertion, %false on failure. On the root
+ * scheduler, %false return triggers scheduler abort and the caller doesn't need
+ * to check the return value.
+ */
+__bpf_kfunc bool
+__scx_bpf_dsq_insert_vtime(struct task_struct *p,
+ struct scx_bpf_dsq_insert_vtime_args *args)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ return scx_dsq_insert_vtime(sch, p, args->dsq_id, args->slice,
+ args->vtime, args->enq_flags);
+}
+
+/*
+ * COMPAT: Will be removed in v6.23.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_enqueue_dispatch,
+};
+
+static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
+ struct task_struct *p, u64 dsq_id, u64 enq_flags)
+{
+ struct scx_sched *sch = scx_root;
+ struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
+ struct rq *this_rq, *src_rq, *locked_rq;
+ bool dispatched = false;
+ bool in_balance;
+ unsigned long flags;
+
+ if (!scx_kf_allowed_if_unlocked() &&
+ !scx_kf_allowed(sch, SCX_KF_DISPATCH))
+ return false;
+
+ /*
+ * If the BPF scheduler keeps calling this function repeatedly, it can
+ * cause similar live-lock conditions as consume_dispatch_q().
+ */
+ if (unlikely(READ_ONCE(scx_aborting)))
+ return false;
+
+ /*
+ * Can be called from either ops.dispatch() locking this_rq() or any
+ * context where no rq lock is held. If latter, lock @p's task_rq which
+ * we'll likely need anyway.
+ */
+ src_rq = task_rq(p);
+
+ local_irq_save(flags);
+ this_rq = this_rq();
+ in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
+
+ if (in_balance) {
+ if (this_rq != src_rq) {
+ raw_spin_rq_unlock(this_rq);
+ raw_spin_rq_lock(src_rq);
+ }
+ } else {
+ raw_spin_rq_lock(src_rq);
+ }
+
+ locked_rq = src_rq;
+ raw_spin_lock(&src_dsq->lock);
+
+ /*
+ * Did someone else get to it? @p could have already left $src_dsq, got
+ * re-enqueud, or be in the process of being consumed by someone else.
+ */
+ if (unlikely(p->scx.dsq != src_dsq ||
+ u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
+ p->scx.holding_cpu >= 0) ||
+ WARN_ON_ONCE(src_rq != task_rq(p))) {
+ raw_spin_unlock(&src_dsq->lock);
+ goto out;
+ }
+
+ /* @p is still on $src_dsq and stable, determine the destination */
+ dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
+
+ /*
+ * Apply vtime and slice updates before moving so that the new time is
+ * visible before inserting into $dst_dsq. @p is still on $src_dsq but
+ * this is safe as we're locking it.
+ */
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
+ p->scx.dsq_vtime = kit->vtime;
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
+ p->scx.slice = kit->slice;
+
+ /* execute move */
+ locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq);
+ dispatched = true;
+out:
+ if (in_balance) {
+ if (this_rq != locked_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ raw_spin_rq_lock(this_rq);
+ }
+ } else {
+ raw_spin_rq_unlock_irqrestore(locked_rq, flags);
+ }
+
+ kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
+ __SCX_DSQ_ITER_HAS_VTIME);
+ return dispatched;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
+ *
+ * Can only be called from ops.dispatch().
+ */
+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
+ return 0;
+
+ return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
+}
+
+/**
+ * scx_bpf_dispatch_cancel - Cancel the latest dispatch
+ *
+ * Cancel the latest dispatch. Can be called multiple times to cancel further
+ * dispatches. Can only be called from ops.dispatch().
+ */
+__bpf_kfunc void scx_bpf_dispatch_cancel(void)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
+ return;
+
+ if (dspc->cursor > 0)
+ dspc->cursor--;
+ else
+ scx_error(sch, "dispatch buffer underflow");
+}
+
+/**
+ * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
+ * @dsq_id: DSQ to move task from
+ *
+ * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
+ * local DSQ for execution. Can only be called from ops.dispatch().
+ *
+ * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
+ * before trying to move from the specified DSQ. It may also grab rq locks and
+ * thus can't be called under any BPF locks.
+ *
+ * Returns %true if a task has been moved, %false if there isn't any task to
+ * move.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
+ return false;
+
+ flush_dispatch_buf(sch, dspc->rq);
+
+ dsq = find_user_dsq(sch, dsq_id);
+ if (unlikely(!dsq)) {
+ scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id);
+ return false;
+ }
+
+ if (consume_dispatch_q(sch, dspc->rq, dsq)) {
+ /*
+ * A successfully consumed task can be dequeued before it starts
+ * running while the CPU is trying to migrate other dispatched
+ * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
+ * local DSQ.
+ */
+ dspc->nr_tasks++;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/**
+ * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
+ * @it__iter: DSQ iterator in progress
+ * @slice: duration the moved task can run for in nsecs
+ *
+ * Override the slice of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous
+ * slice duration is kept.
+ */
+__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
+ u64 slice)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+ kit->slice = slice;
+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
+}
+
+/**
+ * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
+ * @it__iter: DSQ iterator in progress
+ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
+ *
+ * Override the vtime of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
+ * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the
+ * override is ignored and cleared.
+ */
+__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
+ u64 vtime)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+ kit->vtime = vtime;
+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
+}
+
+/**
+ * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
+ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
+ * be the destination.
+ *
+ * For the transfer to be successful, @p must still be on the DSQ and have been
+ * queued before the DSQ iteration started. This function doesn't care whether
+ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
+ * been queued before the iteration started.
+ *
+ * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
+ *
+ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
+ * lock (e.g. BPF timers or SYSCALL programs).
+ *
+ * Returns %true if @p has been consumed, %false if @p had already been
+ * consumed, dequeued, or, for sub-scheds, @dsq_id points to a disallowed local
+ * DSQ.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags);
+}
+
+/**
+ * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the
+ * priority queue of the DSQ specified by @dsq_id. The destination must be a
+ * user DSQ as only user DSQs support priority queue.
+ *
+ * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
+ * and scx_bpf_dsq_move_set_vtime() to update.
+ *
+ * All other aspects are identical to scx_bpf_dsq_move(). See
+ * scx_bpf_dsq_insert_vtime() for more information on @vtime.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_dispatch,
+};
+
+static u32 reenq_local(struct rq *rq)
+{
+ LIST_HEAD(tasks);
+ u32 nr_enqueued = 0;
+ struct task_struct *p, *n;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The BPF scheduler may choose to dispatch tasks back to
+ * @rq->scx.local_dsq. Move all candidate tasks off to a private list
+ * first to avoid processing the same tasks repeatedly.
+ */
+ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
+ scx.dsq_list.node) {
+ /*
+ * If @p is being migrated, @p's current CPU may not agree with
+ * its allowed CPUs and the migration_cpu_stop is about to
+ * deactivate and re-activate @p anyway. Skip re-enqueueing.
+ *
+ * While racing sched property changes may also dequeue and
+ * re-enqueue a migrating task while its current CPU and allowed
+ * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
+ * the current local DSQ for running tasks and thus are not
+ * visible to the BPF scheduler.
+ */
+ if (p->migration_pending)
+ continue;
+
+ dispatch_dequeue(rq, p);
+ list_add_tail(&p->scx.dsq_list.node, &tasks);
+ }
+
+ list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
+ list_del_init(&p->scx.dsq_list.node);
+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+ nr_enqueued++;
+ }
+
+ return nr_enqueued;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks. Can only be called from ops.cpu_release().
+ *
+ * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
+ * returning variant that can be called from anywhere.
+ */
+__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
+ return 0;
+
+ rq = cpu_rq(smp_processor_id());
+ lockdep_assert_rq_held(rq);
+
+ return reenq_local(rq);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_cpu_release,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_create_dsq - Create a custom DSQ
+ * @dsq_id: DSQ to create
+ * @node: NUMA node to allocate from
+ *
+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
+ */
+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
+{
+ struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
+ s32 ret;
+
+ if (unlikely(node >= (int)nr_node_ids ||
+ (node < 0 && node != NUMA_NO_NODE)))
+ return -EINVAL;
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
+ return -EINVAL;
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return -ENOMEM;
+
+ init_dsq(dsq, dsq_id);
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ else
+ ret = -ENODEV;
+
+ rcu_read_unlock();
+ if (ret)
+ kfree(dsq);
+ return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_unlocked,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_task_set_slice - Set task's time slice
+ * @p: task of interest
+ * @slice: time slice to set in nsecs
+ *
+ * Set @p's time slice to @slice. Returns %true on success, %false if the
+ * calling scheduler doesn't have authority over @p.
+ */
+__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
+{
+ p->scx.slice = slice;
+ return true;
+}
+
+/**
+ * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
+ * @p: task of interest
+ * @vtime: virtual time to set
+ *
+ * Set @p's virtual time to @vtime. Returns %true on success, %false if the
+ * calling scheduler doesn't have authority over @p.
+ */
+__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
+{
+ p->scx.dsq_vtime = vtime;
+ return true;
+}
+
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
+{
+ struct rq *this_rq;
+ unsigned long irq_flags;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return;
+
+ local_irq_save(irq_flags);
+
+ this_rq = this_rq();
+
+ /*
+ * While bypassing for PM ops, IRQ handling may not be online which can
+ * lead to irq_work_queue() malfunction such as infinite busy wait for
+ * IRQ status update. Suppress kicking.
+ */
+ if (scx_rq_bypassing(this_rq))
+ goto out;
+
+ /*
+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
+ * rq locks. We can probably be smarter and avoid bouncing if called
+ * from ops which don't hold a rq lock.
+ */
+ if (flags & SCX_KICK_IDLE) {
+ struct rq *target_rq = cpu_rq(cpu);
+
+ if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
+ scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+
+ if (raw_spin_rq_trylock(target_rq)) {
+ if (can_skip_idle_kick(target_rq)) {
+ raw_spin_rq_unlock(target_rq);
+ goto out;
+ }
+ raw_spin_rq_unlock(target_rq);
+ }
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
+ } else {
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
+
+ if (flags & SCX_KICK_PREEMPT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
+ if (flags & SCX_KICK_WAIT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
+ }
+
+ irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+out:
+ local_irq_restore(irq_flags);
+}
+
+/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (likely(sch))
+ scx_kick_cpu(sch, cpu, flags);
+}
+
+/**
+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks
+ * @dsq_id: id of the DSQ
+ *
+ * Return the number of tasks in the DSQ matching @dsq_id. If not found,
+ * -%ENOENT is returned.
+ */
+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+{
+ struct scx_sched *sch;
+ struct scx_dispatch_q *dsq;
+ s32 ret;
+
+ preempt_disable();
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (dsq_id == SCX_DSQ_LOCAL) {
+ ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
+ goto out;
+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+ if (ops_cpu_valid(sch, cpu, NULL)) {
+ ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
+ goto out;
+ }
+ } else {
+ dsq = find_user_dsq(sch, dsq_id);
+ if (dsq) {
+ ret = READ_ONCE(dsq->nr);
+ goto out;
+ }
+ }
+ ret = -ENOENT;
+out:
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * scx_bpf_destroy_dsq - Destroy a custom DSQ
+ * @dsq_id: DSQ to destroy
+ *
+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
+ * which doesn't exist. Can be called from any online scx_ops operations.
+ */
+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
+{
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ destroy_dsq(sch, dsq_id);
+ rcu_read_unlock();
+}
+
+/**
+ * bpf_iter_scx_dsq_new - Create a DSQ iterator
+ * @it: iterator to initialize
+ * @dsq_id: DSQ to iterate
+ * @flags: %SCX_DSQ_ITER_*
+ *
+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
+ * tasks which are already queued when this function is invoked.
+ */
+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
+ u64 flags)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ struct scx_sched *sch;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
+ sizeof(struct bpf_iter_scx_dsq));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
+ __alignof__(struct bpf_iter_scx_dsq));
+ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
+ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+
+ /*
+ * next() and destroy() will be called regardless of the return value.
+ * Always clear $kit->dsq.
+ */
+ kit->dsq = NULL;
+
+ sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
+ return -EINVAL;
+
+ kit->dsq = find_user_dsq(sch, dsq_id);
+ if (!kit->dsq)
+ return -ENOENT;
+
+ kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags,
+ READ_ONCE(kit->dsq->seq));
+
+ return 0;
+}
+
+/**
+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator
+ * @it: iterator to progress
+ *
+ * Return the next task. See bpf_iter_scx_dsq_new().
+ */
+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
+ struct task_struct *p;
+ unsigned long flags;
+
+ if (!kit->dsq)
+ return NULL;
+
+ raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+
+ if (list_empty(&kit->cursor.node))
+ p = NULL;
+ else
+ p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
+
+ /*
+ * Only tasks which were queued before the iteration started are
+ * visible. This bounds BPF iterations and guarantees that vtime never
+ * jumps in the other direction while iterating.
+ */
+ do {
+ p = nldsq_next_task(kit->dsq, p, rev);
+ } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
+
+ if (p) {
+ if (rev)
+ list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
+ else
+ list_move(&kit->cursor.node, &p->scx.dsq_list.node);
+ } else {
+ list_del_init(&kit->cursor.node);
+ }
+
+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+
+ return p;
+}
+
+/**
+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
+ * @it: iterator to destroy
+ *
+ * Undo scx_iter_scx_dsq_new().
+ */
+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+ if (!kit->dsq)
+ return;
+
+ if (!list_empty(&kit->cursor.node)) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+ list_del_init(&kit->cursor.node);
+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+ }
+ kit->dsq = NULL;
+}
+
+/**
+ * scx_bpf_dsq_peek - Lockless peek at the first element.
+ * @dsq_id: DSQ to examine.
+ *
+ * Read the first element in the DSQ. This is semantically equivalent to using
+ * the DSQ iterator, but is lockfree. Of course, like any lockless operation,
+ * this provides only a point-in-time snapshot, and the contents may change
+ * by the time any subsequent locking operation reads the queue.
+ *
+ * Returns the pointer, or NULL indicates an empty queue OR internal error.
+ */
+__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
+{
+ struct scx_sched *sch;
+ struct scx_dispatch_q *dsq;
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) {
+ scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+
+ dsq = find_user_dsq(sch, dsq_id);
+ if (unlikely(!dsq)) {
+ scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id);
+ return NULL;
+ }
+
+ return rcu_dereference(dsq->first_task);
+}
+
+__bpf_kfunc_end_defs();
+
+static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
+ size_t line_size, char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
+ s32 ret;
+
+ if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
+ (data__sz && !data)) {
+ scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz);
+ return -EINVAL;
+ }
+
+ ret = copy_from_kernel_nofault(data_buf, data, data__sz);
+ if (ret < 0) {
+ scx_error(sch, "failed to read data fields (%d)", ret);
+ return ret;
+ }
+
+ ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
+ &bprintf_data);
+ if (ret < 0) {
+ scx_error(sch, "format preparation failed (%d)", ret);
+ return ret;
+ }
+
+ ret = bstr_printf(line_buf, line_size, fmt,
+ bprintf_data.bin_args);
+ bpf_bprintf_cleanup(&bprintf_data);
+ if (ret < 0) {
+ scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
+ return ret;
+ }
+
+ return ret;
+}
+
+static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
+ char *fmt, unsigned long long *data, u32 data__sz)
+{
+ return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
+ fmt, data, data__sz);
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
+ * @exit_code: Exit value to pass to user space via struct scx_exit_info.
+ * @fmt: error message format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
+ * disabling.
+ */
+__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
+ unsigned long long *data, u32 data__sz)
+{
+ struct scx_sched *sch;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
+}
+
+/**
+ * scx_bpf_error_bstr - Indicate fatal error
+ * @fmt: error message format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops
+ * disabling.
+ */
+__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ struct scx_sched *sch;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
+}
+
+/**
+ * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
+ * @fmt: format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
+ * dump_task() to generate extra debug dump specific to the BPF scheduler.
+ *
+ * The extra dump may be multiple lines. A single line may be split over
+ * multiple calls. The last line is automatically terminated.
+ */
+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ struct scx_sched *sch;
+ struct scx_dump_data *dd = &scx_dump_data;
+ struct scx_bstr_buf *buf = &dd->buf;
+ s32 ret;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (raw_smp_processor_id() != dd->cpu) {
+ scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends");
+ return;
+ }
+
+ /* append the formatted string to the line buf */
+ ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
+ sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
+ if (ret < 0) {
+ dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
+ dd->prefix, fmt, data, data__sz, ret);
+ return;
+ }
+
+ dd->cursor += ret;
+ dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * If the line buf overflowed or ends in a newline, flush it into the
+ * dump. This is to allow the caller to generate a single line over
+ * multiple calls. As ops_dump_flush() can also handle multiple lines in
+ * the line buf, the only case which can lead to an unexpected
+ * truncation is when the caller keeps generating newlines in the middle
+ * instead of the end consecutively. Don't do that.
+ */
+ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
+ ops_dump_flush();
+}
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
+ * anywhere.
+ */
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
+{
+ struct rq *rq;
+
+ guard(preempt)();
+
+ rq = this_rq();
+ local_set(&rq->scx.reenq_local_deferred, 1);
+ schedule_deferred(rq);
+}
+
+/**
+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the maximum relative capacity of @cpu in relation to the most
+ * performant CPU in the system. The return value is in the range [1,
+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
+ return arch_scale_cpu_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the current relative performance of @cpu in relation to its maximum.
+ * The return value is in the range [1, %SCX_CPUPERF_ONE].
+ *
+ * The current performance level of a CPU in relation to the maximum performance
+ * available in the system can be calculated as follows:
+ *
+ * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
+ *
+ * The result is in the range [1, %SCX_CPUPERF_ONE].
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
+ return arch_scale_freq_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
+ * @cpu: CPU of interest
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ *
+ * Set the target performance level of @cpu to @perf. @perf is in linear
+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
+ * schedutil cpufreq governor chooses the target frequency.
+ *
+ * The actual performance level chosen, CPU grouping, and the overhead and
+ * latency of the operations are dependent on the hardware and cpufreq driver in
+ * use. Consult hardware and cpufreq documentation for more information. The
+ * current performance level can be monitored using scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (unlikely(perf > SCX_CPUPERF_ONE)) {
+ scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu);
+ return;
+ }
+
+ if (ops_cpu_valid(sch, cpu, NULL)) {
+ struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
+ struct rq_flags rf;
+
+ /*
+ * When called with an rq lock held, restrict the operation
+ * to the corresponding CPU to prevent ABBA deadlocks.
+ */
+ if (locked_rq && rq != locked_rq) {
+ scx_error(sch, "Invalid target CPU %d", cpu);
+ return;
+ }
+
+ /*
+ * If no rq lock is held, allow to operate on any CPU by
+ * acquiring the corresponding rq lock.
+ */
+ if (!locked_rq) {
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ }
+
+ rq->scx.cpuperf_target = perf;
+ cpufreq_update_util(rq, 0);
+
+ if (!locked_rq)
+ rq_unlock_irqrestore(rq, &rf);
+ }
+}
+
+/**
+ * scx_bpf_nr_node_ids - Return the number of possible node IDs
+ *
+ * All valid node IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_node_ids(void)
+{
+ return nr_node_ids;
+}
+
+/**
+ * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
+ *
+ * All valid CPU IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
+{
+ return nr_cpu_ids;
+}
+
+/**
+ * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
+{
+ return cpu_possible_mask;
+}
+
+/**
+ * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
+{
+ return cpu_online_mask;
+}
+
+/**
+ * scx_bpf_put_cpumask - Release a possible/online cpumask
+ * @cpumask: cpumask to release
+ */
+__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global cpumask, which is read-only in the caller and
+ * is never released. The acquire / release semantics here are just used
+ * to make the cpumask is a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_task_running - Is task currently running?
+ * @p: task of interest
+ */
+__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
+{
+ return task_rq(p)->curr == p;
+}
+
+/**
+ * scx_bpf_task_cpu - CPU a task is currently associated with
+ * @p: task of interest
+ */
+__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
+{
+ return task_cpu(p);
+}
+
+/**
+ * scx_bpf_cpu_rq - Fetch the rq of a CPU
+ * @cpu: CPU of the rq
+ */
+__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ if (!sch->warned_deprecated_rq) {
+ printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; "
+ "use scx_bpf_locked_rq() when holding rq lock "
+ "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__);
+ sch->warned_deprecated_rq = true;
+ }
+
+ return cpu_rq(cpu);
+}
+
+/**
+ * scx_bpf_locked_rq - Return the rq currently locked by SCX
+ *
+ * Returns the rq if a rq lock is currently held by SCX.
+ * Otherwise emits an error and returns NULL.
+ */
+__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(preempt)();
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ rq = scx_locked_rq();
+ if (!rq) {
+ scx_error(sch, "accessing rq without holding rq lock");
+ return NULL;
+ }
+
+ return rq;
+}
+
+/**
+ * scx_bpf_cpu_curr - Return remote CPU's curr task
+ * @cpu: CPU of interest
+ *
+ * Callers must hold RCU read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
+ * @p: task of interest
+ *
+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
+ * from the scheduler's POV. SCX operations should use this function to
+ * determine @p's current cgroup as, unlike following @p->cgroups,
+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
+ * operations. The restriction guarantees that @p's rq is locked by the caller.
+ */
+#ifdef CONFIG_CGROUP_SCHED
+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
+{
+ struct task_group *tg = p->sched_task_group;
+ struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out;
+
+ if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
+ goto out;
+
+ cgrp = tg_cgrp(tg);
+
+out:
+ cgroup_get(cgrp);
+ return cgrp;
+}
+#endif
+
+/**
+ * scx_bpf_now - Returns a high-performance monotonically non-decreasing
+ * clock for the current CPU. The clock returned is in nanoseconds.
+ *
+ * It provides the following properties:
+ *
+ * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently
+ * to account for execution time and track tasks' runtime properties.
+ * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
+ * eventually reads a hardware timestamp counter -- is neither performant nor
+ * scalable. scx_bpf_now() aims to provide a high-performance clock by
+ * using the rq clock in the scheduler core whenever possible.
+ *
+ * 2) High enough resolution for the BPF scheduler use cases: In most BPF
+ * scheduler use cases, the required clock resolution is lower than the most
+ * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically
+ * uses the rq clock in the scheduler core whenever it is valid. It considers
+ * that the rq clock is valid from the time the rq clock is updated
+ * (update_rq_clock) until the rq is unlocked (rq_unpin_lock).
+ *
+ * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
+ * guarantees the clock never goes backward when comparing them in the same
+ * CPU. On the other hand, when comparing clocks in different CPUs, there
+ * is no such guarantee -- the clock can go backward. It provides a
+ * monotonically *non-decreasing* clock so that it would provide the same
+ * clock values in two different scx_bpf_now() calls in the same CPU
+ * during the same period of when the rq clock is valid.
+ */
+__bpf_kfunc u64 scx_bpf_now(void)
+{
+ struct rq *rq;
+ u64 clock;
+
+ preempt_disable();
+
+ rq = this_rq();
+ if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
+ /*
+ * If the rq clock is valid, use the cached rq clock.
+ *
+ * Note that scx_bpf_now() is re-entrant between a process
+ * context and an interrupt context (e.g., timer interrupt).
+ * However, we don't need to consider the race between them
+ * because such race is not observable from a caller.
+ */
+ clock = READ_ONCE(rq->scx.clock);
+ } else {
+ /*
+ * Otherwise, return a fresh rq clock.
+ *
+ * The rq clock is updated outside of the rq lock.
+ * In this case, keep the updated rq clock invalid so the next
+ * kfunc call outside the rq lock gets a fresh rq clock.
+ */
+ clock = sched_clock_cpu(cpu_of(rq));
+ }
+
+ preempt_enable();
+
+ return clock;
+}
+
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
+{
+ struct scx_event_stats *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into @events. */
+ memset(events, 0, sizeof(*events));
+ for_each_possible_cpu(cpu) {
+ e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
+ scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+}
+
+/*
+ * scx_bpf_events - Get a system-wide event counter to
+ * @events: output buffer from a BPF program
+ * @events__sz: @events len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
+ size_t events__sz)
+{
+ struct scx_sched *sch;
+ struct scx_event_stats e_sys;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ scx_read_events(sch, &e_sys);
+ else
+ memset(&e_sys, 0, sizeof(e_sys));
+ rcu_read_unlock();
+
+ /*
+ * We cannot entirely trust a BPF-provided size since a BPF program
+ * might be compiled against a different vmlinux.h, of which
+ * scx_event_stats would be larger (a newer vmlinux.h) or smaller
+ * (an older vmlinux.h). Hence, we use the smaller size to avoid
+ * memory corruption.
+ */
+ events__sz = min(events__sz, sizeof(*events));
+ memcpy(events, &e_sys, events__sz);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
+BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
+#ifdef CONFIG_CGROUP_SCHED
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
+#endif
+BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(scx_kfunc_ids_any)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_any = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_any,
+};
+
+static int __init scx_init(void)
+{
+ int ret;
+
+ /*
+ * kfunc registration can't be done from init_sched_ext_class() as
+ * register_btf_kfunc_id_set() needs most of the system to be up.
+ *
+ * Some kfuncs are context-sensitive and can only be called from
+ * specific SCX ops. They are grouped into BTF sets accordingly.
+ * Unfortunately, BPF currently doesn't have a way of enforcing such
+ * restrictions. Eventually, the verifier should be able to enforce
+ * them. For now, register them the same and make each kfunc explicitly
+ * check using scx_kf_allowed().
+ */
+ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_enqueue_dispatch)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_dispatch)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_cpu_release)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_unlocked)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &scx_kfunc_set_unlocked)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_any)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &scx_kfunc_set_any)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &scx_kfunc_set_any))) {
+ pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
+ return ret;
+ }
+
+ ret = scx_idle_init();
+ if (ret) {
+ pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
+ return ret;
+ }
+
+ ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
+ if (ret) {
+ pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
+ return ret;
+ }
+
+ ret = register_pm_notifier(&scx_pm_notifier);
+ if (ret) {
+ pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
+ return ret;
+ }
+
+ scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
+ if (!scx_kset) {
+ pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
+ return -ENOMEM;
+ }
+
+ ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
+ if (ret < 0) {
+ pr_err("sched_ext: Failed to add global attributes\n");
+ return ret;
+ }
+
+ if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) ||
+ !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) {
+ pr_err("sched_ext: Failed to allocate cpumasks\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+__initcall(scx_init);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
new file mode 100644
index 000000000000..43429b33e52c
--- /dev/null
+++ b/kernel/sched/ext.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifdef CONFIG_SCHED_CLASS_EXT
+
+void scx_tick(struct rq *rq);
+void init_scx_entity(struct sched_ext_entity *scx);
+void scx_pre_fork(struct task_struct *p);
+int scx_fork(struct task_struct *p);
+void scx_post_fork(struct task_struct *p);
+void scx_cancel_fork(struct task_struct *p);
+bool scx_can_stop_tick(struct rq *rq);
+void scx_rq_activate(struct rq *rq);
+void scx_rq_deactivate(struct rq *rq);
+int scx_check_setscheduler(struct task_struct *p, int policy);
+bool task_should_scx(int policy);
+bool scx_allow_ttwu_queue(const struct task_struct *p);
+void init_sched_ext_class(void);
+
+static inline u32 scx_cpuperf_target(s32 cpu)
+{
+ if (scx_enabled())
+ return cpu_rq(cpu)->scx.cpuperf_target;
+ else
+ return 0;
+}
+
+static inline bool task_on_scx(const struct task_struct *p)
+{
+ return scx_enabled() && p->sched_class == &ext_sched_class;
+}
+
+#ifdef CONFIG_SCHED_CORE
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi);
+#endif
+
+#else /* CONFIG_SCHED_CLASS_EXT */
+
+static inline void scx_tick(struct rq *rq) {}
+static inline void scx_pre_fork(struct task_struct *p) {}
+static inline int scx_fork(struct task_struct *p) { return 0; }
+static inline void scx_post_fork(struct task_struct *p) {}
+static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
+static inline void scx_rq_activate(struct rq *rq) {}
+static inline void scx_rq_deactivate(struct rq *rq) {}
+static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
+static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
+static inline void init_sched_ext_class(void) {}
+
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
+
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ if (scx_enabled())
+ __scx_update_idle(rq, idle, do_notify);
+}
+#else
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+#ifdef CONFIG_EXT_GROUP_SCHED
+void scx_tg_init(struct task_group *tg);
+int scx_tg_online(struct task_group *tg);
+void scx_tg_offline(struct task_group *tg);
+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
+void scx_cgroup_move_task(struct task_struct *p);
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
+void scx_group_set_idle(struct task_group *tg, bool idle);
+void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
+#else /* CONFIG_EXT_GROUP_SCHED */
+static inline void scx_tg_init(struct task_group *tg) {}
+static inline int scx_tg_online(struct task_group *tg) { return 0; }
+static inline void scx_tg_offline(struct task_group *tg) {}
+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
+static inline void scx_cgroup_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
+static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
+#endif /* CONFIG_EXT_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
new file mode 100644
index 000000000000..3d9d404d5cd2
--- /dev/null
+++ b/kernel/sched/ext_idle.c
@@ -0,0 +1,1435 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Built-in idle CPU tracking policy.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#include "ext_idle.h"
+
+/* Enable/disable built-in idle CPU selection policy */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+/* Enable/disable per-node idle cpumasks */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
+
+/* Enable/disable LLC aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+
+/* Enable/disable NUMA aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+
+/*
+ * cpumasks to track idle CPUs within each NUMA node.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask
+ * from is used to track all the idle CPUs in the system.
+ */
+struct scx_idle_cpus {
+ cpumask_var_t cpu;
+ cpumask_var_t smt;
+};
+
+/*
+ * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
+ * is not enabled).
+ */
+static struct scx_idle_cpus scx_idle_global_masks;
+
+/*
+ * Per-node idle cpumasks.
+ */
+static struct scx_idle_cpus **scx_idle_node_masks;
+
+/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
+
+/*
+ * Return the idle masks associated to a target @node.
+ *
+ * NUMA_NO_NODE identifies the global idle cpumask.
+ */
+static struct scx_idle_cpus *idle_cpumask(int node)
+{
+ return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node];
+}
+
+/*
+ * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if
+ * per-node idle cpumasks are disabled.
+ */
+static int scx_cpu_node_if_enabled(int cpu)
+{
+ if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
+ return NUMA_NO_NODE;
+
+ return cpu_to_node(cpu);
+}
+
+static bool scx_idle_test_and_clear_cpu(int cpu)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+ * cluster is not wholly idle either way. This also prevents
+ * scx_pick_idle_cpu() from getting caught in an infinite loop.
+ */
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ /*
+ * If offline, @cpu is not its own sibling and
+ * scx_pick_idle_cpu() can get caught in an infinite loop as
+ * @cpu is never cleared from the idle SMT mask. Ensure that
+ * @cpu is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
+ */
+ if (cpumask_intersects(smt, idle_smts))
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ else if (cpumask_test_cpu(cpu, idle_smts))
+ __cpumask_clear_cpu(cpu, idle_smts);
+ }
+#endif
+
+ return cpumask_test_and_clear_cpu(cpu, idle_cpus);
+}
+
+/*
+ * Pick an idle CPU in a specific NUMA node.
+ */
+static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ int cpu;
+
+retry:
+ if (sched_smt_active()) {
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ goto found;
+
+ if (flags & SCX_PICK_IDLE_CORE)
+ return -EBUSY;
+ }
+
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+
+found:
+ if (scx_idle_test_and_clear_cpu(cpu))
+ return cpu;
+ else
+ goto retry;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Tracks nodes that have not yet been visited when searching for an idle
+ * CPU across all available nodes.
+ */
+static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited);
+
+/*
+ * Search for an idle CPU across all nodes, excluding @node.
+ */
+static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ nodemask_t *unvisited;
+ s32 cpu = -EBUSY;
+
+ preempt_disable();
+ unvisited = this_cpu_ptr(&per_cpu_unvisited);
+
+ /*
+ * Restrict the search to the online nodes (excluding the current
+ * node that has been visited already).
+ */
+ nodes_copy(*unvisited, node_states[N_ONLINE]);
+ node_clear(node, *unvisited);
+
+ /*
+ * Traverse all nodes in order of increasing distance, starting
+ * from @node.
+ *
+ * This loop is O(N^2), with N being the amount of NUMA nodes,
+ * which might be quite expensive in large NUMA systems. However,
+ * this complexity comes into play only when a scheduler enables
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU
+ * without specifying a target NUMA node, so it shouldn't be a
+ * bottleneck is most cases.
+ *
+ * As a future optimization we may want to cache the list of nodes
+ * in a per-node array, instead of actually traversing them every
+ * time.
+ */
+ for_each_node_numadist(node, *unvisited) {
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ break;
+ }
+ preempt_enable();
+
+ return cpu;
+}
+#else
+static inline s32
+pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ return -EBUSY;
+}
+#endif
+
+/*
+ * Find an idle CPU in the system, starting from @node.
+ */
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ s32 cpu;
+
+ /*
+ * Always search in the starting node first (this is an
+ * optimization that can save some cycles even when the search is
+ * not limited to a single node).
+ */
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ /*
+ * Stop the search if we are using only a single global cpumask
+ * (NUMA_NO_NODE) or if the search is restricted to the first node
+ * only.
+ */
+ if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE)
+ return -EBUSY;
+
+ /*
+ * Extend the search to the other online nodes.
+ */
+ return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags);
+}
+
+/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return NULL;
+
+ return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return 0;
+ sg = sd->groups;
+ if (!sg)
+ return 0;
+
+ return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return NULL;
+ sg = sd->groups;
+ if (!sg)
+ return NULL;
+
+ return sched_group_span(sg);
+}
+
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+ int cpu;
+
+ /*
+ * We need to scan all online CPUs to verify whether their scheduling
+ * domains overlap.
+ *
+ * While it is rare to encounter architectures with asymmetric NUMA
+ * topologies, CPU hotplugging or virtualized environments can result
+ * in asymmetric configurations.
+ *
+ * For example:
+ *
+ * NUMA 0:
+ * - LLC 0: cpu0..cpu7
+ * - LLC 1: cpu8..cpu15 [offline]
+ *
+ * NUMA 1:
+ * - LLC 0: cpu16..cpu23
+ * - LLC 1: cpu24..cpu31
+ *
+ * In this case, if we only check the first online CPU (cpu0), we might
+ * incorrectly assume that the LLC and NUMA domains are fully
+ * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+ * domains).
+ */
+ for_each_online_cpu(cpu)
+ if (llc_weight(cpu) != numa_weight(cpu))
+ return true;
+
+ return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
+{
+ bool enable_llc = false, enable_numa = false;
+ unsigned int nr_cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
+
+ /*
+ * Enable LLC domain optimization only when there are multiple LLC
+ * domains among the online CPUs. If all online CPUs are part of a
+ * single LLC domain, the idle CPU selection logic can choose any
+ * online CPU without bias.
+ *
+ * Note that it is sufficient to check the LLC domain of the first
+ * online CPU to determine whether a single LLC domain includes all
+ * CPUs.
+ */
+ rcu_read_lock();
+ nr_cpus = llc_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus())
+ enable_llc = true;
+ pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+ cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
+ }
+
+ /*
+ * Enable NUMA optimization only when there are multiple NUMA domains
+ * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * with the LLC domains.
+ *
+ * If all CPUs belong to the same NUMA node and the same LLC domain,
+ * enabling both NUMA and LLC optimizations is unnecessary, as checking
+ * for an idle CPU in the same domain twice is redundant.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA
+ * optimization, as we would naturally select idle CPUs within
+ * specific NUMA nodes querying the corresponding per-node cpumask.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ nr_cpus = numa_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+ enable_numa = true;
+ pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+ cpumask_pr_args(numa_span(cpu)), nr_cpus);
+ }
+ }
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ str_enabled_disabled(enable_llc));
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ str_enabled_disabled(enable_numa));
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Return true if @p can run on all possible CPUs, false otherwise.
+ */
+static inline bool task_affinity_all(const struct task_struct *p)
+{
+ return p->nr_cpus_allowed >= num_possible_cpus();
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ * cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node, if the node cpumask is a
+ * subset of @cpus_allowed, to reduce memory access latency.
+ *
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively,
+ * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always
+ * begin in @prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ *
+ * Return the picked CPU if idle, or a negative value otherwise.
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
+ const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
+ int node = scx_cpu_node_if_enabled(prev_cpu);
+ bool is_prev_allowed;
+ s32 cpu;
+
+ preempt_disable();
+
+ /*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (allowed != p->cpus_ptr) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+ if (task_affinity_all(p)) {
+ allowed = cpus_allowed;
+ } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+ allowed = local_cpus;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+ }
+
+ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
+
+ /*
+ * Determine the subset of CPUs that the task can use in its
+ * current LLC and node.
+ *
+ * If the task can run on all CPUs, use the node and LLC cpumasks
+ * directly.
+ */
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
+ const struct cpumask *cpus = numa_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ numa_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ numa_cpus = local_cpus;
+ }
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
+ const struct cpumask *cpus = llc_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ llc_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ llc_cpus = local_cpus;
+ }
+
+ /*
+ * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+ */
+ if (wake_flags & SCX_WAKE_SYNC) {
+ int waker_node;
+
+ /*
+ * If the waker's CPU is cache affine and prev_cpu is idle,
+ * then avoid a migration.
+ */
+ cpu = smp_processor_id();
+ if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * If the waker's local DSQ is empty, and the system is under
+ * utilized, try to wake up @p to the local DSQ of the waker.
+ *
+ * Checking only for an empty local DSQ is insufficient as it
+ * could give the wakee an unfair advantage when the system is
+ * oversaturated.
+ *
+ * Checking only for the presence of idle CPUs is also
+ * insufficient as the local DSQ of the waker could have tasks
+ * piled up on it even if there is an idle core elsewhere on
+ * the system.
+ */
+ waker_node = cpu_to_node(cpu);
+ if (!(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
+ (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
+ !cpumask_empty(idle_cpumask(waker_node)->cpu)) {
+ if (cpumask_test_cpu(cpu, allowed))
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * If CPU has SMT, any wholly idle CPU is likely a better pick than
+ * partially idle @prev_cpu.
+ */
+ if (sched_smt_active()) {
+ /*
+ * Keep using @prev_cpu if it's part of a fully idle core.
+ */
+ if (is_prev_allowed &&
+ cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any full-idle core usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
+ * begin in prev_cpu's node and proceed to other nodes in
+ * order of increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+
+ /*
+ * Give up if we're strictly looking for a full-idle SMT
+ * core.
+ */
+ if (flags & SCX_PICK_IDLE_CORE) {
+ cpu = -EBUSY;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Use @prev_cpu if it's idle.
+ */
+ if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
+ * in prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
+
+out_unlock:
+ rcu_read_unlock();
+out_enable:
+ preempt_enable();
+
+ return cpu;
+}
+
+/*
+ * Initialize global and per-node idle cpumasks.
+ */
+void scx_idle_init_masks(void)
+{
+ int i;
+
+ /* Allocate global idle cpumasks */
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
+
+ /* Allocate per-node idle cpumasks */
+ scx_idle_node_masks = kcalloc(num_possible_nodes(),
+ sizeof(*scx_idle_node_masks), GFP_KERNEL);
+ BUG_ON(!scx_idle_node_masks);
+
+ for_each_node(i) {
+ scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, i);
+ BUG_ON(!scx_idle_node_masks[i]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
+ }
+
+ /* Allocate local per-cpu idle cpumasks */
+ for_each_possible_cpu(i) {
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ }
+}
+
+static void update_builtin_idle(int cpu, bool idle)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+ assign_cpu(cpu, idle_cpus, idle);
+
+#ifdef CONFIG_SCHED_SMT
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ if (idle) {
+ /*
+ * idle_smt handling is racy but that's fine as it's
+ * only for optimization and self-correcting.
+ */
+ if (!cpumask_subset(smt, idle_cpus))
+ return;
+ cpumask_or(idle_smts, idle_smts, smt);
+ } else {
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ }
+ }
+#endif
+}
+
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ struct scx_sched *sch = scx_root;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ *
+ * This must come after builtin idle update so that BPF schedulers can
+ * create interlocking between ops.update_idle() and ops.enqueue() -
+ * either enqueue() sees the idle bit or update_idle() sees the task
+ * that enqueue() queued.
+ */
+ if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
+}
+
+static void reset_idle_masks(struct sched_ext_ops *ops)
+{
+ int node;
+
+ /*
+ * Consider all online cpus idle. Should converge to the actual state
+ * quickly.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask);
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask);
+ return;
+ }
+
+ for_each_node(node) {
+ const struct cpumask *node_mask = cpumask_of_node(node);
+
+ cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask);
+ cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
+ }
+}
+
+void scx_idle_enable(struct sched_ext_ops *ops)
+{
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
+ else
+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
+
+ if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
+ static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
+ else
+ static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
+
+ reset_idle_masks(ops);
+}
+
+void scx_idle_disable(void)
+{
+ static_branch_disable(&scx_builtin_idle_enabled);
+ static_branch_disable(&scx_builtin_idle_per_node);
+}
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+
+static int validate_node(struct scx_sched *sch, int node)
+{
+ if (!static_branch_likely(&scx_builtin_idle_per_node)) {
+ scx_error(sch, "per-node idle tracking is disabled");
+ return -EOPNOTSUPP;
+ }
+
+ /* Return no entry for NUMA_NO_NODE (not a critical scx error) */
+ if (node == NUMA_NO_NODE)
+ return -ENOENT;
+
+ /* Make sure node is in a valid range */
+ if (node < 0 || node >= nr_node_ids) {
+ scx_error(sch, "invalid node %d", node);
+ return -EINVAL;
+ }
+
+ /* Make sure the node is part of the set of possible nodes */
+ if (!node_possible(node)) {
+ scx_error(sch, "unavailable node %d", node);
+ return -EINVAL;
+ }
+
+ return node;
+}
+
+__bpf_kfunc_start_defs();
+
+static bool check_builtin_idle_enabled(struct scx_sched *sch)
+{
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ return true;
+
+ scx_error(sch, "built-in idle tracking is disabled");
+ return false;
+}
+
+/*
+ * Determine whether @p is a migration-disabled task in the context of BPF
+ * code.
+ *
+ * We can't simply check whether @p->migration_disabled is set in a
+ * sched_ext callback, because migration is always disabled for the current
+ * task while running BPF code.
+ *
+ * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively
+ * disable and re-enable migration. For this reason, the current task
+ * inside a sched_ext callback is always a migration-disabled task.
+ *
+ * Therefore, when @p->migration_disabled == 1, check whether @p is the
+ * current task or not: if it is, then migration was not disabled before
+ * entering the callback, otherwise migration was disabled.
+ *
+ * Returns true if @p is migration-disabled, false otherwise.
+ */
+static bool is_bpf_migration_disabled(const struct task_struct *p)
+{
+ if (p->migration_disabled == 1)
+ return p != current;
+ else
+ return p->migration_disabled;
+}
+
+static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
+{
+ struct rq *rq;
+ struct rq_flags rf;
+ s32 cpu;
+
+ if (!ops_cpu_valid(sch, prev_cpu, NULL))
+ return -EINVAL;
+
+ if (!check_builtin_idle_enabled(sch))
+ return -EBUSY;
+
+ /*
+ * If called from an unlocked context, acquire the task's rq lock,
+ * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed.
+ *
+ * Otherwise, allow to use this kfunc only from ops.select_cpu()
+ * and ops.select_enqueue().
+ */
+ if (scx_kf_allowed_if_unlocked()) {
+ rq = task_rq_lock(p, &rf);
+ } else {
+ if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ return -EPERM;
+ rq = scx_locked_rq();
+ }
+
+ /*
+ * Validate locking correctness to access p->cpus_ptr and
+ * p->nr_cpus_allowed: if we're holding an rq lock, we're safe;
+ * otherwise, assert that p->pi_lock is held.
+ */
+ if (!rq)
+ lockdep_assert_held(&p->pi_lock);
+
+ /*
+ * This may also be called from ops.enqueue(), so we need to handle
+ * per-CPU tasks as well. For these tasks, we can skip all idle CPU
+ * selection optimizations and simply check whether the previously
+ * used CPU is idle and within the allowed cpumask.
+ */
+ if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) {
+ if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
+ scx_idle_test_and_clear_cpu(prev_cpu))
+ cpu = prev_cpu;
+ else
+ cpu = -EBUSY;
+ } else {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
+ allowed ?: p->cpus_ptr, flags);
+ }
+
+ if (scx_kf_allowed_if_unlocked())
+ task_rq_unlock(rq, p, &rf);
+
+ return cpu;
+}
+
+/**
+ * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
+ * trigger an error if @cpu is invalid
+ * @cpu: target CPU
+ */
+__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+ return NUMA_NO_NODE;
+ return cpu_to_node(cpu);
+}
+
+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @is_idle: out parameter indicating whether the returned CPU is idle
+ *
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
+ *
+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
+ * currently idle and thus a good candidate for direct dispatching.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *is_idle)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0);
+ if (cpu >= 0) {
+ *is_idle = true;
+ return cpu;
+ }
+ *is_idle = false;
+ return prev_cpu;
+}
+
+struct scx_bpf_select_cpu_and_args {
+ /* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
+ s32 prev_cpu;
+ u64 wake_flags;
+ u64 flags;
+};
+
+/**
+ * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
+ * @p: task_struct to select a CPU for
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @args: struct containing the rest of the arguments
+ * @args->prev_cpu: CPU @p was on previously
+ * @args->wake_flags: %SCX_WAKE_* flags
+ * @args->flags: %SCX_PICK_IDLE* flags
+ *
+ * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
+ * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
+ * as an inline wrapper in common.bpf.h.
+ *
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
+ *
+ * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
+ *
+ * Returns the selected idle CPU, which will be automatically awakened upon
+ * returning from ops.select_cpu() and can be used for direct dispatch, or
+ * a negative value if no idle CPU is available.
+ */
+__bpf_kfunc s32
+__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
+ struct scx_bpf_select_cpu_and_args *args)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
+ cpus_allowed, args->flags);
+}
+
+/*
+ * COMPAT: Will be removed in v6.22.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
+ cpus_allowed, flags);
+}
+
+/**
+ * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
+ * idle-tracking per-CPU cpumask of a target NUMA node.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
+ if (node < 0)
+ return cpu_none_mask;
+
+ return idle_cpumask(node)->cpu;
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled(sch))
+ return cpu_none_mask;
+
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+}
+
+/**
+ * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
+ * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
+ * used to determine if an entire physical core is free.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
+ if (node < 0)
+ return cpu_none_mask;
+
+ if (sched_smt_active())
+ return idle_cpumask(node)->smt;
+ else
+ return idle_cpumask(node)->cpu;
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled(sch))
+ return cpu_none_mask;
+
+ if (sched_smt_active())
+ return idle_cpumask(NUMA_NO_NODE)->smt;
+ else
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
+ */
+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global idle cpumask, which is read-only in the
+ * caller and is never released. The acquire / release semantics here
+ * are just used to make the cpumask a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ if (!check_builtin_idle_enabled(sch))
+ return false;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return false;
+
+ return scx_idle_test_and_clear_cpu(cpu);
+}
+
+/**
+ * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
+ *
+ * Returns the picked idle cpu number on success, or -%EBUSY if no matching
+ * cpu was found.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node).
+ *
+ * Always returns an error if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if
+ * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
+ if (node < 0)
+ return node;
+
+ return scx_pick_idle_cpu(cpus_allowed, node, flags);
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_idle_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_error(sch, "per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (!check_builtin_idle_enabled(sch))
+ return -EBUSY;
+
+ return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+}
+
+/**
+ * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
+ * or pick any CPU from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node, regardless of
+ * the CPU idle state).
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
+ if (node < 0)
+ return node;
+
+ cpu = scx_pick_idle_cpu(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ if (flags & SCX_PICK_IDLE_IN_NODE)
+ cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed);
+ else
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_any_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ struct scx_sched *sch;
+ s32 cpu;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_error(sch, "per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (static_branch_likely(&scx_builtin_idle_enabled)) {
+ cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+ if (cpu >= 0)
+ return cpu;
+ }
+
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_idle)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_idle)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_idle,
+};
+
+int scx_idle_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
+
+ return ret;
+}
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
new file mode 100644
index 000000000000..fa583f141f35
--- /dev/null
+++ b/kernel/sched/ext_idle.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#ifndef _KERNEL_SCHED_EXT_IDLE_H
+#define _KERNEL_SCHED_EXT_IDLE_H
+
+struct sched_ext_ops;
+
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
+void scx_idle_init_masks(void);
+
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags);
+void scx_idle_enable(struct sched_ext_ops *ops);
+void scx_idle_disable(void);
+int scx_idle_init(void);
+
+#endif /* _KERNEL_SCHED_EXT_IDLE_H */
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
new file mode 100644
index 000000000000..386c677e4c9a
--- /dev/null
+++ b/kernel/sched/ext_internal.h
@@ -0,0 +1,1101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_TASK_ITER_BATCH = 32,
+
+ SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
+ SCX_BYPASS_LB_DONOR_PCT = 125,
+ SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
+ SCX_BYPASS_LB_BATCH = 256,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+enum scx_exit_flags {
+ /*
+ * ops.exit() may be called even if the loading failed before ops.init()
+ * finishes successfully. This is because ops.exit() allows rich exit
+ * info communication. The following flag indicates whether ops.init()
+ * finished successfully.
+ */
+ SCX_EFLAG_INITIALIZED,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* %SCX_EFLAG_* */
+ u64 flags;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+
+ /* debug dump */
+ char *dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
+
+ /*
+ * CPU cgroup support flags
+ */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /* the cgroup the task is joining */
+ struct cgroup *cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+ /* the weight of the cgroup [1..10000] */
+ u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
+};
+
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
+ /* next task is being scheduled by &sched_class_dl */
+ SCX_CPU_PREEMPT_DL,
+ /* next task is being scheduled by &sched_class_stop */
+ SCX_CPU_PREEMPT_STOP,
+ /* unknown reason for SCX being preempted */
+ SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+ /* the reason the CPU was preempted */
+ enum scx_cpu_preempt_reason reason;
+
+ /* the task that's going to be scheduled on the CPU */
+ struct task_struct *task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
+ */
+struct sched_ext_ops {
+ /**
+ * @select_cpu: Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * @enqueue: Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
+ *
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @dequeue: Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
+ *
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * @tick: Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * @runnable: A task is becoming runnable on its associated CPU
+ * @p: task becoming runnable
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * This and the following three functions can be used to track a task's
+ * execution state transitions. A task becomes ->runnable() on a CPU,
+ * and then goes through one or more ->running() and ->stopping() pairs
+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+ * done running on the CPU.
+ *
+ * @p is becoming runnable on the CPU because it's
+ *
+ * - waking up (%SCX_ENQ_WAKEUP)
+ * - being moved from another CPU
+ * - being restored after temporarily taken off the queue for an
+ * attribute change.
+ *
+ * This and ->enqueue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be followed by ->enqueue()
+ * e.g. when @p is being dispatched to a remote CPU, or when @p is
+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+ * task may be ->enqueue()'d without being preceded by this operation
+ * e.g. after exhausting its slice.
+ */
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @running: A task is starting to run on its associated CPU
+ * @p: task starting to run
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ */
+ void (*running)(struct task_struct *p);
+
+ /**
+ * @stopping: A task is stopping execution
+ * @p: task stopping to run
+ * @runnable: is task @p still runnable?
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
+ * See ->runnable() for explanation on the task state notifiers. If
+ * !@runnable, ->quiescent() will be invoked after this operation
+ * returns.
+ */
+ void (*stopping)(struct task_struct *p, bool runnable);
+
+ /**
+ * @quiescent: A task is becoming not runnable on its associated CPU
+ * @p: task becoming not runnable
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ *
+ * @p is becoming quiescent on the CPU because it's
+ *
+ * - sleeping (%SCX_DEQ_SLEEP)
+ * - being moved to another CPU
+ * - being temporarily taken off the queue for an attribute change
+ * (%SCX_DEQ_SAVE)
+ *
+ * This and ->dequeue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be preceded by ->dequeue()
+ * e.g. when @p is being dispatched to a remote CPU.
+ */
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @yield: Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * @core_sched_before: Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Used by core-sched to determine the ordering between two tasks. See
+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+ * core-sched.
+ *
+ * Both @a and @b are runnable and may or may not currently be queued on
+ * the BPF scheduler. Should return %true if @a should run before @b.
+ * %false if there's no required ordering or @b should run before @a.
+ *
+ * If not specified, the default is ordering them according to when they
+ * became runnable.
+ */
+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+ /**
+ * @set_weight: Set task weight
+ * @p: task to set weight for
+ * @weight: new weight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * @set_cpumask: Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+ /**
+ * @init_task: Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * @exit_task: Exit a previously-running task from the system
+ * @p: task to exit
+ * @args: exit arguments, see the struct definition
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * @enable: Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * @disable: Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /**
+ * @dump: Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /**
+ * @cgroup_init: Initialize a cgroup
+ * @cgrp: cgroup being initialized
+ * @args: init arguments, see the struct definition
+ *
+ * Either the BPF scheduler is being loaded or @cgrp created, initialize
+ * @cgrp for sched_ext. This operation may block.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During cgroup
+ * creation, it will abort the specific cgroup creation.
+ */
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+
+ /**
+ * @cgroup_exit: Exit a cgroup
+ * @cgrp: cgroup being exited
+ *
+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+ * @cgrp for sched_ext. This operation my block.
+ */
+ void (*cgroup_exit)(struct cgroup *cgrp);
+
+ /**
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Prepare @p for move from cgroup @from to @to. This operation may
+ * block and can be used for allocations.
+ *
+ * Return 0 for success, -errno for failure. An error return aborts the
+ * migration.
+ */
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_move: Commit cgroup move
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Commit the move. @p is dequeued during this operation.
+ */
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_cancel_move: Cancel cgroup move
+ * @p: task whose cgroup move is being canceled
+ * @from: cgroup @p was being moved from
+ * @to: cgroup @p was being moved to
+ *
+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+ * Undo the preparation.
+ */
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_set_weight: A cgroup's weight is being changed
+ * @cgrp: cgroup whose weight is being updated
+ * @weight: new weight [1..10000]
+ *
+ * Update @cgrp's weight to @weight.
+ */
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is up to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
+ /**
+ * @cgroup_set_idle: A cgroup's idle state is being changed
+ * @cgrp: cgroup whose idle state is being updated
+ * @idle: whether the cgroup is entering or exiting idle state
+ *
+ * Update @cgrp's idle state to @idle. This callback is invoked when
+ * a cgroup transitions between idle and non-idle states, allowing the
+ * BPF scheduler to adjust its behavior accordingly.
+ */
+ void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * @cpu_online: A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * @cpu_offline: A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
+ */
+
+ /**
+ * @init: Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * @exit: Clean up after the BPF scheduler
+ * @info: Exit info
+ *
+ * ops.exit() is also called on ops.init() failure, which is a bit
+ * unusual. This is to allow rich reporting through @info on how
+ * ops.init() failed.
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * @flags: %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
+ /**
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
+ * @name: BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched_pcpu {
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats event_stats;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+ struct scx_sched_pcpu __percpu *pcpu;
+
+ /*
+ * Updates to the following warned bitfields can race causing RMW issues
+ * but it doesn't really matter.
+ */
+ bool warned_zero_slice:1;
+ bool warned_deprecated_rq:1;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
+ /*
+ * The task being enqueued was previously enqueued on the current CPU's
+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
+ * invoked in a ->cpu_release() callback, and the task is again
+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+ * task will not be scheduled on the CPU until at least the next invocation
+ * of the ->cpu_acquire() callback.
+ */
+ SCX_ENQ_REENQ = 1LLU << 40,
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * The BPF scheduler is responsible for triggering a follow-up
+ * scheduling event. Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+ SCX_ENQ_NESTED = 1LLU << 58,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The generic core-sched layer decided to execute the task even though
+ * it hasn't been dispatched yet. Dequeue from the BPF side.
+ */
+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
+};
+
+enum scx_kick_flags {
+ /*
+ * Kick the target CPU if idle. Guarantees that the target CPU goes
+ * through at least one full scheduling cycle before going idle. If the
+ * target CPU can be determined to be currently not idle and going to go
+ * through a scheduling cycle before going idle, noop.
+ */
+ SCX_KICK_IDLE = 1LLU << 0,
+
+ /*
+ * Preempt the current task and execute the dispatch path. If the
+ * current task of the target CPU is an SCX task, its ->scx.slice is
+ * cleared to zero before the scheduling path is invoked so that the
+ * task expires and the dispatch path is invoked.
+ */
+ SCX_KICK_PREEMPT = 1LLU << 1,
+
+ /*
+ * The scx_bpf_kick_cpu() call will return after the current SCX task of
+ * the target CPU switches out. This can be used to implement e.g. core
+ * scheduling. This has no effect if the current task on the target CPU
+ * is not on SCX.
+ */
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
+enum scx_tg_flags {
+ SCX_TG_ONLINE = 1U << 0,
+ SCX_TG_INITED = 1U << 1,
+};
+
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
+};
+
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c2980e8733bc..da46c3164537 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
*
@@ -17,88 +18,98 @@
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/prio.h>
-#include <linux/latencytop.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
#include <linux/cpuidle.h>
-#include <linux/slab.h>
-#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
#include <linux/mempolicy.h>
-#include <linux/migrate.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
-#include <trace/events/sched.h>
+#include <asm/switch_to.h>
-#include "sched.h"
+#include <uapi/linux/sched/types.h>
-/*
- * Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
/*
* The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*
* Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
- = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ *
+ * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_base_slice = 700000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
-/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
+__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
+
+static int __init setup_sched_thermal_decay_shift(char *str)
+{
+ pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
+ return 1;
+}
+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
+ * For asym packing, by default the lower numbered CPU has higher priority.
*/
-unsigned int sysctl_sched_child_runs_first __read_mostly;
+int __weak arch_asym_cpu_priority(int cpu)
+{
+ return -cpu;
+}
/*
- * SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * The margin used when comparing utilization with CPU capacity.
*
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
+ * (default: ~20%)
*/
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
/*
- * The exponential sliding window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
+ * The margin used when comparing CPU capacities.
+ * is 'cap1' noticeably greater than 'cap2'
+ *
+ * (default: ~5%)
*/
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#ifdef CONFIG_CFS_BANDWIDTH
/*
@@ -109,10 +120,47 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
* to consumption or the quota being specified to be smaller than the slice)
* we will always only issue the remaining available time.
*
- * default: 5 msec, units: microseconds
- */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+ * (default: 5 msec, units: microseconds)
+ */
+static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+#endif
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_fair_sysctls[] = {
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_slice_us",
+ .data = &sysctl_sched_cfs_bandwidth_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_promote_rate_limit_MBps",
+ .data = &sysctl_numa_balancing_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+};
+
+static int __init sched_fair_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_fair_sysctls);
+ return 0;
+}
+late_initcall(sched_fair_sysctl_init);
+#endif /* CONFIG_SYSCTL */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -141,9 +189,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
{
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
unsigned int factor;
switch (sysctl_sched_tunable_scaling) {
@@ -168,13 +216,11 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}
-void sched_init_granularity(void)
+void __init sched_init_granularity(void)
{
update_sysctl();
}
@@ -204,7 +250,7 @@ static void __update_inv_weight(struct load_weight *lw)
* OR
* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
- * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
* we're guaranteed shift stays positive because inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
@@ -214,28 +260,40 @@ static void __update_inv_weight(struct load_weight *lw)
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
+ u32 fact_hi = (u32)(fact >> 32);
int shift = WMULT_SHIFT;
+ int fs;
__update_inv_weight(lw);
- if (unlikely(fact >> 32)) {
- while (fact >> 32) {
- fact >>= 1;
- shift--;
- }
+ if (unlikely(fact_hi)) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
- /* hint to use a 32x32->64 mul */
- fact = (u64)(u32)fact * lw->inv_weight;
+ fact = mul_u32_u32(fact, lw->inv_weight);
- while (fact >> 32) {
- fact >>= 1;
- shift--;
+ fact_hi = (u32)(fact >> 32);
+ if (fact_hi) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -245,82 +303,107 @@ const struct sched_class fair_sched_class;
#ifdef CONFIG_FAIR_GROUP_SCHED
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rq;
-}
-
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se) (!se->my_q)
-
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!entity_is_task(se));
-#endif
- return container_of(se, struct task_struct, se);
-}
-
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- return p->se.cfs_rq;
-}
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
+ if (cfs_rq->on_list)
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update);
+ cfs_rq->on_list = 1;
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
- if (!cfs_rq->on_list) {
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the top of the tree
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
*/
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- }
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
- cfs_rq->on_list = 1;
- /* We should have no load, but we need to update last_decay. */
- update_cfs_rq_blocked_load(cfs_rq, 0);
+ if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the top of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
}
+
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ /*
+ * With cfs_rq being unthrottled/throttled during an enqueue,
+ * it can happen the tmp_alone_branch points to the leaf that
+ * we finally want to delete. In this case, tmp_alone_branch moves
+ * to the prev element but it will point to rq->leaf_cfs_rq_list
+ * at the end of the enqueue.
+ */
+ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+ rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+ WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
+/* Iterate through all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -332,7 +415,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)
return NULL;
}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}
@@ -369,64 +452,70 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+static int tg_is_idle(struct task_group *tg)
+{
+ return tg->idle > 0;
+}
-static inline struct task_struct *task_of(struct sched_entity *se)
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
- return container_of(se, struct task_struct, se);
+ return cfs_rq->idle > 0;
}
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+static int se_is_idle(struct sched_entity *se)
{
- return container_of(cfs_rq, struct rq, cfs);
+ if (entity_is_task(se))
+ return task_has_idle_policy(task_of(se));
+ return cfs_rq_is_idle(group_cfs_rq(se));
}
-#define entity_is_task(se) 1
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
#define for_each_sched_entity(se) \
for (; se; se = NULL)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- return &task_rq(p)->cfs;
+ return true;
}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
+}
- return &rq->cfs;
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
}
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline int tg_is_idle(struct task_group *tg)
{
+ return 0;
}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
- return NULL;
+ return 0;
}
-static inline void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+static int se_is_idle(struct sched_entity *se)
{
+ return task_has_idle_policy(task_of(se));
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
@@ -435,7 +524,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
@@ -444,7 +533,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
return max_vruntime;
}
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
@@ -453,330 +542,814 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline int entity_before(struct sched_entity *a,
- struct sched_entity *b)
+static inline bool entity_before(const struct sched_entity *a,
+ const struct sched_entity *b)
{
- return (s64)(a->vruntime - b->vruntime) < 0;
+ /*
+ * Tiebreak on vruntime seems unnecessary since it can
+ * hardly happen.
+ */
+ return (s64)(a->deadline - b->deadline) < 0;
}
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 vruntime = cfs_rq->min_vruntime;
+ return (s64)(se->vruntime - cfs_rq->zero_vruntime);
+}
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
+#define __node_2_se(node) \
+ rb_entry((node), struct sched_entity, run_node)
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ * \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ * \Sum lag_i = 0
+ * \Sum w_i * (V - v_i) = 0
+ * \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ * \Sum v_i * w_i \Sum v_i * w_i
+ * V = -------------- = --------------
+ * \Sum w_i W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ * that join/leave operations happen at lag_i = 0, otherwise the
+ * virtual time has non-contiguous motion equivalent to:
+ *
+ * V +-= lag_i / W
+ *
+ * Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplication could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ * W W
+ *
+ * Which we track using:
+ *
+ * v0 := cfs_rq->zero_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ * \Sum w_i := cfs_rq->avg_load
+ *
+ * Since zero_vruntime closely tracks the per-task service, these
+ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
- if (!cfs_rq->curr)
- vruntime = se->vruntime;
- else
- vruntime = min_vruntime(vruntime, se->vruntime);
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
}
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ if (load) {
+ /* sign flips effective floor / ceiling */
+ if (avg < 0)
+ avg -= (load - 1);
+ avg = div_s64(avg, load);
+ }
+
+ return cfs_rq->zero_vruntime + avg;
}
/*
- * Enqueue an entity into the rb-tree:
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
*/
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
- struct rb_node *parent = NULL;
- struct sched_entity *entry;
- int leftmost = 1;
+ s64 vlag, limit;
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_entity, run_node);
- /*
- * We dont care about collisions. Nodes with
- * the same key stay together.
- */
- if (entity_before(se, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = 0;
- }
+ WARN_ON_ONCE(!se->on_rq);
+
+ vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ se->vlag = clamp(vlag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
+ * to the loss in precision caused by the division.
+ */
+static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
}
- /*
- * Maintain a cache of leftmost tree entries (it is frequently
- * used):
- */
- if (leftmost)
- cfs_rq->rb_leftmost = &se->run_node;
+ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
+}
- rb_link_node(&se->run_node, parent, link);
- rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return vruntime_eligible(cfs_rq, se->vruntime);
}
-static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
- if (cfs_rq->rb_leftmost == &se->run_node) {
- struct rb_node *next_node;
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
+
+ avg_vruntime_update(cfs_rq, delta);
+
+ cfs_rq->zero_vruntime = vruntime;
+}
+
+static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 min_slice = ~0ULL;
+
+ if (curr && curr->on_rq)
+ min_slice = curr->slice;
+
+ if (root)
+ min_slice = min(min_slice, root->min_slice);
+
+ return min_slice;
+}
- next_node = rb_next(&se->run_node);
- cfs_rq->rb_leftmost = next_node;
+static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+{
+ return entity_before(__node_2_se(a), __node_2_se(b));
+}
+
+#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (vruntime_gt(min_vruntime, se, rse))
+ se->min_vruntime = rse->min_vruntime;
+ }
+}
+
+static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->min_slice < se->min_slice)
+ se->min_slice = rse->min_slice;
}
+}
+
+/*
+ * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
+ */
+static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_vruntime = se->min_vruntime;
+ u64 old_min_slice = se->min_slice;
+ struct rb_node *node = &se->run_node;
+
+ se->min_vruntime = se->vruntime;
+ __min_vruntime_update(se, node->rb_right);
+ __min_vruntime_update(se, node->rb_left);
+
+ se->min_slice = se->slice;
+ __min_slice_update(se, node->rb_right);
+ __min_slice_update(se, node->rb_left);
+
+ return se->min_vruntime == old_min_vruntime &&
+ se->min_slice == old_min_slice;
+}
- rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+ run_node, min_vruntime, min_vruntime_update);
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ avg_vruntime_add(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
+ se->min_vruntime = se->vruntime;
+ se->min_slice = se->slice;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_vruntime_cb);
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_vruntime_cb);
+ avg_vruntime_sub(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
+}
+
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
+
+ if (!root)
+ return NULL;
+
+ return __node_2_se(root);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = cfs_rq->rb_leftmost;
+ struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
if (!left)
return NULL;
- return rb_entry(left, struct sched_entity, run_node);
+ return __node_2_se(left);
}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Set the vruntime up to which an entity can run before looking
+ * for another entity to pick.
+ * In case of run to parity, we use the shortest slice of the enqueued
+ * entities to set the protected period.
+ * When run to parity is disabled, we give a minimum quantum to the running
+ * entity to ensure progress.
+ */
+static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct rb_node *next = rb_next(&se->run_node);
+ u64 slice = normalized_sysctl_sched_base_slice;
+ u64 vprot = se->deadline;
- if (!next)
- return NULL;
+ if (sched_feat(RUN_TO_PARITY))
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ slice = min(slice, se->slice);
+ if (slice != se->slice)
+ vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
+
+ se->vprot = vprot;
+}
+
+static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = cfs_rq_min_slice(cfs_rq);
+
+ se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
+}
+
+static inline bool protect_slice(struct sched_entity *se)
+{
+ return ((s64)(se->vprot - se->vruntime) > 0);
+}
+
+static inline void cancel_protect_slice(struct sched_entity *se)
+{
+ if (protect_slice(se))
+ se->vprot = se->vruntime;
+}
+
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on deadline, but also functions as a
+ * heap based on the vruntime by keeping:
+ *
+ * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
+ *
+ * Which allows tree pruning through eligibility.
+ */
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+{
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
- return rb_entry(next, struct sched_entity, run_node);
+ /*
+ * We can safely skip eligibility check if there is only one entity
+ * in this cfs_rq, saving some cycles.
+ */
+ if (cfs_rq->nr_queued == 1)
+ return curr && curr->on_rq ? curr : se;
+
+ /*
+ * Picking the ->next buddy will affect latency but not fairness.
+ */
+ if (sched_feat(PICK_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ WARN_ON_ONCE(cfs_rq->next->sched_delayed);
+ return cfs_rq->next;
+ }
+
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ if (curr && protect && protect_slice(curr))
+ return curr;
+
+ /* Pick the leftmost entity if it's eligible */
+ if (se && entity_eligible(cfs_rq, se)) {
+ best = se;
+ goto found;
+ }
+
+ /* Heap search for the EEVD entity */
+ while (node) {
+ struct rb_node *left = node->rb_left;
+
+ /*
+ * Eligible entities in left subtree are always better
+ * choices, since they have earlier deadlines.
+ */
+ if (left && vruntime_eligible(cfs_rq,
+ __node_2_se(left)->min_vruntime)) {
+ node = left;
+ continue;
+ }
+
+ se = __node_2_se(node);
+
+ /*
+ * The left subtree either is empty or has no eligible
+ * entity, so check the current node since it is the one
+ * with earliest deadline that might be eligible.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ best = se;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+found:
+ if (!best || (curr && entity_before(curr, best)))
+ best = curr;
+
+ return best;
+}
+
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ return __pick_eevdf(cfs_rq, true);
}
-#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
if (!last)
return NULL;
- return rb_entry(last, struct sched_entity, run_node);
+ return __node_2_se(last);
}
/**************************************************************
* Scheduling class statistics methods:
*/
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int sched_update_scaling(void)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int factor = get_update_sysctl_factor();
-
- if (ret || !write)
- return ret;
-
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
+ unsigned int factor = get_update_sysctl_factor();
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL
return 0;
}
-#endif
+
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
- * delta /= w
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
*/
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return false;
- return delta;
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_base_slice.
+ */
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ return true;
}
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
+#include "pelt.h"
+
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
+static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
+
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
{
- u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;
+ struct sched_avg *sa = &se->avg;
- if (unlikely(nr_running > nr_latency)) {
- period = sysctl_sched_min_granularity;
- period *= nr_running;
- }
+ memset(sa, 0, sizeof(*sa));
- return period;
+ /*
+ * Tasks are initialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are initialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
+
+ /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
+ * * se_weight(se)
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task and cpu_scale the CPU capacity.
*
- * s = p*P[w/rw]
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
*/
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
+ long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
- cfs_rq = cfs_rq_of(se);
- load = &cfs_rq->load;
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+ return;
+ }
- if (unlikely(!se->on_rq)) {
- lw = cfs_rq->load;
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
- update_load_add(&lw, se->load.weight);
- load = &lw;
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
}
- slice = __calc_delta(slice, se->load.weight, load);
}
- return slice;
+
+ sa->runnable_avg = sa->util_avg;
}
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 update_se(struct rq *rq, struct sched_entity *se)
{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
-}
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
-#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
-static unsigned long task_h_load(struct task_struct *p);
+ delta_exec = now - se->exec_start;
+ if (unlikely(delta_exec <= 0))
+ return delta_exec;
-static inline void __update_task_entity_contrib(struct sched_entity *se);
-static inline void __update_task_entity_utilization(struct sched_entity *se);
+ se->exec_start = now;
+ if (entity_is_task(se)) {
+ struct task_struct *donor = task_of(se);
+ struct task_struct *running = rq->curr;
+ /*
+ * If se is a task, we account the time against the running
+ * task, as w/ proxy-exec they may not be the same.
+ */
+ running->se.exec_start = now;
+ running->se.sum_exec_runtime += delta_exec;
-/* Give new task start runnable values to heavy its load in infant time */
-void init_task_runnable_average(struct task_struct *p)
-{
- u32 slice;
+ trace_sched_stat_runtime(running, delta_exec);
+ account_group_exec_runtime(running, delta_exec);
+
+ /* cgroup time is always accounted against the donor */
+ cgroup_account_cputime(donor, delta_exec);
+ } else {
+ /* If not task, account the time against donor se */
+ se->sum_exec_runtime += delta_exec;
+ }
+
+ if (schedstat_enabled()) {
+ struct sched_statistics *stats;
- slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
- p->se.avg.avg_period = slice;
- __update_task_entity_contrib(&p->se);
- __update_task_entity_utilization(&p->se);
+ stats = __schedstats_from_se(se);
+ __schedstat_set(stats->exec_max,
+ max(delta_exec, stats->exec_max));
+ }
+
+ return delta_exec;
}
-#else
-void init_task_runnable_average(struct task_struct *p)
+
+static void set_next_buddy(struct sched_entity *se);
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
{
+ return update_se(rq, &rq->donor->se);
}
-#endif
/*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
+ /*
+ * Note: cfs_rq->curr corresponds to the task picked to
+ * run (ie: rq->donor.se) which due to proxy-exec may
+ * not necessarily be the actual task running
+ * (rq->curr.se). This is easy to confuse!
+ */
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_clock_task(rq_of(cfs_rq));
- u64 delta_exec;
+ struct rq *rq = rq_of(cfs_rq);
+ s64 delta_exec;
+ bool resched;
if (unlikely(!curr))
return;
- delta_exec = now - curr->exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_se(rq, curr);
+ if (unlikely(delta_exec <= 0))
return;
- curr->exec_start = now;
-
- schedstat_set(curr->statistics.exec_max,
- max(delta_exec, curr->statistics.exec_max));
-
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq, exec_clock, delta_exec);
-
curr->vruntime += calc_delta_fair(delta_exec, curr);
- update_min_vruntime(cfs_rq);
+ resched = update_deadline(cfs_rq, curr);
if (entity_is_task(curr)) {
- struct task_struct *curtask = task_of(curr);
-
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cpuacct_charge(curtask, delta_exec);
- account_group_exec_runtime(curtask, delta_exec);
+ /*
+ * If the fair_server is active, we need to account for the
+ * fair_server time whether or not the task is running on
+ * behalf of fair_server or not:
+ * - If the task is running on behalf of fair_server, we need
+ * to limit its time based on the assigned runtime.
+ * - Fair task that runs outside of fair_server should account
+ * against fair_server such that it can account for this time
+ * and possibly avoid running this period.
+ */
+ dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+ if (cfs_rq->nr_queued == 1)
+ return;
+
+ if (resched || !protect_slice(curr)) {
+ resched_curr_lazy(rq);
+ clear_buddies(cfs_rq, curr);
+ }
}
static void update_curr_fair(struct rq *rq)
{
- update_curr(cfs_rq_of(&rq->curr->se));
+ update_curr(cfs_rq_of(&rq->donor->se));
+}
+
+static inline void
+update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_se(se);
+
+ if (entity_is_task(se))
+ p = task_of(se);
+
+ __update_stats_wait_start(rq_of(cfs_rq), p, stats);
+}
+
+static inline void
+update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_se(se);
+
+ /*
+ * When the sched_schedstat changes from 0 to 1, some sched se
+ * maybe already in the runqueue, the se->statistics.wait_start
+ * will be 0.So it will let the delta wrong. We need to avoid this
+ * scenario.
+ */
+ if (unlikely(!schedstat_val(stats->wait_start)))
+ return;
+
+ if (entity_is_task(se))
+ p = task_of(se);
+
+ __update_stats_wait_end(rq_of(cfs_rq), p, stats);
}
static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
+ struct sched_statistics *stats;
+ struct task_struct *tsk = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ stats = __schedstats_from_se(se);
+
+ if (entity_is_task(se))
+ tsk = task_of(se);
+
+ __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
}
/*
* Task is being enqueued - update stats:
*/
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ if (!schedstat_enabled())
+ return;
+
/*
* Are we enqueueing a waiting task? (for current tasks
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
- update_stats_wait_start(cfs_rq, se);
-}
+ update_stats_wait_start_fair(cfs_rq, se);
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_fair(cfs_rq, se);
}
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+
+ if (!schedstat_enabled())
+ return;
+
/*
* Mark the end of the wait period if dequeueing a
* waiting task:
*/
if (se != cfs_rq->curr)
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
+
+ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+ unsigned int state;
+
+ /* XXX racy against TTWU */
+ state = READ_ONCE(tsk->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(tsk->stats.sleep_start,
+ rq_clock(rq_of(cfs_rq)));
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(tsk->stats.block_start,
+ rq_clock(rq_of(cfs_rq)));
+ }
}
/*
@@ -795,6 +1368,50 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+
+ if (!idle_cpu(sibling))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -810,6 +1427,48 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
+struct numa_group {
+ refcount_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * faults[] array is split into two regions: faults_mem and faults_cpu.
+ *
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long faults[];
+};
+
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_check(p->numa_group, p == current ||
+ (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_protected(p->numa_group, p == current);
+}
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -820,7 +1479,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
* by the PTE scanner and NUMA hinting faults should be trapped based
* on resident pages
*/
- nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size);
rss = get_mm_rss(p->mm);
if (!rss)
rss = nr_scan_pages;
@@ -829,12 +1488,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
return rss / nr_scan_pages;
}
-/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560
static unsigned int task_scan_min(struct task_struct *p)
{
- unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+ unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
@@ -846,47 +1505,66 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+ struct numa_group *ng;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng) {
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= refcount_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+ rcu_read_unlock();
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
+ struct numa_group *ng;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= refcount_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
-
- struct rcu_head rcu;
- nodemask_t active_nodes;
- unsigned long total_faults;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -898,11 +1576,20 @@ struct numa_group {
pid_t task_numa_group_id(struct task_struct *p)
{
- return p->numa_group ? p->numa_group->gid : 0;
+ struct numa_group *ng;
+ pid_t gid = 0;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ gid = ng->gid;
+ rcu_read_unlock();
+
+ return gid;
}
/*
- * The averaged statistics, shared & private, memory & cpu,
+ * The averaged statistics, shared & private, memory & CPU,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
@@ -923,25 +1610,63 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
- if (!p->numa_group)
+ struct numa_group *ng = deref_task_numa_group(p);
+
+ if (!ng)
return 0;
- return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
- p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
- group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+ return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
+ group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
+}
+
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
}
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
- int maxdist, bool task)
+ int lim_dist, bool task)
{
unsigned long score = 0;
- int node;
+ int node, max_dist;
/*
* All nodes are directly connected, and the same distance
@@ -950,9 +1675,11 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
+ /* sched_max_numa_distance may be changed in parallel. */
+ max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
- * which should be ok given the number of nodes rarely exceeds 8.
+ * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
@@ -962,7 +1689,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
- if (dist == sched_max_numa_distance || node == nid)
+ if (dist >= max_dist || node == nid)
continue;
/*
@@ -972,8 +1699,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist > maxdist)
+ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -991,8 +1717,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
- faults *= (sched_max_numa_distance - dist);
- faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ faults *= (max_dist - dist);
+ faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
@@ -1029,12 +1755,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
static inline unsigned long group_weight(struct task_struct *p, int nid,
int dist)
{
+ struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;
- if (!p->numa_group)
+ if (!ng)
return 0;
- total_faults = p->numa_group->total_faults;
+ total_faults = ng->total_faults;
if (!total_faults)
return 0;
@@ -1045,14 +1772,179 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ promo_wmark_pages(zone) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct folio *folio)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = folio_xchg_access_time(folio, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int src_nid, int dst_cpu)
{
- struct numa_group *ng = p->numa_group;
+ struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * Cannot migrate to memoryless nodes.
+ */
+ if (!node_state(dst_nid, N_MEMORY))
+ return false;
+
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (folio_use_access_time(folio)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+ long nr = folio_nr_pages(folio);
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr);
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(folio);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit, nr);
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+ last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
+
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
+
+ /*
+ * Allow first faults or private faults to migrate immediately early in
+ * the lifetime of a task. The magic number 4 is based on waiting for
+ * two full passes of the "multi-stage node selection" test that is
+ * executed below.
+ */
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
+ (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+ return true;
/*
* Multi-stage node selection is used in conjunction with a periodic
@@ -1071,7 +1963,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid)
return false;
@@ -1085,106 +1976,201 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
return true;
/*
- * Do not migrate if the destination is not a node that
- * is actively used by this numa group.
- */
- if (!node_isset(dst_nid, ng->active_nodes))
- return false;
-
- /*
- * Source is a node that is not actively used by this
- * numa group, while the destination is. Migrate.
+ * Destination node is much more heavily used than the source
+ * node? Allow migration.
*/
- if (!node_isset(src_nid, ng->active_nodes))
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+ ACTIVE_NODE_FRACTION)
return true;
/*
- * Both source and destination are nodes in active
- * use by this numa group. Maximize memory bandwidth
- * by migrating from more heavily used groups, to less
- * heavily used ones, spreading the load around.
- * Use a 1/4 hysteresis to avoid spurious page movement.
+ * Distribute memory according to CPU & memory use on each node,
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
+ *
+ * faults_cpu(dst) 3 faults_cpu(src)
+ * --------------- * - > ---------------
+ * faults_mem(dst) 4 faults_mem(src)
*/
- return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+/*
+ * 'numa_type' describes the node at the moment of load balancing.
+ */
+enum numa_type {
+ /* The node has spare capacity that can be used to run more tasks. */
+ node_has_spare = 0,
+ /*
+ * The node is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ node_fully_busy,
+ /*
+ * The node is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ node_overloaded
+};
/* Cached statistics for all CPUs within a node */
struct numa_stats {
- unsigned long nr_running;
unsigned long load;
-
+ unsigned long runnable;
+ unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
+ unsigned int nr_running;
+ unsigned int weight;
+ enum numa_type node_type;
+ int idle_cpu;
+};
+
+struct task_numa_env {
+ struct task_struct *p;
+
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
+ int imb_numa_nr;
+
+ struct numa_stats src_stats, dst_stats;
- /* Approximate capacity in terms of runnable tasks on a node */
- unsigned long task_capacity;
- int has_free_capacity;
+ int imbalance_pct;
+ int dist;
+
+ struct task_struct *best_task;
+ long best_imp;
+ int best_cpu;
};
+static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_runnable(struct rq *rq);
+
+static inline enum
+numa_type numa_classify(unsigned int imbalance_pct,
+ struct numa_stats *ns)
+{
+ if ((ns->nr_running > ns->weight) &&
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
+ return node_overloaded;
+
+ if ((ns->nr_running < ns->weight) ||
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
+ return node_has_spare;
+
+ return node_fully_busy;
+}
+
+#ifdef CONFIG_SCHED_SMT
+/* Forward declarations of select_idle_sibling helpers */
+static inline bool test_idle_cores(int cpu);
+static inline int numa_idle_core(int idle_core, int cpu)
+{
+ if (!static_branch_likely(&sched_smt_present) ||
+ idle_core >= 0 || !test_idle_cores(cpu))
+ return idle_core;
+
+ /*
+ * Prefer cores instead of packing HT siblings
+ * and triggering future load balancing.
+ */
+ if (is_core_idle(cpu))
+ idle_core = cpu;
+
+ return idle_core;
+}
+#else /* !CONFIG_SCHED_SMT: */
+static inline int numa_idle_core(int idle_core, int cpu)
+{
+ return idle_core;
+}
+#endif /* !CONFIG_SCHED_SMT */
+
/*
- * XXX borrowed from update_sg_lb_stats
+ * Gather all necessary information to make NUMA balancing placement
+ * decisions that are compatible with standard load balancer. This
+ * borrows code and logic from update_sg_lb_stats but sharing a
+ * common implementation is impractical.
*/
-static void update_numa_stats(struct numa_stats *ns, int nid)
+static void update_numa_stats(struct task_numa_env *env,
+ struct numa_stats *ns, int nid,
+ bool find_idle)
{
- int smt, cpu, cpus = 0;
- unsigned long capacity;
+ int cpu, idle_core = -1;
memset(ns, 0, sizeof(*ns));
+ ns->idle_cpu = -1;
+
+ rcu_read_lock();
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += cpu_load(rq);
+ ns->runnable += cpu_runnable(rq);
+ ns->util += cpu_util_cfs(cpu);
+ ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);
- cpus++;
+ if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
+ if (READ_ONCE(rq->numa_migrate_on) ||
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
+ continue;
+
+ if (ns->idle_cpu == -1)
+ ns->idle_cpu = cpu;
+
+ idle_core = numa_idle_core(idle_core, cpu);
+ }
}
+ rcu_read_unlock();
- /*
- * If we raced with hotplug and there are no CPUs left in our mask
- * the @ns structure is NULL'ed and task_numa_compare() will
- * not find this node attractive.
- *
- * We'll either bail at !has_free_capacity, or we'll detect a huge
- * imbalance and bail there.
- */
- if (!cpus)
- return;
+ ns->weight = cpumask_weight(cpumask_of_node(nid));
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
- capacity = cpus / smt; /* cores */
+ ns->node_type = numa_classify(env->imbalance_pct, ns);
- ns->task_capacity = min_t(unsigned, capacity,
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
- ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
+ if (idle_core >= 0)
+ ns->idle_cpu = idle_core;
}
-struct task_numa_env {
- struct task_struct *p;
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
+{
+ struct rq *rq = cpu_rq(env->dst_cpu);
- int src_cpu, src_nid;
- int dst_cpu, dst_nid;
+ /* Check if run-queue part of active NUMA balance. */
+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
+ int cpu;
+ int start = env->dst_cpu;
- struct numa_stats src_stats, dst_stats;
+ /* Find alternative idle CPU. */
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
+ if (cpu == env->best_cpu || !idle_cpu(cpu) ||
+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
+ continue;
+ }
- int imbalance_pct;
- int dist;
+ env->dst_cpu = cpu;
+ rq = cpu_rq(env->dst_cpu);
+ if (!xchg(&rq->numa_migrate_on, 1))
+ goto assign;
+ }
- struct task_struct *best_task;
- long best_imp;
- int best_cpu;
-};
+ /* Failed to find an alternative idle CPU */
+ return;
+ }
+
+assign:
+ /*
+ * Clear previous best_cpu/rq numa-migrate flag, since task now
+ * found a better CPU to move/swap.
+ */
+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
+ rq = cpu_rq(env->best_cpu);
+ WRITE_ONCE(rq->numa_migrate_on, 0);
+ }
-static void task_numa_assign(struct task_numa_env *env,
- struct task_struct *p, long imp)
-{
if (env->best_task)
put_task_struct(env->best_task);
if (p)
@@ -1198,11 +2184,9 @@ static void task_numa_assign(struct task_numa_env *env,
static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
+ long imb, old_imb;
+ long orig_src_load, orig_dst_load;
long src_capacity, dst_capacity;
- long orig_src_load;
- long load_a, load_b;
- long moved_load;
- long imb;
/*
* The load is corrected for the CPU capacity available on each node.
@@ -1214,198 +2198,274 @@ static bool load_too_imbalanced(long src_load, long dst_load,
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;
- /* We care about the slope of the imbalance, not the direction. */
- load_a = dst_load;
- load_b = src_load;
- if (load_a < load_b)
- swap(load_a, load_b);
+ imb = abs(dst_load * src_capacity - src_load * dst_capacity);
- /* Is the difference below the threshold? */
- imb = load_a * src_capacity * 100 -
- load_b * dst_capacity * env->imbalance_pct;
- if (imb <= 0)
- return false;
-
- /*
- * The imbalance is above the allowed threshold.
- * Allow a move that brings us closer to a balanced situation,
- * without moving things past the point of balance.
- */
orig_src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
- /*
- * In a task swap, there will be one load moving from src to dst,
- * and another moving back. This is the net sum of both moves.
- * A simple task move will always have a positive value.
- * Allow the move if it brings the system closer to a balanced
- * situation, without crossing over the balance point.
- */
- moved_load = orig_src_load - src_load;
+ old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
- if (moved_load > 0)
- /* Moving src -> dst. Did we overshoot balance? */
- return src_load * dst_capacity < dst_load * src_capacity;
- else
- /* Moving dst -> src. Did we overshoot balance? */
- return dst_load * src_capacity < src_load * dst_capacity;
+ /* Would this change make things worse? */
+ return (imb > old_imb);
}
/*
+ * Maximum NUMA importance can be 1998 (2*999);
+ * SMALLIMP @ 30 would be close to 1998/64.
+ * Used to deter task migration.
+ */
+#define SMALLIMP 30
+
+/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
* into account that it might be best if task running on the dst_cpu should
* be exchanged with the source task
*/
-static void task_numa_compare(struct task_numa_env *env,
- long taskimp, long groupimp)
+static bool task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp, bool maymove)
{
- struct rq *src_rq = cpu_rq(env->src_cpu);
+ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
- long load;
- long imp = env->p->numa_group ? groupimp : taskimp;
- long moveimp = imp;
int dist = env->dist;
+ long moveimp = imp;
+ long load;
+ bool stopsearch = false;
- rcu_read_lock();
+ if (READ_ONCE(dst_rq->numa_migrate_on))
+ return false;
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ rcu_read_lock();
+ cur = rcu_dereference(dst_rq->curr);
+ if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
+ !cur->mm))
cur = NULL;
- raw_spin_unlock_irq(&dst_rq->lock);
/*
* Because we have preemption enabled we can get migrated around and
* end try selecting ourselves (current == env->p) as a swap candidate.
*/
- if (cur == env->p)
+ if (cur == env->p) {
+ stopsearch = true;
+ goto unlock;
+ }
+
+ if (!cur) {
+ if (maymove && moveimp >= env->best_imp)
+ goto assign;
+ else
+ goto unlock;
+ }
+
+ /* Skip this swap candidate if cannot move to the source cpu. */
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
+ goto unlock;
+
+ /*
+ * Skip this swap candidate if it is not moving to its preferred
+ * node and the best task is.
+ */
+ if (env->best_task &&
+ env->best_task->numa_preferred_nid == env->src_nid &&
+ cur->numa_preferred_nid != env->src_nid) {
goto unlock;
+ }
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
- * the value is, the more rmeote accesses that would be expected to
+ * the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
+ *
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
*/
- if (cur) {
- /* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ cur_ng = rcu_dereference(cur->numa_group);
+ if (cur_ng == p_ng) {
+ /*
+ * Do not swap within a group or between tasks that have
+ * no group if there is spare capacity. Swapping does
+ * not address the load imbalance and helps one task at
+ * the cost of punishing another.
+ */
+ if (env->dst_stats.node_type == node_has_spare)
goto unlock;
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
- * If dst and source tasks are in the same NUMA group, or not
- * in any group then look only at task weights.
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
*/
- if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid, dist) -
- task_weight(cur, env->dst_nid, dist);
- /*
- * Add some hysteresis to prevent swapping the
- * tasks within a group over tiny differences.
- */
- if (cur->numa_group)
- imp -= imp/16;
- } else {
- /*
- * Compare the group weights. If a task is all by
- * itself (not part of a group), use the task weight
- * instead.
- */
- if (cur->numa_group)
- imp += group_weight(cur, env->src_nid, dist) -
- group_weight(cur, env->dst_nid, dist);
- else
- imp += task_weight(cur, env->src_nid, dist) -
- task_weight(cur, env->dst_nid, dist);
- }
+ if (cur_ng)
+ imp -= imp / 16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by itself
+ * (not part of a group), use the task weight instead.
+ */
+ if (cur_ng && p_ng)
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
+ else
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
- if (imp <= env->best_imp && moveimp <= env->best_imp)
- goto unlock;
+ /* Discourage picking a task already on its preferred node */
+ if (cur->numa_preferred_nid == env->dst_nid)
+ imp -= imp / 16;
- if (!cur) {
- /* Is there capacity at our destination? */
- if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
- !env->dst_stats.has_free_capacity)
- goto unlock;
+ /*
+ * Encourage picking a task that moves to its preferred node.
+ * This potentially makes imp larger than it's maximum of
+ * 1998 (see SMALLIMP and task_weight for why) but in this
+ * case, it does not matter.
+ */
+ if (cur->numa_preferred_nid == env->src_nid)
+ imp += imp / 8;
- goto balance;
+ if (maymove && moveimp > imp && moveimp > env->best_imp) {
+ imp = moveimp;
+ cur = NULL;
+ goto assign;
}
- /* Balance doesn't matter much if we're running a task per cpu */
- if (imp > env->best_imp && src_rq->nr_running == 1 &&
- dst_rq->nr_running == 1)
+ /*
+ * Prefer swapping with a task moving to its preferred node over a
+ * task that is not.
+ */
+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
+ env->best_task->numa_preferred_nid != env->src_nid) {
goto assign;
+ }
+
+ /*
+ * If the NUMA importance is less than SMALLIMP,
+ * task migration might only result in ping pong
+ * of tasks and also hurt performance due to cache
+ * misses.
+ */
+ if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
+ goto unlock;
/*
* In the overloaded case, try and keep the load balanced.
*/
-balance:
- load = task_h_load(env->p);
+ load = task_h_load(env->p) - task_h_load(cur);
+ if (!load)
+ goto assign;
+
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
- if (moveimp > imp && moveimp > env->best_imp) {
+ if (load_too_imbalanced(src_load, dst_load, env))
+ goto unlock;
+
+assign:
+ /* Evaluate an idle CPU for a task numa move. */
+ if (!cur) {
+ int cpu = env->dst_stats.idle_cpu;
+
+ /* Nothing cached so current CPU went idle since the search. */
+ if (cpu < 0)
+ cpu = env->dst_cpu;
+
/*
- * If the improvement from just moving env->p direction is
- * better than swapping tasks around, check if a move is
- * possible. Store a slightly smaller score than moveimp,
- * so an actually idle CPU will win.
+ * If the CPU is no longer truly idle and the previous best CPU
+ * is, keep using it.
*/
- if (!load_too_imbalanced(src_load, dst_load, env)) {
- imp = moveimp - 1;
- cur = NULL;
- goto assign;
+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
+ idle_cpu(env->best_cpu)) {
+ cpu = env->best_cpu;
}
- }
-
- if (imp <= env->best_imp)
- goto unlock;
- if (cur) {
- load = task_h_load(cur);
- dst_load -= load;
- src_load += load;
+ env->dst_cpu = cpu;
}
- if (load_too_imbalanced(src_load, dst_load, env))
- goto unlock;
+ task_numa_assign(env, cur, imp);
/*
- * One idle CPU per node is evaluated for a task numa move.
- * Call select_idle_sibling to maybe find a better one.
+ * If a move to idle is allowed because there is capacity or load
+ * balance improves then stop the search. While a better swap
+ * candidate may exist, a search is not free.
*/
- if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
+ stopsearch = true;
-assign:
- task_numa_assign(env, cur, imp);
+ /*
+ * If a swap candidate must be identified and the current best task
+ * moves its preferred node then stop the search.
+ */
+ if (!maymove && env->best_task &&
+ env->best_task->numa_preferred_nid == env->src_nid) {
+ stopsearch = true;
+ }
unlock:
rcu_read_unlock();
+
+ return stopsearch;
}
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
+ bool maymove = false;
int cpu;
+ /*
+ * If dst node has spare capacity, then check if there is an
+ * imbalance that would be overruled by the load balancer.
+ */
+ if (env->dst_stats.node_type == node_has_spare) {
+ unsigned int imbalance;
+ int src_running, dst_running;
+
+ /*
+ * Would movement cause an imbalance? Note that if src has
+ * more running tasks that the imbalance is ignored as the
+ * move improves the imbalance from the perspective of the
+ * CPU load balancer.
+ * */
+ src_running = env->src_stats.nr_running - 1;
+ dst_running = env->dst_stats.nr_running + 1;
+ imbalance = max(0, dst_running - src_running);
+ imbalance = adjust_numa_imbalance(imbalance, dst_running,
+ env->imb_numa_nr);
+
+ /* Use idle CPU if there is no imbalance */
+ if (!imbalance) {
+ maymove = true;
+ if (env->dst_stats.idle_cpu >= 0) {
+ env->dst_cpu = env->dst_stats.idle_cpu;
+ task_numa_assign(env, NULL, 0);
+ return;
+ }
+ }
+ } else {
+ long src_load, dst_load, load;
+ /*
+ * If the improvement from just moving env->p direction is better
+ * than swapping tasks around, check if a move is possible.
+ */
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
+ }
+
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
- task_numa_compare(env, taskimp, groupimp);
+ if (task_numa_compare(env, taskimp, groupimp, maymove))
+ break;
}
}
@@ -1421,12 +2481,14 @@ static int task_numa_migrate(struct task_struct *p)
.best_task = NULL,
.best_imp = 0,
- .best_cpu = -1
+ .best_cpu = -1,
};
- struct sched_domain *sd;
unsigned long taskweight, groupweight;
- int nid, ret, dist;
+ struct sched_domain *sd;
long taskimp, groupimp;
+ struct numa_group *ng;
+ struct rq *best_rq;
+ int nid, ret, dist;
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1438,8 +2500,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();
/*
@@ -1449,7 +2513,7 @@ static int task_numa_migrate(struct task_struct *p)
* elsewhere, so there is no point in (re)trying.
*/
if (unlikely(!sd)) {
- p->numa_preferred_nid = task_node(p);
+ sched_setnuma(p, task_node(p));
return -EINVAL;
}
@@ -1457,10 +2521,10 @@ static int task_numa_migrate(struct task_struct *p)
dist = env.dist = node_distance(env.src_nid, env.dst_nid);
taskweight = task_weight(p, env.src_nid, dist);
groupweight = group_weight(p, env.src_nid, dist);
- update_numa_stats(&env.src_stats, env.src_nid);
+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1472,9 +2536,9 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group &&
- nodes_weight(p->numa_group->active_nodes) > 1)) {
- for_each_online_node(nid) {
+ ng = deref_curr_numa_group(p);
+ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
+ for_each_node_state(nid, N_CPU) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1493,7 +2557,7 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
- update_numa_stats(&env.dst_stats, env.dst_nid);
+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -1506,36 +2570,36 @@ static int task_numa_migrate(struct task_struct *p)
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
- if (p->numa_group) {
+ if (ng) {
if (env.best_cpu == -1)
nid = env.src_nid;
else
- nid = env.dst_nid;
+ nid = cpu_to_node(env.best_cpu);
- if (node_isset(nid, p->numa_group->active_nodes))
- sched_setnuma(p, env.dst_nid);
+ if (nid != p->numa_preferred_nid)
+ sched_setnuma(p, nid);
}
/* No better CPU than the current one was found. */
- if (env.best_cpu == -1)
+ if (env.best_cpu == -1) {
+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
return -EAGAIN;
+ }
- /*
- * Reset the scan period if the task is being rescheduled on an
- * alternative node to recheck if the tasks is now properly placed.
- */
- p->numa_scan_period = task_scan_min(p);
-
+ best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
return ret;
}
- ret = migrate_swap(p, env.best_task);
+ ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
+
if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
put_task_struct(env.best_task);
return ret;
}
@@ -1546,7 +2610,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1562,35 +2626,30 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
*/
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
- int nid;
+ int nid, active_nodes = 0;
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
- if (!node_isset(nid, numa_group->active_nodes)) {
- if (faults > max_faults * 6 / 16)
- node_set(nid, numa_group->active_nodes);
- } else if (faults < max_faults * 3 / 16)
- node_clear(nid, numa_group->active_nodes);
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
+ active_nodes++;
}
+
+ numa_group->max_faults_cpu = max_faults;
+ numa_group->active_nodes = active_nodes;
}
/*
@@ -1613,7 +2672,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1621,7 +2680,7 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
- * completely idle or all activity is areas that are not of interest
+ * completely idle or all activity is in areas that are not of interest
* to automatic numa balancing. Related to that, if there were failed
* migration then it implies we are migrating too quickly or the local
* node is overloaded. In either case, scan slower
@@ -1643,25 +2702,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -1686,9 +2756,13 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+ /* Avoid time going backwards, prevent potential divide error: */
+ if (unlikely((s64)*period < 0))
+ *period = 0;
} else {
- delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.avg_period;
+ delta = p->se.avg.load_sum;
+ *period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
@@ -1722,7 +2796,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
dist = sched_max_numa_distance;
- for_each_online_node(node) {
+ for_each_node_state(node, N_CPU) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
@@ -1741,7 +2815,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
- nodes = node_online_map;
+ nodes = node_states[N_CPU];
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
@@ -1787,14 +2861,20 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1, max_group_nid = -1;
- unsigned long max_faults = 0, max_group_faults = 0;
+ int seq, nid, max_nid = NUMA_NO_NODE;
+ unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ struct numa_group *ng;
- seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
+ seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
@@ -1805,8 +2885,9 @@ static void task_numa_placement(struct task_struct *p)
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
- if (p->numa_group) {
- group_lock = &p->numa_group->lock;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
@@ -1847,7 +2928,7 @@ static void task_numa_placement(struct task_struct *p)
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
- if (p->numa_group) {
+ if (ng) {
/*
* safe because we can only change our own group
*
@@ -1855,50 +2936,50 @@ static void task_numa_placement(struct task_struct *p)
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
- p->numa_group->faults[mem_idx] += diff;
- p->numa_group->faults_cpu[mem_idx] += f_diff;
- p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[mem_idx];
+ ng->faults[mem_idx] += diff;
+ ng->faults[cpu_idx] += f_diff;
+ ng->total_faults += diff;
+ group_faults += ng->faults[mem_idx];
}
}
- if (faults > max_faults) {
- max_faults = faults;
+ if (!ng) {
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+ }
+ } else if (group_faults > max_faults) {
+ max_faults = group_faults;
max_nid = nid;
}
-
- if (group_faults > max_group_faults) {
- max_group_faults = group_faults;
- max_group_nid = nid;
- }
}
- update_task_scan_period(p, fault_types[0], fault_types[1]);
+ /* Cannot migrate task to CPU-less node */
+ max_nid = numa_nearest_node(max_nid, N_CPU);
- if (p->numa_group) {
- update_numa_active_node_mask(p->numa_group);
+ if (ng) {
+ numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
- max_nid = preferred_group_nid(p, max_group_nid);
+ max_nid = preferred_group_nid(p, max_nid);
}
if (max_faults) {
/* Set the new preferred node */
if (max_nid != p->numa_preferred_nid)
sched_setnuma(p, max_nid);
-
- if (task_node(p) != p->numa_preferred_nid)
- numa_migrate_preferred(p);
}
+
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
}
static inline int get_numa_group(struct numa_group *grp)
{
- return atomic_inc_not_zero(&grp->refcount);
+ return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
- if (atomic_dec_and_test(&grp->refcount))
+ if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
@@ -1911,22 +2992,20 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
int cpu = cpupid_to_cpu(cpupid);
int i;
- if (unlikely(!p->numa_group)) {
+ if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
- 4*nr_node_ids*sizeof(unsigned long);
+ NR_NUMA_HINT_FAULT_STATS *
+ nr_node_ids * sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
return;
- atomic_set(&grp->refcount, 1);
+ refcount_set(&grp->refcount, 1);
+ grp->active_nodes = 1;
+ grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
- /* Second half of the array tracks nids where faults happen */
- grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
- nr_node_ids;
-
- node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -1938,7 +3017,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
}
rcu_read_lock();
- tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+ tsk = READ_ONCE(cpu_rq(cpu)->curr);
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
@@ -1947,7 +3026,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
goto no_join;
- my_grp = p->numa_group;
+ my_grp = deref_curr_numa_group(p);
if (grp == my_grp)
goto no_join;
@@ -1983,7 +3062,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- BUG_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -2009,13 +3088,24 @@ no_join:
return;
}
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA statistics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
{
- struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults;
+ /* safe: p either is current or is being freed by current */
+ struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+ unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
+ if (!numa_faults)
+ return;
+
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2028,8 +3118,14 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp);
}
- p->numa_faults = NULL;
- kfree(numa_faults);
+ if (final) {
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+ } else {
+ p->total_numa_faults = 0;
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ numa_faults[i] = 0;
+ }
}
/*
@@ -2041,15 +3137,25 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
+ struct numa_group *ng;
int priv;
- if (!numabalancing_enabled)
+ if (!static_branch_likely(&sched_numa_balancing))
return;
/* for example, ksmd faulting in a user's mm */
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2081,19 +3187,20 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- if (!priv && !local && p->numa_group &&
- node_isset(cpu_node, p->numa_group->active_nodes) &&
- node_isset(mem_node, p->numa_group->active_nodes))
+ ng = deref_curr_numa_group(p);
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
+ numa_is_active_node(cpu_node, ng) &&
+ numa_is_active_node(mem_node, ng))
local = 1;
- task_numa_placement(p);
-
/*
- * Retry task to preferred node migration periodically, in case it
- * case it previously failed, or the scheduler moved us.
+ * Retry to migrate task to preferred node periodically, in case it
+ * previously failed, or the scheduler moved us.
*/
- if (time_after(jiffies, p->numa_migrate_retry))
+ if (time_after(jiffies, p->numa_migrate_retry)) {
+ task_numa_placement(p);
numa_migrate_preferred(p);
+ }
if (migrated)
p->numa_pages_migrated += pages;
@@ -2107,27 +3214,78 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
static void reset_ptenuma_scan(struct task_struct *p)
{
- ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ /*
+ * We only did a read acquisition of the mmap sem, so
+ * p->mm->numa_scan_seq is written to without exclusive access
+ * and the update is not guaranteed to be atomic. That's not
+ * much of an issue though, since this is just used for
+ * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+ * expensive, to avoid any form of compiler optimizations:
+ */
+ WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
p->mm->numa_scan_offset = 0;
}
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long pids;
+ /*
+ * Allow unconditional access first two times, so that all the (pages)
+ * of VMAs get prot_none fault introduced irrespective of accesses.
+ * This is also done to avoid any side effect of task scanning
+ * amplifying the unfairness of disjoint set of VMAs' access.
+ */
+ if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
+ return true;
+
+ pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+ if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+ return true;
+
+ /*
+ * Complete a scan that has already started regardless of PID access, or
+ * some VMAs may never be scanned in multi-threaded applications:
+ */
+ if (mm->numa_scan_offset > vma->vm_start) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+ return true;
+ }
+
+ /*
+ * This vma has not been accessed for a while, and if the number
+ * the threads in the same process is low, which means no other
+ * threads can help scan this vma, force a vma scan.
+ */
+ if (READ_ONCE(mm->numa_scan_seq) >
+ (vma->numab_state->prev_scan_seq + get_nr_threads(current)))
+ return true;
+
+ return false;
+}
+
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
-void task_numa_work(struct callback_head *work)
+static void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
- long pages;
+ long pages, virtpages;
+ struct vma_iterator vmi;
+ bool vma_pids_skipped;
+ bool vma_pids_forced = false;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
- work->next = work; /* protect against double add */
+ work->next = work;
/*
* Who cares about NUMA placement when they're dying.
*
@@ -2139,6 +3297,15 @@ void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ /*
+ * Memory is pinned to only one NUMA node via cpuset.mems, naturally
+ * no page can be migrated.
+ */
+ if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
+ trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
+ return;
+ }
+
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
@@ -2153,11 +3320,11 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
return;
/*
@@ -2166,62 +3333,172 @@ void task_numa_work(struct callback_head *work)
*/
p->node_stamp += 2 * TICK_NSEC;
- start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ virtpages = pages * 8; /* Scan up to this much virtual space */
if (!pages)
return;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, start);
+
+ if (!mmap_read_trylock(mm))
+ return;
+
+ /*
+ * VMAs are skipped if the current PID has not trapped a fault within
+ * the VMA recently. Allow scanning to be forced if there is no
+ * suitable VMA remaining.
+ */
+ vma_pids_skipped = false;
+
+retry_pids:
+ start = mm->numa_scan_offset;
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_next(&vmi);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- vma = mm->mmap;
+ vma_iter_set(&vmi, start);
+ vma = vma_next(&vmi);
}
- for (; vma; vma = vma->vm_next) {
+
+ for (; vma; vma = vma_next(&vmi)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
continue;
}
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
- * hinting faults in read-only file-backed mappings or the vdso
+ * hinting faults in read-only file-backed mappings or the vDSO
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
- (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
continue;
+ }
/*
* Skip inaccessible VMAs to avoid any confusion between
- * PROT_NONE and NUMA hinting ptes
+ * PROT_NONE and NUMA hinting PTEs
*/
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ if (!vma_is_accessible(vma)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
continue;
+ }
+
+ /* Initialise new per-VMA NUMAB state. */
+ if (!vma->numab_state) {
+ struct vma_numab_state *ptr;
+
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ continue;
+
+ if (cmpxchg(&vma->numab_state, NULL, ptr)) {
+ kfree(ptr);
+ continue;
+ }
+
+ vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
+ vma->numab_state->next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+ /* Reset happens after 4 times scan delay of scan start */
+ vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+ /*
+ * Ensure prev_scan_seq does not match numa_scan_seq,
+ * to prevent VMAs being skipped prematurely on the
+ * first scan:
+ */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
+ }
+
+ /*
+ * Scanning the VMAs of short lived tasks add more overhead. So
+ * delay the scan for new VMAs.
+ */
+ if (mm->numa_scan_seq && time_before(jiffies,
+ vma->numab_state->next_scan)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
+ continue;
+ }
+
+ /* RESET access PIDs regularly for old VMAs. */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->pids_active_reset)) {
+ vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+ vma->numab_state->pids_active[1] = 0;
+ }
+
+ /* Do not rescan VMAs twice within the same sequence. */
+ if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+ mm->numa_scan_offset = vma->vm_end;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
+ continue;
+ }
+
+ /*
+ * Do not scan the VMA if task has not accessed it, unless no other
+ * VMA candidate exists.
+ */
+ if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+ vma_pids_skipped = true;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+ continue;
+ }
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
- nr_pte_updates += change_prot_numa(vma, start, end);
+ nr_pte_updates = change_prot_numa(vma, start, end);
/*
- * Scan sysctl_numa_balancing_scan_size but ensure that
- * at least one PTE is updated so that unused virtual
- * address space is quickly skipped.
+ * Try to scan sysctl_numa_balancing_size worth of
+ * hpages that have at least one present PTE that
+ * is not already PTE-numa. If the VMA contains
+ * areas that are unused or already full of prot_numa
+ * PTEs, scan up to virtpages, to skip through those
+ * areas faster.
*/
if (nr_pte_updates)
pages -= (end - start) >> PAGE_SHIFT;
+ virtpages -= (end - start) >> PAGE_SHIFT;
start = end;
- if (pages <= 0)
+ if (pages <= 0 || virtpages <= 0)
goto out;
cond_resched();
} while (end != vma->vm_end);
+
+ /* VMA scan is complete, do not scan until next sequence. */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+ /*
+ * Only force scan within one VMA at a time, to limit the
+ * cost of scanning a potentially uninteresting VMA.
+ */
+ if (vma_pids_forced)
+ break;
+ }
+
+ /*
+ * If no VMAs are remaining and VMAs were skipped due to the PID
+ * not accessing the VMA previously, then force a scan to ensure
+ * forward progress:
+ */
+ if (!vma && !vma_pids_forced && vma_pids_skipped) {
+ vma_pids_forced = true;
+ goto retry_pids;
}
out:
@@ -2235,13 +3512,71 @@ out:
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
+
+ /*
+ * Make sure tasks use at least 32x as much time to run other code
+ * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+ * Usually update_task_scan_period slows down scanning enough; on an
+ * overloaded system we need to limit overhead on a per task basis.
+ */
+ if (unlikely(p->se.sum_exec_runtime != runtime)) {
+ u64 diff = p->se.sum_exec_runtime - runtime;
+ p->node_stamp += 32 * diff;
+ }
+}
+
+void init_numa_balancing(u64 clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_migrate_retry = 0;
+ /* Protect against double add, see task_tick_numa and task_numa_work */
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ p->numa_pages_migrated = 0;
+ p->total_numa_faults = 0;
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ init_task_work(&p->numa_work, task_numa_work);
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = NUMA_NO_NODE;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
}
/*
* Drive the periodic memory faults..
*/
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
@@ -2249,7 +3584,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -2261,18 +3596,52 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
- if (now - curr->node_stamp > period) {
+ if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
- task_work_add(curr, work, true);
- }
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
+ task_work_add(curr, work, TWA_RESUME);
}
}
-#else
+
+static void update_scan_period(struct task_struct *p, int new_cpu)
+{
+ int src_nid = cpu_to_node(task_cpu(p));
+ int dst_nid = cpu_to_node(new_cpu);
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ return;
+
+ if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
+ return;
+
+ if (src_nid == dst_nid)
+ return;
+
+ /*
+ * Allow resets if faults have been trapped before one scan
+ * has completed. This is most likely due to a new task that
+ * is pulled cross-node due to wakeups or load balancing.
+ */
+ if (p->numa_scan_seq) {
+ /*
+ * Avoid scan adjustments if moving to the preferred
+ * node or if the task was not previously running on
+ * the preferred node.
+ */
+ if (dst_nid == p->numa_preferred_nid ||
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
+ return;
+ }
+
+ p->numa_scan_period = task_scan_start(p);
+}
+
+#else /* !CONFIG_NUMA_BALANCING: */
+
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
@@ -2284,826 +3653,1629 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+
+static inline void update_scan_period(struct task_struct *p, int new_cpu)
+{
+}
+
+#endif /* !CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
}
-#endif
- cfs_rq->nr_running++;
+ cfs_rq->nr_queued++;
}
static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
- cfs_rq->nr_running--;
+ cfs_rq->nr_queued--;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
-{
- long tg_weight;
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
- /*
- * Use this CPU's actual weight instead of the last load_contribution
- * to gain a more accurate current total weight. See
- * update_cfs_rq_load_contribution().
- */
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_contrib;
- tg_weight += cfs_rq->load.weight;
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
- return tg_weight;
-}
+/*
+ * Remove and clamp on negative, from a local variable.
+ *
+ * A variant of sub_positive(), which does not use explicit load-store
+ * and is thus optimized for local variable updates.
+ */
+#define lsub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ *ptr -= min_t(typeof(*ptr), *ptr, _val); \
+} while (0)
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- long tg_weight, load, shares;
-
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
-
- shares = (tg->shares * load);
- if (tg_weight)
- shares /= tg_weight;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
-
- return shares;
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return tg->shares;
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
-# endif /* CONFIG_SMP */
+
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ bool curr = cfs_rq->curr == se;
+
if (se->on_rq) {
/* commit outstanding execution time */
- if (cfs_rq->curr == se)
- update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
+ update_curr(cfs_rq);
+ update_entity_lag(cfs_rq, se);
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
+ cfs_rq->nr_queued--;
+ if (!curr)
+ __dequeue_entity(cfs_rq, se);
+ update_load_sub(&cfs_rq->load, se->load.weight);
}
+ dequeue_load_avg(cfs_rq, se);
+
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
+ if (se->rel_deadline)
+ se->deadline = div_s64(se->deadline * se->load.weight, weight);
update_load_set(&se->load, weight);
- if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ do {
+ u32 divider = get_pelt_divider(&se->avg);
+
+ se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+ } while (0);
+
+ enqueue_load_avg(cfs_rq, se);
+ if (se->on_rq) {
+ place_entity(cfs_rq, se, 0);
+ update_load_add(&cfs_rq->load, se->load.weight);
+ if (!curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
+ }
+}
+
+static void reweight_task_fair(struct rq *rq, struct task_struct *p,
+ const struct load_weight *lw)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct load_weight *load = &se->load;
+
+ reweight_entity(cfs_rq, se, lw->weight);
+ load->inv_weight = lw->inv_weight;
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (1)
+ * \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ * grq->load.weight -> grq->avg.load_avg (2)
+ *
+ * which yields the following:
+ *
+ * tg->weight * grq->avg.load_avg
+ * ge->load.weight = ------------------------------ (3)
+ * tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- = tg->weight (4)
+ * grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ * ge->load.weight =
+ *
+ * tg->weight * grq->load.weight
+ * --------------------------------------------------- (5)
+ * tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ * tg->weight * grq->load.weight
+ * ge->load.weight = ----------------------------- (6)
+ * tg_load_avg'
+ *
+ * Where:
+ *
+ * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ * max(grq->load.weight, grq->avg.load_avg)
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ * \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
+static long calc_group_shares(struct cfs_rq *cfs_rq)
{
- struct task_group *tg;
- struct sched_entity *se;
+ long tg_weight, tg_shares, load, shares;
+ struct task_group *tg = cfs_rq->tg;
+
+ tg_shares = READ_ONCE(tg->shares);
+
+ load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
+
+ tg_weight = atomic_long_read(&tg->load_avg);
+
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
+
+ shares = (tg_shares * load);
+ if (tg_weight)
+ shares /= tg_weight;
+
+ /*
+ * MIN_SHARES has to be unscaled here to support per-CPU partitioning
+ * of a group with small tg->shares value. It is a floor value which is
+ * assigned as a minimum load.weight to the sched_entity representing
+ * the group on a CPU.
+ *
+ * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
+ * on an 8-core system with 8 tasks each runnable on one CPU shares has
+ * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
+ * case no task is runnable on a CPU MIN_SHARES=2 should be returned
+ * instead of 0.
+ */
+ return clamp_t(long, shares, MIN_SHARES, tg_shares);
+}
+
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
- return;
-#ifndef CONFIG_SMP
- if (likely(se->load.weight == tg->shares))
+ /*
+ * When a group becomes empty, preserve its weight. This matters for
+ * DELAY_DEQUEUE.
+ */
+ if (!gcfs_rq || !gcfs_rq->load.weight)
return;
-#endif
- shares = calc_cfs_shares(cfs_rq, tg);
- reweight_entity(cfs_rq_of(se), se, shares);
+ shares = calc_group_shares(gcfs_rq);
+ if (unlikely(se->load.weight != shares))
+ reweight_entity(cfs_rq_of(se), se, shares);
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+static inline void update_cfs_group(struct sched_entity *se)
{
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
+{
+ struct rq *rq = rq_of(cfs_rq);
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
- 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
- 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
- 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
- 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
- 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
- 0x85aac367, 0x82cd8698,
-};
+ if (&rq->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util_cfs().
+ */
+ cpufreq_update_util(rq, flags);
+ }
+}
+
+static inline bool load_avg_is_decayed(struct sched_avg *sa)
+{
+ if (sa->load_sum)
+ return false;
+
+ if (sa->util_sum)
+ return false;
+
+ if (sa->runnable_sum)
+ return false;
+
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ WARN_ON_ONCE(sa->load_avg ||
+ sa->util_avg ||
+ sa->runnable_avg);
+
+ return true;
+}
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+ cfs_rq->last_update_time_copy);
+}
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
- * over-estimates when re-combining.
+ * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
+ * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
+ * bottom-up, we only have to test whether the cfs_rq before us on the list
+ * is our child.
+ * If cfs_rq is not on the list, test whether a child needs its to be added to
+ * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
*/
-static const u32 runnable_avg_yN_sum[] = {
- 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
- 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
- 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
+static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
+{
+ struct cfs_rq *prev_cfs_rq;
+ struct list_head *prev;
+ struct rq *rq = rq_of(cfs_rq);
-/*
- * Approximate:
- * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ if (cfs_rq->on_list) {
+ prev = cfs_rq->leaf_cfs_rq_list.prev;
+ } else {
+ prev = rq->tmp_alone_branch;
+ }
+
+ if (prev == &rq->leaf_cfs_rq_list)
+ return false;
+
+ prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
+
+ return (prev_cfs_rq->tg->parent == cfs_rq->tg);
+}
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (!load_avg_is_decayed(&cfs_rq->avg))
+ return false;
+
+ if (child_cfs_rq_on_list(cfs_rq))
+ return false;
+
+ if (cfs_rq->tg_load_avg_contrib)
+ return false;
+
+ return true;
+}
+
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share().
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
- unsigned int local_n;
+ long delta;
+ u64 now;
- if (!n)
- return val;
- else if (unlikely(n > LOAD_AVG_PERIOD * 63))
- return 0;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
- /* after bounds checking we can collapse to 32-bit */
- local_n = n;
+ /* rq has been offline and doesn't contribute to the share anymore: */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
/*
- * As y^PERIOD = 1/2, we can combine
- * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
- * With a look-up table which covers y^n (n<PERIOD)
- *
- * To achieve constant time decay_load.
+ * For migration heavy workloads, access to tg->load_avg can be
+ * unbound. Limit the update rate to at most once per ms.
*/
- if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
- val >>= local_n / LOAD_AVG_PERIOD;
- local_n %= LOAD_AVG_PERIOD;
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+ return;
+
+ delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ cfs_rq->last_update_tg_load_avg = now;
}
+}
+
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
- val *= runnable_avg_yN_inv[local_n];
- /* We don't use SRR here since we always want to round down. */
- return val >> 32;
+ /*
+ * No need to update load_avg for root_task_group, as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
+ }
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
}
/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
+ * Called within set_task_rq() right before setting a task's CPU. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
*/
-static u32 __compute_runnable_contrib(u64 n)
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
{
- u32 contrib = 0;
+ u64 p_last_update_time;
+ u64 n_last_update_time;
- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
- return LOAD_AVG_MAX;
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
- /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
- do {
- contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
- contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (!(se->avg.last_update_time && prev))
+ return;
- n -= LOAD_AVG_PERIOD;
- } while (n > LOAD_AVG_PERIOD);
+ p_last_update_time = cfs_rq_last_update_time(prev);
+ n_last_update_time = cfs_rq_last_update_time(next);
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
+ __update_load_avg_blocked_se(p_last_update_time, se);
+ se->avg.last_update_time = n_last_update_time;
}
/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series. To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ * ge->avg == grq->avg (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
+ * and simply copies the running/runnable sum over (but still wrong, because
+ * the group entity and group rq do not have their PELT windows aligned).
*
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- * p0 p1 p2
- * (now) (~1ms ago) (~2ms ago)
+ * However, update_tg_cfs_load() is more complex. So we have:
*
- * Let u_i denote the fraction of p_i that the entity was runnable.
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
*
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
*
- * We choose y based on the with of a reasonably scheduling period, fixing:
- * y^32 = 0.5
+ * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
*
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
+ * And per (1) we have:
+ *
+ * ge->avg.runnable_avg == grq->avg.runnable_avg
+ *
+ * Which gives:
+ *
+ * ge->load.weight * grq->avg.load_avg
+ * ge->avg.load_avg = ----------------------------------- (4)
+ * grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * Another reason this doesn't work is that runnable isn't a 0-sum entity.
+ * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
+ * rq itself is runnable anywhere between 2/3 and 1 depending on how the
+ * runnable section of these tasks overlap (or not). If they were to perfectly
+ * align the rq as a whole would be runnable 2/3 of the time. If however we
+ * always have at least 1 runnable task, the rq as a whole is always runnable.
+ *
+ * So we'll have to approximate.. :/
+ *
+ * Given the constraint:
+ *
+ * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
+ *
+ * We can construct a rule that adds runnable to a rq by assuming minimal
+ * overlap.
+ *
+ * On removal, we'll assume each task is equally runnable; which yields:
+ *
+ * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
+ *
+ * XXX: only do this for the part of runnable > running ?
*
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
- struct sched_avg *sa,
- int runnable,
- int running)
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- u64 delta, periods;
- u32 runnable_contrib;
- int delta_w, decayed = 0;
- unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
+
+ /* Nothing to update */
+ if (!delta_avg)
+ return;
- delta = now - sa->last_runnable_update;
/*
- * This should only happen when time goes backwards, which it
- * unfortunately does during sched clock init when we swap over to TSC.
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
*/
- if ((s64)delta < 0) {
- sa->last_runnable_update = now;
- return 0;
- }
+ divider = get_pelt_divider(&cfs_rq->avg);
+
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+}
+
+static inline void
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
+{
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
+
+ /* Nothing to update */
+ if (!delta_avg)
+ return;
/*
- * Use 1024ns as the unit of measurement since it's a reasonable
- * approximation of 1us and fast to compute.
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
*/
- delta >>= 10;
- if (!delta)
- return 0;
- sa->last_runnable_update = now;
+ divider = get_pelt_divider(&cfs_rq->avg);
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->avg_period % 1024;
- if (delta + delta_w >= 1024) {
- /* period roll-over */
- decayed = 1;
+ /* Set new sched_entity's runnable */
+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
+
+ /* Update parent cfs_rq runnable */
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+}
+
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
+{
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ unsigned long load_avg;
+ u64 load_sum = 0;
+ s64 delta_sum;
+ u32 divider;
+ if (!runnable_sum)
+ return;
+
+ gcfs_rq->prop_runnable_sum = 0;
+
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
+ if (runnable_sum >= 0) {
/*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
+ * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
+ * the CPU is saturated running == runnable.
*/
- delta_w = 1024 - delta_w;
- if (runnable)
- sa->runnable_avg_sum += delta_w;
- if (running)
- sa->running_avg_sum += delta_w * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta_w;
-
- delta -= delta_w;
-
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
-
- sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
- periods + 1);
- sa->running_avg_sum = decay_load(sa->running_avg_sum,
- periods + 1);
- sa->avg_period = decay_load(sa->avg_period,
- periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- runnable_contrib = __compute_runnable_contrib(periods);
- if (runnable)
- sa->runnable_avg_sum += runnable_contrib;
- if (running)
- sa->running_avg_sum += runnable_contrib * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += runnable_contrib;
- }
-
- /* Remainder of delta accrued against u_0` */
- if (runnable)
- sa->runnable_avg_sum += delta;
- if (running)
- sa->running_avg_sum += delta * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta;
+ runnable_sum += se->avg.load_sum;
+ runnable_sum = min_t(long, runnable_sum, divider);
+ } else {
+ /*
+ * Estimate the new unweighted runnable_sum of the gcfs_rq by
+ * assuming all tasks are equally runnable.
+ */
+ if (scale_load_down(gcfs_rq->load.weight)) {
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
+ scale_load_down(gcfs_rq->load.weight));
+ }
- return decayed;
+ /* But make sure to not inflate se's runnable */
+ runnable_sum = min(se->avg.load_sum, load_sum);
+ }
+
+ /*
+ * runnable_sum can't be lower than running_sum
+ * Rescale running sum to be in the same range as runnable sum
+ * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
+ * runnable_sum is in [0 : LOAD_AVG_MAX]
+ */
+ running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
+ runnable_sum = max(runnable_sum, running_sum);
+
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
+
+ delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
+ return;
+
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
+
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 decays = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->propagate = 1;
+ cfs_rq->prop_runnable_sum += runnable_sum;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq, *gcfs_rq;
- decays -= se->avg.decay_count;
- se->avg.decay_count = 0;
- if (!decays)
+ if (entity_is_task(se))
return 0;
- se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.utilization_avg_contrib =
- decay_load(se->avg.utilization_avg_contrib, decays);
+ gcfs_rq = group_cfs_rq(se);
+ if (!gcfs_rq->propagate)
+ return 0;
- return decays;
-}
+ gcfs_rq->propagate = 0;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long tg_contrib;
+ cfs_rq = cfs_rq_of(se);
- tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
- tg_contrib -= cfs_rq->tg_load_contrib;
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
- if (!tg_contrib)
- return;
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
- if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic_long_add(tg_contrib, &tg->load_avg);
- cfs_rq->tg_load_contrib += tg_contrib;
- }
+ trace_pelt_cfs_tp(cfs_rq);
+ trace_pelt_se_tp(se);
+
+ return 1;
}
/*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
*/
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq)
+static inline bool skip_blocked_update(struct sched_entity *se)
{
- struct task_group *tg = cfs_rq->tg;
- long contrib;
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+ /*
+ * If sched_entity still have not zero load or utilization, we have to
+ * decay it:
+ */
+ if (se->avg.load_avg || se->avg.util_avg)
+ return false;
- /* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->avg_period + 1);
- contrib -= cfs_rq->tg_runnable_contrib;
+ /*
+ * If there is a pending propagation, we have to update the load and
+ * the utilization of the sched_entity:
+ */
+ if (gcfs_rq->propagate)
+ return false;
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
- atomic_add(contrib, &tg->runnable_avg);
- cfs_rq->tg_runnable_contrib += contrib;
- }
+ /*
+ * Otherwise, the load and the utilization of the sched_entity is
+ * already zero and there is no pending propagation, so it will be a
+ * waste of time to try to decay it:
+ */
+ return true;
}
-static inline void __update_group_entity_contrib(struct sched_entity *se)
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg = cfs_rq->tg;
- int runnable_avg;
+ return 0;
+}
- u64 contrib;
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
- contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div_u64(contrib,
- atomic_long_read(&tg->load_avg) + 1);
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 throttled = 0, now, lut;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ if (load_avg_is_decayed(&se->avg))
+ return;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ /*
+ * The lag estimation comes with a cost we don't want to pay all the
+ * time. Hence, limiting to the case where the source CPU is idle and
+ * we know we are at the greatest risk to have an outdated clock.
+ */
+ if (!is_idle)
+ return;
/*
- * For group entities we need to compute a correction term in the case
- * that they are consuming <1 cpu so that we would contribute the same
- * load as a task of equal weight.
+ * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
+ *
+ * last_update_time (the cfs_rq's last_update_time)
+ * = cfs_rq_clock_pelt()@cfs_rq_idle
+ * = rq_clock_pelt()@cfs_rq_idle
+ * - cfs->throttled_clock_pelt_time@cfs_rq_idle
*
- * Explicitly co-ordinating this measurement would be expensive, but
- * fortunately the sum of each cpus contribution forms a usable
- * lower-bound on the true value.
+ * cfs_idle_lag (delta between rq's update and cfs_rq's update)
+ * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
*
- * Consider the aggregate of 2 contributions. Either they are disjoint
- * (and the sum represents true value) or they are disjoint and we are
- * understating by the aggregate of their overlap.
+ * rq_idle_lag (delta between now and rq's update)
+ * = sched_clock_cpu() - rq_clock()@rq_idle
*
- * Extending this to N cpus, for a given overlap, the maximum amount we
- * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
- * cpus that overlap for this interval and w_i is the interval width.
+ * We can then write:
*
- * On a small machine; the first term is well-bounded which bounds the
- * total error since w_i is a subset of the period. Whereas on a
- * larger machine, while this first term can be larger, if w_i is the
- * of consequential size guaranteed to see n_i*w_i quickly converge to
- * our upper bound of 1-cpu.
+ * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
+ * sched_clock_cpu() - rq_clock()@rq_idle
+ * Where:
+ * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
+ * rq_clock()@rq_idle is rq->clock_idle
+ * cfs->throttled_clock_pelt_time@cfs_rq_idle
+ * is cfs_rq->throttled_pelt_idle
*/
- runnable_avg = atomic_read(&tg->runnable_avg);
- if (runnable_avg < NICE_0_LOAD) {
- se->avg.load_avg_contrib *= runnable_avg;
- se->avg.load_avg_contrib >>= NICE_0_SHIFT;
- }
-}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
- runnable, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_CFS_BANDWIDTH
+ throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
+ /* The clock has been stopped for throttling */
+ if (throttled == U64_MAX)
+ return;
+#endif
+ now = u64_u32_load(rq->clock_pelt_idle);
+ /*
+ * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
+ * is observed the old clock_pelt_idle value and the new clock_idle,
+ * which lead to an underestimation. The opposite would lead to an
+ * overestimation.
+ */
+ smp_rmb();
+ lut = cfs_rq_last_update_time(cfs_rq);
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
- u32 contrib;
+ now -= throttled;
+ if (now < lut)
+ /*
+ * cfs_rq->avg.last_update_time is more recent than our
+ * estimation, let's use it.
+ */
+ now = lut;
+ else
+ now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.avg_period + 1);
- se->avg.load_avg_contrib = scale_load(contrib);
+ __update_load_avg_blocked_se(now, se);
}
+#else /* !CONFIG_NO_HZ_COMMON: */
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif /* !CONFIG_NO_HZ_COMMON */
-/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
-{
- long old_contrib = se->avg.load_avg_contrib;
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_pelt()
+ * @cfs_rq: cfs_rq to update
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached.
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Return: true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+{
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
+ struct sched_avg *sa = &cfs_rq->avg;
+ int decayed = 0;
+
+ if (cfs_rq->removed.nr) {
+ unsigned long r;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
+
+ raw_spin_lock(&cfs_rq->removed.lock);
+ swap(cfs_rq->removed.util_avg, removed_util);
+ swap(cfs_rq->removed.load_avg, removed_load);
+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
+ cfs_rq->removed.nr = 0;
+ raw_spin_unlock(&cfs_rq->removed.lock);
+
+ r = removed_load;
+ sub_positive(&sa->load_avg, r);
+ sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
+
+ r = removed_util;
+ sub_positive(&sa->util_avg, r);
+ sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
- if (entity_is_task(se)) {
- __update_task_entity_contrib(se);
- } else {
- __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
- __update_group_entity_contrib(se);
+ r = removed_runnable;
+ sub_positive(&sa->runnable_avg, r);
+ sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
+
+ /*
+ * removed_runnable is the unweighted version of removed_load so we
+ * can use it to estimate removed_load_sum.
+ */
+ add_tg_cfs_propagate(cfs_rq,
+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
+
+ decayed = 1;
}
- return se->avg.load_avg_contrib - old_contrib;
+ decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
+ u64_u32_store_copy(sa->last_update_time,
+ cfs_rq->last_update_time_copy,
+ sa->last_update_time);
+ return decayed;
}
-
-static inline void __update_task_entity_utilization(struct sched_entity *se)
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u32 contrib;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
- contrib /= (se->avg.avg_period + 1);
- se->avg.utilization_avg_contrib = scale_load(contrib);
-}
+ /*
+ * When we attach the @se to the @cfs_rq, we must align the decay
+ * window because without that, really weird and wonderful things can
+ * happen.
+ *
+ * XXX illustrate
+ */
+ se->avg.last_update_time = cfs_rq->avg.last_update_time;
+ se->avg.period_contrib = cfs_rq->avg.period_contrib;
-static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
-{
- long old_contrib = se->avg.utilization_avg_contrib;
+ /*
+ * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+ * period_contrib. This isn't strictly correct, but since we're
+ * entirely outside of the PELT hierarchy, nobody cares if we truncate
+ * _sum a little.
+ */
+ se->avg.util_sum = se->avg.util_avg * divider;
- if (entity_is_task(se))
- __update_task_entity_utilization(se);
+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
+
+ se->avg.load_sum = se->avg.load_avg * divider;
+ if (se_weight(se) < se->avg.load_sum)
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
else
- se->avg.utilization_avg_contrib =
- group_cfs_rq(se)->utilization_load_avg;
+ se->avg.load_sum = 1;
+
+ enqueue_load_avg(cfs_rq, se);
+ cfs_rq->avg.util_avg += se->avg.util_avg;
+ cfs_rq->avg.util_sum += se->avg.util_sum;
+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
- return se->avg.utilization_avg_contrib - old_contrib;
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
+
+ cfs_rq_util_change(cfs_rq, 0);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
- long load_contrib)
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (likely(load_contrib < cfs_rq->blocked_load_avg))
- cfs_rq->blocked_load_avg -= load_contrib;
- else
- cfs_rq->blocked_load_avg = 0;
+ dequeue_load_avg(cfs_rq, se);
+ sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
+
+ cfs_rq_util_change(cfs_rq, 0);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+#define DO_ATTACH 0x4
+#define DO_DETACH 0x8
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq)
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta, utilization_delta;
- int cpu = cpu_of(rq_of(cfs_rq));
- u64 now;
+ u64 now = cfs_rq_clock_pelt(cfs_rq);
+ int decayed;
/*
- * For a group entity we need to use their owned cfs_rq_clock_task() in
- * case they are the parent of a throttled hierarchy.
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calculation in migration
*/
- if (entity_is_task(se))
- now = cfs_rq_clock_task(cfs_rq);
- else
- now = cfs_rq_clock_task(group_cfs_rq(se));
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cfs_rq, se);
- if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
- cfs_rq->curr == se))
- return;
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed |= propagate_entity_load_avg(se);
- contrib_delta = __update_entity_load_avg_contrib(se);
- utilization_delta = __update_entity_utilization_avg_contrib(se);
+ if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
- if (!update_cfs_rq)
- return;
+ /*
+ * DO_ATTACH means we're here from enqueue_entity().
+ * !last_update_time means we've passed through
+ * migrate_task_rq_fair() indicating we migrated.
+ *
+ * IOW we're enqueueing a task on a new CPU.
+ */
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
- if (se->on_rq) {
- cfs_rq->runnable_load_avg += contrib_delta;
- cfs_rq->utilization_load_avg += utilization_delta;
- } else {
- subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ } else if (flags & DO_DETACH) {
+ /*
+ * DO_DETACH means we're here from dequeue_entity()
+ * and we are migrating task out of the CPU.
+ */
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq);
}
}
/*
- * Decay the load contributed by all blocked children and account this so that
- * their contribution may appropriately discounted when they wake up.
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
*/
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+static void sync_entity_load_avg(struct sched_entity *se)
{
- u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
- u64 decays;
-
- decays = now - cfs_rq->last_decay;
- if (!decays && !force_update)
- return;
-
- if (atomic_long_read(&cfs_rq->removed_load)) {
- unsigned long removed_load;
- removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
- subtract_blocked_load_contrib(cfs_rq, removed_load);
- }
-
- if (decays) {
- cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
- decays);
- atomic64_add(decays, &cfs_rq->decay_counter);
- cfs_rq->last_decay = now;
- }
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
- __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg_blocked_se(last_update_time, se);
}
-/* Add the load generated by se into cfs_rq's child load-average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup)
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+static void remove_entity_load_avg(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ unsigned long flags;
+
/*
- * We track migrations using entity decay_count <= 0, on a wake-up
- * migration we use a negative decay count to track the remote decays
- * accumulated while sleeping.
- *
- * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
- * are seen by enqueue_entity_load_avg() as a migration with an already
- * constructed load_avg_contrib.
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * enqueue_task_fair() which will have added things to the cfs_rq,
+ * so we can remove unconditionally.
*/
- if (unlikely(se->avg.decay_count <= 0)) {
- se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
- if (se->avg.decay_count) {
- /*
- * In a wake-up migration we have to approximate the
- * time sleeping. This is because we can't synchronize
- * clock_task between the two cpus, and it is not
- * guaranteed to be read-safe. Instead, we can
- * approximate this using our carried decays, which are
- * explicitly atomically readable.
- */
- se->avg.last_runnable_update -= (-se->avg.decay_count)
- << 20;
- update_entity_load_avg(se, 0);
- /* Indicate that we're now synchronized and on-rq */
- se->avg.decay_count = 0;
- }
- wakeup = 0;
- } else {
- __synchronize_entity_decay(se);
- }
- /* migrated tasks did not contribute to our blocked load */
- if (wakeup) {
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- update_entity_load_avg(se, 0);
- }
+ sync_entity_load_avg(se);
- cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
- cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+ raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+ ++cfs_rq->removed.nr;
+ cfs_rq->removed.util_avg += se->avg.util_avg;
+ cfs_rq->removed.load_avg += se->avg.load_avg;
+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
+ raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
-/*
- * Remove se's load from this cfs_rq child load-average, if the entity is
- * transitioning to a blocked state we track its projected decay using
- * blocked_load_avg.
- */
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep)
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
{
- update_entity_load_avg(se, 1);
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !sleep);
+ return cfs_rq->avg.runnable_avg;
+}
- cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
- cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
- if (sleep) {
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.load_avg;
}
-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
+
+static inline unsigned long task_util(struct task_struct *p)
{
- update_rq_runnable_avg(this_rq, 1);
+ return READ_ONCE(p->se.avg.util_avg);
}
-/*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
- */
-void idle_exit_fair(struct rq *this_rq)
+static inline unsigned long task_runnable(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.runnable_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
{
- update_rq_runnable_avg(this_rq, 0);
+ return max(task_util(p), _task_util_est(p));
}
-static int idle_balance(struct rq *this_rq);
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
-#else /* CONFIG_SMP */
+ if (!sched_feat(UTIL_EST))
+ return;
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update) {}
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est;
+ enqueued += _task_util_est(p);
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
-static inline int idle_balance(struct rq *rq)
+ trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
{
- return 0;
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
}
-#endif /* CONFIG_SMP */
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
{
-#ifdef CONFIG_SCHEDSTATS
- struct task_struct *tsk = NULL;
+ unsigned int ewma, dequeued, last_ewma_diff;
- if (entity_is_task(se))
- tsk = task_of(se);
+ if (!sched_feat(UTIL_EST))
+ return;
- if (se->statistics.sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
+ /*
+ * Skip update of task's estimated utilization when the task has not
+ * yet completed an activation, e.g. being migrated.
+ */
+ if (!task_sleep)
+ return;
- if ((s64)delta < 0)
- delta = 0;
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(p->se.avg.util_est);
- if (unlikely(delta > se->statistics.sleep_max))
- se->statistics.sleep_max = delta;
+ /*
+ * If the PELT values haven't changed since enqueue time,
+ * skip the util_est update.
+ */
+ if (ewma & UTIL_AVG_UNCHANGED)
+ return;
- se->statistics.sleep_start = 0;
- se->statistics.sum_sleep_runtime += delta;
+ /* Get utilization at dequeue */
+ dequeued = task_util(p);
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
+ /*
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ if (ewma <= dequeued) {
+ ewma = dequeued;
+ goto done;
}
- if (se->statistics.block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
- if ((s64)delta < 0)
- delta = 0;
+ /*
+ * Skip update of task's estimated utilization when its members are
+ * already ~1% close to its last activation value.
+ */
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
- if (unlikely(delta > se->statistics.block_max))
- se->statistics.block_max = delta;
+ /*
+ * To avoid underestimate of task utilization, skip updates of EWMA if
+ * we cannot grant that thread got all CPU time it wanted.
+ */
+ if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+ goto done;
- se->statistics.block_start = 0;
- se->statistics.sum_sleep_runtime += delta;
- if (tsk) {
- if (tsk->in_iowait) {
- se->statistics.iowait_sum += delta;
- se->statistics.iowait_count++;
- trace_sched_stat_iowait(tsk, delta);
- }
+ /*
+ * Update Task's estimated utilization
+ *
+ * When *p completes an activation we can consolidate another sample
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
+ *
+ * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
+ * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
+ * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
+ *
+ * Where 'w' is the weight of new samples, which is configured to be
+ * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+ */
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(p->se.avg.util_est, ewma);
- trace_sched_stat_blocked(tsk, delta);
+ trace_sched_util_est_se_tp(&p->se);
+}
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
-#endif
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
}
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline int util_fits_cpu(unsigned long util,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max,
+ int cpu)
{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
+ bool fits, uclamp_max_fits;
- if (d < 0)
- d = -d;
+ /*
+ * Check if the real util fits without any uclamp boost/cap applied.
+ */
+ fits = fits_capacity(util, capacity);
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq, nr_spread_over);
-#endif
+ if (!uclamp_is_used())
+ return fits;
+
+ /*
+ * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
+ * uclamp_max. We only care about capacity pressure (by using
+ * capacity_of()) for comparing against the real util.
+ *
+ * If a task is boosted to 1024 for example, we don't want a tiny
+ * pressure to skew the check whether it fits a CPU or not.
+ *
+ * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
+ * should fit a little cpu even if there's some pressure.
+ *
+ * Only exception is for HW or cpufreq pressure since it has a direct impact
+ * on available OPP of the system.
+ *
+ * We honour it for uclamp_min only as a drop in performance level
+ * could result in not getting the requested minimum performance level.
+ *
+ * For uclamp_max, we can tolerate a drop in performance level as the
+ * goal is to cap the task. So it's okay if it's getting less.
+ */
+ capacity_orig = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+ * But we do have some corner cases to cater for..
+ *
+ *
+ * C=z
+ * | ___
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | |
+ * | | | | | | | (util somewhere in this region)
+ * | | | | | | |
+ * | | | | | | |
+ * +----------------------------------------
+ * CPU0 CPU1 CPU2
+ *
+ * In the above example if a task is capped to a specific performance
+ * point, y, then when:
+ *
+ * * util = 80% of x then it does not fit on CPU0 and should migrate
+ * to CPU1
+ * * util = 80% of y then it is forced to fit on CPU1 to honour
+ * uclamp_max request.
+ *
+ * which is what we're enforcing here. A task always fits if
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ * the normal upmigration rules should withhold still.
+ *
+ * Only exception is when we are on max capacity, then we need to be
+ * careful not to block overutilized state. This is so because:
+ *
+ * 1. There's no concept of capping at max_capacity! We can't go
+ * beyond this performance level anyway.
+ * 2. The system is being saturated when we're operating near
+ * max capacity, it doesn't make sense to block overutilized.
+ */
+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
+
+ /*
+ *
+ * C=z
+ * | ___ (region a, capped, util >= uclamp_max)
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ * | | | | | | |
+ * | | | | | | | (region c, boosted, util < uclamp_min)
+ * +----------------------------------------
+ * CPU0 CPU1 CPU2
+ *
+ * a) If util > uclamp_max, then we're capped, we don't care about
+ * actual fitness value here. We only care if uclamp_max fits
+ * capacity without taking margin/pressure into account.
+ * See comment above.
+ *
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
+ * fits_capacity() rules apply. Except we need to ensure that we
+ * enforce we remain within uclamp_max, see comment above.
+ *
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ * need to take into account the boosted value fits the CPU without
+ * taking margin/pressure into account.
+ *
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
+ * just need to consider an extra check for case (c) after ensuring we
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+ if (fits && (util < uclamp_min) &&
+ (uclamp_min > get_actual_cpu_capacity(cpu)))
+ return -1;
+
+ return fits;
+}
+
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
+{
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
+}
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {
+
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ /*
+ * Make sure that misfit_task_load will not be null even if
+ * task_h_load() returns 0.
+ */
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
+}
+
+void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_entity *se = &p->se;
+
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ se->custom_slice = 1;
+ se->slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ se->custom_slice = 0;
+ se->slice = sysctl_sched_base_slice;
+ }
}
static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 vruntime = cfs_rq->min_vruntime;
+ u64 vslice, vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
+
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
+ vslice = calc_delta_fair(se->slice, se);
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh = sysctl_sched_latency;
+ lag = se->vlag;
/*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
+ * evaporate.
+ *
+ * Lag is defined as:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * To avoid the 'w_i' term all over the place, we only track
+ * the virtual lag:
+ *
+ * vl_i = V - v_i <=> v_i = V - vl_i
+ *
+ * And we take V to be the weighted average of all v:
+ *
+ * V = (\Sum w_j*v_j) / W
+ *
+ * Where W is: \Sum w_j
+ *
+ * Then, the weighted average after adding an entity with lag
+ * vl_i is given by:
+ *
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+ * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
+ * = V - w_i*vl_i / (W + w_i)
+ *
+ * And the actual lag after adding an entity with vl_i is:
+ *
+ * vl'_i = V' - v_i
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+ * = vl_i - w_i*vl_i / (W + w_i)
+ *
+ * Which is strictly less than vl_i. So in order to preserve lag
+ * we should inflate the lag before placement such that the
+ * effective lag after placement comes out right.
+ *
+ * As such, invert the above relation for vl'_i to get the vl_i
+ * we need to use such that the lag after placement is the lag
+ * we computed before dequeue.
+ *
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+ *
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
*/
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- vruntime -= thresh;
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ se->vruntime = vruntime - lag;
+
+ if (se->rel_deadline) {
+ se->deadline += se->vruntime;
+ se->rel_deadline = 0;
+ return;
+ }
+
+ /*
+ * When joining the competition; the existing tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
+ */
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+
+static void
+requeue_delayed_entity(struct sched_entity *se);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ bool curr = cfs_rq->curr == se;
+
/*
- * Update the normalized vruntime before updating min_vruntime
- * through calling update_curr().
+ * If we're the current task, we must renormalise before calling
+ * update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, flags);
+
+ update_curr(cfs_rq);
/*
- * Update run-time statistics of the 'current'.
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_runnable of its group cfs_rq.
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
*/
- update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
+ update_cfs_group(se);
+
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
+ place_entity(cfs_rq, se, flags);
+
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
- if (flags & ENQUEUE_WAKEUP) {
- place_entity(cfs_rq, se, 0);
- enqueue_sleeper(cfs_rq, se);
- }
+ /* Entity has migrated, no longer consider this task hot */
+ if (flags & ENQUEUE_MIGRATED)
+ se->exec_start = 0;
- update_stats_enqueue(cfs_rq, se);
- check_spread(cfs_rq, se);
- if (se != cfs_rq->curr)
+ check_schedstat_required();
+ update_stats_enqueue_fair(cfs_rq, se, flags);
+ if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
- list_add_leaf_cfs_rq(cfs_rq);
+ if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
- }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
+ list_add_leaf_cfs_rq(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (cfs_rq->pelt_clock_throttled) {
+ struct rq *rq = rq_of(cfs_rq);
- cfs_rq->last = NULL;
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
+ }
+#endif
}
}
@@ -3118,119 +5290,147 @@ static void __clear_buddies_next(struct sched_entity *se)
}
}
-static void __clear_buddies_skip(struct sched_entity *se)
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void set_delayed(struct sched_entity *se)
+{
+ se->sched_delayed = 1;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since dequeue_entities()
+ * will account it for blocked tasks.
+ */
+ if (!entity_is_task(se))
+ return;
+
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
- cfs_rq->skip = NULL;
+ cfs_rq->h_nr_runnable--;
}
}
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void clear_delayed(struct sched_entity *se)
{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
+ se->sched_delayed = 0;
- if (cfs_rq->next == se)
- __clear_buddies_next(se);
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since a dequeue has
+ * already accounted for it or an enqueue of a task
+ * below it will account for it in enqueue_task_fair().
+ */
+ if (!entity_is_task(se))
+ return;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
+ cfs_rq->h_nr_runnable++;
+ }
}
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+ clear_delayed(se);
+ if (sched_feat(DELAY_ZERO) && se->vlag > 0)
+ se->vlag = 0;
+}
-static void
+static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- /*
- * Update run-time statistics of the 'current'.
- */
+ bool sleep = flags & DEQUEUE_SLEEP;
+ int action = UPDATE_TG;
+
update_curr(cfs_rq);
- dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+ clear_buddies(cfs_rq, se);
- update_stats_dequeue(cfs_rq, se);
- if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- struct task_struct *tsk = task_of(se);
-
- if (tsk->state & TASK_INTERRUPTIBLE)
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+ if (flags & DEQUEUE_DELAYED) {
+ WARN_ON_ONCE(!se->sched_delayed);
+ } else {
+ bool delay = sleep;
+ /*
+ * DELAY_DEQUEUE relies on spurious wakeups, special task
+ * states must not suffer spurious wakeups, excempt them.
+ */
+ if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
+ delay = false;
+
+ WARN_ON_ONCE(delay && se->sched_delayed);
+
+ if (sched_feat(DELAY_DEQUEUE) && delay &&
+ !entity_eligible(cfs_rq, se)) {
+ update_load_avg(cfs_rq, se, 0);
+ set_delayed(se);
+ return false;
}
-#endif
}
- clear_buddies(cfs_rq, se);
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_runnable of its group cfs_rq.
+ * - Subtract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(cfs_rq, se, action);
+ se_update_runnable(se);
+
+ update_stats_dequeue_fair(cfs_rq, se, flags);
+
+ update_entity_lag(cfs_rq, se);
+ if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
+ }
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize the entity after updating the min_vruntime because the
- * update can refer to the ->curr item and we need to reflect this
- * movement in our normalized position.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- unsigned long ideal_runtime, delta_exec;
- struct sched_entity *se;
- s64 delta;
-
- ideal_runtime = sched_slice(cfs_rq, curr);
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- return;
- }
+ update_cfs_group(se);
- /*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
- */
- if (delta_exec < sysctl_sched_min_granularity)
- return;
+ if (flags & DEQUEUE_DELAYED)
+ finish_delayed_dequeue_entity(se);
- se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
+ if (cfs_rq->nr_queued == 0) {
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (throttled_hierarchy(cfs_rq)) {
+ struct rq *rq = rq_of(cfs_rq);
- if (delta < 0)
- return;
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+#endif
+ }
- if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
+ return true;
}
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ clear_buddies(cfs_rq, se);
+
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
@@ -3238,29 +5438,36 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_entity_load_avg(se, 1);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
+ WARN_ON_ONCE(cfs_rq->curr);
cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
+
/*
* Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
+ * least twice that of our own weight (i.e. don't track it
* when there are only lesser-weight tasks around):
*/
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->statistics.slice_max = max(se->statistics.slice_max,
- se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ if (schedstat_enabled() &&
+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
+ struct sched_statistics *stats;
+
+ stats = __schedstats_from_se(se);
+ __schedstat_set(stats->slice_max,
+ max((u64)stats->slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
-#endif
+
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
/*
* Pick the next process, keeping these things in mind, in this order:
@@ -3270,53 +5477,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
struct sched_entity *se;
- /*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
-
- se = left; /* ideally we run the leftmost entity */
-
- /*
- * Avoid running the skip buddy, if running something else can
- * be done without getting too unfair.
- */
- if (cfs_rq->skip == se) {
- struct sched_entity *second;
-
- if (se == curr) {
- second = __pick_first_entity(cfs_rq);
- } else {
- second = __pick_next_entity(se);
- if (!second || (curr && entity_before(curr, second)))
- second = curr;
- }
-
- if (second && wakeup_preempt_entity(second, left) < 1)
- se = second;
+ se = pick_eevdf(cfs_rq);
+ if (se->sched_delayed) {
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ /*
+ * Must not reference @se again, see __block_task().
+ */
+ return NULL;
}
-
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
-
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
-
- clear_buddies(cfs_rq, se);
-
return se;
}
@@ -3334,14 +5506,14 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
+ update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_entity_load_avg(prev, 1);
+ update_load_avg(cfs_rq, prev, 0);
}
+ WARN_ON_ONCE(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -3356,9 +5528,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_entity_load_avg(curr, 1);
- update_cfs_rq_blocked_load(cfs_rq, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(cfs_rq, curr, UPDATE_TG);
+ update_cfs_group(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -3366,19 +5537,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
- /*
- * don't let the period tick interfere with the hrtick preemption
- */
- if (!sched_feat(DOUBLE_TICK) &&
- hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
- return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -3388,7 +5550,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
#ifdef CONFIG_CFS_BANDWIDTH
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
static struct static_key __cfs_bandwidth_used;
static inline bool cfs_bandwidth_used(void)
@@ -3398,14 +5560,14 @@ static inline bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
-#else /* HAVE_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL: */
static bool cfs_bandwidth_used(void)
{
return true;
@@ -3413,16 +5575,7 @@ static bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
-#endif /* HAVE_JUMP_LABEL */
-
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
- return 100000000ULL;
-}
+#endif /* !CONFIG_JUMP_LABEL */
static inline u64 sched_cfs_bandwidth_slice(void)
{
@@ -3430,22 +5583,28 @@ static inline u64 sched_cfs_bandwidth_slice(void)
}
/*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
*
* requires cfs_b->lock
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- u64 now;
+ s64 runtime;
- if (cfs_b->quota == RUNTIME_INF)
+ if (unlikely(cfs_b->quota == RUNTIME_INF))
return;
- now = sched_clock_cpu(smp_processor_id());
- cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+ cfs_b->runtime += cfs_b->quota;
+ runtime = cfs_b->runtime_snap - cfs_b->runtime;
+ if (runtime > 0) {
+ cfs_b->burst_time += runtime;
+ cfs_b->nr_burst++;
+ }
+
+ cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
+ cfs_b->runtime_snap = cfs_b->runtime;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -3453,39 +5612,21 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
}
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
- if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task;
-
- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
-}
-
/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
- /*
- * If the bandwidth pool has become inactive, then at least one
- * period must have elapsed since the last consumption.
- * Refresh the global state and ensure bandwidth timer becomes
- * active.
- */
- if (!cfs_b->timer_active) {
- __refill_cfs_bandwidth_runtime(cfs_b);
- __start_cfs_bandwidth(cfs_b, false);
- }
+ start_cfs_bandwidth(cfs_b);
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
@@ -3493,65 +5634,35 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires = cfs_b->runtime_expires;
- raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if ((s64)(expires - cfs_rq->runtime_expires) > 0)
- cfs_rq->runtime_expires = expires;
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced. It is valid to compare
- * cfs_b->runtime_expires without any locks since we only care about
- * exact equality, so a partial write will still work.
- */
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
- if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
+ return ret;
}
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -3574,189 +5685,500 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_bandwidth_used() && cfs_rq->throttled;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
+}
+
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
+{
+ return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
+}
+
+static inline bool task_is_throttled(struct task_struct *p)
+{
+ return cfs_bandwidth_used() && p->throttled;
+}
+
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
+static void throttle_cfs_rq_work(struct callback_head *work)
+{
+ struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+
+ WARN_ON_ONCE(p != current);
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+
+ /*
+ * If task is exiting, then there won't be a return to userspace, so we
+ * don't have to bother with any of this.
+ */
+ if ((p->flags & PF_EXITING))
+ return;
+
+ scoped_guard(task_rq_lock, p) {
+ se = &p->se;
+ cfs_rq = cfs_rq_of(se);
+
+ /* Raced, forget */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ /*
+ * If not in limbo, then either replenish has happened or this
+ * task got migrated out of the throttled cfs_rq, move along.
+ */
+ if (!cfs_rq->throttle_count)
+ return;
+ rq = scope.rq;
+ update_rq_clock(rq);
+ WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ /*
+ * Must not set throttled before dequeue or dequeue will
+ * mistakenly regard this task as an already throttled one.
+ */
+ p->throttled = true;
+ resched_curr(rq);
+ }
+}
+
+void init_cfs_throttle_work(struct task_struct *p)
+{
+ init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
+ /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
+ p->sched_throttle_work.next = &p->sched_throttle_work;
+ INIT_LIST_HEAD(&p->throttle_node);
+}
+
/*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
+ * Task is throttled and someone wants to dequeue it again:
+ * it could be sched/core when core needs to do things like
+ * task affinity change, task group change, task sched class
+ * change etc. and in these cases, DEQUEUE_SLEEP is not set;
+ * or the task is blocked after throttled due to freezer etc.
+ * and in these cases, DEQUEUE_SLEEP is set.
*/
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static void detach_task_cfs_rq(struct task_struct *p);
+static void dequeue_throttled_task(struct task_struct *p, int flags)
{
- struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+ WARN_ON_ONCE(p->se.on_rq);
+ list_del_init(&p->throttle_node);
- src_cfs_rq = tg->cfs_rq[src_cpu];
- dest_cfs_rq = tg->cfs_rq[dest_cpu];
+ /* task blocked after throttled */
+ if (flags & DEQUEUE_SLEEP) {
+ p->throttled = false;
+ return;
+ }
- return throttled_hierarchy(src_cfs_rq) ||
- throttled_hierarchy(dest_cfs_rq);
+ /*
+ * task is migrating off its old cfs_rq, detach
+ * the task's load from its old cfs_rq.
+ */
+ if (task_on_rq_migrating(p))
+ detach_task_cfs_rq(p);
}
-/* updated child weight may affect parent so we have to do this bottom up */
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+ /* @p should have gone through dequeue_throttled_task() first */
+ WARN_ON_ONCE(!list_empty(&p->throttle_node));
+
+ /*
+ * If the throttled task @p is enqueued to a throttled cfs_rq,
+ * take the fast path by directly putting the task on the
+ * target cfs_rq's limbo list.
+ *
+ * Do not do that when @p is current because the following race can
+ * cause @p's group_node to be incorectly re-insterted in its rq's
+ * cfs_tasks list, despite being throttled:
+ *
+ * cpuX cpuY
+ * p ret2user
+ * throttle_cfs_rq_work() sched_move_task(p)
+ * LOCK task_rq_lock
+ * dequeue_task_fair(p)
+ * UNLOCK task_rq_lock
+ * LOCK task_rq_lock
+ * task_current_donor(p) == true
+ * task_on_rq_queued(p) == true
+ * dequeue_task(p)
+ * put_prev_task(p)
+ * sched_change_group()
+ * enqueue_task(p) -> p's new cfs_rq
+ * is throttled, go
+ * fast path and skip
+ * actual enqueue
+ * set_next_task(p)
+ * list_move(&se->group_node, &rq->cfs_tasks); // bug
+ * schedule()
+ *
+ * In the above race case, @p current cfs_rq is in the same rq as
+ * its previous cfs_rq because sched_move_task() only moves a task
+ * to a different group from the same rq, so we can use its current
+ * cfs_rq to derive rq and test if the task is current.
+ */
+ if (throttled_hierarchy(cfs_rq) &&
+ !task_current_donor(rq_of(cfs_rq), p)) {
+ list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
+
+ /* we can't take the fast path, do an actual enqueue*/
+ p->throttled = false;
+ return false;
+}
+
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+ struct task_struct *p, *tmp;
+
+ if (--cfs_rq->throttle_count)
+ return 0;
- cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
- if (!cfs_rq->throttle_count) {
- /* adjust cfs_rq_clock_task() */
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
+ if (cfs_rq->pelt_clock_throttled) {
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
+ cfs_rq->pelt_clock_throttled = 0;
}
-#endif
+
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (WARN_ON_ONCE((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
+
+ /* Re-enqueue the tasks that have been throttled at this level. */
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
+ list_del_init(&p->throttle_node);
+ p->throttled = false;
+ enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+ }
+
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
return 0;
}
+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+ return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+ if (task_has_throttle_work(p))
+ return;
+
+ /*
+ * Kthreads and exiting tasks don't return to userspace, so adding the
+ * work is pointless
+ */
+ if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+ return;
+
+ task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- /* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count)
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
- cfs_rq->throttle_count++;
+ if (cfs_rq->throttle_count++)
+ return 0;
+
+ /*
+ * For cfs_rqs that still have entities enqueued, PELT clock
+ * stop happens at dequeue time when all entities are dequeued.
+ */
+ if (!cfs_rq->nr_queued) {
+ list_del_leaf_cfs_rq(cfs_rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+ cfs_rq->pelt_clock_throttled = 1;
+ }
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- long task_delta, dequeue = 1;
+ int dequeue = 1;
- se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */
/* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- /* throttled entity or throttle-on-deactivate */
- if (!se->on_rq)
- break;
-
- if (dequeue)
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- qcfs_rq->h_nr_running -= task_delta;
-
- if (qcfs_rq->load.weight)
- dequeue = 0;
- }
-
- if (!se)
- sub_nr_running(rq, task_delta);
-
- cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
/*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
*/
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- if (!cfs_b->timer_active)
- __start_cfs_bandwidth(cfs_b, false);
- raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled = 1;
+ WARN_ON_ONCE(cfs_rq->throttled_clock);
+ return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
- struct sched_entity *se;
- int enqueue = 1;
- long task_delta;
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
- se = cfs_rq->tg->se[cpu_of(rq)];
+ /*
+ * It's possible we are called with runtime_remaining < 0 due to things
+ * like async unthrottled us with a positive runtime_remaining but other
+ * still running entities consumed those runtime before we reached here.
+ *
+ * We can't unthrottle this cfs_rq without any runtime remaining because
+ * any enqueue in tg_unthrottle_up() will immediately trigger a throttle,
+ * which is not supposed to happen on unthrottle path.
+ */
+ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
+ return;
cfs_rq->throttled = 0;
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ if (cfs_rq->throttled_clock) {
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_rq->throttled_clock = 0;
+ }
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- if (!cfs_rq->load.weight)
- return;
+ if (!cfs_rq->load.weight) {
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ }
- task_delta = cfs_rq->h_nr_running;
- for_each_sched_entity(se) {
- if (se->on_rq)
- enqueue = 0;
+ assert_list_leaf_cfs_rq(rq);
- cfs_rq = cfs_rq_of(se);
- if (enqueue)
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
- cfs_rq->h_nr_running += task_delta;
+ /* Determine whether we need to wake up potentially idle CPU: */
+ if (rq->curr == rq->idle && rq->cfs.nr_queued)
+ resched_curr(rq);
+}
- if (cfs_rq_throttled(cfs_rq))
- break;
+static void __cfsb_csd_unthrottle(void *arg)
+{
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
+ /*
+ * Iterating over the list can trigger several call to
+ * update_rq_clock() in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_start_loop_update(rq);
+
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
}
- if (!se)
- add_nr_running(rq, task_delta);
+ rcu_read_unlock();
- /* determine whether we need to wake up potentially idle cpu */
- if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_curr(rq);
+ rq_clock_stop_loop_update(rq);
+ rq_unlock(rq, &rf);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
- struct cfs_rq *cfs_rq;
- u64 runtime;
- u64 starting_runtime = remaining;
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ int this_cpu = smp_processor_id();
+ u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq, *tmp;
+ struct rq_flags rf;
+ struct rq *rq;
+ LIST_HEAD(local_unthrottle);
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }
- raw_spin_lock(&rq->lock);
+ rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+
+ /* By the above checks, this should never be true */
+ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
+
+ raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
- if (runtime > remaining)
- runtime = remaining;
- remaining -= runtime;
+ if (runtime > cfs_b->runtime)
+ runtime = cfs_b->runtime;
+ cfs_b->runtime -= runtime;
+ remaining = cfs_b->runtime;
+ raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu) {
+ unthrottle_cfs_rq_async(cfs_rq);
+ } else {
+ /*
+ * We currently only expect to be unthrottling
+ * a single cfs_rq locally.
+ */
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
+ list_add_tail(&cfs_rq->throttled_csd_list,
+ &local_unthrottle);
+ }
+ } else {
+ throttled = true;
+ }
next:
- raw_spin_unlock(&rq->lock);
+ rq_unlock_irqrestore(rq, &rf);
+ }
- if (!remaining)
- break;
+ list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+ throttled_csd_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ rq_lock_irqsave(rq, &rf);
+
+ list_del_init(&cfs_rq->throttled_csd_list);
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
+ rq_unlock_irqrestore(rq, &rf);
}
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
+
rcu_read_unlock();
- return starting_runtime - remaining;
+ return throttled;
}
/*
@@ -3765,9 +6187,8 @@ next:
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime, runtime_expires;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -3777,6 +6198,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->nr_periods += overrun;
+ /* Refill extra burst quota even if cfs_b->idle */
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
@@ -3784,15 +6208,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (cfs_b->idle && !throttled)
goto out_deactivate;
- /*
- * if we have relooped after returning idle once, we need to update our
- * status as actually running, so that other cpus doing
- * __start_cfs_bandwidth will stop trying to cancel us.
- */
- cfs_b->timer_active = 1;
-
- __refill_cfs_bandwidth_runtime(cfs_b);
-
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
@@ -3802,26 +6217,14 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
- * This check is repeated as we are holding onto the new bandwidth while
- * we unthrottle. This can potentially race with an unthrottled group
- * trying to acquire new bandwidth from the global pool. This can result
- * in us over-using our runtime if it is all used during this loop, but
- * only by limited amounts in that extreme case.
+ * This check is repeated as we release cfs_b->lock while we unthrottle.
*/
while (throttled && cfs_b->runtime > 0) {
- runtime = cfs_b->runtime;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
- raw_spin_lock(&cfs_b->lock);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ throttled = distribute_cfs_runtime(cfs_b);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
}
/*
@@ -3835,7 +6238,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
return 0;
out_deactivate:
- cfs_b->timer_active = 0;
return 1;
}
@@ -3850,13 +6252,13 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
* Are we near the end of the current quota period?
*
* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
- * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * hrtimer base being cleared by hrtimer_start. In the case of
* migrate_hrtimers, base is never cleared, so we are fine.
*/
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -3864,7 +6266,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
@@ -3878,8 +6280,14 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
if (runtime_refresh_within(cfs_b, min_left))
return;
- start_bandwidth_timer(&cfs_b->slack_timer,
- ns_to_ktime(cfs_bandwidth_slack_period));
+ /* don't push forwards an existing deferred unthrottle */
+ if (cfs_b->slack_started)
+ return;
+ cfs_b->slack_started = true;
+
+ hrtimer_start(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period),
+ HRTIMER_MODE_REL);
}
/* we know any runtime found here is valid as update_curr() precedes return */
@@ -3892,8 +6300,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -3912,7 +6319,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -3925,36 +6332,32 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
- u64 expires;
+ unsigned long flags;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
+ cfs_b->slack_started = false;
+
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
-
- raw_spin_lock(&cfs_b->lock);
- if (expires == cfs_b->runtime_expires)
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
- raw_spin_unlock(&cfs_b->lock);
+ distribute_cfs_runtime(cfs_b);
}
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
@@ -3975,6 +6378,33 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *pcfs_rq, *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tg->parent)
+ return;
+
+ cfs_rq = tg->cfs_rq[cpu];
+ pcfs_rq = tg->parent->cfs_rq[cpu];
+
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
+
+ /*
+ * It is not enough to sync the "pelt_clock_throttled" indicator
+ * with the parent cfs_rq when the hierarchy is not queued.
+ * Always join a throttled hierarchy with PELT clock throttled
+ * and leaf it to the first enqueue, or distribution to
+ * unthrottle the PELT clock.
+ */
+ if (cfs_rq->throttle_count)
+ cfs_rq->pelt_clock_throttled = 1;
+}
+
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -3991,14 +6421,14 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_rq_throttled(cfs_rq))
return true;
- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, slack_timer);
+
do_sched_cfs_slack_timer(cfs_b);
return HRTIMER_NORESTART;
@@ -4008,146 +6438,279 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
- ktime_t now;
+ unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+ overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+ cfs_b->burst *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
}
- raw_spin_unlock(&cfs_b->lock);
+ if (idle)
+ cfs_b->period_active = 0;
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
- cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->period = us_to_ktime(default_bw_period_us());
+ cfs_b->burst = 0;
+ cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- cfs_b->period_timer.function = sched_cfs_period_timer;
- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
+
+ /* Add a random offset so that timers interleave */
+ hrtimer_set_expires(&cfs_b->period_timer,
+ get_random_u32_below(cfs_b->period));
+ hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ cfs_b->slack_started = false;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
}
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- /*
- * The timer may be active because we're trying to set a new bandwidth
- * period or because we're racing with the tear-down path
- * (timer_active==0 becomes visible before the hrtimer call-back
- * terminates). In either case we ensure that it's re-programmed
- */
- while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
- hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
- /* bounce the lock to allow do_sched_cfs_period_timer to run */
- raw_spin_unlock(&cfs_b->lock);
- cpu_relax();
- raw_spin_lock(&cfs_b->lock);
- /* if someone else restarted the timer then we're done */
- if (!force && cfs_b->timer_active)
- return;
- }
+ lockdep_assert_held(&cfs_b->lock);
+
+ if (cfs_b->period_active)
+ return;
- cfs_b->timer_active = 1;
- start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+ cfs_b->period_active = 1;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
}
+/*
+ * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online callback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+ lockdep_assert_rq_held(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
raw_spin_lock(&cfs_b->lock);
cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
raw_spin_unlock(&cfs_b->lock);
}
+ rcu_read_unlock();
}
+/* cpu offline callback */
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
- struct cfs_rq *cfs_rq;
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ // Do not unthrottle for an active CPU
+ if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
+ return;
+
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
- for_each_leaf_cfs_rq(rq, cfs_rq) {
if (!cfs_rq->runtime_enabled)
continue;
/*
- * clock_task is not advancing so we just need to make sure
- * there's some valid quota amount
- */
- cfs_rq->runtime_remaining = 1;
- /*
- * Offline rq is schedulable till cpu is completely disabled
+ * Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
- if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
+ if (!cfs_rq_throttled(cfs_rq))
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = 1;
+ unthrottle_cfs_rq(cfs_rq);
}
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
}
-#else /* CONFIG_CFS_BANDWIDTH */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+bool cfs_task_bw_constrained(struct task_struct *p)
{
- return rq_clock_task(rq_of(cfs_rq));
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (cfs_rq->runtime_enabled ||
+ tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+ return true;
+
+ return false;
}
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq);
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (rq->nr_running != 1)
+ return;
+
+ /*
+ * We know there is only one task runnable and we've just picked it. The
+ * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+ * be otherwise able to stop the tick. Just need to check if we are using
+ * bandwidth control.
+ */
+ if (cfs_task_bw_constrained(p))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#endif /* CONFIG_NO_HZ_FULL */
+
+#else /* !CONFIG_CFS_BANDWIDTH: */
+
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void task_throttle_setup_work(struct task_struct *p) {}
+static bool task_is_throttled(struct task_struct *p) { return false; }
+static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
+static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
+{
+ return false;
+}
+
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
return 0;
}
-static inline int throttled_lb_pair(struct task_group *tg,
- int src_cpu, int dest_cpu)
+static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
{
return 0;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
@@ -4158,8 +6721,17 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif /* !CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_CFS_BANDWIDTH */
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
/**************************************************
* CFS operations on tasks:
@@ -4169,17 +6741,16 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- WARN_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
- if (cfs_rq->nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
+ if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;
if (delta < 0) {
- if (rq->curr == p)
+ if (task_current_donor(rq, p))
resched_curr(rq);
return;
}
@@ -4194,15 +6765,14 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
*/
static void hrtick_update(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
- if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, donor);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
@@ -4211,7 +6781,92 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static inline void hrtick_update(struct rq *rq)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
+
+static inline bool cpu_overutilized(int cpu)
+{
+ unsigned long rq_util_min, rq_util_max;
+
+ if (!sched_energy_enabled())
+ return false;
+
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ /* Return true only if the utilization doesn't fit CPU's capacity */
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+}
+
+/*
+ * overutilized value make sense only if EAS is enabled
+ */
+static inline bool is_rd_overutilized(struct root_domain *rd)
+{
+ return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
+}
+
+static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
+{
+ if (!sched_energy_enabled())
+ return;
+
+ WRITE_ONCE(rd->overutilized, flag);
+ trace_sched_overutilized_tp(rd, flag);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+ /*
+ * overutilized field is used for load balancing decisions only
+ * if energy aware scheduler is being used
+ */
+
+ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized(rq->rd, 1);
+}
+
+/* Runqueue only has SCHED_IDLE tasks enqueued */
+static int sched_idle_rq(struct rq *rq)
+{
+ return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
+ rq->nr_running);
+}
+
+static int sched_idle_cpu(int cpu)
+{
+ return sched_idle_rq(cpu_rq(cpu));
+}
+
+static void
+requeue_delayed_entity(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * se->sched_delayed should imply: se->on_rq == 1.
+ * Because a delayed entity is one that is still on
+ * the runqueue competing until elegibility.
+ */
+ WARN_ON_ONCE(!se->sched_delayed);
+ WARN_ON_ONCE(!se->on_rq);
+
+ if (sched_feat(DELAY_ZERO)) {
+ update_entity_lag(cfs_rq, se);
+ if (se->vlag > 0) {
+ cfs_rq->nr_queued--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->vlag = 0;
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
+ }
+ }
+
+ update_load_avg(cfs_rq, se, 0);
+ clear_delayed(se);
+}
/*
* The enqueue_task method is called before nr_running is
@@ -4223,173 +6878,353 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int h_nr_idle = task_has_idle_policy(p);
+ int h_nr_runnable = 1;
+ int task_new = !(flags & ENQUEUE_WAKEUP);
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
+ u64 slice = 0;
+
+ if (task_is_throttled(p) && enqueue_throttled_task(p))
+ return;
+
+ /*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's add the task's estimated utilization to the cfs_rq's
+ * estimated utilization, before we update schedutil.
+ */
+ if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
+ util_est_enqueue(&rq->cfs, p);
+
+ if (flags & ENQUEUE_DELAYED) {
+ requeue_delayed_entity(se);
+ return;
+ }
+
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+
+ if (task_new && se->sched_delayed)
+ h_nr_runnable = 0;
for_each_sched_entity(se) {
- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, flags);
/*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running increment below.
- */
- if (cfs_rq_throttled(cfs_rq))
- break;
- cfs_rq->h_nr_running++;
+ * Basically set the slice of group entries to the min_slice of
+ * their respective cfs_rq. This ensures the group can service
+ * its entities in the desired time-frame.
+ */
+ if (slice) {
+ se->slice = slice;
+ se->custom_slice = 1;
+ }
+ enqueue_entity(cfs_rq, se, flags);
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = 1;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_running++;
- if (cfs_rq_throttled(cfs_rq))
- break;
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
+ update_cfs_group(se);
- update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
- }
+ se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
+ slice = cfs_rq_min_slice(cfs_rq);
- if (!se) {
- update_rq_runnable_avg(rq, rq->nr_running);
- add_nr_running(rq, 1);
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = 1;
}
+
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
+ dl_server_start(&rq->fair_server);
+
+ /* At this point se is NULL and we are at root level*/
+ add_nr_running(rq, 1);
+
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (!task_new)
+ check_update_overutilized_status(rq);
+
+ assert_list_leaf_cfs_rq(rq);
+
hrtick_update(rq);
}
-static void set_next_buddy(struct sched_entity *se);
-
/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ * 0 - dequeue throttled
+ * 1 - dequeue complete
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
+ bool was_sched_idle = sched_idle_rq(rq);
+ bool task_sleep = flags & DEQUEUE_SLEEP;
+ bool task_delayed = flags & DEQUEUE_DELAYED;
+ bool task_throttled = flags & DEQUEUE_THROTTLE;
+ struct task_struct *p = NULL;
+ int h_nr_idle = 0;
+ int h_nr_queued = 0;
+ int h_nr_runnable = 0;
struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
+ u64 slice = 0;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ h_nr_queued = 1;
+ h_nr_idle = task_has_idle_policy(p);
+ if (task_sleep || task_delayed || !se->sched_delayed)
+ h_nr_runnable = 1;
+ }
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, flags);
- /*
- * end evaluation on encountering a throttled cfs_rq
- *
- * note: in the case of encountering a throttled cfs_rq we will
- * post the final h_nr_running decrement below.
- */
- if (cfs_rq_throttled(cfs_rq))
+ if (!dequeue_entity(cfs_rq, se, flags)) {
+ if (p && &p->se == se)
+ return -1;
+
+ slice = cfs_rq_min_slice(cfs_rq);
break;
- cfs_rq->h_nr_running--;
+ }
+
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = h_nr_queued;
+
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && parent_entity(se))
- set_next_buddy(parent_entity(se));
-
- /* avoid re-evaluating load for this entity */
- se = parent_entity(se);
+ if (task_sleep && se)
+ set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
+ flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_running--;
- if (cfs_rq_throttled(cfs_rq))
- break;
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
+ update_cfs_group(se);
- update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
+ se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = h_nr_queued;
+
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
}
- if (!se) {
- sub_nr_running(rq, 1);
- update_rq_runnable_avg(rq, 1);
+ sub_nr_running(rq, h_nr_queued);
+
+ /* balance early to pull high priority tasks */
+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
+ rq->next_balance = jiffies;
+
+ if (p && task_delayed) {
+ WARN_ON_ONCE(!task_sleep);
+ WARN_ON_ONCE(p->on_rq != 1);
+
+ /* Fix-up what dequeue_task_fair() skipped */
+ hrtick_update(rq);
+
+ /*
+ * Fix-up what block_task() skipped.
+ *
+ * Must be last, @p might not be valid after this.
+ */
+ __block_task(rq, p);
}
- hrtick_update(rq);
-}
-#ifdef CONFIG_SMP
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->cfs.runnable_load_avg;
+ return 1;
}
/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
*/
-static unsigned long source_load(int cpu, int type)
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ if (task_is_throttled(p)) {
+ dequeue_throttled_task(p, flags);
+ return true;
+ }
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
+ if (!p->se.sched_delayed)
+ util_est_dequeue(&rq->cfs, p);
- return min(rq->cpu_load[type-1], total);
+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ if (dequeue_entities(rq, &p->se, flags) < 0)
+ return false;
+
+ /*
+ * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
+ */
+
+ hrtick_update(rq);
+ return true;
}
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
+static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
+}
+
+/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
+
+#ifdef CONFIG_NO_HZ_COMMON
+
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
+ unsigned long next_balance; /* in jiffy units */
+ unsigned long next_blocked; /* Next update of blocked load in jiffies */
+} nohz ____cacheline_aligned;
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
+#endif /* CONFIG_NO_HZ_COMMON */
- return max(rq->cpu_load[type-1], total);
+static unsigned long cpu_load(struct rq *rq)
+{
+ return cfs_rq_load_avg(&rq->cfs);
}
-static unsigned long capacity_of(int cpu)
+/*
+ * cpu_load_without - compute CPU load without any contributions from *p
+ * @cpu: the CPU which load is requested
+ * @p: the task which load should be discounted
+ *
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
+ * CPU as well as tasks which are currently sleeping after an execution on that
+ * CPU.
+ *
+ * This method returns the load of the specified CPU by discounting the load of
+ * the specified task, whenever the task is currently contributing to the CPU
+ * load.
+ */
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
- return cpu_rq(cpu)->cpu_capacity;
+ struct cfs_rq *cfs_rq;
+ unsigned int load;
+
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_load(rq);
+
+ cfs_rq = &rq->cfs;
+ load = READ_ONCE(cfs_rq->avg.load_avg);
+
+ /* Discount task's util from CPU's util */
+ lsub_positive(&load, task_h_load(p));
+
+ return load;
}
-static unsigned long capacity_orig_of(int cpu)
+static unsigned long cpu_runnable(struct rq *rq)
{
- return cpu_rq(cpu)->cpu_capacity_orig;
+ return cfs_rq_runnable_avg(&rq->cfs);
}
-static unsigned long cpu_avg_load_per_task(int cpu)
+static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = rq->cfs.runnable_load_avg;
+ struct cfs_rq *cfs_rq;
+ unsigned int runnable;
- if (nr_running)
- return load_avg / nr_running;
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_runnable(rq);
- return 0;
+ cfs_rq = &rq->cfs;
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+
+ /* Discount task's runnable from CPU's runnable */
+ lsub_positive(&runnable, p->se.avg.runnable_avg);
+
+ return runnable;
+}
+
+static unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
{
/*
- * Rough decay (wiping) for cost saving, don't worry
- * about the boundary, really active task won't care
- * about the loss.
+ * Only decay a single time; tasks that have less then 1 wakeup per
+ * jiffy will not have built up many flips.
*/
if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
current->wakee_flips >>= 1;
@@ -4402,301 +7237,150 @@ static void record_wakee(struct task_struct *p)
}
}
-static void task_waking_fair(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
-
- se->vruntime -= min_vruntime;
- record_wakee(p);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * effective_load() calculates the load change as seen from the root_task_group
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.
*
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
*
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
*
- * s_i = rw_i / \Sum rw_j (1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- * rw_i = { 2, 4, 1, 0 }
- * s_i = { 2/7, 4/7, 1/7, 0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- * rw'_i = { 3, 4, 1, 0 }
- * s'_i = { 3/8, 4/8, 1/8, 0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- * dw_i = S * (s'_i - s_i) (3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
*/
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+static int wake_wide(struct task_struct *p)
{
- struct sched_entity *se = tg->se[cpu];
-
- if (!tg->parent) /* the trivial, non-cgroup case */
- return wl;
-
- for_each_sched_entity(se) {
- long w, W;
-
- tg = se->my_q->tg;
-
- /*
- * W = @wg + \Sum rw_j
- */
- W = wg + calc_tg_weight(tg, se->my_q);
+ unsigned int master = current->wakee_flips;
+ unsigned int slave = p->wakee_flips;
+ int factor = __this_cpu_read(sd_llc_size);
- /*
- * w = rw_i + @wl
- */
- w = se->my_q->load.weight + wl;
-
- /*
- * wl = S * s'_i; see (2)
- */
- if (W > 0 && w < W)
- wl = (w * (long)tg->shares) / W;
- else
- wl = tg->shares;
-
- /*
- * Per the above, wl is the new se->load.weight value; since
- * those are clipped to [MIN_SHARES, ...) do so now. See
- * calc_cfs_shares().
- */
- if (wl < MIN_SHARES)
- wl = MIN_SHARES;
-
- /*
- * wl = dw_i = S * (s'_i - s_i); see (3)
- */
- wl -= se->load.weight;
-
- /*
- * Recursively apply this logic to all parent groups to compute
- * the final effective load change on the root group. Since
- * only the @tg group gets extra weight, all parent groups can
- * only redistribute existing shares. @wl is the shift in shares
- * resulting from this level per the above.
- */
- wg = 0;
- }
-
- return wl;
-}
-#else
-
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
- return wl;
+ if (master < slave)
+ swap(master, slave);
+ if (slave < factor || master < slave * factor)
+ return 0;
+ return 1;
}
-#endif
-
-static int wake_wide(struct task_struct *p)
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is
+ * cache-affine and is (or will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ * scheduling latency of the CPUs. This seems to work
+ * for the overloaded case.
+ */
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
- int factor = this_cpu_read(sd_llc_size);
-
/*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
+ * If this_cpu is idle, it implies the wakeup is from interrupt
+ * context. Only allow the move if cache is shared. Otherwise an
+ * interrupt intensive workload could force all tasks onto one
+ * node depending on the IO topology or IRQ affinity settings.
+ *
+ * If the prev_cpu is idle and cache affine then avoid a migration.
+ * There is no guarantee that the cache hot data from an interrupt
+ * is more important than cache hot data on the prev_cpu and from
+ * a cpufreq perspective, it's better to have higher utilisation
+ * on one CPU.
*/
- if (p->wakee_flips > factor) {
- /*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
- */
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
+ if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+ return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+
+ if (sync) {
+ struct rq *rq = cpu_rq(this_cpu);
+
+ if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1)
+ return this_cpu;
}
- return 0;
+ if (available_idle_cpu(prev_cpu))
+ return prev_cpu;
+
+ return nr_cpumask_bits;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
{
- s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
- struct task_group *tg;
- unsigned long weight;
- int balanced;
-
- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
+ unsigned long task_load;
- idx = sd->wake_idx;
- this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
- load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
if (sync) {
- tg = task_group(current);
- weight = current->se.load.weight;
+ unsigned long current_load = task_h_load(current);
- this_load += effective_load(tg, this_cpu, -weight, -weight);
- load += effective_load(tg, prev_cpu, 0, -weight);
+ if (current_load > this_eff_load)
+ return this_cpu;
+
+ this_eff_load -= current_load;
}
- tg = task_group(p);
- weight = p->se.load.weight;
+ task_load = task_h_load(p);
- /*
- * In low-load situations, where prev_cpu is idle and this_cpu is idle
- * due to the sync cause above having dropped this_load to 0, we'll
- * always have an imbalance, but there's really nothing you can do
- * about that, so that's good too.
- *
- * Otherwise check if either cpus are near enough in load to allow this
- * task to be woken on this_cpu.
- */
- this_eff_load = 100;
+ this_eff_load += task_load;
+ if (sched_feat(WA_BIAS))
+ this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
+ prev_eff_load -= task_load;
+ if (sched_feat(WA_BIAS))
+ prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
- if (this_load > 0) {
- this_eff_load *= this_load +
- effective_load(tg, this_cpu, weight, weight);
-
- prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
- }
-
- balanced = this_eff_load <= prev_eff_load;
-
- schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
-
- if (!balanced)
- return 0;
-
- schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.statistics.nr_wakeups_affine);
+ /*
+ * If sync, adjust the weight of prev_eff_load such that if
+ * prev_eff == this_eff that select_idle_sibling() will consider
+ * stacking the wakee on top of the waker if no other CPU is
+ * idle.
+ */
+ if (sync)
+ prev_eff_load += 1;
- return 1;
+ return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
{
- struct sched_group *idlest = NULL, *group = sd->groups;
- unsigned long min_load = ULONG_MAX, this_load = 0;
- int load_idx = sd->forkexec_idx;
- int imbalance = 100 + (sd->imbalance_pct-100)/2;
-
- if (sd_flag & SD_BALANCE_WAKE)
- load_idx = sd->wake_idx;
+ int target = nr_cpumask_bits;
- do {
- unsigned long load, avg_load;
- int local_group;
- int i;
+ if (sched_feat(WA_IDLE))
+ target = wake_affine_idle(this_cpu, prev_cpu, sync);
- /* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_cpus(group),
- tsk_cpus_allowed(p)))
- continue;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
+ target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
- /* Tally up the load of all CPUs in the group */
- avg_load = 0;
+ schedstat_inc(p->stats.nr_wakeups_affine_attempts);
+ if (target != this_cpu)
+ return prev_cpu;
- for_each_cpu(i, sched_group_cpus(group)) {
- /* Bias balancing toward cpus of our domain */
- if (local_group)
- load = source_load(i, load_idx);
- else
- load = target_load(i, load_idx);
-
- avg_load += load;
- }
-
- /* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
-
- if (local_group) {
- this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
- }
- } while (group = group->next, group != sd->groups);
-
- if (!idlest || 100*this_load < imbalance*min_load)
- return NULL;
- return idlest;
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->stats.nr_wakeups_affine);
+ return target;
}
+static struct sched_group *
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -4705,10 +7389,21 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_span(group));
+
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ struct rq *rq = cpu_rq(i);
+
+ if (!sched_core_cookie_match(rq, p))
+ continue;
+
+ if (sched_idle_cpu(i))
+ return i;
+
+ if (available_idle_cpu(i)) {
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
@@ -4730,8 +7425,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
- if (load < min_load || (load == min_load && i == this_cpu)) {
+ load = cpu_load(cpu_rq(i));
+ if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
}
@@ -4741,300 +7436,1334 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+
+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
+ return prev_cpu;
+
+ /*
+ * We need task's util for cpu_util_without, sync it up to
+ * prev_cpu's last_update_time.
+ */
+ if (!(sd_flag & SD_BALANCE_FORK))
+ sync_entity_load_avg(&p->se);
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = sched_balance_find_dst_group(sd, p, cpu);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of 'cpu': */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of 'new_cpu': */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ }
+
+ return new_cpu;
+}
+
+static inline int __select_idle_cpu(int cpu, struct task_struct *p)
+{
+ if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+ sched_cpu_cookie_match(cpu_rq(cpu), p))
+ return cpu;
+
+ return -1;
+}
+
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
+
+static inline void set_idle_cores(int cpu, int val)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ WRITE_ONCE(sds->has_idle_cores, val);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+ struct sched_domain_shared *sds;
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds)
+ return READ_ONCE(sds->has_idle_cores);
+
+ return false;
+}
+
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+ int core = cpu_of(rq);
+ int cpu;
+
+ rcu_read_lock();
+ if (test_idle_cores(core))
+ goto unlock;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (cpu == core)
+ continue;
+
+ if (!available_idle_cpu(cpu))
+ goto unlock;
+ }
+
+ set_idle_cores(core, 1);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
+{
+ bool idle = true;
+ int cpu;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (!available_idle_cpu(cpu)) {
+ idle = false;
+ if (*idle_cpu == -1) {
+ if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+ *idle_cpu = cpu;
+ break;
+ }
+ continue;
+ }
+ break;
+ }
+ if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
+ *idle_cpu = cpu;
+ }
+
+ if (idle)
+ return core;
+
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
+ return -1;
+}
+
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ int cpu;
+
+ for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+ if (cpu == target)
+ continue;
+ /*
+ * Check if the CPU is in the LLC scheduling domain of @target.
+ * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
+ */
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
+ if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ return cpu;
+ }
+
+ return -1;
+}
+
+#else /* !CONFIG_SCHED_SMT: */
+
+static inline void set_idle_cores(int cpu, int val)
+{
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+ return false;
+}
+
+static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
+{
+ return __select_idle_cpu(core, p);
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ return -1;
+}
+
+#endif /* !CONFIG_SCHED_SMT */
+
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+ int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct sched_domain_shared *sd_share;
+
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ if (sched_feat(SIS_UTIL)) {
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ if (sd_share) {
+ /* because !--nr is the condition to stop scan */
+ nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
+ }
+ }
+
+ if (static_branch_unlikely(&sched_cluster_active)) {
+ struct sched_group *sg = sd->groups;
+
+ if (sg->flags & SD_CLUSTER) {
+ for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+ if (!cpumask_test_cpu(cpu, cpus))
+ continue;
+
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ return idle_cpu;
+ }
+ }
+ cpumask_andnot(cpus, cpus, sched_group_span(sg));
+ }
+ }
+
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ break;
+ }
+ }
+
+ if (has_idle_core)
+ set_idle_cores(target, false);
+
+ return idle_cpu;
+}
+
+/*
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
+ * maximize capacity.
+ */
+static int
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
+ int cpu, best_cpu = -1;
+ struct cpumask *cpus;
+
+ cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
+ unsigned long cpu_cap = capacity_of(cpu);
+
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ continue;
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
+ return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = get_actual_cpu_capacity(cpu);
+
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
+ best_cap = cpu_cap;
+ best_cpu = cpu;
+ best_fits = fits;
+ }
+ }
+
+ return best_cpu;
+}
+
+static inline bool asym_fits_cpu(unsigned long util,
+ unsigned long util_min,
+ unsigned long util_max,
+ int cpu)
+{
+ if (sched_asym_cpucap_active())
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+
+ return true;
+}
+
/*
- * Try and locate an idle CPU in the sched_domain.
+ * Try and locate an idle core/thread in the LLC cache domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
+ bool has_idle_core = false;
struct sched_domain *sd;
- struct sched_group *sg;
- int i = task_cpu(p);
+ unsigned long task_util, util_min, util_max;
+ int i, recent_used_cpu, prev_aff = -1;
- if (idle_cpu(target))
+ /*
+ * On asymmetric system, update task utilization because we will check
+ * that the task fits with CPU's capacity.
+ */
+ if (sched_asym_cpucap_active()) {
+ sync_entity_load_avg(&p->se);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
+ }
+
+ /*
+ * per-cpu select_rq_mask usage
+ */
+ lockdep_assert_irqs_disabled();
+
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
- * If the prevous cpu is cache affine and idle, don't be stupid.
+ * If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) &&
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(prev, target))
+ return prev;
+
+ prev_aff = prev;
+ }
+
+ /*
+ * Allow a per-cpu kthread to stack with the wakee if the
+ * kworker thread and the tasks previous CPUs are the same.
+ * The assumption is that the wakee queued work for the
+ * per-cpu kthread that is now complete and the wakeup is
+ * essentially a sync wakeup. An obvious example of this
+ * pattern is IO completions.
+ */
+ if (is_per_cpu_kthread(current) &&
+ in_task() &&
+ prev == smp_processor_id() &&
+ this_rq()->nr_running <= 1 &&
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
+ return prev;
+ }
+
+ /* Check a recently used CPU as a potential idle candidate: */
+ recent_used_cpu = p->recent_used_cpu;
+ p->recent_used_cpu = prev;
+ if (recent_used_cpu != prev &&
+ recent_used_cpu != target &&
+ cpus_share_cache(recent_used_cpu, target) &&
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(recent_used_cpu, target))
+ return recent_used_cpu;
+
+ } else {
+ recent_used_cpu = -1;
+ }
/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
*/
+ if (sched_asym_cpucap_active()) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ /*
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
+ */
+ if (sd) {
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
+ if (!sd)
+ return target;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sched_smt_active()) {
+ has_idle_core = test_idle_cores(target);
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ if (!has_idle_core && cpus_share_cache(prev, target)) {
+ i = select_idle_smt(p, sd, prev);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ }
}
-done:
+
+ i = select_idle_cpu(p, sd, has_idle_core, target);
+ if ((unsigned)i < nr_cpumask_bits)
+ return i;
+
+ /*
+ * For cluster machines which have lower sharing cache like L2 or
+ * LLC Tag, we tend to find an idle CPU in the target's cluster
+ * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+ * use them if possible when no idle CPU found in select_idle_cpu().
+ */
+ if ((unsigned int)prev_aff < nr_cpumask_bits)
+ return prev_aff;
+ if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+ return recent_used_cpu;
+
return target;
}
+
+/**
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long runnable;
+
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ util = max(util, runnable);
+ }
+
+ /*
+ * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ * contribution. If @p migrates from another CPU to @cpu add its
+ * contribution. In all the other cases @cpu is not impacted by the
+ * migration so its util_avg is already correct.
+ */
+ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
+ lsub_positive(&util, task_util(p));
+ else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
+ util += task_util(p);
+
+ if (sched_feat(UTIL_EST)) {
+ unsigned long util_est;
+
+ util_est = READ_ONCE(cfs_rq->avg.util_est);
+
+ /*
+ * During wake-up @p isn't enqueued yet and doesn't contribute
+ * to any cpu_rq(cpu)->cfs.avg.util_est.
+ * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ * has been enqueued.
+ *
+ * During exec (@dst_cpu = -1) @p is enqueued and does
+ * contribute to cpu_rq(cpu)->cfs.util_est.
+ * Remove it to "simulate" cpu_util without @p's contribution.
+ *
+ * Despite the task_on_rq_queued(@p) check there is still a
+ * small window for a possible race when an exec
+ * select_task_rq_fair() races with LB's detach_task().
+ *
+ * detach_task()
+ * deactivate_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * -------------------------------- A
+ * dequeue_task() \
+ * dequeue_task_fair() + Race Time
+ * util_est_dequeue() /
+ * -------------------------------- B
+ *
+ * The additional check "current == p" is required to further
+ * reduce the race window.
+ */
+ if (dst_cpu == cpu)
+ util_est += _task_util_est(p);
+ else if (p && unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&util_est, _task_util_est(p));
+
+ util = max(util, util_est);
+ }
+
+ return min(util, arch_scale_cpu_capacity(cpu));
+}
+
+unsigned long cpu_util_cfs(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 1);
+}
+
+/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ p = NULL;
+
+ return cpu_util(cpu, p, -1, 0);
+}
+
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the IRQ utilization.
+ *
+ * The DL bandwidth number OTOH is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+{
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+}
+
+unsigned long sched_cpu_util(int cpu)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+}
+
+/*
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test the
+ * placement. Given by eenv_task_busy_time().
+ * @pd_busy_time: Utilization of the whole perf domain without the task
+ * contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap: Maximum CPU capacity for the perf domain.
+ * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
+ */
+struct energy_env {
+ unsigned long task_busy_time;
+ unsigned long pd_busy_time;
+ unsigned long cpu_cap;
+ unsigned long pd_cap;
+};
+
/*
- * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the usage with the capacity of the CPU that is available for CFS
- * task (ie cpu_capacity).
- * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
- * CPU. It represents the amount of utilization of a CPU in the range
- * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
- * capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in avg_period and running_load_avg or just
- * after migrating tasks until the average stabilizes with the new running
- * time. So we need to check that the usage stays into the range
- * [0..cpu_capacity_orig] and cap if necessary.
- * Without capping the usage, a group could be seen as overloaded (CPU0 usage
- * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
*/
-static int get_cpu_usage(int cpu)
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+ struct task_struct *p, int prev_cpu)
{
- unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
+ unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
- if (usage >= SCHED_LOAD_SCALE)
- return capacity;
+ if (unlikely(irq >= max_cap))
+ busy_time = max_cap;
+ else
+ busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
- return (usage * capacity) >> SCHED_LOAD_SHIFT;
+ eenv->task_busy_time = busy_time;
}
/*
- * select_task_rq_fair: Select target runqueue for the waking task in domains
- * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
- * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
*
- * Balances load by selecting the idlest cpu in the idlest group, or under
- * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
+ * - A stable PD utilization, no matter which CPU of that PD we want to place
+ * the task on.
*
- * Returns the target cpu number.
+ * - A fair comparison between CPUs as the task contribution (task_util())
+ * will always be the same no matter which CPU utilization we rely on
+ * (util_avg or util_est).
*
- * preempt must be disabled.
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
*/
-static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+ struct cpumask *pd_cpus,
+ struct task_struct *p)
{
- struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
- int cpu = smp_processor_id();
- int new_cpu = cpu;
- int want_affine = 0;
- int sync = wake_flags & WF_SYNC;
+ unsigned long busy_time = 0;
+ int cpu;
- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ for_each_cpu(cpu, pd_cpus) {
+ unsigned long util = cpu_util(cpu, p, -1, 0);
- rcu_read_lock();
- for_each_domain(cpu, tmp) {
- if (!(tmp->flags & SD_LOAD_BALANCE))
- continue;
+ busy_time += effective_cpu_util(cpu, util, NULL, NULL);
+ }
+
+ eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
+}
+
+/*
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can't
+ * exceed @eenv->cpu_cap.
+ */
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+ struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = 0;
+ int cpu;
+
+ for_each_cpu(cpu, pd_cpus) {
+ struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util, min, max;
/*
- * If both cpu and prev_cpu are part of this domain,
- * cpu is a valid SD_WAKE_AFFINE target.
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ * NOTE: in case RT tasks are running, by default the min
+ * utilization can be max OPP.
*/
- if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
- cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
- affine_sd = tmp;
- break;
+ eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+ /* Task's uclamp can modify min and max value */
+ if (tsk && uclamp_is_used()) {
+ min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+ /*
+ * If there is no active max uclamp constraint,
+ * directly use task's one, otherwise keep max.
+ */
+ if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ max = uclamp_eff_value(p, UCLAMP_MAX);
+ else
+ max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
}
- if (tmp->flags & sd_flag)
- sd = tmp;
+ eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+ max_util = max(max_util, eff_util);
}
- if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+ struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long busy_time = eenv->pd_busy_time;
+ unsigned long energy;
+
+ if (dst_cpu >= 0)
+ busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+ energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+ trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
- if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
+ return energy;
+}
+
+/*
+ * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+ * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+ * spare capacity in each performance domain and uses it as a potential
+ * candidate to execute the task. Then, it uses the Energy Model to figure
+ * out which of the CPU candidates is the most energy-efficient.
+ *
+ * The rationale for this heuristic is as follows. In a performance domain,
+ * all the most energy efficient CPU candidates (according to the Energy
+ * Model) are those for which we'll request a low frequency. When there are
+ * several CPUs for which the frequency request will be the same, we don't
+ * have enough data to break the tie between them, because the Energy Model
+ * only includes active power costs. With this model, if we assume that
+ * frequency requests follow utilization (e.g. using schedutil), the CPU with
+ * the maximum spare capacity in a performance domain is guaranteed to be among
+ * the best candidates of the performance domain.
+ *
+ * In practice, it could be preferable from an energy standpoint to pack
+ * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+ * but that could also hurt our chances to go cluster idle, and we have no
+ * ways to tell with the current Energy Model if this is actually a good
+ * idea or not. So, find_energy_efficient_cpu() basically favors
+ * cluster-packing, and spreading inside a cluster. That should at least be
+ * a good thing for latency, and this is consistent with the idea that most
+ * of the energy savings of EAS come from the asymmetry of the system, and
+ * not so much from breaking the tie between identical CPUs. That's also the
+ * reason why EAS is enabled in the topology code only for systems where
+ * SD_ASYM_CPUCAPACITY is set.
+ *
+ * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+ * they don't have any useful utilization data yet and it's not possible to
+ * forecast their impact on energy consumption. Consequently, they will be
+ * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
+ * to be energy-inefficient in some use-cases. The alternative would be to
+ * bias new tasks towards specific types of CPUs first, or to try to infer
+ * their util_avg from the parent task, but those heuristics could hurt
+ * other use-cases too. So, until someone finds a better way to solve this,
+ * let's keep things simple by re-using the existing slow path.
+ */
+static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
+ struct root_domain *rd = this_rq()->rd;
+ int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
+ struct sched_domain *sd;
+ struct perf_domain *pd;
+ struct energy_env eenv;
+
+ rcu_read_lock();
+ pd = rcu_dereference(rd->pd);
+ if (!pd)
goto unlock;
- }
- while (sd) {
- struct sched_group *group;
- int weight;
+ /*
+ * Energy-aware wake-up happens on the lowest sched_domain starting
+ * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+ */
+ sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ sd = sd->parent;
+ if (!sd)
+ goto unlock;
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
+ target = prev_cpu;
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
+ sync_entity_load_avg(&p->se);
+ if (!task_util_est(p) && p_util_min == 0)
+ goto unlock;
+
+ eenv_task_busy_time(&eenv, p, prev_cpu);
+
+ for (; pd; pd = pd->next) {
+ unsigned long util_min = p_util_min, util_max = p_util_max;
+ unsigned long cpu_cap, cpu_actual_cap, util;
+ long prev_spare_cap = -1, max_spare_cap = -1;
+ unsigned long rq_util_min, rq_util_max;
+ unsigned long cur_delta, base_energy;
+ int max_spare_cap_cpu = -1;
+ int fits, max_fits = -1;
+
+ cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+ if (cpumask_empty(cpus))
continue;
+
+ /* Account external pressure for the energy estimation */
+ cpu = cpumask_first(cpus);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);
+
+ eenv.cpu_cap = cpu_actual_cap;
+ eenv.pd_cap = 0;
+
+ for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
+ eenv.pd_cap += cpu_actual_cap;
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
+
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ continue;
+
+ util = cpu_util(cpu, p, cpu, 0);
+ cpu_cap = capacity_of(cpu);
+
+ /*
+ * Skip CPUs that cannot satisfy the capacity request.
+ * IOW, placing the task there would make the CPU
+ * overutilized. Take uclamp into account to see how
+ * much capacity we can get out of the CPU; this is
+ * aligned with sched_cpu_util().
+ */
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. I.e.: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+
+ fits = util_fits_cpu(util, util_min, util_max, cpu);
+ if (!fits)
+ continue;
+
+ lsub_positive(&cpu_cap, util);
+
+ if (cpu == prev_cpu) {
+ /* Always use prev_cpu as a candidate. */
+ prev_spare_cap = cpu_cap;
+ prev_fits = fits;
+ } else if ((fits > max_fits) ||
+ ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
+ /*
+ * Find the CPU with the maximum spare capacity
+ * among the remaining CPUs in the performance
+ * domain.
+ */
+ max_spare_cap = cpu_cap;
+ max_spare_cap_cpu = cpu;
+ max_fits = fits;
+ }
}
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
+ if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
continue;
+
+ eenv_pd_busy_time(&eenv, cpus, p);
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy = compute_energy(&eenv, pd, cpus, p, -1);
+
+ /* Evaluate the energy impact of using prev_cpu. */
+ if (prev_spare_cap > -1) {
+ prev_delta = compute_energy(&eenv, pd, cpus, p,
+ prev_cpu);
+ /* CPU utilization has changed */
+ if (prev_delta < base_energy)
+ goto unlock;
+ prev_delta -= base_energy;
+ prev_actual_cap = cpu_actual_cap;
+ best_delta = min(best_delta, prev_delta);
}
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
+ /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+ /* Current best energy cpu fits better */
+ if (max_fits < best_fits)
+ continue;
+
+ /*
+ * Both don't fit performance hint (i.e. uclamp_min)
+ * but best energy cpu has better capacity.
+ */
+ if ((max_fits < 0) &&
+ (cpu_actual_cap <= best_actual_cap))
+ continue;
+
+ cur_delta = compute_energy(&eenv, pd, cpus, p,
+ max_spare_cap_cpu);
+ /* CPU utilization has changed */
+ if (cur_delta < base_energy)
+ goto unlock;
+ cur_delta -= base_energy;
+
+ /*
+ * Both fit for the task but best energy cpu has lower
+ * energy impact.
+ */
+ if ((max_fits > 0) && (best_fits > 0) &&
+ (cur_delta >= best_delta))
+ continue;
+
+ best_delta = cur_delta;
+ best_energy_cpu = max_spare_cap_cpu;
+ best_fits = max_fits;
+ best_actual_cap = cpu_actual_cap;
}
- /* while loop will break here if sd == NULL */
}
+ rcu_read_unlock();
+
+ if ((best_fits > prev_fits) ||
+ ((best_fits > 0) && (best_delta < prev_delta)) ||
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
+ target = best_energy_cpu;
+
+ return target;
+
unlock:
rcu_read_unlock();
+ return target;
+}
+
+/*
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
+ *
+ * Balances load by selecting the idlest CPU in the idlest group, or under
+ * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
+ *
+ * Returns the target CPU number.
+ */
+static int
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
+{
+ int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+ struct sched_domain *tmp, *sd = NULL;
+ int cpu = smp_processor_id();
+ int new_cpu = prev_cpu;
+ int want_affine = 0;
+ /* SD_flags and WF_flags share the first nibble */
+ int sd_flag = wake_flags & 0xF;
+
+ /*
+ * required for stable ->cpus_allowed
+ */
+ lockdep_assert_held(&p->pi_lock);
+ if (wake_flags & WF_TTWU) {
+ record_wakee(p);
+
+ if ((wake_flags & WF_CURRENT_CPU) &&
+ cpumask_test_cpu(cpu, p->cpus_ptr))
+ return cpu;
+
+ if (!is_rd_overutilized(this_rq()->rd)) {
+ new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+ if (new_cpu >= 0)
+ return new_cpu;
+ new_cpu = prev_cpu;
+ }
+
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
+ }
+
+ rcu_read_lock();
+ for_each_domain(cpu, tmp) {
+ /*
+ * If both 'cpu' and 'prev_cpu' are part of this domain,
+ * cpu is a valid SD_WAKE_AFFINE target.
+ */
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ if (cpu != prev_cpu)
+ new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+
+ sd = NULL; /* Prefer wake_affine over balance flags */
+ break;
+ }
+
+ /*
+ * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+ * usually do not have SD_BALANCE_WAKE set. That means wakeup
+ * will usually go to the fast path.
+ */
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ else if (!want_affine)
+ break;
+ }
+
+ if (unlikely(sd)) {
+ /* Slow path */
+ new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ } else if (wake_flags & WF_TTWU) { /* XXX always ? */
+ /* Fast path */
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ }
+ rcu_read_unlock();
+
return new_cpu;
}
/*
- * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
-static void
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- /*
- * Load tracking: accumulate removed load so that it can be processed
- * when we next update owning cfs_rq under rq->lock. Tasks contribute
- * to blocked load iff they have a positive decay-count. It can never
- * be negative here since on-rq tasks have decay-count == 0.
- */
- if (se->avg.decay_count) {
- se->avg.decay_count = -__synchronize_entity_decay(se);
- atomic_long_add(se->avg.load_avg_contrib,
- &cfs_rq->removed_load);
+ if (!task_on_rq_migrating(p)) {
+ remove_entity_load_avg(se);
+
+ /*
+ * Here, the task's PELT values have been updated according to
+ * the current rq's clock. But if that clock hasn't been
+ * updated in a while, a substantial idle time will be missed,
+ * leading to an inflation after wake-up on the new rq.
+ *
+ * Estimate the missing time from the cfs_rq last_update_time
+ * and update sched_avg to improve the PELT continuity after
+ * migration.
+ */
+ migrate_se_pelt_lag(se);
}
- /* We have migrated, no longer consider this task hot */
- se->exec_start = 0;
+ /* Tell new CPU we are migrated */
+ se->avg.last_update_time = 0;
+
+ update_scan_period(p, new_cpu);
}
-#endif /* CONFIG_SMP */
-static unsigned long
-wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
+static void task_dead_fair(struct task_struct *p)
{
- unsigned long gran = sysctl_sched_wakeup_granularity;
+ struct sched_entity *se = &p->se;
- /*
- * Since its curr running now, convert the gran from real-time
- * to virtual-time in his units.
- *
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- return calc_delta_fair(gran, se);
+ if (se->sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}
/*
- * Should 'se' preempt 'curr'.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
+ * Set the max capacity the task is allowed to run at for misfit detection.
*/
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+static void set_task_max_allowed_capacity(struct task_struct *p)
{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
+ struct asym_cap_data *entry;
- if (vdiff <= 0)
- return -1;
+ if (!sched_asym_cpucap_active())
+ return;
- gran = wakeup_gran(curr, se);
- if (vdiff > gran)
- return 1;
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
- return 0;
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
}
-static void set_last_buddy(struct sched_entity *se)
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
- return;
-
- for_each_sched_entity(se)
- cfs_rq_of(se)->last = se;
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
}
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
- return;
-
- for_each_sched_entity(se)
+ for_each_sched_entity(se) {
+ if (WARN_ON_ONCE(!se->on_rq))
+ return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->next = se;
+ }
}
-static void set_skip_buddy(struct sched_entity *se)
+enum preempt_wakeup_action {
+ PREEMPT_WAKEUP_NONE, /* No preemption. */
+ PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
+ PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
+ PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
+};
+
+static inline bool
+set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
+ /*
+ * Keep existing buddy if the deadline is sooner than pse.
+ * The older buddy may be cache cold and completely unrelated
+ * to the current wakeup but that is unpredictable where as
+ * obeying the deadline is more in line with EEVDF objectives.
+ */
+ if (cfs_rq->next && entity_before(cfs_rq->next, pse))
+ return false;
+
+ set_next_buddy(pse);
+ return true;
+}
+
+/*
+ * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
+ * strictly enforced because the hint is either misunderstood or
+ * multiple tasks must be woken up.
+ */
+static inline enum preempt_wakeup_action
+preempt_sync(struct rq *rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ u64 threshold, delta;
+
+ /*
+ * WF_SYNC without WF_TTWU is not expected so warn if it happens even
+ * though it is likely harmless.
+ */
+ WARN_ON_ONCE(!(wake_flags & WF_TTWU));
+
+ threshold = sysctl_sched_migration_cost;
+ delta = rq_clock_task(rq) - se->exec_start;
+ if ((s64)delta < 0)
+ delta = 0;
+
+ /*
+ * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they
+ * could run on other CPUs. Reduce the threshold before preemption is
+ * allowed to an arbitrary lower value as it is more likely (but not
+ * guaranteed) the waker requires the wakee to finish.
+ */
+ if (wake_flags & WF_RQ_SELECTED)
+ threshold >>= 2;
+
+ /*
+ * As WF_SYNC is not strictly obeyed, allow some runtime for batch
+ * wakeups to be issued.
+ */
+ if (entity_before(pse, se) && delta >= threshold)
+ return PREEMPT_WAKEUP_RESCHED;
+
+ return PREEMPT_WAKEUP_NONE;
}
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
- struct task_struct *curr = rq->curr;
- struct sched_entity *se = &curr->se, *pse = &p->se;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
- int next_buddy_marked = 0;
+ enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
+ struct task_struct *donor = rq->donor;
+ struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct cfs_rq *cfs_rq = task_cfs_rq(donor);
+ int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
return;
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_prempt_curr() after an enqueue (which may have
+ * unconditionally wakeup_preempt() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
- if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ if (task_is_throttled(p))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
- set_next_buddy(pse);
- next_buddy_marked = 1;
- }
-
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
@@ -5045,113 +8774,182 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
* prevents us from potentially nominating it as a false LAST_BUDDY
* below.
*/
- if (test_tsk_need_resched(curr))
+ if (test_tsk_need_resched(rq->curr))
+ return;
+
+ if (!sched_feat(WAKEUP_PREEMPTION))
return;
- /* Idle tasks are by definition preempted by non-idle tasks. */
- if (unlikely(curr->policy == SCHED_IDLE) &&
- likely(p->policy != SCHED_IDLE))
+ find_matching_se(&se, &pse);
+ WARN_ON_ONCE(!pse);
+
+ cse_is_idle = se_is_idle(se);
+ pse_is_idle = se_is_idle(pse);
+
+ /*
+ * Preempt an idle entity in favor of a non-idle entity (and don't preempt
+ * in the inverse case).
+ */
+ if (cse_is_idle && !pse_is_idle) {
+ /*
+ * When non-idle entity preempt an idle entity,
+ * don't give idle entity slice protection.
+ */
+ preempt_action = PREEMPT_WAKEUP_SHORT;
goto preempt;
+ }
+
+ if (cse_is_idle != pse_is_idle)
+ return;
/*
- * Batch and idle tasks do not preempt non-idle tasks (their preemption
- * is driven by the tick):
+ * BATCH and IDLE tasks do not preempt others.
*/
- if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+ if (unlikely(!normal_policy(p->policy)))
return;
- find_matching_se(&se, &pse);
- update_curr(cfs_rq_of(se));
- BUG_ON(!pse);
- if (wakeup_preempt_entity(se, pse) == 1) {
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+ /*
+ * If @p has a shorter slice than current and @p is eligible, override
+ * current's slice protection in order to allow preemption.
+ */
+ if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
+ preempt_action = PREEMPT_WAKEUP_SHORT;
+ goto pick;
+ }
+
+ /*
+ * Ignore wakee preemption on WF_FORK as it is less likely that
+ * there is shared data as exec often follow fork. Do not
+ * preempt for tasks that are sched_delayed as it would violate
+ * EEVDF to forcibly queue an ineligible task.
+ */
+ if ((wake_flags & WF_FORK) || pse->sched_delayed)
+ return;
+
+ /*
+ * If @p potentially is completing work required by current then
+ * consider preemption.
+ *
+ * Reschedule if waker is no longer eligible. */
+ if (in_task() && !entity_eligible(cfs_rq, se)) {
+ preempt_action = PREEMPT_WAKEUP_RESCHED;
+ goto preempt;
+ }
+
+ /* Prefer picking wakee soon if appropriate. */
+ if (sched_feat(NEXT_BUDDY) &&
+ set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
+
/*
- * Bias pick_next to pick the sched entity that is
- * triggering this preemption.
+ * Decide whether to obey WF_SYNC hint for a new buddy. Old
+ * buddies are ignored as they may not be relevant to the
+ * waker and less likely to be cache hot.
*/
- if (!next_buddy_marked)
- set_next_buddy(pse);
+ if (wake_flags & WF_SYNC)
+ preempt_action = preempt_sync(rq, wake_flags, pse, se);
+ }
+
+ switch (preempt_action) {
+ case PREEMPT_WAKEUP_NONE:
+ return;
+ case PREEMPT_WAKEUP_RESCHED:
goto preempt;
+ case PREEMPT_WAKEUP_SHORT:
+ fallthrough;
+ case PREEMPT_WAKEUP_PICK:
+ break;
}
+pick:
+ /*
+ * If @p has become the most eligible task, force preemption.
+ */
+ if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
+ goto preempt;
+
+ if (sched_feat(RUN_TO_PARITY))
+ update_protect_slice(cfs_rq, se);
+
return;
preempt:
- resched_curr(rq);
- /*
- * Only set the backward buddy when the current task is still
- * on the rq. This can happen when a wakeup gets interleaved
- * with schedule on the ->pre_schedule() or idle_balance()
- * point, either of which can * drop the rq lock.
- *
- * Also, during early boot the idle thread is in the fair class,
- * for obvious reasons its a bad idea to schedule back to it.
- */
- if (unlikely(!se->on_rq || curr == rq->idle))
- return;
+ if (preempt_action == PREEMPT_WAKEUP_SHORT)
+ cancel_protect_slice(se);
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
- set_last_buddy(se);
+ resched_curr_lazy(rq);
}
-static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
{
- struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
struct task_struct *p;
- int new_tasks;
+ bool throttled;
again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (!cfs_rq->nr_running)
- goto idle;
-
- if (prev->sched_class != &fair_sched_class)
- goto simple;
+ cfs_rq = &rq->cfs;
+ if (!cfs_rq->nr_queued)
+ return NULL;
- /*
- * Because of the set_next_buddy() in dequeue_task_fair() it is rather
- * likely that a next task is from the same cgroup as the current.
- *
- * Therefore attempt to avoid putting and setting the entire cgroup
- * hierarchy, only change the part that actually changes.
- */
+ throttled = false;
do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /*
- * Since we got here without doing put_prev_entity() we also
- * have to consider cfs_rq->curr. If it is still a runnable
- * entity, update_curr() will update its vruntime, otherwise
- * forget we've ever seen it.
- */
- if (curr && curr->on_rq)
+ /* Might not have done put_prev_entity() */
+ if (cfs_rq->curr && cfs_rq->curr->on_rq)
update_curr(cfs_rq);
- else
- curr = NULL;
- /*
- * This call to check_cfs_rq_runtime() will do the throttle and
- * dequeue its entity in the parent(s). Therefore the 'simple'
- * nr_running test will indeed be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto simple;
+ throttled |= check_cfs_rq_runtime(cfs_rq);
- se = pick_next_entity(cfs_rq, curr);
+ se = pick_next_entity(rq, cfs_rq);
+ if (!se)
+ goto again;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
+ if (unlikely(throttled))
+ task_throttle_setup_work(p);
+ return p;
+}
+
+static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+
+struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct sched_entity *se;
+ struct task_struct *p;
+ int new_tasks;
+
+again:
+ p = pick_task_fair(rq, rf);
+ if (!p)
+ goto idle;
+ se = &p->se;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ __put_prev_set_next_dl_server(rq, prev, p);
/*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ *
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
+ struct cfs_rq *cfs_rq;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
@@ -5169,54 +8967,62 @@ again:
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
- }
- if (hrtick_enabled(rq))
- hrtick_start_fair(rq, p);
+ __set_next_task_fair(rq, p, true);
+ }
return p;
+
simple:
- cfs_rq = &rq->cfs;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+ put_prev_set_next_task(rq, prev, p);
+ return p;
- if (!cfs_rq->nr_running)
- goto idle;
+idle:
+ if (rf) {
+ new_tasks = sched_balance_newidle(rq, rf);
- put_prev_task(rq, prev);
+ /*
+ * Because sched_balance_newidle() releases (and re-acquires)
+ * rq->lock, it is possible for any higher priority task to
+ * appear. In that case we must re-start the pick_next_entity()
+ * loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
- do {
- se = pick_next_entity(cfs_rq, NULL);
- set_next_entity(cfs_rq, se);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
+ if (new_tasks > 0)
+ goto again;
+ }
- p = task_of(se);
+ /*
+ * rq is about to be idle, check if we need to update the
+ * lost_idle_time of clock_pelt
+ */
+ update_idle_rq_clock_pelt(rq);
- if (hrtick_enabled(rq))
- hrtick_start_fair(rq, p);
+ return NULL;
+}
- return p;
+static struct task_struct *
+fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
+{
+ return pick_task_fair(dl_se->rq, rf);
+}
-idle:
- new_tasks = idle_balance(rq);
- /*
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
- * possible for any higher priority task to appear. In that case we
- * must re-start the pick_next_entity() loop.
- */
- if (new_tasks < 0)
- return RETRY_TASK;
+void fair_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->fair_server;
- if (new_tasks > 0)
- goto again;
+ init_dl_entity(dl_se);
- return NULL;
+ dl_server_init(dl_se, rq, fair_server_pick_task);
}
/*
* Account for a descheduled task:
*/
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
@@ -5229,12 +9035,10 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
/*
* sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr = rq->donor;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se;
@@ -5246,32 +9050,41 @@ static void yield_task_fair(struct rq *rq)
clear_buddies(cfs_rq, se);
- if (curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- /*
- * Tell update_rq_clock() that we've just updated,
- * so we don't do microscopic update in schedule()
- * and double the fastpath cost.
- */
- rq_clock_skip_update(rq, true);
- }
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
- set_skip_buddy(se);
+ /*
+ * Forfeit the remaining vruntime, only if the entity is eligible. This
+ * condition is necessary because in core scheduling we prefer to run
+ * ineligible tasks rather than force idling. If this happens we may
+ * end up in a loop where the core scheduler picks the yielding task,
+ * which yields immediately again; without the condition the vruntime
+ * ends up quickly running away.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ se->vruntime = se->deadline;
+ se->deadline += calc_delta_fair(se->slice, se);
+ }
}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- /* throttled hierarchies are not runnable */
- if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ /* !se->on_rq also covers throttled task */
+ if (!se->on_rq)
return false;
- /* Tell the scheduler that we'd really like pse to run next. */
+ /* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
yield_task_fair(rq);
@@ -5279,32 +9092,31 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
return true;
}
-#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods.
*
* BASICS
*
* The purpose of load-balancing is to achieve the same basic fairness the
- * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * per-CPU scheduler provides, namely provide a proportional amount of compute
* time to each task. This is expressed in the following equation:
*
* W_i,n/P_i == W_j,n/P_j for all i,j (1)
*
- * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
* W_i,0 is defined as:
*
* W_i,0 = \Sum_j w_i,j (2)
*
- * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
- * is derived from the nice value as per prio_to_weight[].
+ * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
+ * is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
* weight:
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
- * C_i is the compute capacity of cpu i, typically it is the
+ * C_i is the compute capacity of CPU i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
@@ -5325,11 +9137,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* SCHED DOMAINS
*
* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
- * for all i,j solution, we create a tree of cpus that follows the hardware
+ * for all i,j solution, we create a tree of CPUs that follows the hardware
* topology where each level pairs two lower groups (or better). This results
- * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of cpus in
+ * of load-balance at each level inversely proportional to the number of CPUs in
* the groups.
*
* This yields:
@@ -5338,7 +9150,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* \Sum { --- * --- * 2^i } = O(n) (5)
* i = 0 2^i 2^i
* `- size of each group
- * | | `- number of cpus doing load-balance
+ * | | `- number of CPUs doing load-balance
* | `- freq
* `- sum over all levels
*
@@ -5346,11 +9158,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* this makes (5) the runtime complexity of the balancer.
*
* An important property here is that each CPU is still (indirectly) connected
- * to every other cpu in at most O(log n) steps:
+ * to every other CPU in at most O(log n) steps:
*
* The adjacency matrix of the resulting graph is given by:
*
- * log_2 n
+ * log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
@@ -5358,7 +9170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* A^(log_2 n)_i,j != 0 for all i,j (7)
*
- * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * Showing there's indeed a path between every CPU in at most O(log n) steps.
* The task movement gives a factor of O(m), giving a convergence complexity
* of:
*
@@ -5368,7 +9180,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* WORK CONSERVING
*
* In order to avoid CPUs going idle while there's still work to do, new idle
- * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * balancing is more aggressive and has the newly idle CPU iterate up the domain
* tree itself instead of relying on other CPUs to bring it work.
*
* This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -5389,23 +9201,74 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
*
- * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
*
* The big problem is S_k, its a global sum needed to compute a local (W_i)
* property.
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
- */
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+/*
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
+ *
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the group_type can simply be compared when selecting the busiest
+ * group. See update_sd_pick_busiest().
+ */
+enum group_type {
+ /* The group has spare capacity that can be used to run more tasks. */
+ group_has_spare = 0,
+ /*
+ * The group is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ group_fully_busy,
+ /*
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
+ */
+ group_misfit_task,
+ /*
+ * Balance SMT group that's fully busy. Can benefit from migration
+ * a task on SMT with busy sibling to another CPU on idle core.
+ */
+ group_smt_balance,
+ /*
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
+ * and the task should be migrated to it instead of running on the
+ * current CPU.
+ */
+ group_asym_packing,
+ /*
+ * The tasks' affinity constraints previously prevented the scheduler
+ * from balancing the load across the system.
+ */
+ group_imbalanced,
+ /*
+ * The CPU is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ group_overloaded
+};
+
+enum migration_type {
+ migrate_load = 0,
+ migrate_util,
+ migrate_task,
+ migrate_misfit
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
+#define LBF_ACTIVE_LB 0x10
struct lb_env {
struct sched_domain *sd;
@@ -5430,6 +9293,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum migration_type migration_type;
struct list_head tasks;
};
@@ -5440,24 +9304,35 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class)
return 0;
- if (unlikely(p->policy == SCHED_IDLE))
+ if (unlikely(task_has_idle_policy(p)))
+ return 0;
+
+ /* SMT siblings share cache */
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
return 0;
/*
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
+ (&p->se == cfs_rq_of(&p->se)->next))
return 1;
if (sysctl_sched_migration_cost == -1)
return 1;
+
+ /*
+ * Don't migrate task if the task's cookie does not match
+ * with the destination CPU's core cookie.
+ */
+ if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+ return 1;
+
if (sysctl_sched_migration_cost == 0)
return 0;
@@ -5467,92 +9342,88 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
}
#ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+/*
+ * Returns a positive value, if task migration degrades locality.
+ * Returns 0, if task migration is not affected by locality.
+ * Returns a negative value, if task migration improves locality i.e migration preferred.
+ */
+static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
- int src_nid, dst_nid;
+ unsigned long src_weight, dst_weight;
+ int src_nid, dst_nid, dist;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
- !(env->sd->flags & SD_NUMA)) {
- return false;
- }
+ if (!static_branch_likely(&sched_numa_balancing))
+ return 0;
+
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ return 0;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return false;
-
- if (numa_group) {
- /* Task is already in the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return false;
-
- /* Task is moving into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return true;
+ return 0;
- return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid) {
+ if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+ return 1;
+ else
+ return 0;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return true;
-
- return task_faults(p, dst_nid) > task_faults(p, src_nid);
-}
-
-
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
-{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
- int src_nid, dst_nid;
-
- if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
- return false;
-
- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return false;
-
- src_nid = cpu_to_node(env->src_cpu);
- dst_nid = cpu_to_node(env->dst_cpu);
+ return -1;
- if (src_nid == dst_nid)
- return false;
+ /* Leaving a core idle is often worse than degrading locality. */
+ if (env->idle == CPU_IDLE)
+ return 0;
+ dist = node_distance(src_nid, dst_nid);
if (numa_group) {
- /* Task is moving within/into the group's interleave set. */
- if (node_isset(dst_nid, numa_group->active_nodes))
- return false;
-
- /* Task is moving out of the group's interleave set. */
- if (node_isset(src_nid, numa_group->active_nodes))
- return true;
-
- return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ src_weight = group_weight(p, src_nid, dist);
+ dst_weight = group_weight(p, dst_nid, dist);
+ } else {
+ src_weight = task_weight(p, src_nid, dist);
+ dst_weight = task_weight(p, dst_nid, dist);
}
- /* Migrating away from the preferred node is always bad. */
- if (src_nid == p->numa_preferred_nid)
- return true;
-
- return task_faults(p, dst_nid) < task_faults(p, src_nid);
+ return src_weight - dst_weight;
}
-#else
-static inline bool migrate_improves_locality(struct task_struct *p,
+#else /* !CONFIG_NUMA_BALANCING: */
+static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return false;
+ return 0;
}
+#endif /* !CONFIG_NUMA_BALANCING */
-static inline bool migrate_degrades_locality(struct task_struct *p,
- struct lb_env *env)
+/*
+ * Check whether the task is ineligible on the destination cpu
+ *
+ * When the PLACE_LAG scheduling feature is enabled and
+ * dst_cfs_rq->nr_queued is greater than 1, if the task
+ * is ineligible, it will also be ineligible when
+ * it is migrated to the destination cpu.
+ */
+static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
{
- return false;
-}
+ struct cfs_rq *dst_cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
+#else
+ dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
#endif
+ if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
+ !entity_eligible(task_cfs_rq(p), &p->se))
+ return 1;
+
+ return 0;
+}
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -5560,78 +9431,108 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot = 0;
+ long degrades, hot;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot)
+ p->sched_task_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU.
+ * 1) delayed dequeued unless we migrate load, or
+ * 2) target cfs_rq is in throttled hierarchy, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
+ */
+ if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
+ return 0;
+
+ if (lb_throttled_hierarchy(p, env->dst_cpu))
+ return 0;
+
+ /*
+ * We want to prioritize the migration of eligible tasks.
+ * For ineligible tasks we soft-limit them and only allow
+ * them to migrate when nr_balance_failed is non-zero to
+ * avoid load-balancing trying very hard to balance the load.
*/
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (!env->sd->nr_balance_failed &&
+ task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
+ return 0;
+
+ /* Disregard percpu kthreads; they are where they need to be. */
+ if (kthread_is_per_cpu(p))
+ return 0;
+
+ if (task_is_blocked(p))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
- schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->stats.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
/*
- * Remember if this task can be migrated to any other cpu in
+ * Remember if this task can be migrated to any other CPU in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Also avoid computing new_dst_cpu if we have already computed
- * one in current iteration.
+ * Avoid computing new_dst_cpu
+ * - for NEWLY_IDLE
+ * - if we have already computed one in current iteration
+ * - if it's an active balance
*/
- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE ||
+ env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
return 0;
- /* Prevent to re-select dst_cpu via env's cpus */
- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
- env->flags |= LBF_DST_PINNED;
- env->new_dst_cpu = cpu;
- break;
- }
+ /* Prevent to re-select dst_cpu via env's CPUs: */
+ cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
+
+ if (cpu < nr_cpu_ids) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
}
return 0;
}
- /* Record that we found atleast one task that could run on dst_cpu */
+ /* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_running(env->src_rq, p)) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ if (task_on_cpu(env->src_rq, p) ||
+ task_current_donor(env->src_rq, p)) {
+ schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
- */
- tsk_cache_hot = task_hot(p, env);
- if (!tsk_cache_hot)
- tsk_cache_hot = migrate_degrades_locality(p, env);
-
- if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot) {
- schedstat_inc(env->sd, lb_hot_gained[env->idle]);
- schedstat_inc(p, se.statistics.nr_forced_migrations);
- }
+ * 1) active balance
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
+ */
+ if (env->flags & LBF_ACTIVE_LB)
+ return 1;
+
+ degrades = migrate_degrades_locality(p, env);
+ if (!degrades)
+ hot = task_hot(p, env);
+ else
+ hot = degrades > 0;
+
+ if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+ if (hot)
+ p->sched_task_hot = 1;
return 1;
}
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
return 0;
}
@@ -5640,10 +9541,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
- deactivate_task(env->src_rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
+ if (p->sched_task_hot) {
+ p->sched_task_hot = 0;
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->stats.nr_forced_migrations);
+ }
+
+ WARN_ON(task_current(env->src_rq, p));
+ WARN_ON(task_current_donor(env->src_rq, p));
+
+ deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -5655,11 +9564,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
*/
static struct task_struct *detach_one_task(struct lb_env *env)
{
- struct task_struct *p, *n;
+ struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
- list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ list_for_each_entry_reverse(p,
+ &env->src_rq->cfs_tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
@@ -5671,16 +9581,14 @@ static struct task_struct *detach_one_task(struct lb_env *env)
* so we can safely collect stats here rather than
* inside detach_tasks().
*/
- schedstat_inc(env->sd, lb_gained[env->idle]);
+ schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
return NULL;
}
-static const unsigned int sched_nr_migrate_break = 32;
-
/*
- * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -5688,17 +9596,31 @@ static const unsigned int sched_nr_migrate_break = 32;
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
+ unsigned long util, load;
struct task_struct *p;
- unsigned long load;
int detached = 0;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
+
+ /*
+ * Source run queue has been emptied by another CPU, clear
+ * LBF_ALL_PINNED flag as we will not test any task.
+ */
+ if (env->src_rq->nr_running <= 1) {
+ env->flags &= ~LBF_ALL_PINNED;
+ return 0;
+ }
if (env->imbalance <= 0)
return 0;
while (!list_empty(tasks)) {
- p = list_first_entry(tasks, struct task_struct, se.group_node);
+ /*
+ * We don't want to steal all, otherwise we may be treated likewise,
+ * which could at worst lead to a livelock crash.
+ */
+ if (env->idle && env->src_rq->nr_running <= 1)
+ break;
env->loop++;
/* We've more or less seen every task there is, call it quits */
@@ -5707,29 +9629,71 @@ static int detach_tasks(struct lb_env *env)
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sched_nr_migrate_break;
+ env->loop_break += SCHED_NR_MIGRATE_BREAK;
env->flags |= LBF_NEED_BREAK;
break;
}
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
+
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ switch (env->migration_type) {
+ case migrate_load:
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
- goto next;
+ if (sched_feat(LB_MIN) &&
+ load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- if ((load / 2) > env->imbalance)
- goto next;
+ /*
+ * Make sure that we don't migrate too much load.
+ * Nevertheless, let relax the constraint if
+ * scheduler fails to find a good waiting task to
+ * migrate.
+ */
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
+ goto next;
+
+ env->imbalance -= load;
+ break;
+
+ case migrate_util:
+ util = task_util_est(p);
+
+ if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
+ goto next;
+
+ env->imbalance -= util;
+ break;
+
+ case migrate_task:
+ env->imbalance--;
+ break;
+
+ case migrate_misfit:
+ /* This is not a misfit task */
+ if (task_fits_cpu(p, env->src_cpu))
+ goto next;
+
+ env->imbalance = 0;
+ break;
+ }
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
- env->imbalance -= load;
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
@@ -5741,14 +9705,17 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
- * weighted load.
+ * load/util/tasks.
*/
if (env->imbalance <= 0)
break;
continue;
next:
- list_move_tail(&p->se.group_node, tasks);
+ if (p->sched_task_hot)
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
+
+ list_move(&p->se.group_node, tasks);
}
/*
@@ -5756,7 +9723,7 @@ next:
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], detached);
+ schedstat_add(env->sd->lb_gained[env->idle], detached);
return detached;
}
@@ -5766,12 +9733,11 @@ next:
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
- BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
- activate_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
+ WARN_ON_ONCE(task_rq(p) != rq);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
+ wakeup_preempt(rq, p, 0);
}
/*
@@ -5780,9 +9746,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
- raw_spin_lock(&rq->lock);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
attach_task(rq, p);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -5793,8 +9762,10 @@ static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
+ struct rq_flags rf;
- raw_spin_lock(&env->dst_rq->lock);
+ rq_lock(env->dst_rq, &rf);
+ update_rq_clock(env->dst_rq);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -5803,65 +9774,114 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
- raw_spin_unlock(&env->dst_rq->lock);
+ rq_unlock(env->dst_rq, &rf);
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
+#ifdef CONFIG_NO_HZ_COMMON
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = tg->se[cpu];
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ if (cfs_rq->avg.load_avg)
+ return true;
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- return;
+ if (cfs_rq->avg.util_avg)
+ return true;
- update_cfs_rq_blocked_load(cfs_rq, 1);
+ return false;
+}
- if (se) {
- update_entity_load_avg(se, 1);
- /*
- * We pivot on our runnable average having decayed to zero for
- * list removal. This generally implies that all our children
- * have also been removed (modulo rounding error or bandwidth
- * control); however, such cases are rare and we can fix these
- * at enqueue.
- *
- * TODO: fix up out-of-order children on enqueue.
- */
- if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
- list_del_leaf_cfs_rq(cfs_rq);
- } else {
- struct rq *rq = rq_of(cfs_rq);
- update_rq_runnable_avg(rq, rq->nr_running);
- }
+static inline bool others_have_blocked(struct rq *rq)
+{
+ if (cpu_util_rt(rq))
+ return true;
+
+ if (cpu_util_dl(rq))
+ return true;
+
+ if (hw_load_avg(rq))
+ return true;
+
+ if (cpu_util_irq(rq))
+ return true;
+
+ return false;
}
-static void update_blocked_averages(int cpu)
+static inline void update_blocked_load_tick(struct rq *rq)
{
- struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
- unsigned long flags;
+ WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
+
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+ if (!has_blocked)
+ rq->has_blocked_load = 0;
+}
+#else /* !CONFIG_NO_HZ_COMMON: */
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif /* !CONFIG_NO_HZ_COMMON */
+
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+ bool updated;
+
+ /*
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+ * DL and IRQ signals have been updated before updating CFS.
+ */
+ updated = update_other_load_avgs(rq);
+
+ if (others_have_blocked(rq))
+ *done = false;
+
+ return updated;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static bool __update_blocked_fair(struct rq *rq, bool *done)
+{
+ struct cfs_rq *cfs_rq, *pos;
+ bool decayed = false;
+ int cpu = cpu_of(rq);
- raw_spin_lock_irqsave(&rq->lock, flags);
- update_rq_clock(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+ struct sched_entity *se;
+
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
+ update_tg_load_avg(cfs_rq);
+
+ if (cfs_rq->nr_queued == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
+
+ if (cfs_rq == &rq->cfs)
+ decayed = true;
+ }
+
+ /* Propagate pending load changes to the parent, if any: */
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+
/*
- * Note: We may want to consider periodically releasing
- * rq->lock about these updates so that creating many task
- * groups does not result in continually extending hold time.
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
*/
- __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ return decayed;
}
/*
@@ -5879,23 +9899,23 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
if (!se) {
- cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
- load = div64_ul(load * se->avg.load_avg_contrib,
- cfs_rq->runnable_load_avg + 1);
+ load = div64_ul(load * se->avg.load_avg,
+ cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
@@ -5907,43 +9927,66 @@ static unsigned long task_h_load(struct task_struct *p)
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
- return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
- cfs_rq->runnable_load_avg + 1);
+ return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+ cfs_rq_load_avg(cfs_rq) + 1);
}
-#else
-static inline void update_blocked_averages(int cpu)
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ bool decayed;
+
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
+
+ return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
{
- return p->se.avg.load_avg_contrib;
+ return p->se.avg.load_avg;
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
-/********** Helpers for find_busiest_group ************************/
+static void sched_balance_update_blocked_averages(int cpu)
+{
+ bool decayed = false, done = true;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
+ rq_lock_irqsave(rq, &rf);
+ update_blocked_load_tick(rq);
+ update_rq_clock(rq);
+
+ decayed |= __update_blocked_others(rq, &done);
+ decayed |= __update_blocked_fair(rq, &done);
+
+ update_blocked_load_status(rq, !done);
+ if (decayed)
+ cpufreq_update_util(rq, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+/********** Helpers for sched_balance_find_src_group ************************/
/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
+ * sg_lb_stats - stats of a sched_group required for load-balancing:
*/
struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
- unsigned long load_per_task;
- unsigned long group_capacity;
- unsigned long group_usage; /* Total usage of the group */
- unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int idle_cpus;
+ unsigned long avg_load; /* Avg load over the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long group_capacity; /* Capacity over the CPUs of the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
+ unsigned int sum_nr_running; /* Nr of all tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
+ unsigned int idle_cpus; /* Nr of idle CPUs in the group */
unsigned int group_weight;
enum group_type group_type;
- int group_no_capacity;
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5951,18 +9994,18 @@ struct sg_lb_stats {
};
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
+ * sd_lb_stats - stats of a sched_domain required for load-balancing:
*/
struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *local; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_capacity; /* Total capacity of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
-
- struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
- struct sg_lb_stats local_stat; /* Statistics of the local group */
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* Tasks should go to sibling first */
+
+ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -5970,8 +10013,9 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
- * We must however clear busiest_stat::avg_load because
- * update_sd_pick_busiest() reads this before assignment.
+ * We must however set busiest_stat::group_type and
+ * busiest_stat::idle_cpus to the worst busiest group because
+ * update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
@@ -5979,110 +10023,60 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
- .avg_load = 0UL,
- .sum_nr_running = 0,
- .group_type = group_other,
+ .idle_cpus = UINT_MAX,
+ .group_type = group_has_spare,
},
};
}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
- *
- * Return: The load index.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
- enum cpu_idle_type idle)
-{
- int load_idx;
-
- switch (idle) {
- case CPU_NOT_IDLE:
- load_idx = sd->busy_idx;
- break;
-
- case CPU_NEWLY_IDLE:
- load_idx = sd->newidle_idx;
- break;
- default:
- load_idx = sd->idle_idx;
- break;
- }
-
- return load_idx;
-}
-
-static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
- return sd->smt_gain / sd->span_weight;
-
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_cpu_capacity(sd, cpu);
-}
-
static unsigned long scale_rt_capacity(int cpu)
{
+ unsigned long max = get_actual_cpu_capacity(cpu);
struct rq *rq = cpu_rq(cpu);
- u64 total, used, age_stamp, avg;
- s64 delta;
+ unsigned long used, free;
+ unsigned long irq;
- /*
- * Since we're reading these variables without serialization make sure
- * we read them once before doing sanity checks on them.
- */
- age_stamp = ACCESS_ONCE(rq->age_stamp);
- avg = ACCESS_ONCE(rq->rt_avg);
- delta = __rq_clock_broken(rq) - age_stamp;
+ irq = cpu_util_irq(rq);
- if (unlikely(delta < 0))
- delta = 0;
+ if (unlikely(irq >= max))
+ return 1;
- total = sched_avg_period() + delta;
+ /*
+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
+ * (running and not running) with weights 0 and 1024 respectively.
+ */
+ used = cpu_util_rt(rq);
+ used += cpu_util_dl(rq);
- used = div_u64(avg, total);
+ if (unlikely(used >= max))
+ return 1;
- if (likely(used < SCHED_CAPACITY_SCALE))
- return SCHED_CAPACITY_SCALE - used;
+ free = max - used;
- return 1;
+ return scale_irq_capacity(free, irq, max);
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = SCHED_CAPACITY_SCALE;
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_cpu_capacity(sd, cpu);
- else
- capacity *= default_scale_cpu_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
-
- cpu_rq(cpu)->cpu_capacity_orig = capacity;
-
- capacity *= scale_rt_capacity(cpu);
- capacity >>= SCHED_CAPACITY_SHIFT;
-
if (!capacity)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -6095,50 +10089,42 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
+ min_capacity = ULONG_MAX;
+ max_capacity = 0;
- if (child->flags & SD_OVERLAP) {
+ if (child->flags & SD_NUMA) {
/*
- * SD_OVERLAP domains cannot assume that child groups
+ * SD_NUMA domains cannot assume that child groups
* span the current group.
*/
- for_each_cpu(cpu, sched_group_cpus(sdg)) {
- struct sched_group_capacity *sgc;
- struct rq *rq = cpu_rq(cpu);
-
- /*
- * build_sched_domains() -> init_sched_groups_capacity()
- * gets here before we've attached the domains to the
- * runqueues.
- *
- * Use capacity_of(), which is set irrespective of domains
- * in update_cpu_capacity().
- *
- * This avoids capacity from being 0 and
- * causing divide-by-zero issues on boot.
- */
- if (unlikely(!rq->sd)) {
- capacity += capacity_of(cpu);
- continue;
- }
+ for_each_cpu(cpu, sched_group_span(sdg)) {
+ unsigned long cpu_cap = capacity_of(cpu);
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ capacity += cpu_cap;
+ min_capacity = min(cpu_cap, min_capacity);
+ max_capacity = max(cpu_cap, max_capacity);
}
} else {
/*
- * !SD_OVERLAP domains can assume that child groups
+ * !SD_NUMA domains can assume that child groups
* span the current group.
- */
+ */
group = child->groups;
do {
- capacity += group->sgc->capacity;
+ struct sched_group_capacity *sgc = group->sgc;
+
+ capacity += sgc->capacity;
+ min_capacity = min(sgc->min_capacity, min_capacity);
+ max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->min_capacity = min_capacity;
+ sdg->sgc->max_capacity = max_capacity;
}
/*
@@ -6150,23 +10136,29 @@ static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
- (rq->cpu_capacity_orig * 100));
+ (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
+}
+
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
+{
+ return rq->misfit_task_load;
}
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to tsk_cpus_allowed() constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
*
- * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
- * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
+ * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
* Something like:
*
- * { 0 1 2 3 } { 4 5 6 7 }
- * * * * *
+ * { 0 1 2 3 } { 4 5 6 7 }
+ * * * * *
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
- * cpu 3 and leave one of the cpus in the second group unused.
+ * cpu 3 and leave one of the CPUs in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
* by noticing the lower domain failed to reach balance and had difficulty
@@ -6174,7 +10166,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -6190,9 +10182,9 @@ static inline int sg_imbalanced(struct sched_group *group)
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
- * We consider that a group has spare capacity if the * number of task is
- * smaller than the number of CPUs or if the usage is lower than the available
- * capacity for CFS tasks.
+ * We consider that a group has spare capacity if the number of task is
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
* account the variance of the tasks' load and to return true if the available
* capacity in meaningful for the load balancer.
@@ -6200,13 +10192,17 @@ static inline int sg_imbalanced(struct sched_group *group)
* any benefit for the load balance.
*/
static inline bool
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
+ if ((sgs->group_capacity * imbalance_pct) <
+ (sgs->group_runnable * 100))
+ return false;
+
if ((sgs->group_capacity * 100) >
- (sgs->group_usage * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -6221,86 +10217,277 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
* false.
*/
static inline bool
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
- (sgs->group_usage * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
+ return true;
+
+ if ((sgs->group_capacity * imbalance_pct) <
+ (sgs->group_runnable * 100))
return true;
return false;
}
-static enum group_type group_classify(struct lb_env *env,
- struct sched_group *group,
- struct sg_lb_stats *sgs)
+static inline enum
+group_type group_classify(unsigned int imbalance_pct,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
- if (sgs->group_no_capacity)
+ if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
- return group_other;
+ if (sgs->group_asym_packing)
+ return group_asym_packing;
+
+ if (sgs->group_smt_balance)
+ return group_smt_balance;
+
+ if (sgs->group_misfit_task_load)
+ return group_misfit_task;
+
+ if (!group_has_capacity(imbalance_pct, sgs))
+ return group_fully_busy;
+
+ return group_has_spare;
+}
+
+/**
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd: The scheduling domain of the load balancing
+ * @cpu: A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
+{
+ if (!(sd->flags & SD_ASYM_PACKING))
+ return false;
+
+ if (!sched_smt_active())
+ return true;
+
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
+{
+ /*
+ * First check if @dst_cpu can do asym_packing load balance. Only do it
+ * if it has higher priority than @src_cpu.
+ */
+ return sched_use_asym_prio(sd, dst_cpu) &&
+ sched_asym_prefer(dst_cpu, src_cpu);
+}
+
+/**
+ * sched_group_asym - Check if the destination CPU can do asym_packing balance
+ * @env: The load balancing environment
+ * @sgs: Load-balancing statistics of the candidate busiest group
+ * @group: The candidate busiest group
+ *
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
+ *
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
+ */
+static inline bool
+sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
+{
+ /*
+ * CPU priorities do not make sense for SMT cores with more than one
+ * busy sibling.
+ */
+ if ((group->flags & SD_SHARE_CPUCAPACITY) &&
+ (sgs->group_weight - sgs->idle_cpus != 1))
+ return false;
+
+ return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
+}
+
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
+ return false;
+
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (!env->idle)
+ return false;
+
+ /*
+ * For SMT source group, it is better to move a task
+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+ * will not be on.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY &&
+ sgs->sum_h_nr_running > 1)
+ return true;
+
+ return false;
+}
+
+static inline long sibling_imbalance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *local)
+{
+ int ncores_busiest, ncores_local;
+ long imbalance;
+
+ if (!env->idle || !busiest->sum_nr_running)
+ return 0;
+
+ ncores_busiest = sds->busiest->cores;
+ ncores_local = sds->local->cores;
+
+ if (ncores_busiest == ncores_local) {
+ imbalance = busiest->sum_nr_running;
+ lsub_positive(&imbalance, local->sum_nr_running);
+ return imbalance;
+ }
+
+ /* Balance such that nr_running/ncores ratio are same on both groups */
+ imbalance = ncores_local * busiest->sum_nr_running;
+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+ /* Normalize imbalance and do rounding on normalization */
+ imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+ imbalance /= ncores_local + ncores_busiest;
+
+ /* Take advantage of resource in an empty sched group */
+ if (imbalance <= 1 && local->sum_nr_running == 0 &&
+ busiest->sum_nr_running > 1)
+ imbalance = 2;
+
+ return imbalance;
+}
+
+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_runnable != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @sg_overloaded: sched_group is overloaded
+ * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
- struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ struct sd_lb_stats *sds,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ bool *sg_overloaded,
+ bool *sg_overutilized)
{
- unsigned long load;
- int i;
+ int i, nr_running, local_group, sd_flags = env->sd->flags;
+ bool balancing_at_rd = !env->sd->parent;
memset(sgs, 0, sizeof(*sgs));
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- struct rq *rq = cpu_rq(i);
+ local_group = group == sds->local;
- /* Bias balancing toward cpus of our domain */
- if (local_group)
- load = target_load(i, load_idx);
- else
- load = source_load(i, load_idx);
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);
sgs->group_load += load;
- sgs->group_usage += get_cpu_usage(i);
- sgs->sum_nr_running += rq->cfs.h_nr_running;
+ sgs->group_util += cpu_util_cfs(i);
+ sgs->group_runnable += cpu_runnable(rq);
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
+
+ nr_running = rq->nr_running;
+ sgs->sum_nr_running += nr_running;
- if (rq->nr_running > 1)
- *overload = true;
+ if (cpu_overutilized(i))
+ *sg_overutilized = 1;
+
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i)) {
+ sgs->idle_cpus++;
+ /* Idle cpu can't have misfit task */
+ continue;
+ }
+
+ /* Overload indicator is only updated at root domain */
+ if (balancing_at_rd && nr_running > 1)
+ *sg_overloaded = 1;
#ifdef CONFIG_NUMA_BALANCING
- sgs->nr_numa_running += rq->nr_numa_running;
- sgs->nr_preferred_running += rq->nr_preferred_running;
+ /* Only fbq_classify_group() uses this to classify NUMA groups */
+ if (sd_flags & SD_NUMA) {
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+ }
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
- sgs->idle_cpus++;
+ if (local_group)
+ continue;
+
+ if (sd_flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_overloaded = 1;
+ }
+ } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
+ }
}
- /* Adjust by relative CPU capacity of the group */
sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
-
- if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(env, group, sgs);
+ /* Check if dst CPU is idle and preferred to this group */
+ if (!local_group && env->idle && sgs->sum_h_nr_running &&
+ sched_group_asym(env, sgs, group))
+ sgs->group_asym_packing = 1;
+
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (!local_group && smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
+
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+
+ /* Computing avg_load makes sense only when group is overloaded */
+ if (sgs->group_type == group_overloaded)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
}
/**
@@ -6323,41 +10510,144 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /* Make sure that there is at least one task to pull */
+ if (!sgs->sum_h_nr_running)
+ return false;
+
+ /*
+ * Don't try to pull misfit tasks we can't help.
+ * We can use max_capacity here as reduction in capacity on some
+ * CPUs in the group should either be possible to resolve
+ * internally or be covered by avg_load imbalance (eventually).
+ */
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
+ (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
+ sds->local_stat.group_type != group_has_spare))
+ return false;
+
if (sgs->group_type > busiest->group_type)
return true;
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
+ /*
+ * The candidate and the current busiest group are the same type of
+ * group. Let check which one is the busiest according to the type.
+ */
+
+ switch (sgs->group_type) {
+ case group_overloaded:
+ /* Select the overloaded group with highest avg_load. */
+ return sgs->avg_load > busiest->avg_load;
+
+ case group_imbalanced:
+ /*
+ * Select the 1st imbalanced group as we don't have any way to
+ * choose one more than another.
+ */
return false;
- /* This is the busiest node in its class. */
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return true;
+ case group_asym_packing:
+ /* Prefer to move from lowest priority CPU's work */
+ return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
+ READ_ONCE(sg->asym_prefer_cpu));
- /*
- * ASYM_PACKING needs to move all the work to the lowest
- * numbered CPUs in the group, therefore mark all groups
- * higher than ourself as busy.
- */
- if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
- if (!sds->busiest)
- return true;
+ case group_misfit_task:
+ /*
+ * If we have more than one misfit sg go with the biggest
+ * misfit.
+ */
+ return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
- if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
- return true;
+ case group_smt_balance:
+ /*
+ * Check if we have spare CPUs on either SMT group to
+ * choose has spare or fully busy handling.
+ */
+ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+ goto has_spare;
+
+ fallthrough;
+
+ case group_fully_busy:
+ /*
+ * Select the fully busy group with highest avg_load. In
+ * theory, there is no need to pull task from such kind of
+ * group because tasks have all compute capacity that they need
+ * but we can still improve the overall throughput by reducing
+ * contention when accessing shared HW resources.
+ *
+ * XXX for now avg_load is not computed and always 0 so we
+ * select the 1st one, except if @sg is composed of SMT
+ * siblings.
+ */
+
+ if (sgs->avg_load < busiest->avg_load)
+ return false;
+
+ if (sgs->avg_load == busiest->avg_load) {
+ /*
+ * SMT sched groups need more help than non-SMT groups.
+ * If @sg happens to also be SMT, either choice is good.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+ return false;
+ }
+
+ break;
+
+ case group_has_spare:
+ /*
+ * Do not pick sg with SMT CPUs over sg with pure CPUs,
+ * as we do not want to pull task off SMT core with one task
+ * and make the core idle.
+ */
+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+ return false;
+ else
+ return true;
+ }
+has_spare:
+
+ /*
+ * Select not overloaded group with lowest number of idle CPUs
+ * and highest number of running tasks. We could also compare
+ * the spare capacity which is more stable but it can end up
+ * that the group has less spare capacity but finally more idle
+ * CPUs which means less opportunity to pull tasks.
+ */
+ if (sgs->idle_cpus > busiest->idle_cpus)
+ return false;
+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
+ (sgs->sum_nr_running <= busiest->sum_nr_running))
+ return false;
+
+ break;
}
- return false;
+ /*
+ * Candidate sg has no more than one task per CPU and has higher
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
+ * throughput. Maximize throughput, power/energy consequences are not
+ * considered.
+ */
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type <= group_fully_busy) &&
+ (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
+ return false;
+
+ return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->nr_numa_running)
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
@@ -6370,7 +10660,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
return remote;
return all;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
return all;
@@ -6380,199 +10670,477 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
return regular;
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
+
+
+struct sg_lb_stats;
+
+/*
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
+ */
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return 0;
+
+ if (task_on_rq_queued(p))
+ return 1;
+
+ return 0;
+}
/**
- * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @env: The load balancing environment.
- * @sds: variable to hold the statistics for this sched_domain.
+ * idle_cpu_without - would a given CPU be idle without p ?
+ * @cpu: the processor on which idleness is tested.
+ * @p: task which should be ignored.
+ *
+ * Return: 1 if the CPU would be idle. 0 otherwise.
*/
-static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
+static int idle_cpu_without(int cpu, struct task_struct *p)
{
- struct sched_domain *child = env->sd->child;
- struct sched_group *sg = env->sd->groups;
- struct sg_lb_stats tmp_sgs;
- int load_idx, prefer_sibling = 0;
- bool overload = false;
+ struct rq *rq = cpu_rq(cpu);
- if (child && child->flags & SD_PREFER_SIBLING)
- prefer_sibling = 1;
+ if (rq->curr != rq->idle && rq->curr != p)
+ return 0;
- load_idx = get_sd_load_idx(env->sd, env->idle);
+ /*
+ * rq->nr_running can't be used but an updated version without the
+ * impact of p on cpu must be used instead. The updated nr_running
+ * be computed and tested before calling idle_cpu_without().
+ */
+
+ if (rq->ttwu_pending)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
+ * @sd: The sched_domain level to look for idlest group.
+ * @group: sched_group whose statistics are to be updated.
+ * @sgs: variable to hold the statistics for this group.
+ * @p: The task for which we look for the idlest group/CPU.
+ */
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ struct task_struct *p)
+{
+ int i, nr_running;
+
+ memset(sgs, 0, sizeof(*sgs));
+
+ /* Assume that task can't fit any CPU of the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
+ sgs->group_misfit_task_load = 1;
+
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ struct rq *rq = cpu_rq(i);
+ unsigned int local;
+
+ sgs->group_load += cpu_load_without(rq, p);
+ sgs->group_util += cpu_util_without(i, p);
+ sgs->group_runnable += cpu_runnable_without(rq, p);
+ local = task_running_on_cpu(i, p);
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
+
+ nr_running = rq->nr_running - local;
+ sgs->sum_nr_running += nr_running;
+
+ /*
+ * No need to call idle_cpu_without() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu_without(i, p))
+ sgs->idle_cpus++;
+
+ /* Check if task fits in the CPU */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load &&
+ task_fits_cpu(p, i))
+ sgs->group_misfit_task_load = 0;
+
+ }
+
+ sgs->group_capacity = group->sgc->capacity;
+
+ sgs->group_weight = group->group_weight;
+
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
+
+ /*
+ * Computing avg_load makes sense only when group is fully busy or
+ * overloaded
+ */
+ if (sgs->group_type == group_fully_busy ||
+ sgs->group_type == group_overloaded)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+}
+
+static bool update_pick_idlest(struct sched_group *idlest,
+ struct sg_lb_stats *idlest_sgs,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_type < idlest_sgs->group_type)
+ return true;
+
+ if (sgs->group_type > idlest_sgs->group_type)
+ return false;
+
+ /*
+ * The candidate and the current idlest group are the same type of
+ * group. Let check which one is the idlest according to the type.
+ */
+
+ switch (sgs->group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /* Select the group with lowest avg_load. */
+ if (idlest_sgs->avg_load <= sgs->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ case group_smt_balance:
+ /* Those types are not used in the slow wakeup path */
+ return false;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+ return false;
+ break;
+
+ case group_has_spare:
+ /* Select group with most idle CPUs */
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
+ return false;
+
+ /* Select group with lowest group_util */
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
+ idlest_sgs->group_util <= sgs->group_util)
+ return false;
+
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
+ struct sg_lb_stats local_sgs, tmp_sgs;
+ struct sg_lb_stats *sgs;
+ unsigned long imbalance;
+ struct sg_lb_stats idlest_sgs = {
+ .avg_load = UINT_MAX,
+ .group_type = group_overloaded,
+ };
do {
- struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_span(group),
+ p->cpus_ptr))
+ continue;
+
+ /* Skip over this group if no cookie matched */
+ if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_span(group));
+
if (local_group) {
- sds->local = sg;
- sgs = &sds->local_stat;
+ sgs = &local_sgs;
+ local = group;
+ } else {
+ sgs = &tmp_sgs;
+ }
- if (env->idle != CPU_NEWLY_IDLE ||
- time_after_eq(jiffies, sg->sgc->next_update))
- update_group_capacity(env->sd, env->dst_cpu);
+ update_sg_wakeup_stats(sd, group, sgs, p);
+
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
+ idlest = group;
+ idlest_sgs = *sgs;
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ } while (group = group->next, group != sd->groups);
- if (local_group)
- goto next_group;
+
+ /* There is no idlest group to push tasks to */
+ if (!idlest)
+ return NULL;
+
+ /* The local group has been skipped because of CPU affinity */
+ if (!local)
+ return idlest;
+
+ /*
+ * If the local group is idler than the selected idlest group
+ * don't try and push the task.
+ */
+ if (local_sgs.group_type < idlest_sgs.group_type)
+ return NULL;
+
+ /*
+ * If the local group is busier than the selected idlest group
+ * try and push the task.
+ */
+ if (local_sgs.group_type > idlest_sgs.group_type)
+ return idlest;
+
+ switch (local_sgs.group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+
+ /* Calculate allowed imbalance based on load */
+ imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
/*
- * In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity so that we'll try
- * and move all the excess tasks away. We lower the capacity
- * of a group only if the local group has the capacity to fit
- * these excess tasks. The extra check prevents the case where
- * you always pull from the heaviest group when it is already
- * under-utilized (possible with a large weight task outweighs
- * the tasks on the system).
+ * When comparing groups across NUMA domains, it's possible for
+ * the local domain to be very lightly loaded relative to the
+ * remote domains but "imbalance" skews the comparison making
+ * remote CPUs look much more favourable. When considering
+ * cross-domain, add imbalance to the load on the remote node
+ * and consider staying local.
*/
- if (prefer_sibling && sds->local &&
- group_has_capacity(env, &sds->local_stat) &&
- (sgs->sum_nr_running > 1)) {
- sgs->group_no_capacity = 1;
- sgs->group_type = group_overloaded;
- }
- if (update_sd_pick_busiest(env, sds, sg, sgs)) {
- sds->busiest = sg;
- sds->busiest_stat = *sgs;
- }
+ if ((sd->flags & SD_NUMA) &&
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
+ return NULL;
-next_group:
- /* Now, start updating sd_lb_stats */
- sds->total_load += sgs->group_load;
- sds->total_capacity += sgs->group_capacity;
+ /*
+ * If the local group is less loaded than the selected
+ * idlest group don't try and push any tasks.
+ */
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
+ return NULL;
- sg = sg->next;
- } while (sg != env->sd->groups);
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
+ return NULL;
+ break;
- if (env->sd->flags & SD_NUMA)
- env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+ case group_imbalanced:
+ case group_asym_packing:
+ case group_smt_balance:
+ /* Those type are not used in the slow wakeup path */
+ return NULL;
- if (!env->sd->parent) {
- /* update overload indicator if we are at root domain */
- if (env->dst_rq->rd->overload != overload)
- env->dst_rq->rd->overload = overload;
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+ return NULL;
+ break;
+
+ case group_has_spare:
+#ifdef CONFIG_NUMA
+ if (sd->flags & SD_NUMA) {
+ int imb_numa_nr = sd->imb_numa_nr;
+#ifdef CONFIG_NUMA_BALANCING
+ int idlest_cpu;
+ /*
+ * If there is spare capacity at NUMA, try to select
+ * the preferred node
+ */
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+ return NULL;
+
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+ return idlest;
+#endif /* CONFIG_NUMA_BALANCING */
+ /*
+ * Otherwise, keep the task close to the wakeup source
+ * and improve locality if the number of running tasks
+ * would remain below threshold where an imbalance is
+ * allowed while accounting for the possibility the
+ * task is pinned to a subset of CPUs. If there is a
+ * real need of migration, periodic load balance will
+ * take care of it.
+ */
+ if (p->nr_cpus_allowed != NR_CPUS) {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+
+ cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+ imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ }
+
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ imb_numa_nr)) {
+ return NULL;
+ }
+ }
+#endif /* CONFIG_NUMA */
+
+ /*
+ * Select group with highest number of idle CPUs. We could also
+ * compare the utilization which is more stable but it can end
+ * up that the group has less spare capacity but finally more
+ * idle CPUs which means more opportunity to run task.
+ */
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
+ return NULL;
+ break;
}
+ return idlest;
}
-/**
- * check_asym_packing - Check to see if the group is packed into the
- * sched doman.
- *
- * This is primarily intended to used at the sibling level. Some
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle. When in lower SMT modes, the threads will
- * perform better since they share less core resources. Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads. It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on. Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
- *
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain which is to be packed
- */
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
+static void update_idle_cpu_scan(struct lb_env *env,
+ unsigned long sum_util)
{
- int busiest_cpu;
+ struct sched_domain_shared *sd_share;
+ int llc_weight, pct;
+ u64 x, y, tmp;
+ /*
+ * Update the number of CPUs to scan in LLC domain, which could
+ * be used as a hint in select_idle_cpu(). The update of sd_share
+ * could be expensive because it is within a shared cache line.
+ * So the write of this hint only occurs during periodic load
+ * balancing, rather than CPU_NEWLY_IDLE, because the latter
+ * can fire way more frequently than the former.
+ */
+ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return 0;
+ llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+ if (env->sd->span_weight != llc_weight)
+ return;
- if (!sds->busiest)
- return 0;
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ if (!sd_share)
+ return;
- busiest_cpu = group_first_cpu(sds->busiest);
- if (env->dst_cpu > busiest_cpu)
- return 0;
+ /*
+ * The number of CPUs to search drops as sum_util increases, when
+ * sum_util hits 85% or above, the scan stops.
+ * The reason to choose 85% as the threshold is because this is the
+ * imbalance_pct(117) when a LLC sched group is overloaded.
+ *
+ * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
+ * and y'= y / SCHED_CAPACITY_SCALE
+ *
+ * x is the ratio of sum_util compared to the CPU capacity:
+ * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+ * y' is the ratio of CPUs to be scanned in the LLC domain,
+ * and the number of CPUs to scan is calculated by:
+ *
+ * nr_scan = llc_weight * y' [2]
+ *
+ * When x hits the threshold of overloaded, AKA, when
+ * x = 100 / pct, y drops to 0. According to [1],
+ * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+ *
+ * Scale x by SCHED_CAPACITY_SCALE:
+ * x' = sum_util / llc_weight; [3]
+ *
+ * and finally [1] becomes:
+ * y = SCHED_CAPACITY_SCALE -
+ * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
+ *
+ */
+ /* equation [3] */
+ x = sum_util;
+ do_div(x, llc_weight);
- env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
- SCHED_CAPACITY_SCALE);
+ /* equation [4] */
+ pct = env->sd->imbalance_pct;
+ tmp = x * x * pct * pct;
+ do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+ tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+ y = SCHED_CAPACITY_SCALE - tmp;
- return 1;
+ /* equation [2] */
+ y *= llc_weight;
+ do_div(y, SCHED_CAPACITY_SCALE);
+ if ((int)y != sd_share->nr_idle_scan)
+ WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
}
/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- * amongst the groups of a sched_domain, during
- * load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @sds: variable to hold the statistics for this sched_domain.
*/
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, capa_now = 0, capa_move = 0;
- unsigned int imbn = 2;
- unsigned long scaled_busy_load_per_task;
- struct sg_lb_stats *local, *busiest;
+ struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats *local = &sds->local_stat;
+ struct sg_lb_stats tmp_sgs;
+ unsigned long sum_util = 0;
+ bool sg_overloaded = 0, sg_overutilized = 0;
- local = &sds->local_stat;
- busiest = &sds->busiest_stat;
+ do {
+ struct sg_lb_stats *sgs = &tmp_sgs;
+ int local_group;
+
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
+ if (local_group) {
+ sds->local = sg;
+ sgs = local;
+
+ if (env->idle != CPU_NEWLY_IDLE ||
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
+ }
- if (!local->sum_nr_running)
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
- else if (busiest->load_per_task > local->load_per_task)
- imbn = 1;
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
- scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- busiest->group_capacity;
+ if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
+ sds->busiest = sg;
+ sds->busiest_stat = *sgs;
+ }
- if (busiest->avg_load + scaled_busy_load_per_task >=
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
- env->imbalance = busiest->load_per_task;
- return;
- }
+ /* Now, start updating sd_lb_stats */
+ sds->total_load += sgs->group_load;
+ sds->total_capacity += sgs->group_capacity;
+
+ sum_util += sgs->group_util;
+ sg = sg->next;
+ } while (sg != env->sd->groups);
/*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU capacity used by
- * moving them.
+ * Indicate that the child domain of the busiest group prefers tasks
+ * go to a child's sibling domains first. NB the flags of a sched group
+ * are those of the child domain.
*/
+ if (sds->busiest)
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
- capa_now += busiest->group_capacity *
- min(busiest->load_per_task, busiest->avg_load);
- capa_now += local->group_capacity *
- min(local->load_per_task, local->avg_load);
- capa_now /= SCHED_CAPACITY_SCALE;
- /* Amount of load we'd subtract */
- if (busiest->avg_load > scaled_busy_load_per_task) {
- capa_move += busiest->group_capacity *
- min(busiest->load_per_task,
- busiest->avg_load - scaled_busy_load_per_task);
- }
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
- /* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_capacity <
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
- tmp = (busiest->avg_load * busiest->group_capacity) /
- local->group_capacity;
- } else {
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- local->group_capacity;
+ if (!env->sd->parent) {
+ /* update overload indicator if we are at root domain */
+ set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
+ } else if (sg_overutilized) {
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}
- capa_move += local->group_capacity *
- min(local->load_per_task, local->avg_load + tmp);
- capa_move /= SCHED_CAPACITY_SCALE;
- /* Move if we gain throughput */
- if (capa_move > capa_now)
- env->imbalance = busiest->load_per_task;
+ update_idle_cpu_scan(env, sum_util);
}
/**
@@ -6583,91 +11151,207 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
*/
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long max_pull, load_above_capacity = ~0UL;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
+ if (busiest->group_type == group_misfit_task) {
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
+ return;
+ }
+
+ if (busiest->group_type == group_asym_packing) {
+ /*
+ * In case of asym capacity, we will try to migrate all load to
+ * the preferred CPU.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = busiest->sum_h_nr_running;
+ return;
+ }
+
+ if (busiest->group_type == group_smt_balance) {
+ /* Reduce number of tasks sharing CPU capacity */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
- * to ensure cpu-load equilibrium, look at wider averages. XXX
+ * to ensure CPU-load equilibrium, try to move any task to fix
+ * the imbalance. The next load balance will take care of
+ * balancing back the system.
*/
- busiest->load_per_task =
- min(busiest->load_per_task, sds->avg_load);
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
}
/*
- * In the presence of smp nice balancing, certain scenarios can have
- * max load less than avg load(as we skip the groups at or below
- * its cpu_capacity, while calculating max_load..)
+ * Try to use spare capacity of local group without overloading it or
+ * emptying busiest.
*/
- if (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load) {
- env->imbalance = 0;
- return fix_small_imbalance(env, sds);
+ if (local->group_type == group_has_spare) {
+ if ((busiest->group_type > group_fully_busy) &&
+ !(env->sd->flags & SD_SHARE_LLC)) {
+ /*
+ * If busiest is overloaded, try to fill spare
+ * capacity. This might end up creating spare capacity
+ * in busiest or busiest still being overloaded but
+ * there is no simple way to directly compute the
+ * amount of load to migrate in order to balance the
+ * system.
+ */
+ env->migration_type = migrate_util;
+ env->imbalance = max(local->group_capacity, local->group_util) -
+ local->group_util;
+
+ /*
+ * In some cases, the group's utilization is max or even
+ * higher than capacity because of migrations but the
+ * local CPU is (newly) idle. There is at least one
+ * waiting task in this overloaded busiest group. Let's
+ * try to pull it.
+ */
+ if (env->idle && env->imbalance == 0) {
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ }
+
+ return;
+ }
+
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
+ /*
+ * When prefer sibling, evenly spread running tasks on
+ * groups.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = sibling_imbalance(env, sds, busiest, local);
+ } else {
+
+ /*
+ * If there is no overload, we just want to even the number of
+ * idle CPUs.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
+ }
+
+#ifdef CONFIG_NUMA
+ /* Consider allowing a small imbalance between NUMA groups */
+ if (env->sd->flags & SD_NUMA) {
+ env->imbalance = adjust_numa_imbalance(env->imbalance,
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
+ }
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
+
+ return;
}
/*
- * If there aren't any idle cpus, avoid creating some.
+ * Local is fully busy but has to take more load to relieve the
+ * busiest group
*/
- if (busiest->group_type == group_overloaded &&
- local->group_type == group_overloaded) {
- load_above_capacity = busiest->sum_nr_running *
- SCHED_LOAD_SCALE;
- if (load_above_capacity > busiest->group_capacity)
- load_above_capacity -= busiest->group_capacity;
- else
- load_above_capacity = ~0UL;
+ if (local->group_type < group_overloaded) {
+ /*
+ * Local will become overloaded so the avg_load metrics are
+ * finally needed.
+ */
+
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+
+ /*
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
+
+ /*
+ * If the local group is more loaded than the average system
+ * load, don't try to pull any tasks.
+ */
+ if (local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
+
}
/*
- * We're trying to get all the cpus to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded cpu below the average load. At the same time,
- * we also don't want to reduce the group load below the group capacity
- * (so that we can implement power-savings policies etc). Thus we look
- * for the minimum possible imbalance.
+ * Both group are or will become overloaded and we're trying to get all
+ * the CPUs to the average_load, so we don't want to push ourselves
+ * above the average load, nor do we wish to reduce the max loaded CPU
+ * below the average load. At the same time, we also don't want to
+ * reduce the group load below the group capacity. Thus we look for
+ * the minimum possible imbalance.
*/
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
-
- /* How much load to actually move to equalise the imbalance */
+ env->migration_type = migrate_load;
env->imbalance = min(
- max_pull * busiest->group_capacity,
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
-
- /*
- * if *imbalance is less than the average load per runnable task
- * there is no guarantee that any tasks will be moved so we'll have
- * a think about bumping its value to force at least one task to be
- * moved
- */
- if (env->imbalance < busiest->load_per_task)
- return fix_small_imbalance(env, sds);
}
-/******* find_busiest_group() helpers end here *********************/
+/******* sched_balance_find_src_group() helpers end here *********************/
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
+/*
+ * Decision matrix according to the local and busiest group type:
*
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
+ * has_spare nr_idle balanced N/A N/A balanced balanced
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
+ * misfit_task force N/A N/A N/A N/A N/A
+ * asym_packing force force N/A N/A force force
+ * imbalanced force force N/A N/A force force
+ * overloaded force force N/A N/A force avg_load
*
+ * N/A : Not Applicable because already filtered while updating
+ * statistics.
+ * balanced : The system is balanced for these 2 groups.
+ * force : Calculate the imbalance as load migration is probably needed.
+ * avg_load : Only if imbalance is significant enough.
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
+ * different in groups.
+ */
+
+/**
+ * sched_balance_find_src_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance.
* @env: The load balancing environment.
*
+ * Also calculates the amount of runnable load which should be moved
+ * to restore balance.
+ *
* Return: - The busiest group if imbalance exists.
- * - If no imbalance and user has opted for power-savings balance,
- * return the least loaded group whose CPUs can be
- * put to idle by rebalancing its tasks onto our group.
*/
-static struct sched_group *find_busiest_group(struct lb_env *env)
+static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
@@ -6675,77 +11359,127 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
init_sd_lb_stats(&sds);
/*
- * Compute the various statistics relavent for load balancing at
+ * Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
- local = &sds.local_stat;
+
+ /* There is no busy sibling group to pull tasks from */
+ if (!sds.busiest)
+ goto out_balanced;
+
busiest = &sds.busiest_stat;
- /* ASYM feature bypasses nice load balance check */
- if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
- check_asym_packing(env, &sds))
- return sds.busiest;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
- /* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || busiest->sum_nr_running == 0)
+ if (!is_rd_overutilized(env->dst_rq->rd) &&
+ rcu_dereference(env->dst_rq->rd->pd))
goto out_balanced;
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
- / sds.total_capacity;
+ /* ASYM feature bypasses nice load balance check */
+ if (busiest->group_type == group_asym_packing)
+ goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
- * isn't true due to cpus_allowed constraints and the like.
+ * isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
- busiest->group_no_capacity)
- goto force_balance;
-
+ local = &sds.local_stat;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
- if (local->avg_load >= busiest->avg_load)
+ if (local->group_type > busiest->group_type)
goto out_balanced;
/*
- * Don't pull any tasks if this group is already above the domain
- * average load.
+ * When groups are overloaded, use the avg_load to ensure fairness
+ * between tasks.
*/
- if (local->avg_load >= sds.avg_load)
- goto out_balanced;
+ if (local->group_type == group_overloaded) {
+ /*
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load)
+ goto out_balanced;
+
+ /* XXX broken for overlapping NUMA groups */
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
+ sds.total_capacity;
- if (env->idle == CPU_IDLE) {
/*
- * This cpu is idle. If the busiest group is not overloaded
- * and there is no imbalance between this and busiest group
- * wrt idle cpus, it is balanced. The imbalance becomes
- * significant if the diff is greater than 1 otherwise we
- * might end up to just move the imbalance on another group
+ * Don't pull any tasks if this group is already above the
+ * domain average load.
*/
- if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ if (local->avg_load >= sds.avg_load)
goto out_balanced;
- } else {
+
/*
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
- * imbalance_pct to be conservative.
+ * If the busiest group is more loaded, use imbalance_pct to be
+ * conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
+ /*
+ * Try to move all excess tasks to a sibling domain of the busiest
+ * group's child domain.
+ */
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
+ sibling_imbalance(env, &sds, busiest, local) > 1)
+ goto force_balance;
+
+ if (busiest->group_type != group_overloaded) {
+ if (!env->idle) {
+ /*
+ * If the busiest group is not overloaded (and as a
+ * result the local one too) but this CPU is already
+ * busy, let another idle CPU try to pull task.
+ */
+ goto out_balanced;
+ }
+
+ if (busiest->group_type == group_smt_balance &&
+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+ goto force_balance;
+ }
+
+ if (busiest->group_weight > 1 &&
+ local->idle_cpus <= (busiest->idle_cpus + 1)) {
+ /*
+ * If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest
+ * group wrt idle CPUs, it is balanced. The imbalance
+ * becomes significant if the diff is greater than 1
+ * otherwise we might end up to just move the imbalance
+ * on another group. Of course this applies only if
+ * there is more than 1 CPU per group.
+ */
+ goto out_balanced;
+ }
+
+ if (busiest->sum_h_nr_running == 1) {
+ /*
+ * busiest doesn't have any tasks waiting to run
+ */
+ goto out_balanced;
+ }
+ }
+
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
- return sds.busiest;
+ return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
@@ -6753,17 +11487,19 @@ out_balanced:
}
/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
*/
-static struct rq *find_busiest_queue(struct lb_env *env,
+static struct rq *sched_balance_find_src_rq(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_capacity = 1;
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int busiest_nr = 0;
int i;
- for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, wl;
+ for_each_cpu_and(i, sched_group_span(group), env->cpus) {
+ unsigned long capacity, load, util;
+ unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6791,34 +11527,101 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- capacity = capacity_of(i);
+ nr_running = rq->cfs.h_nr_runnable;
+ if (!nr_running)
+ continue;
- wl = weighted_cpuload(i);
+ capacity = capacity_of(i);
/*
- * When comparing with imbalance, use weighted_cpuload()
- * which is not scaled with the cpu capacity.
+ * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
+ * eventually lead to active_balancing high->low capacity.
+ * Higher per-CPU capacity is considered better than balancing
+ * average load.
*/
-
- if (rq->nr_running == 1 && wl > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
+ nr_running == 1)
continue;
/*
- * For the load comparisons with the other cpu's, consider
- * the weighted_cpuload() scaled with the cpu capacity, so
- * that the load can be moved away from the cpu that is
- * potentially running at a lower capacity.
+ * Make sure we only pull tasks from a CPU of lower priority
+ * when balancing between SMT siblings.
*
- * Thus we're looking for max(wl_i / capacity_i), crosswise
- * multiplication to rid ourselves of the division works out
- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
- * our previous maximum.
+ * If balancing between cores, let lower priority CPUs help
+ * SMT cores with more than one busy sibling.
*/
- if (wl * busiest_capacity > busiest_load * capacity) {
- busiest_load = wl;
- busiest_capacity = capacity;
- busiest = rq;
+ if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
+ continue;
+
+ switch (env->migration_type) {
+ case migrate_load:
+ /*
+ * When comparing with load imbalance, use cpu_load()
+ * which is not scaled with the CPU capacity.
+ */
+ load = cpu_load(rq);
+
+ if (nr_running == 1 && load > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
+ break;
+
+ /*
+ * For the load comparisons with the other CPUs,
+ * consider the cpu_load() scaled with the CPU
+ * capacity, so that the load can be moved away
+ * from the CPU that is potentially running at a
+ * lower capacity.
+ *
+ * Thus we're looking for max(load_i / capacity_i),
+ * crosswise multiplication to rid ourselves of the
+ * division works out to:
+ * load_i * capacity_j > load_j * capacity_i;
+ * where j is our previous maximum.
+ */
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
+ busiest_capacity = capacity;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_util:
+ util = cpu_util_cfs_boost(i);
+
+ /*
+ * Don't try to pull utilization from a CPU with one
+ * running task. Whatever its utilization, we will fail
+ * detach the task.
+ */
+ if (nr_running <= 1)
+ continue;
+
+ if (busiest_util < util) {
+ busiest_util = util;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_task:
+ if (busiest_nr < nr_running) {
+ busiest_nr = nr_running;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_misfit:
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
+ * simply seek the "biggest" misfit task.
+ */
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ break;
+
}
}
@@ -6831,23 +11634,50 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+static inline bool
+asym_active_balance(struct lb_env *env)
+{
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
+ * priority CPUs in order to pack all tasks in the highest priority
+ * CPUs. When done between cores, do it only if the whole core if the
+ * whole core is idle.
+ *
+ * If @env::src_cpu is an SMT core with busy siblings, let
+ * the lower priority @env::dst_cpu help it. Do not follow
+ * CPU priority.
+ */
+ return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+ !sched_use_asym_prio(env->sd, env->src_cpu));
+}
+
+static inline bool
+imbalanced_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ /*
+ * The imbalanced case includes the case of pinned tasks preventing a fair
+ * distribution of the load on the system but also the even distribution of the
+ * threads on a system with spare capacity
+ */
+ if ((env->migration_type == migrate_task) &&
+ (sd->nr_balance_failed > sd->cache_nice_tries+2))
+ return 1;
+
+ return 0;
+}
static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
- if (env->idle == CPU_NEWLY_IDLE) {
+ if (asym_active_balance(env))
+ return 1;
- /*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * higher numbered CPUs in order to pack all tasks in the
- * lowest numbered CPUs.
- */
- if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
- return 1;
- }
+ if (imbalanced_active_balance(env))
+ return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -6855,57 +11685,129 @@ static int need_active_balance(struct lb_env *env)
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
- if ((env->idle != CPU_NOT_IDLE) &&
- (env->src_rq->cfs.h_nr_running == 1)) {
+ if (env->idle &&
+ (env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
}
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+ if (env->migration_type == migrate_misfit)
+ return 1;
+
+ return 0;
}
static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
+ struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
- struct cpumask *sg_cpus, *sg_mask;
- int cpu, balance_cpu = -1;
+ int cpu, idle_smt = -1;
+
+ /*
+ * Ensure the balancing environment is consistent; can happen
+ * when the softirq triggers 'during' hotplug.
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+ return 0;
/*
- * In the newly idle case, we will allow all the cpu's
+ * In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
+ *
+ * However, we bail out if we already have tasks or a wakeup pending,
+ * to optimize wakeup latency.
*/
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ return 0;
return 1;
+ }
- sg_cpus = sched_group_cpus(sg);
- sg_mask = sched_group_mask(sg);
- /* Try to find first idle cpu */
- for_each_cpu_and(cpu, sg_cpus, env->cpus) {
- if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+ cpumask_copy(swb_cpus, group_balance_mask(sg));
+ /* Try to find first idle CPU */
+ for_each_cpu_and(cpu, swb_cpus, env->cpus) {
+ if (!idle_cpu(cpu))
continue;
- balance_cpu = cpu;
- break;
+ /*
+ * Don't balance to idle SMT in busy core right away when
+ * balancing cores, but remember the first idle SMT CPU for
+ * later consideration. Find CPU on an idle core first.
+ */
+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (idle_smt == -1)
+ idle_smt = cpu;
+ /*
+ * If the core is not idle, and first SMT sibling which is
+ * idle has been found, then its not needed to check other
+ * SMT siblings for idleness:
+ */
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
+ continue;
+ }
+
+ /*
+ * Are we the first idle core in a non-SMT domain or higher,
+ * or the first idle CPU in a SMT domain?
+ */
+ return cpu == env->dst_cpu;
}
- if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
+ /* Are we the first idle CPU with busy siblings? */
+ if (idle_smt != -1)
+ return idle_smt == env->dst_cpu;
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above domains.
- */
- return balance_cpu == env->dst_cpu;
+ /* Are we the first CPU of this group ? */
+ return group_balance_cpu(sg) == env->dst_cpu;
+}
+
+static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ if (!schedstat_enabled())
+ return;
+
+ switch (env->migration_type) {
+ case migrate_load:
+ __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
+ break;
+ case migrate_util:
+ __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
+ break;
+ case migrate_task:
+ __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
+ break;
+ case migrate_misfit:
+ __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ break;
+ }
}
/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
+
+/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
-static int load_balance(int this_cpu, struct rq *this_rq,
+static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
@@ -6913,31 +11815,24 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
- unsigned long flags;
+ struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
-
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_cpus(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
- .loop_break = sched_nr_migrate_break,
+ .loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
+ bool need_unlock = false;
- /*
- * For NEWLY_IDLE load_balancing, we don't need to consider
- * other cpus in our group
- */
- if (idle == CPU_NEWLY_IDLE)
- env.dst_grpmask = NULL;
-
- cpumask_copy(cpus, cpu_active_mask);
+ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
- schedstat_inc(sd, lb_count[idle]);
+ schedstat_inc(sd->lb_count[idle]);
redo:
if (!should_we_balance(&env)) {
@@ -6945,38 +11840,48 @@ redo:
goto out_balanced;
}
- group = find_busiest_group(&env);
+ if (!need_unlock && (sd->flags & SD_SERIALIZE)) {
+ int zero = 0;
+ if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1))
+ goto out_balanced;
+
+ need_unlock = true;
+ }
+
+ group = sched_balance_find_src_group(&env);
if (!group) {
- schedstat_inc(sd, lb_nobusyg[idle]);
+ schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group);
+ busiest = sched_balance_find_src_rq(&env, group);
if (!busiest) {
- schedstat_inc(sd, lb_nobusyq[idle]);
+ schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
- BUG_ON(busiest == env.dst_rq);
+ WARN_ON_ONCE(busiest == env.dst_rq);
- schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+ update_lb_imbalance_stat(&env, sd, idle);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
ld_moved = 0;
+ /* Clear this flag as soon as we find a pullable task */
+ env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
- * Attempt to move tasks. If find_busiest_group has found
+ * Attempt to move tasks. If sched_balance_find_src_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ rq_lock_irqsave(busiest, &rf);
+ update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
@@ -6992,14 +11897,14 @@ more_balance:
* See task_rq_lock() family for the details.
*/
- raw_spin_unlock(&busiest->lock);
+ rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
- local_irq_restore(flags);
+ local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
@@ -7010,7 +11915,7 @@ more_balance:
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
- * iterate on same src_cpu is dependent on number of cpus in our
+ * iterate on same src_cpu is dependent on number of CPUs in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
@@ -7020,21 +11925,21 @@ more_balance:
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
- * given_cpu) causing exceess load to be moved to given_cpu.
+ * given_cpu) causing excess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
- /* Prevent to re-select dst_cpu via env's cpus */
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
+ /* Prevent to re-select dst_cpu via env's CPUs */
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
/*
* Go back to "more_balance" rather than "redo" since we
@@ -7055,10 +11960,18 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
- cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus)) {
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
+ /*
+ * Attempting to continue load balancing at the current
+ * sched_domain level only makes sense if there are
+ * active CPUs remaining as possible busiest CPUs to
+ * pull load from which are not contained within the
+ * destination group that is receiving any migrated
+ * load.
+ */
+ if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
goto redo;
}
goto out_all_pinned;
@@ -7066,31 +11979,38 @@ more_balance:
}
if (!ld_moved) {
- schedstat_inc(sd, lb_failed[idle]);
+ schedstat_inc(sd->lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
+ *
+ * Similarly for migration_misfit which is not related to
+ * load/util migration, don't pollute nr_balance_failed.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ env.migration_type != migrate_misfit)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ unsigned long flags;
- /* don't kick the active_load_balance_cpu_stop,
- * if the curr task on busiest cpu can't be
- * moved to this_cpu
+ raw_spin_rq_lock_irqsave(busiest, flags);
+
+ /*
+ * Don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest CPU can't be
+ * moved to this_cpu:
*/
- if (!cpumask_test_cpu(this_cpu,
- tsk_cpus_allowed(busiest->curr))) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
- env.flags |= LBF_ALL_PINNED;
+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
goto out_one_pinned;
}
+ /* Record that we found at least one task that could run on this_cpu */
+ env.flags &= ~LBF_ALL_PINNED;
+
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
@@ -7101,35 +12021,23 @@ more_balance:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ preempt_disable();
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
-
- /*
- * We've kicked active balancing, reset the failure
- * counter.
- */
- sd->nr_balance_failed = sd->cache_nice_tries+1;
+ preempt_enable();
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ }
- if (likely(!active_balance)) {
+ if (likely(!active_balance) || need_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
- } else {
- /*
- * If we've begun active balancing, start to back off. This
- * case may not be covered by the all_pinned logic if there
- * is only 1 task on the busy runqueue (because we don't call
- * detach_tasks).
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
}
goto out;
@@ -7137,9 +12045,10 @@ more_balance:
out_balanced:
/*
* We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag if it was set.
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
*/
- if (sd_parent) {
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
@@ -7152,19 +12061,36 @@ out_all_pinned:
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
- schedstat_inc(sd, lb_balanced[idle]);
+ schedstat_inc(sd->lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
+ ld_moved = 0;
+
+ /*
+ * sched_balance_newidle() disregards balance intervals, so we could
+ * repeatedly reach this code, which would lead to balance_interval
+ * skyrocketing in a short amount of time. Skip the balance_interval
+ * increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
+ */
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
+ goto out;
+
/* tune up the balancing interval */
- if (((env.flags & LBF_ALL_PINNED) &&
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
- (sd->balance_interval < sd->max_interval))
+ if ((env.flags & LBF_ALL_PINNED &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
-
- ld_moved = 0;
out:
+ if (need_unlock)
+ atomic_set_release(&sched_balance_running, 0);
+
return ld_moved;
}
@@ -7178,17 +12104,27 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
+
+ /*
+ * Reduce likelihood of busy balancing at higher domains racing with
+ * balancing at lower domains by preventing their balancing periods
+ * from being multiples of each other.
+ */
+ if (cpu_busy)
+ interval -= 1;
+
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
}
static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
{
unsigned long interval, next;
- interval = get_sd_balance_interval(sd, cpu_busy);
+ /* used by idle balance, so cpu_busy = 0 */
+ interval = get_sd_balance_interval(sd, 0);
next = sd->last_balance + interval;
if (time_after(*next_balance, next))
@@ -7196,112 +12132,7 @@ update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_b
}
/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static int idle_balance(struct rq *this_rq)
-{
- unsigned long next_balance = jiffies + HZ;
- int this_cpu = this_rq->cpu;
- struct sched_domain *sd;
- int pulled_task = 0;
- u64 curr_cost = 0;
-
- idle_enter_fair(this_rq);
-
- /*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
- */
- this_rq->idle_stamp = rq_clock(this_rq);
-
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (sd)
- update_next_balance(sd, 0, &next_balance);
- rcu_read_unlock();
-
- goto out;
- }
-
- /*
- * Drop the rq->lock, but keep IRQ/preempt disabled.
- */
- raw_spin_unlock(&this_rq->lock);
-
- update_blocked_averages(this_cpu);
- rcu_read_lock();
- for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
- u64 t0, domain_cost;
-
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, 0, &next_balance);
- break;
- }
-
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
-
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
-
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
-
- curr_cost += domain_cost;
- }
-
- update_next_balance(sd, 0, &next_balance);
-
- /*
- * Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
- */
- if (pulled_task || this_rq->nr_running > 0)
- break;
- }
- rcu_read_unlock();
-
- raw_spin_lock(&this_rq->lock);
-
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
- /*
- * While browsing the domains, we released the rq lock, a task could
- * have been enqueued in the meantime. Since we're not going idle,
- * pretend we pulled a task.
- */
- if (this_rq->cfs.h_nr_running && !pulled_task)
- pulled_task = 1;
-
-out:
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
- pulled_task = -1;
-
- if (pulled_task) {
- idle_exit_fair(this_rq);
- this_rq->idle_stamp = 0;
- }
-
- return pulled_task;
-}
-
-/*
- * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
@@ -7314,10 +12145,18 @@ static int active_load_balance_cpu_stop(void *data)
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
struct task_struct *p = NULL;
+ struct rq_flags rf;
- raw_spin_lock_irq(&busiest_rq->lock);
+ rq_lock_irq(busiest_rq, &rf);
+ /*
+ * Between queueing the stop-work and running it is a hole in which
+ * CPUs can become inactive. We should not move tasks from or to
+ * inactive CPUs.
+ */
+ if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
+ goto out_unlock;
- /* make sure the requested cpu hasn't gone down in the meantime */
+ /* Make sure the requested CPU hasn't gone down in the meantime: */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
@@ -7329,16 +12168,15 @@ static int active_load_balance_cpu_stop(void *data)
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
- * Bjorn Helgaas on a 128-cpu setup.
+ * Bjorn Helgaas on a 128-CPU setup.
*/
- BUG_ON(busiest_rq == target_rq);
+ WARN_ON_ONCE(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
- if ((sd->flags & SD_LOAD_BALANCE) &&
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
- break;
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
}
if (likely(sd)) {
@@ -7349,20 +12187,25 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
+ .flags = LBF_ACTIVE_LB,
};
- schedstat_inc(sd, alb_count);
+ schedstat_inc(sd->alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
- if (p)
- schedstat_inc(sd, alb_pushed);
- else
- schedstat_inc(sd, alb_failed);
+ if (p) {
+ schedstat_inc(sd->alb_pushed);
+ /* Active balancing done, reset the failure counter. */
+ sd->nr_balance_failed = 0;
+ } else {
+ schedstat_inc(sd->alb_failed);
+ }
}
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock(&busiest_rq->lock);
+ rq_unlock(busiest_rq, &rf);
if (p)
attach_one_task(target_rq, p);
@@ -7372,6 +12215,136 @@ out_unlock:
return 0;
}
+/*
+ * Scale the max sched_balance_rq interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+void update_max_interval(void)
+{
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
+static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
+{
+ sd->newidle_call++;
+ sd->newidle_success += success;
+
+ if (sd->newidle_call >= 1024) {
+ sd->newidle_ratio = sd->newidle_success;
+ sd->newidle_call /= 2;
+ sd->newidle_success /= 2;
+ }
+}
+
+static inline bool
+update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
+{
+ unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
+ unsigned long now = jiffies;
+
+ if (cost)
+ update_newidle_stats(sd, success);
+
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ sd->max_newidle_lb_cost = cost;
+ sd->last_decay_max_lb_cost = now;
+
+ } else if (time_after(now, next_decay)) {
+ /*
+ * Decay the newidle max times by ~1% per second to ensure that
+ * it is not outdated and the current max cost is actually
+ * shorter.
+ */
+ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+ sd->last_decay_max_lb_cost = now;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in init_sched_domains.
+ */
+static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
+{
+ int continue_balancing = 1;
+ int cpu = rq->cpu;
+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ unsigned long interval;
+ struct sched_domain *sd;
+ /* Earliest time when we have to do rebalance again */
+ unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;
+ int need_decay = 0;
+ u64 max_cost = 0;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ /*
+ * Decay the newidle max times here because this is a regular
+ * visit to all the domains.
+ */
+ need_decay = update_newidle_cost(sd, 0, 0);
+ max_cost += sd->max_newidle_lb_cost;
+
+ /*
+ * Stop the load balance at this level. There is another
+ * CPU in our sched group which is doing load balancing more
+ * actively.
+ */
+ if (!continue_balancing) {
+ if (need_decay)
+ continue;
+ break;
+ }
+
+ interval = get_sd_balance_interval(sd, busy);
+ if (time_after_eq(jiffies, sd->last_balance + interval)) {
+ if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
+ /*
+ * The LBF_DST_PINNED logic could have changed
+ * env->dst_cpu, so we can't know our idle
+ * state even if we migrated tasks. Update it.
+ */
+ idle = idle_cpu(cpu);
+ busy = !idle && !sched_idle_cpu(cpu);
+ }
+ sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, busy);
+ }
+ if (time_after(next_balance, sd->last_balance + interval)) {
+ next_balance = sd->last_balance + interval;
+ update_next_balance = 1;
+ }
+ }
+ if (need_decay) {
+ /*
+ * Ensure the rq-wide value also decays but keep it at a
+ * reasonable floor to avoid funnies with rq->avg_idle.
+ */
+ rq->max_idle_balance_cost =
+ max((u64)sysctl_sched_migration_cost, max_cost);
+ }
+ rcu_read_unlock();
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the cpu is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ rq->next_balance = next_balance;
+
+}
+
static inline int on_null_domain(struct rq *rq)
{
return unlikely(!rcu_dereference_sched(rq->sd));
@@ -7379,413 +12352,700 @@ static inline int on_null_domain(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
-
static inline int find_new_ilb(void)
{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
+ const struct cpumask *hk_mask;
+ int ilb_cpu;
+
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
- if (ilb < nr_cpu_ids && idle_cpu(ilb))
- return ilb;
+ for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
- return nr_cpu_ids;
+ if (ilb_cpu == smp_processor_id())
+ continue;
+
+ if (idle_cpu(ilb_cpu))
+ return ilb_cpu;
+ }
+
+ return -1;
}
/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
+ * (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
+ if (ilb_cpu < 0)
+ return;
- if (ilb_cpu >= nr_cpu_ids)
+ /*
+ * Don't bother if no new NOHZ balance work items for ilb_cpu,
+ * i.e. all bits in flags are already set in ilb_cpu.
+ */
+ if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
return;
- if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+ /*
+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
+ * the first flag owns it; cleared by nohz_csd_func().
+ */
+ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
+ if (flags & NOHZ_KICK_MASK)
return;
+
/*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target cpu which
- * is idle. And the softirq performing nohz idle load balance
+ * This way we generate an IPI on the target CPU which
+ * is idle, and the softirq performing NOHZ idle load balancing
* will be run before returning from the IPI.
*/
- smp_send_reschedule(ilb_cpu);
- return;
+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
}
-static inline void nohz_balance_exit_idle(int cpu)
+/*
+ * Current decision point for kicking the idle load balancer in the presence
+ * of idle CPUs in the system.
+ */
+static void nohz_balancer_kick(struct rq *rq)
{
- if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+ unsigned long now = jiffies;
+ struct sched_domain_shared *sds;
+ struct sched_domain *sd;
+ int nr_busy, i, cpu = rq->cpu;
+ unsigned int flags = 0;
+
+ if (unlikely(rq->idle_balance))
+ return;
+
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ nohz_balance_exit_idle(rq);
+
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing:
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return;
+
+ if (READ_ONCE(nohz.has_blocked) &&
+ time_after(now, READ_ONCE(nohz.next_blocked)))
+ flags = NOHZ_STATS_KICK;
+
+ if (time_before(now, nohz.next_balance))
+ goto out;
+
+ if (rq->nr_running >= 2) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ goto out;
+ }
+
+ rcu_read_lock();
+
+ sd = rcu_dereference(rq->sd);
+ if (sd) {
/*
- * Completely isolated CPUs don't ever set, so we must test.
+ * If there's a runnable CFS task and the current CPU has reduced
+ * capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
+ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ goto unlock;
}
- clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
+
+ sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+ if (sd) {
+ /*
+ * When ASYM_PACKING; see if there's a more preferred CPU
+ * currently idle; in which case, kick the ILB to move tasks
+ * around.
+ *
+ * When balancing between cores, all the SMT siblings of the
+ * preferred CPU must be idle.
+ */
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+ if (sched_asym(sd, i, cpu)) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ goto unlock;
+ }
+ }
+ }
+
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ if (sd) {
+ /*
+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
+ * to run the misfit task on.
+ */
+ if (check_misfit_status(rq)) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ goto unlock;
+ }
+
+ /*
+ * For asymmetric systems, we do not want to nicely balance
+ * cache use, instead we want to embrace asymmetry and only
+ * ensure tasks have enough CPU capacity.
+ *
+ * Skip the LLC logic because it's not relevant in that case.
+ */
+ goto unlock;
+ }
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * If there is an imbalance between LLC domains (IOW we could
+ * increase the overall cache utilization), we need a less-loaded LLC
+ * domain to pull some load from. Likewise, we may need to spread
+ * load within the current LLC domain (e.g. packed SMT cores but
+ * other CPUs are idle). We can't really know from here how busy
+ * the others are - so just get a NOHZ balance going if it looks
+ * like this LLC domain has tasks we could move.
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
+ if (nr_busy > 1) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ goto unlock;
+ }
+ }
+unlock:
+ rcu_read_unlock();
+out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
+ if (flags)
+ kick_ilb(flags);
}
-static inline void set_cpu_sd_state_busy(void)
+static void set_cpu_sd_state_busy(int cpu)
{
struct sched_domain *sd;
- int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+ atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
-void set_cpu_sd_state_idle(void)
+void nohz_balance_exit_idle(struct rq *rq)
+{
+ WARN_ON_ONCE(rq != this_rq());
+
+ if (likely(!rq->nohz_tick_stopped))
+ return;
+
+ rq->nohz_tick_stopped = 0;
+ cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+
+ set_cpu_sd_state_busy(rq->cpu);
+}
+
+static void set_cpu_sd_state_idle(int cpu)
{
struct sched_domain *sd;
- int cpu = smp_processor_id();
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+ atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
/*
- * This routine will record that the cpu is going idle with tick stopped.
+ * This routine will record that the CPU is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
void nohz_balance_enter_idle(int cpu)
{
- /*
- * If this cpu is going down, then nothing needs to be done.
- */
+ struct rq *rq = cpu_rq(cpu);
+
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ /* If this CPU is going down, then nothing needs to be done: */
if (!cpu_active(cpu))
return;
- if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
- return;
+ /*
+ * Can be set safely without rq->lock held
+ * If a clear happens, it will have evaluated last additions because
+ * rq->lock is held during the check and the clear
+ */
+ rq->has_blocked_load = 1;
/*
- * If we're a completely isolated CPU, we don't play.
+ * The tick is still stopped but load could have been added in the
+ * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
+ * of nohz.has_blocked can only happen after checking the new load
*/
- if (on_null_domain(cpu_rq(cpu)))
+ if (rq->nohz_tick_stopped)
+ goto out;
+
+ /* If we're a completely isolated CPU, we don't play: */
+ if (on_null_domain(rq))
return;
+ rq->nohz_tick_stopped = 1;
+
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
- set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+
+ /*
+ * Ensures that if nohz_idle_balance() fails to observe our
+ * @idle_cpus_mask store, it must observe the @has_blocked
+ * and @needs_update stores.
+ */
+ smp_mb__after_atomic();
+
+ set_cpu_sd_state_idle(cpu);
+
+ WRITE_ONCE(nohz.needs_update, 1);
+out:
+ /*
+ * Each time a cpu enter idle, we assume that it has blocked load and
+ * enable the periodic update of the load of idle CPUs
+ */
+ WRITE_ONCE(nohz.has_blocked, 1);
}
-static int sched_ilb_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static bool update_nohz_stats(struct rq *rq)
{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DYING:
- nohz_balance_exit_idle(smp_processor_id());
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-#endif
+ unsigned int cpu = rq->cpu;
-static DEFINE_SPINLOCK(balancing);
+ if (!rq->has_blocked_load)
+ return false;
-/*
- * Scale the max load_balance interval with the number of CPUs in the system.
- * This trades load-balance latency on larger machines for less cross talk.
- */
-void update_max_interval(void)
-{
- max_load_balance_interval = HZ*num_online_cpus()/10;
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
+ return true;
+
+ sched_balance_update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
}
/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in init_sched_domains.
+ * Internal function that runs load balance for all idle CPUs. The load balance
+ * can be a simple update of blocked load or a complete load balance with
+ * tasks movement depending of flags.
*/
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
{
- int continue_balancing = 1;
- int cpu = rq->cpu;
- unsigned long interval;
- struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
- unsigned long next_balance = jiffies + 60*HZ;
+ unsigned long now = jiffies;
+ unsigned long next_balance = now + 60*HZ;
+ bool has_blocked_load = false;
int update_next_balance = 0;
- int need_serialize, need_decay = 0;
- u64 max_cost = 0;
+ int this_cpu = this_rq->cpu;
+ int balance_cpu;
+ struct rq *rq;
- update_blocked_averages(cpu);
+ WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+
+ /*
+ * We assume there will be no idle load after this update and clear
+ * the has_blocked flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked flag and trigger another update of idle load.
+ * Because a cpu that becomes idle, is added to idle_cpus_mask before
+ * setting the flag, we are sure to not clear the state and not
+ * check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
+ */
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);
+
+ /*
+ * Ensures that if we miss the CPU, we must see the has_blocked
+ * store from nohz_balance_enter_idle().
+ */
+ smp_mb();
+
+ /*
+ * Start with the next CPU after this_cpu so we will end with this_cpu and let a
+ * chance for other idle cpu to pull load.
+ */
+ for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
+ if (!idle_cpu(balance_cpu))
+ continue;
- rcu_read_lock();
- for_each_domain(cpu, sd) {
/*
- * Decay the newidle max times here because this is a regular
- * visit to all the domains. Decay ~1% per second.
+ * If this CPU gets work to do, stop the load balancing
+ * work being done for other CPUs. Next load
+ * balancing owner will pick it up.
*/
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
- sd->max_newidle_lb_cost =
- (sd->max_newidle_lb_cost * 253) / 256;
- sd->next_decay_max_lb_cost = jiffies + HZ;
- need_decay = 1;
+ if (!idle_cpu(this_cpu) && need_resched()) {
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
+ goto abort;
}
- max_cost += sd->max_newidle_lb_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
+ rq = cpu_rq(balance_cpu);
+
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);
/*
- * Stop the load balance at this level. There is another
- * CPU in our sched group which is doing load balancing more
- * actively.
+ * If time for next balance is due,
+ * do the balance.
*/
- if (!continue_balancing) {
- if (need_decay)
- continue;
- break;
- }
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ struct rq_flags rf;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ rq_unlock_irqrestore(rq, &rf);
- need_serialize = sd->flags & SD_SERIALIZE;
- if (need_serialize) {
- if (!spin_trylock(&balancing))
- goto out;
+ if (flags & NOHZ_BALANCE_KICK)
+ sched_balance_domains(rq, CPU_IDLE);
}
- if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
- /*
- * The LBF_DST_PINNED logic could have changed
- * env->dst_cpu, so we can't know our idle
- * state even if we migrated tasks. Update it.
- */
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- }
- sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
- }
- if (need_serialize)
- spin_unlock(&balancing);
-out:
- if (time_after(next_balance, sd->last_balance + interval)) {
- next_balance = sd->last_balance + interval;
+ if (time_after(next_balance, rq->next_balance)) {
+ next_balance = rq->next_balance;
update_next_balance = 1;
}
}
- if (need_decay) {
- /*
- * Ensure the rq-wide value also decays but keep it at a
- * reasonable floor to avoid funnies with rq->avg_idle.
- */
- rq->max_idle_balance_cost =
- max((u64)sysctl_sched_migration_cost, max_cost);
- }
- rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
- * When the cpu is attached to null domain for ex, it will not be
+ * When the CPU is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
- rq->next_balance = next_balance;
+ nohz.next_balance = next_balance;
+
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);
}
-#ifdef CONFIG_NO_HZ_COMMON
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
*/
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
- int this_cpu = this_rq->cpu;
- struct rq *rq;
- int balance_cpu;
+ unsigned int flags = this_rq->nohz_idle_balance;
- if (idle != CPU_IDLE ||
- !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
- goto end;
+ if (!flags)
+ return false;
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
- continue;
+ this_rq->nohz_idle_balance = 0;
- /*
- * If this cpu gets work to do, stop the load balancing
- * work being done for other cpus. Next load
- * balancing owner will pick it up.
- */
- if (need_resched())
- break;
+ if (idle != CPU_IDLE)
+ return false;
- rq = cpu_rq(balance_cpu);
+ _nohz_idle_balance(this_rq, flags);
- /*
- * If time for next balance is due,
- * do the balance.
- */
- if (time_after_eq(jiffies, rq->next_balance)) {
- raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
- update_idle_cpu_load(rq);
- raw_spin_unlock_irq(&rq->lock);
- rebalance_domains(rq, CPU_IDLE);
- }
+ return true;
+}
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
- }
- nohz.next_balance = this_rq->next_balance;
-end:
- clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+/*
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / IRQ off phase of the local
+ * cpu about to enter idle, because it can take a long time.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+ unsigned int flags;
+
+ flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+ /*
+ * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+ * (i.e. NOHZ_STATS_KICK set) and will do the same.
+ */
+ if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
+}
+
+static void nohz_newidle_balance(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu;
+
+ /* Will wake up very soon. No time for doing anything else*/
+ if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ return;
+
+ /* Don't need to update blocked load of idle CPUs*/
+ if (!READ_ONCE(nohz.has_blocked) ||
+ time_before(jiffies, READ_ONCE(nohz.next_blocked)))
+ return;
+
+ /*
+ * Set the need to trigger ILB in order to update blocked load
+ * before entering idle state.
+ */
+ atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
+#else /* !CONFIG_NO_HZ_COMMON: */
+static inline void nohz_balancer_kick(struct rq *rq) { }
+
+static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
+{
+ return false;
+}
+
+static inline void nohz_newidle_balance(struct rq *this_rq) { }
+#endif /* !CONFIG_NO_HZ_COMMON */
+
/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- * - This rq has more than one task.
- * - This rq has at least one CFS task and the capacity of the CPU is
- * significantly reduced because of RT tasks or IRQs.
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
- * multiple busy cpu.
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- * domain span are idle.
+ * sched_balance_newidle is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ *
+ * Returns:
+ * < 0 - we released the lock and there are !fair tasks present
+ * 0 - failed, no new tasks
+ * > 0 - success, new (fair) tasks present
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
- unsigned long now = jiffies;
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
+ int continue_balancing = 1;
+ u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
- struct sched_group_capacity *sgc;
- int nr_busy, cpu = rq->cpu;
- bool kick = false;
+ int pulled_task = 0;
- if (unlikely(rq->idle_balance))
- return false;
+ update_misfit_status(NULL, this_rq);
- /*
- * We may be recently in ticked or tickless idle mode. At the first
- * busy tick after returning from idle, we will update the busy stats.
- */
- set_cpu_sd_state_busy();
- nohz_balance_exit_idle(cpu);
+ /*
+ * There is a task waiting to run. No need to search for one.
+ * Return 0; the task will be enqueued when switching to idle.
+ */
+ if (this_rq->ttwu_pending)
+ return 0;
/*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
+ * We must set idle_stamp _before_ calling sched_balance_rq()
+ * for CPU_NEWLY_IDLE, such that we measure the this duration
+ * as idle time.
*/
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
+ this_rq->idle_stamp = rq_clock(this_rq);
- if (time_before(now, nohz.next_balance))
- return false;
+ /*
+ * Do not pull tasks towards !active CPUs...
+ */
+ if (!cpu_active(this_cpu))
+ return 0;
- if (rq->nr_running >= 2)
- return true;
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
- sgc = sd->groups->sgc;
- nr_busy = atomic_read(&sgc->nr_busy_cpus);
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (!sd) {
+ rcu_read_unlock();
+ goto out;
+ }
- if (nr_busy > 1) {
- kick = true;
- goto unlock;
- }
+ if (!get_rd_overloaded(this_rq->rd) ||
+ this_rq->avg_idle < sd->max_newidle_lb_cost) {
+ update_next_balance(sd, &next_balance);
+ rcu_read_unlock();
+ goto out;
}
+ rcu_read_unlock();
- sd = rcu_dereference(rq->sd);
- if (sd) {
- if ((rq->cfs.h_nr_running >= 1) &&
- check_cpu_capacity(rq, sd)) {
- kick = true;
- goto unlock;
+ rq_modified_clear(this_rq);
+ raw_spin_rq_unlock(this_rq);
+
+ t0 = sched_clock_cpu(this_cpu);
+ sched_balance_update_blocked_averages(this_cpu);
+
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ u64 domain_cost;
+
+ update_next_balance(sd, &next_balance);
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+ break;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned int weight = 1;
+
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
+ }
+ weight = (1024 + weight/2) / weight;
+ }
+
+ pulled_task = sched_balance_rq(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+
+ t1 = sched_clock_cpu(this_cpu);
+ domain_cost = t1 - t0;
+ curr_cost += domain_cost;
+ t0 = t1;
+
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
}
- }
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
- if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
- sched_domain_span(sd)) < cpu)) {
- kick = true;
- goto unlock;
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || !continue_balancing)
+ break;
}
-
-unlock:
rcu_read_unlock();
- return kick;
+
+ raw_spin_rq_lock(this_rq);
+
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
+ /*
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
+ */
+ if (this_rq->cfs.h_nr_queued && !pulled_task)
+ pulled_task = 1;
+
+ /* If a higher prio class was modified, restart the pick */
+ if (rq_modified_above(this_rq, &fair_sched_class))
+ pulled_task = -1;
+
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
+ this_rq->next_balance = next_balance;
+
+ if (pulled_task)
+ this_rq->idle_stamp = 0;
+ else
+ nohz_newidle_balance(this_rq);
+
+ rq_repin_lock(this_rq, rf);
+
+ return pulled_task;
}
-#else
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
-#endif
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
+ *
+ * - directly from the local sched_tick() for periodic load balancing
+ *
+ * - indirectly from a remote sched_tick() for NOHZ idle balancing
+ * through the SMP cross-call nohz_csd_func()
*/
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(void)
{
struct rq *this_rq = this_rq();
- enum cpu_idle_type idle = this_rq->idle_balance ?
- CPU_IDLE : CPU_NOT_IDLE;
-
+ enum cpu_idle_type idle = this_rq->idle_balance;
/*
- * If this cpu has a pending nohz_balance_kick, then do the
- * balancing on behalf of the other idle cpus whose ticks are
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
- * give the idle cpus a chance to load balance. Else we may
+ * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
+ * balancing on behalf of the other idle CPUs whose ticks are
+ * stopped. Do nohz_idle_balance *before* sched_balance_domains to
+ * give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
- nohz_idle_balance(this_rq, idle);
- rebalance_domains(this_rq, idle);
+ if (nohz_idle_balance(this_rq, idle))
+ return;
+
+ /* normal load balance */
+ sched_balance_update_blocked_averages(this_rq->cpu);
+ sched_balance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq)
+void sched_balance_trigger(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ /*
+ * Don't need to rebalance while attached to NULL domain or
+ * runqueue CPU is not active
+ */
+ if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ_COMMON
- if (nohz_kick_needed(rq))
- nohz_balancer_kick();
-#endif
+
+ nohz_balancer_kick(rq);
}
static void rq_online_fair(struct rq *rq)
@@ -7801,12 +13061,308 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share: */
+ clear_tg_offline_cfs_rqs(rq);
}
-#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+ u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
+
+ return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE 2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ /*
+ * If runqueue has only one task which used up its slice and
+ * if the sibling is forced idle, then trigger schedule to
+ * give forced idle task a chance.
+ *
+ * sched_slice() considers only this active rq and it gets the
+ * whole slice. But during force idle, we have siblings acting
+ * like a single runqueue and hence we need to consider runnable
+ * tasks on this CPU and the forced idle CPU. Ideally, we should
+ * go through the forced idle rq, but that would be a perf hit.
+ * We can assume that the forced idle CPU has at least
+ * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+ * if we need to give up the CPU.
+ */
+ if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
+ __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ resched_curr(rq);
+}
+
+/*
+ * Consider any infeasible weight scenario. Take for instance two tasks,
+ * each bound to their respective sibling, one with weight 1 and one with
+ * weight 2. Then the lower weight task will run ahead of the higher weight
+ * task without bound.
+ *
+ * This utterly destroys the concept of a shared time base.
+ *
+ * Remember; all this is about a proportionally fair scheduling, where each
+ * tasks receives:
+ *
+ * w_i
+ * dt_i = ---------- dt (1)
+ * \Sum_j w_j
+ *
+ * which we do by tracking a virtual time, s_i:
+ *
+ * 1
+ * s_i = --- d[t]_i (2)
+ * w_i
+ *
+ * Where d[t] is a delta of discrete time, while dt is an infinitesimal.
+ * The immediate corollary is that the ideal schedule S, where (2) to use
+ * an infinitesimal delta, is:
+ *
+ * 1
+ * S = ---------- dt (3)
+ * \Sum_i w_i
+ *
+ * From which we can define the lag, or deviation from the ideal, as:
+ *
+ * lag(i) = S - s_i (4)
+ *
+ * And since the one and only purpose is to approximate S, we get that:
+ *
+ * \Sum_i w_i lag(i) := 0 (5)
+ *
+ * If this were not so, we no longer converge to S, and we can no longer
+ * claim our scheduler has any of the properties we derive from S. This is
+ * exactly what you did above, you broke it!
+ *
+ *
+ * Let's continue for a while though; to see if there is anything useful to
+ * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i:
+ *
+ * \Sum_i w_i s_i
+ * S = -------------- (6)
+ * \Sum_i w_i
+ *
+ * Which gives us a way to compute S, given our s_i. Now, if you've read
+ * our code, you know that we do not in fact do this, the reason for this
+ * is two-fold. Firstly, computing S in that way requires a 64bit division
+ * for every time we'd use it (see 12), and secondly, this only describes
+ * the steady-state, it doesn't handle dynamics.
+ *
+ * Anyway, in (6): s_i -> x + (s_i - x), to get:
+ *
+ * \Sum_i w_i (s_i - x)
+ * S - x = -------------------- (7)
+ * \Sum_i w_i
+ *
+ * Which shows that S and s_i transform alike (which makes perfect sense
+ * given that S is basically the (weighted) average of s_i).
+ *
+ * So the thing to remember is that the above is strictly UP. It is
+ * possible to generalize to multiple runqueues -- however it gets really
+ * yuck when you have to add affinity support, as illustrated by our very
+ * first counter-example.
+ *
+ * Luckily I think we can avoid needing a full multi-queue variant for
+ * core-scheduling (or load-balancing). The crucial observation is that we
+ * only actually need this comparison in the presence of forced-idle; only
+ * then do we need to tell if the stalled rq has higher priority over the
+ * other.
+ *
+ * [XXX assumes SMT2; better consider the more general case, I suspect
+ * it'll work out because our comparison is always between 2 rqs and the
+ * answer is only interesting if one of them is forced-idle]
+ *
+ * And (under assumption of SMT2) when there is forced-idle, there is only
+ * a single queue, so everything works like normal.
+ *
+ * Let, for our runqueue 'k':
+ *
+ * T_k = \Sum_i w_i s_i
+ * W_k = \Sum_i w_i ; for all i of k (8)
+ *
+ * Then we can write (6) like:
+ *
+ * T_k
+ * S_k = --- (9)
+ * W_k
+ *
+ * From which immediately follows that:
+ *
+ * T_k + T_l
+ * S_k+l = --------- (10)
+ * W_k + W_l
+ *
+ * On which we can define a combined lag:
+ *
+ * lag_k+l(i) := S_k+l - s_i (11)
+ *
+ * And that gives us the tools to compare tasks across a combined runqueue.
+ *
+ *
+ * Combined this gives the following:
+ *
+ * a) when a runqueue enters force-idle, sync it against it's sibling rq(s)
+ * using (7); this only requires storing single 'time'-stamps.
+ *
+ * b) when comparing tasks between 2 runqueues of which one is forced-idle,
+ * compare the combined lag, per (11).
+ *
+ * Now, of course cgroups (I so hate them) make this more interesting in
+ * that a) seems to suggest we need to iterate all cgroup on a CPU at such
+ * boundaries, but I think we can avoid that. The force-idle is for the
+ * whole CPU, all it's rqs. So we can mark it in the root and lazily
+ * propagate downward on demand.
+ */
+
+/*
+ * So this sync is basically a relative reset of S to 0.
+ *
+ * So with 2 queues, when one goes idle, we drop them both to 0 and one
+ * then increases due to not being idle, and the idle one builds up lag to
+ * get re-elected. So far so simple, right?
+ *
+ * When there's 3, we can have the situation where 2 run and one is idle,
+ * we sync to 0 and let the idle one build up lag to get re-election. Now
+ * suppose another one also drops idle. At this point dropping all to 0
+ * again would destroy the built-up lag from the queue that was already
+ * idle, not good.
+ *
+ * So instead of syncing everything, we can:
+ *
+ * less := !((s64)(s_a - s_b) <= 0)
+ *
+ * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b
+ * == v_a - (v_b - S_a + S_b)
+ *
+ * IOW, we can recast the (lag) comparison to a one-sided difference.
+ * So if then, instead of syncing the whole queue, sync the idle queue
+ * against the active queue with S_a + S_b at the point where we sync.
+ *
+ * (XXX consider the implication of living in a cyclic group: N / 2^n N)
+ *
+ * This gives us means of syncing single queues against the active queue,
+ * and for already idle queues to preserve their build-up lag.
+ *
+ * Of course, then we get the situation where there's 2 active and one
+ * going idle, who do we pick to sync against? Theory would have us sync
+ * against the combined S, but as we've already demonstrated, there is no
+ * such thing in infeasible weight scenarios.
+ *
+ * One thing I've considered; and this is where that core_active rudiment
+ * came from, is having active queues sync up between themselves after
+ * every tick. This limits the observed divergence due to the work
+ * conservancy.
+ *
+ * On top of that, we can improve upon things by employing (10) here.
+ */
/*
- * scheduler tick hitting a task of our scheduling class:
+ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
+ bool forceidle)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
+{
+ struct rq *rq = task_rq(a);
+ const struct sched_entity *sea = &a->se;
+ const struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ WARN_ON_ONCE(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Find an se in the hierarchy for tasks a and b, such that the se's
+ * are immediate siblings.
+ */
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+ cfs_rqa = &task_rq(a)->cfs;
+ cfs_rqb = &task_rq(b)->cfs;
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
+ /*
+ * Find delta after normalizing se's vruntime with its cfs_rq's
+ * zero_vruntime_fi, which would have been updated in prior calls
+ * to se_fi_update().
+ */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
+
+ return delta > 0;
+}
+
+static int task_is_throttled_fair(struct task_struct *p, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq = task_group(p)->cfs_rq[cpu];
+#else
+ cfs_rq = &cpu_rq(cpu)->cfs;
+#endif
+ return throttled_hierarchy(cfs_rq);
+}
+#else /* !CONFIG_SCHED_CORE: */
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif /* !CONFIG_SCHED_CORE */
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
@@ -7818,10 +13374,13 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
- if (numabalancing_enabled)
+ if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
- update_rq_runnable_avg(rq, 1);
+ update_misfit_status(curr, rq);
+ check_update_overutilized_status(task_rq(curr));
+
+ task_tick_core(rq, curr);
}
/*
@@ -7831,47 +13390,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
- struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- update_rq_clock(rq);
-
- cfs_rq = task_cfs_rq(current);
- curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
- se->vruntime = curr->vruntime;
- place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ set_task_max_allowed_capacity(p);
}
/*
@@ -7879,94 +13398,175 @@ static void task_fork_fair(struct task_struct *p)
* the current task.
*/
static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
+ if (p->prio == oldprio)
+ return;
+
+ if (rq->cfs.nr_queued == 1)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (rq->curr == p) {
+ if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
- } else
- check_preempt_curr(rq, p, 0);
+ } else {
+ wakeup_preempt(rq, p, 0);
+ }
}
-static void switched_from_fair(struct rq *rq, struct task_struct *p)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when it's
- * switched back to the fair class the enqueue_entity(.flags=0) will
- * do the right thing.
- *
- * If it's queued, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !queued, then only when
- * the task is sleeping will it still have non-normalized vruntime.
+ * If a task gets attached to this cfs_rq and before being queued,
+ * it gets migrated to another CPU due to reasons like affinity
+ * change, make sure this cfs_rq stays on leaf cfs_rq list to have
+ * that removed load decayed or it can cause faireness problem.
*/
- if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+
+ if (!cfs_rq_pelt_clock_throttled(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
-#ifdef CONFIG_SMP
+ assert_list_leaf_cfs_rq(rq_of(cfs_rq));
+}
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
/*
- * Remove our load from contribution when we leave sched_fair
- * and ensure we don't carry in an old decay_count if we
- * switch back.
- */
- if (se->avg.decay_count) {
- __synchronize_entity_decay(se);
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- }
-#endif
+ * In case the task sched_avg hasn't been attached:
+ * - A forked task which hasn't been woken up by wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() but is
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (!se->avg.last_update_time)
+ return;
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(cfs_rq, se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
+ propagate_entity_cfs_rq(se);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ attach_entity_cfs_rq(se);
+}
+
+static void switching_from_fair(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+}
+
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ detach_task_cfs_rq(p);
}
-/*
- * We switched to the sched_fair class.
- */
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
+ WARN_ON_ONCE(p->se.sched_delayed);
+
+ attach_task_cfs_rq(p);
+
+ set_task_max_allowed_capacity(p);
+
+ if (task_on_rq_queued(p)) {
+ /*
+ * We were most likely switched from sched_rt, so
+ * kick off the schedule if running, otherwise just see
+ * if we can still preempt the current task.
+ */
+ if (task_current_donor(rq, p))
+ resched_curr(rq);
+ else
+ wakeup_preempt(rq, p, 0);
+ }
+}
+
+static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+{
struct sched_entity *se = &p->se;
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
- if (!task_on_rq_queued(p))
+
+ if (task_on_rq_queued(p)) {
+ /*
+ * Move the next running task to the front of the list, so our
+ * cfs_tasks list becomes MRU one.
+ */
+ list_move(&se->group_node, &rq->cfs_tasks);
+ }
+ if (!first)
return;
- /*
- * We were most likely switched from sched_rt, so
- * kick off the schedule if running, otherwise just see
- * if we can still preempt the current task.
- */
- if (rq->curr == p)
- resched_curr(rq);
- else
- check_preempt_curr(rq, p, 0);
+ WARN_ON_ONCE(se->sched_delayed);
+
+ if (hrtick_enabled_fair(rq))
+ hrtick_start_fair(rq, p);
+
+ update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
}
-/* Account for a task changing its policy or group.
+/*
+ * Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
- struct sched_entity *se = &rq->curr->se;
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -7975,80 +13575,39 @@ static void set_curr_task_fair(struct rq *rq)
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
+
+ __set_next_task_fair(rq, p, first);
}
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
- cfs_rq->tasks_timeline = RB_ROOT;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-#ifdef CONFIG_SMP
- atomic64_set(&cfs_rq->decay_counter, 1);
- atomic_long_set(&cfs_rq->removed_load, 0);
-#endif
+ cfs_rq->tasks_timeline = RB_ROOT_CACHED;
+ cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
+ raw_spin_lock_init(&cfs_rq->removed.lock);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int queued)
+static void task_change_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq;
-
/*
- * If the task was not on the rq at the time of this cgroup movement
- * it must have been asleep, sleeping tasks keep their ->vruntime
- * absolute on their old rq until wakeup (needed for the fair sleeper
- * bonus in place_entity()).
- *
- * If it was on the rq, we've just 'preempted' it, which does convert
- * ->vruntime to a relative base.
- *
- * Make sure both cases convert their relative position when migrating
- * to another cgroup's rq. This does somewhat interfere with the
- * fair sleeper stuff for the first placement, but who cares.
+ * We couldn't detach or attach a forked task which
+ * hasn't been woken up by wake_up_new_task().
*/
- /*
- * When !queued, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - Moving a forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - Moving a task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- *
- * To prevent boost or penalty in the new cfs_rq caused by delta
- * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
- */
- if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- queued = 1;
+ if (READ_ONCE(p->__state) == TASK_NEW)
+ return;
- if (!queued)
- se->vruntime -= cfs_rq_of(se)->min_vruntime;
+ detach_task_cfs_rq(p);
+
+ /* Tell se's cfs_rq has been changed -- migrated */
+ p->se.avg.last_update_time = 0;
set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!queued) {
- cfs_rq = cfs_rq_of(se);
- se->vruntime += cfs_rq->min_vruntime;
-#ifdef CONFIG_SMP
- /*
- * migrate_task_rq_fair() will have removed our previous
- * contribution, but we must synchronize for ongoing future
- * decay.
- */
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
-#endif
- }
+ attach_task_cfs_rq(p);
}
void free_fair_sched_group(struct task_group *tg)
{
int i;
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -8062,20 +13621,20 @@ void free_fair_sched_group(struct task_group *tg)
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
int i;
- tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+ tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
- tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+ tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
if (!tg->se)
goto err;
tg->shares = NICE_0_LOAD;
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -8083,13 +13642,14 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!cfs_rq)
goto err;
- se = kzalloc_node(sizeof(struct sched_entity),
+ se = kzalloc_node(sizeof(struct sched_entity_stats),
GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err_free_rq;
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ init_entity_runnable_average(se);
}
return 1;
@@ -8100,21 +13660,56 @@ err:
return 0;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
+void online_fair_sched_group(struct task_group *tg)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct sched_entity *se;
+ struct rq_flags rf;
+ struct rq *rq;
+ int i;
- /*
- * Only empty task groups can be destroyed; so we can speculatively
- * check on_list without danger of it being re-added.
- */
- if (!tg->cfs_rq[cpu]->on_list)
- return;
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ se = tg->se[i];
+ rq_lock_irq(rq, &rf);
+ update_rq_clock(rq);
+ attach_entity_cfs_rq(se);
+ sync_throttle(tg, i);
+ rq_unlock_irq(rq, &rf);
+ }
+}
+
+void unregister_fair_sched_group(struct task_group *tg)
+{
+ int cpu;
+
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(cpu) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ }
}
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
@@ -8150,10 +13745,11 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
- unsigned long flags;
+
+ lockdep_assert_held(&shares_mutex);
/*
* We can't change the weight of the root cgroup.
@@ -8163,40 +13759,106 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
- mutex_lock(&shares_mutex);
if (tg->shares == shares)
- goto done;
+ return 0;
tg->shares = shares;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
- struct sched_entity *se;
+ struct sched_entity *se = tg->se[i];
+ struct rq_flags rf;
- se = tg->se[i];
/* Propagate contribution to hierarchy */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- /* Possible calls to update_curr() need rq clock */
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ for_each_sched_entity(se) {
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
+ update_cfs_group(se);
+ }
+ rq_unlock_irqrestore(rq, &rf);
}
-done:
- mutex_unlock(&shares_mutex);
return 0;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-
-void free_fair_sched_group(struct task_group *tg) { }
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
- return 1;
+ int ret;
+
+ mutex_lock(&shares_mutex);
+ if (tg_is_idle(tg))
+ ret = -EINVAL;
+ else
+ ret = __sched_group_set_shares(tg, shares);
+ mutex_unlock(&shares_mutex);
+
+ return ret;
}
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+int sched_group_set_idle(struct task_group *tg, long idle)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (idle < 0 || idle > 1)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->idle == idle) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->idle = idle;
+
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se = tg->se[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+ bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+ long idle_task_delta;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+
+ grp_cfs_rq->idle = idle;
+ if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+ goto next_cpu;
+
+ idle_task_delta = grp_cfs_rq->h_nr_queued -
+ grp_cfs_rq->h_nr_idle;
+ if (!cfs_rq_is_idle(grp_cfs_rq))
+ idle_task_delta *= -1;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!se->on_rq)
+ break;
+
+ cfs_rq->h_nr_idle += idle_task_delta;
+
+ /* Already accounted at parent level and above. */
+ if (cfs_rq_is_idle(cfs_rq))
+ break;
+ }
+
+next_cpu:
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ /* Idle groups have minimum weight. */
+ if (tg_is_idle(tg))
+ __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+ else
+ __sched_group_set_shares(tg, NICE_0_LOAD);
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8211,7 +13873,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
@@ -8219,33 +13881,37 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-const struct sched_class fair_sched_class = {
- .next = &idle_sched_class,
+DEFINE_SCHED_CLASS(fair) = {
+
+ .queue_mask = 2,
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .check_preempt_curr = check_preempt_wakeup,
+ .wakeup_preempt = check_preempt_wakeup_fair,
+ .pick_task = pick_task_fair,
.pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
+ .set_next_task = set_next_task_fair,
-#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
- .task_waking = task_waking_fair,
-#endif
+ .task_dead = task_dead_fair,
+ .set_cpus_allowed = set_cpus_allowed_fair,
- .set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
+ .reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
+ .switching_from = switching_from_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
@@ -8254,32 +13920,73 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_fair,
+#endif
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
#endif
};
-#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+void show_numa_stats(struct task_struct *p, struct seq_file *m)
+{
+ int node;
+ unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+ struct numa_group *ng;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ for_each_online_node(node) {
+ if (p->numa_faults) {
+ tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
+ tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ if (ng) {
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+ print_numa_stats(m, node, tsf, tpf, gsf, gpf);
+ }
+ rcu_read_unlock();
+}
+#endif /* CONFIG_NUMA_BALANCING */
__init void init_sched_fair_class(void)
{
-#ifdef CONFIG_SMP
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+ int i;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
+#endif
+ }
+
+ open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
+ nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- cpu_notifier(sched_ilb_notifier, 0);
#endif
-#endif /* SMP */
-
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 91e33cd485f6..980d92bab8ab 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,60 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
-
+SCHED_FEAT(PLACE_LAG, true)
+/*
+ * Give new tasks half a slice to ease into the competition.
+ */
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+/*
+ * Preserve relative virtual deadline on 'migration'.
+ */
+SCHED_FEAT(PLACE_REL_DEADLINE, true)
/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
+ * Inhibit (wakeup) preemption until the current task has either matched the
+ * 0-lag point or until is has exhausted it's slice.
*/
-SCHED_FEAT(START_DEBIT, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
+/*
+ * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
+ * current.
+ */
+SCHED_FEAT(PREEMPT_SHORT, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
*/
-SCHED_FEAT(NEXT_BUDDY, false)
+SCHED_FEAT(NEXT_BUDDY, true)
/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
+ * Allow completely ignoring cfs_rq->next; which can be set from various
+ * places:
+ * - NEXT_BUDDY (wakeup preemption)
+ * - yield_to_task()
+ * - cgroup dequeue / pick
*/
-SCHED_FEAT(LAST_BUDDY, true)
+SCHED_FEAT(PICK_BUDDY, true)
/*
- * Consider buddies to be cache hot, decreases the likelyness of a
+ * Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
- * Allow wakeup-time preemption of the current task:
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ *
+ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
*/
-SCHED_FEAT(WAKEUP_PREEMPTION, true)
+SCHED_FEAT(DELAY_DEQUEUE, true)
+SCHED_FEAT(DELAY_ZERO, true)
/*
- * Use arch dependent cpu capacity functions
+ * Allow wakeup-time preemption of the current task:
*/
-SCHED_FEAT(ARCH_CAPACITY, true)
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
-SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(HRTICK_DL, false)
/*
* Decrement CPU capacity based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_CAPACITY, true)
+#ifdef CONFIG_PREEMPT_RT
+SCHED_FEAT(TTWU_QUEUE, false)
+#else
+
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#endif
+
+/*
+ * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+ */
+SCHED_FEAT(SIS_UTIL, true)
+
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
#ifdef HAVE_RT_PUSH_IPI
/*
@@ -69,30 +107,22 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(FORCE_SD_OVERLAP, false)
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ATTACH_AGE_LOAD, true)
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
- */
-#ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA, false)
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
/*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- * higher number of hinting faults are recorded during active load
- * balancing.
+ * UtilEstimation. Use estimated CPU utilization.
*/
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+SCHED_FEAT(UTIL_EST, true)
+
+SCHED_FEAT(LATENCY_WARN, false)
/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
+ * Do newidle balancing proportional to its success rate using randomization.
*/
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
-#endif
+SCHED_FEAT(NI_RANDOM, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index fefcb1fa5160..c174afe1dd17 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,19 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Generic entry point for the idle threads
+ * Generic entry points for the idle threads and
+ * implementation of the idle task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE batch scheduled
+ * tasks which are handled in sched/fair.c )
*/
-#include <linux/sched.h>
-#include <linux/cpu.h>
#include <linux/cpuidle.h>
-#include <linux/tick.h>
-#include <linux/mm.h>
-#include <linux/stackprotector.h>
#include <linux/suspend.h>
+#include <linux/livepatch.h>
+#include "sched.h"
+#include "smp.h"
-#include <asm/tlb.h>
-
-#include <trace/events/power.h>
+/* Linker adds these: start and end of __cpuidle functions */
+extern char __cpuidle_text_start[], __cpuidle_text_end[];
-#include "sched.h"
+/**
+ * sched_idle_set_state - Record idle state for the current CPU.
+ * @idle_state: State to record.
+ */
+void sched_idle_set_state(struct cpuidle_state *idle_state)
+{
+ idle_set_state(this_rq(), idle_state);
+}
static int __read_mostly cpu_idle_force_poll;
@@ -31,6 +40,7 @@ void cpu_idle_poll_ctrl(bool enable)
static int __init cpu_idle_poll_setup(char *__unused)
{
cpu_idle_force_poll = 1;
+
return 1;
}
__setup("nohlt", cpu_idle_poll_setup);
@@ -38,21 +48,31 @@ __setup("nohlt", cpu_idle_poll_setup);
static int __init cpu_idle_nopoll_setup(char *__unused)
{
cpu_idle_force_poll = 0;
+
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
-#endif
+#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */
-static inline int cpu_idle_poll(void)
+static noinline int __cpuidle cpu_idle_poll(void)
{
- rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
- local_irq_enable();
+ instrumentation_begin();
+ trace_cpu_idle(0, smp_processor_id());
+ stop_critical_timings();
+ ct_cpuidle_enter();
+
+ raw_local_irq_enable();
while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
- rcu_idle_exit();
+ raw_local_irq_disable();
+
+ ct_cpuidle_exit();
+ start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ local_irq_enable();
+ instrumentation_end();
+
return 1;
}
@@ -60,11 +80,85 @@ static inline int cpu_idle_poll(void)
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
+void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
+}
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE
+DEFINE_STATIC_KEY_FALSE(arch_needs_tick_broadcast);
+
+static inline void cond_tick_broadcast_enter(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_enter();
+}
+
+static inline void cond_tick_broadcast_exit(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_exit();
+}
+#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */
+static inline void cond_tick_broadcast_enter(void) { }
+static inline void cond_tick_broadcast_exit(void) { }
+#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
+
+/**
+ * default_idle_call - Default CPU idle routine.
+ *
+ * To use when the cpuidle framework cannot be used.
+ */
+void __cpuidle default_idle_call(void)
+{
+ instrumentation_begin();
+ if (!current_clr_polling_and_test()) {
+ cond_tick_broadcast_enter();
+ trace_cpu_idle(1, smp_processor_id());
+ stop_critical_timings();
+
+ ct_cpuidle_enter();
+ arch_cpu_idle();
+ ct_cpuidle_exit();
+
+ start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ cond_tick_broadcast_exit();
+ }
local_irq_enable();
+ instrumentation_end();
+}
+
+static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev,
+ u64 max_latency_ns)
+{
+ if (current_clr_polling_and_test())
+ return -EBUSY;
+
+ return cpuidle_enter_s2idle(drv, dev, max_latency_ns);
+}
+
+static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ int next_state)
+{
+ /*
+ * The idle task must be scheduled, it is pointless to go to idle, just
+ * update no idle residency and return.
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency_ns = 0;
+ local_irq_enable();
+ return -EBUSY;
+ }
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ return cpuidle_enter(drv, dev, next_state);
}
/**
@@ -72,227 +166,397 @@ void __weak arch_cpu_idle(void)
*
* NOTE: no locks or semaphores should be used here
*
- * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * On architectures that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
static void cpuidle_idle_call(void)
{
- struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_device *dev = cpuidle_get_device();
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
- bool reflect;
/*
* Check if the idle task must be rescheduled. If it is the
- * case, exit the function after re-enabling the local irq.
+ * case, exit the function after re-enabling the local IRQ.
*/
if (need_resched()) {
local_irq_enable();
return;
}
- /*
- * During the idle period, stop measuring the disabled irqs
- * critical sections latencies
- */
- stop_critical_timings();
+ if (cpuidle_not_available(drv, dev)) {
+ tick_nohz_idle_stop_tick();
- /*
- * Tell the RCU framework we are entering an idle section,
- * so no more rcu read side critical sections and one more
- * step to the grace period
- */
- rcu_idle_enter();
-
- if (cpuidle_not_available(drv, dev))
- goto use_default;
+ default_idle_call();
+ goto exit_idle;
+ }
/*
- * Suspend-to-idle ("freeze") is a system state in which all user space
+ * Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
- * activity happens here and in iterrupts (if any). In that case bypass
- * the cpuidle governor and go stratight for the deepest idle state
+ * activity happens here and in interrupts (if any). In that case bypass
+ * the cpuidle governor and go straight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state >= 0) {
- local_irq_enable();
- goto exit_idle;
+
+ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
+ u64 max_latency_ns;
+
+ if (idle_should_enter_s2idle()) {
+ max_latency_ns = cpu_wakeup_latency_qos_limit() *
+ NSEC_PER_USEC;
+
+ entered_state = call_cpuidle_s2idle(drv, dev,
+ max_latency_ns);
+ if (entered_state > 0)
+ goto exit_idle;
+ } else {
+ max_latency_ns = dev->forced_idle_latency_limit_ns;
}
- reflect = false;
- next_state = cpuidle_find_deepest_state(drv, dev);
+ tick_nohz_idle_stop_tick();
+
+ next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
+ call_cpuidle(drv, dev, next_state);
} else {
- reflect = true;
+ bool stop_tick = true;
+
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
- next_state = cpuidle_select(drv, dev);
+ next_state = cpuidle_select(drv, dev, &stop_tick);
+
+ if (stop_tick || tick_nohz_tick_stopped())
+ tick_nohz_idle_stop_tick();
+ else
+ tick_nohz_idle_retain_tick();
+
+ entered_state = call_cpuidle(drv, dev, next_state);
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ cpuidle_reflect(dev, entered_state);
}
- /* Fall back to the default arch idle method on errors. */
- if (next_state < 0)
- goto use_default;
+
+exit_idle:
+ __current_set_polling();
/*
- * The idle task must be scheduled, it is pointless to
- * go to idle, just update no idle residency and get
- * out of this function
+ * It is up to the idle functions to re-enable local interrupts
*/
- if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
- entered_state = next_state;
+ if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
- goto exit_idle;
- }
+}
- /* Take note of the planned idle state. */
- idle_set_state(this_rq(), &drv->states[next_state]);
+/*
+ * Generic idle loop implementation
+ *
+ * Called with polling cleared.
+ */
+static void do_idle(void)
+{
+ int cpu = smp_processor_id();
/*
- * Enter the idle state previously returned by the governor decision.
- * This function will block until an interrupt occurs and will take
- * care of re-enabling the local interrupts
+ * Check if we need to update blocked load
*/
- entered_state = cpuidle_enter(drv, dev, next_state);
+ nohz_run_idle_balance(cpu);
- /* The cpu is no longer idle or about to enter idle. */
- idle_set_state(this_rq(), NULL);
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+ * rq->idle). This means that, if rq->idle has the polling bit set,
+ * then setting need_resched is guaranteed to cause the CPU to
+ * reschedule.
+ */
+
+ __current_set_polling();
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+
+ /*
+ * Interrupts shouldn't be re-enabled from that point on until
+ * the CPU sleeping instruction is reached. Otherwise an interrupt
+ * may fire and queue a timer that would be ignored until the CPU
+ * wakes from the sleeping instruction. And testing need_resched()
+ * doesn't tell about pending needed timer reprogram.
+ *
+ * Several cases to consider:
+ *
+ * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as
+ * "wfi" or "mwait" are fine because they can be entered with
+ * interrupt disabled.
+ *
+ * - sti;mwait() couple is fine because the interrupts are
+ * re-enabled only upon the execution of mwait, leaving no gap
+ * in-between.
+ *
+ * - ROLLBACK based idle handlers with the sleeping instruction
+ * called with interrupts enabled are NOT fine. In this scheme
+ * when the interrupt detects it has interrupted an idle handler,
+ * it rolls back to its beginning which performs the
+ * need_resched() check before re-executing the sleeping
+ * instruction. This can leak a pending needed timer reprogram.
+ * If such a scheme is really mandatory due to the lack of an
+ * appropriate CPU sleeping instruction, then a FAST-FORWARD
+ * must instead be applied: when the interrupt detects it has
+ * interrupted an idle handler, it must resume to the end of
+ * this idle handler so that the generic idle loop is iterated
+ * again to reprogram the tick.
+ */
+ local_irq_disable();
- if (entered_state == -EBUSY)
- goto use_default;
+ if (cpu_is_offline(cpu)) {
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
+ }
+
+ arch_cpu_idle_enter();
+ rcu_nocb_flush_deferred_wakeup();
+
+ /*
+ * In poll mode we re-enable interrupts and spin. Also if we
+ * detected in the wakeup from idle path that the tick
+ * broadcast device expired for us, we don't want to go deep
+ * idle as we know that the IPI is going to arrive right away.
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ tick_nohz_idle_restart_tick();
+ cpu_idle_poll();
+ } else {
+ cpuidle_idle_call();
+ }
+ arch_cpu_idle_exit();
+ }
/*
- * Give the governor an opportunity to reflect on the outcome
+ * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+ * be set, propagate it into PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will not have had
+ * an IPI to fold the state for us.
*/
- if (reflect)
- cpuidle_reflect(dev, entered_state);
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
-exit_idle:
- __current_set_polling();
+ /*
+ * We promise to call sched_ttwu_pending() and reschedule if
+ * need_resched() is set while polling is set. That means that clearing
+ * polling needs to be visible before doing these things.
+ */
+ smp_mb__after_atomic();
/*
- * It is up to the idle functions to reenable local interrupts
+ * RCU relies on this call to be done outside of an RCU read-side
+ * critical section.
*/
- if (WARN_ON_ONCE(irqs_disabled()))
- local_irq_enable();
+ flush_smp_call_function_queue();
+ schedule_idle();
- rcu_idle_exit();
- start_critical_timings();
- return;
+ if (unlikely(klp_patch_pending(current)))
+ klp_update_patch_state(current);
+}
+
+bool cpu_in_idle(unsigned long pc)
+{
+ return pc >= (unsigned long)__cpuidle_text_start &&
+ pc < (unsigned long)__cpuidle_text_end;
+}
+
+struct idle_timer {
+ struct hrtimer timer;
+ int done;
+};
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+ struct idle_timer *it = container_of(timer, struct idle_timer, timer);
+
+ WRITE_ONCE(it->done, 1);
+ set_tsk_need_resched(current);
+
+ return HRTIMER_NORESTART;
+}
+
+void play_idle_precise(u64 duration_ns, u64 latency_ns)
+{
+ struct idle_timer it;
-use_default:
/*
- * We can't use the cpuidle framework, let's use the default
- * idle routine.
+ * Only FIFO tasks can disable the tick since they don't need the forced
+ * preemption.
*/
- if (current_clr_polling_and_test())
- local_irq_enable();
- else
- arch_cpu_idle();
+ WARN_ON_ONCE(current->policy != SCHED_FIFO);
+ WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+ WARN_ON_ONCE(!duration_ns);
+ WARN_ON_ONCE(current->mm);
+
+ rcu_sleep_check();
+ preempt_disable();
+ current->flags |= PF_IDLE;
+ cpuidle_use_deepest_state(latency_ns);
+
+ it.done = 0;
+ hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
+ hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
+ HRTIMER_MODE_REL_PINNED_HARD);
+
+ while (!READ_ONCE(it.done))
+ do_idle();
+
+ cpuidle_use_deepest_state(0);
+ current->flags &= ~PF_IDLE;
+
+ preempt_fold_need_resched();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(play_idle_precise);
- goto exit_idle;
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ current->flags |= PF_IDLE;
+ arch_cpu_idle_prepare();
+ cpuhp_online_idle(state);
+ while (1)
+ do_idle();
}
-DEFINE_PER_CPU(bool, cpu_dead_idle);
+/*
+ * idle-task scheduling class.
+ */
+
+static int
+select_task_rq_idle(struct task_struct *p, int cpu, int flags)
+{
+ return task_cpu(p); /* IDLE tasks as never migrated */
+}
+
+static int
+balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return WARN_ON_ONCE(1);
+}
/*
- * Generic idle loop implementation
- *
- * Called with polling cleared.
+ * Idle tasks are unconditionally rescheduled:
*/
-static void cpu_idle_loop(void)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
{
- while (1) {
- /*
- * If the arch has a polling bit, we maintain an invariant:
- *
- * Our polling bit is clear if we're not scheduled (i.e. if
- * rq->curr != rq->idle). This means that, if rq->idle has
- * the polling bit set, then setting need_resched is
- * guaranteed to cause the cpu to reschedule.
- */
+ resched_curr(rq);
+}
- __current_set_polling();
- tick_nohz_idle_enter();
-
- while (!need_resched()) {
- check_pgt_cache();
- rmb();
-
- if (cpu_is_offline(smp_processor_id())) {
- rcu_cpu_notify(NULL, CPU_DYING_IDLE,
- (void *)(long)smp_processor_id());
- smp_mb(); /* all activity before dead. */
- this_cpu_write(cpu_dead_idle, true);
- arch_cpu_idle_dead();
- }
-
- local_irq_disable();
- arch_cpu_idle_enter();
-
- /*
- * In poll mode we reenable interrupts and spin.
- *
- * Also if we detected in the wakeup from idle
- * path that the tick broadcast device expired
- * for us, we don't want to go deep idle as we
- * know that the IPI is going to arrive right
- * away
- */
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
- cpu_idle_poll();
- else
- cpuidle_idle_call();
-
- arch_cpu_idle_exit();
- }
+static void update_curr_idle(struct rq *rq);
- /*
- * Since we fell out of the loop above, we know
- * TIF_NEED_RESCHED must be set, propagate it into
- * PREEMPT_NEED_RESCHED.
- *
- * This is required because for polling idle loops we will
- * not have had an IPI to fold the state for us.
- */
- preempt_set_need_resched();
- tick_nohz_idle_exit();
- __current_clr_polling();
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+{
+ update_curr_idle(rq);
+ scx_update_idle(rq, false, true);
+}
- /*
- * We promise to call sched_ttwu_pending and reschedule
- * if need_resched is set while polling is set. That
- * means that clearing polling needs to be visible
- * before doing these things.
- */
- smp_mb__after_atomic();
+static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
+{
+ update_idle_core(rq);
+ scx_update_idle(rq, true, true);
+ schedstat_inc(rq->sched_goidle);
+ next->se.exec_start = rq_clock_task(rq);
+}
- sched_ttwu_pending();
- schedule_preempt_disabled();
- }
+struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
+{
+ scx_update_idle(rq, true, false);
+ return rq->idle;
}
-void cpu_startup_entry(enum cpuhp_state state)
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static bool
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
- /*
- * This #ifdef needs to die, but it's too late in the cycle to
- * make this generic (arm and sh have never invoked the canary
- * init for the non boot cpus!). Will be fixed in 3.11
- */
-#ifdef CONFIG_X86
- /*
- * If we're the non-boot CPU, nothing set the stack canary up
- * for us. The boot CPU already has it initialized but no harm
- * in doing it again. This is a good place for updating it, as
- * we wont ever return from this function (so the invalid
- * canaries already on the stack wont ever trigger).
- */
- boot_init_stack_canary();
-#endif
- arch_cpu_idle_prepare();
- cpu_idle_loop();
+ raw_spin_rq_unlock_irq(rq);
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ raw_spin_rq_lock_irq(rq);
+ return true;
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+ update_curr_idle(rq);
+}
+
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
+{
+ BUG();
+}
+
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
+{
+ if (p->prio == oldprio)
+ return;
+
+ BUG();
}
+
+static void update_curr_idle(struct rq *rq)
+{
+ struct sched_entity *se = &rq->idle->se;
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
+
+ delta_exec = now - se->exec_start;
+ if (unlikely(delta_exec <= 0))
+ return;
+
+ se->exec_start = now;
+
+ dl_server_update_idle(&rq->fair_server, delta_exec);
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+DEFINE_SCHED_CLASS(idle) = {
+
+ .queue_mask = 0,
+
+ /* no enqueue/yield_task for idle tasks */
+
+ /* dequeue is not valid, we print a debug message there: */
+ .dequeue_task = dequeue_task_idle,
+
+ .wakeup_preempt = wakeup_preempt_idle,
+
+ .pick_task = pick_task_idle,
+ .put_prev_task = put_prev_task_idle,
+ .set_next_task = set_next_task_idle,
+
+ .balance = balance_idle,
+ .select_task_rq = select_task_rq_idle,
+ .set_cpus_allowed = set_cpus_allowed_common,
+
+ .task_tick = task_tick_idle,
+
+ .prio_changed = prio_changed_idle,
+ .switching_to = switching_to_idle,
+ .update_curr = update_curr_idle,
+};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index c65dac8c97cd..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,109 +0,0 @@
-#include "sched.h"
-
-/*
- * idle-task scheduling class.
- *
- * (NOTE: these are not related to SCHED_IDLE tasks which are
- * handled in sched/fair.c)
- */
-
-#ifdef CONFIG_SMP
-static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
-{
- return task_cpu(p); /* IDLE tasks as never migrated */
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Idle tasks are unconditionally rescheduled:
- */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
-{
- resched_curr(rq);
-}
-
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev)
-{
- put_prev_task(rq, prev);
-
- schedstat_inc(rq, sched_goidle);
- return rq->idle;
-}
-
-/*
- * It is not legal to sleep in the idle task - print a warning
- * message if some code attempts to do it:
- */
-static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
-{
- raw_spin_unlock_irq(&rq->lock);
- printk(KERN_ERR "bad: scheduling from the idle thread!\n");
- dump_stack();
- raw_spin_lock_irq(&rq->lock);
-}
-
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
- idle_exit_fair(rq);
- rq_last_tick_reset(rq);
-}
-
-static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
-{
- BUG();
-}
-
-static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
-{
- BUG();
-}
-
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
-static void update_curr_idle(struct rq *rq)
-{
-}
-
-/*
- * Simple, special scheduling class for the per-CPU idle tasks:
- */
-const struct sched_class idle_sched_class = {
- /* .next is NULL */
- /* no enqueue/yield_task for idle tasks */
-
- /* dequeue is not valid, we print a debug message there: */
- .dequeue_task = dequeue_task_idle,
-
- .check_preempt_curr = check_preempt_curr_idle,
-
- .pick_next_task = pick_next_task_idle,
- .put_prev_task = put_prev_task_idle,
-
-#ifdef CONFIG_SMP
- .select_task_rq = select_task_rq_idle,
-#endif
-
- .set_curr_task = set_curr_task_idle,
- .task_tick = task_tick_idle,
-
- .get_rr_interval = get_rr_interval_idle,
-
- .prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
- .update_curr = update_curr_idle,
-};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000000..3ad0d6df6a0a
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Housekeeping management. Manage the targets for routine code that can run on
+ * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
+ *
+ */
+#include <linux/sched/isolation.h>
+#include "sched.h"
+
+enum hk_flags {
+ HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
+ HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
+ HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
+};
+
+DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
+EXPORT_SYMBOL_GPL(housekeeping_overridden);
+
+struct housekeeping {
+ cpumask_var_t cpumasks[HK_TYPE_MAX];
+ unsigned long flags;
+};
+
+static struct housekeeping housekeeping;
+
+bool housekeeping_enabled(enum hk_type type)
+{
+ return !!(housekeeping.flags & BIT(type));
+}
+EXPORT_SYMBOL_GPL(housekeeping_enabled);
+
+int housekeeping_any_cpu(enum hk_type type)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ if (housekeeping.flags & BIT(type)) {
+ cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
+ cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
+ if (likely(cpu < nr_cpu_ids))
+ return cpu;
+ /*
+ * Unless we have another problem this can only happen
+ * at boot time before start_secondary() brings the 1st
+ * housekeeping CPU up.
+ */
+ WARN_ON_ONCE(system_state == SYSTEM_RUNNING ||
+ type != HK_TYPE_TIMER);
+ }
+ }
+ return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
+{
+ if (static_branch_unlikely(&housekeeping_overridden))
+ if (housekeeping.flags & BIT(type))
+ return housekeeping.cpumasks[type];
+ return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t, enum hk_type type)
+{
+ if (static_branch_unlikely(&housekeeping_overridden))
+ if (housekeeping.flags & BIT(type))
+ set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu, enum hk_type type)
+{
+ if (static_branch_unlikely(&housekeeping_overridden))
+ if (housekeeping.flags & BIT(type))
+ return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
+ return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
+
+void __init housekeeping_init(void)
+{
+ enum hk_type type;
+
+ if (!housekeeping.flags)
+ return;
+
+ static_branch_enable(&housekeeping_overridden);
+
+ if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
+ sched_tick_offload_init();
+
+ for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+ }
+}
+
+static void __init housekeeping_setup_type(enum hk_type type,
+ cpumask_var_t housekeeping_staging)
+{
+
+ alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
+ cpumask_copy(housekeeping.cpumasks[type],
+ housekeeping_staging);
+}
+
+static int __init housekeeping_setup(char *str, unsigned long flags)
+{
+ cpumask_var_t non_housekeeping_mask, housekeeping_staging;
+ unsigned int first_cpu;
+ int err = 0;
+
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ return 0;
+ }
+ }
+
+ alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+ if (cpulist_parse(str, non_housekeeping_mask) < 0) {
+ pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
+ goto free_non_housekeeping_mask;
+ }
+
+ alloc_bootmem_cpumask_var(&housekeeping_staging);
+ cpumask_andnot(housekeeping_staging,
+ cpu_possible_mask, non_housekeeping_mask);
+
+ first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging);
+ if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) {
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+ if (!housekeeping.flags) {
+ pr_warn("Housekeeping: must include one present CPU, "
+ "using boot CPU:%d\n", smp_processor_id());
+ }
+ }
+
+ if (cpumask_empty(non_housekeeping_mask))
+ goto free_housekeeping_staging;
+
+ if (!housekeeping.flags) {
+ /* First setup call ("nohz_full=" or "isolcpus=") */
+ enum hk_type type;
+
+ for_each_set_bit(type, &flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
+ } else {
+ /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
+ enum hk_type type;
+ unsigned long iter_flags = flags & housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
+ if (!cpumask_equal(housekeeping_staging,
+ housekeeping.cpumasks[type])) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ goto free_housekeeping_staging;
+ }
+ }
+
+ /*
+ * Check the combination of nohz_full and isolcpus=domain,
+ * necessary to avoid problems with the timer migration
+ * hierarchy. managed_irq is ignored by this check since it
+ * isn't considered in the timer migration logic.
+ */
+ iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
+ type = find_first_bit(&iter_flags, HK_TYPE_MAX);
+ /*
+ * Pass the check if none of these flags were previously set or
+ * are not in the current selection.
+ */
+ iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
+ first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
+ cpumask_first_and_and(cpu_present_mask,
+ housekeeping_staging, housekeeping.cpumasks[type]);
+ if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
+ pr_warn("Housekeeping: must include one present CPU "
+ "neither in nohz_full= nor in isolcpus=domain, "
+ "ignoring setting %s\n", str);
+ goto free_housekeeping_staging;
+ }
+
+ iter_flags = flags & ~housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
+ }
+
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
+ tick_nohz_full_setup(non_housekeeping_mask);
+
+ housekeeping.flags |= flags;
+ err = 1;
+
+free_housekeeping_staging:
+ free_bootmem_cpumask_var(housekeeping_staging);
+free_non_housekeeping_mask:
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+
+ return err;
+}
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+ unsigned long flags;
+
+ flags = HK_FLAG_KERNEL_NOISE;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+ unsigned long flags = 0;
+ bool illegal = false;
+ char *par;
+ int len;
+
+ while (isalpha(*str)) {
+ /*
+ * isolcpus=nohz is equivalent to nohz_full.
+ */
+ if (!strncmp(str, "nohz,", 5)) {
+ str += 5;
+ flags |= HK_FLAG_KERNEL_NOISE;
+ continue;
+ }
+
+ if (!strncmp(str, "domain,", 7)) {
+ str += 7;
+ flags |= HK_FLAG_DOMAIN;
+ continue;
+ }
+
+ if (!strncmp(str, "managed_irq,", 12)) {
+ str += 12;
+ flags |= HK_FLAG_MANAGED_IRQ;
+ continue;
+ }
+
+ /*
+ * Skip unknown sub-parameter and validate that it is not
+ * containing an invalid character.
+ */
+ for (par = str, len = 0; *str && *str != ','; str++, len++) {
+ if (!isalpha(*str) && *str != '_')
+ illegal = true;
+ }
+
+ if (illegal) {
+ pr_warn("isolcpus: Invalid flag %.*s\n", len, par);
+ return 0;
+ }
+
+ pr_info("isolcpus: Skipped unknown flag %.*s\n", len, par);
+ str++;
+ }
+
+ /* Default behaviour for isolcpus without flags */
+ if (!flags)
+ flags |= HK_FLAG_DOMAIN;
+
+ return housekeeping_setup(str, flags);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
new file mode 100644
index 000000000000..b601e0243d0e
--- /dev/null
+++ b/kernel/sched/loadavg.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kernel/sched/loadavg.c
+ *
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
+ */
+#include <linux/sched/nohz.h>
+#include "sched.h"
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of CPUs, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-CPU deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-CPU delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * CPU to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
+ * this would add another cross-CPU cache-line miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever CPU the task ran
+ * when it went into uninterruptible state and decrement on whatever CPU
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all CPUs yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to destination load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq, long adjust)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running - adjust;
+ nr_active += (long)this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) {
+ for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-CPU sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' CPU delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two NO_HZ-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old NO_HZ contribution in this window while
+ * accumulating the new one.
+ *
+ * - When we wake up from NO_HZ during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
+ * intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_nohz[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next NO_HZ-delta.
+ */
+ if (!time_before(jiffies, READ_ONCE(calc_load_update)))
+ idx++;
+
+ return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+ return calc_load_idx & 1;
+}
+
+static void calc_load_nohz_fold(struct rq *rq)
+{
+ long delta;
+
+ delta = calc_load_fold_active(rq, 0);
+ if (delta) {
+ int idx = calc_load_write_idx();
+
+ atomic_long_add(delta, &calc_load_nohz[idx]);
+ }
+}
+
+void calc_load_nohz_start(void)
+{
+ /*
+ * We're going into NO_HZ mode, if there's any pending delta, fold it
+ * into the pending NO_HZ delta.
+ */
+ calc_load_nohz_fold(this_rq());
+}
+
+/*
+ * Keep track of the load for NOHZ_FULL, must be called between
+ * calc_load_nohz_{start,stop}().
+ */
+void calc_load_nohz_remote(struct rq *rq)
+{
+ calc_load_nohz_fold(rq);
+}
+
+void calc_load_nohz_stop(void)
+{
+ struct rq *this_rq = this_rq();
+
+ /*
+ * If we're still before the pending sample window, we're done.
+ */
+ this_rq->calc_load_update = READ_ONCE(calc_load_update);
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_nohz_read(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_nohz[idx]))
+ delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
+
+ return delta;
+}
+
+/*
+ * NO_HZ can leave us missing all per-CPU ticks calling
+ * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
+ * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
+ * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+ unsigned long sample_window;
+ long delta, active, n;
+
+ sample_window = READ_ONCE(calc_load_update);
+ if (!time_before(jiffies, sample_window + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - sample_window - 10;
+ n = 1 + (delta / LOAD_FREQ);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
+ }
+
+ /*
+ * Flip the NO_HZ index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON: */
+
+static inline long calc_load_nohz_read(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* !CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
+ */
+void calc_global_load(void)
+{
+ unsigned long sample_window;
+ long active, delta;
+
+ sample_window = READ_ONCE(calc_load_update);
+ if (time_before(jiffies, sample_window + 10))
+ return;
+
+ /*
+ * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
+ */
+ delta = calc_load_nohz_read();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+ avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+ avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+ WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
+
+ /*
+ * In case we went to NO_HZ for multiple LOAD_FREQ intervals
+ * catch up in bulk.
+ */
+ calc_global_nohz();
+}
+
+/*
+ * Called from sched_tick() to periodically update this CPU's
+ * active count.
+ */
+void calc_global_load_tick(struct rq *this_rq)
+{
+ long delta;
+
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ delta = calc_load_fold_active(this_rq, 0);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ this_rq->calc_load_update += LOAD_FREQ;
+}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..623445603725
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ */
+#include <uapi/linux/membarrier.h>
+#include "sched.h"
+
+/*
+ * For documentation purposes, here are some membarrier ordering
+ * scenarios to keep in mind:
+ *
+ * A) Userspace thread execution after IPI vs membarrier's memory
+ * barrier before sending the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the start of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU0 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU1 after the IPI-induced memory barrier:
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r2 = y
+ * y = 1
+ * barrier()
+ * r1 = x
+ *
+ * BUG_ON(r1 == 0 && r2 == 0)
+ *
+ * The write to y and load from x by CPU1 are unordered by the hardware,
+ * so it's possible to have "r1 = x" reordered before "y = 1" at any
+ * point after (b). If the memory barrier at (a) is omitted, then "x = 1"
+ * can be reordered after (a) (although not after (c)), so we get r1 == 0
+ * and r2 == 0. This violates the guarantee that membarrier() is
+ * supposed by provide.
+ *
+ * The timing of the memory barrier at (a) has to ensure that it executes
+ * before the IPI-induced memory barrier on CPU1.
+ *
+ * B) Userspace thread execution before IPI vs membarrier's memory
+ * barrier after completing the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the end of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU1 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU0 after the membarrier():
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * barrier()
+ * y = 1
+ * r2 = y
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r1 = x
+ * BUG_ON(r1 == 0 && r2 == 1)
+ *
+ * The writes to x and y are unordered by the hardware, so it's possible to
+ * have "r2 = 1" even though the write to x doesn't execute until (b). If
+ * the memory barrier at (c) is omitted then "r1 = x" can be reordered
+ * before (b) (although not before (a)), so we get "r1 = 0". This violates
+ * the guarantee that membarrier() is supposed to provide.
+ *
+ * The timing of the memory barrier at (c) has to ensure that it executes
+ * after the IPI-induced memory barrier on CPU1.
+ *
+ * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * d: switch to kthread (includes mb)
+ * b: read rq->curr->mm == NULL
+ * e: switch to user (includes mb)
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (e). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ *
+ * D) exit_mm vs membarrier
+ *
+ * Two thread groups are created, A and B. Thread group B is created by
+ * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
+ * Let's assume we have a single thread within each thread group (Thread A
+ * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * exit_mm():
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * c: smp_mb()
+ *
+ * Using scenario (B), we can show that (c) needs to be paired with (d).
+ *
+ * E) kthread_{use,unuse}_mm vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * kthread_unuse_mm()
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * kthread_use_mm()
+ * f: current->mm = mm
+ * g: smp_mb()
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (g). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#endif
+
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ | MEMBARRIER_CMD_GET_REGISTRATIONS)
+
+static DEFINE_MUTEX(membarrier_ipi_mutex);
+#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
+
+static void ipi_mb(void *info)
+{
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+}
+
+static void ipi_sync_core(void *info)
+{
+ /*
+ * The smp_mb() in membarrier after all the IPIs is supposed to
+ * ensure that memory on remote CPUs that occur before the IPI
+ * become visible to membarrier()'s caller -- see scenario B in
+ * the big comment at the top of this file.
+ *
+ * A sync_core() would provide this guarantee, but
+ * sync_core_before_usermode() might end up being deferred until
+ * after membarrier()'s smp_mb().
+ */
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+
+ sync_core_before_usermode();
+}
+
+static void ipi_rseq(void *info)
+{
+ /*
+ * Ensure that all stores done by the calling thread are visible
+ * to the current task before the current task resumes. We could
+ * probably optimize this away on most architectures, but by the
+ * time we've already sent an IPI, the cost of the extra smp_mb()
+ * is negligible.
+ */
+ smp_mb();
+ rseq_sched_switch_event(current);
+}
+
+static void ipi_sync_rq_state(void *info)
+{
+ struct mm_struct *mm = (struct mm_struct *) info;
+
+ if (current->mm != mm)
+ return;
+ this_cpu_write(runqueues.membarrier_state,
+ atomic_read(&mm->membarrier_state));
+ /*
+ * Issue a memory barrier after setting
+ * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
+ * guarantee that no memory access following registration is reordered
+ * before registration.
+ */
+ smp_mb();
+}
+
+void membarrier_exec_mmap(struct mm_struct *mm)
+{
+ /*
+ * Issue a memory barrier before clearing membarrier_state to
+ * guarantee that no memory access prior to exec is reordered after
+ * clearing this state.
+ */
+ smp_mb();
+ atomic_set(&mm->membarrier_state, 0);
+ /*
+ * Keep the runqueue membarrier_state in sync with this mm
+ * membarrier_state.
+ */
+ this_cpu_write(runqueues.membarrier_state, 0);
+}
+
+void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+ struct rq *rq = this_rq();
+ int membarrier_state = 0;
+
+ if (next_mm)
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+
+static int membarrier_global_expedited(void)
+{
+ int cpu;
+ cpumask_var_t tmpmask;
+
+ if (num_online_cpus() == 1)
+ return 0;
+
+ /*
+ * Matches memory barriers after rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ SERIALIZE_IPI();
+ cpus_read_lock();
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+
+ if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED))
+ continue;
+
+ /*
+ * Skip the CPU if it runs a kernel thread which is not using
+ * a task mm.
+ */
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (!p->mm)
+ continue;
+
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
+
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
+
+ free_cpumask_var(tmpmask);
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers before
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+ return 0;
+}
+
+static int membarrier_private_expedited(int flags, int cpu_id)
+{
+ cpumask_var_t tmpmask;
+ struct mm_struct *mm = current->mm;
+ smp_call_func_t ipi_func = ipi_mb;
+
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+ return -EPERM;
+ ipi_func = ipi_sync_core;
+ prepare_sync_core_cmd(mm);
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+ return -EPERM;
+ ipi_func = ipi_rseq;
+ } else {
+ WARN_ON_ONCE(flags);
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+ return -EPERM;
+ }
+
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
+ (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
+ return 0;
+
+ /*
+ * Matches memory barriers after rq->curr modification in
+ * scheduler.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ SERIALIZE_IPI();
+ cpus_read_lock();
+
+ if (cpu_id >= 0) {
+ struct task_struct *p;
+
+ if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+ goto out;
+ rcu_read_lock();
+ p = rcu_dereference(cpu_rq(cpu_id)->curr);
+ if (!p || p->mm != mm) {
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
+ } else {
+ int cpu;
+
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
+ }
+
+ if (cpu_id >= 0) {
+ /*
+ * smp_call_function_single() will call ipi_func() if cpu_id
+ * is the calling CPU.
+ */
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+ } else {
+ /*
+ * For regular membarrier, we can save a few cycles by
+ * skipping the current cpu -- we're about to do smp_mb()
+ * below, and if we migrate to a different cpu, this cpu
+ * and the new cpu will execute a full barrier in the
+ * scheduler.
+ *
+ * For SYNC_CORE, we do need a barrier on the current cpu --
+ * otherwise, if we are migrated and replaced by a different
+ * task in the same mm just before, during, or after
+ * membarrier, we will end up with some thread in the mm
+ * running without a core sync.
+ *
+ * For RSEQ, don't invoke rseq_sched_switch_event() on the
+ * caller. User code is not supposed to issue syscalls at
+ * all from inside an rseq critical section.
+ */
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_func, NULL, true);
+ preempt_enable();
+ } else {
+ on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
+ }
+ }
+
+out:
+ if (cpu_id < 0)
+ free_cpumask_var(tmpmask);
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers before
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+
+ return 0;
+}
+
+static int sync_runqueues_membarrier_state(struct mm_struct *mm)
+{
+ int membarrier_state = atomic_read(&mm->membarrier_state);
+ cpumask_var_t tmpmask;
+ int cpu;
+
+ if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
+ this_cpu_write(runqueues.membarrier_state, membarrier_state);
+
+ /*
+ * For single mm user, we can simply issue a memory barrier
+ * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
+ * mm and in the current runqueue to guarantee that no memory
+ * access following registration is reordered before
+ * registration.
+ */
+ smp_mb();
+ return 0;
+ }
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * For mm with multiple users, we need to ensure all future
+ * scheduler executions will observe @mm's new membarrier
+ * state.
+ */
+ synchronize_rcu();
+
+ /*
+ * For each cpu runqueue, if the task's mm match @mm, ensure that all
+ * @mm's membarrier state set bits are also set in the runqueue's
+ * membarrier state. This ensures that a runqueue scheduling
+ * between threads which are users of @mm has its membarrier state
+ * updated.
+ */
+ SERIALIZE_IPI();
+ cpus_read_lock();
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p;
+
+ p = rcu_dereference(rq->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
+
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
+
+ free_cpumask_var(tmpmask);
+ cpus_read_unlock();
+
+ return 0;
+}
+
+static int membarrier_register_global_expedited(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int ret;
+
+ if (atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
+ return 0;
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+ ret = sync_runqueues_membarrier_state(mm);
+ if (ret)
+ return ret;
+ atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ &mm->membarrier_state);
+
+ return 0;
+}
+
+static int membarrier_register_private_expedited(int flags)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
+ ret;
+
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+ return -EINVAL;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+ } else {
+ WARN_ON_ONCE(flags);
+ }
+
+ /*
+ * We need to consider threads belonging to different thread
+ * groups, which use the same mm. (CLONE_VM but not
+ * CLONE_THREAD).
+ */
+ if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
+ return 0;
+ if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ if (flags & MEMBARRIER_FLAG_RSEQ)
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
+ atomic_or(set_state, &mm->membarrier_state);
+ ret = sync_runqueues_membarrier_state(mm);
+ if (ret)
+ return ret;
+ atomic_or(ready_state, &mm->membarrier_state);
+
+ return 0;
+}
+
+static int membarrier_get_registrations(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int registrations_mask = 0, membarrier_state, i;
+ static const int states[] = {
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
+ };
+ static const int registration_cmds[] = {
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+ };
+ BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
+
+ membarrier_state = atomic_read(&mm->membarrier_state);
+ for (i = 0; i < ARRAY_SIZE(states); ++i) {
+ if (membarrier_state & states[i]) {
+ registrations_mask |= registration_cmds[i];
+ membarrier_state &= ~states[i];
+ }
+ }
+ WARN_ON_ONCE(membarrier_state != 0);
+ return registrations_mask;
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0 for all commands other than
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ * contains the CPU on which to interrupt (= restart)
+ * the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ * RSEQ CS should be interrupted (@cmd must be
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
+ * always return the same value until reboot. In addition, it can return
+ * -ENOMEM if there is not enough memory available to perform the system
+ * call.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
+{
+ switch (cmd) {
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+ return -EINVAL;
+ break;
+ default:
+ if (unlikely(flags))
+ return -EINVAL;
+ }
+
+ if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+ cpu_id = -1;
+
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ {
+ int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+ if (tick_nohz_full_enabled())
+ cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
+ return cmd_mask;
+ }
+ case MEMBARRIER_CMD_GLOBAL:
+ /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -EINVAL;
+ if (num_online_cpus() > 1)
+ synchronize_rcu();
+ return 0;
+ case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ return membarrier_global_expedited();
+ case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ return membarrier_register_global_expedited();
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ return membarrier_private_expedited(0, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ return membarrier_register_private_expedited(0);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
+ case MEMBARRIER_CMD_GET_REGISTRATIONS:
+ return membarrier_get_registrations();
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
new file mode 100644
index 000000000000..fa83bbaf4f3e
--- /dev/null
+++ b/kernel/sched/pelt.c
@@ -0,0 +1,490 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per Entity Load Tracking (PELT)
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Interactivity improvements by Mike Galbraith
+ * (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ * Various enhancements by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ * Group scheduling enhancements by Srivatsa Vaddagiri
+ * Copyright IBM Corporation, 2007
+ * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ * Scaled math optimizations by Thomas Gleixner
+ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * Move PELT related code from fair.c into this pelt.c file
+ * Author: Vincent Guittot <vincent.guittot@linaro.org>
+ */
+#include "pelt.h"
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static u64 decay_load(u64 val, u64 n)
+{
+ unsigned int local_n;
+
+ if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+ * With a look-up table which covers y^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
+}
+
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */
+
+ /*
+ * c1 = d1 y^p
+ */
+ c1 = decay_load((u64)d1, periods);
+
+ /*
+ * p-1
+ * c2 = 1024 \Sum y^n
+ * n=1
+ *
+ * inf inf
+ * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+ * n=0 n=p
+ */
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+ return c1 + c2 + c3;
+}
+
+/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ * d1 d2 d3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ * n=1
+ *
+ * = u y^p + (Step 1)
+ *
+ * p-1
+ * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
+{
+ u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+ u64 periods;
+
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ sa->runnable_sum =
+ decay_load(sa->runnable_sum, periods);
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ if (load) {
+ /*
+ * This relies on the:
+ *
+ * if (!load)
+ * runnable = running = 0;
+ *
+ * clause from ___update_load_sum(); this results in
+ * the below usage of @contrib to disappear entirely,
+ * so no point in calculating it.
+ */
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
+ }
+ sa->period_contrib = delta;
+
+ if (load)
+ sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
+ if (running)
+ sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
+
+ return periods;
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+___update_load_sum(u64 now, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
+{
+ u64 delta;
+
+ delta = now - sa->last_update_time;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_update_time = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+
+ sa->last_update_time += delta << 10;
+
+ /*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during sched_balance_newidle() which calls
+ * sched_balance_update_blocked_averages().
+ *
+ * Also see the comment in accumulate_sum().
+ */
+ if (!load)
+ runnable = running = 0;
+
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, sa, load, runnable, running))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * When syncing *_avg with *_sum, we must take into account the current
+ * position in the PELT segment otherwise the remaining part of the segment
+ * will be considered as idle time whereas it's not yet elapsed and this will
+ * generate unwanted oscillation in the range [1002..1024[.
+ *
+ * The max value of *_sum varies with the position in the time segment and is
+ * equals to :
+ *
+ * LOAD_AVG_MAX*y + sa->period_contrib
+ *
+ * which can be simplified into:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
+ *
+ * The same care must be taken when a sched entity is added, updated or
+ * removed from a cfs_rq and we need to update sched_avg. Scheduler entities
+ * and the cfs rq, to which they are attached, have the same position in the
+ * time segment because they use the same clock. This means that we can use
+ * the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
+ * if it's more convenient.
+ */
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load)
+{
+ u32 divider = get_pelt_divider(sa);
+
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_avg = div_u64(sa->runnable_sum, divider);
+ WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
+}
+
+/*
+ * sched_entity:
+ *
+ * task:
+ * se_weight() = se->load.weight
+ * se_runnable() = !!on_rq
+ *
+ * group: [ see update_cfs_group() ]
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = grq->h_nr_runnable
+ *
+ * runnable_sum = se_runnable() * runnable = grq->runnable_sum
+ * runnable_avg = runnable_sum
+ *
+ * load_sum := runnable
+ * load_avg = se_weight(se) * load_sum
+ *
+ * cfq_rq:
+ *
+ * runnable_sum = \Sum se->avg.runnable_sum
+ * runnable_avg = \Sum se->avg.runnable_avg
+ *
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
+ * load_avg = \Sum se->avg.load_avg
+ */
+
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
+{
+ if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se));
+ trace_pelt_se_tp(se);
+ return 1;
+ }
+
+ return 0;
+}
+
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
+ cfs_rq->curr == se)) {
+
+ ___update_load_avg(&se->avg, se_weight(se));
+ cfs_se_util_change(&se->avg);
+ trace_pelt_se_tp(se);
+ return 1;
+ }
+
+ return 0;
+}
+
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
+{
+ if (___update_load_sum(now, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ cfs_rq->h_nr_runnable,
+ cfs_rq->curr != NULL)) {
+
+ ___update_load_avg(&cfs_rq->avg, 1);
+ trace_pelt_cfs_tp(cfs_rq);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * rt_rq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_sum = util_sum
+ *
+ * load_avg and runnable_avg are not supported and meaningless.
+ *
+ */
+
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ if (___update_load_sum(now, &rq->avg_rt,
+ running,
+ running,
+ running)) {
+
+ ___update_load_avg(&rq->avg_rt, 1);
+ trace_pelt_rt_tp(rq);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * dl_rq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_sum = util_sum
+ *
+ * load_avg and runnable_avg are not supported and meaningless.
+ *
+ */
+
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ if (___update_load_sum(now, &rq->avg_dl,
+ running,
+ running,
+ running)) {
+
+ ___update_load_avg(&rq->avg_dl, 1);
+ trace_pelt_dl_tp(rq);
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_SCHED_HW_PRESSURE
+/*
+ * hardware:
+ *
+ * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
+ *
+ * util_avg and runnable_load_avg are not supported and meaningless.
+ *
+ * Unlike rt/dl utilization tracking that track time spent by a cpu
+ * running a rt/dl task through util_avg, the average HW pressure is
+ * tracked through load_avg. This is because HW pressure signal is
+ * time weighted "delta" capacity unlike util_avg which is binary.
+ * "delta capacity" = actual capacity -
+ * capped capacity a cpu due to a HW event.
+ */
+
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ if (___update_load_sum(now, &rq->avg_hw,
+ capacity,
+ capacity,
+ capacity)) {
+ ___update_load_avg(&rq->avg_hw, 1);
+ trace_pelt_hw_tp(rq);
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SCHED_HW_PRESSURE */
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+/*
+ * IRQ:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_sum = util_sum
+ *
+ * load_avg and runnable_avg are not supported and meaningless.
+ *
+ */
+
+int update_irq_load_avg(struct rq *rq, u64 running)
+{
+ int ret = 0;
+
+ /*
+ * We can't use clock_pelt because IRQ time is not accounted in
+ * clock_task. Instead we directly scale the running time to
+ * reflect the real amount of computation
+ */
+ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
+
+ /*
+ * We know the time that has been used by interrupt since last update
+ * but we don't when. Let be pessimistic and assume that interrupt has
+ * happened just before the update. This is not so far from reality
+ * because interrupt will most probably wake up task and trig an update
+ * of rq clock during which the metric is updated.
+ * We start to decay with normal context time and then we add the
+ * interrupt context time.
+ * We can safely remove running from rq->clock because
+ * rq->clock += delta with delta >= running
+ */
+ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
+ 0,
+ 0,
+ 0);
+ ret += ___update_load_sum(rq->clock, &rq->avg_irq,
+ 1,
+ 1,
+ 1);
+
+ if (ret) {
+ ___update_load_avg(&rq->avg_irq, 1);
+ trace_pelt_irq_tp(rq);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */
+
+/*
+ * Load avg and utiliztion metrics need to be updated periodically and before
+ * consumption. This function updates the metrics for all subsystems except for
+ * the fair class. @rq must be locked and have its clock updated.
+ */
+bool update_other_load_avgs(struct rq *rq)
+{
+ u64 now = rq_clock_pelt(rq);
+ const struct sched_class *curr_class = rq->donor->sched_class;
+ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+
+ lockdep_assert_rq_held(rq);
+
+ /* hw_pressure doesn't care about invariance */
+ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
+ update_irq_load_avg(rq, 0);
+}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
new file mode 100644
index 000000000000..f921302dc40f
--- /dev/null
+++ b/kernel/sched/pelt.h
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _KERNEL_SCHED_PELT_H
+#define _KERNEL_SCHED_PELT_H
+#include "sched.h"
+
+#include "sched-pelt.h"
+
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+bool update_other_load_avgs(struct rq *rq);
+
+#ifdef CONFIG_SCHED_HW_PRESSURE
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
+
+static inline u64 hw_load_avg(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_hw.load_avg);
+}
+#else /* !CONFIG_SCHED_HW_PRESSURE: */
+static inline int
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
+{
+ return 0;
+}
+
+static inline u64 hw_load_avg(struct rq *rq)
+{
+ return 0;
+}
+#endif /* !CONFIG_SCHED_HW_PRESSURE */
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+int update_irq_load_avg(struct rq *rq, u64 running);
+#else
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+ return 0;
+}
+#endif
+
+#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
+
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+ return PELT_MIN_DIVIDER + avg->period_contrib;
+}
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Avoid store if the flag has been already reset */
+ enqueued = avg->util_est;
+ if (!(enqueued & UTIL_AVG_UNCHANGED))
+ return;
+
+ /* Reset flag to report util_avg has been updated */
+ enqueued &= ~UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(avg->util_est, enqueued);
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+/* The rq is idle, we can sync to clock_task */
+static inline void _update_idle_rq_clock_pelt(struct rq *rq)
+{
+ rq->clock_pelt = rq_clock_task(rq);
+
+ u64_u32_store(rq->clock_idle, rq_clock(rq));
+ /* Paired with smp_rmb in migrate_se_pelt_lag() */
+ smp_wmb();
+ u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
+}
+
+/*
+ * The clock_pelt scales the time to reflect the effective amount of
+ * computation done during the running delta time but then sync back to
+ * clock_task when rq is idle.
+ *
+ *
+ * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
+ * @ max capacity ------******---------------******---------------
+ * @ half capacity ------************---------************---------
+ * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
+ *
+ */
+static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
+{
+ if (unlikely(is_idle_task(rq->curr))) {
+ _update_idle_rq_clock_pelt(rq);
+ return;
+ }
+
+ /*
+ * When a rq runs at a lower compute capacity, it will need
+ * more time to do the same amount of work than at max
+ * capacity. In order to be invariant, we scale the delta to
+ * reflect how much work has been really done.
+ * Running longer results in stealing idle time that will
+ * disturb the load signal compared to max capacity. This
+ * stolen idle time will be automatically reflected when the
+ * rq will be idle and the clock will be synced with
+ * rq_clock_task.
+ */
+
+ /*
+ * Scale the elapsed time to reflect the real amount of
+ * computation
+ */
+ delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
+
+ rq->clock_pelt += delta;
+}
+
+/*
+ * When rq becomes idle, we have to check if it has lost idle time
+ * because it was fully busy. A rq is fully used when the /Sum util_sum
+ * is greater or equal to:
+ * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
+ * For optimization and computing rounding purpose, we don't take into account
+ * the position in the current window (period_contrib) and we use the higher
+ * bound of util_sum to decide.
+ */
+static inline void update_idle_rq_clock_pelt(struct rq *rq)
+{
+ u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
+ u32 util_sum = rq->cfs.avg.util_sum;
+ util_sum += rq->avg_rt.util_sum;
+ util_sum += rq->avg_dl.util_sum;
+
+ /*
+ * Reflecting stolen time makes sense only if the idle
+ * phase would be present at max capacity. As soon as the
+ * utilization of a rq has reached the maximum value, it is
+ * considered as an always running rq without idle time to
+ * steal. This potential idle time is considered as lost in
+ * this case. We keep track of this lost idle time compare to
+ * rq's clock_task.
+ */
+ if (util_sum >= divider)
+ rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+
+ _update_idle_rq_clock_pelt(rq);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ u64 throttled;
+
+ if (unlikely(cfs_rq->pelt_clock_throttled))
+ throttled = U64_MAX;
+ else
+ throttled = cfs_rq->throttled_clock_pelt_time;
+
+ u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
+}
+
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->pelt_clock_throttled))
+ return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
+
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
+}
+#else /* !CONFIG_CFS_BANDWIDTH: */
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_pelt(rq_of(cfs_rq));
+}
+#endif /* !CONFIG_CFS_BANDWIDTH */
+
+#endif /* _KERNEL_SCHED_PELT_H */
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
deleted file mode 100644
index 8ecd552fe4f2..000000000000
--- a/kernel/sched/proc.c
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- * kernel/sched/proc.c
- *
- * Kernel load calculations, forked from sched/core.c
- */
-
-#include <linux/export.h>
-
-#include "sched.h"
-
-/*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- * nr_active = 0;
- * for_each_possible_cpu(cpu)
- * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- * - for_each_possible_cpu() is prohibitively expensive on machines with
- * serious number of cpus, therefore we need to take a distributed approach
- * to calculating nr_active.
- *
- * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- * So assuming nr_active := 0 when we start out -- true per definition, we
- * can simply take per-cpu deltas and fold those into a global accumulate
- * to obtain the same result. See calc_load_fold_active().
- *
- * Furthermore, in order to avoid synchronizing all per-cpu delta folding
- * across the machine, we assume 10 ticks is sufficient time for every
- * cpu to have completed this task.
- *
- * This places an upper-bound on the IRQ-off latency of the machine. Then
- * again, being late doesn't loose the delta, just wrecks the sample.
- *
- * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- * this would add another cross-cpu cacheline miss and atomic operation
- * to the wakeup path. Instead we increment on whatever cpu the task ran
- * when it went into uninterruptible state and decrement on whatever cpu
- * did the wakeup. This means that only the sum of nr_uninterruptible over
- * all cpus yields the correct result.
- *
- * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
- */
-
-/* Variables and functions for calc_load */
-atomic_long_t calc_load_tasks;
-unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
-
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
-
-long calc_load_fold_active(struct rq *this_rq)
-{
- long nr_active, delta = 0;
-
- nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
-
- if (nr_active != this_rq->calc_load_active) {
- delta = nr_active - this_rq->calc_load_active;
- this_rq->calc_load_active = nr_active;
- }
-
- return delta;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- */
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- load += 1UL << (FSHIFT - 1);
- return load >> FSHIFT;
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- * - When we go NO_HZ idle during the window, we can negate our sample
- * contribution, causing under-accounting.
- *
- * We avoid this by keeping two idle-delta counters and flipping them
- * when the window starts, thus separating old and new NO_HZ load.
- *
- * The only trick is the slight shift in index flip for read vs write.
- *
- * 0s 5s 10s 15s
- * +10 +10 +10 +10
- * |-|-----------|-|-----------|-|-----------|-|
- * r:0 0 1 1 0 0 1 1 0
- * w:0 1 1 0 0 1 1 0 0
- *
- * This ensures we'll fold the old idle contribution in this window while
- * accumlating the new one.
- *
- * - When we wake up from NO_HZ idle during the window, we push up our
- * contribution, since we effectively move our sample point to a known
- * busy state.
- *
- * This is solved by pushing the window forward, and thus skipping the
- * sample, for this cpu (effectively using the idle-delta for this cpu which
- * was in effect at the time the window opened). This also solves the issue
- * of having to deal with a cpu having been in NOHZ idle for multiple
- * LOAD_FREQ intervals.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
-
-static inline int calc_load_write_idx(void)
-{
- int idx = calc_load_idx;
-
- /*
- * See calc_global_nohz(), if we observe the new index, we also
- * need to observe the new update time.
- */
- smp_rmb();
-
- /*
- * If the folding window started, make sure we start writing in the
- * next idle-delta.
- */
- if (!time_before(jiffies, calc_load_update))
- idx++;
-
- return idx & 1;
-}
-
-static inline int calc_load_read_idx(void)
-{
- return calc_load_idx & 1;
-}
-
-void calc_load_enter_idle(void)
-{
- struct rq *this_rq = this_rq();
- long delta;
-
- /*
- * We're going into NOHZ mode, if there's any pending delta, fold it
- * into the pending idle delta.
- */
- delta = calc_load_fold_active(this_rq);
- if (delta) {
- int idx = calc_load_write_idx();
- atomic_long_add(delta, &calc_load_idle[idx]);
- }
-}
-
-void calc_load_exit_idle(void)
-{
- struct rq *this_rq = this_rq();
-
- /*
- * If we're still before the sample window, we're done.
- */
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
-
- /*
- * We woke inside or after the sample window, this means we're already
- * accounted through the nohz accounting, so skip the entire deal and
- * sync up for the next window.
- */
- this_rq->calc_load_update = calc_load_update;
- if (time_before(jiffies, this_rq->calc_load_update + 10))
- this_rq->calc_load_update += LOAD_FREQ;
-}
-
-static long calc_load_fold_idle(void)
-{
- int idx = calc_load_read_idx();
- long delta = 0;
-
- if (atomic_long_read(&calc_load_idle[idx]))
- delta = atomic_long_xchg(&calc_load_idle[idx], 0);
-
- return delta;
-}
-
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x: base of the power
- * @frac_bits: fractional bits of @x
- * @n: power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
- unsigned long result = 1UL << frac_bits;
-
- if (n) for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
- }
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
- }
-
- return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- * = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- * ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- * = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- * n 1 - x^(n+1)
- * S_n := \Sum x^i = -------------
- * i=0 1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
- unsigned long active, unsigned int n)
-{
-
- return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(void)
-{
- long delta, active, n;
-
- if (!time_before(jiffies, calc_load_update + 10)) {
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
- calc_load_update += n * LOAD_FREQ;
- }
-
- /*
- * Flip the idle index...
- *
- * Make sure we first write the new time then flip the index, so that
- * calc_load_write_idx() will see the new time when it reads the new
- * index, this avoids a double flip messing things up.
- */
- smp_wmb();
- calc_load_idx++;
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
-
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
- long active, delta;
-
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- /*
- * Fold the 'old' idle-delta to include all NO_HZ cpus.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
-
- avenrun[0] = calc_load(avenrun[0], EXP_1, active);
- avenrun[1] = calc_load(avenrun[1], EXP_5, active);
- avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
- calc_load_update += LOAD_FREQ;
-
- /*
- * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
- */
- calc_global_nohz();
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
- long delta;
-
- if (time_before(jiffies, this_rq->calc_load_update))
- return;
-
- delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-
- sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
- return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long load = get_rq_runnable_load(this_rq);
- unsigned long pending_updates;
-
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (load || curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- this_rq->last_load_update_tick = curr_jiffies;
-
- __update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
- struct rq *this_rq = this_rq();
- unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
- unsigned long pending_updates;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- raw_spin_lock(&this_rq->lock);
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
- */
- __update_cpu_load(this_rq, 0, pending_updates);
- }
- raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
- unsigned long load = get_rq_runnable_load(this_rq);
- /*
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
-
- calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
new file mode 100644
index 000000000000..59fdb7ebbf22
--- /dev/null
+++ b/kernel/sched/psi.c
@@ -0,0 +1,1682 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Pressure stall information for CPU, memory and IO
+ *
+ * Copyright (c) 2018 Facebook, Inc.
+ * Author: Johannes Weiner <hannes@cmpxchg.org>
+ *
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
+ * When CPU, memory and IO are contended, tasks experience delays that
+ * reduce throughput and introduce latencies into the workload. Memory
+ * and IO contention, in addition, can cause a full loss of forward
+ * progress in which the CPU goes idle.
+ *
+ * This code aggregates individual task delays into resource pressure
+ * metrics that indicate problems with both workload health and
+ * resource utilization.
+ *
+ * Model
+ *
+ * The time in which a task can execute on a CPU is our baseline for
+ * productivity. Pressure expresses the amount of time in which this
+ * potential cannot be realized due to resource contention.
+ *
+ * This concept of productivity has two components: the workload and
+ * the CPU. To measure the impact of pressure on both, we define two
+ * contention states for a resource: SOME and FULL.
+ *
+ * In the SOME state of a given resource, one or more tasks are
+ * delayed on that resource. This affects the workload's ability to
+ * perform work, but the CPU may still be executing other tasks.
+ *
+ * In the FULL state of a given resource, all non-idle tasks are
+ * delayed on that resource such that nobody is advancing and the CPU
+ * goes idle. This leaves both workload and CPU unproductive.
+ *
+ * SOME = nr_delayed_tasks != 0
+ * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an on-CPU task.
+ *
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
+ *
+ * The percentage of wall clock time spent in those compound stall
+ * states gives pressure numbers between 0 and 100 for each resource,
+ * where the SOME percentage indicates workload slowdowns and the FULL
+ * percentage indicates reduced CPU utilization:
+ *
+ * %SOME = time(SOME) / period
+ * %FULL = time(FULL) / period
+ *
+ * Multiple CPUs
+ *
+ * The more tasks and available CPUs there are, the more work can be
+ * performed concurrently. This means that the potential that can go
+ * unrealized due to resource contention *also* scales with non-idle
+ * tasks and CPUs.
+ *
+ * Consider a scenario where 257 number crunching tasks are trying to
+ * run concurrently on 256 CPUs. If we simply aggregated the task
+ * states, we would have to conclude a CPU SOME pressure number of
+ * 100%, since *somebody* is waiting on a runqueue at all
+ * times. However, that is clearly not the amount of contention the
+ * workload is experiencing: only one out of 256 possible execution
+ * threads will be contended at any given time, or about 0.4%.
+ *
+ * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
+ * given time *one* of the tasks is delayed due to a lack of memory.
+ * Again, looking purely at the task state would yield a memory FULL
+ * pressure number of 0%, since *somebody* is always making forward
+ * progress. But again this wouldn't capture the amount of execution
+ * potential lost, which is 1 out of 4 CPUs, or 25%.
+ *
+ * To calculate wasted potential (pressure) with multiple processors,
+ * we have to base our calculation on the number of non-idle tasks in
+ * conjunction with the number of available CPUs, which is the number
+ * of potential execution threads. SOME becomes then the proportion of
+ * delayed tasks to possible threads, and FULL is the share of possible
+ * threads that are unproductive due to delays:
+ *
+ * threads = min(nr_nonidle_tasks, nr_cpus)
+ * SOME = min(nr_delayed_tasks / threads, 1)
+ * FULL = (threads - min(nr_productive_tasks, threads)) / threads
+ *
+ * For the 257 number crunchers on 256 CPUs, this yields:
+ *
+ * threads = min(257, 256)
+ * SOME = min(1 / 256, 1) = 0.4%
+ * FULL = (256 - min(256, 256)) / 256 = 0%
+ *
+ * For the 1 out of 4 memory-delayed tasks, this yields:
+ *
+ * threads = min(4, 4)
+ * SOME = min(1 / 4, 1) = 25%
+ * FULL = (4 - min(3, 4)) / 4 = 25%
+ *
+ * [ Substitute nr_cpus with 1, and you can see that it's a natural
+ * extension of the single-CPU model. ]
+ *
+ * Implementation
+ *
+ * To assess the precise time spent in each such state, we would have
+ * to freeze the system on task changes and start/stop the state
+ * clocks accordingly. Obviously that doesn't scale in practice.
+ *
+ * Because the scheduler aims to distribute the compute load evenly
+ * among the available CPUs, we can track task state locally to each
+ * CPU and, at much lower frequency, extrapolate the global state for
+ * the cumulative stall times and the running averages.
+ *
+ * For each runqueue, we track:
+ *
+ * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
+ * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
+ *
+ * and then periodically aggregate:
+ *
+ * tNONIDLE = sum(tNONIDLE[i])
+ *
+ * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
+ * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
+ *
+ * %SOME = tSOME / period
+ * %FULL = tFULL / period
+ *
+ * This gives us an approximation of pressure that is practical
+ * cost-wise, yet way more sensitive and accurate than periodic
+ * sampling of the aggregate task states would be.
+ */
+#include <linux/sched/clock.h>
+#include <linux/workqueue.h>
+#include <linux/psi.h>
+#include "sched.h"
+
+static int psi_bug __read_mostly;
+
+DEFINE_STATIC_KEY_FALSE(psi_disabled);
+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
+
+#ifdef CONFIG_PSI_DEFAULT_DISABLED
+static bool psi_enable;
+#else
+static bool psi_enable = true;
+#endif
+static int __init setup_psi(char *str)
+{
+ return kstrtobool(str, &psi_enable) == 0;
+}
+__setup("psi=", setup_psi);
+
+/* Running averages - we need to be higher-res than loadavg */
+#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
+#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
+#define EXP_60s 1981 /* 1/exp(2s/60s) */
+#define EXP_300s 2034 /* 1/exp(2s/300s) */
+
+/* PSI trigger definitions */
+#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
+
+/* Sampling frequency in nanoseconds */
+static u64 psi_period __read_mostly;
+
+/* System-level pressure and stall tracking */
+static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
+struct psi_group psi_system = {
+ .pcpu = &system_group_pcpu,
+};
+
+static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+ write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+ write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+ return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+ return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
+static void psi_avgs_work(struct work_struct *work);
+
+static void poll_timer_fn(struct timer_list *t);
+
+static void group_init(struct psi_group *group)
+{
+ group->enabled = true;
+ group->avg_last_update = sched_clock();
+ group->avg_next_update = group->avg_last_update + psi_period;
+ mutex_init(&group->avgs_lock);
+
+ /* Init avg trigger-related members */
+ INIT_LIST_HEAD(&group->avg_triggers);
+ memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
+ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+
+ /* Init rtpoll trigger-related members */
+ atomic_set(&group->rtpoll_scheduled, 0);
+ mutex_init(&group->rtpoll_trigger_lock);
+ INIT_LIST_HEAD(&group->rtpoll_triggers);
+ group->rtpoll_min_period = U32_MAX;
+ group->rtpoll_next_update = ULLONG_MAX;
+ init_waitqueue_head(&group->rtpoll_wait);
+ timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->rtpoll_task, NULL);
+}
+
+void __init psi_init(void)
+{
+ if (!psi_enable) {
+ static_branch_enable(&psi_disabled);
+ static_branch_disable(&psi_cgroups_enabled);
+ return;
+ }
+
+ if (!cgroup_psi_enabled())
+ static_branch_disable(&psi_cgroups_enabled);
+
+ psi_period = jiffies_to_nsecs(PSI_FREQ);
+ group_init(&psi_system);
+}
+
+static u32 test_states(unsigned int *tasks, u32 state_mask)
+{
+ const bool oncpu = state_mask & PSI_ONCPU;
+
+ if (tasks[NR_IOWAIT]) {
+ state_mask |= BIT(PSI_IO_SOME);
+ if (!tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_IO_FULL);
+ }
+
+ if (tasks[NR_MEMSTALL]) {
+ state_mask |= BIT(PSI_MEM_SOME);
+ if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING])
+ state_mask |= BIT(PSI_MEM_FULL);
+ }
+
+ if (tasks[NR_RUNNING] > oncpu)
+ state_mask |= BIT(PSI_CPU_SOME);
+
+ if (tasks[NR_RUNNING] && !oncpu)
+ state_mask |= BIT(PSI_CPU_FULL);
+
+ if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_NONIDLE);
+
+ return state_mask;
+}
+
+static void get_recent_times(struct psi_group *group, int cpu,
+ enum psi_aggregators aggregator, u32 *times,
+ u32 *pchanged_states)
+{
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
+ u64 now, state_start;
+ enum psi_states s;
+ unsigned int seq;
+ u32 state_mask;
+
+ *pchanged_states = 0;
+
+ /* Snapshot a coherent view of the CPU state */
+ do {
+ seq = psi_read_begin(cpu);
+ now = cpu_clock(cpu);
+ memcpy(times, groupc->times, sizeof(groupc->times));
+ state_mask = groupc->state_mask;
+ state_start = groupc->state_start;
+ if (cpu == current_cpu)
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+ } while (psi_read_retry(cpu, seq));
+
+ /* Calculate state time deltas against the previous snapshot */
+ for (s = 0; s < NR_PSI_STATES; s++) {
+ u32 delta;
+ /*
+ * In addition to already concluded states, we also
+ * incorporate currently active states on the CPU,
+ * since states may last for many sampling periods.
+ *
+ * This way we keep our delta sampling buckets small
+ * (u32) and our reported pressure close to what's
+ * actually happening.
+ */
+ if (state_mask & (1 << s))
+ times[s] += now - state_start;
+
+ delta = times[s] - groupc->times_prev[aggregator][s];
+ groupc->times_prev[aggregator][s] = times[s];
+
+ times[s] = delta;
+ if (delta)
+ *pchanged_states |= (1 << s);
+ }
+
+ /*
+ * When collect_percpu_times() from the avgs_work, we don't want to
+ * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+ * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+ * So for the current CPU, we need to re-arm avgs_work only when
+ * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+ * we can just check PSI_NONIDLE delta.
+ */
+ if (current_work() == &group->avgs_work.work) {
+ bool reschedule;
+
+ if (cpu == current_cpu)
+ reschedule = tasks[NR_RUNNING] +
+ tasks[NR_IOWAIT] +
+ tasks[NR_MEMSTALL] > 1;
+ else
+ reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+ if (reschedule)
+ *pchanged_states |= PSI_STATE_RESCHEDULE;
+ }
+}
+
+static void calc_avgs(unsigned long avg[3], int missed_periods,
+ u64 time, u64 period)
+{
+ unsigned long pct;
+
+ /* Fill in zeroes for periods of no activity */
+ if (missed_periods) {
+ avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods);
+ avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods);
+ avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods);
+ }
+
+ /* Sample the most recent active period */
+ pct = div_u64(time * 100, period);
+ pct *= FIXED_1;
+ avg[0] = calc_load(avg[0], EXP_10s, pct);
+ avg[1] = calc_load(avg[1], EXP_60s, pct);
+ avg[2] = calc_load(avg[2], EXP_300s, pct);
+}
+
+static void collect_percpu_times(struct psi_group *group,
+ enum psi_aggregators aggregator,
+ u32 *pchanged_states)
+{
+ u64 deltas[NR_PSI_STATES - 1] = { 0, };
+ unsigned long nonidle_total = 0;
+ u32 changed_states = 0;
+ int cpu;
+ int s;
+
+ /*
+ * Collect the per-cpu time buckets and average them into a
+ * single time sample that is normalized to wall clock time.
+ *
+ * For averaging, each CPU is weighted by its non-idle time in
+ * the sampling period. This eliminates artifacts from uneven
+ * loading, or even entirely idle CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ u32 times[NR_PSI_STATES];
+ u32 nonidle;
+ u32 cpu_changed_states;
+
+ get_recent_times(group, cpu, aggregator, times,
+ &cpu_changed_states);
+ changed_states |= cpu_changed_states;
+
+ nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
+ nonidle_total += nonidle;
+
+ for (s = 0; s < PSI_NONIDLE; s++)
+ deltas[s] += (u64)times[s] * nonidle;
+ }
+
+ /*
+ * Integrate the sample into the running statistics that are
+ * reported to userspace: the cumulative stall times and the
+ * decaying averages.
+ *
+ * Pressure percentages are sampled at PSI_FREQ. We might be
+ * called more often when the user polls more frequently than
+ * that; we might be called less often when there is no task
+ * activity, thus no data, and clock ticks are sporadic. The
+ * below handles both.
+ */
+
+ /* total= */
+ for (s = 0; s < NR_PSI_STATES - 1; s++)
+ group->total[aggregator][s] +=
+ div_u64(deltas[s], max(nonidle_total, 1UL));
+
+ if (pchanged_states)
+ *pchanged_states = changed_states;
+}
+
+/* Trigger tracking window manipulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+ u64 prev_growth)
+{
+ win->start_time = now;
+ win->start_value = value;
+ win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+ u64 elapsed;
+ u64 growth;
+
+ elapsed = now - win->start_time;
+ growth = value - win->start_value;
+ /*
+ * After each tracking window passes win->start_value and
+ * win->start_time get reset and win->prev_growth stores
+ * the average per-window growth of the previous window.
+ * win->prev_growth is then used to interpolate additional
+ * growth from the previous window assuming it was linear.
+ */
+ if (elapsed > win->size)
+ window_reset(win, now, value, growth);
+ else {
+ u32 remaining;
+
+ remaining = win->size - elapsed;
+ growth += div64_u64(win->prev_growth * remaining, win->size);
+ }
+
+ return growth;
+}
+
+static void update_triggers(struct psi_group *group, u64 now,
+ enum psi_aggregators aggregator)
+{
+ struct psi_trigger *t;
+ u64 *total = group->total[aggregator];
+ struct list_head *triggers;
+ u64 *aggregator_total;
+
+ if (aggregator == PSI_AVGS) {
+ triggers = &group->avg_triggers;
+ aggregator_total = group->avg_total;
+ } else {
+ triggers = &group->rtpoll_triggers;
+ aggregator_total = group->rtpoll_total;
+ }
+
+ /*
+ * On subsequent updates, calculate growth deltas and let
+ * watchers know when their specified thresholds are exceeded.
+ */
+ list_for_each_entry(t, triggers, node) {
+ u64 growth;
+ bool new_stall;
+
+ new_stall = aggregator_total[t->state] != total[t->state];
+
+ /* Check for stall activity or a previous threshold breach */
+ if (!new_stall && !t->pending_event)
+ continue;
+ /*
+ * Check for new stall activity, as well as deferred
+ * events that occurred in the last window after the
+ * trigger had already fired (we want to ratelimit
+ * events without dropping any).
+ */
+ if (new_stall) {
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (!t->pending_event) {
+ if (growth < t->threshold)
+ continue;
+
+ t->pending_event = true;
+ }
+ }
+ /* Limit event signaling to once per window */
+ if (now < t->last_event_time + t->win.size)
+ continue;
+
+ /* Generate an event */
+ if (cmpxchg(&t->event, 0, 1) == 0) {
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+ }
+ t->last_event_time = now;
+ /* Reset threshold breach flag once event got generated */
+ t->pending_event = false;
+ }
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+ unsigned long missed_periods = 0;
+ u64 expires, period;
+ u64 avg_next_update;
+ int s;
+
+ /* avgX= */
+ expires = group->avg_next_update;
+ if (now - expires >= psi_period)
+ missed_periods = div_u64(now - expires, psi_period);
+
+ /*
+ * The periodic clock tick can get delayed for various
+ * reasons, especially on loaded systems. To avoid clock
+ * drift, we schedule the clock in fixed psi_period intervals.
+ * But the deltas we sample out of the per-cpu buckets above
+ * are based on the actual time elapsing between clock ticks.
+ */
+ avg_next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->avg_last_update + (missed_periods * psi_period));
+ group->avg_last_update = now;
+
+ for (s = 0; s < NR_PSI_STATES - 1; s++) {
+ u32 sample;
+
+ sample = group->total[PSI_AVGS][s] - group->avg_total[s];
+ /*
+ * Due to the lockless sampling of the time buckets,
+ * recorded time deltas can slip into the next period,
+ * which under full pressure can result in samples in
+ * excess of the period length.
+ *
+ * We don't want to report non-sensical pressures in
+ * excess of 100%, nor do we want to drop such events
+ * on the floor. Instead we punt any overage into the
+ * future until pressure subsides. By doing this we
+ * don't underreport the occurring pressure curve, we
+ * just report it delayed by one period length.
+ *
+ * The error isn't cumulative. As soon as another
+ * delta slips from a period P to P+1, by definition
+ * it frees up its time T in P.
+ */
+ if (sample > period)
+ sample = period;
+ group->avg_total[s] += sample;
+ calc_avgs(group->avg[s], missed_periods, sample, period);
+ }
+
+ return avg_next_update;
+}
+
+static void psi_avgs_work(struct work_struct *work)
+{
+ struct delayed_work *dwork;
+ struct psi_group *group;
+ u32 changed_states;
+ u64 now;
+
+ dwork = to_delayed_work(work);
+ group = container_of(dwork, struct psi_group, avgs_work);
+
+ mutex_lock(&group->avgs_lock);
+
+ now = sched_clock();
+
+ collect_percpu_times(group, PSI_AVGS, &changed_states);
+ /*
+ * If there is task activity, periodically fold the per-cpu
+ * times and feed samples into the running averages. If things
+ * are idle and there is no data to process, stop the clock.
+ * Once restarted, we'll catch up the running averages in one
+ * go - see calc_avgs() and missed_periods.
+ */
+ if (now >= group->avg_next_update) {
+ update_triggers(group, now, PSI_AVGS);
+ group->avg_next_update = update_averages(group, now);
+ }
+
+ if (changed_states & PSI_STATE_RESCHEDULE) {
+ schedule_delayed_work(dwork, nsecs_to_jiffies(
+ group->avg_next_update - now) + 1);
+ }
+
+ mutex_unlock(&group->avgs_lock);
+}
+
+static void init_rtpoll_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+
+ list_for_each_entry(t, &group->rtpoll_triggers, node)
+ window_reset(&t->win, now,
+ group->total[PSI_POLL][t->state], 0);
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
+}
+
+/* Schedule rtpolling if it's not already scheduled or forced. */
+static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
+ bool force)
+{
+ struct task_struct *task;
+
+ /*
+ * atomic_xchg should be called even when !force to provide a
+ * full memory barrier (see the comment inside psi_rtpoll_work).
+ */
+ if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force)
+ return;
+
+ rcu_read_lock();
+
+ task = rcu_dereference(group->rtpoll_task);
+ /*
+ * kworker might be NULL in case psi_trigger_destroy races with
+ * psi_task_change (hotpath) which can't use locks
+ */
+ if (likely(task))
+ mod_timer(&group->rtpoll_timer, jiffies + delay);
+ else
+ atomic_set(&group->rtpoll_scheduled, 0);
+
+ rcu_read_unlock();
+}
+
+static void psi_rtpoll_work(struct psi_group *group)
+{
+ bool force_reschedule = false;
+ u32 changed_states;
+ u64 now;
+
+ mutex_lock(&group->rtpoll_trigger_lock);
+
+ now = sched_clock();
+
+ if (now > group->rtpoll_until) {
+ /*
+ * We are either about to start or might stop rtpolling if no
+ * state change was recorded. Resetting rtpoll_scheduled leaves
+ * a small window for psi_group_change to sneak in and schedule
+ * an immediate rtpoll_work before we get to rescheduling. One
+ * potential extra wakeup at the end of the rtpolling window
+ * should be negligible and rtpoll_next_update still keeps
+ * updates correctly on schedule.
+ */
+ atomic_set(&group->rtpoll_scheduled, 0);
+ /*
+ * A task change can race with the rtpoll worker that is supposed to
+ * report on it. To avoid missing events, ensure ordering between
+ * rtpoll_scheduled and the task state accesses, such that if the
+ * rtpoll worker misses the state update, the task change is
+ * guaranteed to reschedule the rtpoll worker:
+ *
+ * rtpoll worker:
+ * atomic_set(rtpoll_scheduled, 0)
+ * smp_mb()
+ * LOAD states
+ *
+ * task change:
+ * STORE states
+ * if atomic_xchg(rtpoll_scheduled, 1) == 0:
+ * schedule rtpoll worker
+ *
+ * The atomic_xchg() implies a full barrier.
+ */
+ smp_mb();
+ } else {
+ /* The rtpolling window is not over, keep rescheduling */
+ force_reschedule = true;
+ }
+
+
+ collect_percpu_times(group, PSI_POLL, &changed_states);
+
+ if (changed_states & group->rtpoll_states) {
+ /* Initialize trigger windows when entering rtpolling mode */
+ if (now > group->rtpoll_until)
+ init_rtpoll_triggers(group, now);
+
+ /*
+ * Keep the monitor active for at least the duration of the
+ * minimum tracking window as long as monitor states are
+ * changing.
+ */
+ group->rtpoll_until = now +
+ group->rtpoll_min_period * UPDATES_PER_WINDOW;
+ }
+
+ if (now > group->rtpoll_until) {
+ group->rtpoll_next_update = ULLONG_MAX;
+ goto out;
+ }
+
+ if (now >= group->rtpoll_next_update) {
+ if (changed_states & group->rtpoll_states) {
+ update_triggers(group, now, PSI_POLL);
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ }
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
+ }
+
+ psi_schedule_rtpoll_work(group,
+ nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
+ force_reschedule);
+
+out:
+ mutex_unlock(&group->rtpoll_trigger_lock);
+}
+
+static int psi_rtpoll_worker(void *data)
+{
+ struct psi_group *group = (struct psi_group *)data;
+
+ sched_set_fifo_low(current);
+
+ while (true) {
+ wait_event_interruptible(group->rtpoll_wait,
+ atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ psi_rtpoll_work(group);
+ }
+ return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+ struct psi_group *group = timer_container_of(group, t, rtpoll_timer);
+
+ atomic_set(&group->rtpoll_wakeup, 1);
+ wake_up_interruptible(&group->rtpoll_wait);
+}
+
+static void record_times(struct psi_group_cpu *groupc, u64 now)
+{
+ u32 delta;
+
+ delta = now - groupc->state_start;
+ groupc->state_start = now;
+
+ if (groupc->state_mask & (1 << PSI_IO_SOME)) {
+ groupc->times[PSI_IO_SOME] += delta;
+ if (groupc->state_mask & (1 << PSI_IO_FULL))
+ groupc->times[PSI_IO_FULL] += delta;
+ }
+
+ if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
+ groupc->times[PSI_MEM_SOME] += delta;
+ if (groupc->state_mask & (1 << PSI_MEM_FULL))
+ groupc->times[PSI_MEM_FULL] += delta;
+ }
+
+ if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
+ groupc->times[PSI_CPU_SOME] += delta;
+ if (groupc->state_mask & (1 << PSI_CPU_FULL))
+ groupc->times[PSI_CPU_FULL] += delta;
+ }
+
+ if (groupc->state_mask & (1 << PSI_NONIDLE))
+ groupc->times[PSI_NONIDLE] += delta;
+}
+
+#define for_each_group(iter, group) \
+ for (typeof(group) iter = group; iter; iter = iter->parent)
+
+static void psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set,
+ u64 now, bool wake_clock)
+{
+ struct psi_group_cpu *groupc;
+ unsigned int t, m;
+ u32 state_mask;
+
+ lockdep_assert_rq_held(cpu_rq(cpu));
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ /*
+ * Start with TSK_ONCPU, which doesn't have a corresponding
+ * task count - it's just a boolean flag directly encoded in
+ * the state mask. Clear, set, or carry the current state if
+ * no changes are requested.
+ */
+ if (unlikely(clear & TSK_ONCPU)) {
+ state_mask = 0;
+ clear &= ~TSK_ONCPU;
+ } else if (unlikely(set & TSK_ONCPU)) {
+ state_mask = PSI_ONCPU;
+ set &= ~TSK_ONCPU;
+ } else {
+ state_mask = groupc->state_mask & PSI_ONCPU;
+ }
+
+ /*
+ * The rest of the state mask is calculated based on the task
+ * counts. Update those first, then construct the mask.
+ */
+ for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
+ if (!(m & (1 << t)))
+ continue;
+ if (groupc->tasks[t]) {
+ groupc->tasks[t]--;
+ } else if (!psi_bug) {
+ printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
+ cpu, t, groupc->tasks[0],
+ groupc->tasks[1], groupc->tasks[2],
+ groupc->tasks[3], clear, set);
+ psi_bug = 1;
+ }
+ }
+
+ for (t = 0; set; set &= ~(1 << t), t++)
+ if (set & (1 << t))
+ groupc->tasks[t]++;
+
+ if (!group->enabled) {
+ /*
+ * On the first group change after disabling PSI, conclude
+ * the current state and flush its time. This is unlikely
+ * to matter to the user, but aggregation (get_recent_times)
+ * may have already incorporated the live state into times_prev;
+ * avoid a delta sample underflow when PSI is later re-enabled.
+ */
+ if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+ record_times(groupc, now);
+
+ groupc->state_mask = state_mask;
+
+ return;
+ }
+
+ state_mask = test_states(groupc->tasks, state_mask);
+
+ /*
+ * Since we care about lost potential, a memstall is FULL
+ * when there are no other working tasks, but also when
+ * the CPU is actively reclaiming and nothing productive
+ * could run even if it were runnable. So when the current
+ * task in a cgroup is in_memstall, the corresponding groupc
+ * on that cpu is in PSI_MEM_FULL state.
+ */
+ if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
+ state_mask |= (1 << PSI_MEM_FULL);
+
+ record_times(groupc, now);
+
+ groupc->state_mask = state_mask;
+
+ if (state_mask & group->rtpoll_states)
+ psi_schedule_rtpoll_work(group, 1, false);
+
+ if (wake_clock && !delayed_work_pending(&group->avgs_work))
+ schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+}
+
+static inline struct psi_group *task_psi_group(struct task_struct *task)
+{
+#ifdef CONFIG_CGROUPS
+ if (static_branch_likely(&psi_cgroups_enabled))
+ return cgroup_psi(task_dfl_cgroup(task));
+#endif
+ return &psi_system;
+}
+
+static void psi_flags_change(struct task_struct *task, int clear, int set)
+{
+ if (((task->psi_flags & set) ||
+ (task->psi_flags & clear) != clear) &&
+ !psi_bug) {
+ printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n",
+ task->pid, task->comm, task_cpu(task),
+ task->psi_flags, clear, set);
+ psi_bug = 1;
+ }
+
+ task->psi_flags &= ~clear;
+ task->psi_flags |= set;
+}
+
+void psi_task_change(struct task_struct *task, int clear, int set)
+{
+ int cpu = task_cpu(task);
+ u64 now;
+
+ if (!task->pid)
+ return;
+
+ psi_flags_change(task, clear, set);
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(task))
+ psi_group_change(group, cpu, clear, set, now, true);
+ psi_write_end(cpu);
+}
+
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep)
+{
+ struct psi_group *common = NULL;
+ int cpu = task_cpu(prev);
+ u64 now;
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+
+ if (next->pid) {
+ psi_flags_change(next, 0, TSK_ONCPU);
+ /*
+ * Set TSK_ONCPU on @next's cgroups. If @next shares any
+ * ancestors with @prev, those will already have @prev's
+ * TSK_ONCPU bit set, and we can stop the iteration there.
+ */
+ for_each_group(group, task_psi_group(next)) {
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ if (groupc->state_mask & PSI_ONCPU) {
+ common = group;
+ break;
+ }
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ }
+ }
+
+ if (prev->pid) {
+ int clear = TSK_ONCPU, set = 0;
+ bool wake_clock = true;
+
+ /*
+ * When we're going to sleep, psi_dequeue() lets us
+ * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+ * TSK_IOWAIT here, where we can combine it with
+ * TSK_ONCPU and save walking common ancestors twice.
+ */
+ if (sleep) {
+ clear |= TSK_RUNNING;
+ if (prev->in_memstall)
+ clear |= TSK_MEMSTALL_RUNNING;
+ if (prev->in_iowait)
+ set |= TSK_IOWAIT;
+
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((prev->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(prev) == psi_avgs_work))
+ wake_clock = false;
+ }
+
+ psi_flags_change(prev, clear, set);
+
+ for_each_group(group, task_psi_group(prev)) {
+ if (group == common)
+ break;
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
+
+ /*
+ * TSK_ONCPU is handled up to the common ancestor. If there are
+ * any other differences between the two tasks (e.g. prev goes
+ * to sleep, or only one task is memstall), finish propagating
+ * those differences all the way up to the root.
+ */
+ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
+ clear &= ~TSK_ONCPU;
+ for_each_group(group, common)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
+ }
+ psi_write_end(cpu);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
+{
+ int cpu = task_cpu(curr);
+ struct psi_group_cpu *groupc;
+ s64 delta;
+ u64 irq;
+ u64 now;
+
+ if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
+ return;
+
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ if (prev && task_psi_group(prev) == task_psi_group(curr))
+ return;
+
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+
+ for_each_group(group, task_psi_group(curr)) {
+ if (!group->enabled)
+ continue;
+
+ groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
+ if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_rtpoll_work(group, 1, false);
+ }
+ psi_write_end(cpu);
+}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/**
+ * psi_memstall_enter - mark the beginning of a memory stall section
+ * @flags: flags to handle nested sections
+ *
+ * Marks the calling task as being stalled due to a lack of memory,
+ * such as waiting for a refault or performing reclaim.
+ */
+void psi_memstall_enter(unsigned long *flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ *flags = current->in_memstall;
+ if (*flags)
+ return;
+ /*
+ * in_memstall setting & accounting needs to be atomic wrt
+ * changes to the task's scheduling state, otherwise we can
+ * race with CPU migration.
+ */
+ rq = this_rq_lock_irq(&rf);
+
+ current->in_memstall = 1;
+ psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
+
+ rq_unlock_irq(rq, &rf);
+}
+EXPORT_SYMBOL_GPL(psi_memstall_enter);
+
+/**
+ * psi_memstall_leave - mark the end of an memory stall section
+ * @flags: flags to handle nested memdelay sections
+ *
+ * Marks the calling task as no longer stalled due to lack of memory.
+ */
+void psi_memstall_leave(unsigned long *flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ if (*flags)
+ return;
+ /*
+ * in_memstall clearing & accounting needs to be atomic wrt
+ * changes to the task's scheduling state, otherwise we could
+ * race with CPU migration.
+ */
+ rq = this_rq_lock_irq(&rf);
+
+ current->in_memstall = 0;
+ psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
+
+ rq_unlock_irq(rq, &rf);
+}
+EXPORT_SYMBOL_GPL(psi_memstall_leave);
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgroup)
+{
+ if (!static_branch_likely(&psi_cgroups_enabled))
+ return 0;
+
+ cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
+ if (!cgroup->psi)
+ return -ENOMEM;
+
+ cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
+ if (!cgroup->psi->pcpu) {
+ kfree(cgroup->psi);
+ return -ENOMEM;
+ }
+ group_init(cgroup->psi);
+ cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
+ return 0;
+}
+
+void psi_cgroup_free(struct cgroup *cgroup)
+{
+ if (!static_branch_likely(&psi_cgroups_enabled))
+ return;
+
+ cancel_delayed_work_sync(&cgroup->psi->avgs_work);
+ free_percpu(cgroup->psi->pcpu);
+ /* All triggers must be removed by now */
+ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n");
+ kfree(cgroup->psi);
+}
+
+/**
+ * cgroup_move_task - move task to a different cgroup
+ * @task: the task
+ * @to: the target css_set
+ *
+ * Move task to a new cgroup and safely migrate its associated stall
+ * state between the different groups.
+ *
+ * This function acquires the task's rq lock to lock out concurrent
+ * changes to the task's scheduling state and - in case the task is
+ * running - concurrent changes to its stall state.
+ */
+void cgroup_move_task(struct task_struct *task, struct css_set *to)
+{
+ unsigned int task_flags;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!static_branch_likely(&psi_cgroups_enabled)) {
+ /*
+ * Lame to do this here, but the scheduler cannot be locked
+ * from the outside, so we move cgroups from inside sched/.
+ */
+ rcu_assign_pointer(task->cgroups, to);
+ return;
+ }
+
+ rq = task_rq_lock(task, &rf);
+
+ /*
+ * We may race with schedule() dropping the rq lock between
+ * deactivating prev and switching to next. Because the psi
+ * updates from the deactivation are deferred to the switch
+ * callback to save cgroup tree updates, the task's scheduling
+ * state here is not coherent with its psi state:
+ *
+ * schedule() cgroup_move_task()
+ * rq_lock()
+ * deactivate_task()
+ * p->on_rq = 0
+ * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+ * pick_next_task()
+ * rq_unlock()
+ * rq_lock()
+ * psi_task_change() // old cgroup
+ * task->cgroups = to
+ * psi_task_change() // new cgroup
+ * rq_unlock()
+ * rq_lock()
+ * psi_sched_switch() // does deferred updates in new cgroup
+ *
+ * Don't rely on the scheduling state. Use psi_flags instead.
+ */
+ task_flags = task->psi_flags;
+
+ if (task_flags)
+ psi_task_change(task, task_flags, 0);
+
+ /* See comment above */
+ rcu_assign_pointer(task->cgroups, to);
+
+ if (task_flags)
+ psi_task_change(task, 0, task_flags);
+
+ task_rq_unlock(rq, task, &rf);
+}
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+ int cpu;
+
+ /*
+ * After we disable psi_group->enabled, we don't actually
+ * stop percpu tasks accounting in each psi_group_cpu,
+ * instead only stop test_states() loop, record_times()
+ * and averaging worker, see psi_group_change() for details.
+ *
+ * When disable cgroup PSI, this function has nothing to sync
+ * since cgroup pressure files are hidden and percpu psi_group_cpu
+ * would see !psi_group->enabled and only do task accounting.
+ *
+ * When re-enable cgroup PSI, this function use psi_group_change()
+ * to get correct state mask from test_states() loop on tasks[],
+ * and restart groupc->state_start from now, use .clear = .set = 0
+ * here since no task status really changed.
+ */
+ if (!group->enabled)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ u64 now;
+
+ guard(rq_lock_irq)(cpu_rq(cpu));
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ psi_write_end(cpu);
+ }
+}
+#endif /* CONFIG_CGROUPS */
+
+int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
+{
+ bool only_full = false;
+ int full;
+ u64 now;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (!irqtime_enabled() && res == PSI_IRQ)
+ return -EOPNOTSUPP;
+#endif
+
+ /* Update averages before reporting them */
+ mutex_lock(&group->avgs_lock);
+ now = sched_clock();
+ collect_percpu_times(group, PSI_AVGS, NULL);
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
+ mutex_unlock(&group->avgs_lock);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ only_full = res == PSI_IRQ;
+#endif
+
+ for (full = 0; full < 2 - only_full; full++) {
+ unsigned long avg[3] = { 0, };
+ u64 total = 0;
+ int w;
+
+ /* CPU FULL is undefined at the system level */
+ if (!(group == &psi_system && res == PSI_CPU && full)) {
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
+ }
+
+ seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
+ full || only_full ? "full" : "some",
+ LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
+ LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
+ LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
+ total);
+ }
+
+ return 0;
+}
+
+struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
+ enum psi_res res, struct file *file,
+ struct kernfs_open_file *of)
+{
+ struct psi_trigger *t;
+ enum psi_states state;
+ u32 threshold_us;
+ bool privileged;
+ u32 window_us;
+
+ if (static_branch_likely(&psi_disabled))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * Checking the privilege here on file->f_cred implies that a privileged user
+ * could open the file and delegate the write to an unprivileged one.
+ */
+ privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
+
+ if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_SOME + res * 2;
+ else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_FULL + res * 2;
+ else
+ return ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+ return ERR_PTR(-EINVAL);
+#endif
+
+ if (state >= PSI_NONIDLE)
+ return ERR_PTR(-EINVAL);
+
+ if (window_us == 0 || window_us > WINDOW_MAX_US)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Unprivileged users can only use 2s windows so that averages aggregation
+ * work is used, and no RT threads need to be spawned.
+ */
+ if (!privileged && window_us % 2000000)
+ return ERR_PTR(-EINVAL);
+
+ /* Check threshold */
+ if (threshold_us == 0 || threshold_us > window_us)
+ return ERR_PTR(-EINVAL);
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+
+ t->group = group;
+ t->state = state;
+ t->threshold = threshold_us * NSEC_PER_USEC;
+ t->win.size = window_us * NSEC_PER_USEC;
+ window_reset(&t->win, sched_clock(),
+ group->total[PSI_POLL][t->state], 0);
+
+ t->event = 0;
+ t->last_event_time = 0;
+ t->of = of;
+ if (!of)
+ init_waitqueue_head(&t->event_wait);
+ t->pending_event = false;
+ t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
+
+ if (privileged) {
+ mutex_lock(&group->rtpoll_trigger_lock);
+
+ if (!rcu_access_pointer(group->rtpoll_task)) {
+ struct task_struct *task;
+
+ task = kthread_create(psi_rtpoll_worker, group, "psimon");
+ if (IS_ERR(task)) {
+ kfree(t);
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ return ERR_CAST(task);
+ }
+ atomic_set(&group->rtpoll_wakeup, 0);
+ wake_up_process(task);
+ rcu_assign_pointer(group->rtpoll_task, task);
+ }
+
+ list_add(&t->node, &group->rtpoll_triggers);
+ group->rtpoll_min_period = min(group->rtpoll_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->rtpoll_nr_triggers[t->state]++;
+ group->rtpoll_states |= (1 << t->state);
+
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ } else {
+ mutex_lock(&group->avgs_lock);
+
+ list_add(&t->node, &group->avg_triggers);
+ group->avg_nr_triggers[t->state]++;
+
+ mutex_unlock(&group->avgs_lock);
+ }
+ return t;
+}
+
+void psi_trigger_destroy(struct psi_trigger *t)
+{
+ struct psi_group *group;
+ struct task_struct *task_to_destroy = NULL;
+
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
+ return;
+
+ group = t->group;
+ /*
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
+ */
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+
+ if (t->aggregator == PSI_AVGS) {
+ mutex_lock(&group->avgs_lock);
+ if (!list_empty(&t->node)) {
+ list_del(&t->node);
+ group->avg_nr_triggers[t->state]--;
+ }
+ mutex_unlock(&group->avgs_lock);
+ } else {
+ mutex_lock(&group->rtpoll_trigger_lock);
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->rtpoll_nr_triggers[t->state]--;
+ if (!group->rtpoll_nr_triggers[t->state])
+ group->rtpoll_states &= ~(1 << t->state);
+ /*
+ * Reset min update period for the remaining triggers
+ * iff the destroying trigger had the min window size.
+ */
+ if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ }
+ /* Destroy rtpoll_task when the last trigger is destroyed */
+ if (group->rtpoll_states == 0) {
+ group->rtpoll_until = 0;
+ task_to_destroy = rcu_dereference_protected(
+ group->rtpoll_task,
+ lockdep_is_held(&group->rtpoll_trigger_lock));
+ rcu_assign_pointer(group->rtpoll_task, NULL);
+ timer_delete(&group->rtpoll_timer);
+ }
+ }
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ }
+
+ /*
+ * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * rtpoll_task.
+ */
+ synchronize_rcu();
+ /*
+ * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
+ * a deadlock while waiting for psi_rtpoll_work to acquire
+ * rtpoll_trigger_lock
+ */
+ if (task_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->rtpoll_task.
+ */
+ kthread_stop(task_to_destroy);
+ atomic_set(&group->rtpoll_scheduled, 0);
+ }
+ kfree(t);
+}
+
+__poll_t psi_trigger_poll(void **trigger_ptr,
+ struct file *file, poll_table *wait)
+{
+ __poll_t ret = DEFAULT_POLLMASK;
+ struct psi_trigger *t;
+
+ if (static_branch_likely(&psi_disabled))
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+ if (t->of)
+ kernfs_generic_poll(t->of, wait);
+ else
+ poll_wait(file, &t->event_wait, wait);
+
+ if (cmpxchg(&t->event, 1, 0) == 1)
+ ret |= EPOLLPRI;
+
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static int psi_io_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_io_show, NULL);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_memory_show, NULL);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_cpu_show, NULL);
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, enum psi_res res)
+{
+ char buf[32];
+ size_t buf_size;
+ struct seq_file *seq;
+ struct psi_trigger *new;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+ if (!nbytes)
+ return -EINVAL;
+
+ buf_size = min(nbytes, sizeof(buf));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size - 1] = '\0';
+
+ seq = file->private_data;
+
+ /* Take seq->lock to protect seq->private from concurrent writes */
+ mutex_lock(&seq->lock);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, res, file, NULL);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
+ mutex_unlock(&seq->lock);
+
+ return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+ struct seq_file *seq = file->private_data;
+
+ return psi_trigger_poll(&seq->private, file, wait);
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+
+ psi_trigger_destroy(seq->private);
+ return single_release(inode, file);
+}
+
+static const struct proc_ops psi_io_proc_ops = {
+ .proc_open = psi_io_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_io_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+
+static const struct proc_ops psi_memory_proc_ops = {
+ .proc_open = psi_memory_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_memory_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+
+static const struct proc_ops psi_cpu_proc_ops = {
+ .proc_open = psi_cpu_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_cpu_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_irq_show, NULL);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+static int __init psi_proc_init(void)
+{
+ if (psi_enable) {
+ proc_mkdir("pressure", NULL);
+ proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
+ }
+ return 0;
+}
+module_init(psi_proc_init);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+ DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+ return 0;
+}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874..f1867fe8e5c5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1,36 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
#include "sched.h"
-
-#include <linux/slab.h>
-#include <linux/irq_work.h>
+#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
+/* More than 4 hours if BW_SHIFT equals 20. */
+static const u64 max_rt_runtime = MAX_BW;
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+/*
+ * period over which we measure -rt task CPU usage in us.
+ * default: 1s
+ */
+int sysctl_sched_rt_period = 1000000;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+#ifdef CONFIG_SYSCTL
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
+static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static const struct ctl_table sched_rt_sysctls[] = {
+ {
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+ {
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = (void *)&sysctl_sched_rt_period,
+ },
+ {
+ .procname = "sched_rr_timeslice_ms",
+ .data = &sysctl_sched_rr_timeslice,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rr_handler,
+ },
+};
+
+static int __init sched_rt_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_rt_sysctls);
+ return 0;
+}
+late_initcall(sched_rt_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
+void init_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array;
+ int i;
-struct rt_bandwidth def_rt_bandwidth;
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
+
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ rt_rq->highest_prio.next = MAX_RT_PRIO-1;
+ rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks);
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+ rt_rq->tg = &root_task_group;
+#endif
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
container_of(timer, struct rt_bandwidth, rt_period_timer);
- ktime_t now;
- int overrun;
int idle = 0;
+ int overrun;
+ raw_spin_lock(&rt_b->rt_runtime_lock);
for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+ overrun = hrtimer_forward_now(timer, rt_b->rt_period);
if (!overrun)
break;
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
idle = do_sched_rt_period_timer(rt_b, overrun);
+ raw_spin_lock(&rt_b->rt_runtime_lock);
}
+ if (idle)
+ rt_b->rt_period_active = 0;
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -42,65 +129,38 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
raw_spin_lock_init(&rt_b->rt_runtime_lock);
- hrtimer_init(&rt_b->rt_period_timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- rt_b->rt_period_timer.function = sched_rt_period_timer;
+ hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
- if (hrtimer_active(&rt_b->rt_period_timer))
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
- start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+ if (!rt_b->rt_period_active) {
+ rt_b->rt_period_active = 1;
+ /*
+ * SCHED_DEADLINE updates the bandwidth, as a run away
+ * RT task with a DL task could hog a CPU. But DL does
+ * not reset the period. If a deadline task was running
+ * without an RT task running, it can cause RT tasks to
+ * throttle when they start up. Kick the timer right away
+ * to update the period.
+ */
+ hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS_PINNED_HARD);
+ }
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#ifdef CONFIG_SMP
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
-void init_rt_rq(struct rt_rq *rt_rq)
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- struct rt_prio_array *array;
- int i;
-
- array = &rt_rq->active;
- for (i = 0; i < MAX_RT_PRIO; i++) {
- INIT_LIST_HEAD(array->queue + i);
- __clear_bit(i, array->bitmap);
- }
- /* delimiter for bitsearch: */
- __set_bit(MAX_RT_PRIO, array->bitmap);
-
-#if defined CONFIG_SMP
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
- rt_rq->highest_prio.next = MAX_RT_PRIO;
- rt_rq->rt_nr_migratory = 0;
- rt_rq->overloaded = 0;
- plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
- rt_rq->push_flags = 0;
- rt_rq->push_cpu = nr_cpu_ids;
- raw_spin_lock_init(&rt_rq->push_lock);
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
-#endif /* CONFIG_SMP */
- /* We start is dequeued state, because no RT tasks are queued */
- rt_rq->rt_queued = 0;
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
- rt_rq->rt_time = 0;
- rt_rq->rt_throttled = 0;
- rt_rq->rt_runtime = 0;
- raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+ do_start_rt_bandwidth(rt_b);
}
-#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
hrtimer_cancel(&rt_b->rt_period_timer);
@@ -110,19 +170,21 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
+
return container_of(rt_se, struct task_struct, rt);
}
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
+ /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
return rt_rq->rq;
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
+ WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group);
return rt_se->rt_rq;
}
@@ -130,15 +192,25 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_se->rt_rq;
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
return rt_rq->rq;
}
-void free_rt_sched_group(struct task_group *tg)
+void unregister_rt_sched_group(struct task_group *tg)
{
- int i;
+ if (!rt_group_sched_enabled())
+ return;
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
+ if (!rt_group_sched_enabled())
+ return;
for_each_possible_cpu(i) {
if (tg->rt_rq)
@@ -157,7 +229,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
rt_rq->tg = tg;
@@ -184,15 +256,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
struct sched_rt_entity *rt_se;
int i;
- tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!rt_group_sched_enabled())
+ return 1;
+
+ tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
if (!tg->rt_rq)
goto err;
- tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+ tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
if (!tg->rt_se)
goto err;
- init_rt_bandwidth(&tg->rt_bandwidth,
- ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+ init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
for_each_possible_cpu(i) {
rt_rq = kzalloc_node(sizeof(struct rt_rq),
@@ -218,7 +292,7 @@ err:
return 0;
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
#define rt_entity_is_task(rt_se) (1)
@@ -246,22 +320,20 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt;
}
+void unregister_rt_sched_group(struct task_group *tg) { }
+
void free_rt_sched_group(struct task_group *tg) { }
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_SMP
-
-static int pull_rt_task(struct rq *this_rq);
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- return rq->rt.highest_prio.curr > prev->prio;
+ return rq->online && rq->rt.highest_prio.curr > prev->prio;
}
static inline int rt_overloaded(struct rq *rq)
@@ -298,65 +370,28 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rt_rq *rt_rq)
+static inline int has_pushable_tasks(struct rq *rq)
{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
+ return !plist_head_empty(&rq->rt.pushable_tasks);
}
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
+static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
- update_rt_migration(rt_rq);
-}
+static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+static inline void rt_queue_push_tasks(struct rq *rq)
{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
+ if (!has_pushable_tasks(rq))
return;
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
+ queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
}
-static inline int has_pushable_tasks(struct rq *rq)
+static inline void rt_queue_pull_task(struct rq *rq)
{
- return !plist_head_empty(&rq->rt.pushable_tasks);
-}
-
-static inline void set_post_schedule(struct rq *rq)
-{
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
+ queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -368,6 +403,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -379,60 +419,67 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
p = plist_first_entry(&rq->rt.pushable_tasks,
struct task_struct, pushable_tasks);
rq->rt.highest_prio.next = p->prio;
- } else
- rq->rt.highest_prio.next = MAX_RT_PRIO;
-}
-
-#else
-
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
+ } else {
+ rq->rt.highest_prio.next = MAX_RT_PRIO-1;
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
+ }
}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
+ return rt_se->on_rq;
}
-static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the uclamp
+ * settings.
+ *
+ * This check is only important for heterogeneous systems where uclamp_min value
+ * is higher than the capacity of a @cpu. For non-heterogeneous system this
+ * function will always return true.
+ *
+ * The function will return true if the capacity of the @cpu is >= the
+ * uclamp_min and false otherwise.
+ *
+ * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
+ * > uclamp_max.
+ */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
{
- return false;
-}
+ unsigned int min_cap;
+ unsigned int max_cap;
+ unsigned int cpu_cap;
-static inline int pull_rt_task(struct rq *this_rq)
-{
- return 0;
-}
+ /* Only heterogeneous systems can benefit from this check */
+ if (!sched_asym_cpucap_active())
+ return true;
-static inline void set_post_schedule(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
+ min_cap = uclamp_eff_value(p, UCLAMP_MIN);
+ max_cap = uclamp_eff_value(p, UCLAMP_MAX);
-static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+ cpu_cap = arch_scale_cpu_capacity(cpu);
-static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+ return cpu_cap >= min(min_cap, max_cap);
+}
+#else /* !CONFIG_UCLAMP_TASK: */
+static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
{
- return !list_empty(&rt_se->run_list);
+ return true;
}
+#endif /* !CONFIG_UCLAMP_TASK */
#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
- if (!rt_rq->tg)
- return RUNTIME_INF;
-
return rt_rq->rt_runtime;
}
@@ -445,6 +492,11 @@ typedef struct task_group *rt_rq_iter_t;
static inline struct task_group *next_task_group(struct task_group *tg)
{
+ if (!rt_group_sched_enabled()) {
+ WARN_ON(tg != &root_task_group);
+ return NULL;
+ }
+
do {
tg = list_entry_rcu(tg->list.next,
typeof(struct task_group), list);
@@ -457,9 +509,9 @@ static inline struct task_group *next_task_group(struct task_group *tg)
}
#define for_each_rt_rq(rt_rq, iter, rq) \
- for (iter = container_of(&task_groups, typeof(*iter), list); \
- (iter = next_task_group(iter)) && \
- (rt_rq = iter->rt_rq[cpu_of(rq)]);)
+ for (iter = &root_task_group; \
+ iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+ iter = next_task_group(iter))
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -469,12 +521,12 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->my_q;
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor;
struct rq *rq = rq_of_rt_rq(rt_rq);
struct sched_rt_entity *rt_se;
@@ -486,9 +538,9 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (!rt_se)
enqueue_top_rt_rq(rt_rq);
else if (!on_rt_rq(rt_se))
- enqueue_rt_entity(rt_se, false);
+ enqueue_rt_entity(rt_se, 0);
- if (rt_rq->highest_prio.curr < curr->prio)
+ if (rt_rq->highest_prio.curr < donor->prio)
resched_curr(rq);
}
}
@@ -500,10 +552,13 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
- if (!rt_se)
- dequeue_top_rt_rq(rt_rq);
+ if (!rt_se) {
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
+ }
else if (on_rt_rq(rt_se))
- dequeue_rt_entity(rt_se);
+ dequeue_rt_entity(rt_se, 0);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -523,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
return p->prio != p->normal_prio;
}
-#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{
return this_rq()->rd->span;
}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-#endif
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
@@ -546,70 +594,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
return &rt_rq->tg->rt_bandwidth;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
- return ktime_to_ns(def_rt_bandwidth.rt_period);
-}
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
- for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
- for (; rt_se; rt_se = NULL)
-
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
- return NULL;
-}
-
-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- if (!rt_rq->rt_nr_running)
- return;
-
- enqueue_top_rt_rq(rt_rq);
- resched_curr(rq);
-}
-
-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
- dequeue_top_rt_rq(rt_rq);
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled;
-}
-
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
- return &cpu_rq(cpu)->rt;
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
- return &def_rt_bandwidth;
-}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -618,15 +602,14 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
rt_rq->rt_time < rt_b->rt_runtime);
}
-#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
*/
-static int do_balance_runtime(struct rt_rq *rt_rq)
+static void do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
- int i, weight, more = 0;
+ int i, weight;
u64 rt_period;
weight = cpumask_weight(rd->span);
@@ -644,7 +627,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
/*
* Either all rqs have inf runtime and there's nothing to steal
* or __disable_runtime() below sets a specific rq to inf to
- * indicate its been disabled and disalow stealing.
+ * indicate its been disabled and disallow stealing.
*/
if (iter->rt_runtime == RUNTIME_INF)
goto next;
@@ -660,7 +643,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
diff = rt_period - rt_rq->rt_runtime;
iter->rt_runtime -= diff;
rt_rq->rt_runtime += diff;
- more = 1;
if (rt_rq->rt_runtime == rt_period) {
raw_spin_unlock(&iter->rt_runtime_lock);
break;
@@ -670,8 +652,6 @@ next:
raw_spin_unlock(&iter->rt_runtime_lock);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
-
- return more;
}
/*
@@ -743,7 +723,7 @@ static void __disable_runtime(struct rq *rq)
* We cannot be left wanting - that would mean some runtime
* leaked out of the system.
*/
- BUG_ON(want);
+ WARN_ON_ONCE(want);
balanced:
/*
* Disable all the borrow logic by pretending we have inf
@@ -783,27 +763,17 @@ static void __enable_runtime(struct rq *rq)
}
}
-static int balance_runtime(struct rt_rq *rt_rq)
+static void balance_runtime(struct rt_rq *rt_rq)
{
- int more = 0;
-
if (!sched_feat(RT_RUNTIME_SHARE))
- return more;
+ return;
if (rt_rq->rt_time > rt_rq->rt_runtime) {
raw_spin_unlock(&rt_rq->rt_runtime_lock);
- more = do_balance_runtime(rt_rq);
+ do_balance_runtime(rt_rq);
raw_spin_lock(&rt_rq->rt_runtime_lock);
}
-
- return more;
}
-#else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
-{
- return 0;
-}
-#endif /* CONFIG_SMP */
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
@@ -811,7 +781,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
const struct cpumask *span;
span = sched_rt_period_mask();
-#ifdef CONFIG_RT_GROUP_SCHED
+
/*
* FIXME: isolated CPUs should really leave the root task group,
* whether they are isolcpus or were isolated via cpusets, lest
@@ -823,13 +793,29 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
*/
if (rt_b == &root_task_group.rt_bandwidth)
span = cpu_online_mask;
-#endif
+
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq_flags rf;
+ int skip;
+
+ /*
+ * When span == cpu_online_mask, taking each rq->lock
+ * can be time-consuming. Try to avoid it when possible.
+ */
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
+ rt_rq->rt_runtime = rt_b->rt_runtime;
+ skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (skip)
+ continue;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
- raw_spin_lock(&rq->lock);
if (rt_rq->rt_time) {
u64 runtime;
@@ -844,13 +830,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
/*
* When we're idle and a woken (rt) task is
- * throttled check_preempt_curr() will set
+ * throttled wakeup_preempt() will set
* skip_update and the time between the wakeup
* and this unthrottle will get accounted as
* 'runtime'.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq_clock_skip_update(rq, false);
+ rq_clock_cancel_skipupdate(rq);
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
@@ -865,7 +851,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -874,18 +860,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
return idle;
}
-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
- if (rt_rq)
- return rt_rq->highest_prio.curr;
-#endif
-
- return rt_task_of(rt_se)->prio;
-}
-
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -929,52 +903,112 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
+#else /* !CONFIG_RT_GROUP_SCHED: */
+
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_curr(rq);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return false;
+}
+
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return cpu_online_mask;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return &cpu_rq(cpu)->rt;
+}
+
+static void __enable_runtime(struct rq *rq) { }
+static void __disable_runtime(struct rq *rq) { }
+
+#endif /* !CONFIG_RT_GROUP_SCHED */
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq)
+ return rt_rq->highest_prio.curr;
+#endif
+
+ return rt_task_of(rt_se)->prio;
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static void update_curr_rt(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
- struct sched_rt_entity *rt_se = &curr->rt;
- u64 delta_exec;
+ struct task_struct *donor = rq->donor;
+ s64 delta_exec;
- if (curr->sched_class != &rt_sched_class)
+ if (donor->sched_class != &rt_sched_class)
return;
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
return;
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
-
- sched_rt_avg_update(rq, delta_exec);
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity *rt_se = &donor->rt;
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
+#endif /* CONFIG_RT_GROUP_SCHED */
}
static void
-dequeue_top_rt_rq(struct rt_rq *rt_rq)
+dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -985,8 +1019,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
BUG_ON(!rq->nr_running);
- sub_nr_running(rq, rt_rq->rt_nr_running);
+ sub_nr_running(rq, count);
rt_rq->rt_queued = 0;
+
}
static void
@@ -998,27 +1033,30 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
if (rt_rq->rt_queued)
return;
- if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+
+ if (rt_rq_throttled(rt_rq))
return;
- add_nr_running(rq, rt_rq->rt_nr_running);
- rt_rq->rt_queued = 1;
-}
+ if (rt_rq->rt_nr_running) {
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+ }
-#if defined CONFIG_SMP
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_util(rq, 0);
+}
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
-#ifdef CONFIG_RT_GROUP_SCHED
/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (&rq->rt != rt_rq)
+ if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -1028,27 +1066,16 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
-#ifdef CONFIG_RT_GROUP_SCHED
/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (&rq->rt != rt_rq)
+ if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
-#else /* CONFIG_SMP */
-
-static inline
-void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-static inline
-void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-
-#endif /* CONFIG_SMP */
-
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
@@ -1071,7 +1098,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
/*
* This may have been our highest task, and therefore
- * we may have some recomputation to do
+ * we may have some re-computation to do
*/
if (prio == prev_prio) {
struct rt_prio_array *array = &rt_rq->active;
@@ -1080,19 +1107,13 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
sched_find_first_bit(array->bitmap);
}
- } else
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ } else {
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ }
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
-#else
-
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
-
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_RT_GROUP_SCHED
static void
@@ -1101,8 +1122,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted++;
- if (rt_rq->tg)
- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
}
static void
@@ -1114,18 +1134,17 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
- start_rt_bandwidth(&def_rt_bandwidth);
}
static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
@@ -1139,15 +1158,29 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
}
static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+ struct task_struct *tsk;
+
+ if (group_rq)
+ return group_rq->rr_nr_running;
+
+ tsk = rt_task_of(rt_se);
+
+ return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
+static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+ rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}
@@ -1157,13 +1190,140 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+ rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ return false;
+
+ return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+ list_del_init(&rt_se->run_list);
+
+ if (list_empty(array->queue + rt_se_prio(rt_se)))
+ __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+ rt_se->on_list = 0;
+}
+
+static inline struct sched_statistics *
+__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
+{
+ /* schedstats is not supported for rt group. */
+ if (!rt_entity_is_task(rt_se))
+ return NULL;
+
+ return &rt_task_of(rt_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
+}
+
+static inline void
+update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ if ((flags & DEQUEUE_SLEEP) && p) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+ }
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -1176,26 +1336,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
* get throttled and the current group doesn't have any other
* active members.
*/
- if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+ if (rt_se->on_list)
+ __delist_rt_entity(rt_se, array);
return;
+ }
- if (head)
- list_add(&rt_se->run_list, queue);
- else
- list_add_tail(&rt_se->run_list, queue);
- __set_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(rt_se->on_list);
+ if (flags & ENQUEUE_HEAD)
+ list_add(&rt_se->run_list, queue);
+ else
+ list_add_tail(&rt_se->run_list, queue);
+
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+ rt_se->on_list = 1;
+ }
+ rt_se->on_rq = 1;
inc_rt_tasks(rt_se, rt_rq);
}
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
- list_del_init(&rt_se->run_list);
- if (list_empty(array->queue + rt_se_prio(rt_se)))
- __clear_bit(rt_se_prio(rt_se), array->bitmap);
+ if (move_entity(flags)) {
+ WARN_ON_ONCE(!rt_se->on_list);
+ __delist_rt_entity(rt_se, array);
+ }
+ rt_se->on_rq = 0;
dec_rt_tasks(rt_se, rt_rq);
}
@@ -1204,44 +1375,51 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
+ unsigned int rt_nr_running;
for_each_sched_rt_entity(rt_se) {
rt_se->back = back;
back = rt_se;
}
- dequeue_top_rt_rq(rt_rq_of_se(back));
+ rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
- __dequeue_rt_entity(rt_se);
+ __dequeue_rt_entity(rt_se, flags);
}
+
+ dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
- __enqueue_rt_entity(rt_se, head);
+ __enqueue_rt_entity(rt_se, flags);
enqueue_top_rt_rq(&rq->rt);
}
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
- dequeue_rt_stack(rt_se);
+ update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
+ dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
- __enqueue_rt_entity(rt_se, false);
+ __enqueue_rt_entity(rt_se, flags);
}
enqueue_top_rt_rq(&rq->rt);
}
@@ -1257,20 +1435,28 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
- enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+ check_schedstat_required();
+ update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
+
+ enqueue_rt_entity(rt_se, flags);
+
+ if (task_is_blocked(p))
+ return;
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- dequeue_rt_entity(rt_se);
+ dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p);
+
+ return true;
}
/*
@@ -1304,26 +1490,27 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
static void yield_task_rt(struct rq *rq)
{
- requeue_task_rt(rq, rq->curr, 0);
+ requeue_task_rt(rq, rq->donor, 0);
}
-#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
struct rq *rq;
+ bool test;
/* For anything but wake ups, just return the task_cpu */
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (!(flags & (WF_TTWU | WF_FORK)))
goto out;
rq = cpu_rq(cpu);
rcu_read_lock();
- curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+ curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If the current task on @p's runqueue is an RT task, then
@@ -1341,18 +1528,31 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
*
* For equal prio tasks, we just let the scheduler sort it out.
*
- * Otherwise, just let it ride on the affined RQ and the
+ * Otherwise, just let it ride on the affine RQ and the
* post-schedule router will push the preempted task away
*
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
+ *
+ * We take into account the capacity of the CPU to ensure it fits the
+ * requirement of the task - which is only important on heterogeneous
+ * systems like big.LITTLE.
*/
- if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ test = curr &&
+ unlikely(rt_task(donor)) &&
+ (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio);
+
+ if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
/*
+ * Bail out if we were forcing a migration to find a better
+ * fitting CPU but our search failed.
+ */
+ if (!test && target != -1 && !rt_task_fits_capacity(p, target))
+ goto out_unlock;
+
+ /*
* Don't bother moving it if the destination CPU is
* not running a lower priority task.
*/
@@ -1360,6 +1560,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
+
+out_unlock:
rcu_read_unlock();
out:
@@ -1368,44 +1570,56 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- /*
- * Current can't be migrated, useless to reschedule,
- * let's hope p can move out.
- */
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->donor, NULL))
return;
/*
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, NULL))
+ if (p->nr_cpus_allowed != 1 &&
+ cpupri_find(&rq->rd->cpupri, p, NULL))
return;
/*
- * There appears to be other cpus that can accept
- * current and none to run 'p', so lets reschedule
- * to try and push current away:
+ * There appear to be other CPUs that can accept
+ * the current task but none can run 'p', so lets reschedule
+ * to try and push the current task away:
*/
requeue_task_rt(rq, p, 1);
resched_curr(rq);
}
-#endif /* CONFIG_SMP */
+static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+{
+ if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_rt_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+
+ return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
+}
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
- if (p->prio < rq->curr->prio) {
+ struct task_struct *donor = rq->donor;
+
+ if (p->prio < donor->prio) {
resched_curr(rq);
return;
}
-#ifdef CONFIG_SMP
/*
* If:
*
@@ -1418,13 +1632,37 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
* to move current somewhere else, making room for our non-migratable
* task.
*/
- if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
+ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
-#endif
}
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
- struct rt_rq *rt_rq)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ p->se.exec_start = rq_clock_task(rq);
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_end_rt(rt_rq, rt_se);
+
+ /* The running task is never eligible for pushing */
+ dequeue_pushable_task(rq, p);
+
+ if (!first)
+ return;
+
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->donor->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ rt_queue_push_tasks(rq);
+}
+
+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
@@ -1435,6 +1673,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
+ if (WARN_ON_ONCE(list_empty(queue)))
+ return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
@@ -1443,65 +1683,44 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
- struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
do {
- rt_se = pick_next_rt_entity(rq, rt_rq);
- BUG_ON(!rt_se);
+ rt_se = pick_next_rt_entity(rt_rq);
+ if (unlikely(!rt_se))
+ return NULL;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
- p = rt_task_of(rt_se);
- p->se.exec_start = rq_clock_task(rq);
-
- return p;
+ return rt_task_of(rt_se);
}
-static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev)
+static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *p;
- struct rt_rq *rt_rq = &rq->rt;
-
- if (need_pull_rt_task(rq, prev)) {
- pull_rt_task(rq);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl or stop task can slip in, in which case we need
- * to re-start task selection.
- */
- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
- rq->dl.dl_nr_running))
- return RETRY_TASK;
- }
- /*
- * We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_nr_running check.
- */
- if (prev->sched_class == &rt_sched_class)
- update_curr_rt(rq);
-
- if (!rt_rq->rt_queued)
+ if (!sched_rt_runnable(rq))
return NULL;
- put_prev_task(rq, prev);
-
p = _pick_next_task_rt(rq);
- /* The running task is never eligible for pushing */
- dequeue_pushable_task(rq, p);
-
- set_post_schedule(rq);
-
return p;
}
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_start_rt(rt_rq, rt_se);
+
update_curr_rt(rq);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+
+ if (task_is_blocked(p))
+ return;
/*
* The previous task needs to be made eligible for pushing
* if it is still active
@@ -1510,22 +1729,12 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
enqueue_pushable_task(rq, p);
}
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
- return 1;
- return 0;
-}
-
/*
* Return the highest pushable rq's task, which is suitable to be executed
- * on the cpu, NULL otherwise
+ * on the CPU, NULL otherwise
*/
static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{
@@ -1536,7 +1745,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
return NULL;
plist_for_each_entry(p, head, pushable_tasks) {
- if (pick_rt_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
}
@@ -1551,6 +1760,7 @@ static int find_lowest_rq(struct task_struct *task)
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ int ret;
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
@@ -1559,15 +1769,30 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
- if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+ /*
+ * If we're on asym system ensure we consider the different capacities
+ * of the CPUs when searching for the lowest_mask.
+ */
+ if (sched_asym_cpucap_active()) {
+
+ ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
+ task, lowest_mask,
+ rt_task_fits_capacity);
+ } else {
+
+ ret = cpupri_find(&task_rq(task)->rd->cpupri,
+ task, lowest_mask);
+ }
+
+ if (!ret)
return -1; /* No targets found */
/*
- * At this point we have built a mask of cpus representing the
+ * At this point we have built a mask of CPUs representing the
* lowest priority tasks in the system. Now we want to elect
* the best one based on our affinity and topology.
*
- * We prioritize the last cpu that the task executed on since
+ * We prioritize the last CPU that the task executed on since
* it is most likely cache-hot in that location.
*/
if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1575,7 +1800,7 @@ static int find_lowest_rq(struct task_struct *task)
/*
* Otherwise, we consult the sched_domains span maps to figure
- * out which cpu is logically closest to our hot cache data.
+ * out which CPU is logically closest to our hot cache data.
*/
if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1595,8 +1820,8 @@ static int find_lowest_rq(struct task_struct *task)
return this_cpu;
}
- best_cpu = cpumask_first_and(lowest_mask,
- sched_domain_span(sd));
+ best_cpu = cpumask_any_and_distribute(lowest_mask,
+ sched_domain_span(sd));
if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
@@ -1613,12 +1838,34 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
- cpu = cpumask_any(lowest_mask);
+ cpu = cpumask_any_distribute(lowest_mask);
if (cpu < nr_cpu_ids)
return cpu;
+
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1649,14 +1896,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
+ * It is possible the task was scheduled, set
+ * "migrate_disabled" and then got preempted, so we must
+ * check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu,
- tsk_cpus_allowed(task)) ||
- task_running(rq, task) ||
- !task_on_rq_queued(task))) {
+ if (unlikely(is_migration_disabled(task) ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1676,32 +1925,12 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
* of lesser priority.
*/
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
@@ -1715,21 +1944,61 @@ static int push_rt_task(struct rq *rq)
return 0;
retry:
- if (unlikely(next_task == rq->curr)) {
- WARN_ON(1);
- return 0;
- }
-
/*
* It's possible that the next_task slipped in of
* higher priority than current. If that's the case
* just reschedule current.
*/
- if (unlikely(next_task->prio < rq->curr->prio)) {
+ if (unlikely(next_task->prio < rq->donor->prio)) {
resched_curr(rq);
return 0;
}
+ if (is_migration_disabled(next_task)) {
+ struct task_struct *push_task = NULL;
+ int cpu;
+
+ if (!pull || rq->push_busy)
+ return 0;
+
+ /*
+ * Invoking find_lowest_rq() on anything but an RT task doesn't
+ * make sense. Per the above priority check, curr has to
+ * be of higher priority than next_task, so no need to
+ * reschedule when bailing out.
+ *
+ * Note that the stoppers are masqueraded as SCHED_FIFO
+ * (cf. sched_set_stop_task()), so we can't rely on rt_task().
+ */
+ if (rq->donor->sched_class != &rt_sched_class)
+ return 0;
+
+ cpu = find_lowest_rq(rq->curr);
+ if (cpu == -1 || cpu == rq->cpu)
+ return 0;
+
+ /*
+ * Given we found a CPU with lower priority than @next_task,
+ * therefore it should be running. However we cannot migrate it
+ * to this other CPU, instead attempt to push the current
+ * running task on this CPU away.
+ */
+ push_task = get_push_task(rq);
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(rq);
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ push_task, &rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(rq);
+ }
+
+ return 0;
+ }
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -1746,12 +2015,12 @@ retry:
* pushing.
*/
task = pick_next_pushable_task(rq);
- if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ if (task == next_task) {
/*
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
* to push it to. Do not retry in this case, since
- * other cpus will pull from us when ready.
+ * other CPUs will pull from us when ready.
*/
goto out;
}
@@ -1768,15 +2037,11 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, lowest_rq->cpu);
- activate_task(lowest_rq, next_task, 0);
- ret = 1;
-
+ move_queued_task_locked(rq, lowest_rq, next_task);
resched_curr(lowest_rq);
+ ret = 1;
double_unlock_balance(rq, lowest_rq);
-
out:
put_task_struct(next_task);
@@ -1786,176 +2051,191 @@ out:
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
- while (push_rt_task(rq))
+ while (push_rt_task(rq, false))
;
}
#ifdef HAVE_RT_PUSH_IPI
+
/*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own IRQ work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * task must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single IRQ work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the IRQ work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
*
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
*/
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
{
- int prev_cpu = rq->rt.push_cpu;
+ int next;
int cpu;
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
/*
- * If the previous cpu is less than the rq's CPU, then it already
- * passed the end of the mask, and has started from the beginning.
- * We end if the next CPU is greater or equal to rq's CPU.
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
*/
- if (prev_cpu < rq->cpu) {
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
+ for (;;) {
- } else if (cpu >= nr_cpu_ids) {
- /*
- * We passed the end of the mask, start at the beginning.
- * If the result is greater or equal to the rq's CPU, then
- * the loop is finished.
- */
- cpu = cpumask_first(rq->rd->rto_mask);
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
- }
- rq->rt.push_cpu = cpu;
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- /* Return cpu to let the caller know if the loop is finished or not */
- return cpu;
-}
+ rd->rto_cpu = cpu;
-static int find_next_push_cpu(struct rq *rq)
-{
- struct rq *next_rq;
- int cpu;
+ if (cpu < nr_cpu_ids)
+ return cpu;
- while (1) {
- cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- break;
- next_rq = cpu_rq(cpu);
+ rd->rto_cpu = -1;
- /* Make sure the next rq can push to this rq */
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
+
+ if (rd->rto_loop == next)
break;
+
+ rd->rto_loop = next;
}
- return cpu;
+ return -1;
}
-#define RT_PUSH_IPI_EXECUTING 1
-#define RT_PUSH_IPI_RESTART 2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
+}
-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
{
- int cpu;
+ atomic_set_release(v, 0);
+}
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- raw_spin_lock(&rq->rt.push_lock);
- /* Make sure it's still executing */
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- /*
- * Tell the IPI to restart the loop as things have
- * changed since it started.
- */
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- raw_spin_unlock(&rq->rt.push_lock);
- return;
- }
- raw_spin_unlock(&rq->rt.push_lock);
- }
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu = -1;
- /* When here, there's no IPI going around */
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
- rq->rt.push_cpu = rq->cpu;
- cpu = find_next_push_cpu(rq);
- if (cpu >= nr_cpu_ids)
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ raw_spin_lock(&rq->rd->rto_lock);
- irq_work_queue_on(&rq->rt.push_work, cpu);
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid CPU
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an IPI needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq->rd);
+
+ raw_spin_unlock(&rq->rd->rto_lock);
+
+ rto_start_unlock(&rq->rd->rto_loop_start);
+
+ if (cpu >= 0) {
+ /* Make sure the rd does not get freed while pushing */
+ sched_get_rd(rq->rd);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ }
}
/* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
{
- struct rt_rq *rt_rq = arg;
- struct rq *rq, *src_rq;
- int this_cpu;
+ struct root_domain *rd =
+ container_of(work, struct root_domain, rto_push_work);
+ struct rq *rq;
int cpu;
- this_cpu = rt_rq->push_cpu;
-
- /* Paranoid check */
- BUG_ON(this_cpu != smp_processor_id());
+ rq = this_rq();
- rq = cpu_rq(this_cpu);
- src_rq = rq_of_rt_rq(rt_rq);
-
-again:
- if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
- push_rt_task(rq);
- raw_spin_unlock(&rq->lock);
- }
-
- /* Pass the IPI to the next rt overloaded queue */
- raw_spin_lock(&rt_rq->push_lock);
/*
- * If the source queue changed since the IPI went out,
- * we need to restart the search from that CPU again.
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
*/
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- rt_rq->push_cpu = src_rq->cpu;
+ if (has_pushable_tasks(rq)) {
+ raw_spin_rq_lock(rq);
+ while (push_rt_task(rq, true))
+ ;
+ raw_spin_rq_unlock(rq);
}
- cpu = find_next_push_cpu(src_rq);
+ raw_spin_lock(&rd->rto_lock);
- if (cpu >= nr_cpu_ids)
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- raw_spin_unlock(&rt_rq->push_lock);
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rd);
- if (cpu >= nr_cpu_ids)
- return;
+ raw_spin_unlock(&rd->rto_lock);
- /*
- * It is possible that a restart caused this CPU to be
- * chosen again. Don't bother with an IPI, just see if we
- * have more to push.
- */
- if (unlikely(cpu == rq->cpu))
- goto again;
+ if (cpu < 0) {
+ sched_put_rd(rd);
+ return;
+ }
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
- try_to_push_tasks(rt_rq);
+ irq_work_queue_on(&rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
- struct task_struct *p;
+ int this_cpu = this_rq->cpu, cpu;
+ bool resched = false;
+ struct task_struct *p, *push_task;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
- return 0;
+ if (likely(!rt_overload_count))
+ return;
/*
* Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1963,10 +2243,15 @@ static int pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
- return 0;
+ return;
}
#endif
@@ -1992,6 +2277,7 @@ static int pull_rt_task(struct rq *this_rq)
* double_lock_balance, and another CPU could
* alter this_rq
*/
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@@ -2010,20 +2296,21 @@ static int pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
- * than what's currently running on its cpu.
- * This is just that p is wakeing up and hasn't
+ * than what's currently running on its CPU.
+ * This is just that p is waking up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue
*/
- if (p->prio < src_rq->curr->prio)
+ if (p->prio < src_rq->donor->prio)
goto skip;
- ret = 1;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ move_queued_task_locked(src_rq, this_rq, p);
+ resched = true;
+ }
/*
* We continue with the search, just in
* case there's an even higher prio task
@@ -2033,14 +2320,19 @@ static int pull_rt_task(struct rq *this_rq)
}
skip:
double_unlock_balance(this_rq, src_rq);
- }
- return ret;
-}
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(this_rq);
+ }
+ }
-static void post_schedule_rt(struct rq *rq)
-{
- push_rt_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -2049,53 +2341,15 @@ static void post_schedule_rt(struct rq *rq)
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- has_pushable_tasks(rq) &&
- p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
- (rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio))
- push_rt_tasks(rq);
-}
-
-static void set_cpus_allowed_rt(struct task_struct *p,
- const struct cpumask *new_mask)
-{
- struct rq *rq;
- int weight;
-
- BUG_ON(!rt_task(p));
-
- if (!task_on_rq_queued(p))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- rq = task_rq(p);
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_task(rq, p);
- BUG_ON(!rq->rt.rt_nr_migratory);
- rq->rt.rt_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_task(rq, p);
- rq->rt.rt_nr_migratory++;
- }
+ bool need_to_push = !task_on_cpu(rq, p) &&
+ !test_tsk_need_resched(rq->curr) &&
+ p->nr_cpus_allowed > 1 &&
+ (dl_task(rq->donor) || rt_task(rq->donor)) &&
+ (rq->curr->nr_cpus_allowed < 2 ||
+ rq->donor->prio <= p->prio);
- update_rt_migration(&rq->rt);
+ if (need_to_push)
+ push_rt_tasks(rq);
}
/* Assumes rq->lock is held */
@@ -2136,8 +2390,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
- if (pull_rt_task(rq))
- resched_curr(rq);
+ rt_queue_pull_task(rq);
}
void __init init_sched_rt_class(void)
@@ -2149,7 +2402,6 @@ void __init init_sched_rt_class(void)
GFP_KERNEL, cpu_to_node(i));
}
}
-#endif /* CONFIG_SMP */
/*
* When switching a task to RT, we may overload the runqueue
@@ -2158,23 +2410,24 @@ void __init init_sched_rt_class(void)
*/
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
+ /*
+ * If we are running, update the avg_rt tracking, as the running time
+ * will now on be accounted into the latter.
+ */
+ if (task_current(rq, p)) {
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ return;
+ }
/*
- * If we are already running, then there's nothing
- * that needs to be done. But if we are not running
- * we may need to preempt the current running task.
- * If that current running task is also an RT task
+ * If we are not running we may need to preempt the current
+ * running task. If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (task_on_rq_queued(p) && rq->curr != p) {
-#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
- /* Don't resched if we changed runqueues */
- push_rt_task(rq) && rq != task_rq(p))
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched && p->prio < rq->curr->prio)
+ if (task_on_rq_queued(p)) {
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ rt_queue_push_tasks(rq);
+ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
}
@@ -2184,43 +2437,40 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* us to initiate a push or pull.
*/
static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio)
{
if (!task_on_rq_queued(p))
return;
- if (rq->curr == p) {
-#ifdef CONFIG_SMP
+ if (p->prio == oldprio)
+ return;
+
+ if (task_current_donor(rq, p)) {
/*
* If our priority decreases while running, we
* may need to pull tasks to this runqueue.
*/
if (oldprio < p->prio)
- pull_rt_task(rq);
+ rt_queue_pull_task(rq);
+
/*
* If there's a higher priority task waiting to run
- * then reschedule. Note, the above pull_rt_task
- * can release the rq lock and p could migrate.
- * Only reschedule if p is still on the same runqueue.
+ * then reschedule.
*/
- if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
- resched_curr(rq);
-#else
- /* For UP simply resched on drop of prio */
- if (oldprio < p->prio)
+ if (p->prio > rq->rt.highest_prio.curr)
resched_curr(rq);
-#endif /* CONFIG_SMP */
} else {
/*
* This task is not running, but if it is
* greater than the current running task
* then reschedule.
*/
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->donor->prio)
resched_curr(rq);
}
}
+#ifdef CONFIG_POSIX_TIMERS
static void watchdog(struct rq *rq, struct task_struct *p)
{
unsigned long soft, hard;
@@ -2238,21 +2488,35 @@ static void watchdog(struct rq *rq, struct task_struct *p)
}
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
- if (p->rt.timeout > next)
- p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ if (p->rt.timeout > next) {
+ posix_cputimers_rt_watchdog(&p->posix_cputimers,
+ p->se.sum_exec_runtime);
+ }
}
}
+#else /* !CONFIG_POSIX_TIMERS: */
+static inline void watchdog(struct rq *rq, struct task_struct *p) { }
+#endif /* !CONFIG_POSIX_TIMERS */
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
watchdog(rq, p);
/*
- * RR tasks need a special form of timeslice management.
+ * RR tasks need a special form of time-slice management.
* FIFO tasks have no timeslices.
*/
if (p->policy != SCHED_RR)
@@ -2276,16 +2540,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
}
}
-static void set_curr_task_rt(struct rq *rq)
-{
- struct task_struct *p = rq->curr;
-
- p->se.exec_start = rq_clock_task(rq);
-
- /* The running task is never eligible for pushing */
- dequeue_pushable_task(rq, p);
-}
-
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
{
/*
@@ -2297,41 +2551,385 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
-const struct sched_class rt_sched_class = {
- .next = &fair_sched_class,
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_rt(struct task_struct *p, int cpu)
+{
+ struct rt_rq *rt_rq;
+
+#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq
+ rt_rq = task_group(p)->rt_rq[cpu];
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
+#else
+ rt_rq = &cpu_rq(cpu)->rt;
+#endif
+
+ return rt_rq_throttled(rt_rq);
+}
+#endif /* CONFIG_SCHED_CORE */
+
+DEFINE_SCHED_CLASS(rt) = {
+
+ .queue_mask = 4,
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
- .check_preempt_curr = check_preempt_curr_rt,
+ .wakeup_preempt = wakeup_preempt_rt,
- .pick_next_task = pick_next_task_rt,
+ .pick_task = pick_task_rt,
.put_prev_task = put_prev_task_rt,
+ .set_next_task = set_next_task_rt,
-#ifdef CONFIG_SMP
+ .balance = balance_rt,
.select_task_rq = select_task_rq_rt,
-
- .set_cpus_allowed = set_cpus_allowed_rt,
+ .set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
-#endif
+ .find_lock_rq = find_lock_lowest_rq,
- .set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
- .prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+ .prio_changed = prio_changed_rt,
.update_curr = update_curr_rt,
+
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_rt,
+#endif
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
+};
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *task;
+ struct css_task_iter it;
+ int ret = 0;
+
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
+ css_task_iter_start(&tg->css, 0, &it);
+ while (!ret && (task = css_task_iter_next(&it)))
+ ret |= rt_task(task);
+ css_task_iter_end(&it);
+
+ return ret;
+}
+
+struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
};
-#ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+{
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
+
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
+
+ /*
+ * Ensure we don't starve existing RT tasks if runtime turns zero.
+ */
+ if (rt_bandwidth_enabled() && !runtime &&
+ tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
+
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
+ }
+
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+ int ret;
+
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int tg_set_rt_bandwidth(struct task_group *tg,
+ u64 rt_period, u64 rt_runtime)
+{
+ int i, err = 0;
+
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
+ return -EINVAL;
+
+ mutex_lock(&rt_constraints_mutex);
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
+ goto unlock;
+
+ raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+ tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+ tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+ for_each_possible_cpu(i) {
+ struct rt_rq *rt_rq = tg->rt_rq[i];
+
+ raw_spin_lock(&rt_rq->rt_runtime_lock);
+ rt_rq->rt_runtime = rt_runtime;
+ raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ }
+ raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+unlock:
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
+}
+
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+ u64 rt_runtime, rt_period;
+
+ rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us < 0)
+ rt_runtime = RUNTIME_INF;
+ else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+
+int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
+{
+ u64 rt_runtime, rt_period;
+
+ if (rt_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ rt_period = rt_period_us * NSEC_PER_USEC;
+ rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+ u64 rt_period_us;
+
+ rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ do_div(rt_period_us, NSEC_PER_USEC);
+ return rt_period_us;
+}
+
+#ifdef CONFIG_SYSCTL
+static int sched_rt_global_constraints(void)
+{
+ int ret = 0;
+
+ mutex_lock(&rt_constraints_mutex);
+ ret = __rt_schedulable(NULL, 0, 0);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+ /* Don't accept real-time tasks when there is no way for them to run */
+ if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ return 0;
+
+ return 1;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED: */
+
+#ifdef CONFIG_SYSCTL
+static int sched_rt_global_constraints(void)
+{
+ return 0;
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* !CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SYSCTL
+static int sched_rt_global_validate(void)
+{
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
+ ((u64)sysctl_sched_rt_runtime *
+ NSEC_PER_USEC > max_rt_runtime)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+}
+
+static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_period, old_runtime;
+ static DEFINE_MUTEX(mutex);
+ int ret;
+
+ mutex_lock(&mutex);
+ sched_domains_mutex_lock();
+ old_period = sysctl_sched_rt_period;
+ old_runtime = sysctl_sched_rt_runtime;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_validate();
+ if (ret)
+ goto undo;
+
+ ret = sched_rt_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
+ }
+ sched_domains_mutex_unlock();
+ mutex_unlock(&mutex);
+
+ /*
+ * After changing maximum available bandwidth for DEADLINE, we need to
+ * recompute per root domain and per cpus variables accordingly.
+ */
+ rebuild_sched_domains();
+
+ return ret;
+}
+
+static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /*
+ * Make sure that internally we keep jiffies.
+ * Also, writing zero resets the time-slice to default:
+ */
+ if (!ret && write) {
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+#endif /* CONFIG_SYSCTL */
void print_rt_stats(struct seq_file *m, int cpu)
{
@@ -2343,4 +2941,3 @@ void print_rt_stats(struct seq_file *m, int cpu)
print_rt_rq(m, cpu, rt_rq);
rcu_read_unlock();
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644
index 000000000000..6803cfec7a1e
--- /dev/null
+++ b/kernel/sched/sched-pelt.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+#include <linux/types.h>
+
+static const u32 runnable_avg_yN_inv[] __maybe_unused = {
+ 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+ 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+ 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+ 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+ 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+ 0x85aac367, 0x82cd8698,
+};
+
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993958..d30cca6870f5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,22 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal types and methods:
+ */
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+#include <linux/prandom.h>
+#include <linux/sched/affinity.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/deadline.h>
#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/smt.h>
+#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-#include <linux/stop_machine.h>
+#include <linux/sched/task_flags.h>
+#include <linux/sched/task.h>
+#include <linux/sched/topology.h>
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/capability.h>
+#include <linux/cgroup_api.h>
+#include <linux/cgroup.h>
+#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/fs_api.h>
+#include <linux/hrtimer_api.h>
+#include <linux/interrupt.h>
#include <linux/irq_work.h>
+#include <linux/jiffies.h>
+#include <linux/kref_api.h>
+#include <linux/kthread.h>
+#include <linux/ktime_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex_api.h>
+#include <linux/plist.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/softirq.h>
+#include <linux/spinlock_api.h>
+#include <linux/static_key.h>
+#include <linux/stop_machine.h>
+#include <linux/syscalls_api.h>
+#include <linux/syscalls.h>
#include <linux/tick.h>
-#include <linux/slab.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+#include <linux/u64_stats_sync_api.h>
+#include <linux/uaccess.h>
+#include <linux/wait_api.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue_api.h>
+#include <linux/delayacct.h>
+#include <linux/mmu_context.h>
-#include "cpupri.h"
-#include "cpudeadline.h"
-#include "cpuacct.h"
+#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
+#include "../workqueue_internal.h"
struct rq;
+struct cfs_rq;
+struct rt_rq;
+struct sched_group;
struct cpuidle_state;
+#ifdef CONFIG_PARAVIRT
+# include <asm/paravirt.h>
+# include <asm/paravirt_api_clock.h>
+#endif
+
+#include <asm/barrier.h>
+
+#include "cpupri.h"
+#include "cpudeadline.h"
+
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -26,61 +101,105 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
-extern long calc_load_fold_active(struct rq *this_rq);
-extern void update_cpu_load_active(struct rq *this_rq);
+extern void calc_global_load_tick(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
+
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
+
+extern int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+extern int sched_rr_timeslice;
+
+/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ struct rcu_head rcu;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+extern struct list_head asym_cap_list;
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
-#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ))
/*
* Increase resolution of nice-level calculations for 64-bit architectures.
* The extra resolution improves shares distribution and load balancing of
- * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group
* hierarchies, especially on larger systems. This is not a user-visible change
* and does not change the user-interface for setting shares/weights.
*
* We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
+ * are pretty high and the returns do not justify the increased costs.
+ *
+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
+ * increase coverage and consistency always enable it on 64-bit platforms.
*/
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
-# define SCHED_LOAD_RESOLUTION 10
-# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
+#ifdef CONFIG_64BIT
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
+})
#else
-# define SCHED_LOAD_RESOLUTION 0
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) (w)
# define scale_load_down(w) (w)
#endif
-#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
-#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
-
-#define NICE_0_LOAD SCHED_LOAD_SCALE
-#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+/*
+ * Task weight (visible to users) and its load (invisible to users) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD
+ *
+ */
+#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
/*
* Single value that decides SCHED_DEADLINE internal math precision.
* 10 -> just above 1us
* 9 -> just above 0.5us
*/
-#define DL_SCALE (10)
+#define DL_SCALE 10
/*
- * These are the 'tuning knobs' of the scheduler:
+ * Single value that denotes runtime == period, ie unlimited time.
*/
+#define RUNTIME_INF ((u64)~0ULL)
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF ((u64)~0ULL)
+static inline int idle_policy(int policy)
+{
+ return policy == SCHED_IDLE;
+}
+
+static inline int normal_policy(int policy)
+{
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (policy == SCHED_EXT)
+ return true;
+#endif
+ return policy == SCHED_NORMAL;
+}
static inline int fair_policy(int policy)
{
- return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+ return normal_policy(policy) || policy == SCHED_BATCH;
}
static inline int rt_policy(int policy)
@@ -93,6 +212,17 @@ static inline int dl_policy(int policy)
return policy == SCHED_DEADLINE;
}
+static inline bool valid_policy(int policy)
+{
+ return idle_policy(policy) || fair_policy(policy) ||
+ rt_policy(policy) || dl_policy(policy);
+}
+
+static inline int task_has_idle_policy(struct task_struct *p)
+{
+ return idle_policy(p->policy);
+}
+
static inline int task_has_rt_policy(struct task_struct *p)
{
return rt_policy(p->policy);
@@ -103,18 +233,73 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
-static inline bool dl_time_before(u64 a, u64 b)
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+static inline void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+
+ *avg += diff / 8;
+}
+
+/*
+ * Shifting a value by an exponent greater *or equal* to the size of said value
+ * is UB; cap at size-1.
+ */
+#define shr_bound(val, shift) \
+ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
+
+/*
+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are
+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it
+ * maps pretty well onto the shares value used by scheduler and the round-trip
+ * conversions preserve the original value over the entire range.
+ */
+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight)
+{
+ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
+}
+
+static inline unsigned long sched_weight_to_cgroup(unsigned long weight)
{
- return (s64)(a - b) < 0;
+ return clamp_t(unsigned long,
+ DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024),
+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+/*
+ * !! For sched_setattr_nocheck() (kernel) only !!
+ *
+ * This is actually gross. :(
+ *
+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
+ * tasks, but still be able to sleep. We need this on platforms that cannot
+ * atomically change clock frequency. Remove once fast switching will be
+ * available on such platforms.
+ *
+ * SUGOV stands for SchedUtil GOVernor.
+ */
+#define SCHED_FLAG_SUGOV 0x10000000
+
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
+static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
+{
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
+#else
+ return false;
+#endif
}
/*
* Tells if entity @a should preempt entity @b.
*/
-static inline bool
-dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+static inline bool dl_entity_preempt(const struct sched_dl_entity *a,
+ const struct sched_dl_entity *b)
{
- return dl_time_before(a->deadline, b->deadline);
+ return dl_entity_is_special(a) ||
+ dl_time_before(a->deadline, b->deadline);
}
/*
@@ -131,15 +316,20 @@ struct rt_bandwidth {
ktime_t rt_period;
u64 rt_runtime;
struct hrtimer rt_period_timer;
+ unsigned int rt_period_active;
};
-void __dl_clear_params(struct task_struct *p);
+static inline int dl_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
/*
- * To keep the bandwidth of -deadline tasks and groups under control
+ * To keep the bandwidth of -deadline tasks under control
* we need some place where:
- * - store the maximum -deadline bandwidth of the system (the group);
- * - cache the fraction of that bandwidth that is currently allocated.
+ * - store the maximum -deadline bandwidth of each cpu;
+ * - cache the fraction of bandwidth that is currently allocated in
+ * each root domain;
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
@@ -147,123 +337,192 @@ void __dl_clear_params(struct task_struct *p);
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * With respect to SMP, bandwidth is given on a per root domain basis,
* meaning that:
- * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- * - dl_total_bw array contains, in the i-eth element, the currently
- * allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
+ * - bw (< 100%) is the deadline bandwidth of each CPU;
+ * - total_bw is the currently allocated bandwidth in each root domain;
*/
-struct dl_bandwidth {
- raw_spinlock_t dl_runtime_lock;
- u64 dl_runtime;
- u64 dl_period;
-};
-
-static inline int dl_bandwidth_enabled(void)
-{
- return sysctl_sched_rt_runtime >= 0;
-}
-
-extern struct dl_bw *dl_bw_of(int i);
-
struct dl_bw {
- raw_spinlock_t lock;
- u64 bw, total_bw;
+ raw_spinlock_t lock;
+ u64 bw;
+ u64 total_bw;
};
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw -= tsk_bw;
-}
+extern void init_dl_bw(struct dl_bw *dl_b);
+extern int sched_dl_global_validate(void);
+extern void sched_dl_do_global(void);
+extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
+extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern bool __checkparam_dl(const struct sched_attr *attr);
+extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
+extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
+extern int dl_bw_deactivate(int cpu);
+extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ * dl_se::rq -- runqueue we belong to.
+ *
+ * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ * returns NULL.
+ *
+ * dl_server_update() -- called from update_curr_common(), propagates runtime
+ * to the server.
+ *
+ * dl_server_start() -- start the server when it has tasks; it will stop
+ * automatically when there are no more tasks, per
+ * dl_se::server_pick() returning NULL.
+ *
+ * dl_server_stop() -- (force) stop the server; use when updating
+ * parameters.
+ *
+ * dl_server_init() -- initializes the server.
+ *
+ * When started the dl_server will (per dl_defer) schedule a timer for its
+ * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a
+ * server will run at the very end of its period.
+ *
+ * This is done such that any runtime from the target class can be accounted
+ * against the server -- through dl_server_update() above -- such that when it
+ * becomes time to run, it might already be out of runtime and get deferred
+ * until the next period. In this case dl_server_timer() will alternate
+ * between defer and replenish but never actually enqueue the server.
+ *
+ * Only when the target class does not manage to exhaust the server's runtime
+ * (there's actualy starvation in the given period), will the dl_server get on
+ * the runqueue. Once queued it will pick tasks from the target class and run
+ * them until either its runtime is exhaused, at which point its back to
+ * dl_server_timer, or until there are no more tasks to run, at which point
+ * the dl_server stops itself.
+ *
+ * By stopping at this point the dl_server retains bandwidth, which, if a new
+ * task wakes up imminently (starting the server again), can be used --
+ * subject to CBS wakeup rules -- without having to wait for the next period.
+ *
+ * Additionally, because of the dl_defer behaviour the start/stop behaviour is
+ * naturally thottled to once per period, avoiding high context switch
+ * workloads from spamming the hrtimer program/cancel paths.
+ */
+extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_pick_f pick_task);
+extern void sched_init_dl_servers(void);
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw += tsk_bw;
-}
+extern void fair_server_init(struct rq *rq);
+extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
+extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
+ u64 runtime, u64 period, bool init);
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+static inline bool dl_server_active(struct sched_dl_entity *dl_se)
{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+ return dl_se->dl_server_active;
}
-extern struct mutex sched_domains_mutex;
-
#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
+extern struct list_head task_groups;
-struct cfs_rq;
-struct rt_rq;
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
+extern const u64 max_bw_quota_period_us;
-extern struct list_head task_groups;
+/*
+ * default period for group bandwidth.
+ * default: 0.1s, units: microseconds
+ */
+static inline u64 default_bw_period_us(void)
+{
+ return 100000ULL;
+}
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
- raw_spinlock_t lock;
- ktime_t period;
- u64 quota, runtime;
- s64 hierarchical_quota;
- u64 runtime_expires;
-
- int idle, timer_active;
- struct hrtimer period_timer, slack_timer;
- struct list_head throttled_cfs_rq;
-
- /* statistics */
- int nr_periods, nr_throttled;
- u64 throttled_time;
-#endif
+ raw_spinlock_t lock;
+ ktime_t period;
+ u64 quota;
+ u64 runtime;
+ u64 burst;
+ u64 runtime_snap;
+ s64 hierarchical_quota;
+
+ u8 idle;
+ u8 period_active;
+ u8 slack_started;
+ struct hrtimer period_timer;
+ struct hrtimer slack_timer;
+ struct list_head throttled_cfs_rq;
+
+ /* Statistics: */
+ int nr_periods;
+ int nr_throttled;
+ int nr_burst;
+ u64 throttled_time;
+ u64 burst_time;
+#endif /* CONFIG_CFS_BANDWIDTH */
};
-/* task group related information */
+/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /* schedulable entities of this group on each cpu */
- struct sched_entity **se;
- /* runqueue "owned" by this group on each cpu */
- struct cfs_rq **cfs_rq;
- unsigned long shares;
-
-#ifdef CONFIG_SMP
- atomic_long_t load_avg;
- atomic_t runnable_avg;
-#endif
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* schedulable entities of this group on each CPU */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each CPU */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+ /*
+ * load_avg can be heavily contended at clock tick time, so put
+ * it in its own cache-line separated from the fields above which
+ * will also be accessed at each tick.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity **rt_se;
- struct rt_rq **rt_rq;
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
- struct rt_bandwidth rt_bandwidth;
+ struct rt_bandwidth rt_bandwidth;
#endif
- struct rcu_head rcu;
- struct list_head list;
+ struct scx_task_group scx;
- struct task_group *parent;
- struct list_head siblings;
- struct list_head children;
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
#ifdef CONFIG_SCHED_AUTOGROUP
- struct autogroup *autogroup;
+ struct autogroup *autogroup;
+#endif
+
+ struct cfs_bandwidth cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* The two decimal precision [%] value requested from user-space */
+ unsigned int uclamp_pct[UCLAMP_CNT];
+ /* Clamp values requested for a task group */
+ struct uclamp_se uclamp_req[UCLAMP_CNT];
+ /* Effective clamp values used for a task group */
+ struct uclamp_se uclamp[UCLAMP_CNT];
#endif
- struct cfs_bandwidth cfs_bandwidth;
};
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
/*
@@ -274,8 +533,8 @@ struct task_group {
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
-#define MIN_SHARES (1UL << 1)
-#define MAX_SHARES (1UL << 18)
+#define MIN_SHARES (1UL << 1)
+#define MAX_SHARES (1UL << 18)
#endif
typedef int (*tg_visitor)(struct task_group *, void *);
@@ -294,91 +553,169 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
return walk_tg_tree_from(&root_task_group, down, up, data);
}
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
extern int tg_nop(struct task_group *tg, void *data);
+#ifdef CONFIG_FAIR_GROUP_SCHED
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
-extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void online_fair_sched_group(struct task_group *tg);
+extern void unregister_fair_sched_group(struct task_group *tg);
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+static inline void free_fair_sched_group(struct task_group *tg) { }
+static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+static inline void online_fair_sched_group(struct task_group *tg) { }
+static inline void unregister_fair_sched_group(struct task_group *tg) { }
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern bool cfs_task_bw_constrained(struct task_struct *p);
-extern void free_rt_sched_group(struct task_group *tg);
-extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
+extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us);
+extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
+extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
-extern void sched_offline_group(struct task_group *tg);
+extern void sched_release_group(struct task_group *tg);
-extern void sched_move_task(struct task_struct *tsk);
+extern void sched_move_task(struct task_struct *tsk, bool for_autogroup);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
-#else /* CONFIG_CGROUP_SCHED */
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
+static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
+#else /* !CONFIG_CGROUP_SCHED: */
struct cfs_bandwidth { };
-#endif /* CONFIG_CGROUP_SCHED */
+static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
+
+#endif /* !CONFIG_CGROUP_SCHED */
+
+extern void unregister_rt_sched_group(struct task_group *tg);
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+
+/*
+ * u64_u32_load/u64_u32_store
+ *
+ * Use a copy of a u64 value to protect against data race. This is only
+ * applicable for 32-bits architectures.
+ */
+#ifdef CONFIG_64BIT
+# define u64_u32_load_copy(var, copy) var
+# define u64_u32_store_copy(var, copy, val) (var = val)
+#else
+# define u64_u32_load_copy(var, copy) \
+({ \
+ u64 __val, __val_copy; \
+ do { \
+ __val_copy = copy; \
+ /* \
+ * paired with u64_u32_store_copy(), ordering access \
+ * to var and copy. \
+ */ \
+ smp_rmb(); \
+ __val = var; \
+ } while (__val != __val_copy); \
+ __val; \
+})
+# define u64_u32_store_copy(var, copy, val) \
+do { \
+ typeof(val) __val = (val); \
+ var = __val; \
+ /* \
+ * paired with u64_u32_load_copy(), ordering access to var and \
+ * copy. \
+ */ \
+ smp_wmb(); \
+ copy = __val; \
+} while (0)
+#endif
+# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+
+struct balance_callback {
+ struct balance_callback *next;
+ void (*func)(struct rq *rq);
+};
/* CFS-related fields in a runqueue */
struct cfs_rq {
- struct load_weight load;
- unsigned int nr_running, h_nr_running;
-
- u64 exec_clock;
- u64 min_vruntime;
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
+ struct load_weight load;
+ unsigned int nr_queued;
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
+
+ s64 avg_vruntime;
+ u64 avg_load;
+
+ u64 zero_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ unsigned int forceidle_seq;
+ u64 zero_vruntime_fi;
#endif
- struct rb_root tasks_timeline;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached tasks_timeline;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next, *last, *skip;
+ struct sched_entity *curr;
+ struct sched_entity *next;
-#ifdef CONFIG_SCHED_DEBUG
- unsigned int nr_spread_over;
-#endif
-
-#ifdef CONFIG_SMP
/*
- * CFS Load tracking
- * Under CFS, load is tracked on a per-entity basis and aggregated up.
- * This allows for the description of both thread and group usage (in
- * the FAIR_GROUP_SCHED case).
- * runnable_load_avg is the sum of the load_avg_contrib of the
- * sched_entities on the rq.
- * blocked_load_avg is similar to runnable_load_avg except that its
- * the blocked sched_entities on the rq.
- * utilization_load_avg is the sum of the average running time of the
- * sched_entities on the rq.
+ * CFS load tracking
*/
- unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
- atomic64_t decay_counter;
- u64 last_decay;
- atomic_long_t removed_load;
+ struct sched_avg avg;
+#ifndef CONFIG_64BIT
+ u64 last_update_time_copy;
+#endif
+ struct {
+ raw_spinlock_t lock ____cacheline_aligned;
+ int nr;
+ unsigned long load_avg;
+ unsigned long util_avg;
+ unsigned long runnable_avg;
+ } removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* Required to track per-cpu representation of a task_group */
- u32 tg_runnable_contrib;
- unsigned long tg_load_contrib;
+ u64 last_update_tg_load_avg;
+ unsigned long tg_load_avg_contrib;
+ long propagate;
+ long prop_runnable_sum;
/*
* h_load = weight * f(tg)
@@ -386,165 +723,391 @@ struct cfs_rq {
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
- unsigned long h_load;
- u64 last_h_load_update;
- struct sched_entity *h_load_next;
+ unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+ struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
+ * This list is used during load balance.
*/
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
+
+ /* Locally cached copy of our task_group's idle value */
+ int idle;
#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
- u64 runtime_expires;
- s64 runtime_remaining;
-
- u64 throttled_clock, throttled_clock_task;
- u64 throttled_clock_task_time;
- int throttled, throttle_count;
- struct list_head throttled_list;
+ int runtime_enabled;
+ s64 runtime_remaining;
+
+ u64 throttled_pelt_idle;
+#ifndef CONFIG_64BIT
+ u64 throttled_pelt_idle_copy;
+#endif
+ u64 throttled_clock;
+ u64 throttled_clock_pelt;
+ u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
+ bool throttled:1;
+ bool pelt_clock_throttled:1;
+ int throttle_count;
+ struct list_head throttled_list;
+ struct list_head throttled_csd_list;
+ struct list_head throttled_limbo_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+ /*
+ * A hotplugged CPU starts scheduling before rq_online_scx(). Track
+ * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
+ * only while the BPF scheduler considers the CPU to be online.
+ */
+ SCX_RQ_ONLINE = 1 << 0,
+ SCX_RQ_CAN_STOP_TICK = 1 << 1,
+ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
+ SCX_RQ_BYPASSING = 1 << 4,
+ SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
+ SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */
+
+ SCX_RQ_IN_WAKEUP = 1 << 16,
+ SCX_RQ_IN_BALANCE = 1 << 17,
+};
+
+struct scx_rq {
+ struct scx_dispatch_q local_dsq;
+ struct list_head runnable_list; /* runnable tasks on this rq */
+ struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */
+ unsigned long ops_qseq;
+ u64 extra_enq_flags; /* see move_task_to_local_dsq() */
+ u32 nr_running;
+ u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
+ bool cpu_released;
+ u32 flags;
+ u64 clock; /* current per-rq clock -- see scx_bpf_now() */
+ cpumask_var_t cpus_to_kick;
+ cpumask_var_t cpus_to_kick_if_idle;
+ cpumask_var_t cpus_to_preempt;
+ cpumask_var_t cpus_to_wait;
+ unsigned long kick_sync;
+ local_t reenq_local_deferred;
+ struct balance_callback deferred_bal_cb;
+ struct irq_work deferred_irq_work;
+ struct irq_work kick_cpus_irq_work;
+ struct scx_dispatch_q bypass_dsq;
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
}
/* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
- struct rt_prio_array active;
- unsigned int rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ struct rt_prio_array active;
+ unsigned int rt_nr_running;
+ unsigned int rr_nr_running;
struct {
- int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
- int next; /* next highest */
-#endif
+ int curr; /* highest queued rt task prio */
+ int next; /* next highest */
} highest_prio;
-#endif
-#ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
- int overloaded;
- struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
- int push_flags;
- int push_cpu;
- struct irq_work push_work;
- raw_spinlock_t push_lock;
-#endif
-#endif /* CONFIG_SMP */
- int rt_queued;
+ bool overloaded;
+ struct plist_head pushable_tasks;
- int rt_throttled;
- u64 rt_time;
- u64 rt_runtime;
- /* Nests inside the rq lock: */
- raw_spinlock_t rt_runtime_lock;
+ int rt_queued;
#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
+ int rt_throttled;
+ u64 rt_time; /* consumed RT time, goes up in update_curr_rt */
+ u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */
+ /* Nests inside the rq lock: */
+ raw_spinlock_t rt_runtime_lock;
- struct rq *rq;
- struct task_group *tg;
+ unsigned int rt_nr_boosted;
+
+ struct rq *rq; /* this is always top-level rq, cache? */
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */
#endif
};
+static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_queued && rt_rq->rt_nr_running;
+}
+
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
+ struct rb_root_cached root;
- unsigned long dl_nr_running;
+ unsigned int dl_nr_running;
-#ifdef CONFIG_SMP
/*
* Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates
- * the decision wether or not a ready but not running task
+ * the decision whether or not a ready but not running task
* should migrate somewhere else.
*/
struct {
- u64 curr;
- u64 next;
+ u64 curr;
+ u64 next;
} earliest_dl;
- unsigned long dl_nr_migratory;
- int overloaded;
+ bool overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
- struct rb_root pushable_dl_tasks_root;
- struct rb_node *pushable_dl_tasks_leftmost;
-#else
- struct dl_bw dl_bw;
-#endif
+ struct rb_root_cached pushable_dl_tasks_root;
+
+ /*
+ * "Active utilization" for this runqueue: increased when a
+ * task wakes up (becomes TASK_RUNNING) and decreased when a
+ * task blocks
+ */
+ u64 running_bw;
+
+ /*
+ * Utilization of the tasks "assigned" to this runqueue (including
+ * the tasks that are in runqueue and the tasks that executed on this
+ * CPU and blocked). Increased when a task moves to this runqueue, and
+ * decreased when the task moves away (migrates, changes scheduling
+ * policy, or terminates).
+ * This is needed to compute the "inactive utilization" for the
+ * runqueue (inactive utilization = this_bw - running_bw).
+ */
+ u64 this_bw;
+ u64 extra_bw;
+
+ /*
+ * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+ * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+ */
+ u64 max_bw;
+
+ /*
+ * Inverse of the fraction of CPU utilization that can be reclaimed
+ * by the GRUB algorithm.
+ */
+ u64 bw_ratio;
};
-#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se) (!se->my_q)
+
+static inline void se_update_runnable(struct sched_entity *se)
+{
+ if (!entity_is_task(se))
+ se->runnable_weight = se->my_q->h_nr_runnable;
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ if (se->sched_delayed)
+ return false;
+
+ if (entity_is_task(se))
+ return !!se->on_rq;
+ else
+ return se->runnable_weight;
+}
+
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
+#define entity_is_task(se) 1
+
+static inline void se_update_runnable(struct sched_entity *se) { }
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ if (se->sched_delayed)
+ return false;
+
+ return !!se->on_rq;
+}
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
+}
+
+
+static inline bool sched_asym_prefer(int a, int b)
+{
+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
+}
+
+struct perf_domain {
+ struct em_perf_domain *em_pd;
+ struct perf_domain *next;
+ struct rcu_head rcu;
+};
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * fully partitioning the member CPUs from any other cpuset. Whenever a new
* exclusive cpuset is created, we also create and attach a new root-domain
* object.
*
*/
struct root_domain {
- atomic_t refcount;
- atomic_t rto_count;
- struct rcu_head rcu;
- cpumask_var_t span;
- cpumask_var_t online;
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
- /* Indicate more than one runnable task for any CPU */
- bool overload;
+ /*
+ * Indicate pullable load on at least one CPU, e.g:
+ * - More than one runnable task
+ * - Running task is misfit
+ */
+ bool overloaded;
+
+ /* Indicate one or more CPUs over-utilized (tipping point) */
+ bool overutilized;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
*/
- cpumask_var_t dlo_mask;
- atomic_t dlo_count;
- struct dl_bw dl_bw;
- struct cpudl cpudl;
+ cpumask_var_t dlo_mask;
+ atomic_t dlo_count;
+ struct dl_bw dl_bw;
+ struct cpudl cpudl;
+
+ /*
+ * Indicate whether a root_domain's dl_bw has been checked or
+ * updated. It's monotonously increasing value.
+ *
+ * Also, some corner cases, like 'wrap around' is dangerous, but given
+ * that u64 is 'big enough'. So that shouldn't be a concern.
+ */
+ u64 visit_cookie;
+#ifdef HAVE_RT_PUSH_IPI
+ /*
+ * For IPI pull requests, loop across the rto_mask.
+ */
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
+ /* These are only updated and read within rto_lock */
+ int rto_loop;
+ int rto_cpu;
+ /* These atomics are updated outside of a lock */
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
+#endif /* HAVE_RT_PUSH_IPI */
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
- cpumask_var_t rto_mask;
- struct cpupri cpupri;
+ cpumask_var_t rto_mask;
+ struct cpupri cpupri;
+
+ /*
+ * NULL-terminated list of performance domains intersecting with the
+ * CPUs of the rd. Protected by RCU.
+ */
+ struct perf_domain __rcu *pd;
};
-extern struct root_domain def_root_domain;
+extern void init_defrootdomain(void);
+extern int sched_init_domains(const struct cpumask *cpu_map);
+extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);
-#endif /* CONFIG_SMP */
+static inline int get_rd_overloaded(struct root_domain *rd)
+{
+ return READ_ONCE(rd->overloaded);
+}
+
+static inline void set_rd_overloaded(struct root_domain *rd, int status)
+{
+ if (get_rd_overloaded(rd) != status)
+ WRITE_ONCE(rd->overloaded, status);
+}
+
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
+
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+ unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+ unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ * utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ * maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+ unsigned int value;
+ struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+
+DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
+#endif /* CONFIG_UCLAMP_TASK */
/*
* This is the main, per-CPU runqueue data structure.
@@ -555,41 +1118,47 @@ extern struct root_domain def_root_domain;
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t __lock;
- /*
- * nr_running and cpu_load should be in the same cacheline because
- * remote CPUs use both these fields when doing load calculation.
- */
- unsigned int nr_running;
+ /* Per class runqueue modification mask; bits in class order. */
+ unsigned int queue_mask;
+ unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
- unsigned int nr_numa_running;
- unsigned int nr_preferred_running;
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+ unsigned int numa_migrate_on;
#endif
- #define CPU_LOAD_IDX_MAX 5
- unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ_COMMON
- u64 nohz_stamp;
- unsigned long nohz_flags;
+ unsigned long last_blocked_load_update_tick;
+ unsigned int has_blocked_load;
+ call_single_data_t nohz_csd;
+ unsigned int nohz_tick_stopped;
+ atomic_t nohz_flags;
+#endif /* CONFIG_NO_HZ_COMMON */
+
+ unsigned int ttwu_pending;
+ u64 nr_switches;
+
+#ifdef CONFIG_UCLAMP_TASK
+ /* Utilization clamp values based on CPU's RUNNABLE tasks */
+ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
+ unsigned int uclamp_flags;
+#define UCLAMP_FLAG_IDLE 0x01
#endif
-#ifdef CONFIG_NO_HZ_FULL
- unsigned long last_sched_tick;
+
+ struct cfs_rq cfs;
+ struct rt_rq rt;
+ struct dl_rq dl;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct scx_rq scx;
#endif
- /* capture load from *all* tasks on this cpu: */
- struct load_weight load;
- unsigned long nr_load_updates;
- u64 nr_switches;
- struct cfs_rq cfs;
- struct rt_rq rt;
- struct dl_rq dl;
+ struct sched_dl_entity fair_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* list of leaf cfs_rq on this cpu: */
- struct list_head leaf_cfs_rq_list;
-
- struct sched_avg avg;
+ /* list of leaf cfs_rq on this CPU: */
+ struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -598,106 +1167,196 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned long nr_uninterruptible;
- struct task_struct *curr, *idle, *stop;
- unsigned long next_balance;
- struct mm_struct *prev_mm;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+ struct task_struct __rcu *donor; /* Scheduling context */
+ struct task_struct __rcu *curr; /* Execution context */
+#else
+ union {
+ struct task_struct __rcu *donor; /* Scheduler context */
+ struct task_struct __rcu *curr; /* Execution context */
+ };
+#endif
+ struct sched_dl_entity *dl_server;
+ struct task_struct *idle;
+ struct task_struct *stop;
+ unsigned long next_balance;
+ struct mm_struct *prev_mm;
+
+ unsigned int clock_update_flags;
+ u64 clock;
+ /* Ensure that all clocks are in the same cache line */
+ u64 clock_task ____cacheline_aligned;
+ u64 clock_pelt;
+ unsigned long lost_idle_time;
+ u64 clock_pelt_idle;
+ u64 clock_idle;
+#ifndef CONFIG_64BIT
+ u64 clock_pelt_idle_copy;
+ u64 clock_idle_copy;
+#endif
- unsigned int clock_skip_update;
- u64 clock;
- u64 clock_task;
+ atomic_t nr_iowait;
- atomic_t nr_iowait;
+ u64 last_seen_need_resched_ns;
+ int ticks_without_resched;
-#ifdef CONFIG_SMP
- struct root_domain *rd;
- struct sched_domain *sd;
+#ifdef CONFIG_MEMBARRIER
+ int membarrier_state;
+#endif
+
+ struct root_domain *rd;
+ struct sched_domain __rcu *sd;
+
+ unsigned long cpu_capacity;
+
+ struct balance_callback *balance_callback;
- unsigned long cpu_capacity;
- unsigned long cpu_capacity_orig;
+ unsigned char nohz_idle_balance;
+ unsigned char idle_balance;
+
+ unsigned long misfit_task_load;
- unsigned char idle_balance;
/* For active balancing */
- int post_schedule;
- int active_balance;
- int push_cpu;
- struct cpu_stop_work active_balance_work;
- /* cpu of this runqueue: */
- int cpu;
- int online;
+ int active_balance;
+ int push_cpu;
+ struct cpu_stop_work active_balance_work;
+
+ /* CPU of this runqueue: */
+ int cpu;
+ int online;
struct list_head cfs_tasks;
- u64 rt_avg;
- u64 age_stamp;
- u64 idle_stamp;
- u64 avg_idle;
+ struct sched_avg avg_rt;
+ struct sched_avg avg_dl;
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+ struct sched_avg avg_irq;
+#endif
+#ifdef CONFIG_SCHED_HW_PRESSURE
+ struct sched_avg avg_hw;
+#endif
+ u64 idle_stamp;
+ u64 avg_idle;
/* This is used to determine avg_idle's max value */
- u64 max_idle_balance_cost;
+ u64 max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ struct rcuwait hotplug_wait;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- u64 prev_irq_time;
+ u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
- u64 prev_steal_time;
+ u64 prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- u64 prev_steal_time_rq;
+ u64 prev_steal_time_rq;
#endif
/* calc_load related fields */
- unsigned long calc_load_update;
- long calc_load_active;
+ unsigned long calc_load_update;
+ long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
- int hrtick_csd_pending;
- struct call_single_data hrtick_csd;
-#endif
- struct hrtimer hrtick_timer;
+ call_single_data_t hrtick_csd;
+ struct hrtimer hrtick_timer;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
- struct sched_info rq_sched_info;
- unsigned long long rq_cpu_time;
- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
/* sys_sched_yield() stats */
- unsigned int yld_count;
+ unsigned int yld_count;
/* schedule() stats */
- unsigned int sched_count;
- unsigned int sched_goidle;
+ unsigned int sched_count;
+ unsigned int sched_goidle;
/* try_to_wake_up() stats */
- unsigned int ttwu_count;
- unsigned int ttwu_local;
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
#endif
-#ifdef CONFIG_SMP
- struct llist_head wake_list;
+#ifdef CONFIG_CPU_IDLE
+ /* Must be inspected within a RCU lock section */
+ struct cpuidle_state *idle_state;
#endif
-#ifdef CONFIG_CPU_IDLE
- /* Must be inspected within a rcu lock section */
- struct cpuidle_state *idle_state;
+ unsigned int nr_pinned;
+ unsigned int push_busy;
+ struct cpu_stop_work push_work;
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ struct task_struct *core_pick;
+ struct sched_dl_entity *core_dl_server;
+ unsigned int core_enabled;
+ unsigned int core_sched_seq;
+ struct rb_root core_tree;
+
+ /* shared state -- careful with sched_core_cpu_deactivate() */
+ unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
+ unsigned int core_forceidle_count;
+ unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
+#endif /* CONFIG_SCHED_CORE */
+
+ /* Scratch cpumask to be temporarily used under rq_lock */
+ cpumask_var_t scratch_mask;
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
#endif
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* CPU runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
static inline int cpu_of(struct rq *rq)
{
-#ifdef CONFIG_SMP
return rq->cpu;
-#else
- return 0;
-#endif
+}
+
+#define MDF_PUSH 0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+ return p->migration_disabled;
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
+
+static inline u32 sched_rng(void)
+{
+ return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
+}
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() this_cpu_ptr(&runqueues)
@@ -705,47 +1364,609 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
-static inline u64 __rq_clock_broken(struct rq *rq)
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ rcu_assign_pointer(rq->donor, t);
+}
+#else
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ /* Do nothing */
+}
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+static inline struct cpumask *sched_group_span(struct sched_group *sg);
+
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return !static_branch_unlikely(&__sched_core_enabled);
+}
+
+/*
+ * Be careful with this function; not for general use. The return value isn't
+ * stable unless you actually hold a relevant rq->__lock.
+ */
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ if (rq->core_enabled)
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+extern bool
+cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi);
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+/*
+ * Helpers to check if the CPU's core cookie matches with the task's cookie
+ * when core scheduling is enabled.
+ * A special case is that the task's cookie always matches with CPU's core
+ * cookie if the CPU is in an idle core.
+ */
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ return rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ bool idle_core = true;
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ if (rq->core->core_cookie == p->core_cookie)
+ return true;
+
+ for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) {
+ if (!available_idle_cpu(cpu)) {
+ idle_core = false;
+ break;
+ }
+ }
+
+ /*
+ * A CPU in an idle core is always the best choice for tasks with
+ * cookies.
+ */
+ return idle_core;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) {
+ if (sched_core_cookie_match(cpu_rq(cpu), p))
+ return true;
+ }
+ return false;
+}
+
+static inline bool sched_core_enqueued(struct task_struct *p)
+{
+ return !RB_EMPTY_NODE(&p->core_node);
+}
+
+extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
+
+extern void sched_core_get(void);
+extern void sched_core_put(void);
+
+#else /* !CONFIG_SCHED_CORE: */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return true;
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ return true;
+}
+
+#endif /* !CONFIG_SCHED_CORE */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
+DECLARE_STATIC_KEY_FALSE(rt_group_sched);
+static inline bool rt_group_sched_enabled(void)
+{
+ return static_branch_unlikely(&rt_group_sched);
+}
+# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */
+DECLARE_STATIC_KEY_TRUE(rt_group_sched);
+static inline bool rt_group_sched_enabled(void)
+{
+ return static_branch_likely(&rt_group_sched);
+}
+# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
+# define rt_group_sched_enabled() false
+#endif /* !CONFIG_RT_GROUP_SCHED */
+
+static inline void lockdep_assert_rq_held(struct rq *rq)
+{
+ lockdep_assert_held(__rq_lockp(rq));
+}
+
+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);
+extern bool raw_spin_rq_trylock(struct rq *rq);
+extern void raw_spin_rq_unlock(struct rq *rq);
+
+static inline void raw_spin_rq_lock(struct rq *rq)
+{
+ raw_spin_rq_lock_nested(rq, 0);
+}
+
+static inline void raw_spin_rq_lock_irq(struct rq *rq)
{
- return ACCESS_ONCE(rq->clock);
+ local_irq_disable();
+ raw_spin_rq_lock(rq);
+}
+
+static inline void raw_spin_rq_unlock_irq(struct rq *rq)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_enable();
+}
+
+static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ raw_spin_rq_lock(rq);
+
+ return flags;
+}
+
+static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_restore(flags);
+}
+
+#define raw_spin_rq_lock_irqsave(rq, flags) \
+do { \
+ flags = _raw_spin_rq_lock_irqsave(rq); \
+} while (0)
+
+#ifdef CONFIG_SCHED_SMT
+extern void __update_idle_core(struct rq *rq);
+
+static inline void update_idle_core(struct rq *rq)
+{
+ if (static_branch_unlikely(&sched_smt_present))
+ __update_idle_core(rq);
+}
+
+#else /* !CONFIG_SCHED_SMT: */
+static inline void update_idle_core(struct rq *rq) { }
+#endif /* !CONFIG_SCHED_SMT */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ WARN_ON_ONCE(!entity_is_task(se));
+ return container_of(se, struct task_struct, se);
+}
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
+{
+ return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
+#define task_of(_se) container_of(_se, struct task_struct, se)
+
+static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
+{
+ const struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
+extern void update_rq_clock(struct rq *rq);
+
+/*
+ * rq::clock_update_flags bits
+ *
+ * %RQCF_REQ_SKIP - will request skipping of clock update on the next
+ * call to __schedule(). This is an optimisation to avoid
+ * neighbouring rq clock updates.
+ *
+ * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is
+ * in effect and calls to update_rq_clock() are being ignored.
+ *
+ * %RQCF_UPDATED - is a debug flag that indicates whether a call has been
+ * made to update_rq_clock() since the last time rq::lock was pinned.
+ *
+ * If inside of __schedule(), clock_update_flags will have been
+ * shifted left (a left shift is a cheap operation for the fast path
+ * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use,
+ *
+ * if (rq-clock_update_flags >= RQCF_UPDATED)
+ *
+ * to check if %RQCF_UPDATED is set. It'll never be shifted more than
+ * one position though, because the next rq_unpin_lock() will shift it
+ * back.
+ */
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+#define RQCF_UPDATED 0x04
+
+static inline void assert_clock_updated(struct rq *rq)
+{
+ /*
+ * The only reason for not seeing a clock update since the
+ * last rq_pin_lock() is if we're currently skipping updates.
+ */
+ WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP);
}
static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
return rq->clock_task;
}
-#define RQCF_REQ_SKIP 0x01
-#define RQCF_ACT_SKIP 0x02
+static inline void rq_clock_skip_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags |= RQCF_REQ_SKIP;
+}
-static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+/*
+ * See rt task throttling, which is the only time a skip
+ * request is canceled.
+ */
+static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
- if (skip)
- rq->clock_skip_update |= RQCF_REQ_SKIP;
- else
- rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+}
+
+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_start_loop_update() can be called after updating the clock
+ * once and before iterating over the list to prevent multiple update.
+ * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ */
+static inline void rq_clock_start_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_stop_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
+struct rq_flags {
+ unsigned long flags;
+ struct pin_cookie cookie;
+ /*
+ * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
+ * current pin context is stashed here in case it needs to be
+ * restored in rq_repin_lock().
+ */
+ unsigned int clock_update_flags;
+};
+
+extern struct balance_callback balance_push_callback;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+extern const struct sched_class ext_sched_class;
+
+DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
+
+#define scx_enabled() static_branch_unlikely(&__scx_enabled)
+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.clock, clock);
+ smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID);
+}
+
+static inline void scx_rq_clock_invalidate(struct rq *rq)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
+}
+
+#else /* !CONFIG_SCHED_CLASS_EXT: */
+#define scx_enabled() false
+#define scx_switched_all() false
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
+static inline void scx_rq_clock_invalidate(struct rq *rq) {}
+#endif /* !CONFIG_SCHED_CLASS_EXT */
+
+/*
+ * Lockdep annotation that avoids accidental unlocks; it's like a
+ * sticky/continuous lockdep_assert_held().
+ *
+ * This avoids code that has access to 'struct rq *rq' (basically everything in
+ * the scheduler) from accidentally unlocking the rq if they do not also have a
+ * copy of the (on-stack) 'struct rq_flags rf'.
+ *
+ * Also see Documentation/locking/lockdep-design.rst.
+ */
+static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
+
+ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rf->clock_update_flags = 0;
+ WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+}
+
+static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ if (rq->clock_update_flags > RQCF_ACT_SKIP)
+ rf->clock_update_flags = RQCF_UPDATED;
+
+ scx_rq_clock_invalidate(rq);
+ lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
+}
+
+static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
+{
+ lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
+
+ /*
+ * Restore the value we stashed in @rf for this pin context.
+ */
+ rq->clock_update_flags |= rf->clock_update_flags;
+}
+
+extern
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock);
+
+extern
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock);
+
+static inline void
+__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_rq_unlock(rq);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ __task_rq_unlock(rq, p, rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+}
+
+DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
+ _T->rq = task_rq_lock(_T->lock, &_T->rf),
+ task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct,
+ _T->rq = __task_rq_lock(_T->lock, &_T->rf),
+ __task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_rq_lock_irqsave(rq, rf->flags);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_rq_lock_irq(rq);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_rq_lock(rq);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_rq_unlock_irqrestore(rq, rf->flags);
+}
+
+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_rq_unlock_irq(rq);
+}
+
+static inline void rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_rq_unlock(rq);
+}
+
+DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
+ rq_lock(_T->lock, &_T->rf),
+ rq_unlock(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
+ rq_lock_irq(_T->lock, &_T->rf),
+ rq_unlock_irq(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
+ rq_lock_irqsave(_T->lock, &_T->rf),
+ rq_unlock_irqrestore(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+static inline struct rq *this_rq_lock_irq(struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq, rf);
+
+ return rq;
}
#ifdef CONFIG_NUMA
+
enum numa_topology_type {
NUMA_DIRECT,
NUMA_GLUELESS_MESH,
NUMA_BACKPLANE,
};
+
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-#endif
+extern void sched_init_numa(int offline_node);
+extern void sched_update_numa(int cpu, bool online);
+extern void sched_domains_numa_masks_set(unsigned int cpu);
+extern void sched_domains_numa_masks_clear(unsigned int cpu);
+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
+
+#else /* !CONFIG_NUMA: */
+
+static inline void sched_init_numa(int offline_node) { }
+static inline void sched_update_numa(int cpu, bool online) { }
+static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+
+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return nr_cpu_ids;
+}
+
+#endif /* !CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING
+
/* The regions in numa_faults array from task_struct */
enum numa_faults_stats {
NUMA_MEM = 0,
@@ -753,22 +1974,48 @@ enum numa_faults_stats {
NUMA_MEMBUF,
NUMA_CPUBUF
};
+
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
-extern int migrate_swap(struct task_struct *, struct task_struct *);
-#endif /* CONFIG_NUMA_BALANCING */
+extern int migrate_swap(struct task_struct *p, struct task_struct *t,
+ int cpu, int scpu);
+extern void init_numa_balancing(u64 clone_flags, struct task_struct *p);
-#ifdef CONFIG_SMP
+#else /* !CONFIG_NUMA_BALANCING: */
+
+static inline void
+init_numa_balancing(u64 clone_flags, struct task_struct *p)
+{
+}
+
+#endif /* !CONFIG_NUMA_BALANCING */
+
+static inline void
+queue_balance_callback(struct rq *rq,
+ struct balance_callback *head,
+ void (*func)(struct rq *rq))
+{
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Don't (re)queue an already queued item; nor queue anything when
+ * balance_push() is active, see the comment with
+ * balance_push_callback.
+ */
+ if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
+ return;
-extern void sched_ttwu_pending(void);
+ head->func = func;
+ head->next = rq->balance_callback;
+ rq->balance_callback = head;
+}
#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), \
- lockdep_is_held(&sched_domains_mutex))
+ rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
+ * See destroy_sched_domains: call_rcu for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
@@ -777,25 +2024,39 @@ extern void sched_ttwu_pending(void);
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
-#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
/**
* highest_flag_domain - Return highest sched_domain containing flag.
- * @cpu: The cpu whose highest level of sched domain is to
+ * @cpu: The CPU whose highest level of sched domain is to
* be returned.
* @flag: The flag to check for the highest sched_domain
- * for the given cpu.
+ * for the given CPU.
*
- * Returns the highest sched_domain of a cpu which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
+ if (sd->flags & flag) {
+ hsd = sd;
+ continue;
+ }
+
+ /*
+ * Stop the search if @flag is known to be shared at lower
+ * levels. It will not be found further up.
+ */
+ if (flag & SD_SHARED_CHILD_MASK)
break;
- hsd = sd;
}
return hsd;
@@ -813,36 +2074,49 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
return sd;
}
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(int, sd_share_id);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
+extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
+
+static __always_inline bool sched_asym_cpucap_active(void)
+{
+ return static_branch_unlikely(&sched_asym_cpucapacity);
+}
struct sched_group_capacity {
- atomic_t ref;
+ atomic_t ref;
/*
- * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity;
- unsigned long next_update;
- int imbalance; /* XXX unrelated to capacity but shared group state */
- /*
- * Number of busy cpus in this group.
- */
- atomic_t nr_busy_cpus;
+ unsigned long capacity;
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
+ unsigned long max_capacity; /* Max per-CPU capacity in group */
+ unsigned long next_update;
+ int imbalance; /* XXX unrelated to capacity but shared group state */
- unsigned long cpumask[0]; /* iteration mask */
+ int id;
+
+ unsigned long cpumask[]; /* Balance mask */
};
struct sched_group {
- struct sched_group *next; /* Must be a circular list */
- atomic_t ref;
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
- unsigned int group_weight;
+ unsigned int group_weight;
+ unsigned int cores;
struct sched_group_capacity *sgc;
+ int asym_prefer_cpu; /* CPU of highest priority in group */
+ int flags;
/*
* The CPUs this group covers.
@@ -851,42 +2125,35 @@ struct sched_group {
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
- unsigned long cpumask[0];
+ unsigned long cpumask[];
};
-static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
{
return to_cpumask(sg->cpumask);
}
/*
- * cpumask masking which cpus in the group are allowed to iterate up the domain
- * tree.
+ * See build_balance_mask().
*/
-static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
{
return to_cpumask(sg->sgc->cpumask);
}
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_cpus(group));
-}
-
extern int group_balance_cpu(struct sched_group *sg);
-#else
-
-static inline void sched_ttwu_pending(void) { }
+extern void update_sched_domain_debugfs(void);
+extern void dirty_sched_domain_sysctl(int cpu);
-#endif /* CONFIG_SMP */
+extern int sched_update_scaling(void);
-#include "stats.h"
-#include "auto_group.h"
+static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+{
+ if (!p->user_cpus_ptr)
+ return cpu_possible_mask; /* &init_task.cpus_mask */
+ return p->user_cpus_ptr;
+}
#ifdef CONFIG_CGROUP_SCHED
@@ -916,25 +2183,35 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
+ p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * p->rt.rt_rq is NULL initially and it is easier to assign
+ * root_task_group's rt_rq than switching in rt_rq_of_se()
+ * Clobbers tg(!)
+ */
+ if (!rt_group_sched_enabled())
+ tg = &root_task_group;
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu];
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
-#else /* CONFIG_CGROUP_SCHED */
+#else /* !CONFIG_CGROUP_SCHED: */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+
static inline struct task_group *task_group(struct task_struct *p)
{
return NULL;
}
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* !CONFIG_CGROUP_SCHED */
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -942,26 +2219,19 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
#ifdef CONFIG_SMP
/*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
- * successfuly executed on another CPU. We must ensure that updates of
+ * successfully executed on another CPU. We must ensure that updates of
* per-task data have been completed by this moment.
*/
smp_wmb();
- task_thread_info(p)->cpu = cpu;
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
-#endif
+ rseq_sched_set_ids_changed(p);
+#endif /* CONFIG_SMP */
}
/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ * Tunables:
*/
-#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
-# define const_debug __read_mostly
-#else
-# define const_debug const
-#endif
-
-extern const_debug unsigned int sysctl_sched_features;
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
@@ -973,7 +2243,14 @@ enum {
#undef SCHED_FEAT
-#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+/*
+ * To support run-time toggling of sched features, all the translation units
+ * (but core.c) reference the sysctl_sched_features defined in core.c.
+ */
+extern __read_mostly unsigned int sysctl_sched_features;
+
+#ifdef CONFIG_JUMP_LABEL
+
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -981,26 +2258,19 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
}
#include "features.h"
-
#undef SCHED_FEAT
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+
+#else /* !CONFIG_JUMP_LABEL: */
+
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
-#ifdef CONFIG_NUMA_BALANCING
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_JUMP_LABEL */
+
+extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
static inline u64 global_rt_period(void)
{
@@ -1015,83 +2285,61 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
+/*
+ * Is p the current execution context?
+ */
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
-static inline int task_running(struct rq *rq, struct task_struct *p)
+/*
+ * Is p the current scheduling context?
+ *
+ * Note that it might be the current execution context at the same time if
+ * rq->curr == rq->donor == p.
+ */
+static inline int task_current_donor(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_SMP
- return p->on_cpu;
-#else
- return task_current(rq, p);
-#endif
+ return rq->donor == p;
}
-static inline int task_on_rq_queued(struct task_struct *p)
+static inline bool task_is_blocked(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_QUEUED;
+ if (!sched_proxy_exec())
+ return false;
+
+ return !!p->blocked_on;
}
-static inline int task_on_rq_migrating(struct task_struct *p)
+static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_MIGRATING;
+ return p->on_cpu;
}
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next) do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev) do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch() do { } while (0)
-#endif
-
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+static inline int task_on_rq_queued(struct task_struct *p)
{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED;
}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+static inline int task_on_rq_migrating(struct task_struct *p)
{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- */
- smp_wmb();
- prev->on_cpu = 0;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
- /* this is a valid case when another task releases the spinlock */
- rq->lock.owner = current;
-#endif
- /*
- * If we are tracking spinlock dependencies then we have to
- * fix up the runqueue lock - which gets 'carried over' from
- * prev into current:
- */
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
- raw_spin_unlock_irq(&rq->lock);
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
-/*
- * wake flags
- */
-#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
-#define WF_FORK 0x02 /* child wakeup after fork */
-#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+/* Wake flags. The first three directly map to some SD flag value */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
+#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
+
+static_assert(WF_EXEC == SD_BALANCE_EXEC);
+static_assert(WF_FORK == SD_BALANCE_FORK);
+static_assert(WF_TTWU == SD_BALANCE_WAKE);
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1102,133 +2350,319 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* slice expiry etc.
*/
-#define WEIGHT_IDLEPRIO 3
-#define WMULT_IDLEPRIO 1431655765
+#define WEIGHT_IDLEPRIO 3
+#define WMULT_IDLEPRIO 1431655765
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */ 88761, 71755, 56483, 46273, 36291,
- /* -15 */ 29154, 23254, 18705, 14949, 11916,
- /* -10 */ 9548, 7620, 6100, 4904, 3906,
- /* -5 */ 3121, 2501, 1991, 1586, 1277,
- /* 0 */ 1024, 820, 655, 526, 423,
- /* 5 */ 335, 272, 215, 172, 137,
- /* 10 */ 110, 87, 70, 56, 45,
- /* 15 */ 36, 29, 23, 18, 15,
-};
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ * {de,en}queue flags:
+ *
+ * SLEEP/WAKEUP - task is no-longer/just-became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * NOCLOCK - skip the update_rq_clock() (avoids double updates)
+ *
+ * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
+ *
+ * DELAYED - de/re-queue a sched_delayed task
*
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
+ * CLASS - going to update p->sched_class; makes sched_change call the
+ * various switch methods.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_MIGRATED - the task was migrated during wakeup
+ * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
+ *
+ * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but
+ * SCHED_DEADLINE seems to rely on this for now.
*/
-static const u32 prio_to_wmult[40] = {
- /* -20 */ 48388, 59856, 76040, 92818, 118348,
- /* -15 */ 147320, 184698, 229616, 287308, 360437,
- /* -10 */ 449829, 563644, 704093, 875809, 1099582,
- /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
- /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
- /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
- /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
- /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_HEAD 2
-#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
-#else
-#define ENQUEUE_WAKING 0
-#endif
-#define ENQUEUE_REPLENISH 8
+#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */
+#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */
+
+#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */
+
+#define DEQUEUE_SPECIAL 0x00010000
+#define DEQUEUE_THROTTLE 0x00020000
+
+#define ENQUEUE_WAKEUP 0x0001
+#define ENQUEUE_RESTORE 0x0002
+#define ENQUEUE_MOVE 0x0004
+#define ENQUEUE_NOCLOCK 0x0008
+
+#define ENQUEUE_MIGRATING 0x0010
+#define ENQUEUE_DELAYED 0x0020
+#define ENQUEUE_CLASS 0x0040
-#define DEQUEUE_SLEEP 1
+#define ENQUEUE_HEAD 0x00010000
+#define ENQUEUE_REPLENISH 0x00020000
+#define ENQUEUE_MIGRATED 0x00040000
+#define ENQUEUE_INITIAL 0x00080000
+#define ENQUEUE_RQ_SELECTED 0x00100000
#define RETRY_TASK ((void *)-1UL)
+struct affinity_context {
+ const struct cpumask *new_mask;
+ struct cpumask *user_mask;
+ unsigned int flags;
+};
+
+extern s64 update_curr_common(struct rq *rq);
+
struct sched_class {
- const struct sched_class *next;
+#ifdef CONFIG_UCLAMP_TASK
+ int uclamp_enabled;
+#endif
+ /*
+ * idle: 0
+ * ext: 1
+ * fair: 2
+ * rt: 4
+ * dl: 8
+ * stop: 16
+ */
+ unsigned int queue_mask;
+
+ /*
+ * move_queued_task/activate_task/enqueue_task: rq->lock
+ * ttwu_do_activate/activate_task/enqueue_task: rq->lock
+ * wake_up_new_task/activate_task/enqueue_task: task_rq_lock
+ * ttwu_runnable/enqueue_task: task_rq_lock
+ * proxy_task_current: rq->lock
+ * sched_change_end
+ */
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
- void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
- void (*yield_task) (struct rq *rq);
- bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+ /*
+ * move_queued_task/deactivate_task/dequeue_task: rq->lock
+ * __schedule/block_task/dequeue_task: rq->lock
+ * proxy_task_current: rq->lock
+ * wait_task_inactive: task_rq_lock
+ * sched_change_begin
+ */
+ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * do_sched_yield: rq->lock
+ */
+ void (*yield_task) (struct rq *rq);
+ /*
+ * yield_to: rq->lock (double)
+ */
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
- void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+ /*
+ * move_queued_task: rq->lock
+ * __migrate_swap_task: rq->lock
+ * ttwu_do_activate: rq->lock
+ * ttwu_runnable: task_rq_lock
+ * wake_up_new_task: task_rq_lock
+ */
+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+
+ /*
+ * schedule/pick_next_task/prev_balance: rq->lock
+ */
+ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
/*
- * It is the responsibility of the pick_next_task() method that will
- * return the next task to call put_prev_task() on the @prev task or
- * something equivalent.
+ * schedule/pick_next_task: rq->lock
+ */
+ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf);
+ /*
+ * Optional! When implemented pick_next_task() should be equivalent to:
*
- * May return RETRY_TASK when it finds a higher prio class has runnable
- * tasks.
+ * next = pick_task();
+ * if (next) {
+ * put_prev_task(prev);
+ * set_next_task_first(next);
+ * }
*/
- struct task_struct * (*pick_next_task) (struct rq *rq,
- struct task_struct *prev);
- void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf);
-#ifdef CONFIG_SMP
- int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
- void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+ /*
+ * sched_change:
+ * __schedule: rq->lock
+ */
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+
+ /*
+ * select_task_rq: p->pi_lock
+ * sched_exec: p->pi_lock
+ */
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+
+ /*
+ * set_task_cpu: p->pi_lock || rq->lock (ttwu like)
+ */
+ void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
- void (*post_schedule) (struct rq *this_rq);
- void (*task_waking) (struct task_struct *task);
- void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+ /*
+ * ttwu_do_activate: rq->lock
+ * wake_up_new_task: task_rq_lock
+ */
+ void (*task_woken)(struct rq *this_rq, struct task_struct *task);
- void (*set_cpus_allowed)(struct task_struct *p,
- const struct cpumask *newmask);
+ /*
+ * do_set_cpus_allowed: task_rq_lock + sched_change
+ */
+ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+ /*
+ * sched_set_rq_{on,off}line: rq->lock
+ */
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
-#endif
- void (*set_curr_task) (struct rq *rq);
- void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
- void (*task_fork) (struct task_struct *p);
- void (*task_dead) (struct task_struct *p);
+ /*
+ * push_cpu_stop: p->pi_lock && rq->lock
+ */
+ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
/*
- * The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serliazed by
- * rq->lock. They are however serialized by p->pi_lock.
+ * hrtick: rq->lock
+ * sched_tick: rq->lock
+ * sched_tick_remote: rq->lock
*/
+ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+ /*
+ * sched_cgroup_fork: p->pi_lock
+ */
+ void (*task_fork)(struct task_struct *p);
+ /*
+ * finish_task_switch: no locks
+ */
+ void (*task_dead)(struct task_struct *p);
+
+ /*
+ * sched_change
+ */
+ void (*switching_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
- void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ u64 (*get_prio) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
- int oldprio);
+ u64 oldprio);
- unsigned int (*get_rr_interval) (struct rq *rq,
- struct task_struct *task);
+ /*
+ * set_load_weight: task_rq_lock + sched_change
+ * __setscheduler_parms: task_rq_lock + sched_change
+ */
+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+ const struct load_weight *lw);
- void (*update_curr) (struct rq *rq);
+ /*
+ * sched_rr_get_interval: task_rq_lock
+ */
+ unsigned int (*get_rr_interval)(struct rq *rq,
+ struct task_struct *task);
+
+ /*
+ * task_sched_runtime: task_rq_lock
+ */
+ void (*update_curr)(struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p, int on_rq);
+ /*
+ * sched_change_group: task_rq_lock + sched_change
+ */
+ void (*task_change_group)(struct task_struct *p);
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * pick_next_task: rq->lock
+ * try_steal_cookie: rq->lock (double)
+ */
+ int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
+/*
+ * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
+ */
+static inline void rq_modified_clear(struct rq *rq)
+{
+ rq->queue_mask = 0;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
+{
+ unsigned int mask = class->queue_mask;
+ return rq->queue_mask & ~((mask << 1) - 1);
+}
+
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- prev->sched_class->put_prev_task(rq, prev);
+ WARN_ON_ONCE(rq->donor != prev);
+ prev->sched_class->put_prev_task(rq, prev, NULL);
}
-#define sched_class_highest (&stop_sched_class)
-#define for_each_class(class) \
- for (class = sched_class_highest; class; class = class->next)
+static inline void set_next_task(struct rq *rq, struct task_struct *next)
+{
+ next->sched_class->set_next_task(rq, next, false);
+}
+
+static inline void
+__put_prev_set_next_dl_server(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ prev->dl_server = NULL;
+ next->dl_server = rq->dl_server;
+ rq->dl_server = NULL;
+}
+
+static inline void put_prev_set_next_task(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ WARN_ON_ONCE(rq->donor != prev);
+
+ __put_prev_set_next_dl_server(rq, prev, next);
+
+ if (next == prev)
+ return;
+
+ prev->sched_class->put_prev_task(rq, prev, next);
+ next->sched_class->set_next_task(rq, next, true);
+}
+
+/*
+ * Helper to define a sched_class instance; each one is placed in a separate
+ * section which is ordered by the linker script:
+ *
+ * include/asm-generic/vmlinux.lds.h
+ *
+ * *CAREFUL* they are laid out in *REVERSE* order!!!
+ *
+ * Also enforce alignment on the instance, not the type, to guarantee layout.
+ */
+#define DEFINE_SCHED_CLASS(name) \
+const struct sched_class name##_sched_class \
+ __aligned(__alignof__(struct sched_class)) \
+ __section("__" #name "_sched_class")
+
+/* Defined in include/asm-generic/vmlinux.lds.h */
+extern struct sched_class __sched_class_highest[];
+extern struct sched_class __sched_class_lowest[];
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -1236,24 +2670,118 @@ extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
+/*
+ * Iterate only active classes. SCX can take over all fair tasks or be
+ * completely disabled. If the former, skip fair. If the latter, skip SCX.
+ */
+static inline const struct sched_class *next_active_class(const struct sched_class *class)
+{
+ class++;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (scx_switched_all() && class == &fair_sched_class)
+ class++;
+ if (!scx_enabled() && class == &ext_sched_class)
+ class++;
+#endif
+ return class;
+}
+
+#define for_class_range(class, _from, _to) \
+ for (class = (_from); class < (_to); class++)
-#ifdef CONFIG_SMP
+#define for_each_class(class) \
+ for_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define for_active_class_range(class, _from, _to) \
+ for (class = (_from); class != (_to); class = next_active_class(class))
+
+#define for_each_active_class(class) \
+ for_active_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b) ((_a) < (_b))
+
+static inline bool sched_stop_runnable(struct rq *rq)
+{
+ return rq->stop && task_on_rq_queued(rq->stop);
+}
+
+static inline bool sched_dl_runnable(struct rq *rq)
+{
+ return rq->dl.dl_nr_running > 0;
+}
+
+static inline bool sched_rt_runnable(struct rq *rq)
+{
+ return rq->rt.rt_queued > 0;
+}
+
+static inline bool sched_fair_runnable(struct rq *rq)
+{
+ return rq->cfs.nr_queued > 0;
+}
+
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf);
+extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf);
+
+#define SCA_CHECK 0x01
+#define SCA_MIGRATE_DISABLE 0x02
+#define SCA_MIGRATE_ENABLE 0x04
+#define SCA_USER 0x08
extern void update_group_capacity(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq);
+extern void sched_balance_trigger(struct rq *rq);
-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
+extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx);
+extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
-#else
+static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
+{
+ /* When not in the task's cpumask, no point in looking further. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
+ /* Can @cpu run a user thread? */
+ if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p))
+ return false;
-#endif
+ return true;
+}
+
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See set_cpus_allowed_force() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
+}
+
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+ struct task_struct *p = rq->donor;
+
+ lockdep_assert_rq_held(rq);
+
+ if (rq->push_busy)
+ return NULL;
+
+ if (p->nr_cpus_allowed == 1)
+ return NULL;
+
+ if (p->migration_disabled)
+ return NULL;
+
+ rq->push_busy = true;
+ return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
#ifdef CONFIG_CPU_IDLE
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -1262,10 +2790,13 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
return rq->idle_state;
}
-#else
+
+#else /* !CONFIG_CPU_IDLE: */
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -1275,7 +2806,11 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
-#endif
+
+#endif /* !CONFIG_CPU_IDLE */
+
+extern void schedule_idle(void);
+asmlinkage void schedule_user(void);
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
@@ -1284,79 +2819,150 @@ extern void update_max_interval(void);
extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void init_sched_dl_class(void);
extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
extern void resched_cpu(int cpu);
-extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
+extern void init_dl_entity(struct sched_dl_entity *dl_se);
-extern struct dl_bandwidth def_dl_bandwidth;
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
-extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+extern void init_cfs_throttle_work(struct task_struct *p);
-unsigned long to_ratio(u64 period, u64 runtime);
+#define BW_SHIFT 20
+#define BW_UNIT (1 << BW_SHIFT)
+#define RATIO_SHIFT 8
+#define MAX_BW_BITS (64 - BW_SHIFT)
+#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
-extern void update_idle_cpu_load(struct rq *this_rq);
+extern unsigned long to_ratio(u64 period, u64 runtime);
-extern void init_task_runnable_average(struct task_struct *p);
+extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct task_struct *p);
+
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (sched_can_stop_tick(rq))
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+ else
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else /* !CONFIG_NO_HZ_FULL: */
+static inline int sched_tick_offload_init(void) { return 0; }
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif /* !CONFIG_NO_HZ_FULL */
static inline void add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
rq->nr_running = prev_nr + count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, count);
+ }
- if (prev_nr < 2 && rq->nr_running >= 2) {
-#ifdef CONFIG_SMP
- if (!rq->rd->overload)
- rq->rd->overload = true;
-#endif
+ if (prev_nr < 2 && rq->nr_running >= 2)
+ set_rd_overloaded(rq->rd, 1);
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_cpu(rq->cpu)) {
- /*
- * Tick is needed if more than one task runs on a CPU.
- * Send the target an IPI to kick it out of nohz mode.
- *
- * We assume that IPI implies full memory barrier and the
- * new value of rq->nr_running is visible on reception
- * from the target.
- */
- tick_nohz_full_kick_cpu(rq->cpu);
- }
-#endif
- }
+ sched_update_tick_dependency(rq);
}
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, -count);
+ }
+
+ /* Check if we still need preemption */
+ sched_update_tick_dependency(rq);
}
-static inline void rq_last_tick_reset(struct rq *rq)
+static inline void __block_task(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_NO_HZ_FULL
- rq->last_sched_tick = jiffies;
-#endif
-}
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible++;
-extern void update_rq_clock(struct rq *rq);
+ if (p->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+
+ /*
+ * The moment this write goes through, ttwu() can swoop in and migrate
+ * this task, rendering our rq->__lock ineffective.
+ *
+ * __schedule() try_to_wake_up()
+ * LOCK rq->__lock LOCK p->pi_lock
+ * pick_next_task()
+ * pick_next_task_fair()
+ * pick_next_entity()
+ * dequeue_entities()
+ * __block_task()
+ * RELEASE p->on_rq = 0 if (p->on_rq && ...)
+ * break;
+ *
+ * ACQUIRE (after ctrl-dep)
+ *
+ * cpu = select_task_rq();
+ * set_task_cpu(p, cpu);
+ * ttwu_queue()
+ * ttwu_do_activate()
+ * LOCK rq->__lock
+ * activate_task()
+ * STORE p->on_rq = 1
+ * UNLOCK rq->__lock
+ *
+ * Callers must ensure to not reference @p after this -- we no longer
+ * own it.
+ */
+ smp_store_release(&p->on_rq, 0);
+}
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+
+#ifdef CONFIG_PREEMPT_RT
+# define SCHED_NR_MIGRATE_BREAK 8
+#else
+# define SCHED_NR_MIGRATE_BREAK 32
+#endif
+
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
-extern const_debug unsigned int sysctl_sched_time_avg;
-extern const_debug unsigned int sysctl_sched_nr_migrate;
-extern const_debug unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_base_slice;
-static inline u64 sched_avg_period(void)
-{
- return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
+extern unsigned int sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
#ifdef CONFIG_SCHED_HRTICK
@@ -1367,127 +2973,115 @@ static inline u64 sched_avg_period(void)
*/
static inline int hrtick_enabled(struct rq *rq)
{
- if (!sched_feat(HRTICK))
- return 0;
if (!cpu_active(cpu_of(rq)))
return 0;
return hrtimer_is_hres_active(&rq->hrtick_timer);
}
-void hrtick_start(struct rq *rq, u64 delay);
-
-#else
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ return hrtick_enabled(rq);
+}
-static inline int hrtick_enabled(struct rq *rq)
+static inline int hrtick_enabled_dl(struct rq *rq)
{
- return 0;
+ if (!sched_feat(HRTICK_DL))
+ return 0;
+ return hrtick_enabled(rq);
}
-#endif /* CONFIG_SCHED_HRTICK */
+extern void hrtick_start(struct rq *rq, u64 delay);
-#ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
+#else /* !CONFIG_SCHED_HRTICK: */
-#ifndef arch_scale_freq_capacity
-static __always_inline
-unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
+static inline int hrtick_enabled_fair(struct rq *rq)
{
- return SCHED_CAPACITY_SCALE;
+ return 0;
}
-#endif
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+static inline int hrtick_enabled_dl(struct rq *rq)
{
- rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
- sched_avg_update(rq);
+ return 0;
}
-#else
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
-static inline void sched_avg_update(struct rq *rq) { }
-#endif
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
+static inline int hrtick_enabled(struct rq *rq)
{
- struct rq *rq;
+ return 0;
+}
- lockdep_assert_held(&p->pi_lock);
+#endif /* !CONFIG_SCHED_HRTICK */
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
+#ifndef arch_scale_freq_tick
+static __always_inline void arch_scale_freq_tick(void) { }
+#endif
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
+#ifndef arch_scale_freq_capacity
+/**
+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ * f_curr
+ * ------ * SCHED_CAPACITY_SCALE
+ * f_max
+ */
+static __always_inline
+unsigned long arch_scale_freq_capacity(int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
}
+#endif
/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
*/
-static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
{
- struct rq *rq;
-
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- /*
- * move_queued_task() task_rq_lock()
- *
- * ACQUIRE (rq->lock)
- * [S] ->on_rq = MIGRATING [L] rq = task_rq()
- * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
- * [S] ->cpu = new_cpu [L] task_rq()
- * [L] ->on_rq
- * RELEASE (rq->lock)
- *
- * If we observe the old cpu in task_rq_lock, the acquire of
- * the old rq->lock will fully serialize against the stores.
- *
- * If we observe the new cpu in task_rq_lock, the acquire will
- * pair with the WMB to ensure we must then also see migrating.
- */
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
}
-static inline void __task_rq_unlock(struct rq *rq)
- __releases(rq->lock)
-{
- raw_spin_unlock(&rq->lock);
-}
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+ _lock; return _t; }
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
+static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
{
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * In order to not have {0,2},{1,3} turn into into an AB-BA,
+ * order by core-id first and cpu-id second.
+ *
+ * Notably:
+ *
+ * double_rq_lock(0,3); will take core-0, core-1 lock
+ * double_rq_lock(1,2); will take core-1, core-0 lock
+ *
+ * when only cpu-id is considered.
+ */
+ if (rq1->core->cpu < rq2->core->cpu)
+ return true;
+ if (rq1->core->cpu > rq2->core->cpu)
+ return false;
+
+ /*
+ * __sched_core_flip() relies on SMT having cpu-id lock order.
+ */
+#endif /* CONFIG_SCHED_CORE */
+ return rq1->cpu < rq2->cpu;
}
-#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPT
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+#ifdef CONFIG_PREEMPTION
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
@@ -1502,18 +3096,18 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
double_rq_lock(this_rq, busiest);
return 1;
}
-#else
+#else /* !CONFIG_PREEMPTION: */
/*
* Unfair double_lock_balance: Optimizes throughput at the expense of
* latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry. This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
+ * already in proper order on entry. This favors lower CPU-ids and will
+ * grant the double lock to lower CPUs over higher ids under contention,
* regardless of entry order into the function.
*/
static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1521,34 +3115,32 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- int ret = 0;
-
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+ likely(raw_spin_rq_trylock(busiest))) {
+ double_rq_clock_clear_update(this_rq, busiest);
+ return 0;
+ }
+
+ if (rq_order_less(this_rq, busiest)) {
+ raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(this_rq, busiest);
+ return 0;
}
- return ret;
+
+ raw_spin_rq_unlock(this_rq);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
}
-#endif /* CONFIG_PREEMPT */
+#endif /* !CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
+ lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest);
}
@@ -1556,8 +3148,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ if (__rq_lockp(this_rq) != __rq_lockp(busiest))
+ raw_spin_rq_unlock(busiest);
+ lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -1587,31 +3180,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- }
- }
+static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ raw_spin_unlock(l1);
+ raw_spin_unlock(l2);
}
+DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
+ double_raw_lock(_T->lock, _T->lock2),
+ double_raw_unlock(_T->lock, _T->lock2))
+
/*
* double_rq_unlock - safely unlock two runqueues
*
@@ -1622,53 +3200,43 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);
+ raw_spin_rq_unlock(rq1);
}
-#else /* CONFIG_SMP */
+extern void set_rq_online (struct rq *rq);
+extern void set_rq_offline(struct rq *rq);
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
-{
- BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
- __release(rq2->lock);
-}
+extern bool sched_smp_initialized;
-#endif
+DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
+ double_rq_lock(_T->lock, _T->lock2),
+ double_rq_unlock(_T->lock, _T->lock2))
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+extern bool sched_debug_verbose;
+
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+extern void resched_latency_warn(int cpu, u64 latency);
+
+#ifdef CONFIG_NUMA_BALANCING
+extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
+extern void
+print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+#endif /* CONFIG_NUMA_BALANCING */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
@@ -1678,59 +3246,743 @@ extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
-enum rq_nohz_flag_bits {
- NOHZ_TICK_STOPPED,
- NOHZ_BALANCE_KICK,
-};
-#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#define NOHZ_BALANCE_KICK_BIT 0
+#define NOHZ_STATS_KICK_BIT 1
+#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3
+
+/* Run sched_balance_domains() */
+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
+#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
+
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
+
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+
+extern void nohz_balance_exit_idle(struct rq *rq);
+#else /* !CONFIG_NO_HZ_COMMON: */
+static inline void nohz_balance_exit_idle(struct rq *rq) { }
+#endif /* !CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_COMMON
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+#include "stats.h"
-DECLARE_PER_CPU(u64, cpu_hardirq_time);
-DECLARE_PER_CPU(u64, cpu_softirq_time);
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
-#ifndef CONFIG_64BIT
-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
+extern void __sched_core_account_forceidle(struct rq *rq);
-static inline void irq_time_write_begin(void)
+static inline void sched_core_account_forceidle(struct rq *rq)
{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
}
-static inline void irq_time_write_end(void)
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
+
+static inline void sched_core_account_forceidle(struct rq *rq) { }
+
+static inline void sched_core_tick(struct rq *rq) { }
+
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+struct irqtime {
+ u64 total;
+ u64 tick_delta;
+ u64 irq_start_time;
+ struct u64_stats_sync sync;
+};
+
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+extern int sched_clock_irqtime;
+
+static inline int irqtime_enabled(void)
+{
+ return sched_clock_irqtime;
}
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
+ * and never move forward.
+ */
static inline u64 irq_time_read(int cpu)
{
- u64 irq_time;
- unsigned seq;
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ unsigned int seq;
+ u64 total;
do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ total = irqtime->total;
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
- return irq_time;
+ return total;
}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
+
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
+
+static inline int irqtime_enabled(void)
{
+ return 0;
}
-static inline void irq_time_write_end(void)
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS
+ * and DL, though, because they may not be coming in if only RT tasks are
+ * active all the time (or there are RT tasks only).
+ *
+ * As a workaround for that issue, this function is called periodically by the
+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
+ if (data)
+ data->func(data, rq_clock(rq), flags);
}
+#else /* !CONFIG_CPU_FREQ: */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
+#endif /* !CONFIG_CPU_FREQ */
-static inline u64 irq_time_read(int cpu)
+#ifdef arch_scale_freq_capacity
+# ifndef arch_scale_freq_invariant
+# define arch_scale_freq_invariant() true
+# endif
+#else
+# define arch_scale_freq_invariant() false
+#endif
+
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max);
+
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max);
+
+
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the original capacity of @cpu is
+ * greater than or equal to task's deadline density right shifted by
+ * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+ return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT);
+}
+
+static inline unsigned long cpu_bw_dl(struct rq *rq)
+{
+ return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+}
+
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_dl.util_avg);
+}
+
+
+extern unsigned long cpu_util_cfs(int cpu);
+extern unsigned long cpu_util_cfs_boost(int cpu);
+
+static inline unsigned long cpu_util_rt(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_rt.util_avg);
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
+
+/*
+ * Enabling static branches would get the cpus_read_lock(),
+ * check whether uclamp_is_used before enable it to avoid always
+ * calling cpus_read_lock(). Because we never disable this
+ * static key once enable it.
+ */
+static inline void sched_uclamp_enable(void)
+{
+ if (!uclamp_is_used())
+ static_branch_enable(&sched_uclamp_used);
+}
+
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ return READ_ONCE(rq->uclamp[clamp_id].value);
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+ WRITE_ONCE(rq->uclamp[clamp_id].value, value);
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
+}
+
+/* Is the rq being capped/throttled by uclamp_max? */
+static inline bool uclamp_rq_is_capped(struct rq *rq)
+{
+ unsigned long rq_util;
+ unsigned long max_util;
+
+ if (!uclamp_is_used())
+ return false;
+
+ rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+ max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+}
+
+#define for_each_clamp_id(clamp_id) \
+ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
+extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
+
+
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+ return SCHED_CAPACITY_SCALE;
+}
+
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
+}
+
+static inline void
+uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
+{
+ uc_se->value = value;
+ uc_se->bucket_id = uclamp_bucket_id(value);
+ uc_se->user_defined = user_defined;
+}
+
+#else /* !CONFIG_UCLAMP_TASK: */
+
+static inline unsigned long
+uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
+
+static inline bool uclamp_is_used(void)
+{
+ return false;
+}
+
+static inline void sched_uclamp_enable(void) {}
+
+static inline unsigned long
+uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void
+uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value)
+{
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return false;
+}
+
+#endif /* !CONFIG_UCLAMP_TASK */
+
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_irq.util_avg);
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ util *= (max - irq);
+ util /= max;
+
+ return util;
+
+}
+
+#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */
+
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ return 0;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ return util;
+}
+
+#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */
+
+extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr);
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+
+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
+
+DECLARE_STATIC_KEY_FALSE(sched_energy_present);
+
+static inline bool sched_energy_enabled(void)
+{
+ return static_branch_unlikely(&sched_energy_present);
+}
+
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
+
+#define perf_domain_span(pd) NULL
+
+static inline bool sched_energy_enabled(void) { return false; }
+
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+
+#ifdef CONFIG_MEMBARRIER
+
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+ int membarrier_state;
+
+ if (prev_mm == next_mm)
+ return;
+
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+
+#else /* !CONFIG_MEMBARRIER: */
+
+static inline void membarrier_switch_mm(struct rq *rq,
+ struct mm_struct *prev_mm,
+ struct mm_struct *next_mm)
+{
+}
+
+#endif /* !CONFIG_MEMBARRIER */
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+
+extern void swake_up_all_locked(struct swait_queue_head *q);
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+extern int preempt_dynamic_mode;
+extern int sched_dynamic_mode(const char *str);
+extern void sched_dynamic_update(int mode);
+#endif
+extern const char *preempt_modes[];
+
+#ifdef CONFIG_SCHED_MM_CID
+
+static __always_inline bool cid_on_cpu(unsigned int cid)
+{
+ return cid & MM_CID_ONCPU;
+}
+
+static __always_inline bool cid_in_transit(unsigned int cid)
+{
+ return cid & MM_CID_TRANSIT;
+}
+
+static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_ONCPU;
+}
+
+static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid)
+{
+ return cid | MM_CID_ONCPU;
+}
+
+static __always_inline unsigned int cid_to_transit_cid(unsigned int cid)
+{
+ return cid | MM_CID_TRANSIT;
+}
+
+static __always_inline unsigned int cid_from_transit_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_TRANSIT;
+}
+
+static __always_inline bool cid_on_task(unsigned int cid)
+{
+ /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */
+ return cid < MM_CID_TRANSIT;
+}
+
+static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid)
+{
+ clear_bit(cid, mm_cidmask(mm));
+}
+
+static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
+{
+ unsigned int cid = t->mm_cid.cid;
+
+ t->mm_cid.cid = MM_CID_UNSET;
+ if (cid_on_task(cid))
+ mm_drop_cid(t->mm, cid);
+}
+
+static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
+{
+ /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
+ pcp->cid = cpu_cid_to_cid(pcp->cid);
+ mm_drop_cid(mm, pcp->cid);
+}
+
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+ unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+
+ if (cid >= max_cids)
+ return MM_CID_UNSET;
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
+ return MM_CID_UNSET;
+ return cid;
+}
+
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
+{
+ unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
+
+ while (cid == MM_CID_UNSET) {
+ cpu_relax();
+ cid = __mm_get_cid(mm, num_possible_cpus());
+ }
+ return cid;
+}
+
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+ unsigned int max_cids)
+{
+ unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
+
+ /* Is it in the optimal CID space? */
+ if (likely(cid < max_cids))
+ return orig_cid;
+
+ /* Try to find one in the optimal space. Otherwise keep the provided. */
+ new_cid = __mm_get_cid(mm, max_cids);
+ if (new_cid != MM_CID_UNSET) {
+ mm_drop_cid(mm, cid);
+ /* Preserve the ONCPU mode of the original CID */
+ return new_cid | (orig_cid & MM_CID_ONCPU);
+ }
+ return orig_cid;
+}
+
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+ if (t->mm_cid.cid != cid) {
+ t->mm_cid.cid = cid;
+ rseq_sched_set_ids_changed(t);
+ }
+}
+
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
+}
+
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+{
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case where both have the ONCPU bit set */
+ if (likely(cid_on_cpu(cpu_cid & tcid))) {
+ if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+ mm_cid_update_task_cid(t, cpu_cid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+ } else {
+ /* Hand over or drop the task owned CID */
+ if (cid_on_task(tcid)) {
+ if (cid_on_cpu(cpu_cid))
+ mm_unset_cid_on_task(t);
+ else
+ cpu_cid = cid_to_cpu_cid(tcid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_cpu(cpu_cid))
+ cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+ }
+ mm_cid_update_pcpu_cid(mm, cpu_cid);
+ mm_cid_update_task_cid(t, cpu_cid);
+}
+
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+{
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case, where both have the ONCPU bit clear */
+ if (likely(cid_on_task(tcid | cpu_cid))) {
+ if (likely(tcid < max_cids)) {
+ mm_cid_update_pcpu_cid(mm, tcid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ tcid = mm_cid_converge(mm, tcid, max_cids);
+ } else {
+ /* Hand over or drop the CPU owned CID */
+ if (cid_on_cpu(cpu_cid)) {
+ if (cid_on_task(tcid))
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+ else
+ tcid = cpu_cid_to_cid(cpu_cid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_task(tcid))
+ tcid = mm_get_cid(mm);
+ /* Set the transition mode flag if required */
+ tcid |= READ_ONCE(mm->mm_cid.transit);
+ }
+ mm_cid_update_pcpu_cid(mm, tcid);
+ mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+ struct mm_struct *mm = next->mm;
+ unsigned int cpu_cid;
+
+ if (!next->mm_cid.active)
+ return;
+
+ cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+ if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+ mm_cid_from_task(next, cpu_cid);
+ else
+ mm_cid_from_cpu(next, cpu_cid);
+}
+
+static __always_inline void mm_cid_schedout(struct task_struct *prev)
+{
+ /* During mode transitions CIDs are temporary and need to be dropped */
+ if (likely(!cid_in_transit(prev->mm_cid.cid)))
+ return;
+
+ mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
+ prev->mm_cid.cid = MM_CID_UNSET;
+}
+
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+ mm_cid_schedout(prev);
+ mm_cid_schedin(next);
}
-#endif /* CONFIG_64BIT */
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#else /* !CONFIG_SCHED_MM_CID: */
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
+#endif /* !CONFIG_SCHED_MM_CID */
+
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static inline
+void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
+{
+ lockdep_assert_rq_held(src_rq);
+ lockdep_assert_rq_held(dst_rq);
+
+ deactivate_task(src_rq, task, 0);
+ set_task_cpu(task, dst_rq->cpu);
+ activate_task(dst_rq, task, 0);
+}
+
+static inline
+bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_on_cpu(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_mask))
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_RT_MUTEXES
+
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
+#else /* !CONFIG_RT_MUTEXES: */
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
+
+#endif /* !CONFIG_RT_MUTEXES */
+
+extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
+extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+extern const struct sched_class *__setscheduler_class(int policy, int prio);
+extern void set_load_weight(struct task_struct *p, bool update_load);
+extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+
+/*
+ * The 'sched_change' pattern is the safe, easy and slow way of changing a
+ * task's scheduling properties. It dequeues a task, such that the scheduler
+ * is fully unaware of it; at which point its properties can be modified;
+ * after which it is enqueued again.
+ *
+ * Typically this must be called while holding task_rq_lock, since most/all
+ * properties are serialized under those locks. There is currently one
+ * exception to this rule in sched/ext which only holds rq->lock.
+ */
+
+/*
+ * This structure is a temporary, used to preserve/convey the queueing state
+ * of the task between sched_change_begin() and sched_change_end(). Ensuring
+ * the task's queueing state is idempotent across the operation.
+ */
+struct sched_change_ctx {
+ u64 prio;
+ struct task_struct *p;
+ int flags;
+ bool queued;
+ bool running;
+};
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags);
+void sched_change_end(struct sched_change_ctx *ctx);
+
+DEFINE_CLASS(sched_change, struct sched_change_ctx *,
+ sched_change_end(_T),
+ sched_change_begin(p, flags),
+ struct task_struct *p, unsigned int flags)
+
+DEFINE_CLASS_IS_UNCONDITIONAL(sched_change)
+
+#include "ext.h"
+
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
new file mode 100644
index 000000000000..7f151d96dba9
--- /dev/null
+++ b/kernel/sched/smp.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_SCHED_SMP_H
+#define _KERNEL_SCHED_SMP_H
+
+/*
+ * Scheduler internal SMP callback types and methods between the scheduler
+ * and other internal parts of the core kernel:
+ */
+#include <linux/types.h>
+
+extern void sched_ttwu_pending(void *arg);
+
+extern bool call_function_single_prep_ipi(int cpu);
+
+#ifdef CONFIG_SMP
+extern void flush_smp_call_function_queue(void);
+#else
+static inline void flush_smp_call_function_queue(void) { }
+#endif
+
+#endif /* _KERNEL_SCHED_SMP_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..d1c9429a4ac5 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,16 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * /proc/schedstat implementation
+ */
+#include "sched.h"
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 wait_start, prev_wait_start;
-#include "sched.h"
+ wait_start = rq_clock(rq);
+ prev_wait_start = schedstat_val(stats->wait_start);
+
+ if (p && likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
+
+ __schedstat_set(stats->wait_start, wait_start);
+}
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start);
+
+ if (p) {
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ __schedstat_set(stats->wait_start, delta);
+
+ return;
+ }
+
+ trace_sched_stat_wait(p, delta);
+ }
+
+ __schedstat_set(stats->wait_max,
+ max(schedstat_val(stats->wait_max), delta));
+ __schedstat_inc(stats->wait_count);
+ __schedstat_add(stats->wait_sum, delta);
+ __schedstat_set(stats->wait_start, 0);
+}
+
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 sleep_start, block_start;
+
+ sleep_start = schedstat_val(stats->sleep_start);
+ block_start = schedstat_val(stats->block_start);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->sleep_max)))
+ __schedstat_set(stats->sleep_max, delta);
+
+ __schedstat_set(stats->sleep_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+
+ if (p) {
+ account_scheduler_latency(p, delta >> 10, 1);
+ trace_sched_stat_sleep(p, delta);
+ }
+ }
+
+ if (block_start) {
+ u64 delta = rq_clock(rq) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->block_max)))
+ __schedstat_set(stats->block_max, delta);
+
+ __schedstat_set(stats->block_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+ __schedstat_add(stats->sum_block_runtime, delta);
+
+ if (p) {
+ if (p->in_iowait) {
+ __schedstat_add(stats->iowait_sum, delta);
+ __schedstat_inc(stats->iowait_count);
+ trace_sched_stat_iowait(p, delta);
+ }
+
+ trace_sched_stat_blocked(p, delta);
+
+ account_scheduler_latency(p, delta >> 10, 0);
+ }
+ }
+}
/*
- * bump this up when changing the output format or the meaning of an existing
+ * Current schedstat API version.
+ *
+ * Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 15
+#define SCHEDSTAT_VERSION 17
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -21,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "timestamp %lu\n", jiffies);
} else {
struct rq *rq;
-#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
-#endif
cpu = (unsigned long)(v - 2);
rq = cpu_rq(cpu);
@@ -39,21 +131,22 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- seq_printf(seq, "domain%d %*pb", dcount++,
+ seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
cpumask_pr_args(sched_domain_span(sd)));
- for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
- itype++) {
- seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
- sd->lb_imbalance[itype],
+ sd->lb_imbalance_load[itype],
+ sd->lb_imbalance_util[itype],
+ sd->lb_imbalance_task[itype],
+ sd->lb_imbalance_misfit[itype],
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
@@ -68,17 +161,16 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_move_balance);
}
rcu_read_unlock();
-#endif
}
return 0;
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the CPUs.
*/
static void *schedstat_start(struct seq_file *file, loff_t *offset)
{
@@ -98,12 +190,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
if (n < nr_cpu_ids)
return (void *)(unsigned long)(n + 2);
+
return NULL;
}
static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
{
(*offset)++;
+
return schedstat_start(file, offset);
}
@@ -118,21 +212,9 @@ static const struct seq_operations schedstat_sops = {
.show = show_schedstat,
};
-static int schedstat_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &schedstat_sops);
-}
-
-static const struct file_operations proc_schedstat_operations = {
- .open = schedstat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static int __init proc_schedstat_init(void)
{
- proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+ proc_create_seq("schedstat", 0, NULL, &schedstat_sops);
return 0;
}
subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..c903f1a42891 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -1,6 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_STATS_H
+#define _KERNEL_STATS_H
#ifdef CONFIG_SCHEDSTATS
+extern struct static_key_false sched_schedstats;
+
/*
* Expects runqueue lock to be held for atomicity of update
*/
@@ -24,69 +29,259 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
}
static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_sched_info.run_delay += delta;
}
-# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val) do { var = (val); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{}
+#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
+#define __schedstat_inc(var) do { var++; } while (0)
+#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
+#define __schedstat_add(var, amt) do { var += (amt); } while (0)
+#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val) do { var = (val); } while (0)
+#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var) (var)
+#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-# define schedstat_inc(rq, field) do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-# define schedstat_set(var, val) do { } while (0)
+check_schedstat_required(void)
+{
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled())
+ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n");
+}
+
+#else /* !CONFIG_SCHEDSTATS: */
+
+static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
+# define schedstat_enabled() 0
+# define __schedstat_inc(var) do { } while (0)
+# define schedstat_inc(var) do { } while (0)
+# define __schedstat_add(var, amt) do { } while (0)
+# define schedstat_add(var, amt) do { } while (0)
+# define __schedstat_set(var, val) do { } while (0)
+# define schedstat_set(var, val) do { } while (0)
+# define schedstat_val(var) 0
+# define schedstat_val_or_zero(var) 0
+
+# define __update_stats_wait_start(rq, p, stats) do { } while (0)
+# define __update_stats_wait_end(rq, p, stats) do { } while (0)
+# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0)
+# define check_schedstat_required() do { } while (0)
+
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+struct sched_entity_stats {
+ struct sched_entity se;
+ struct sched_statistics stats;
+} __no_randomize_layout;
#endif
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-static inline void sched_info_reset_dequeued(struct task_struct *t)
+static inline struct sched_statistics *
+__schedstats_from_se(struct sched_entity *se)
{
- t->sched_info.last_queued = 0;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (!entity_is_task(se))
+ return &container_of(se, struct sched_entity_stats, se)->stats;
+#endif
+ return &task_of(se)->stats;
+}
+
+#ifdef CONFIG_PSI
+void psi_task_change(struct task_struct *task, int clear, int set);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+/*
+ * PSI tracks state that persists across sleeps, such as iowaits and
+ * memory stalls. As a result, it has to distinguish between sleeps,
+ * where a task's runnable state changes, and migrations, where a task
+ * and its runnable state are being moved between CPUs and runqueues.
+ *
+ * A notable case is a task whose dequeue is delayed. PSI considers
+ * those sleeping, but because they are still on the runqueue they can
+ * go through migration requeues. In this case, *sleeping* states need
+ * to be transferred.
+ */
+static inline void psi_enqueue(struct task_struct *p, int flags)
+{
+ int clear = 0, set = 0;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ /* Same runqueue, nothing changed for psi */
+ if (flags & ENQUEUE_RESTORE)
+ return;
+
+ /* psi_sched_switch() will handle the flags */
+ if (task_on_cpu(task_rq(p), p))
+ return;
+
+ if (p->se.sched_delayed) {
+ /* CPU migration of "sleeping" task */
+ WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED));
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL;
+ if (p->in_iowait)
+ set |= TSK_IOWAIT;
+ } else if (flags & ENQUEUE_MIGRATED) {
+ /* CPU migration of runnable task */
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
+ } else {
+ /* Wakeup of new or sleeping task */
+ if (p->in_iowait)
+ clear |= TSK_IOWAIT;
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL_RUNNING;
+ }
+
+ psi_task_change(p, clear, set);
+}
+
+static inline void psi_dequeue(struct task_struct *p, int flags)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ /* Same runqueue, nothing changed for psi */
+ if (flags & DEQUEUE_SAVE)
+ return;
+
+ /*
+ * A voluntary sleep is a dequeue followed by a task switch. To
+ * avoid walking all ancestors twice, psi_task_switch() handles
+ * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+ * Do nothing here.
+ *
+ * In the SCHED_PROXY_EXECUTION case we may do sleeping
+ * dequeues that are not followed by a task switch, so check
+ * TSK_ONCPU is set to ensure the task switch is imminent.
+ * Otherwise clear the flags as usual.
+ */
+ if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU))
+ return;
+
+ /*
+ * When migrating a task to another CPU, clear all psi
+ * state. The enqueue callback above will work it out.
+ */
+ psi_task_change(p, p->psi_flags, 0);
+}
+
+static inline void psi_ttwu_dequeue(struct task_struct *p)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+ /*
+ * Is the task being migrated during a wakeup? Make sure to
+ * deregister its sleep-persistent psi states from the old
+ * queue, and let psi_enqueue() know it has to requeue.
+ */
+ if (unlikely(p->psi_flags)) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = __task_rq_lock(p, &rf);
+ psi_task_change(p, p->psi_flags, 0);
+ __task_rq_unlock(rq, p, &rf);
+ }
+}
+
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep)
+{
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ psi_task_switch(prev, next, sleep);
}
+#else /* !CONFIG_PSI: */
+static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
+static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
+static inline void psi_ttwu_dequeue(struct task_struct *p) {}
+static inline void psi_sched_switch(struct task_struct *prev,
+ struct task_struct *next,
+ bool sleep) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /* !CONFIG_PSI */
+
+#ifdef CONFIG_SCHED_INFO
/*
* We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * delta taken on each cpu would annul the skew.
+ * task was queued to the time that it finally hit a CPU, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across CPUs. The
+ * delta taken on each CPU would annul the skew.
*/
-static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long delta = 0;
- if (unlikely(sched_info_on()))
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- sched_info_reset_dequeued(t);
- t->sched_info.run_delay += delta;
+ if (!t->sched_info.last_queued)
+ return;
- rq_sched_info_dequeued(rq, delta);
+ delta = rq_clock(rq) - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
+ t->sched_info.run_delay += delta;
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
+ rq_sched_info_dequeue(rq, delta);
}
/*
- * Called when a task finally hits the cpu. We can now calculate how
+ * Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we
- * can keep stats on how long its timeslice is.
+ * can keep stats on how long its time-slice is.
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long now, delta = 0;
+
+ if (!t->sched_info.last_queued)
+ return;
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- sched_info_reset_dequeued(t);
+ now = rq_clock(rq);
+ delta = now - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
rq_sched_info_arrive(rq, delta);
}
@@ -94,13 +289,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
/*
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
+ * sched_info_dequeue() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t)
{
- if (unlikely(sched_info_on()))
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = rq_clock(rq);
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
@@ -108,18 +302,17 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
* due, typically, to expiring its time slice (this may also be called when
* switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
+ * sched_info_enqueue() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
- unsigned long long delta = rq_clock(rq) -
- t->sched_info.last_arrival;
+ unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
rq_sched_info_depart(rq, delta);
- if (t->state == TASK_RUNNING)
- sched_info_queued(rq, t);
+ if (task_is_running(t))
+ sched_info_enqueue(rq, t);
}
/*
@@ -128,11 +321,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct rq *rq,
- struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
- * prev now departs the cpu. It's not interesting to record
+ * prev now departs the CPU. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
@@ -142,126 +334,11 @@ __sched_info_switch(struct rq *rq,
if (next != rq->idle)
sched_info_arrive(rq, next);
}
-static inline void
-sched_info_switch(struct rq *rq,
- struct task_struct *prev, struct task_struct *next)
-{
- if (unlikely(sched_info_on()))
- __sched_info_switch(rq, prev, next);
-}
-#else
-#define sched_info_queued(rq, t) do { } while (0)
-#define sched_info_reset_dequeued(t) do { } while (0)
-#define sched_info_dequeued(rq, t) do { } while (0)
-#define sched_info_depart(rq, t) do { } while (0)
-#define sched_info_arrive(rq, next) do { } while (0)
-#define sched_info_switch(rq, t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick. None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * cputimer_running - return true if cputimer is running
- *
- * @tsk: Pointer to target task.
- */
-static inline bool cputimer_running(struct task_struct *tsk)
-
-{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
- if (!cputimer->running)
- return false;
-
- /*
- * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
- * in __exit_signal(), we won't account to the signal struct further
- * cputime consumed by that task, even though the task can still be
- * ticking after __exit_signal().
- *
- * In order to keep a consistent behaviour between thread group cputime
- * and thread group cputimer accounting, lets also ignore the cputime
- * elapsing after __exit_signal() in any thread group timer running.
- *
- * This makes sure that POSIX CPU clocks and timers are synchronized, so
- * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
- * clock delta is behind the expiring timer value.
- */
- if (unlikely(!tsk->sighand))
- return false;
-
- return true;
-}
+#else /* !CONFIG_SCHED_INFO: */
+# define sched_info_enqueue(rq, t) do { } while (0)
+# define sched_info_dequeue(rq, t) do { } while (0)
+# define sched_info_switch(rq, t, next) do { } while (0)
+#endif /* !CONFIG_SCHED_INFO */
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the utime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
- cputime_t cputime)
-{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
- if (!cputimer_running(tsk))
- return;
-
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime += cputime;
- raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the stime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
- cputime_t cputime)
-{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
- if (!cputimer_running(tsk))
- return;
-
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime += cputime;
- raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @ns: Time value by which to increment the sum_exec_runtime field
- * of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
- unsigned long long ns)
-{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
- if (!cputimer_running(tsk))
- return;
-
- raw_spin_lock(&cputimer->lock);
- cputimer->cputime.sum_exec_runtime += ns;
- raw_spin_unlock(&cputimer->lock);
-}
+#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec45a6ac..4f9192be4b5b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,5 +1,4 @@
-#include "sched.h"
-
+// SPDX-License-Identifier: GPL-2.0
/*
* stop-task scheduling class.
*
@@ -8,34 +7,37 @@
*
* See kernel/stop_machine.c
*/
+#include "sched.h"
-#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
-#endif /* CONFIG_SMP */
+
+static int
+balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return sched_stop_runnable(rq);
+}
static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
{
/* we're never preempted */
}
-static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev)
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first)
{
- struct task_struct *stop = rq->stop;
+ stop->se.exec_start = rq_clock_task(rq);
+}
- if (!stop || !task_on_rq_queued(stop))
+static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf)
+{
+ if (!sched_stop_runnable(rq))
return NULL;
- put_prev_task(rq, prev);
-
- stop->se.exec_start = rq_clock_task(rq);
-
- return stop;
+ return rq->stop;
}
static void
@@ -44,10 +46,11 @@ enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
add_nr_running(rq, 1);
}
-static void
+static bool
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ return true;
}
static void yield_task_stop(struct rq *rq)
@@ -55,51 +58,35 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- struct task_struct *curr = rq->curr;
- u64 delta_exec;
-
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
-
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ update_curr_common(rq);
}
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_stop(struct rq *rq)
-{
- struct task_struct *stop = rq->stop;
-
- stop->se.exec_start = rq_clock_task(rq);
-}
-
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
+static void switching_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio)
{
- BUG(); /* how!?, what priority? */
-}
+ if (p->prio == oldprio)
+ return;
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
- return 0;
+ BUG(); /* how!?, what priority? */
}
static void update_curr_stop(struct rq *rq)
@@ -109,28 +96,27 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-const struct sched_class stop_sched_class = {
- .next = &dl_sched_class,
+DEFINE_SCHED_CLASS(stop) = {
+
+ .queue_mask = 16,
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
- .check_preempt_curr = check_preempt_curr_stop,
+ .wakeup_preempt = wakeup_preempt_stop,
- .pick_next_task = pick_next_task_stop,
+ .pick_task = pick_task_stop,
.put_prev_task = put_prev_task_stop,
+ .set_next_task = set_next_task_stop,
-#ifdef CONFIG_SMP
+ .balance = balance_stop,
.select_task_rq = select_task_rq_stop,
-#endif
+ .set_cpus_allowed = set_cpus_allowed_common,
- .set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
- .get_rr_interval = get_rr_interval_stop,
-
.prio_changed = prio_changed_stop,
- .switched_to = switched_to_stop,
+ .switching_to = switching_to_stop,
.update_curr = update_curr_stop,
};
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 000000000000..0fef6496c4c8
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * <linux/swait.h> (simple wait queues ) implementation:
+ */
+#include "sched.h"
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+ struct lock_class_key *key)
+{
+ raw_spin_lock_init(&q->lock);
+ lockdep_set_class_and_name(&q->lock, key, name);
+ INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q, int wake_flags)
+{
+ struct swait_queue *curr;
+
+ if (list_empty(&q->task_list))
+ return;
+
+ curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+ try_to_wake_up(curr->task, TASK_NORMAL, wake_flags);
+ list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+/*
+ * Wake up all waiters. This is an interface which is solely exposed for
+ * completions and not for general usage.
+ *
+ * It is intentionally different from swake_up_all() to allow usage from
+ * hard interrupt context and interrupt disabled regions.
+ */
+void swake_up_all_locked(struct swait_queue_head *q)
+{
+ while (!list_empty(&q->task_list))
+ swake_up_locked(q, 0);
+}
+
+void swake_up_one(struct swait_queue_head *q)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ swake_up_locked(q, 0);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up_one);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+ struct swait_queue *curr;
+ LIST_HEAD(tmp);
+
+ raw_spin_lock_irq(&q->lock);
+ list_splice_init(&q->task_list, &tmp);
+ while (!list_empty(&tmp)) {
+ curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+ wake_up_state(curr->task, TASK_NORMAL);
+ list_del_init(&curr->task_list);
+
+ if (list_empty(&tmp))
+ break;
+
+ raw_spin_unlock_irq(&q->lock);
+ raw_spin_lock_irq(&q->lock);
+ }
+ raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ wait->task = current;
+ if (list_empty(&wait->task_list))
+ list_add_tail(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait_exclusive);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+ unsigned long flags;
+ long ret = 0;
+
+ raw_spin_lock_irqsave(&q->lock, flags);
+ if (signal_pending_state(state, current)) {
+ /*
+ * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
+ * must not see us.
+ */
+ list_del_init(&wait->task_list);
+ ret = -ERESTARTSYS;
+ } else {
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ }
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ __set_current_state(TASK_RUNNING);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+
+ if (!list_empty_careful(&wait->task_list)) {
+ raw_spin_lock_irqsave(&q->lock, flags);
+ list_del_init(&wait->task_list);
+ raw_spin_unlock_irqrestore(&q->lock, flags);
+ }
+}
+EXPORT_SYMBOL(finish_swait);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
new file mode 100644
index 000000000000..0496dc29ed0f
--- /dev/null
+++ b/kernel/sched/syscalls.c
@@ -0,0 +1,1570 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kernel/sched/syscalls.c
+ *
+ * Core kernel scheduler syscalls related code
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
+ */
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <linux/sched/debug.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "autogroup.h"
+
+static inline int __normal_prio(int policy, int rt_prio, int nice)
+{
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_or_dl_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ int old_prio;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ guard(task_rq_lock)(p);
+
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it won't have any effect on scheduling until the task is
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
+ */
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ return;
+ }
+
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+ }
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
+ * @p: task
+ * @nice: nice value
+ */
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
+{
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif /* __ARCH_WANT_SYS_NICE */
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * idle_cpu - is a given CPU idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle)
+ return 0;
+
+ if (rq->nr_running)
+ return 0;
+
+ if (rq->ttwu_pending)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the CPU @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+#endif /* CONFIG_SCHED_CORE */
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == SETPARAM_POLICY)
+ policy = p->policy;
+
+ p->policy = policy;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ __setparam_fair(p, attr);
+
+ /* rt-policy tasks do not have a timerslack */
+ if (rt_or_dl_task_policy(p)) {
+ p->timer_slack_ns = 0;
+ } else if (p->timer_slack_ns == 0) {
+ /* when switching back to non-rt policy, restore timerslack */
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ }
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
+ p->normal_prio = normal_prio(p);
+ set_load_weight(p, true);
+}
+
+/*
+ * Check the target process has a UID that matches the current process's:
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ guard(rcu)();
+
+ pcred = __task_cred(p);
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+
+static int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ util_min = attr->sched_util_min;
+
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ util_max = attr->sched_util_max;
+
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
+ return -EINVAL;
+
+ /*
+ * We have valid uclamp attributes; make sure uclamp is enabled.
+ *
+ * We need to do that here, because enabling static branches is a
+ * blocking operation which obviously cannot be done while holding
+ * scheduler locks.
+ */
+ sched_uclamp_enable();
+
+ return 0;
+}
+
+static bool uclamp_reset(const struct sched_attr *attr,
+ enum uclamp_id clamp_id,
+ struct uclamp_se *uc_se)
+{
+ /* Reset on sched class change for a non user-defined clamp value. */
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+ !uc_se->user_defined)
+ return true;
+
+ /* Reset on sched_util_{min,max} == -1. */
+ if (clamp_id == UCLAMP_MIN &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min == -1) {
+ return true;
+ }
+
+ if (clamp_id == UCLAMP_MAX &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max == -1) {
+ return true;
+ }
+
+ return false;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int value;
+
+ if (!uclamp_reset(attr, clamp_id, uc_se))
+ continue;
+
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ value = sysctl_sched_uclamp_util_min_rt_default;
+ else
+ value = uclamp_none(clamp_id);
+
+ uclamp_se_set(uc_se, value, false);
+
+ }
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ return;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+ attr->sched_util_min, true);
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+ attr->sched_util_max, true);
+ }
+}
+
+#else /* !CONFIG_UCLAMP_TASK: */
+
+static inline int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr) { }
+#endif /* !CONFIG_UCLAMP_TASK */
+
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
+int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user, bool pi)
+{
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio;
+ const struct sched_class *prev_class, *next_class;
+ struct balance_callback *head;
+ struct rq_flags rf;
+ int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct rq *rq;
+ bool cpuset_locked = false;
+
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
+recheck:
+ /* Double check policy once rq lock held: */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
+
+ if (!valid_policy(policy))
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ return -EINVAL;
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
+ */
+ if (attr->sched_priority > MAX_RT_PRIO-1)
+ return -EINVAL;
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
+ return -EINVAL;
+
+ if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
+
+ /*
+ * Make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the appropriate
+ * runqueue lock must be held.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea:
+ */
+ if (p == rq->stop) {
+ retval = -EINVAL;
+ goto unlock;
+ }
+
+ retval = scx_check_setscheduler(p, policy);
+ if (retval)
+ goto unlock;
+
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+ */
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime != p->se.slice)))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy) && dl_param_changed(p, attr))
+ goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ retval = 0;
+ goto unlock;
+ }
+change:
+
+ if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow real-time tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_group_sched_enabled() &&
+ rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
+ retval = -EPERM;
+ goto unlock;
+ }
+#endif /* CONFIG_RT_GROUP_SCHED */
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, p->cpus_ptr) ||
+ rq->rd->dl_bw.bw == 0) {
+ retval = -EPERM;
+ goto unlock;
+ }
+ }
+ }
+
+ /* Re-check policy now with rq lock held: */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ goto recheck;
+ }
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
+ retval = -EBUSY;
+ goto unlock;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
+ }
+
+ prev_class = p->sched_class;
+ next_class = __setscheduler_class(policy, newprio);
+
+ if (prev_class != next_class)
+ queue_flags |= DEQUEUE_CLASS;
+
+ scoped_guard (sched_change, p, queue_flags) {
+
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ p->sched_class = next_class;
+ p->prio = newprio;
+ }
+ __setscheduler_uclamp(p, attr);
+
+ if (scope->queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ scope->flags |= ENQUEUE_HEAD;
+ }
+ }
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ head = splice_balance_callbacks(rq);
+ task_rq_unlock(rq, p, &rf);
+
+ if (pi) {
+ if (cpuset_locked)
+ cpuset_unlock();
+ rt_mutex_adjust_pi(p);
+ }
+
+ /* Run balance callbacks after we've adjusted the PI chain: */
+ balance_callbacks(rq, head);
+ preempt_enable();
+
+ return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ return retval;
+}
+
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ if (p->se.custom_slice)
+ attr.sched_runtime = p->se.slice;
+
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check, true);
+}
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Use sched_set_fifo(), read its comment.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, true);
+}
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true, true);
+}
+
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, false);
+}
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+/*
+ * Used when the primary interrupt handler is forced into a thread, in addition
+ * to the (always threaded) secondary handler. The secondary handler gets a
+ * slightly lower priority so that the primary handler can preempt it, thereby
+ * emulating the behavior of a non-PREEMPT_RT system where the primary handler
+ * runs in hard interrupt context.
+ */
+void sched_set_fifo_secondary(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+
+ if (unlikely(!param || pid < 0))
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ return sched_setscheduler(p, policy, &lparam);
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ /* Zero the full structure, so that a short copy will be nice: */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ /* ABI compatibility quirk: */
+ if (!size)
+ size = SCHED_ATTR_SIZE_VER0;
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
+ goto err_size;
+
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
+ }
+
+ if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+ size < SCHED_ATTR_SIZE_VER1)
+ return -EINVAL;
+
+ /*
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p)) {
+ __getparam_dl(p, attr);
+ } else if (task_has_rt_policy(p)) {
+ attr->sched_priority = p->rt_priority;
+ } else {
+ attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
+{
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ int retval;
+
+ if (unlikely(!uattr || pid < 0 || flags))
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
+
+ return sched_setattr(p, &attr);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
+ }
+ return retval;
+}
+
+/**
+ * sys_sched_getparam - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval;
+
+ if (unlikely(!param || pid < 0))
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @usize: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, usize, unsigned int, flags)
+{
+ struct sched_attr kattr = { };
+ struct task_struct *p;
+ int retval;
+
+ if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags))
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
+
+#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
+ }
+
+ kattr.size = min(usize, sizeof(kattr));
+ return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
+}
+
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+{
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * The special/sugov task isn't part of regular bandwidth/admission
+ * control so let userspace change affinities.
+ */
+ if (dl_entity_is_special(&p->dl))
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ guard(rcu)();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ return -EBUSY;
+
+ return 0;
+}
+
+int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
+{
+ int retval;
+ cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+
+ retval = __set_cpus_allowed_ptr(p, ctx);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
+
+ if (empty)
+ cpumask_copy(new_mask, cpus_allowed);
+ }
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
+ int retval;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
+
+ if (!check_same_owner(p)) {
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else {
+ return -ENOMEM;
+ }
+
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ struct cpumask *new_mask)
+{
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
+{
+ struct task_struct *p;
+ int retval;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
+
+ return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
+ *
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ unsigned int retlen = min(len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+static void do_sched_yield(void)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = this_rq_lock_irq(&rf);
+
+ schedstat_inc(rq->yld_count);
+ rq->donor->sched_class->yield_task(rq);
+
+ preempt_disable();
+ rq_unlock_irq(rq, &rf);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ do_sched_yield();
+ return 0;
+}
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, it's already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ do_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr;
+ struct rq *rq, *p_rq;
+ int yielded = 0;
+
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ rq = this_rq();
+ curr = rq->donor;
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
+
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
+
+ if (!curr->sched_class->yield_to_task)
+ return 0;
+
+ if (curr->sched_class != p->sched_class)
+ return 0;
+
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
+
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
+ }
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_RT_PRIO-1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ case SCHED_EXT:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ case SCHED_EXT:
+ ret = 0;
+ }
+ return ret;
+}
+
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
+{
+ unsigned int time_slice = 0;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
+
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default time-slice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the time-slice value.
+ *
+ * this syscall writes the default time-slice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the time-slice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct __kernel_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_old_timespec32(&t, interval);
+ return retval;
+}
+#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
new file mode 100644
index 000000000000..cf643a5ddedd
--- /dev/null
+++ b/kernel/sched/topology.c
@@ -0,0 +1,2942 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Scheduler topology setup/handling methods
+ */
+
+#include <linux/sched/isolation.h>
+#include <linux/bsearch.h>
+#include "sched.h"
+
+DEFINE_MUTEX(sched_domains_mutex);
+void sched_domains_mutex_lock(void)
+{
+ mutex_lock(&sched_domains_mutex);
+}
+void sched_domains_mutex_unlock(void)
+{
+ mutex_unlock(&sched_domains_mutex);
+}
+
+/* Protected by sched_domains_mutex: */
+static cpumask_var_t sched_domains_tmpmask;
+static cpumask_var_t sched_domains_tmpmask2;
+
+static int __init sched_debug_setup(char *str)
+{
+ sched_debug_verbose = true;
+
+ return 0;
+}
+early_param("sched_verbose", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+ return sched_debug_verbose;
+}
+
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+ struct cpumask *groupmask)
+{
+ struct sched_group *group = sd->groups;
+ unsigned long flags = sd->flags;
+ unsigned int idx;
+
+ cpumask_clear(groupmask);
+
+ printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
+ printk(KERN_CONT "span=%*pbl level=%s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+ }
+ if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+ }
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ unsigned int flag = BIT(idx);
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+ !(sd->child->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+ sd_flag_debug[idx].name);
+
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+ !(sd->parent->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+ sd_flag_debug[idx].name);
+ }
+
+ printk(KERN_DEBUG "%*s groups:", level + 1, "");
+ do {
+ if (!group) {
+ printk("\n");
+ printk(KERN_ERR "ERROR: group is NULL\n");
+ break;
+ }
+
+ if (cpumask_empty(sched_group_span(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: empty group\n");
+ break;
+ }
+
+ if (!(sd->flags & SD_NUMA) &&
+ cpumask_intersects(groupmask, sched_group_span(group))) {
+ printk(KERN_CONT "\n");
+ printk(KERN_ERR "ERROR: repeated CPUs\n");
+ break;
+ }
+
+ cpumask_or(groupmask, groupmask, sched_group_span(group));
+
+ printk(KERN_CONT " %d:{ span=%*pbl",
+ group->sgc->id,
+ cpumask_pr_args(sched_group_span(group)));
+
+ if ((sd->flags & SD_NUMA) &&
+ !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
+ printk(KERN_CONT " mask=%*pbl",
+ cpumask_pr_args(group_balance_mask(group)));
+ }
+
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
+ printk(KERN_CONT " cap=%lu", group->sgc->capacity);
+
+ if (group == sd->groups && sd->child &&
+ !cpumask_equal(sched_domain_span(sd->child),
+ sched_group_span(group))) {
+ printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
+ }
+
+ printk(KERN_CONT " }");
+
+ group = group->next;
+
+ if (group != sd->groups)
+ printk(KERN_CONT ",");
+
+ } while (group != sd->groups);
+ printk(KERN_CONT "\n");
+
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
+ printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+ printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+ return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+ int level = 0;
+
+ if (!sched_debug_verbose)
+ return;
+
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
+ printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
+
+ for (;;) {
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+ break;
+ level++;
+ sd = sd->parent;
+ if (!sd)
+ break;
+ }
+}
+
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+ (sd->groups != sd->groups->next))
+ return 0;
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_AFFINE))
+ return 0;
+
+ return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+ return 0;
+
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next)
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
+static unsigned int sysctl_sched_energy_aware = 1;
+static DEFINE_MUTEX(sched_energy_mutex);
+static bool sched_energy_update;
+
+static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
+{
+ bool any_asym_capacity = false;
+ int i;
+
+ /* EAS is enabled for asymmetric CPU capacity topologies. */
+ for_each_cpu(i, cpu_mask) {
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
+ any_asym_capacity = true;
+ break;
+ }
+ }
+ if (!any_asym_capacity) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* EAS definitely does *not* handle SMT */
+ if (sched_smt_active()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ if (!arch_scale_freq_invariant()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ if (!cpufreq_ready_for_eas(cpu_mask)) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ return true;
+}
+
+void rebuild_sched_domains_energy(void)
+{
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = true;
+ rebuild_sched_domains();
+ sched_energy_update = false;
+ mutex_unlock(&sched_energy_mutex);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+static int sched_energy_aware_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, state;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!sched_is_eas_possible(cpu_active_mask)) {
+ if (write) {
+ return -EOPNOTSUPP;
+ } else {
+ *lenp = 0;
+ return 0;
+ }
+ }
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ state = static_branch_unlikely(&sched_energy_present);
+ if (state != sysctl_sched_energy_aware)
+ rebuild_sched_domains_energy();
+ }
+
+ return ret;
+}
+
+static const struct ctl_table sched_energy_aware_sysctls[] = {
+ {
+ .procname = "sched_energy_aware",
+ .data = &sysctl_sched_energy_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_energy_aware_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sched_energy_aware_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_energy_aware_sysctls);
+ return 0;
+}
+
+late_initcall(sched_energy_aware_sysctl_init);
+#endif /* CONFIG_PROC_SYSCTL */
+
+static void free_pd(struct perf_domain *pd)
+{
+ struct perf_domain *tmp;
+
+ while (pd) {
+ tmp = pd->next;
+ kfree(pd);
+ pd = tmp;
+ }
+}
+
+static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
+{
+ while (pd) {
+ if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
+ return pd;
+ pd = pd->next;
+ }
+
+ return NULL;
+}
+
+static struct perf_domain *pd_init(int cpu)
+{
+ struct em_perf_domain *obj = em_cpu_get(cpu);
+ struct perf_domain *pd;
+
+ if (!obj) {
+ if (sched_debug())
+ pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
+ return NULL;
+ }
+
+ pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+ if (!pd)
+ return NULL;
+ pd->em_pd = obj;
+
+ return pd;
+}
+
+static void perf_domain_debug(const struct cpumask *cpu_map,
+ struct perf_domain *pd)
+{
+ if (!sched_debug() || !pd)
+ return;
+
+ printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
+
+ while (pd) {
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
+ cpumask_first(perf_domain_span(pd)),
+ cpumask_pr_args(perf_domain_span(pd)),
+ em_pd_nr_perf_states(pd->em_pd));
+ pd = pd->next;
+ }
+
+ printk(KERN_CONT "\n");
+}
+
+static void destroy_perf_domain_rcu(struct rcu_head *rp)
+{
+ struct perf_domain *pd;
+
+ pd = container_of(rp, struct perf_domain, rcu);
+ free_pd(pd);
+}
+
+static void sched_energy_set(bool has_eas)
+{
+ if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
+ if (sched_debug())
+ pr_info("%s: stopping EAS\n", __func__);
+ static_branch_disable_cpuslocked(&sched_energy_present);
+ } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
+ if (sched_debug())
+ pr_info("%s: starting EAS\n", __func__);
+ static_branch_enable_cpuslocked(&sched_energy_present);
+ }
+}
+
+/*
+ * EAS can be used on a root domain if it meets all the following conditions:
+ * 1. an Energy Model (EM) is available;
+ * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
+ * 3. no SMT is detected.
+ * 4. schedutil is driving the frequency of all CPUs of the rd;
+ * 5. frequency invariance support is present;
+ */
+static bool build_perf_domains(const struct cpumask *cpu_map)
+{
+ int i;
+ struct perf_domain *pd = NULL, *tmp;
+ int cpu = cpumask_first(cpu_map);
+ struct root_domain *rd = cpu_rq(cpu)->rd;
+
+ if (!sysctl_sched_energy_aware)
+ goto free;
+
+ if (!sched_is_eas_possible(cpu_map))
+ goto free;
+
+ for_each_cpu(i, cpu_map) {
+ /* Skip already covered CPUs. */
+ if (find_pd(pd, i))
+ continue;
+
+ /* Create the new pd and add it to the local list. */
+ tmp = pd_init(i);
+ if (!tmp)
+ goto free;
+ tmp->next = pd;
+ pd = tmp;
+ }
+
+ perf_domain_debug(cpu_map, pd);
+
+ /* Attach the new list of performance domains to the root domain. */
+ tmp = rd->pd;
+ rcu_assign_pointer(rd->pd, pd);
+ if (tmp)
+ call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+ return !!pd;
+
+free:
+ free_pd(pd);
+ tmp = rd->pd;
+ rcu_assign_pointer(rd->pd, NULL);
+ if (tmp)
+ call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+ return false;
+}
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
+static void free_pd(struct perf_domain *pd) { }
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+ cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ free_pd(rd->pd);
+ kfree(rd);
+}
+
+void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ struct root_domain *old_rd = NULL;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+
+ if (rq->rd) {
+ old_rd = rq->rd;
+
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
+ set_rq_offline(rq);
+
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+ /*
+ * If we don't want to free the old_rd yet then
+ * set old_rd to NULL to skip the freeing later
+ * in this function:
+ */
+ if (!atomic_dec_and_test(&old_rd->refcount))
+ old_rd = NULL;
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+ set_rq_online(rq);
+
+ /*
+ * Because the rq is not a task, dl_add_task_root_domain() did not
+ * move the fair server bw to the rd if it already started.
+ * Add it now.
+ */
+ if (rq->fair_server.dl_server)
+ __dl_server_attach_root(&rq->fair_server, rq);
+
+ rq_unlock_irqrestore(rq, &rf);
+
+ if (old_rd)
+ call_rcu(&old_rd->rcu, free_rootdomain);
+}
+
+void sched_get_rd(struct root_domain *rd)
+{
+ atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+ if (!atomic_dec_and_test(&rd->refcount))
+ return;
+
+ call_rcu(&rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto out;
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ goto free_online;
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+#ifdef HAVE_RT_PUSH_IPI
+ rd->rto_cpu = -1;
+ raw_spin_lock_init(&rd->rto_lock);
+ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
+#endif
+
+ rd->visit_cookie = 0;
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_rto_mask;
+
+ if (cpupri_init(&rd->cpupri) != 0)
+ goto free_cpudl;
+ return 0;
+
+free_cpudl:
+ cpudl_cleanup(&rd->cpudl);
+free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
+free_online:
+ free_cpumask_var(rd->online);
+free_span:
+ free_cpumask_var(rd->span);
+out:
+ return -ENOMEM;
+}
+
+/*
+ * By default the system creates a single root-domain with all CPUs as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
+void __init init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ if (init_rootdomain(rd) != 0) {
+ kfree(rd);
+ return NULL;
+ }
+
+ return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
+{
+ struct sched_group *tmp, *first;
+
+ if (!sg)
+ return;
+
+ first = sg;
+ do {
+ tmp = sg->next;
+
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
+
+ if (atomic_dec_and_test(&sg->ref))
+ kfree(sg);
+ sg = tmp;
+ } while (sg != first);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd)
+{
+ /*
+ * A normal sched domain may have multiple group references, an
+ * overlapping domain, having private groups, only one. Iterate,
+ * dropping group/capacity references, freeing where none remain.
+ */
+ free_sched_groups(sd->groups, 1);
+
+ if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+ kfree(sd->shared);
+ kfree(sd);
+}
+
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+ while (sd) {
+ struct sched_domain *parent = sd->parent;
+ destroy_sched_domain(sd);
+ sd = parent;
+ }
+}
+
+static void destroy_sched_domains(struct sched_domain *sd)
+{
+ if (sd)
+ call_rcu(&sd->rcu, destroy_sched_domains_rcu);
+}
+
+/*
+ * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set
+ * (Last Level Cache Domain) for this allows us to avoid some pointer chasing
+ * select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first CPU number in the cpumask
+ * of the domain), this allows us to quickly tell if two CPUs are in the same
+ * cache domain, see cpus_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
+DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_share_id);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
+DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
+
+static void update_top_cache_domain(int cpu)
+{
+ struct sched_domain_shared *sds = NULL;
+ struct sched_domain *sd;
+ int id = cpu;
+ int size = 1;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_LLC);
+ if (sd) {
+ id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ sds = sd->shared;
+ }
+
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
+ per_cpu(sd_llc_id, cpu) = id;
+ rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+
+ sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ if (sd)
+ id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * This assignment should be placed after the sd_llc_id as
+ * we want this id equals to cluster id on cluster machines
+ * but equals to LLC id on non-Cluster machines.
+ */
+ per_cpu(sd_share_id, cpu) = id;
+
+ sd = lowest_flag_domain(cpu, SD_NUMA);
+ rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
+
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+ rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+
+ if (sd_parent_degenerate(tmp, parent)) {
+ tmp->parent = parent->parent;
+
+ if (parent->parent) {
+ parent->parent->child = tmp;
+ parent->parent->groups->flags = tmp->flags;
+ }
+
+ /*
+ * Transfer SD_PREFER_SIBLING down in case of a
+ * degenerate parent; the spans match for this
+ * so the property transfers.
+ */
+ if (parent->flags & SD_PREFER_SIBLING)
+ tmp->flags |= SD_PREFER_SIBLING;
+ destroy_sched_domain(parent);
+ } else
+ tmp = tmp->parent;
+ }
+
+ if (sd && sd_degenerate(sd)) {
+ tmp = sd;
+ sd = sd->parent;
+ destroy_sched_domain(tmp);
+ if (sd) {
+ struct sched_group *sg = sd->groups;
+
+ /*
+ * sched groups hold the flags of the child sched
+ * domain for convenience. Clear such flags since
+ * the child is being destroyed.
+ */
+ do {
+ sg->flags = 0;
+ } while (sg != sd->groups);
+
+ sd->child = NULL;
+ }
+ }
+
+ sched_domain_debug(sd, cpu);
+
+ rq_attach_root(rq, rd);
+ tmp = rq->sd;
+ rcu_assign_pointer(rq->sd, sd);
+ dirty_sched_domain_sysctl(cpu);
+ destroy_sched_domains(tmp);
+
+ update_top_cache_domain(cpu);
+}
+
+struct s_data {
+ struct sched_domain * __percpu *sd;
+ struct root_domain *rd;
+};
+
+enum s_alloc {
+ sa_rootdomain,
+ sa_sd,
+ sa_sd_storage,
+ sa_none,
+};
+
+/*
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the balance mask.
+ *
+ * The balance mask are all those CPUs that could actually end up at this
+ * group. See build_balance_mask().
+ *
+ * Also see should_we_balance().
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first(group_balance_mask(sg));
+}
+
+
+/*
+ * NUMA topology (first read the regular topology blurb below)
+ *
+ * Given a node-distance table, for example:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 20
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 20 30 20 10
+ *
+ * which represents a 4 node ring topology like:
+ *
+ * 0 ----- 1
+ * | |
+ * | |
+ * | |
+ * 3 ----- 2
+ *
+ * We want to construct domains and groups to represent this. The way we go
+ * about doing this is to build the domains on 'hops'. For each NUMA level we
+ * construct the mask of all nodes reachable in @level hops.
+ *
+ * For the above NUMA topology that gives 3 levels:
+ *
+ * NUMA-2 0-3 0-3 0-3 0-3
+ * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
+ *
+ * NUMA-1 0-1,3 0-2 1-3 0,2-3
+ * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ *
+ * As can be seen; things don't nicely line up as with the regular topology.
+ * When we iterate a domain in child domain chunks some nodes can be
+ * represented multiple times -- hence the "overlap" naming for this part of
+ * the topology.
+ *
+ * In order to minimize this overlap, we only build enough groups to cover the
+ * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
+ *
+ * Because:
+ *
+ * - the first group of each domain is its child domain; this
+ * gets us the first 0-1,3
+ * - the only uncovered node is 2, who's child domain is 1-3.
+ *
+ * However, because of the overlap, computing a unique CPU for each group is
+ * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
+ * groups include the CPUs of Node-0, while those CPUs would not in fact ever
+ * end up at those groups (they would end up in group: 0-1,3).
+ *
+ * To correct this we have to introduce the group balance mask. This mask
+ * will contain those CPUs in the group that can reach this group given the
+ * (child) domain tree.
+ *
+ * With this we can once again compute balance_cpu and sched_group_capacity
+ * relations.
+ *
+ * XXX include words on how balance_cpu is unique and therefore can be
+ * used for sched_group_capacity links.
+ *
+ *
+ * Another 'interesting' topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 20 30
+ * 1: 20 10 20 20
+ * 2: 20 20 10 20
+ * 3: 30 20 20 10
+ *
+ * Which looks a little like:
+ *
+ * 0 ----- 1
+ * | / |
+ * | / |
+ * | / |
+ * 2 ----- 3
+ *
+ * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
+ * are not.
+ *
+ * This leads to a few particularly weird cases where the sched_domain's are
+ * not of the same number for each CPU. Consider:
+ *
+ * NUMA-2 0-3 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-1 0-2 0-3 0-3 1-3
+ *
+ * NUMA-0 0 1 2 3
+ *
+ */
+
+
+/*
+ * Build the balance mask; it contains only those CPUs that can arrive at this
+ * group and should be considered to continue balancing.
+ *
+ * We do this during the group creation pass, therefore the group information
+ * isn't complete yet, however since each group represents a (child) domain we
+ * can fully construct this using the sched_domain bits (which are already
+ * complete).
+ */
+static void
+build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+{
+ const struct cpumask *sg_span = sched_group_span(sg);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ cpumask_clear(mask);
+
+ for_each_cpu(i, sg_span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
+ continue;
+
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
+ continue;
+
+ cpumask_set_cpu(i, mask);
+ }
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(mask));
+}
+
+/*
+ * XXX: This creates per-node group entries; since the load-balancer will
+ * immediately access remote memory to construct this group's load-balance
+ * statistics having the groups node local is of dubious benefit.
+ */
+static struct sched_group *
+build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *sg;
+ struct cpumask *sg_span;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(cpu));
+
+ if (!sg)
+ return NULL;
+
+ sg_span = sched_group_span(sg);
+ if (sd->child) {
+ cpumask_copy(sg_span, sched_domain_span(sd->child));
+ sg->flags = sd->child->flags;
+ } else {
+ cpumask_copy(sg_span, sched_domain_span(sd));
+ }
+
+ atomic_inc(&sg->ref);
+ return sg;
+}
+
+static void init_overlap_sched_group(struct sched_domain *sd,
+ struct sched_group *sg)
+{
+ struct cpumask *mask = sched_domains_tmpmask2;
+ struct sd_data *sdd = sd->private;
+ struct cpumask *sg_span;
+ int cpu;
+
+ build_balance_mask(sd, sg, mask);
+ cpu = cpumask_first(mask);
+
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
+ cpumask_copy(group_balance_mask(sg), mask);
+ else
+ WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
+
+ /*
+ * Initialize sgc->capacity such that even if we mess up the
+ * domains and no possible iteration will get us here, we won't
+ * die on a /0 trap.
+ */
+ sg_span = sched_group_span(sg);
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+}
+
+static struct sched_domain *
+find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
+{
+ /*
+ * The proper descendant would be the one whose child won't span out
+ * of sd
+ */
+ while (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child),
+ sched_domain_span(sd)))
+ sibling = sibling->child;
+
+ /*
+ * As we are referencing sgc across different topology level, we need
+ * to go down to skip those sched_domains which don't contribute to
+ * scheduling because they will be degenerated in cpu_attach_domain
+ */
+ while (sibling->child &&
+ cpumask_equal(sched_domain_span(sibling->child),
+ sched_domain_span(sibling)))
+ sibling = sibling->child;
+
+ return sibling;
+}
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL, *sg;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered = sched_domains_tmpmask;
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ cpumask_clear(covered);
+
+ for_each_cpu_wrap(i, span, cpu) {
+ struct cpumask *sg_span;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sibling = *per_cpu_ptr(sdd->sd, i);
+
+ /*
+ * Asymmetric node setups can result in situations where the
+ * domain tree is of unequal depth, make sure to skip domains
+ * that already cover the entire range.
+ *
+ * In that case build_sched_domains() will have terminated the
+ * iteration early and our sibling sd spans will be empty.
+ * Domains should always include the CPU they're built on, so
+ * check that.
+ */
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ /*
+ * Usually we build sched_group by sibling's child sched_domain
+ * But for machines whose NUMA diameter are 3 or above, we move
+ * to build sched_group by sibling's proper descendant's child
+ * domain because sibling's child sched_domain will span out of
+ * the sched_domain being built as below.
+ *
+ * Smallest diameter=3 topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 40
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 40 30 20 10
+ *
+ * 0 --- 1 --- 2 --- 3
+ *
+ * NUMA-3 0-3 N/A N/A 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-2 0-2 0-3 0-3 1-3
+ * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
+ *
+ * NUMA-1 0-1 0-2 1-3 2-3
+ * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
+ * group span isn't a subset of the domain span.
+ */
+ if (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child), span))
+ sibling = find_descended_sibling(sd, sibling);
+
+ sg = build_group_from_child_sched_domain(sibling, cpu);
+ if (!sg)
+ goto fail;
+
+ sg_span = sched_group_span(sg);
+ cpumask_or(covered, covered, sg_span);
+
+ init_overlap_sched_group(sibling, sg);
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ last->next = first;
+ }
+ sd->groups = first;
+
+ return 0;
+
+fail:
+ free_sched_groups(first, 0);
+
+ return -ENOMEM;
+}
+
+
+/*
+ * Package topology (also see the load-balance blurb in fair.c)
+ *
+ * The scheduler builds a tree structure to represent a number of important
+ * topology features. By default (default_topology[]) these include:
+ *
+ * - Simultaneous multithreading (SMT)
+ * - Multi-Core Cache (MC)
+ * - Package (PKG)
+ *
+ * Where the last one more or less denotes everything up to a NUMA node.
+ *
+ * The tree consists of 3 primary data structures:
+ *
+ * sched_domain -> sched_group -> sched_group_capacity
+ * ^ ^ ^ ^
+ * `-' `-'
+ *
+ * The sched_domains are per-CPU and have a two way link (parent & child) and
+ * denote the ever growing mask of CPUs belonging to that level of topology.
+ *
+ * Each sched_domain has a circular (double) linked list of sched_group's, each
+ * denoting the domains of the level below (or individual CPUs in case of the
+ * first domain level). The sched_group linked by a sched_domain includes the
+ * CPU of that sched_domain [*].
+ *
+ * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * PKG [ ]
+ * MC [ ] [ ]
+ * SMT [ ] [ ] [ ] [ ]
+ *
+ * - or -
+ *
+ * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
+ * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
+ *
+ * CPU 0 1 2 3 4 5 6 7
+ *
+ * One way to think about it is: sched_domain moves you up and down among these
+ * topology levels, while sched_group moves you sideways through it, at child
+ * domain granularity.
+ *
+ * sched_group_capacity ensures each unique sched_group has shared storage.
+ *
+ * There are two related construction problems, both require a CPU that
+ * uniquely identify each group (for a given domain):
+ *
+ * - The first is the balance_cpu (see should_we_balance() and the
+ * load-balance blurb in fair.c); for each group we only want 1 CPU to
+ * continue balancing at a higher domain.
+ *
+ * - The second is the sched_group_capacity; we want all identical groups
+ * to share a single sched_group_capacity.
+ *
+ * Since these topologies are exclusive by construction. That is, its
+ * impossible for an SMT thread to belong to multiple cores, and cores to
+ * be part of multiple caches. There is a very clear and unique location
+ * for each CPU in the hierarchy.
+ *
+ * Therefore computing a unique CPU for each group is trivial (the iteration
+ * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
+ * group), we can simply pick the first CPU in each group.
+ *
+ *
+ * [*] in other words, the first group of each domain is its child domain.
+ */
+
+static struct sched_group *get_group(int cpu, struct sd_data *sdd)
+{
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
+ struct sched_group *sg;
+ bool already_visited;
+
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
+
+ sg = *per_cpu_ptr(sdd->sg, cpu);
+ sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+ /* Increase refcounts for claim_allocations: */
+ already_visited = atomic_inc_return(&sg->ref) > 1;
+ /* sgc visits should follow a similar trend as sg */
+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
+
+ /* If we have already visited that group, it's already initialized. */
+ if (already_visited)
+ return sg;
+
+ if (child) {
+ cpumask_copy(sched_group_span(sg), sched_domain_span(child));
+ cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ sg->flags = child->flags;
+ } else {
+ cpumask_set_cpu(cpu, sched_group_span(sg));
+ cpumask_set_cpu(cpu, group_balance_mask(sg));
+ }
+
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+
+ return sg;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, will set each group's ->cpumask correctly,
+ * and will initialize their ->sgc.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
+
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+
+ cpumask_clear(covered);
+
+ for_each_cpu_wrap(i, span, cpu) {
+ struct sched_group *sg;
+
+ if (cpumask_test_cpu(i, covered))
+ continue;
+
+ sg = get_group(i, sdd);
+
+ cpumask_or(covered, covered, sched_group_span(sg));
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
+ sd->groups = first;
+
+ return 0;
+}
+
+/*
+ * Initialize sched groups cpu_capacity.
+ *
+ * cpu_capacity indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
+ */
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+{
+ struct sched_group *sg = sd->groups;
+ struct cpumask *mask = sched_domains_tmpmask2;
+
+ WARN_ON(!sg);
+
+ do {
+ int cpu, cores = 0, max_cpu = -1;
+
+ sg->group_weight = cpumask_weight(sched_group_span(sg));
+
+ cpumask_copy(mask, sched_group_span(sg));
+ for_each_cpu(cpu, mask) {
+ cores++;
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
+#endif
+ }
+ sg->cores = cores;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ goto next;
+
+ for_each_cpu(cpu, sched_group_span(sg)) {
+ if (max_cpu < 0)
+ max_cpu = cpu;
+ else if (sched_asym_prefer(cpu, max_cpu))
+ max_cpu = cpu;
+ }
+ sg->asym_prefer_cpu = max_cpu;
+
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+
+ if (cpu != group_balance_cpu(sg))
+ return;
+
+ update_group_capacity(sd, cpu);
+}
+
+/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
+void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
+{
+ int asym_prefer_cpu = cpu;
+ struct sched_domain *sd;
+
+ guard(rcu)();
+
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg;
+ int group_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ continue;
+
+ /*
+ * Groups of overlapping domain are replicated per NUMA
+ * node and will require updating "asym_prefer_cpu" on
+ * each local copy.
+ *
+ * If you are hitting this warning, consider moving
+ * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
+ * which is shared by all the overlapping groups.
+ */
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
+
+ sg = sd->groups;
+ if (cpu != sg->asym_prefer_cpu) {
+ /*
+ * Since the parent is a superset of the current group,
+ * if the cpu is not the "asym_prefer_cpu" at the
+ * current level, it cannot be the preferred CPU at a
+ * higher levels either.
+ */
+ if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu))
+ return;
+
+ WRITE_ONCE(sg->asym_prefer_cpu, cpu);
+ continue;
+ }
+
+ /* Ranking has improved; CPU is still the preferred one. */
+ if (new_prio >= old_prio)
+ continue;
+
+ for_each_cpu(group_cpu, sched_group_span(sg)) {
+ if (sched_asym_prefer(group_cpu, asym_prefer_cpu))
+ asym_prefer_cpu = group_cpu;
+ }
+
+ WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu);
+ }
+}
+
+/*
+ * Set of available CPUs grouped by their corresponding capacities
+ * Each list entry contains a CPU mask reflecting CPUs that share the same
+ * capacity.
+ * The lifespan of data is unlimited.
+ */
+LIST_HEAD(asym_cap_list);
+
+/*
+ * Verify whether there is any CPU capacity asymmetry in a given sched domain.
+ * Provides sd_flags reflecting the asymmetry scope.
+ */
+static inline int
+asym_cpu_capacity_classify(const struct cpumask *sd_span,
+ const struct cpumask *cpu_map)
+{
+ struct asym_cap_data *entry;
+ int count = 0, miss = 0;
+
+ /*
+ * Count how many unique CPU capacities this domain spans across
+ * (compare sched_domain CPUs mask with ones representing available
+ * CPUs capacities). Take into account CPUs that might be offline:
+ * skip those.
+ */
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
+ ++count;
+ else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
+ ++miss;
+ }
+
+ WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
+
+ /* No asymmetry detected */
+ if (count < 2)
+ return 0;
+ /* Some of the available CPU capacity values have not been detected */
+ if (miss)
+ return SD_ASYM_CPUCAPACITY;
+
+ /* Full asymmetry */
+ return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
+
+}
+
+static void free_asym_cap_entry(struct rcu_head *head)
+{
+ struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ kfree(entry);
+}
+
+static inline void asym_cpu_capacity_update_data(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+ struct asym_cap_data *insert_entry = NULL;
+ struct asym_cap_data *entry;
+
+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descending.
+ */
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (capacity == entry->capacity)
+ goto done;
+ else if (!insert_entry && capacity > entry->capacity)
+ insert_entry = list_prev_entry(entry, link);
+ }
+
+ entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
+ if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
+ return;
+ entry->capacity = capacity;
+
+ /* If NULL then the new capacity is the smallest, add last. */
+ if (!insert_entry)
+ list_add_tail_rcu(&entry->link, &asym_cap_list);
+ else
+ list_add_rcu(&entry->link, &insert_entry->link);
+done:
+ __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
+}
+
+/*
+ * Build-up/update list of CPUs grouped by their capacities
+ * An update requires explicit request to rebuild sched domains
+ * with state indicating CPU topology changes.
+ */
+static void asym_cpu_capacity_scan(void)
+{
+ struct asym_cap_data *entry, *next;
+ int cpu;
+
+ list_for_each_entry(entry, &asym_cap_list, link)
+ cpumask_clear(cpu_capacity_span(entry));
+
+ for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
+ asym_cpu_capacity_update_data(cpu);
+
+ list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
+ if (cpumask_empty(cpu_capacity_span(entry))) {
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
+ }
+ }
+
+ /*
+ * Only one capacity value has been detected i.e. this system is symmetric.
+ * No need to keep this data around.
+ */
+ if (list_is_singular(&asym_cap_list)) {
+ entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
+ }
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
+
+ return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+ struct sched_domain_attr *attr)
+{
+ int request;
+
+ if (!attr || attr->relax_domain_level < 0) {
+ if (default_relax_domain_level < 0)
+ return;
+ request = default_relax_domain_level;
+ } else
+ request = attr->relax_domain_level;
+
+ if (sd->level >= request) {
+ /* Turn off idle balance on this domain: */
+ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+ }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ const struct cpumask *cpu_map)
+{
+ switch (what) {
+ case sa_rootdomain:
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu);
+ fallthrough;
+ case sa_sd:
+ free_percpu(d->sd);
+ fallthrough;
+ case sa_sd_storage:
+ __sdt_free(cpu_map);
+ fallthrough;
+ case sa_none:
+ break;
+ }
+}
+
+static enum s_alloc
+__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
+{
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
+ d->rd = alloc_rootdomain();
+ if (!d->rd)
+ return sa_sd;
+
+ return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+ *per_cpu_ptr(sdd->sds, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+enum numa_topology_type sched_numa_topology_type;
+
+/*
+ * sched_domains_numa_distance is derived from sched_numa_node_distance
+ * and provides a simplified view of NUMA distances used specifically
+ * for building NUMA scheduling domains.
+ */
+static int sched_domains_numa_levels;
+static int sched_numa_node_levels;
+
+int sched_max_numa_distance;
+static int *sched_domains_numa_distance;
+static int *sched_numa_node_distance;
+static struct cpumask ***sched_domains_numa_masks;
+#endif /* CONFIG_NUMA */
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function. For details, see include/linux/sched/sd_flags.h.
+ *
+ * SD_SHARE_CPUCAPACITY
+ * SD_SHARE_LLC
+ * SD_CLUSTER
+ * SD_NUMA
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUCAPACITY | \
+ SD_CLUSTER | \
+ SD_SHARE_LLC | \
+ SD_NUMA | \
+ SD_ASYM_PACKING)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map,
+ struct sched_domain *child, int cpu)
+{
+ struct sd_data *sdd = &tl->data;
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ int sd_id, sd_weight, sd_flags = 0;
+ struct cpumask *sd_span;
+
+ sd_weight = cpumask_weight(tl->mask(tl, cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= TOPOLOGY_SD_FLAGS;
+
+ *sd = (struct sched_domain){
+ .min_interval = sd_weight,
+ .max_interval = 2*sd_weight,
+ .busy_factor = 16,
+ .imbalance_pct = 117,
+
+ .cache_nice_tries = 0,
+
+ .flags = 1*SD_BALANCE_NEWIDLE
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
+ | 0*SD_BALANCE_WAKE
+ | 1*SD_WAKE_AFFINE
+ | 0*SD_SHARE_CPUCAPACITY
+ | 0*SD_SHARE_LLC
+ | 0*SD_SERIALIZE
+ | 1*SD_PREFER_SIBLING
+ | 0*SD_NUMA
+ | sd_flags
+ ,
+
+ .last_balance = jiffies,
+ .balance_interval = sd_weight,
+
+ /* 50% success rate */
+ .newidle_call = 512,
+ .newidle_success = 256,
+ .newidle_ratio = 512,
+
+ .max_newidle_lb_cost = 0,
+ .last_decay_max_lb_cost = jiffies,
+ .child = child,
+ .name = tl->name,
+ };
+
+ sd_span = sched_domain_span(sd);
+ cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
+ sd_id = cpumask_first(sd_span);
+
+ sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
+
+ WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
+ (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
+ "CPU capacity asymmetry not supported on SMT\n");
+
+ /*
+ * Convert topological properties into behaviour.
+ */
+ /* Don't attempt to spread across CPUs of different capacities. */
+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
+ sd->child->flags &= ~SD_PREFER_SIBLING;
+
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
+ sd->imbalance_pct = 110;
+
+ } else if (sd->flags & SD_SHARE_LLC) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+
+ sd->flags &= ~SD_PREFER_SIBLING;
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif /* CONFIG_NUMA */
+ } else {
+ sd->cache_nice_tries = 1;
+ }
+
+ /*
+ * For all levels sharing cache; connect a sched_domain_shared
+ * instance.
+ */
+ if (sd->flags & SD_SHARE_LLC) {
+ sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+ atomic_inc(&sd->shared->ref);
+ atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+ }
+
+ sd->private = sdd;
+
+ return sd;
+}
+
+#ifdef CONFIG_SCHED_SMT
+int cpu_smt_flags(void)
+{
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_smt_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+int cpu_cluster_flags(void)
+{
+ return SD_CLUSTER | SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_clustergroup_mask(cpu);
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+int cpu_core_flags(void)
+{
+ return SD_SHARE_LLC;
+}
+
+const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_coregroup_mask(cpu);
+}
+#endif
+
+const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return cpu_node_mask(cpu);
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT),
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+ SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS),
+#endif
+
+#ifdef CONFIG_SCHED_MC
+ SDTL_INIT(tl_mc_mask, cpu_core_flags, MC),
+#endif
+ SDTL_INIT(tl_pkg_mask, NULL, PKG),
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+ default_topology;
+static struct sched_domain_topology_level *sched_domain_topology_saved;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void __init set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ if (WARN_ON_ONCE(sched_smp_initialized))
+ return;
+
+ sched_domain_topology = tl;
+ sched_domain_topology_saved = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int cpu_numa_flags(void)
+{
+ return SD_NUMA;
+}
+
+static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+ return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+ static int done = false;
+ int i,j;
+
+ if (done)
+ return;
+
+ done = true;
+
+ printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+ for (i = 0; i < nr_node_ids; i++) {
+ printk(KERN_WARNING " ");
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
+ printk(KERN_CONT "(%02d) ", node_distance(i,j));
+ else
+ printk(KERN_CONT " %02d ", node_distance(i,j));
+ }
+ printk(KERN_CONT "\n");
+ }
+ printk(KERN_WARNING "\n");
+}
+
+bool find_numa_distance(int distance)
+{
+ bool found = false;
+ int i, *distances;
+
+ if (distance == node_distance(0, 0))
+ return true;
+
+ rcu_read_lock();
+ distances = rcu_dereference(sched_numa_node_distance);
+ if (!distances)
+ goto unlock;
+ for (i = 0; i < sched_numa_node_levels; i++) {
+ if (distances[i] == distance) {
+ found = true;
+ break;
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ return found;
+}
+
+#define for_each_cpu_node_but(n, nbut) \
+ for_each_node_state(n, N_CPU) \
+ if (n == nbut) \
+ continue; \
+ else
+
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(int offline_node)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (sched_domains_numa_levels <= 2) {
+ sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
+
+ for_each_cpu_node_but(a, offline_node) {
+ for_each_cpu_node_but(b, offline_node) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_cpu_node_but(c, offline_node) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+
+ pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
+ sched_numa_topology_type = NUMA_DIRECT;
+}
+
+
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
+/*
+ * An architecture could modify its NUMA distance, to change
+ * grouping of NUMA nodes and number of NUMA levels when creating
+ * NUMA level sched domains.
+ *
+ * A NUMA level is created for each unique
+ * arch_sched_node_distance.
+ */
+static int numa_node_dist(int i, int j)
+{
+ return node_distance(i, j);
+}
+
+int arch_sched_node_distance(int from, int to)
+ __weak __alias(numa_node_dist);
+
+static bool modified_sched_node_distance(void)
+{
+ return numa_node_dist != arch_sched_node_distance;
+}
+
+static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int),
+ int **dist, int *levels)
+{
+ unsigned long *distance_map __free(bitmap) = NULL;
+ int nr_levels = 0;
+ int i, j;
+ int *distances;
+
+ /*
+ * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
+ * unique distances in the node_distance() table.
+ */
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return -ENOMEM;
+
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
+ for_each_cpu_node_but(i, offline_node) {
+ for_each_cpu_node_but(j, offline_node) {
+ int distance = n_dist(i, j);
+
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ return -EINVAL;
+ }
+
+ bitmap_set(distance_map, distance, 1);
+ }
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
+
+ distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!distances)
+ return -ENOMEM;
+
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ distances[i] = j;
+ }
+ *dist = distances;
+ *levels = nr_levels;
+
+ return 0;
+}
+
+void sched_init_numa(int offline_node)
+{
+ struct sched_domain_topology_level *tl;
+ int nr_levels, nr_node_levels;
+ int i, j;
+ int *distances, *domain_distances;
+ struct cpumask ***masks;
+
+ /* Record the NUMA distances from SLIT table */
+ if (sched_record_numa_dist(offline_node, numa_node_dist, &distances,
+ &nr_node_levels))
+ return;
+
+ /* Record modified NUMA distances for building sched domains */
+ if (modified_sched_node_distance()) {
+ if (sched_record_numa_dist(offline_node, arch_sched_node_distance,
+ &domain_distances, &nr_levels)) {
+ kfree(distances);
+ return;
+ }
+ } else {
+ domain_distances = distances;
+ nr_levels = nr_node_levels;
+ }
+ rcu_assign_pointer(sched_numa_node_distance, distances);
+ WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]);
+ WRITE_ONCE(sched_numa_node_levels, nr_node_levels);
+
+ /*
+ * 'nr_levels' contains the number of unique distances
+ *
+ * The sched_domains_numa_distance[] array includes the actual distance
+ * numbers.
+ */
+
+ /*
+ * Here, we should temporarily reset sched_domains_numa_levels to 0.
+ * If it fails to allocate memory for array sched_domains_numa_masks[][],
+ * the array will contain less then 'nr_levels' members. This could be
+ * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+ * in other functions.
+ *
+ * We reset it to 'nr_levels' at the end of this function.
+ */
+ rcu_assign_pointer(sched_domains_numa_distance, domain_distances);
+
+ sched_domains_numa_levels = 0;
+
+ masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+ if (!masks)
+ return;
+
+ /*
+ * Now for each level, construct a mask per node which contains all
+ * CPUs of nodes that are that many hops away from us.
+ */
+ for (i = 0; i < nr_levels; i++) {
+ masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!masks[i])
+ return;
+
+ for_each_cpu_node_but(j, offline_node) {
+ struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
+ if (!mask)
+ return;
+
+ masks[i][j] = mask;
+
+ for_each_cpu_node_but(k, offline_node) {
+ if (sched_debug() &&
+ (arch_sched_node_distance(j, k) !=
+ arch_sched_node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
+
+ if (arch_sched_node_distance(j, k) >
+ sched_domains_numa_distance[i])
+ continue;
+
+ cpumask_or(mask, mask, cpumask_of_node(k));
+ }
+ }
+ }
+ rcu_assign_pointer(sched_domains_numa_masks, masks);
+
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + nr_levels + 1) *
+ sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+ if (!tl)
+ return;
+
+ /*
+ * Copy the default topology bits..
+ */
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
+
+ /*
+ * Add the NUMA identity distance, aka single NODE.
+ */
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
+
+ /*
+ * .. and append 'j' levels of NUMA goodness.
+ */
+ for (j = 1; j < nr_levels; i++, j++) {
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
+ }
+
+ sched_domain_topology_saved = sched_domain_topology;
+ sched_domain_topology = tl;
+
+ sched_domains_numa_levels = nr_levels;
+
+ init_numa_topology_type(offline_node);
+}
+
+
+static void sched_reset_numa(void)
+{
+ int nr_levels, *distances, *dom_distances = NULL;
+ struct cpumask ***masks;
+
+ nr_levels = sched_domains_numa_levels;
+ sched_numa_node_levels = 0;
+ sched_domains_numa_levels = 0;
+ sched_max_numa_distance = 0;
+ sched_numa_topology_type = NUMA_DIRECT;
+ distances = sched_numa_node_distance;
+ if (sched_numa_node_distance != sched_domains_numa_distance)
+ dom_distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_numa_node_distance, NULL);
+ rcu_assign_pointer(sched_domains_numa_distance, NULL);
+ masks = sched_domains_numa_masks;
+ rcu_assign_pointer(sched_domains_numa_masks, NULL);
+ if (distances || masks) {
+ int i, j;
+
+ synchronize_rcu();
+ kfree(distances);
+ kfree(dom_distances);
+ for (i = 0; i < nr_levels && masks; i++) {
+ if (!masks[i])
+ continue;
+ for_each_node(j)
+ kfree(masks[i][j]);
+ kfree(masks[i]);
+ }
+ kfree(masks);
+ }
+ if (sched_domain_topology_saved) {
+ kfree(sched_domain_topology);
+ sched_domain_topology = sched_domain_topology_saved;
+ sched_domain_topology_saved = NULL;
+ }
+}
+
+/*
+ * Call with hotplug lock held
+ */
+void sched_update_numa(int cpu, bool online)
+{
+ int node;
+
+ node = cpu_to_node(cpu);
+ /*
+ * Scheduler NUMA topology is updated when the first CPU of a
+ * node is onlined or the last CPU of a node is offlined.
+ */
+ if (cpumask_weight(cpumask_of_node(node)) != 1)
+ return;
+
+ sched_reset_numa();
+ sched_init_numa(online ? NUMA_NO_NODE : node);
+}
+
+void sched_domains_numa_masks_set(unsigned int cpu)
+{
+ int node = cpu_to_node(cpu);
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(j, N_CPU))
+ continue;
+
+ /* Set ourselves in the remote node's masks */
+ if (arch_sched_node_distance(j, node) <=
+ sched_domains_numa_distance[i])
+ cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+void sched_domains_numa_masks_clear(unsigned int cpu)
+{
+ int i, j;
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (sched_domains_numa_masks[i][j])
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
+ }
+}
+
+/*
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: cpu to be close to
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
+ struct cpumask ***masks;
+
+ rcu_read_lock();
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ goto unlock;
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ if (!masks[i][j])
+ break;
+ cpu = cpumask_any_and_distribute(cpus, masks[i][j]);
+ if (cpu < nr_cpu_ids) {
+ found = cpu;
+ break;
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ return found;
+}
+
+struct __cmp_key {
+ const struct cpumask *cpus;
+ struct cpumask ***masks;
+ int node;
+ int cpu;
+ int w;
+};
+
+static int hop_cmp(const void *a, const void *b)
+{
+ struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b;
+ struct __cmp_key *k = (struct __cmp_key *)a;
+
+ if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+ return 1;
+
+ if (b == k->masks) {
+ k->w = 0;
+ return 0;
+ }
+
+ prev_hop = *((struct cpumask ***)b - 1);
+ k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);
+ if (k->w <= k->cpu)
+ return 0;
+
+ return -1;
+}
+
+/**
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
+ * from @cpus to @cpu, taking into account distance
+ * from a given @node.
+ * @cpus: cpumask to find a cpu from
+ * @cpu: CPU to start searching
+ * @node: NUMA node to order CPUs by distance
+ *
+ * Return: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
+ struct cpumask ***hop_masks;
+ int hop, ret = nr_cpu_ids;
+
+ if (node == NUMA_NO_NODE)
+ return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+
+ rcu_read_lock();
+
+ /* CPU-less node entries are uninitialized in sched_domains_numa_masks */
+ node = numa_nearest_node(node, N_CPU);
+ k.node = node;
+
+ k.masks = rcu_dereference(sched_domains_numa_masks);
+ if (!k.masks)
+ goto unlock;
+
+ hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ if (!hop_masks)
+ goto unlock;
+ hop = hop_masks - k.masks;
+
+ ret = hop ?
+ cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+ cpumask_nth_and(cpu, cpus, k.masks[0][node]);
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
+
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
+ * @node
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
+ * @node, an error value otherwise.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in distance; see sched_init_numa() for how
+ * distances and masks are handled.
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ struct cpumask ***masks;
+
+ if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+ return ERR_PTR(-EINVAL);
+
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ return ERR_PTR(-EBUSY);
+
+ return masks[hops][node];
+}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
+
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sds = alloc_percpu(struct sched_domain_shared *);
+ if (!sdd->sds)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_domain_shared *sds;
+ struct sched_group *sg;
+ struct sched_group_capacity *sgc;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sds = kzalloc_node(sizeof(struct sched_domain_shared),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sds)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sds, j) = sds;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ sg->next = sg;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sgc)
+ return -ENOMEM;
+
+ sgc->id = j;
+
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for_each_sd_topology(tl) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+
+ if (sdd->sd) {
+ sd = *per_cpu_ptr(sdd->sd, j);
+ if (sd && (sd->flags & SD_NUMA))
+ free_sched_groups(sd->groups, 0);
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ }
+
+ if (sdd->sds)
+ kfree(*per_cpu_ptr(sdd->sds, j));
+ if (sdd->sg)
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
+ }
+ free_percpu(sdd->sd);
+ sdd->sd = NULL;
+ free_percpu(sdd->sds);
+ sdd->sds = NULL;
+ free_percpu(sdd->sg);
+ sdd->sg = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
+ }
+}
+
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *child, int cpu)
+{
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+ /* Fixup, ensure @sd has at least @child CPUs. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
+ }
+ set_domain_attribute(sd, attr);
+
+ return sd;
+}
+
+/*
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
+ * any two given CPUs on non-NUMA topology levels.
+ */
+static bool topology_span_sane(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ struct cpumask *covered, *id_seen;
+ int cpu;
+
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+ id_seen = sched_domains_tmpmask2;
+
+ for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
+
+ /* NUMA levels are allowed to overlap */
+ if (tl_common_flags & SD_NUMA)
+ continue;
+
+ cpumask_clear(covered);
+ cpumask_clear(id_seen);
+
+ /*
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
+ */
+ for_each_cpu(cpu, cpu_map) {
+ const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu);
+ int id;
+
+ /* lowest bit set in this mask is used as a unique id */
+ id = cpumask_first(tl_cpu_mask);
+
+ if (cpumask_test_cpu(id, id_seen)) {
+ /* First CPU has already been seen, ensure identical spans */
+ if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask))
+ return false;
+ } else {
+ /* First CPU hasn't been seen before, ensure it's a completely new span */
+ if (cpumask_intersects(tl_cpu_mask, covered))
+ return false;
+
+ cpumask_or(covered, covered, tl_cpu_mask);
+ cpumask_set_cpu(id, id_seen);
+ }
+ }
+ }
+ return true;
+}
+
+/*
+ * Build sched domains for a given set of CPUs and attach the sched domains
+ * to the individual CPUs
+ */
+static int
+build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
+{
+ enum s_alloc alloc_state = sa_none;
+ struct sched_domain *sd;
+ struct s_data d;
+ struct rq *rq = NULL;
+ int i, ret = -ENOMEM;
+ bool has_asym = false;
+ bool has_cluster = false;
+
+ if (WARN_ON(cpumask_empty(cpu_map)))
+ goto error;
+
+ alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ if (alloc_state != sa_rootdomain)
+ goto error;
+
+ /* Set up domains for CPUs specified by the cpu_map: */
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain_topology_level *tl;
+
+ sd = NULL;
+ for_each_sd_topology(tl) {
+
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+
+ has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
+
+ if (tl == sched_domain_topology)
+ *per_cpu_ptr(d.sd, i) = sd;
+ if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+ break;
+ }
+ }
+
+ if (WARN_ON(!topology_span_sane(cpu_map)))
+ goto error;
+
+ /* Build the groups for the domains */
+ for_each_cpu(i, cpu_map) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ if (sd->flags & SD_NUMA) {
+ if (build_overlap_sched_groups(sd, i))
+ goto error;
+ } else {
+ if (build_sched_groups(sd, i))
+ goto error;
+ }
+ }
+ }
+
+ /*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+ for_each_cpu(i, cpu_map) {
+ unsigned int imb = 0;
+ unsigned int imb_span = 1;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ struct sched_domain *child = sd->child;
+
+ if (!(sd->flags & SD_SHARE_LLC) && child &&
+ (child->flags & SD_SHARE_LLC)) {
+ struct sched_domain __rcu *top_p;
+ unsigned int nr_llcs;
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ nr_llcs = sd->span_weight / child->span_weight;
+ if (nr_llcs == 1)
+ imb = sd->span_weight >> 3;
+ else
+ imb = nr_llcs;
+ imb = max(1U, imb);
+ sd->imb_numa_nr = imb;
+
+ /* Set span based on the first NUMA domain. */
+ top_p = sd->parent;
+ while (top_p && !(top_p->flags & SD_NUMA)) {
+ top_p = top_p->parent;
+ }
+ imb_span = top_p ? top_p->span_weight : sd->span_weight;
+ } else {
+ int factor = max(1U, (sd->span_weight / imb_span));
+
+ sd->imb_numa_nr = imb * factor;
+ }
+ }
+ }
+
+ /* Calculate CPU capacity for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_capacity(i, sd);
+ }
+ }
+
+ /* Attach the domains */
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
+ sd = *per_cpu_ptr(d.sd, i);
+
+ cpu_attach_domain(sd, d.rd, i);
+
+ if (lowest_flag_domain(i, SD_CLUSTER))
+ has_cluster = true;
+ }
+ rcu_read_unlock();
+
+ if (has_asym)
+ static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
+
+ if (has_cluster)
+ static_branch_inc_cpuslocked(&sched_cluster_active);
+
+ if (rq && sched_debug_verbose)
+ pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
+
+ ret = 0;
+error:
+ __free_domain_allocs(&d, alloc_state, cpu_map);
+
+ return ret;
+}
+
+/* Current sched domains: */
+static cpumask_var_t *doms_cur;
+
+/* Number of sched domains in 'doms_cur': */
+static int ndoms_cur;
+
+/* Attributes of custom domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+
+/*
+ * Special case: If a kmalloc() of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * CPU core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+ return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+ int i;
+ cpumask_var_t *doms;
+
+ doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
+ if (!doms)
+ return NULL;
+ for (i = 0; i < ndoms; i++) {
+ if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+ free_sched_domains(doms, i);
+ return NULL;
+ }
+ }
+ return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+ unsigned int i;
+ for (i = 0; i < ndoms; i++)
+ free_cpumask_var(doms[i]);
+ kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. For now this just excludes isolated
+ * CPUs, but could be used to exclude other special cases in the future.
+ */
+int __init sched_init_domains(const struct cpumask *cpu_map)
+{
+ int err;
+
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+ zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+ zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+ arch_update_cpu_topology();
+ asym_cpu_capacity_scan();
+ ndoms_cur = 1;
+ doms_cur = alloc_sched_domains(ndoms_cur);
+ if (!doms_cur)
+ doms_cur = &fallback_doms;
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ err = build_sched_domains(doms_cur[0], NULL);
+
+ return err;
+}
+
+/*
+ * Detach sched domains from a group of CPUs specified in cpu_map
+ * These CPUs will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+ unsigned int cpu = cpumask_any(cpu_map);
+ int i;
+
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
+ static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+
+ if (static_branch_unlikely(&sched_cluster_active))
+ static_branch_dec_cpuslocked(&sched_cluster_active);
+
+ rcu_read_lock();
+ for_each_cpu(i, cpu_map)
+ cpu_attach_domain(NULL, &def_root_domain, i);
+ rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+ struct sched_domain_attr *new, int idx_new)
+{
+ struct sched_domain_attr tmp;
+
+ /* Fast path: */
+ if (!new && !cur)
+ return 1;
+
+ tmp = SD_ATTR_INIT;
+
+ return !memcmp(cur ? (cur + idx_cur) : &tmp,
+ new ? (new + idx_new) : &tmp,
+ sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains. This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock and sched_domains_mutex held
+ */
+static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ bool __maybe_unused has_eas = false;
+ int i, j, n;
+ int new_topology;
+
+ lockdep_assert_held(&sched_domains_mutex);
+
+ /* Let the architecture update CPU core mappings: */
+ new_topology = arch_update_cpu_topology();
+ /* Trigger rebuilding CPU capacity asymmetry data */
+ if (new_topology)
+ asym_cpu_capacity_scan();
+
+ if (!doms_new) {
+ WARN_ON_ONCE(dattr_new);
+ n = 0;
+ doms_new = alloc_sched_domains(1);
+ if (doms_new) {
+ n = 1;
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ }
+ } else {
+ n = ndoms_new;
+ }
+
+ /* Destroy deleted domains: */
+ for (i = 0; i < ndoms_cur; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_cur[i], doms_new[j]) &&
+ dattrs_equal(dattr_cur, i, dattr_new, j))
+ goto match1;
+ }
+ /* No match - a current sched domain not in new doms_new[] */
+ detach_destroy_domains(doms_cur[i]);
+match1:
+ ;
+ }
+
+ n = ndoms_cur;
+ if (!doms_new) {
+ n = 0;
+ doms_new = &fallback_doms;
+ cpumask_and(doms_new[0], cpu_active_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ }
+
+ /* Build new domains: */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !new_topology; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+ dattrs_equal(dattr_new, i, dattr_cur, j))
+ goto match2;
+ }
+ /* No match - add a new doms_new */
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+ ;
+ }
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+ /* Build perf domains: */
+ for (i = 0; i < ndoms_new; i++) {
+ for (j = 0; j < n && !sched_energy_update; j++) {
+ if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+ cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
+ has_eas = true;
+ goto match3;
+ }
+ }
+ /* No match - add perf domains for a new rd */
+ has_eas |= build_perf_domains(doms_new[i]);
+match3:
+ ;
+ }
+ sched_energy_set(has_eas);
+#endif
+
+ /* Remember the new sched domains: */
+ if (doms_cur != &fallback_doms)
+ free_sched_domains(doms_cur, ndoms_cur);
+
+ kfree(dattr_cur);
+ doms_cur = doms_new;
+ dattr_cur = dattr_new;
+ ndoms_cur = ndoms_new;
+
+ update_sched_domain_debugfs();
+ dl_rebuild_rd_accounting();
+}
+
+/*
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ sched_domains_mutex_lock();
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+ sched_domains_mutex_unlock();
+}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..20f27e2cf7ae 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -1,122 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic waiting primitives.
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/wait.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
-
-void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+#include "sched.h"
+
+void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
- spin_lock_init(&q->lock);
- lockdep_set_class_and_name(&q->lock, key, name);
- INIT_LIST_HEAD(&q->task_list);
+ spin_lock_init(&wq_head->lock);
+ lockdep_set_class_and_name(&wq_head->lock, key, name);
+ INIT_LIST_HEAD(&wq_head->head);
}
EXPORT_SYMBOL(__init_waitqueue_head);
-void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, wait);
- spin_unlock_irqrestore(&q->lock, flags);
+ wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);
-void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_tail(q, wait);
- spin_unlock_irqrestore(&q->lock, flags);
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
-void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
- spin_lock_irqsave(&q->lock, flags);
- __remove_wait_queue(q, wait);
- spin_unlock_irqrestore(&q->lock, flags);
+ wq_entry->flags |= WQ_FLAG_PRIORITY;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
-EXPORT_SYMBOL(remove_wait_queue);
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
+int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+ struct wait_queue_entry *wq_entry)
+{
+ struct list_head *head = &wq_head->head;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+
+ guard(spinlock_irqsave)(&wq_head->lock);
+
+ if (!list_empty(head) &&
+ (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
+ return -EBUSY;
+
+ list_add(&wq_entry->entry, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
+
+void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __remove_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL(remove_wait_queue);
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken if it's also an exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
- wait_queue_t *curr, *next;
+ wait_queue_entry_t *curr, *next;
+
+ lockdep_assert_held(&wq_head->lock);
+
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+ if (&curr->entry == &wq_head->head)
+ return nr_exclusive;
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+ list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
+ int ret;
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ ret = curr->func(curr, mode, wake_flags, key);
+ if (ret < 0)
+ break;
+ if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
+
+ return nr_exclusive;
+}
+
+static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, int wake_flags, void *key)
+{
+ unsigned long flags;
+ int remaining;
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+ key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+
+ return nr_exclusive - remaining;
}
/**
* __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
+ * @wq_head: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier
+ * before accessing the task state. Returns the number of exclusive
+ * tasks that were awaken.
*/
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
+int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
{
- unsigned long flags;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
- spin_unlock_irqrestore(&q->lock, flags);
+ return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(q, mode, nr, 0, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(wq_head, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
+ * @wq_head: the waitqueue
* @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: opaque value to be passed to wakeup targets
*
* The sync wakeup differs that the waker knows that it will schedule
@@ -126,36 +180,58 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
*
* On UP it can prevent extra preemption.
*
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
*/
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
+ void *key)
{
- unsigned long flags;
- int wake_flags = 1; /* XXX WF_SYNC */
-
- if (unlikely(!q))
+ if (unlikely(!wq_head))
return;
- if (unlikely(nr_exclusive != 1))
- wake_flags = 0;
-
- spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
- spin_unlock_irqrestore(&q->lock, flags);
+ __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+/**
+ * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs in that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
+ unsigned int mode, void *key)
+{
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
+
/*
* __wake_up_sync - see __wake_up_sync_key()
*/
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
{
- __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+ __wake_up_sync_key(wq_head, mode, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
@@ -169,67 +245,134 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
* loads to move into the critical region).
*/
void
-prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list))
- __add_wait_queue(q, wait);
+ wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (list_empty(&wq_entry->entry))
+ __add_wait_queue(wq_head, wq_entry);
set_current_state(state);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait);
-void
-prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+/* Returns true if we are the first waiter in the queue, false otherwise. */
+bool
+prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
+ bool was_empty = false;
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list))
- __add_wait_queue_tail(q, wait);
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (list_empty(&wq_entry->entry)) {
+ was_empty = list_empty(&wq_head->head);
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ }
set_current_state(state);
- spin_unlock_irqrestore(&q->lock, flags);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+ return was_empty;
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
+{
+ wq_entry->flags = flags;
+ wq_entry->private = current;
+ wq_entry->func = autoremove_wake_function;
+ INIT_LIST_HEAD(&wq_entry->entry);
+}
+EXPORT_SYMBOL(init_wait_entry);
+
+long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
+ long ret = 0;
+
+ spin_lock_irqsave(&wq_head->lock, flags);
+ if (signal_pending_state(state, current)) {
+ /*
+ * Exclusive waiter must not fail if it was selected by wakeup,
+ * it should "consume" the condition we were waiting for.
+ *
+ * The caller will recheck the condition and return success if
+ * we were already woken up, we can not miss the event because
+ * wakeup locks/unlocks the same wq_head->lock.
+ *
+ * But we need to ensure that set-condition + wakeup after that
+ * can't see us, it should wake up another exclusive waiter if
+ * we fail.
+ */
+ list_del_init(&wq_entry->entry);
+ ret = -ERESTARTSYS;
+ } else {
+ if (list_empty(&wq_entry->entry)) {
+ if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+ __add_wait_queue_entry_tail(wq_head, wq_entry);
+ else
+ __add_wait_queue(wq_head, wq_entry);
+ }
+ set_current_state(state);
+ }
+ spin_unlock_irqrestore(&wq_head->lock, flags);
- if (signal_pending_state(state, current))
+ return ret;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
+
+/*
+ * Note! These two wait functions are entered with the
+ * wait-queue lock held (and interrupts off in the _irq
+ * case), so there is no race with testing the wakeup
+ * condition in the caller before they add the wait
+ * entry to the wake queue.
+ */
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
+{
+ if (likely(list_empty(&wait->entry)))
+ __add_wait_queue_entry_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
return -ERESTARTSYS;
- wait->private = current;
- wait->func = autoremove_wake_function;
+ spin_unlock(&wq->lock);
+ schedule();
+ spin_lock(&wq->lock);
- spin_lock_irqsave(&q->lock, flags);
- if (list_empty(&wait->task_list)) {
- if (wait->flags & WQ_FLAG_EXCLUSIVE)
- __add_wait_queue_tail(q, wait);
- else
- __add_wait_queue(q, wait);
- }
- set_current_state(state);
- spin_unlock_irqrestore(&q->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(do_wait_intr);
+
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
+{
+ if (likely(list_empty(&wait->entry)))
+ __add_wait_queue_entry_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ spin_unlock_irq(&wq->lock);
+ schedule();
+ spin_lock_irq(&wq->lock);
return 0;
}
-EXPORT_SYMBOL(prepare_to_wait_event);
+EXPORT_SYMBOL(do_wait_intr_irq);
/**
* finish_wait - clean up after waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
+ * @wq_head: waitqueue waited on
+ * @wq_entry: wait descriptor
*
* Sets current thread back to running state and removes
* the wait descriptor from the given waitqueue if still
* queued.
*/
-void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -247,378 +390,76 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
* have _one_ other CPU that looks at or modifies
* the list).
*/
- if (!list_empty_careful(&wait->task_list)) {
- spin_lock_irqsave(&q->lock, flags);
- list_del_init(&wait->task_list);
- spin_unlock_irqrestore(&q->lock, flags);
+ if (!list_empty_careful(&wq_entry->entry)) {
+ spin_lock_irqsave(&wq_head->lock, flags);
+ list_del_init(&wq_entry->entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
}
}
EXPORT_SYMBOL(finish_wait);
-/**
- * abort_exclusive_wait - abort exclusive waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
- * @mode: runstate of the waiter to be woken
- * @key: key to identify a wait bit queue or %NULL
- *
- * Sets current thread back to running state and removes
- * the wait descriptor from the given waitqueue if still
- * queued.
- *
- * Wakes up the next waiter if the caller is concurrently
- * woken up through the queue.
- *
- * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and no one wakes up
- * the next waiter.
- */
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
- unsigned int mode, void *key)
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
- unsigned long flags;
-
- __set_current_state(TASK_RUNNING);
- spin_lock_irqsave(&q->lock, flags);
- if (!list_empty(&wait->task_list))
- list_del_init(&wait->task_list);
- else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
- spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(abort_exclusive_wait);
-
-int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
-{
- int ret = default_wake_function(wait, mode, sync, key);
+ int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
- list_del_init(&wait->task_list);
+ list_del_init_careful(&wq_entry->entry);
+
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
-static inline bool is_kthread_should_stop(void)
-{
- return (current->flags & PF_KTHREAD) && kthread_should_stop();
-}
-
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
- * add_wait_queue(&wq, &wait);
+ * add_wait_queue(&wq_head, &wait);
* for (;;) {
* if (condition)
* break;
*
- * p->state = mode; condition = true;
- * smp_mb(); // A smp_wmb(); // C
- * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
- * schedule() try_to_wake_up();
- * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
- * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
- * smp_mb() // B smp_wmb(); // C
- * wait->flags |= WQ_FLAG_WOKEN;
- * }
- * remove_wait_queue(&wq, &wait);
+ * // in wait_woken() // in woken_wake_function()
*
+ * p->state = mode; wq_entry->flags |= WQ_FLAG_WOKEN;
+ * smp_mb(); // A try_to_wake_up():
+ * if (!(wq_entry->flags & WQ_FLAG_WOKEN)) <full barrier>
+ * schedule() if (p->state & mode)
+ * p->state = TASK_RUNNING; p->state = TASK_RUNNING;
+ * wq_entry->flags &= ~WQ_FLAG_WOKEN; ~~~~~~~~~~~~~~~~~~
+ * smp_mb(); // B condition = true;
+ * } smp_mb(); // C
+ * remove_wait_queue(&wq_head, &wait); wq_entry->flags |= WQ_FLAG_WOKEN;
*/
-long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
{
- set_current_state(mode); /* A */
/*
- * The above implies an smp_mb(), which matches with the smp_wmb() from
- * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
- * also observe all state before the wakeup.
+ * The below executes an smp_mb(), which matches with the full barrier
+ * executed by the try_to_wake_up() in woken_wake_function() such that
+ * either we see the store to wq_entry->flags in woken_wake_function()
+ * or woken_wake_function() sees our store to current->state.
*/
- if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ set_current_state(mode); /* A */
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
/*
- * The below implies an smp_mb(), it too pairs with the smp_wmb() from
- * woken_wake_function() such that we must either observe the wait
- * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
- * an event.
+ * The below executes an smp_mb(), which matches with the smp_mb() (C)
+ * in woken_wake_function() such that either we see the wait condition
+ * being true or the store to wq_entry->flags in woken_wake_function()
+ * follows ours in the coherence order.
*/
- set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+ smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */
return timeout;
}
EXPORT_SYMBOL(wait_woken);
-int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
{
- /*
- * Although this function is called under waitqueue lock, LOCK
- * doesn't imply write barrier and the users expects write
- * barrier semantics on wakeup functions. The following
- * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
- * and is paired with set_mb() in wait_woken().
- */
- smp_wmb(); /* C */
- wait->flags |= WQ_FLAG_WOKEN;
+ /* Pairs with the smp_store_mb() in wait_woken(). */
+ smp_mb(); /* C */
+ wq_entry->flags |= WQ_FLAG_WOKEN;
- return default_wake_function(wait, mode, sync, key);
+ return default_wake_function(wq_entry, mode, sync, key);
}
EXPORT_SYMBOL(woken_wake_function);
-
-int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
-{
- struct wait_bit_key *key = arg;
- struct wait_bit_queue *wait_bit
- = container_of(wait, struct wait_bit_queue, wait);
-
- if (wait_bit->key.flags != key->flags ||
- wait_bit->key.bit_nr != key->bit_nr ||
- test_bit(key->bit_nr, key->flags))
- return 0;
- else
- return autoremove_wake_function(wait, mode, sync, key);
-}
-EXPORT_SYMBOL(wake_bit_function);
-
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking)
- * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
- * permitted return codes. Nonzero return codes halt waiting and return.
- */
-int __sched
-__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
- wait_bit_action_f *action, unsigned mode)
-{
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q->wait, mode);
- if (test_bit(q->key.bit_nr, q->key.flags))
- ret = (*action)(&q->key);
- } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
- finish_wait(wq, &q->wait);
- return ret;
-}
-EXPORT_SYMBOL(__wait_on_bit);
-
-int __sched out_of_line_wait_on_bit(void *word, int bit,
- wait_bit_action_f *action, unsigned mode)
-{
- wait_queue_head_t *wq = bit_waitqueue(word, bit);
- DEFINE_WAIT_BIT(wait, word, bit);
-
- return __wait_on_bit(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_bit);
-
-int __sched out_of_line_wait_on_bit_timeout(
- void *word, int bit, wait_bit_action_f *action,
- unsigned mode, unsigned long timeout)
-{
- wait_queue_head_t *wq = bit_waitqueue(word, bit);
- DEFINE_WAIT_BIT(wait, word, bit);
-
- wait.key.timeout = jiffies + timeout;
- return __wait_on_bit(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
-
-int __sched
-__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
- wait_bit_action_f *action, unsigned mode)
-{
- do {
- int ret;
-
- prepare_to_wait_exclusive(wq, &q->wait, mode);
- if (!test_bit(q->key.bit_nr, q->key.flags))
- continue;
- ret = action(&q->key);
- if (!ret)
- continue;
- abort_exclusive_wait(wq, &q->wait, mode, &q->key);
- return ret;
- } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
- finish_wait(wq, &q->wait);
- return 0;
-}
-EXPORT_SYMBOL(__wait_on_bit_lock);
-
-int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
- wait_bit_action_f *action, unsigned mode)
-{
- wait_queue_head_t *wq = bit_waitqueue(word, bit);
- DEFINE_WAIT_BIT(wait, word, bit);
-
- return __wait_on_bit_lock(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-
-void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
-{
- struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
- if (waitqueue_active(wq))
- __wake_up(wq, TASK_NORMAL, 1, &key);
-}
-EXPORT_SYMBOL(__wake_up_bit);
-
-/**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
- *
- * In order for this to function properly, as it uses waitqueue_active()
- * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_atomic(), but in some
- * cases where bitflags are manipulated non-atomically under a lock, one
- * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
- * because spin_unlock() does not guarantee a memory barrier.
- */
-void wake_up_bit(void *word, int bit)
-{
- __wake_up_bit(bit_waitqueue(word, bit), word, bit);
-}
-EXPORT_SYMBOL(wake_up_bit);
-
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
- const int shift = BITS_PER_LONG == 32 ? 5 : 6;
- const struct zone *zone = page_zone(virt_to_page(word));
- unsigned long val = (unsigned long)word << shift | bit;
-
- return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
-/*
- * Manipulate the atomic_t address to produce a better bit waitqueue table hash
- * index (we're keying off bit -1, but that would produce a horrible hash
- * value).
- */
-static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
-{
- if (BITS_PER_LONG == 64) {
- unsigned long q = (unsigned long)p;
- return bit_waitqueue((void *)(q & ~1), q & 1);
- }
- return bit_waitqueue(p, 0);
-}
-
-static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
- void *arg)
-{
- struct wait_bit_key *key = arg;
- struct wait_bit_queue *wait_bit
- = container_of(wait, struct wait_bit_queue, wait);
- atomic_t *val = key->flags;
-
- if (wait_bit->key.flags != key->flags ||
- wait_bit->key.bit_nr != key->bit_nr ||
- atomic_read(val) != 0)
- return 0;
- return autoremove_wake_function(wait, mode, sync, key);
-}
-
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
- * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
- * return codes halt waiting and return.
- */
-static __sched
-int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(atomic_t *), unsigned mode)
-{
- atomic_t *val;
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q->wait, mode);
- val = q->key.flags;
- if (atomic_read(val) == 0)
- break;
- ret = (*action)(val);
- } while (!ret && atomic_read(val) != 0);
- finish_wait(wq, &q->wait);
- return ret;
-}
-
-#define DEFINE_WAIT_ATOMIC_T(name, p) \
- struct wait_bit_queue name = { \
- .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
- .wait = { \
- .private = current, \
- .func = wake_atomic_t_function, \
- .task_list = \
- LIST_HEAD_INIT((name).wait.task_list), \
- }, \
- }
-
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
- unsigned mode)
-{
- wait_queue_head_t *wq = atomic_t_waitqueue(p);
- DEFINE_WAIT_ATOMIC_T(wait, p);
-
- return __wait_on_atomic_t(wq, &wait, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
-
-/**
- * wake_up_atomic_t - Wake up a waiter on a atomic_t
- * @p: The atomic_t being waited on, a kernel virtual address
- *
- * Wake up anyone waiting for the atomic_t to go to zero.
- *
- * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
- * check is done by the waiter's wake function, not the by the waker itself).
- */
-void wake_up_atomic_t(atomic_t *p)
-{
- __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
-}
-EXPORT_SYMBOL(wake_up_atomic_t);
-
-__sched int bit_wait(struct wait_bit_key *word)
-{
- if (signal_pending_state(current->state, current))
- return 1;
- schedule();
- return 0;
-}
-EXPORT_SYMBOL(bit_wait);
-
-__sched int bit_wait_io(struct wait_bit_key *word)
-{
- if (signal_pending_state(current->state, current))
- return 1;
- io_schedule();
- return 0;
-}
-EXPORT_SYMBOL(bit_wait_io);
-
-__sched int bit_wait_timeout(struct wait_bit_key *word)
-{
- unsigned long now = ACCESS_ONCE(jiffies);
- if (signal_pending_state(current->state, current))
- return 1;
- if (time_after_eq(now, word->timeout))
- return -EAGAIN;
- schedule_timeout(word->timeout - now);
- return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_timeout);
-
-__sched int bit_wait_io_timeout(struct wait_bit_key *word)
-{
- unsigned long now = ACCESS_ONCE(jiffies);
- if (signal_pending_state(current->state, current))
- return 1;
- if (time_after_eq(now, word->timeout))
- return -EAGAIN;
- io_schedule_timeout(word->timeout - now);
- return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
new file mode 100644
index 000000000000..1088d3b7012c
--- /dev/null
+++ b/kernel/sched/wait_bit.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/sched/debug.h>
+#include "sched.h"
+
+/*
+ * The implementation of the wait_bit*() and related waiting APIs:
+ */
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit)
+{
+ const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+ unsigned long val = (unsigned long)word << shift | bit;
+
+ return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
+
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+ if (wait_bit->key.flags != key->flags ||
+ wait_bit->key.bit_nr != key->bit_nr ||
+ test_bit(key->bit_nr, key->flags))
+ return 0;
+
+ return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. non-blocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+ wait_bit_action_f *action, unsigned mode)
+{
+ int ret = 0;
+
+ do {
+ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
+ ret = (*action)(&wbq_entry->key, mode);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+
+ return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(unsigned long *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched out_of_line_wait_on_bit_timeout(
+ unsigned long *word, int bit, wait_bit_action_f *action,
+ unsigned mode, unsigned long timeout)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ wq_entry.key.timeout = jiffies + timeout;
+
+ return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
+int __sched
+__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+ wait_bit_action_f *action, unsigned mode)
+{
+ int ret = 0;
+
+ for (;;) {
+ prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode);
+ if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+ ret = action(&wbq_entry->key, mode);
+ /*
+ * See the comment in prepare_to_wait_event().
+ * finish_wait() does not necessarily takes wwq_head->lock,
+ * but test_and_set_bit() implies mb() which pairs with
+ * smp_mb__after_atomic() before wake_up_page().
+ */
+ if (ret)
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ }
+ if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+ if (!ret)
+ finish_wait(wq_head, &wbq_entry->wq_entry);
+ return 0;
+ } else if (ret) {
+ return ret;
+ }
+ }
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit,
+ wait_bit_action_f *action, unsigned mode)
+{
+ struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+ return __wait_on_bit_lock(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit)
+{
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+
+ if (waitqueue_active(wq_head))
+ __wake_up(wq_head, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up waiters on a bit
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
+ *
+ * Wake up any process waiting in wait_on_bit() or similar for the
+ * given bit to be cleared.
+ *
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address and bit will be woken, and only if the
+ * bit is clear.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the bit is cleared and before this function is called.
+ * If the bit was cleared atomically, such as a by clear_bit() then
+ * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed.
+ * If the bit was cleared with a fully-ordered operation, no further
+ * barrier is required.
+ *
+ * Normally the bit should be cleared by an operation with RELEASE
+ * semantics so that any changes to memory made before the bit is
+ * cleared are guaranteed to be visible after the matching wait_on_bit()
+ * completes.
+ */
+void wake_up_bit(unsigned long *word, int bit)
+{
+ __wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+wait_queue_head_t *__var_waitqueue(void *p)
+{
+ return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(__var_waitqueue);
+
+static int
+var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+ int sync, void *arg)
+{
+ struct wait_bit_key *key = arg;
+ struct wait_bit_queue_entry *wbq_entry =
+ container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+ if (wbq_entry->key.flags != key->flags ||
+ wbq_entry->key.bit_nr != key->bit_nr)
+ return 0;
+
+ return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
+{
+ *wbq_entry = (struct wait_bit_queue_entry){
+ .key = {
+ .flags = (var),
+ .bit_nr = -1,
+ },
+ .wq_entry = {
+ .flags = flags,
+ .private = current,
+ .func = var_wake_function,
+ .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+ },
+ };
+}
+EXPORT_SYMBOL(init_wait_var_entry);
+
+/**
+ * wake_up_var - wake up waiters on a variable (kernel address)
+ * @var: the address of the variable being waited on
+ *
+ * Wake up any process waiting in wait_var_event() or similar for the
+ * given variable to change. wait_var_event() can be waiting for an
+ * arbitrary condition to be true and associates that condition with an
+ * address. Calling wake_up_var() suggests that the condition has been
+ * made true, but does not strictly require the condtion to use the
+ * address given.
+ *
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address will be woken.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the variable is updated (or more accurately, after the
+ * condition waited on has been made to be true) and before this function
+ * is called. If the variable was updated atomically, such as a by
+ * atomic_dec() then smb_mb__after_atomic() can be used. If the
+ * variable was updated by a fully ordered operation such as
+ * atomic_dec_and_test() then no extra barrier is required. Otherwise
+ * smb_mb() is needed.
+ *
+ * Normally the variable should be updated (the condition should be made
+ * to be true) by an operation with RELEASE semantics such as
+ * smp_store_release() so that any changes to memory made before the
+ * variable was updated are guaranteed to be visible after the matching
+ * wait_var_event() completes.
+ */
+void wake_up_var(void *var)
+{
+ __wake_up_bit(__var_waitqueue(var), var, -1);
+}
+EXPORT_SYMBOL(wake_up_var);
+
+__sched int bit_wait(struct wait_bit_key *word, int mode)
+{
+ schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word, int mode)
+{
+ io_schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
+{
+ unsigned long now = READ_ONCE(jiffies);
+
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ schedule_timeout(word->timeout - now);
+ if (signal_pending_state(mode, current))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+void __init wait_bit_init(void)
+{
+ int i;
+
+ for (i = 0; i < WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(bit_wait_table + i);
+}
diff --git a/kernel/scs.c b/kernel/scs.c
new file mode 100644
index 000000000000..772488afd5b9
--- /dev/null
+++ b/kernel/scs.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shadow Call Stack support.
+ *
+ * Copyright (C) 2019 Google LLC
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/scs.h>
+#include <linux/vmalloc.h>
+#include <linux/vmstat.h>
+
+#ifdef CONFIG_DYNAMIC_SCS
+DEFINE_STATIC_KEY_FALSE(dynamic_scs_enabled);
+#endif
+
+static void __scs_account(void *s, int account)
+{
+ struct page *scs_page = vmalloc_to_page(s);
+
+ mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
+ account * (SCS_SIZE / SZ_1K));
+}
+
+/* Matches NR_CACHED_STACKS for VMAP_STACK */
+#define NR_CACHED_SCS 2
+static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
+
+static void *__scs_alloc(int node)
+{
+ int i;
+ void *s;
+
+ for (i = 0; i < NR_CACHED_SCS; i++) {
+ s = this_cpu_xchg(scs_cache[i], NULL);
+ if (s) {
+ s = kasan_unpoison_vmalloc(s, SCS_SIZE,
+ KASAN_VMALLOC_PROT_NORMAL);
+ memset(s, 0, SCS_SIZE);
+ goto out;
+ }
+ }
+
+ s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+ GFP_SCS, PAGE_KERNEL, 0, node,
+ __builtin_return_address(0));
+
+out:
+ return kasan_reset_tag(s);
+}
+
+void *scs_alloc(int node)
+{
+ void *s;
+
+ s = __scs_alloc(node);
+ if (!s)
+ return NULL;
+
+ *__scs_magic(s) = SCS_END_MAGIC;
+
+ /*
+ * Poison the allocation to catch unintentional accesses to
+ * the shadow stack when KASAN is enabled.
+ */
+ kasan_poison_vmalloc(s, SCS_SIZE);
+ __scs_account(s, 1);
+ return s;
+}
+
+void scs_free(void *s)
+{
+ int i;
+
+ __scs_account(s, -1);
+
+ /*
+ * We cannot sleep as this can be called in interrupt context,
+ * so use this_cpu_cmpxchg to update the cache, and vfree_atomic
+ * to free the stack.
+ */
+
+ for (i = 0; i < NR_CACHED_SCS; i++)
+ if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
+ return;
+
+ kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL);
+ vfree_atomic(s);
+}
+
+static int scs_cleanup(unsigned int cpu)
+{
+ int i;
+ void **cache = per_cpu_ptr(scs_cache, cpu);
+
+ for (i = 0; i < NR_CACHED_SCS; i++) {
+ vfree(cache[i]);
+ cache[i] = NULL;
+ }
+
+ return 0;
+}
+
+void __init scs_init(void)
+{
+ if (!scs_is_enabled())
+ return;
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
+ scs_cleanup);
+}
+
+int scs_prepare(struct task_struct *tsk, int node)
+{
+ void *s;
+
+ if (!scs_is_enabled())
+ return 0;
+
+ s = scs_alloc(node);
+ if (!s)
+ return -ENOMEM;
+
+ task_scs(tsk) = task_scs_sp(tsk) = s;
+ return 0;
+}
+
+static void scs_check_usage(struct task_struct *tsk)
+{
+ static unsigned long highest;
+
+ unsigned long *p, prev, curr = highest, used = 0;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE))
+ return;
+
+ for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) {
+ if (!READ_ONCE_NOCHECK(*p))
+ break;
+ used += sizeof(*p);
+ }
+
+ while (used > curr) {
+ prev = cmpxchg_relaxed(&highest, curr, used);
+
+ if (prev == curr) {
+ pr_info("%s (%d): highest shadow stack usage: %lu bytes\n",
+ tsk->comm, task_pid_nr(tsk), used);
+ break;
+ }
+
+ curr = prev;
+ }
+}
+
+void scs_release(struct task_struct *tsk)
+{
+ void *s = task_scs(tsk);
+
+ if (!scs_is_enabled() || !s)
+ return;
+
+ WARN(task_scs_end_corrupted(tsk),
+ "corrupted shadow stack detected when freeing task\n");
+ scs_check_usage(tsk);
+ scs_free(s);
+}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4f44028943e6..25f62867a16d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/seccomp.c
*
@@ -12,37 +13,203 @@
* Mode 2 allows user-defined system call filters in the form
* of Berkeley Packet Filters/Linux Socket Filters.
*/
+#define pr_fmt(fmt) "seccomp: " fmt
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/audit.h>
#include <linux/compat.h>
+#include <linux/coredump.h>
+#include <linux/kmemleak.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
+#include <linux/sysctl.h>
-#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
-#endif
+
+/* Not exposed in headers: strictly internal use only. */
+#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
#ifdef CONFIG_SECCOMP_FILTER
+#include <linux/file.h>
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
-#include <linux/security.h>
-#include <linux/tracehook.h>
+#include <linux/capability.h>
#include <linux/uaccess.h>
+#include <linux/anon_inodes.h>
+#include <linux/lockdep.h>
+
+/*
+ * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
+ * wrong direction flag in the ioctl number. This is the broken one,
+ * which the kernel needs to keep supporting until all userspaces stop
+ * using the wrong command number.
+ */
+#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64)
+
+enum notify_state {
+ SECCOMP_NOTIFY_INIT,
+ SECCOMP_NOTIFY_SENT,
+ SECCOMP_NOTIFY_REPLIED,
+};
+
+struct seccomp_knotif {
+ /* The struct pid of the task whose filter triggered the notification */
+ struct task_struct *task;
+
+ /* The "cookie" for this request; this is unique for this filter. */
+ u64 id;
+
+ /*
+ * The seccomp data. This pointer is valid the entire time this
+ * notification is active, since it comes from __seccomp_filter which
+ * eclipses the entire lifecycle here.
+ */
+ const struct seccomp_data *data;
+
+ /*
+ * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
+ * struct seccomp_knotif is created and starts out in INIT. Once the
+ * handler reads the notification off of an FD, it transitions to SENT.
+ * If a signal is received the state transitions back to INIT and
+ * another message is sent. When the userspace handler replies, state
+ * transitions to REPLIED.
+ */
+ enum notify_state state;
+
+ /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
+ int error;
+ long val;
+ u32 flags;
+
+ /*
+ * Signals when this has changed states, such as the listener
+ * dying, a new seccomp addfd message, or changing to REPLIED
+ */
+ struct completion ready;
+
+ struct list_head list;
+
+ /* outstanding addfd requests */
+ struct list_head addfd;
+};
+
+/**
+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
+ *
+ * @file: A reference to the file to install in the other task
+ * @fd: The fd number to install it at. If the fd number is -1, it means the
+ * installing process should allocate the fd as normal.
+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
+ * is allowed.
+ * @ioctl_flags: The flags used for the seccomp_addfd ioctl.
+ * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd
+ * @ret: The return value of the installing process. It is set to the fd num
+ * upon success (>= 0).
+ * @completion: Indicates that the installing process has completed fd
+ * installation, or gone away (either due to successful
+ * reply, or signal)
+ * @list: list_head for chaining seccomp_kaddfd together.
+ *
+ */
+struct seccomp_kaddfd {
+ struct file *file;
+ int fd;
+ unsigned int flags;
+ __u32 ioctl_flags;
+
+ union {
+ bool setfd;
+ /* To only be set on reply */
+ int ret;
+ };
+ struct completion completion;
+ struct list_head list;
+};
+
+/**
+ * struct notification - container for seccomp userspace notifications. Since
+ * most seccomp filters will not have notification listeners attached and this
+ * structure is fairly large, we store the notification-specific stuff in a
+ * separate structure.
+ *
+ * @requests: A semaphore that users of this notification can wait on for
+ * changes. Actual reads and writes are still controlled with
+ * filter->notify_lock.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
+ * @next_id: The id of the next request.
+ * @notifications: A list of struct seccomp_knotif elements.
+ */
+
+struct notification {
+ atomic_t requests;
+ u32 flags;
+ u64 next_id;
+ struct list_head notifications;
+};
+
+#ifdef SECCOMP_ARCH_NATIVE
+/**
+ * struct action_cache - per-filter cache of seccomp actions per
+ * arch/syscall pair
+ *
+ * @allow_native: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * native architecture.
+ * @allow_compat: A bitmap where each bit represents whether the
+ * filter will always allow the syscall, for the
+ * compat architecture.
+ */
+struct action_cache {
+ DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
+#ifdef SECCOMP_ARCH_COMPAT
+ DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
+#endif
+};
+#else
+struct action_cache { };
+
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ return false;
+}
+
+static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+}
+#endif /* SECCOMP_ARCH_NATIVE */
/**
* struct seccomp_filter - container for seccomp BPF programs
*
- * @usage: reference count to manage the object lifetime.
- * get/put helpers should be used when accessing an instance
- * outside of a lifetime-guarded section. In general, this
- * is only needed for handling filters shared across tasks.
+ * @refs: Reference count to manage the object lifetime.
+ * A filter's reference count is incremented for each directly
+ * attached task, once for the dependent filter, and if
+ * requested for the user notifier. When @refs reaches zero,
+ * the filter can be freed.
+ * @users: A filter's @users count is incremented for each directly
+ * attached task (filter installation, fork(), thread_sync),
+ * and once for the dependent filter (tracked in filter->prev).
+ * When it reaches zero it indicates that no direct or indirect
+ * users of that filter exist. No new tasks can get associated with
+ * this filter after reaching 0. The @users count is always smaller
+ * or equal to @refs. Hence, reaching 0 for @users does not mean
+ * the filter can be freed.
+ * @cache: cache of arch/syscall mappings to actions
+ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
+ * @wait_killable_recv: Put notifying process in killable state once the
+ * notification is received by the userspace listener.
* @prev: points to a previously installed, or inherited, filter
- * @len: the number of instructions in the program
- * @insnsi: the BPF program instructions to evaluate
+ * @prog: the BPF program to evaluate
+ * @notif: the struct that holds all notification related information
+ * @notify_lock: A lock for all notification-related accesses.
+ * @wqh: A wait queue for poll if a notifier is in use.
*
* seccomp_filter objects are organized in a tree linked via the @prev
* pointer. For any task, it appears to be a singly-linked list starting
@@ -52,12 +219,19 @@
* how namespaces work.
*
* seccomp_filter objects should never be modified after being attached
- * to a task_struct (other than @usage).
+ * to a task_struct (other than @refs).
*/
struct seccomp_filter {
- atomic_t usage;
+ refcount_t refs;
+ refcount_t users;
+ bool log;
+ bool wait_killable_recv;
+ struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
+ struct notification *notif;
+ struct mutex notify_lock;
+ wait_queue_head_t wqh;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -69,13 +243,17 @@ struct seccomp_filter {
*/
static void populate_seccomp_data(struct seccomp_data *sd)
{
+ /*
+ * Instead of using current_pt_reg(), we're already doing the work
+ * to safely fetch "current", so just use "task" everywhere below.
+ */
struct task_struct *task = current;
struct pt_regs *regs = task_pt_regs(task);
unsigned long args[6];
sd->nr = syscall_get_nr(task, regs);
- sd->arch = syscall_get_arch();
- syscall_get_arguments(task, regs, 0, 6, args);
+ sd->arch = syscall_get_arch(task);
+ syscall_get_arguments(task, regs, args);
sd->args[0] = args[0];
sd->args[1] = args[1];
sd->args[2] = args[2];
@@ -167,39 +345,88 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
+#ifdef SECCOMP_ARCH_NATIVE
+static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
+ size_t bitmap_size,
+ int syscall_nr)
+{
+ if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
+ return false;
+ syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
+
+ return test_bit(syscall_nr, bitmap);
+}
+
+/**
+ * seccomp_cache_check_allow - lookup seccomp cache
+ * @sfilter: The seccomp filter
+ * @sd: The seccomp data to lookup the cache with
+ *
+ * Returns true if the seccomp_data is cached and allowed.
+ */
+static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
+ const struct seccomp_data *sd)
+{
+ int syscall_nr = sd->nr;
+ const struct action_cache *cache = &sfilter->cache;
+
+#ifndef SECCOMP_ARCH_COMPAT
+ /* A native-only architecture doesn't need to check sd->arch. */
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+#else
+ if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
+ return seccomp_cache_check_allow_bitmap(cache->allow_native,
+ SECCOMP_ARCH_NATIVE_NR,
+ syscall_nr);
+ if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
+ return seccomp_cache_check_allow_bitmap(cache->allow_compat,
+ SECCOMP_ARCH_COMPAT_NR,
+ syscall_nr);
+#endif /* SECCOMP_ARCH_COMPAT */
+
+ WARN_ON_ONCE(true);
+ return false;
+}
+#endif /* SECCOMP_ARCH_NATIVE */
+
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
/**
- * seccomp_run_filters - evaluates all seccomp filters against @syscall
- * @syscall: number of the current system call
+ * seccomp_run_filters - evaluates all seccomp filters against @sd
+ * @sd: optional seccomp data to be passed to filters
+ * @match: stores struct seccomp_filter that resulted in the return value,
+ * unless filter returned SECCOMP_RET_ALLOW, in which case it will
+ * be unchanged.
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd,
+ struct seccomp_filter **match)
{
- struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
- struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ struct seccomp_filter *f =
+ READ_ONCE(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
- if (unlikely(WARN_ON(f == NULL)))
- return SECCOMP_RET_KILL;
-
- /* Make sure cross-thread synced filter points somewhere sane. */
- smp_read_barrier_depends();
+ if (WARN_ON(f == NULL))
+ return SECCOMP_RET_KILL_PROCESS;
- if (!sd) {
- populate_seccomp_data(&sd_local);
- sd = &sd_local;
- }
+ if (seccomp_cache_check_allow(f, sd))
+ return SECCOMP_RET_ALLOW;
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
for (; f; f = f->prev) {
- u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
+ u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
- if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
+ *match = f;
+ }
}
return ret;
}
@@ -215,18 +442,24 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
return true;
}
+void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
+
static inline void seccomp_assign_mode(struct task_struct *task,
- unsigned long seccomp_mode)
+ unsigned long seccomp_mode,
+ unsigned long flags)
{
assert_spin_locked(&task->sighand->siglock);
task->seccomp.mode = seccomp_mode;
/*
- * Make sure TIF_SECCOMP cannot be set before the mode (and
+ * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
* filter) is set.
*/
smp_mb__before_atomic();
- set_tsk_thread_flag(task, TIF_SECCOMP);
+ /* Assume default seccomp processes want spec flaw mitigation. */
+ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
+ arch_seccomp_spec_mitigate(task);
+ set_task_syscall_work(task, SECCOMP);
}
#ifdef CONFIG_SECCOMP_FILTER
@@ -249,7 +482,7 @@ static int is_ancestor(struct seccomp_filter *parent,
* Expects sighand and cred_guard_mutex locks to be held.
*
* Returns 0 on success, -ve on error, or the pid of a thread which was
- * either not in the correct seccomp mode or it did not have an ancestral
+ * either not in the correct seccomp mode or did not have an ancestral
* seccomp filter.
*/
static inline pid_t seccomp_can_sync_threads(void)
@@ -267,6 +500,9 @@ static inline pid_t seccomp_can_sync_threads(void)
/* Skip current, since it is initiating the sync. */
if (thread == caller)
continue;
+ /* Skip exited threads. */
+ if (thread->flags & PF_EXITING)
+ continue;
if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
(thread->seccomp.mode == SECCOMP_MODE_FILTER &&
@@ -277,7 +513,7 @@ static inline pid_t seccomp_can_sync_threads(void)
/* Return the first thread that cannot be synchronized. */
failed = task_pid_vnr(thread);
/* If the pid cannot be resolved, then return -ESRCH */
- if (unlikely(WARN_ON(failed == 0)))
+ if (WARN_ON(failed == 0))
failed = -ESRCH;
return failed;
}
@@ -285,21 +521,94 @@ static inline pid_t seccomp_can_sync_threads(void)
return 0;
}
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_destroy(filter->prog);
+ kfree(filter);
+ }
+}
+
+static void __seccomp_filter_orphan(struct seccomp_filter *orig)
+{
+ while (orig && refcount_dec_and_test(&orig->users)) {
+ if (waitqueue_active(&orig->wqh))
+ wake_up_poll(&orig->wqh, EPOLLHUP);
+ orig = orig->prev;
+ }
+}
+
+static void __put_seccomp_filter(struct seccomp_filter *orig)
+{
+ /* Clean up single-reference branches iteratively. */
+ while (orig && refcount_dec_and_test(&orig->refs)) {
+ struct seccomp_filter *freeme = orig;
+ orig = orig->prev;
+ seccomp_filter_free(freeme);
+ }
+}
+
+static void __seccomp_filter_release(struct seccomp_filter *orig)
+{
+ /* Notify about any unused filters in the task's former filter tree. */
+ __seccomp_filter_orphan(orig);
+ /* Finally drop all references to the task's former tree. */
+ __put_seccomp_filter(orig);
+}
+
+/**
+ * seccomp_filter_release - Detach the task from its filter tree,
+ * drop its reference count, and notify
+ * about unused filters
+ *
+ * @tsk: task the filter should be released from.
+ *
+ * This function should only be called when the task is exiting as
+ * it detaches it from its filter tree. PF_EXITING has to be set
+ * for the task.
+ */
+void seccomp_filter_release(struct task_struct *tsk)
+{
+ struct seccomp_filter *orig;
+
+ if (WARN_ON((tsk->flags & PF_EXITING) == 0))
+ return;
+
+ if (READ_ONCE(tsk->seccomp.filter) == NULL)
+ return;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ orig = tsk->seccomp.filter;
+ /* Detach task from its filter tree. */
+ tsk->seccomp.filter = NULL;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ __seccomp_filter_release(orig);
+}
+
/**
* seccomp_sync_threads: sets all threads to use current's filter
*
+ * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync.
+ *
* Expects sighand and cred_guard_mutex locks to be held, and for
* seccomp_can_sync_threads() to have returned success already
* without dropping the locks.
*
*/
-static inline void seccomp_sync_threads(void)
+static inline void seccomp_sync_threads(unsigned long flags)
{
struct task_struct *thread, *caller;
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
assert_spin_locked(&current->sighand->siglock);
+ /*
+ * Don't touch any of the threads if the process is being killed.
+ * This allows for a lockless check in seccomp_filter_release.
+ */
+ if (current->signal->flags & SIGNAL_GROUP_EXIT)
+ return;
+
/* Synchronize all threads. */
caller = current;
for_each_thread(caller, thread) {
@@ -307,34 +616,47 @@ static inline void seccomp_sync_threads(void)
if (thread == caller)
continue;
+ /*
+ * Skip exited threads. seccomp_filter_release could have
+ * been already called for this task.
+ */
+ if (thread->flags & PF_EXITING)
+ continue;
+
/* Get a task reference for the new leaf node. */
get_seccomp_filter(caller);
+
/*
* Drop the task reference to the shared ancestor since
* current's path will hold a reference. (This also
* allows a put before the assignment.)
*/
- put_seccomp_filter(thread);
+ __seccomp_filter_release(thread->seccomp.filter);
+
+ /* Make our new filter tree visible. */
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
+ atomic_set(&thread->seccomp.filter_count,
+ atomic_read(&caller->seccomp.filter_count));
+
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
/*
* Opt the other thread into seccomp if needed.
* As threads are considered to be trust-realm
* equivalent (see ptrace_may_access), it is safe to
* allow one thread to transition the other.
*/
- if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
- /*
- * Don't let an unprivileged task work around
- * the no_new_privs restriction by creating
- * a thread that sets it up, enters seccomp,
- * then dies.
- */
- if (task_no_new_privs(caller))
- task_set_no_new_privs(thread);
-
- seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
- }
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
+ flags);
}
}
@@ -346,16 +668,19 @@ static inline void seccomp_sync_threads(void)
*/
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
- struct seccomp_filter *filter;
- unsigned long fp_size;
- struct sock_filter *fp;
- int new_len;
- long ret;
+ struct seccomp_filter *sfilter;
+ int ret;
+ const bool save_orig =
+#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
+ true;
+#else
+ false;
+#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
+
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
- fp_size = fprog->len * sizeof(struct sock_filter);
/*
* Installing a seccomp filter requires that the task has
@@ -364,64 +689,27 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
- security_capable_noaudit(current_cred(), current_user_ns(),
- CAP_SYS_ADMIN) != 0)
+ !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
- fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
- if (!fp)
- return ERR_PTR(-ENOMEM);
-
- /* Copy the instructions from fprog. */
- ret = -EFAULT;
- if (copy_from_user(fp, fprog->filter, fp_size))
- goto free_prog;
-
- /* Check and rewrite the fprog via the skb checker */
- ret = bpf_check_classic(fp, fprog->len);
- if (ret)
- goto free_prog;
-
- /* Check and rewrite the fprog for seccomp use */
- ret = seccomp_check_filter(fp, fprog->len);
- if (ret)
- goto free_prog;
-
- /* Convert 'sock_filter' insns to 'bpf_insn' insns */
- ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
- if (ret)
- goto free_prog;
-
/* Allocate a new seccomp_filter */
- ret = -ENOMEM;
- filter = kzalloc(sizeof(struct seccomp_filter),
- GFP_KERNEL|__GFP_NOWARN);
- if (!filter)
- goto free_prog;
-
- filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
- if (!filter->prog)
- goto free_filter;
-
- ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
- if (ret)
- goto free_filter_prog;
-
- kfree(fp);
- atomic_set(&filter->usage, 1);
- filter->prog->len = new_len;
+ sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+ if (!sfilter)
+ return ERR_PTR(-ENOMEM);
- bpf_prog_select_runtime(filter->prog);
+ mutex_init(&sfilter->notify_lock);
+ ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
+ seccomp_check_filter, save_orig);
+ if (ret < 0) {
+ kfree(sfilter);
+ return ERR_PTR(ret);
+ }
- return filter;
+ refcount_set(&sfilter->refs, 1);
+ refcount_set(&sfilter-